| distributed init (rank 0): env://, gpu 0
| distributed init (rank 6): env://, gpu 6
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 3): env://, gpu 3
| distributed init (rank 5): env://, gpu 5
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 7): env://, gpu 7
Namespace(batch_size=128, epochs=300, update_freq=4, model='nano', drop_path=0, input_size=224, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.004, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.2, cutmix=0.3, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_nano_1.4G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=8, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(224, 224))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7f7523e0b190>
Mixup is activated!
Model = RaCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (layer1): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(16, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=16, bias=False)
          (norm): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(16, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): Identity()
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(32, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.003)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(32, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.006)
    )
  )
  (layer2): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(32, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
          (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.008)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(64, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.011)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(64, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.014)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(64, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.017)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(64, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.019)
    )
  )
  (layer3): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(64, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(768, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.022)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.025)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.028)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.031)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.033)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.036)
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.039)
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.042)
    )
  )
  (layer4): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(128, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(1536, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.044)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.047)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.050)
    )
  )
  (head): ConvX(
    (conv): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 13409964
LR = 0.00400000
Batch size = 4096
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 312
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.mlp.0.conv.weight",
      "layer1.0.mlp.1.conv.weight",
      "layer1.0.mlp.2.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.mlp.conv_in.conv.weight",
      "layer1.1.mlp.dw.conv.weight",
      "layer1.1.mlp.re.region.0.weight",
      "layer1.1.mlp.re.region.3.weight",
      "layer1.1.mlp.proj.conv.weight",
      "layer1.1.dcnn.conv_in.conv.weight",
      "layer1.1.dcnn.spe.conv.weight",
      "layer1.1.dcnn.att.logit_scale",
      "layer1.1.dcnn.proj.conv.weight",
      "layer1.2.mlp.conv_in.conv.weight",
      "layer1.2.mlp.dw.conv.weight",
      "layer1.2.mlp.re.region.0.weight",
      "layer1.2.mlp.re.region.3.weight",
      "layer1.2.mlp.proj.conv.weight",
      "layer1.2.dcnn.conv_in.conv.weight",
      "layer1.2.dcnn.spe.conv.weight",
      "layer1.2.dcnn.att.logit_scale",
      "layer1.2.dcnn.proj.conv.weight",
      "layer2.0.mlp.0.conv.weight",
      "layer2.0.mlp.1.conv.weight",
      "layer2.0.mlp.2.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.mlp.conv_in.conv.weight",
      "layer2.1.mlp.dw.conv.weight",
      "layer2.1.mlp.re.region.0.weight",
      "layer2.1.mlp.re.region.3.weight",
      "layer2.1.mlp.proj.conv.weight",
      "layer2.1.dcnn.conv_in.conv.weight",
      "layer2.1.dcnn.spe.conv.weight",
      "layer2.1.dcnn.att.logit_scale",
      "layer2.1.dcnn.proj.conv.weight",
      "layer2.2.mlp.conv_in.conv.weight",
      "layer2.2.mlp.dw.conv.weight",
      "layer2.2.mlp.re.region.0.weight",
      "layer2.2.mlp.re.region.3.weight",
      "layer2.2.mlp.proj.conv.weight",
      "layer2.2.dcnn.conv_in.conv.weight",
      "layer2.2.dcnn.spe.conv.weight",
      "layer2.2.dcnn.att.logit_scale",
      "layer2.2.dcnn.proj.conv.weight",
      "layer2.3.mlp.conv_in.conv.weight",
      "layer2.3.mlp.dw.conv.weight",
      "layer2.3.mlp.re.region.0.weight",
      "layer2.3.mlp.re.region.3.weight",
      "layer2.3.mlp.proj.conv.weight",
      "layer2.3.dcnn.conv_in.conv.weight",
      "layer2.3.dcnn.spe.conv.weight",
      "layer2.3.dcnn.att.logit_scale",
      "layer2.3.dcnn.proj.conv.weight",
      "layer2.4.mlp.conv_in.conv.weight",
      "layer2.4.mlp.dw.conv.weight",
      "layer2.4.mlp.re.region.0.weight",
      "layer2.4.mlp.re.region.3.weight",
      "layer2.4.mlp.proj.conv.weight",
      "layer2.4.dcnn.conv_in.conv.weight",
      "layer2.4.dcnn.spe.conv.weight",
      "layer2.4.dcnn.att.logit_scale",
      "layer2.4.dcnn.proj.conv.weight",
      "layer3.0.mlp.0.conv.weight",
      "layer3.0.mlp.1.conv.weight",
      "layer3.0.mlp.2.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.mlp.conv_in.conv.weight",
      "layer3.1.mlp.dw.conv.weight",
      "layer3.1.mlp.re.region.0.weight",
      "layer3.1.mlp.re.region.3.weight",
      "layer3.1.mlp.proj.conv.weight",
      "layer3.1.dcnn.conv_in.conv.weight",
      "layer3.1.dcnn.spe.conv.weight",
      "layer3.1.dcnn.att.logit_scale",
      "layer3.1.dcnn.proj.conv.weight",
      "layer3.2.mlp.conv_in.conv.weight",
      "layer3.2.mlp.dw.conv.weight",
      "layer3.2.mlp.re.region.0.weight",
      "layer3.2.mlp.re.region.3.weight",
      "layer3.2.mlp.proj.conv.weight",
      "layer3.2.dcnn.conv_in.conv.weight",
      "layer3.2.dcnn.spe.conv.weight",
      "layer3.2.dcnn.att.logit_scale",
      "layer3.2.dcnn.proj.conv.weight",
      "layer3.3.mlp.conv_in.conv.weight",
      "layer3.3.mlp.dw.conv.weight",
      "layer3.3.mlp.re.region.0.weight",
      "layer3.3.mlp.re.region.3.weight",
      "layer3.3.mlp.proj.conv.weight",
      "layer3.3.dcnn.conv_in.conv.weight",
      "layer3.3.dcnn.spe.conv.weight",
      "layer3.3.dcnn.att.logit_scale",
      "layer3.3.dcnn.proj.conv.weight",
      "layer3.4.mlp.conv_in.conv.weight",
      "layer3.4.mlp.dw.conv.weight",
      "layer3.4.mlp.re.region.0.weight",
      "layer3.4.mlp.re.region.3.weight",
      "layer3.4.mlp.proj.conv.weight",
      "layer3.4.dcnn.conv_in.conv.weight",
      "layer3.4.dcnn.spe.conv.weight",
      "layer3.4.dcnn.att.logit_scale",
      "layer3.4.dcnn.proj.conv.weight",
      "layer3.5.mlp.conv_in.conv.weight",
      "layer3.5.mlp.dw.conv.weight",
      "layer3.5.mlp.re.region.0.weight",
      "layer3.5.mlp.re.region.3.weight",
      "layer3.5.mlp.proj.conv.weight",
      "layer3.5.dcnn.conv_in.conv.weight",
      "layer3.5.dcnn.spe.conv.weight",
      "layer3.5.dcnn.att.logit_scale",
      "layer3.5.dcnn.proj.conv.weight",
      "layer3.6.mlp.conv_in.conv.weight",
      "layer3.6.mlp.dw.conv.weight",
      "layer3.6.mlp.re.region.0.weight",
      "layer3.6.mlp.re.region.3.weight",
      "layer3.6.mlp.proj.conv.weight",
      "layer3.6.dcnn.conv_in.conv.weight",
      "layer3.6.dcnn.spe.conv.weight",
      "layer3.6.dcnn.att.logit_scale",
      "layer3.6.dcnn.proj.conv.weight",
      "layer3.7.mlp.conv_in.conv.weight",
      "layer3.7.mlp.dw.conv.weight",
      "layer3.7.mlp.re.region.0.weight",
      "layer3.7.mlp.re.region.3.weight",
      "layer3.7.mlp.proj.conv.weight",
      "layer3.7.dcnn.conv_in.conv.weight",
      "layer3.7.dcnn.spe.conv.weight",
      "layer3.7.dcnn.att.logit_scale",
      "layer3.7.dcnn.proj.conv.weight",
      "layer4.0.mlp.0.conv.weight",
      "layer4.0.mlp.1.conv.weight",
      "layer4.0.mlp.2.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.mlp.conv_in.conv.weight",
      "layer4.1.mlp.dw.conv.weight",
      "layer4.1.mlp.re.region.0.weight",
      "layer4.1.mlp.re.region.3.weight",
      "layer4.1.mlp.proj.conv.weight",
      "layer4.1.dcnn.conv_in.conv.weight",
      "layer4.1.dcnn.spe.conv.weight",
      "layer4.1.dcnn.att.logit_scale",
      "layer4.1.dcnn.proj.conv.weight",
      "layer4.2.mlp.conv_in.conv.weight",
      "layer4.2.mlp.dw.conv.weight",
      "layer4.2.mlp.re.region.0.weight",
      "layer4.2.mlp.re.region.3.weight",
      "layer4.2.mlp.proj.conv.weight",
      "layer4.2.dcnn.conv_in.conv.weight",
      "layer4.2.dcnn.spe.conv.weight",
      "layer4.2.dcnn.att.logit_scale",
      "layer4.2.dcnn.proj.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.mlp.0.norm.weight",
      "layer1.0.mlp.0.norm.bias",
      "layer1.0.mlp.1.norm.weight",
      "layer1.0.mlp.1.norm.bias",
      "layer1.0.mlp.2.norm.weight",
      "layer1.0.mlp.2.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.mlp.conv_in.norm.weight",
      "layer1.1.mlp.conv_in.norm.bias",
      "layer1.1.mlp.dw.norm.weight",
      "layer1.1.mlp.dw.norm.bias",
      "layer1.1.mlp.re.region.1.weight",
      "layer1.1.mlp.re.region.1.bias",
      "layer1.1.mlp.re.region.3.bias",
      "layer1.1.mlp.proj.norm.weight",
      "layer1.1.mlp.proj.norm.bias",
      "layer1.1.dcnn.conv_in.norm.weight",
      "layer1.1.dcnn.conv_in.norm.bias",
      "layer1.1.dcnn.spe.norm.weight",
      "layer1.1.dcnn.spe.norm.bias",
      "layer1.1.dcnn.proj.norm.weight",
      "layer1.1.dcnn.proj.norm.bias",
      "layer1.2.mlp.conv_in.norm.weight",
      "layer1.2.mlp.conv_in.norm.bias",
      "layer1.2.mlp.dw.norm.weight",
      "layer1.2.mlp.dw.norm.bias",
      "layer1.2.mlp.re.region.1.weight",
      "layer1.2.mlp.re.region.1.bias",
      "layer1.2.mlp.re.region.3.bias",
      "layer1.2.mlp.proj.norm.weight",
      "layer1.2.mlp.proj.norm.bias",
      "layer1.2.dcnn.conv_in.norm.weight",
      "layer1.2.dcnn.conv_in.norm.bias",
      "layer1.2.dcnn.spe.norm.weight",
      "layer1.2.dcnn.spe.norm.bias",
      "layer1.2.dcnn.proj.norm.weight",
      "layer1.2.dcnn.proj.norm.bias",
      "layer2.0.mlp.0.norm.weight",
      "layer2.0.mlp.0.norm.bias",
      "layer2.0.mlp.1.norm.weight",
      "layer2.0.mlp.1.norm.bias",
      "layer2.0.mlp.2.norm.weight",
      "layer2.0.mlp.2.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.mlp.conv_in.norm.weight",
      "layer2.1.mlp.conv_in.norm.bias",
      "layer2.1.mlp.dw.norm.weight",
      "layer2.1.mlp.dw.norm.bias",
      "layer2.1.mlp.re.region.1.weight",
      "layer2.1.mlp.re.region.1.bias",
      "layer2.1.mlp.re.region.3.bias",
      "layer2.1.mlp.proj.norm.weight",
      "layer2.1.mlp.proj.norm.bias",
      "layer2.1.dcnn.conv_in.norm.weight",
      "layer2.1.dcnn.conv_in.norm.bias",
      "layer2.1.dcnn.spe.norm.weight",
      "layer2.1.dcnn.spe.norm.bias",
      "layer2.1.dcnn.proj.norm.weight",
      "layer2.1.dcnn.proj.norm.bias",
      "layer2.2.mlp.conv_in.norm.weight",
      "layer2.2.mlp.conv_in.norm.bias",
      "layer2.2.mlp.dw.norm.weight",
      "layer2.2.mlp.dw.norm.bias",
      "layer2.2.mlp.re.region.1.weight",
      "layer2.2.mlp.re.region.1.bias",
      "layer2.2.mlp.re.region.3.bias",
      "layer2.2.mlp.proj.norm.weight",
      "layer2.2.mlp.proj.norm.bias",
      "layer2.2.dcnn.conv_in.norm.weight",
      "layer2.2.dcnn.conv_in.norm.bias",
      "layer2.2.dcnn.spe.norm.weight",
      "layer2.2.dcnn.spe.norm.bias",
      "layer2.2.dcnn.proj.norm.weight",
      "layer2.2.dcnn.proj.norm.bias",
      "layer2.3.mlp.conv_in.norm.weight",
      "layer2.3.mlp.conv_in.norm.bias",
      "layer2.3.mlp.dw.norm.weight",
      "layer2.3.mlp.dw.norm.bias",
      "layer2.3.mlp.re.region.1.weight",
      "layer2.3.mlp.re.region.1.bias",
      "layer2.3.mlp.re.region.3.bias",
      "layer2.3.mlp.proj.norm.weight",
      "layer2.3.mlp.proj.norm.bias",
      "layer2.3.dcnn.conv_in.norm.weight",
      "layer2.3.dcnn.conv_in.norm.bias",
      "layer2.3.dcnn.spe.norm.weight",
      "layer2.3.dcnn.spe.norm.bias",
      "layer2.3.dcnn.proj.norm.weight",
      "layer2.3.dcnn.proj.norm.bias",
      "layer2.4.mlp.conv_in.norm.weight",
      "layer2.4.mlp.conv_in.norm.bias",
      "layer2.4.mlp.dw.norm.weight",
      "layer2.4.mlp.dw.norm.bias",
      "layer2.4.mlp.re.region.1.weight",
      "layer2.4.mlp.re.region.1.bias",
      "layer2.4.mlp.re.region.3.bias",
      "layer2.4.mlp.proj.norm.weight",
      "layer2.4.mlp.proj.norm.bias",
      "layer2.4.dcnn.conv_in.norm.weight",
      "layer2.4.dcnn.conv_in.norm.bias",
      "layer2.4.dcnn.spe.norm.weight",
      "layer2.4.dcnn.spe.norm.bias",
      "layer2.4.dcnn.proj.norm.weight",
      "layer2.4.dcnn.proj.norm.bias",
      "layer3.0.mlp.0.norm.weight",
      "layer3.0.mlp.0.norm.bias",
      "layer3.0.mlp.1.norm.weight",
      "layer3.0.mlp.1.norm.bias",
      "layer3.0.mlp.2.norm.weight",
      "layer3.0.mlp.2.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.mlp.conv_in.norm.weight",
      "layer3.1.mlp.conv_in.norm.bias",
      "layer3.1.mlp.dw.norm.weight",
      "layer3.1.mlp.dw.norm.bias",
      "layer3.1.mlp.re.region.1.weight",
      "layer3.1.mlp.re.region.1.bias",
      "layer3.1.mlp.re.region.3.bias",
      "layer3.1.mlp.proj.norm.weight",
      "layer3.1.mlp.proj.norm.bias",
      "layer3.1.dcnn.conv_in.norm.weight",
      "layer3.1.dcnn.conv_in.norm.bias",
      "layer3.1.dcnn.spe.norm.weight",
      "layer3.1.dcnn.spe.norm.bias",
      "layer3.1.dcnn.proj.norm.weight",
      "layer3.1.dcnn.proj.norm.bias",
      "layer3.2.mlp.conv_in.norm.weight",
      "layer3.2.mlp.conv_in.norm.bias",
      "layer3.2.mlp.dw.norm.weight",
      "layer3.2.mlp.dw.norm.bias",
      "layer3.2.mlp.re.region.1.weight",
      "layer3.2.mlp.re.region.1.bias",
      "layer3.2.mlp.re.region.3.bias",
      "layer3.2.mlp.proj.norm.weight",
      "layer3.2.mlp.proj.norm.bias",
      "layer3.2.dcnn.conv_in.norm.weight",
      "layer3.2.dcnn.conv_in.norm.bias",
      "layer3.2.dcnn.spe.norm.weight",
      "layer3.2.dcnn.spe.norm.bias",
      "layer3.2.dcnn.proj.norm.weight",
      "layer3.2.dcnn.proj.norm.bias",
      "layer3.3.mlp.conv_in.norm.weight",
      "layer3.3.mlp.conv_in.norm.bias",
      "layer3.3.mlp.dw.norm.weight",
      "layer3.3.mlp.dw.norm.bias",
      "layer3.3.mlp.re.region.1.weight",
      "layer3.3.mlp.re.region.1.bias",
      "layer3.3.mlp.re.region.3.bias",
      "layer3.3.mlp.proj.norm.weight",
      "layer3.3.mlp.proj.norm.bias",
      "layer3.3.dcnn.conv_in.norm.weight",
      "layer3.3.dcnn.conv_in.norm.bias",
      "layer3.3.dcnn.spe.norm.weight",
      "layer3.3.dcnn.spe.norm.bias",
      "layer3.3.dcnn.proj.norm.weight",
      "layer3.3.dcnn.proj.norm.bias",
      "layer3.4.mlp.conv_in.norm.weight",
      "layer3.4.mlp.conv_in.norm.bias",
      "layer3.4.mlp.dw.norm.weight",
      "layer3.4.mlp.dw.norm.bias",
      "layer3.4.mlp.re.region.1.weight",
      "layer3.4.mlp.re.region.1.bias",
      "layer3.4.mlp.re.region.3.bias",
      "layer3.4.mlp.proj.norm.weight",
      "layer3.4.mlp.proj.norm.bias",
      "layer3.4.dcnn.conv_in.norm.weight",
      "layer3.4.dcnn.conv_in.norm.bias",
      "layer3.4.dcnn.spe.norm.weight",
      "layer3.4.dcnn.spe.norm.bias",
      "layer3.4.dcnn.proj.norm.weight",
      "layer3.4.dcnn.proj.norm.bias",
      "layer3.5.mlp.conv_in.norm.weight",
      "layer3.5.mlp.conv_in.norm.bias",
      "layer3.5.mlp.dw.norm.weight",
      "layer3.5.mlp.dw.norm.bias",
      "layer3.5.mlp.re.region.1.weight",
      "layer3.5.mlp.re.region.1.bias",
      "layer3.5.mlp.re.region.3.bias",
      "layer3.5.mlp.proj.norm.weight",
      "layer3.5.mlp.proj.norm.bias",
      "layer3.5.dcnn.conv_in.norm.weight",
      "layer3.5.dcnn.conv_in.norm.bias",
      "layer3.5.dcnn.spe.norm.weight",
      "layer3.5.dcnn.spe.norm.bias",
      "layer3.5.dcnn.proj.norm.weight",
      "layer3.5.dcnn.proj.norm.bias",
      "layer3.6.mlp.conv_in.norm.weight",
      "layer3.6.mlp.conv_in.norm.bias",
      "layer3.6.mlp.dw.norm.weight",
      "layer3.6.mlp.dw.norm.bias",
      "layer3.6.mlp.re.region.1.weight",
      "layer3.6.mlp.re.region.1.bias",
      "layer3.6.mlp.re.region.3.bias",
      "layer3.6.mlp.proj.norm.weight",
      "layer3.6.mlp.proj.norm.bias",
      "layer3.6.dcnn.conv_in.norm.weight",
      "layer3.6.dcnn.conv_in.norm.bias",
      "layer3.6.dcnn.spe.norm.weight",
      "layer3.6.dcnn.spe.norm.bias",
      "layer3.6.dcnn.proj.norm.weight",
      "layer3.6.dcnn.proj.norm.bias",
      "layer3.7.mlp.conv_in.norm.weight",
      "layer3.7.mlp.conv_in.norm.bias",
      "layer3.7.mlp.dw.norm.weight",
      "layer3.7.mlp.dw.norm.bias",
      "layer3.7.mlp.re.region.1.weight",
      "layer3.7.mlp.re.region.1.bias",
      "layer3.7.mlp.re.region.3.bias",
      "layer3.7.mlp.proj.norm.weight",
      "layer3.7.mlp.proj.norm.bias",
      "layer3.7.dcnn.conv_in.norm.weight",
      "layer3.7.dcnn.conv_in.norm.bias",
      "layer3.7.dcnn.spe.norm.weight",
      "layer3.7.dcnn.spe.norm.bias",
      "layer3.7.dcnn.proj.norm.weight",
      "layer3.7.dcnn.proj.norm.bias",
      "layer4.0.mlp.0.norm.weight",
      "layer4.0.mlp.0.norm.bias",
      "layer4.0.mlp.1.norm.weight",
      "layer4.0.mlp.1.norm.bias",
      "layer4.0.mlp.2.norm.weight",
      "layer4.0.mlp.2.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.mlp.conv_in.norm.weight",
      "layer4.1.mlp.conv_in.norm.bias",
      "layer4.1.mlp.dw.norm.weight",
      "layer4.1.mlp.dw.norm.bias",
      "layer4.1.mlp.re.region.1.weight",
      "layer4.1.mlp.re.region.1.bias",
      "layer4.1.mlp.re.region.3.bias",
      "layer4.1.mlp.proj.norm.weight",
      "layer4.1.mlp.proj.norm.bias",
      "layer4.1.dcnn.conv_in.norm.weight",
      "layer4.1.dcnn.conv_in.norm.bias",
      "layer4.1.dcnn.spe.norm.weight",
      "layer4.1.dcnn.spe.norm.bias",
      "layer4.1.dcnn.proj.norm.weight",
      "layer4.1.dcnn.proj.norm.bias",
      "layer4.2.mlp.conv_in.norm.weight",
      "layer4.2.mlp.conv_in.norm.bias",
      "layer4.2.mlp.dw.norm.weight",
      "layer4.2.mlp.dw.norm.bias",
      "layer4.2.mlp.re.region.1.weight",
      "layer4.2.mlp.re.region.1.bias",
      "layer4.2.mlp.re.region.3.bias",
      "layer4.2.mlp.proj.norm.weight",
      "layer4.2.mlp.proj.norm.bias",
      "layer4.2.dcnn.conv_in.norm.weight",
      "layer4.2.dcnn.conv_in.norm.bias",
      "layer4.2.dcnn.spe.norm.weight",
      "layer4.2.dcnn.spe.norm.bias",
      "layer4.2.dcnn.proj.norm.weight",
      "layer4.2.dcnn.proj.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 6240
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: 
Start training for 300 epochs
Epoch: [0]  [   0/1251]  eta: 4:11:22  lr: 0.000000  min_lr: 0.000000  loss: 6.9301 (6.9301)  weight_decay: 0.0500 (0.0500)  time: 12.0566  data: 3.2497  max mem: 18117
Epoch: [0]  [ 200/1251]  eta: 0:05:15  lr: 0.000032  min_lr: 0.000032  loss: 6.9460 (6.9550)  weight_decay: 0.0500 (0.0500)  grad_norm: 26.2354 (nan)  time: 0.2404  data: 0.0005  max mem: 18117
Epoch: [0]  [ 400/1251]  eta: 0:03:50  lr: 0.000064  min_lr: 0.000064  loss: 6.8136 (6.9187)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.2570 (nan)  time: 0.2398  data: 0.0005  max mem: 18117
Epoch: [0]  [ 600/1251]  eta: 0:02:49  lr: 0.000096  min_lr: 0.000096  loss: 6.6431 (6.8573)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.3731 (nan)  time: 0.2398  data: 0.0006  max mem: 18117
Epoch: [0]  [ 800/1251]  eta: 0:01:55  lr: 0.000128  min_lr: 0.000128  loss: 6.5325 (6.7934)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.8111 (nan)  time: 0.2417  data: 0.0006  max mem: 18117
Epoch: [0]  [1000/1251]  eta: 0:01:03  lr: 0.000160  min_lr: 0.000160  loss: 6.3428 (6.7297)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.5060 (nan)  time: 0.2395  data: 0.0006  max mem: 18117
Epoch: [0]  [1200/1251]  eta: 0:00:12  lr: 0.000192  min_lr: 0.000192  loss: 6.3392 (6.6633)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.3329 (nan)  time: 0.2399  data: 0.0005  max mem: 18117
Epoch: [0]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 6.1574 (6.6488)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.7587 (nan)  time: 0.1967  data: 0.0007  max mem: 18117
Epoch: [0] Total time: 0:05:12 (0.2497 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 6.1574 (6.6504)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.7587 (nan)
Test:  [ 0/25]  eta: 0:04:36  loss: 5.3009 (5.3009)  acc1: 8.0000 (8.0000)  acc5: 22.4000 (22.4000)  time: 11.0698  data: 7.2500  max mem: 18117
Test:  [10/25]  eta: 0:00:16  loss: 5.3009 (5.3135)  acc1: 4.0000 (4.9091)  acc5: 18.0000 (18.1818)  time: 1.1048  data: 0.6593  max mem: 18117
Test:  [20/25]  eta: 0:00:03  loss: 5.4644 (5.4304)  acc1: 4.4000 (5.2952)  acc5: 17.2000 (17.5238)  time: 0.1082  data: 0.0002  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 5.5096 (5.3931)  acc1: 5.2000 (5.9520)  acc5: 17.2000 (18.6560)  time: 0.1082  data: 0.0001  max mem: 18117
Test: Total time: 0:00:13 (0.5499 s / it)
* Acc@1 6.308 Acc@5 18.516 loss 5.396
Accuracy of the model on the 50000 test images: 6.3%
Max accuracy: 6.31%
Epoch: [1]  [   0/1251]  eta: 1:01:45  lr: 0.000200  min_lr: 0.000200  loss: 6.5118 (6.5118)  weight_decay: 0.0500 (0.0500)  time: 2.9617  data: 1.8091  max mem: 18117
Epoch: [1]  [ 200/1251]  eta: 0:04:25  lr: 0.000232  min_lr: 0.000232  loss: 6.0750 (6.2253)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.1690 (4.4738)  time: 0.2416  data: 0.0005  max mem: 18117
Epoch: [1]  [ 400/1251]  eta: 0:03:29  lr: 0.000264  min_lr: 0.000264  loss: 5.8774 (6.1727)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.1022 (4.4080)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [1]  [ 600/1251]  eta: 0:02:38  lr: 0.000296  min_lr: 0.000296  loss: 5.9245 (6.1242)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.3660 (4.4092)  time: 0.2371  data: 0.0005  max mem: 18117
Epoch: [1]  [ 800/1251]  eta: 0:01:49  lr: 0.000328  min_lr: 0.000328  loss: 5.7696 (6.0809)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2330 (4.4059)  time: 0.2403  data: 0.0003  max mem: 18117
Epoch: [1]  [1000/1251]  eta: 0:01:00  lr: 0.000360  min_lr: 0.000360  loss: 5.4293 (6.0346)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.1883 (4.4142)  time: 0.2341  data: 0.0004  max mem: 18117
Epoch: [1]  [1200/1251]  eta: 0:00:12  lr: 0.000392  min_lr: 0.000392  loss: 5.7223 (5.9899)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2680 (4.4113)  time: 0.2413  data: 0.0005  max mem: 18117
Epoch: [1]  [1250/1251]  eta: 0:00:00  lr: 0.000399  min_lr: 0.000399  loss: 5.4390 (5.9786)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2485 (4.3982)  time: 0.1954  data: 0.0005  max mem: 18117
Epoch: [1] Total time: 0:05:01 (0.2409 s / it)
Averaged stats: lr: 0.000399  min_lr: 0.000399  loss: 5.4390 (5.9699)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2485 (4.3982)
Test:  [ 0/25]  eta: 0:02:04  loss: 4.0290 (4.0290)  acc1: 16.8000 (16.8000)  acc5: 46.0000 (46.0000)  time: 4.9645  data: 4.8078  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 3.8153 (3.9393)  acc1: 19.6000 (20.0727)  acc5: 43.6000 (44.1091)  time: 0.6563  data: 0.5326  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 4.3140 (4.1684)  acc1: 17.6000 (18.6095)  acc5: 38.0000 (40.6095)  time: 0.2013  data: 0.0870  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 4.3298 (4.1393)  acc1: 18.0000 (19.2160)  acc5: 38.0000 (41.2480)  time: 0.2292  data: 0.1180  max mem: 18117
Test: Total time: 0:00:10 (0.4119 s / it)
* Acc@1 19.062 Acc@5 41.376 loss 4.133
Accuracy of the model on the 50000 test images: 19.1%
Max accuracy: 19.06%
Epoch: [2]  [   0/1251]  eta: 1:02:17  lr: 0.000400  min_lr: 0.000400  loss: 5.1551 (5.1551)  weight_decay: 0.0500 (0.0500)  time: 2.9879  data: 2.6963  max mem: 18117
Epoch: [2]  [ 200/1251]  eta: 0:04:26  lr: 0.000432  min_lr: 0.000432  loss: 5.5320 (5.6293)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9954 (4.2236)  time: 0.2409  data: 0.0004  max mem: 18117
Epoch: [2]  [ 400/1251]  eta: 0:03:29  lr: 0.000464  min_lr: 0.000464  loss: 5.3896 (5.6233)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.4228 (4.1979)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [2]  [ 600/1251]  eta: 0:02:37  lr: 0.000496  min_lr: 0.000496  loss: 5.8175 (5.5794)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0735 (4.2727)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [2]  [ 800/1251]  eta: 0:01:48  lr: 0.000528  min_lr: 0.000528  loss: 5.1068 (5.5384)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8140 (4.1794)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [2]  [1000/1251]  eta: 0:01:00  lr: 0.000560  min_lr: 0.000560  loss: 4.8695 (5.4951)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9983 (4.1322)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [2]  [1200/1251]  eta: 0:00:12  lr: 0.000592  min_lr: 0.000592  loss: 5.3680 (5.4745)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.3143 (4.1393)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [2]  [1250/1251]  eta: 0:00:00  lr: 0.000599  min_lr: 0.000599  loss: 4.7778 (5.4618)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7826 (4.1305)  time: 0.1956  data: 0.0005  max mem: 18117
Epoch: [2] Total time: 0:05:00 (0.2403 s / it)
Averaged stats: lr: 0.000599  min_lr: 0.000599  loss: 4.7778 (5.4690)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7826 (4.1305)
Test:  [ 0/25]  eta: 0:02:29  loss: 3.1610 (3.1610)  acc1: 35.6000 (35.6000)  acc5: 61.2000 (61.2000)  time: 5.9826  data: 5.8555  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 3.0654 (3.1231)  acc1: 35.6000 (34.1091)  acc5: 64.0000 (62.3273)  time: 0.7686  data: 0.6546  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 3.5592 (3.3863)  acc1: 30.0000 (31.5429)  acc5: 53.6000 (56.9524)  time: 0.1948  data: 0.0845  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 3.5701 (3.3737)  acc1: 30.0000 (31.8880)  acc5: 53.6000 (57.2800)  time: 0.1924  data: 0.0844  max mem: 18117
Test: Total time: 0:00:10 (0.4171 s / it)
* Acc@1 31.068 Acc@5 57.172 loss 3.370
Accuracy of the model on the 50000 test images: 31.1%
Max accuracy: 31.07%
Epoch: [3]  [   0/1251]  eta: 1:14:10  lr: 0.000600  min_lr: 0.000600  loss: 5.2757 (5.2757)  weight_decay: 0.0500 (0.0500)  time: 3.5575  data: 3.3093  max mem: 18117
Epoch: [3]  [ 200/1251]  eta: 0:04:29  lr: 0.000632  min_lr: 0.000632  loss: 4.9805 (5.2311)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8403 (4.2292)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [3]  [ 400/1251]  eta: 0:03:31  lr: 0.000664  min_lr: 0.000664  loss: 4.6533 (5.2134)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8176 (4.0552)  time: 0.2411  data: 0.0004  max mem: 18117
Epoch: [3]  [ 600/1251]  eta: 0:02:39  lr: 0.000696  min_lr: 0.000696  loss: 5.2732 (5.1767)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5268 (3.9825)  time: 0.2407  data: 0.0005  max mem: 18117
Epoch: [3]  [ 800/1251]  eta: 0:01:49  lr: 0.000728  min_lr: 0.000728  loss: 4.9888 (5.1477)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1482 (3.8898)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [3]  [1000/1251]  eta: 0:01:00  lr: 0.000760  min_lr: 0.000760  loss: 4.7904 (5.1195)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6276 (3.8337)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [3]  [1200/1251]  eta: 0:00:12  lr: 0.000792  min_lr: 0.000792  loss: 4.7148 (5.1027)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6867 (3.8443)  time: 0.2430  data: 0.0004  max mem: 18117
Epoch: [3]  [1250/1251]  eta: 0:00:00  lr: 0.000799  min_lr: 0.000799  loss: 5.2786 (5.0996)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3050 (3.8322)  time: 0.1964  data: 0.0006  max mem: 18117
Epoch: [3] Total time: 0:05:01 (0.2414 s / it)
Averaged stats: lr: 0.000799  min_lr: 0.000799  loss: 5.2786 (5.1063)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3050 (3.8322)
Test:  [ 0/25]  eta: 0:02:13  loss: 2.5059 (2.5059)  acc1: 47.2000 (47.2000)  acc5: 69.6000 (69.6000)  time: 5.3317  data: 5.1714  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 2.4526 (2.5777)  acc1: 46.4000 (43.2000)  acc5: 70.4000 (71.0182)  time: 0.7542  data: 0.6375  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 3.0726 (2.8973)  acc1: 33.6000 (38.3619)  acc5: 61.6000 (65.4095)  time: 0.2298  data: 0.1185  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 3.1532 (2.9009)  acc1: 33.6000 (38.7360)  acc5: 61.2000 (65.1680)  time: 0.2292  data: 0.1182  max mem: 18117
Test: Total time: 0:00:10 (0.4178 s / it)
* Acc@1 39.056 Acc@5 65.726 loss 2.876
Accuracy of the model on the 50000 test images: 39.1%
Max accuracy: 39.06%
Epoch: [4]  [   0/1251]  eta: 1:04:03  lr: 0.000800  min_lr: 0.000800  loss: 3.9435 (3.9435)  weight_decay: 0.0500 (0.0500)  time: 3.0724  data: 2.7154  max mem: 18117
Epoch: [4]  [ 200/1251]  eta: 0:04:26  lr: 0.000832  min_lr: 0.000832  loss: 4.7406 (5.0161)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.4415 (3.6337)  time: 0.2374  data: 0.0003  max mem: 18117
Epoch: [4]  [ 400/1251]  eta: 0:03:29  lr: 0.000864  min_lr: 0.000864  loss: 4.6348 (4.8907)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8438 (3.5945)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [4]  [ 600/1251]  eta: 0:02:38  lr: 0.000896  min_lr: 0.000896  loss: 4.8662 (4.8918)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2831 (3.6080)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [4]  [ 800/1251]  eta: 0:01:49  lr: 0.000928  min_lr: 0.000928  loss: 4.9629 (4.8871)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1989 (3.5935)  time: 0.2409  data: 0.0004  max mem: 18117
Epoch: [4]  [1000/1251]  eta: 0:01:00  lr: 0.000960  min_lr: 0.000960  loss: 4.9961 (4.8629)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.4120 (3.5346)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [4]  [1200/1251]  eta: 0:00:12  lr: 0.000992  min_lr: 0.000992  loss: 4.2598 (4.8346)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1674 (3.5251)  time: 0.2434  data: 0.0004  max mem: 18117
Epoch: [4]  [1250/1251]  eta: 0:00:00  lr: 0.001000  min_lr: 0.001000  loss: 4.8602 (4.8297)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0042 (3.5096)  time: 0.1957  data: 0.0007  max mem: 18117
Epoch: [4] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.001000  min_lr: 0.001000  loss: 4.8602 (4.8364)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0042 (3.5096)
Test:  [ 0/25]  eta: 0:02:19  loss: 2.2143 (2.2143)  acc1: 52.0000 (52.0000)  acc5: 77.6000 (77.6000)  time: 5.5857  data: 5.4604  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 2.2367 (2.3010)  acc1: 50.0000 (49.6000)  acc5: 77.6000 (77.2364)  time: 0.7163  data: 0.6046  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 2.7889 (2.5908)  acc1: 40.8000 (44.5905)  acc5: 69.6000 (71.5810)  time: 0.1893  data: 0.0800  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 2.8299 (2.5829)  acc1: 40.8000 (44.7520)  acc5: 67.6000 (71.6640)  time: 0.1932  data: 0.0846  max mem: 18117
Test: Total time: 0:00:10 (0.4002 s / it)
* Acc@1 45.120 Acc@5 72.006 loss 2.567
Accuracy of the model on the 50000 test images: 45.1%
Max accuracy: 45.12%
Epoch: [5]  [   0/1251]  eta: 0:55:16  lr: 0.001000  min_lr: 0.001000  loss: 5.7534 (5.7534)  weight_decay: 0.0500 (0.0500)  time: 2.6513  data: 2.2866  max mem: 18117
Epoch: [5]  [ 200/1251]  eta: 0:04:24  lr: 0.001032  min_lr: 0.001032  loss: 4.8340 (4.6573)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3390 (3.3384)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [5]  [ 400/1251]  eta: 0:03:28  lr: 0.001064  min_lr: 0.001064  loss: 5.3072 (4.7355)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7131 (3.1606)  time: 0.2410  data: 0.0003  max mem: 18117
Epoch: [5]  [ 600/1251]  eta: 0:02:38  lr: 0.001096  min_lr: 0.001096  loss: 4.6448 (4.7105)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1331 (3.1257)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [5]  [ 800/1251]  eta: 0:01:49  lr: 0.001128  min_lr: 0.001128  loss: 4.3553 (4.6961)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3735 (3.1165)  time: 0.2373  data: 0.0005  max mem: 18117
Epoch: [5]  [1000/1251]  eta: 0:01:00  lr: 0.001160  min_lr: 0.001160  loss: 3.8874 (4.6785)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5952 (3.0851)  time: 0.2369  data: 0.0003  max mem: 18117
Epoch: [5]  [1200/1251]  eta: 0:00:12  lr: 0.001192  min_lr: 0.001192  loss: 4.9503 (4.6618)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4529 (3.0504)  time: 0.2384  data: 0.0005  max mem: 18117
Epoch: [5]  [1250/1251]  eta: 0:00:00  lr: 0.001200  min_lr: 0.001200  loss: 5.2296 (4.6668)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2096 (3.0402)  time: 0.1957  data: 0.0007  max mem: 18117
Epoch: [5] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.001200  min_lr: 0.001200  loss: 5.2296 (4.6465)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2096 (3.0402)
Test:  [ 0/25]  eta: 0:02:19  loss: 1.9271 (1.9271)  acc1: 62.0000 (62.0000)  acc5: 82.4000 (82.4000)  time: 5.5857  data: 5.4611  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 2.0145 (2.0973)  acc1: 56.4000 (55.6727)  acc5: 82.4000 (81.3455)  time: 0.7564  data: 0.6429  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 2.5090 (2.4341)  acc1: 45.6000 (49.6952)  acc5: 71.2000 (75.5429)  time: 0.2155  data: 0.1041  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 2.7073 (2.4385)  acc1: 45.6000 (49.6480)  acc5: 70.8000 (75.3920)  time: 0.2138  data: 0.1040  max mem: 18117
Test: Total time: 0:00:10 (0.4162 s / it)
* Acc@1 49.336 Acc@5 75.392 loss 2.432
Accuracy of the model on the 50000 test images: 49.3%
Max accuracy: 49.34%
Epoch: [6]  [   0/1251]  eta: 1:09:11  lr: 0.001200  min_lr: 0.001200  loss: 5.3721 (5.3721)  weight_decay: 0.0500 (0.0500)  time: 3.3188  data: 3.0429  max mem: 18117
Epoch: [6]  [ 200/1251]  eta: 0:04:24  lr: 0.001232  min_lr: 0.001232  loss: 3.9038 (4.4911)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6913 (2.9218)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [6]  [ 400/1251]  eta: 0:03:28  lr: 0.001264  min_lr: 0.001264  loss: 4.4220 (4.5012)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6891 (2.8726)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [6]  [ 600/1251]  eta: 0:02:38  lr: 0.001296  min_lr: 0.001296  loss: 4.8802 (4.5169)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6714 (2.8419)  time: 0.2443  data: 0.0004  max mem: 18117
Epoch: [6]  [ 800/1251]  eta: 0:01:49  lr: 0.001328  min_lr: 0.001328  loss: 4.5188 (4.5108)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4279 (2.7835)  time: 0.2415  data: 0.0004  max mem: 18117
Epoch: [6]  [1000/1251]  eta: 0:01:00  lr: 0.001360  min_lr: 0.001360  loss: 4.2721 (4.4944)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2189 (2.7491)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [6]  [1200/1251]  eta: 0:00:12  lr: 0.001393  min_lr: 0.001393  loss: 3.9670 (4.4781)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3272 (2.7448)  time: 0.2431  data: 0.0004  max mem: 18117
Epoch: [6]  [1250/1251]  eta: 0:00:00  lr: 0.001400  min_lr: 0.001400  loss: 5.0189 (4.4842)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1795 (2.7229)  time: 0.1955  data: 0.0006  max mem: 18117
Epoch: [6] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.001400  min_lr: 0.001400  loss: 5.0189 (4.4852)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1795 (2.7229)
Test:  [ 0/25]  eta: 0:02:22  loss: 1.6130 (1.6130)  acc1: 67.2000 (67.2000)  acc5: 87.2000 (87.2000)  time: 5.7059  data: 5.5794  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.8097 (1.8781)  acc1: 62.0000 (59.4182)  acc5: 85.6000 (84.3636)  time: 0.7447  data: 0.6322  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 2.3025 (2.1823)  acc1: 49.6000 (53.7524)  acc5: 74.4000 (78.7429)  time: 0.2011  data: 0.0915  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 2.4186 (2.1851)  acc1: 49.6000 (53.3440)  acc5: 73.6000 (78.4960)  time: 0.1997  data: 0.0915  max mem: 18117
Test: Total time: 0:00:10 (0.4098 s / it)
* Acc@1 53.296 Acc@5 78.588 loss 2.179
Accuracy of the model on the 50000 test images: 53.3%
Max accuracy: 53.30%
Epoch: [7]  [   0/1251]  eta: 1:03:43  lr: 0.001400  min_lr: 0.001400  loss: 3.8184 (3.8184)  weight_decay: 0.0500 (0.0500)  time: 3.0565  data: 2.7555  max mem: 18117
Epoch: [7]  [ 200/1251]  eta: 0:04:26  lr: 0.001432  min_lr: 0.001432  loss: 4.5829 (4.3458)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4312 (2.6132)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [7]  [ 400/1251]  eta: 0:03:29  lr: 0.001464  min_lr: 0.001464  loss: 4.6277 (4.3718)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1023 (2.5635)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [7]  [ 600/1251]  eta: 0:02:38  lr: 0.001496  min_lr: 0.001496  loss: 4.6799 (4.3662)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1134 (2.4992)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [7]  [ 800/1251]  eta: 0:01:49  lr: 0.001528  min_lr: 0.001528  loss: 3.8760 (4.3560)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1566 (2.4784)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [7]  [1000/1251]  eta: 0:01:00  lr: 0.001561  min_lr: 0.001561  loss: 5.1612 (4.3473)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3963 (2.4772)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [7]  [1200/1251]  eta: 0:00:12  lr: 0.001593  min_lr: 0.001593  loss: 4.4734 (4.3383)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3085 (2.4668)  time: 0.2382  data: 0.0005  max mem: 18117
Epoch: [7]  [1250/1251]  eta: 0:00:00  lr: 0.001600  min_lr: 0.001600  loss: 4.5967 (4.3398)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0963 (2.4457)  time: 0.1952  data: 0.0006  max mem: 18117
Epoch: [7] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.001600  min_lr: 0.001600  loss: 4.5967 (4.3508)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0963 (2.4457)
Test:  [ 0/25]  eta: 0:02:26  loss: 1.5855 (1.5855)  acc1: 70.0000 (70.0000)  acc5: 86.0000 (86.0000)  time: 5.8757  data: 5.7242  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 1.6162 (1.7546)  acc1: 58.4000 (61.1273)  acc5: 88.4000 (85.5636)  time: 0.7126  data: 0.5986  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 2.1484 (2.0747)  acc1: 51.6000 (55.9619)  acc5: 77.2000 (80.4952)  time: 0.1798  data: 0.0706  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 2.3448 (2.0874)  acc1: 51.2000 (55.7600)  acc5: 76.0000 (80.1440)  time: 0.1957  data: 0.0866  max mem: 18117
Test: Total time: 0:00:10 (0.4123 s / it)
* Acc@1 56.168 Acc@5 80.426 loss 2.084
Accuracy of the model on the 50000 test images: 56.2%
Max accuracy: 56.17%
Epoch: [8]  [   0/1251]  eta: 1:00:30  lr: 0.001600  min_lr: 0.001600  loss: 4.0193 (4.0193)  weight_decay: 0.0500 (0.0500)  time: 2.9023  data: 2.5708  max mem: 18117
Epoch: [8]  [ 200/1251]  eta: 0:04:25  lr: 0.001632  min_lr: 0.001632  loss: 3.9414 (4.2566)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9250 (2.0782)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [8]  [ 400/1251]  eta: 0:03:29  lr: 0.001664  min_lr: 0.001664  loss: 3.6739 (4.2498)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9402 (2.1227)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [8]  [ 600/1251]  eta: 0:02:38  lr: 0.001696  min_lr: 0.001696  loss: 3.4928 (4.2373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9593 (2.1954)  time: 0.2393  data: 0.0004  max mem: 18117
Epoch: [8]  [ 800/1251]  eta: 0:01:49  lr: 0.001728  min_lr: 0.001728  loss: 4.1537 (4.2187)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3049 (2.2127)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [8]  [1000/1251]  eta: 0:01:00  lr: 0.001761  min_lr: 0.001761  loss: 4.8219 (4.2312)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4171 (2.2297)  time: 0.2413  data: 0.0004  max mem: 18117
Epoch: [8]  [1200/1251]  eta: 0:00:12  lr: 0.001793  min_lr: 0.001793  loss: 3.6967 (4.2275)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9140 (2.1777)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [8]  [1250/1251]  eta: 0:00:00  lr: 0.001800  min_lr: 0.001800  loss: 3.6429 (4.2261)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9932 (2.1738)  time: 0.1963  data: 0.0005  max mem: 18117
Epoch: [8] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.001800  min_lr: 0.001800  loss: 3.6429 (4.2643)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9932 (2.1738)
Test:  [ 0/25]  eta: 0:02:24  loss: 1.4737 (1.4737)  acc1: 72.0000 (72.0000)  acc5: 88.8000 (88.8000)  time: 5.7862  data: 5.6420  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.5713 (1.6566)  acc1: 65.2000 (64.1818)  acc5: 89.6000 (87.6364)  time: 0.7516  data: 0.6379  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 2.0615 (1.9636)  acc1: 53.6000 (58.5524)  acc5: 80.4000 (82.5714)  time: 0.1973  data: 0.0876  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 2.1553 (1.9591)  acc1: 54.0000 (58.4960)  acc5: 78.0000 (82.3680)  time: 0.2008  data: 0.0914  max mem: 18117
Test: Total time: 0:00:10 (0.4128 s / it)
* Acc@1 58.394 Acc@5 82.086 loss 1.965
Accuracy of the model on the 50000 test images: 58.4%
Max accuracy: 58.39%
Epoch: [9]  [   0/1251]  eta: 0:53:04  lr: 0.001800  min_lr: 0.001800  loss: 4.3430 (4.3430)  weight_decay: 0.0500 (0.0500)  time: 2.5456  data: 1.9588  max mem: 18117
Epoch: [9]  [ 200/1251]  eta: 0:04:24  lr: 0.001832  min_lr: 0.001832  loss: 3.7696 (4.2369)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7118 (1.9566)  time: 0.2384  data: 0.0003  max mem: 18117
Epoch: [9]  [ 400/1251]  eta: 0:03:28  lr: 0.001864  min_lr: 0.001864  loss: 3.9725 (4.1966)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0308 (2.0281)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [9]  [ 600/1251]  eta: 0:02:37  lr: 0.001896  min_lr: 0.001896  loss: 3.9253 (4.2169)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0283 (2.0844)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [9]  [ 800/1251]  eta: 0:01:49  lr: 0.001929  min_lr: 0.001929  loss: 3.9107 (4.2097)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6900 (2.0343)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [9]  [1000/1251]  eta: 0:01:00  lr: 0.001961  min_lr: 0.001961  loss: 3.4404 (4.1913)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5710 (1.9874)  time: 0.2546  data: 0.0005  max mem: 18117
Epoch: [9]  [1200/1251]  eta: 0:00:12  lr: 0.001993  min_lr: 0.001993  loss: 4.0242 (4.1926)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6822 (1.9652)  time: 0.2408  data: 0.0004  max mem: 18117
Epoch: [9]  [1250/1251]  eta: 0:00:00  lr: 0.002000  min_lr: 0.002000  loss: 4.5907 (4.1945)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5296 (1.9525)  time: 0.1955  data: 0.0007  max mem: 18117
Epoch: [9] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.002000  min_lr: 0.002000  loss: 4.5907 (4.1788)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5296 (1.9525)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.5475 (1.5475)  acc1: 72.4000 (72.4000)  acc5: 87.6000 (87.6000)  time: 5.6714  data: 5.5241  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.5475 (1.6845)  acc1: 66.0000 (65.5273)  acc5: 89.6000 (87.7091)  time: 0.7635  data: 0.6490  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 2.0158 (1.9437)  acc1: 55.6000 (59.7905)  acc5: 82.0000 (82.8381)  time: 0.2205  data: 0.1086  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 2.0904 (1.9443)  acc1: 54.4000 (59.7280)  acc5: 78.8000 (82.7680)  time: 0.2196  data: 0.1085  max mem: 18117
Test: Total time: 0:00:10 (0.4243 s / it)
* Acc@1 59.656 Acc@5 82.950 loss 1.945
Accuracy of the model on the 50000 test images: 59.7%
Max accuracy: 59.66%
Epoch: [10]  [   0/1251]  eta: 1:00:02  lr: 0.002000  min_lr: 0.002000  loss: 3.1442 (3.1442)  weight_decay: 0.0500 (0.0500)  time: 2.8794  data: 2.5593  max mem: 18117
Epoch: [10]  [ 200/1251]  eta: 0:04:24  lr: 0.002032  min_lr: 0.002032  loss: 3.4928 (4.1880)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7952 (1.9321)  time: 0.2375  data: 0.0005  max mem: 18117
Epoch: [10]  [ 400/1251]  eta: 0:03:28  lr: 0.002064  min_lr: 0.002064  loss: 4.5012 (4.1555)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6589 (1.9151)  time: 0.2370  data: 0.0005  max mem: 18117
Epoch: [10]  [ 600/1251]  eta: 0:02:38  lr: 0.002096  min_lr: 0.002096  loss: 4.1122 (4.1509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6262 (1.8046)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [10]  [ 800/1251]  eta: 0:01:49  lr: 0.002129  min_lr: 0.002129  loss: 3.4454 (4.1703)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5147 (1.7799)  time: 0.2379  data: 0.0005  max mem: 18117
Epoch: [10]  [1000/1251]  eta: 0:01:00  lr: 0.002161  min_lr: 0.002161  loss: 3.8581 (4.1777)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7613 (1.7735)  time: 0.2439  data: 0.0003  max mem: 18117
Epoch: [10]  [1200/1251]  eta: 0:00:12  lr: 0.002193  min_lr: 0.002193  loss: 4.0659 (4.1624)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5723 (1.7616)  time: 0.2362  data: 0.0003  max mem: 18117
Epoch: [10]  [1250/1251]  eta: 0:00:00  lr: 0.002200  min_lr: 0.002200  loss: 3.7400 (4.1539)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5732 (1.7570)  time: 0.1958  data: 0.0005  max mem: 18117
Epoch: [10] Total time: 0:05:01 (0.2408 s / it)
Averaged stats: lr: 0.002200  min_lr: 0.002200  loss: 3.7400 (4.1123)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5732 (1.7570)
Test:  [ 0/25]  eta: 0:02:24  loss: 1.4084 (1.4084)  acc1: 72.8000 (72.8000)  acc5: 89.2000 (89.2000)  time: 5.7625  data: 5.6104  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.4551 (1.4859)  acc1: 67.2000 (67.9273)  acc5: 90.0000 (89.2000)  time: 0.7479  data: 0.6337  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.8133 (1.7724)  acc1: 58.0000 (62.0571)  acc5: 82.0000 (84.7619)  time: 0.2013  data: 0.0921  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.9785 (1.7758)  acc1: 57.6000 (61.8720)  acc5: 80.8000 (84.5440)  time: 0.2010  data: 0.0920  max mem: 18117
Test: Total time: 0:00:10 (0.4125 s / it)
* Acc@1 61.212 Acc@5 84.342 loss 1.783
Accuracy of the model on the 50000 test images: 61.2%
Max accuracy: 61.21%
Epoch: [11]  [   0/1251]  eta: 1:02:29  lr: 0.002200  min_lr: 0.002200  loss: 5.0987 (5.0987)  weight_decay: 0.0500 (0.0500)  time: 2.9975  data: 2.7119  max mem: 18117
Epoch: [11]  [ 200/1251]  eta: 0:04:24  lr: 0.002232  min_lr: 0.002232  loss: 3.3974 (4.1332)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7825 (1.8567)  time: 0.2389  data: 0.0005  max mem: 18117
Epoch: [11]  [ 400/1251]  eta: 0:03:28  lr: 0.002264  min_lr: 0.002264  loss: 3.4682 (4.0993)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4947 (1.6846)  time: 0.2386  data: 0.0005  max mem: 18117
Epoch: [11]  [ 600/1251]  eta: 0:02:37  lr: 0.002297  min_lr: 0.002297  loss: 3.4449 (4.0398)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6634 (1.6787)  time: 0.2360  data: 0.0005  max mem: 18117
Epoch: [11]  [ 800/1251]  eta: 0:01:48  lr: 0.002329  min_lr: 0.002329  loss: 3.8481 (4.0271)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4383 (1.6422)  time: 0.2366  data: 0.0006  max mem: 18117
Epoch: [11]  [1000/1251]  eta: 0:01:00  lr: 0.002361  min_lr: 0.002361  loss: 4.7219 (4.0277)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8157 (1.6491)  time: 0.2406  data: 0.0005  max mem: 18117
Epoch: [11]  [1200/1251]  eta: 0:00:12  lr: 0.002393  min_lr: 0.002393  loss: 3.2141 (4.0144)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5760 (1.6203)  time: 0.2348  data: 0.0005  max mem: 18117
Epoch: [11]  [1250/1251]  eta: 0:00:00  lr: 0.002400  min_lr: 0.002400  loss: 3.6974 (4.0106)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2929 (1.6081)  time: 0.1955  data: 0.0008  max mem: 18117
Epoch: [11] Total time: 0:05:00 (0.2402 s / it)
Averaged stats: lr: 0.002400  min_lr: 0.002400  loss: 3.6974 (4.0498)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2929 (1.6081)
Test:  [ 0/25]  eta: 0:02:24  loss: 1.2791 (1.2791)  acc1: 74.0000 (74.0000)  acc5: 90.0000 (90.0000)  time: 5.7838  data: 5.6361  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.3552 (1.4639)  acc1: 67.2000 (67.9273)  acc5: 91.2000 (90.2909)  time: 0.7490  data: 0.6346  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.7774 (1.7735)  acc1: 58.8000 (62.2667)  acc5: 82.8000 (85.3714)  time: 0.1919  data: 0.0823  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.9945 (1.7663)  acc1: 57.6000 (62.3200)  acc5: 81.6000 (85.2160)  time: 0.1916  data: 0.0822  max mem: 18117
Test: Total time: 0:00:10 (0.4054 s / it)
* Acc@1 62.310 Acc@5 84.976 loss 1.761
Accuracy of the model on the 50000 test images: 62.3%
Max accuracy: 62.31%
Epoch: [12]  [   0/1251]  eta: 1:10:03  lr: 0.002400  min_lr: 0.002400  loss: 3.3123 (3.3123)  weight_decay: 0.0500 (0.0500)  time: 3.3601  data: 3.0991  max mem: 18117
Epoch: [12]  [ 200/1251]  eta: 0:04:25  lr: 0.002432  min_lr: 0.002432  loss: 3.5039 (4.0348)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5925 (1.6559)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [12]  [ 400/1251]  eta: 0:03:28  lr: 0.002464  min_lr: 0.002464  loss: 4.0037 (4.0595)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4149 (1.5978)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [12]  [ 600/1251]  eta: 0:02:38  lr: 0.002497  min_lr: 0.002497  loss: 4.3534 (4.0438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6457 (1.5828)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [12]  [ 800/1251]  eta: 0:01:48  lr: 0.002529  min_lr: 0.002529  loss: 3.5558 (4.0336)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4665 (1.5456)  time: 0.2362  data: 0.0004  max mem: 18117
Epoch: [12]  [1000/1251]  eta: 0:01:00  lr: 0.002561  min_lr: 0.002561  loss: 3.5622 (4.0154)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5109 (1.5401)  time: 0.2423  data: 0.0003  max mem: 18117
Epoch: [12]  [1200/1251]  eta: 0:00:12  lr: 0.002593  min_lr: 0.002593  loss: 3.8243 (4.0069)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3140 (1.5447)  time: 0.2407  data: 0.0005  max mem: 18117
Epoch: [12]  [1250/1251]  eta: 0:00:00  lr: 0.002600  min_lr: 0.002600  loss: 3.3499 (3.9992)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1172 (1.5304)  time: 0.1956  data: 0.0005  max mem: 18117
Epoch: [12] Total time: 0:05:01 (0.2406 s / it)
Averaged stats: lr: 0.002600  min_lr: 0.002600  loss: 3.3499 (4.0051)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1172 (1.5304)
Test:  [ 0/25]  eta: 0:02:24  loss: 1.2630 (1.2630)  acc1: 74.4000 (74.4000)  acc5: 91.6000 (91.6000)  time: 5.7687  data: 5.6415  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 1.2630 (1.3939)  acc1: 68.8000 (69.0182)  acc5: 91.6000 (90.5455)  time: 0.6941  data: 0.5816  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.7477 (1.6783)  acc1: 60.8000 (63.8857)  acc5: 84.4000 (86.2667)  time: 0.1766  data: 0.0671  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.8481 (1.6848)  acc1: 60.8000 (63.5360)  acc5: 83.6000 (86.1280)  time: 0.1884  data: 0.0797  max mem: 18117
Test: Total time: 0:00:10 (0.4037 s / it)
* Acc@1 63.184 Acc@5 85.760 loss 1.683
Accuracy of the model on the 50000 test images: 63.2%
Max accuracy: 63.18%
Epoch: [13]  [   0/1251]  eta: 1:02:16  lr: 0.002600  min_lr: 0.002600  loss: 5.0031 (5.0031)  weight_decay: 0.0500 (0.0500)  time: 2.9869  data: 2.5988  max mem: 18117
Epoch: [13]  [ 200/1251]  eta: 0:04:30  lr: 0.002632  min_lr: 0.002632  loss: 3.3072 (4.0111)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4039 (1.3058)  time: 0.2444  data: 0.0004  max mem: 18117
Epoch: [13]  [ 400/1251]  eta: 0:03:31  lr: 0.002665  min_lr: 0.002665  loss: 3.8107 (3.9872)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1710 (1.2973)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [13]  [ 600/1251]  eta: 0:02:39  lr: 0.002697  min_lr: 0.002697  loss: 4.6677 (4.0081)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1568 (1.2974)  time: 0.2362  data: 0.0004  max mem: 18117
Epoch: [13]  [ 800/1251]  eta: 0:01:49  lr: 0.002729  min_lr: 0.002729  loss: 3.4867 (3.9893)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4497 (1.3016)  time: 0.2388  data: 0.0005  max mem: 18117
Epoch: [13]  [1000/1251]  eta: 0:01:00  lr: 0.002761  min_lr: 0.002761  loss: 3.2537 (3.9643)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2214 (1.3135)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [13]  [1200/1251]  eta: 0:00:12  lr: 0.002793  min_lr: 0.002793  loss: 3.0931 (3.9559)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1566 (1.3020)  time: 0.2360  data: 0.0003  max mem: 18117
Epoch: [13]  [1250/1251]  eta: 0:00:00  lr: 0.002800  min_lr: 0.002800  loss: 4.1441 (3.9601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2300 (1.2972)  time: 0.1951  data: 0.0006  max mem: 18117
Epoch: [13] Total time: 0:05:01 (0.2409 s / it)
Averaged stats: lr: 0.002800  min_lr: 0.002800  loss: 4.1441 (3.9491)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2300 (1.2972)
Test:  [ 0/25]  eta: 0:02:29  loss: 1.1754 (1.1754)  acc1: 76.4000 (76.4000)  acc5: 92.8000 (92.8000)  time: 5.9987  data: 5.8742  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 1.2135 (1.3820)  acc1: 73.6000 (69.6727)  acc5: 92.0000 (91.2000)  time: 0.6930  data: 0.5798  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.8002 (1.6432)  acc1: 59.6000 (64.5524)  acc5: 83.2000 (86.8191)  time: 0.1837  data: 0.0731  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.8214 (1.6548)  acc1: 59.6000 (64.0800)  acc5: 82.8000 (86.5120)  time: 0.2100  data: 0.1013  max mem: 18117
Test: Total time: 0:00:10 (0.4300 s / it)
* Acc@1 64.016 Acc@5 86.460 loss 1.641
Accuracy of the model on the 50000 test images: 64.0%
Max accuracy: 64.02%
Epoch: [14]  [   0/1251]  eta: 1:10:38  lr: 0.002800  min_lr: 0.002800  loss: 4.2341 (4.2341)  weight_decay: 0.0500 (0.0500)  time: 3.3878  data: 3.1070  max mem: 18117
Epoch: [14]  [ 200/1251]  eta: 0:04:29  lr: 0.002833  min_lr: 0.002833  loss: 3.6093 (3.8266)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1856 (1.1885)  time: 0.2414  data: 0.0005  max mem: 18117
Epoch: [14]  [ 400/1251]  eta: 0:03:31  lr: 0.002865  min_lr: 0.002865  loss: 3.4196 (3.8587)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2637 (1.1897)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [14]  [ 600/1251]  eta: 0:02:39  lr: 0.002897  min_lr: 0.002897  loss: 3.9755 (3.8510)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1601 (1.1764)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [14]  [ 800/1251]  eta: 0:01:49  lr: 0.002929  min_lr: 0.002929  loss: 4.0604 (3.8667)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1554 (1.1904)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [14]  [1000/1251]  eta: 0:01:00  lr: 0.002961  min_lr: 0.002961  loss: 3.5232 (3.8613)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0664 (1.2020)  time: 0.2412  data: 0.0005  max mem: 18117
Epoch: [14]  [1200/1251]  eta: 0:00:12  lr: 0.002993  min_lr: 0.002993  loss: 3.3631 (3.8782)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0669 (1.1813)  time: 0.2455  data: 0.0004  max mem: 18117
Epoch: [14]  [1250/1251]  eta: 0:00:00  lr: 0.003000  min_lr: 0.003000  loss: 4.4186 (3.8838)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0151 (1.1756)  time: 0.1956  data: 0.0006  max mem: 18117
Epoch: [14] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.003000  min_lr: 0.003000  loss: 4.4186 (3.9121)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0151 (1.1756)
Test:  [ 0/25]  eta: 0:02:25  loss: 1.2359 (1.2359)  acc1: 76.8000 (76.8000)  acc5: 91.2000 (91.2000)  time: 5.8330  data: 5.7086  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.3036 (1.3863)  acc1: 69.2000 (69.7818)  acc5: 91.2000 (90.6909)  time: 0.7543  data: 0.6426  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.7957 (1.6558)  acc1: 60.4000 (64.5905)  acc5: 83.6000 (86.2286)  time: 0.2055  data: 0.0957  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.8878 (1.6677)  acc1: 60.4000 (64.0160)  acc5: 82.4000 (86.1120)  time: 0.2048  data: 0.0956  max mem: 18117
Test: Total time: 0:00:10 (0.4187 s / it)
* Acc@1 64.112 Acc@5 86.176 loss 1.663
Accuracy of the model on the 50000 test images: 64.1%
Max accuracy: 64.11%
Epoch: [15]  [   0/1251]  eta: 1:09:42  lr: 0.003000  min_lr: 0.003000  loss: 3.4151 (3.4151)  weight_decay: 0.0500 (0.0500)  time: 3.3436  data: 3.0963  max mem: 18117
Epoch: [15]  [ 200/1251]  eta: 0:04:26  lr: 0.003033  min_lr: 0.003033  loss: 3.8336 (3.9316)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1917 (1.2537)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [15]  [ 400/1251]  eta: 0:03:28  lr: 0.003065  min_lr: 0.003065  loss: 3.4952 (3.8914)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1011 (1.1843)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [15]  [ 600/1251]  eta: 0:02:38  lr: 0.003097  min_lr: 0.003097  loss: 4.1661 (3.9085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1353 (1.1626)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [15]  [ 800/1251]  eta: 0:01:49  lr: 0.003129  min_lr: 0.003129  loss: 3.7515 (3.8872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8928 (1.1191)  time: 0.2415  data: 0.0004  max mem: 18117
Epoch: [15]  [1000/1251]  eta: 0:01:00  lr: 0.003161  min_lr: 0.003161  loss: 4.1685 (3.8844)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0854 (1.1022)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [15]  [1200/1251]  eta: 0:00:12  lr: 0.003193  min_lr: 0.003193  loss: 3.0770 (3.8743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9972 (1.0901)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [15]  [1250/1251]  eta: 0:00:00  lr: 0.003200  min_lr: 0.003200  loss: 4.4381 (3.8831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9420 (1.0856)  time: 0.1958  data: 0.0007  max mem: 18117
Epoch: [15] Total time: 0:05:01 (0.2409 s / it)
Averaged stats: lr: 0.003200  min_lr: 0.003200  loss: 4.4381 (3.8746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9420 (1.0856)
Test:  [ 0/25]  eta: 0:01:27  loss: 1.2523 (1.2523)  acc1: 75.2000 (75.2000)  acc5: 92.8000 (92.8000)  time: 3.4945  data: 3.3670  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 1.3033 (1.4204)  acc1: 71.2000 (70.4727)  acc5: 92.4000 (91.7455)  time: 0.6260  data: 0.5137  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.7468 (1.6725)  acc1: 61.6000 (65.5429)  acc5: 86.0000 (87.5238)  time: 0.2786  data: 0.1691  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.8167 (1.6788)  acc1: 61.2000 (65.0880)  acc5: 85.2000 (87.4560)  time: 0.1998  data: 0.0916  max mem: 18117
Test: Total time: 0:00:09 (0.3969 s / it)
* Acc@1 65.402 Acc@5 87.164 loss 1.677
Accuracy of the model on the 50000 test images: 65.4%
Max accuracy: 65.40%
Epoch: [16]  [   0/1251]  eta: 1:14:25  lr: 0.003201  min_lr: 0.003201  loss: 4.3721 (4.3721)  weight_decay: 0.0500 (0.0500)  time: 3.5694  data: 3.3175  max mem: 18117
Epoch: [16]  [ 200/1251]  eta: 0:04:27  lr: 0.003233  min_lr: 0.003233  loss: 3.7358 (3.8567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8902 (1.0095)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [16]  [ 400/1251]  eta: 0:03:29  lr: 0.003265  min_lr: 0.003265  loss: 3.7896 (3.8509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8133 (1.0060)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [16]  [ 600/1251]  eta: 0:02:38  lr: 0.003297  min_lr: 0.003297  loss: 3.9332 (3.8780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9093 (1.0017)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [16]  [ 800/1251]  eta: 0:01:49  lr: 0.003329  min_lr: 0.003329  loss: 3.8820 (3.8935)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1853 (1.0319)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [16]  [1000/1251]  eta: 0:01:00  lr: 0.003361  min_lr: 0.003361  loss: 3.3997 (3.8837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8244 (1.0185)  time: 0.2401  data: 0.0004  max mem: 18117
Epoch: [16]  [1200/1251]  eta: 0:00:12  lr: 0.003393  min_lr: 0.003393  loss: 4.2574 (3.8952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9277 (1.0190)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [16]  [1250/1251]  eta: 0:00:00  lr: 0.003400  min_lr: 0.003400  loss: 3.9909 (3.8970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9913 (1.0201)  time: 0.1954  data: 0.0006  max mem: 18117
Epoch: [16] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.003400  min_lr: 0.003400  loss: 3.9909 (3.8675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9913 (1.0201)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.0929 (1.0929)  acc1: 77.6000 (77.6000)  acc5: 93.2000 (93.2000)  time: 5.6658  data: 5.5433  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.2030 (1.3076)  acc1: 73.6000 (71.2364)  acc5: 94.0000 (92.0000)  time: 0.7389  data: 0.6279  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.6370 (1.5727)  acc1: 63.6000 (65.6191)  acc5: 85.6000 (87.8667)  time: 0.2024  data: 0.0917  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.7543 (1.5828)  acc1: 62.0000 (65.4720)  acc5: 84.4000 (87.5840)  time: 0.2103  data: 0.1005  max mem: 18117
Test: Total time: 0:00:10 (0.4161 s / it)
* Acc@1 65.864 Acc@5 87.378 loss 1.583
Accuracy of the model on the 50000 test images: 65.9%
Max accuracy: 65.86%
Epoch: [17]  [   0/1251]  eta: 0:59:22  lr: 0.003401  min_lr: 0.003401  loss: 4.3734 (4.3734)  weight_decay: 0.0500 (0.0500)  time: 2.8474  data: 2.5249  max mem: 18117
Epoch: [17]  [ 200/1251]  eta: 0:04:24  lr: 0.003433  min_lr: 0.003433  loss: 3.3908 (3.8284)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0726 (1.0934)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [17]  [ 400/1251]  eta: 0:03:27  lr: 0.003465  min_lr: 0.003465  loss: 3.3811 (3.8228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8957 (1.1062)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [17]  [ 600/1251]  eta: 0:02:37  lr: 0.003497  min_lr: 0.003497  loss: 3.2573 (3.8342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8255 (1.0436)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [17]  [ 800/1251]  eta: 0:01:48  lr: 0.003529  min_lr: 0.003529  loss: 3.6684 (3.8293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8308 (1.0207)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [17]  [1000/1251]  eta: 0:01:00  lr: 0.003561  min_lr: 0.003561  loss: 3.0349 (3.8145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8782 (1.0011)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [17]  [1200/1251]  eta: 0:00:12  lr: 0.003593  min_lr: 0.003593  loss: 3.8093 (3.8118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7694 (0.9636)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [17]  [1250/1251]  eta: 0:00:00  lr: 0.003600  min_lr: 0.003600  loss: 3.2807 (3.8035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8203 (0.9604)  time: 0.1955  data: 0.0005  max mem: 18117
Epoch: [17] Total time: 0:05:00 (0.2403 s / it)
Averaged stats: lr: 0.003600  min_lr: 0.003600  loss: 3.2807 (3.8296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8203 (0.9604)
Test:  [ 0/25]  eta: 0:02:09  loss: 1.0392 (1.0392)  acc1: 80.8000 (80.8000)  acc5: 94.0000 (94.0000)  time: 5.1719  data: 5.0162  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 1.1656 (1.2790)  acc1: 71.2000 (70.6909)  acc5: 92.4000 (91.4182)  time: 0.6887  data: 0.5748  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.6165 (1.5353)  acc1: 63.2000 (65.9429)  acc5: 87.2000 (87.8476)  time: 0.1979  data: 0.0889  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.7063 (1.5467)  acc1: 61.6000 (65.5840)  acc5: 84.0000 (87.6960)  time: 0.2262  data: 0.1179  max mem: 18117
Test: Total time: 0:00:10 (0.4089 s / it)
* Acc@1 66.220 Acc@5 87.804 loss 1.530
Accuracy of the model on the 50000 test images: 66.2%
Max accuracy: 66.22%
Epoch: [18]  [   0/1251]  eta: 1:02:32  lr: 0.003601  min_lr: 0.003601  loss: 3.0559 (3.0559)  weight_decay: 0.0500 (0.0500)  time: 2.9997  data: 2.7396  max mem: 18117
Epoch: [18]  [ 200/1251]  eta: 0:04:25  lr: 0.003633  min_lr: 0.003633  loss: 3.6424 (3.8421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9438 (0.8676)  time: 0.2358  data: 0.0005  max mem: 18117
Epoch: [18]  [ 400/1251]  eta: 0:03:28  lr: 0.003665  min_lr: 0.003665  loss: 3.1328 (3.8343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9477 (0.9169)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [18]  [ 600/1251]  eta: 0:02:38  lr: 0.003697  min_lr: 0.003697  loss: 3.1820 (3.8444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8252 (0.8959)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [18]  [ 800/1251]  eta: 0:01:49  lr: 0.003729  min_lr: 0.003729  loss: 3.9230 (3.8173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8681 (0.9012)  time: 0.2437  data: 0.0004  max mem: 18117
Epoch: [18]  [1000/1251]  eta: 0:01:00  lr: 0.003761  min_lr: 0.003761  loss: 3.4144 (3.8016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7999 (0.9025)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [18]  [1200/1251]  eta: 0:00:12  lr: 0.003793  min_lr: 0.003793  loss: 3.1341 (3.8042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8681 (0.9142)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [18]  [1250/1251]  eta: 0:00:00  lr: 0.003800  min_lr: 0.003800  loss: 3.4704 (3.8067)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0216 (0.9172)  time: 0.1964  data: 0.0006  max mem: 18117
Epoch: [18] Total time: 0:05:00 (0.2405 s / it)
Averaged stats: lr: 0.003800  min_lr: 0.003800  loss: 3.4704 (3.8085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0216 (0.9172)
Test:  [ 0/25]  eta: 0:02:22  loss: 1.1877 (1.1877)  acc1: 77.6000 (77.6000)  acc5: 91.6000 (91.6000)  time: 5.7100  data: 5.5867  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.2231 (1.3041)  acc1: 70.4000 (71.2727)  acc5: 92.0000 (91.4546)  time: 0.7660  data: 0.6533  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.7709 (1.5508)  acc1: 63.2000 (66.4381)  acc5: 85.2000 (87.9048)  time: 0.2011  data: 0.0913  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.7709 (1.5613)  acc1: 63.2000 (66.1760)  acc5: 84.8000 (87.6800)  time: 0.2000  data: 0.0912  max mem: 18117
Test: Total time: 0:00:10 (0.4098 s / it)
* Acc@1 66.314 Acc@5 87.824 loss 1.552
Accuracy of the model on the 50000 test images: 66.3%
Max accuracy: 66.31%
Epoch: [19]  [   0/1251]  eta: 1:05:50  lr: 0.003801  min_lr: 0.003801  loss: 3.4669 (3.4669)  weight_decay: 0.0500 (0.0500)  time: 3.1578  data: 2.8542  max mem: 18117
Epoch: [19]  [ 200/1251]  eta: 0:04:23  lr: 0.003833  min_lr: 0.003833  loss: 3.1022 (3.6942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8795 (0.9433)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [19]  [ 400/1251]  eta: 0:03:27  lr: 0.003865  min_lr: 0.003865  loss: 3.9137 (3.7450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9135 (0.8871)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [19]  [ 600/1251]  eta: 0:02:37  lr: 0.003897  min_lr: 0.003897  loss: 3.1439 (3.7552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6991 (0.8602)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [19]  [ 800/1251]  eta: 0:01:48  lr: 0.003929  min_lr: 0.003929  loss: 3.9684 (3.7549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7863 (0.8647)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [19]  [1000/1251]  eta: 0:01:00  lr: 0.003961  min_lr: 0.003961  loss: 3.1790 (3.7469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8246 (0.8582)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [19]  [1200/1251]  eta: 0:00:12  lr: 0.003993  min_lr: 0.003993  loss: 3.2497 (3.7357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8924 (0.8615)  time: 0.2416  data: 0.0004  max mem: 18117
Epoch: [19]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.2621 (3.7436)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8507 (0.8585)  time: 0.1958  data: 0.0006  max mem: 18117
Epoch: [19] Total time: 0:05:01 (0.2408 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.2621 (3.7829)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8507 (0.8585)
Test:  [ 0/25]  eta: 0:02:27  loss: 1.0672 (1.0672)  acc1: 79.6000 (79.6000)  acc5: 94.0000 (94.0000)  time: 5.8848  data: 5.7432  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.2559 (1.2966)  acc1: 71.6000 (71.9273)  acc5: 92.8000 (92.2182)  time: 0.7524  data: 0.6381  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.6768 (1.5477)  acc1: 63.6000 (66.8952)  acc5: 86.4000 (88.7810)  time: 0.1910  data: 0.0807  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.7538 (1.5569)  acc1: 63.2000 (66.7680)  acc5: 86.4000 (88.6720)  time: 0.1904  data: 0.0807  max mem: 18117
Test: Total time: 0:00:10 (0.4095 s / it)
* Acc@1 66.550 Acc@5 88.238 loss 1.559
Accuracy of the model on the 50000 test images: 66.6%
Max accuracy: 66.55%
Epoch: [20]  [   0/1251]  eta: 1:07:55  lr: 0.004000  min_lr: 0.004000  loss: 4.2231 (4.2231)  weight_decay: 0.0500 (0.0500)  time: 3.2579  data: 2.9532  max mem: 18117
Epoch: [20]  [ 200/1251]  eta: 0:04:27  lr: 0.004000  min_lr: 0.004000  loss: 3.0772 (3.6639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6776 (0.8513)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [20]  [ 400/1251]  eta: 0:03:29  lr: 0.004000  min_lr: 0.004000  loss: 3.7599 (3.7287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8822 (0.8955)  time: 0.2364  data: 0.0003  max mem: 18117
Epoch: [20]  [ 600/1251]  eta: 0:02:38  lr: 0.004000  min_lr: 0.004000  loss: 3.1066 (3.7681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7063 (0.8464)  time: 0.2423  data: 0.0004  max mem: 18117
Epoch: [20]  [ 800/1251]  eta: 0:01:49  lr: 0.004000  min_lr: 0.004000  loss: 4.5304 (3.7767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6928 (0.8251)  time: 0.2478  data: 0.0004  max mem: 18117
Epoch: [20]  [1000/1251]  eta: 0:01:00  lr: 0.004000  min_lr: 0.004000  loss: 3.0949 (3.7930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.8124)  time: 0.2384  data: 0.0005  max mem: 18117
Epoch: [20]  [1200/1251]  eta: 0:00:12  lr: 0.004000  min_lr: 0.004000  loss: 3.5640 (3.7998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7243 (0.8025)  time: 0.2387  data: 0.0005  max mem: 18117
Epoch: [20]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.0783 (3.8010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7988 (0.8005)  time: 0.1970  data: 0.0012  max mem: 18117
Epoch: [20] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.0783 (3.7514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7988 (0.8005)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.0793 (1.0793)  acc1: 78.8000 (78.8000)  acc5: 92.0000 (92.0000)  time: 5.5077  data: 5.3816  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 1.1993 (1.2294)  acc1: 75.2000 (72.8727)  acc5: 93.2000 (92.4727)  time: 0.7251  data: 0.6114  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.6009 (1.4892)  acc1: 64.0000 (67.6952)  acc5: 88.4000 (88.9333)  time: 0.2027  data: 0.0926  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.6606 (1.4986)  acc1: 64.0000 (67.4240)  acc5: 86.4000 (88.8320)  time: 0.2130  data: 0.1050  max mem: 18117
Test: Total time: 0:00:10 (0.4136 s / it)
* Acc@1 67.502 Acc@5 88.696 loss 1.493
Accuracy of the model on the 50000 test images: 67.5%
Max accuracy: 67.50%
Epoch: [21]  [   0/1251]  eta: 1:00:10  lr: 0.004000  min_lr: 0.004000  loss: 2.8239 (2.8239)  weight_decay: 0.0500 (0.0500)  time: 2.8857  data: 2.5499  max mem: 18117
Epoch: [21]  [ 200/1251]  eta: 0:04:23  lr: 0.004000  min_lr: 0.004000  loss: 3.6539 (3.7039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8105 (0.7805)  time: 0.2348  data: 0.0004  max mem: 18117
Epoch: [21]  [ 400/1251]  eta: 0:03:28  lr: 0.004000  min_lr: 0.004000  loss: 3.6301 (3.7100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7539 (0.7878)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [21]  [ 600/1251]  eta: 0:02:37  lr: 0.004000  min_lr: 0.004000  loss: 3.3392 (3.6715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9325 (0.8340)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [21]  [ 800/1251]  eta: 0:01:48  lr: 0.004000  min_lr: 0.004000  loss: 4.2820 (3.6729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8027 (0.8215)  time: 0.2406  data: 0.0004  max mem: 18117
Epoch: [21]  [1000/1251]  eta: 0:01:00  lr: 0.004000  min_lr: 0.004000  loss: 3.5416 (3.6859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7004 (0.8076)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [21]  [1200/1251]  eta: 0:00:12  lr: 0.004000  min_lr: 0.004000  loss: 3.1184 (3.6901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5676 (0.7941)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [21]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 3.6977 (3.6929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7258 (0.7915)  time: 0.1956  data: 0.0010  max mem: 18117
Epoch: [21] Total time: 0:05:00 (0.2403 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 3.6977 (3.7133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7258 (0.7915)
Test:  [ 0/25]  eta: 0:02:16  loss: 1.0154 (1.0154)  acc1: 77.6000 (77.6000)  acc5: 93.2000 (93.2000)  time: 5.4565  data: 5.3282  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0814 (1.1370)  acc1: 72.0000 (73.0545)  acc5: 93.2000 (93.2000)  time: 0.7578  data: 0.6447  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.4846 (1.3959)  acc1: 64.0000 (68.6286)  acc5: 88.4000 (89.3714)  time: 0.2316  data: 0.1217  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.6165 (1.4013)  acc1: 64.0000 (68.2720)  acc5: 85.6000 (89.1840)  time: 0.2298  data: 0.1216  max mem: 18117
Test: Total time: 0:00:10 (0.4241 s / it)
* Acc@1 68.214 Acc@5 89.040 loss 1.413
Accuracy of the model on the 50000 test images: 68.2%
Max accuracy: 68.21%
Epoch: [22]  [   0/1251]  eta: 1:04:03  lr: 0.003999  min_lr: 0.003999  loss: 2.8451 (2.8451)  weight_decay: 0.0500 (0.0500)  time: 3.0720  data: 2.7210  max mem: 18117
Epoch: [22]  [ 200/1251]  eta: 0:04:27  lr: 0.003999  min_lr: 0.003999  loss: 3.5134 (3.7210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6778 (nan)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [22]  [ 400/1251]  eta: 0:03:29  lr: 0.003999  min_lr: 0.003999  loss: 2.8950 (3.7298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7106 (nan)  time: 0.2368  data: 0.0003  max mem: 18117
Epoch: [22]  [ 600/1251]  eta: 0:02:38  lr: 0.003999  min_lr: 0.003999  loss: 4.2257 (3.7333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7276 (nan)  time: 0.2377  data: 0.0005  max mem: 18117
Epoch: [22]  [ 800/1251]  eta: 0:01:49  lr: 0.003999  min_lr: 0.003999  loss: 4.3176 (3.7458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6959 (nan)  time: 0.2355  data: 0.0004  max mem: 18117
Epoch: [22]  [1000/1251]  eta: 0:01:00  lr: 0.003999  min_lr: 0.003999  loss: 3.3842 (3.7101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7076 (nan)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [22]  [1200/1251]  eta: 0:00:12  lr: 0.003999  min_lr: 0.003999  loss: 4.3055 (3.7023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7611 (nan)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [22]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 3.1534 (3.6998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7611 (nan)  time: 0.1955  data: 0.0005  max mem: 18117
Epoch: [22] Total time: 0:05:00 (0.2402 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 3.1534 (3.6956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7611 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.0164 (1.0164)  acc1: 76.8000 (76.8000)  acc5: 94.0000 (94.0000)  time: 5.4992  data: 5.3729  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0997 (1.1682)  acc1: 74.0000 (73.0545)  acc5: 94.0000 (93.4182)  time: 0.7383  data: 0.6255  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.4695 (1.4077)  acc1: 64.8000 (68.3238)  acc5: 89.2000 (89.4857)  time: 0.2080  data: 0.0982  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.5851 (1.4106)  acc1: 64.8000 (67.9200)  acc5: 86.4000 (89.4560)  time: 0.2063  data: 0.0981  max mem: 18117
Test: Total time: 0:00:10 (0.4081 s / it)
* Acc@1 68.368 Acc@5 89.160 loss 1.414
Accuracy of the model on the 50000 test images: 68.4%
Max accuracy: 68.37%
Epoch: [23]  [   0/1251]  eta: 1:10:51  lr: 0.003999  min_lr: 0.003999  loss: 3.6329 (3.6329)  weight_decay: 0.0500 (0.0500)  time: 3.3988  data: 3.1492  max mem: 18117
Epoch: [23]  [ 200/1251]  eta: 0:04:29  lr: 0.003999  min_lr: 0.003999  loss: 3.3690 (3.7227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7360 (0.7274)  time: 0.2403  data: 0.0004  max mem: 18117
Epoch: [23]  [ 400/1251]  eta: 0:03:32  lr: 0.003999  min_lr: 0.003999  loss: 4.1076 (3.6868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7904 (0.7584)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [23]  [ 600/1251]  eta: 0:02:40  lr: 0.003998  min_lr: 0.003998  loss: 3.2805 (3.6962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7211 (0.7670)  time: 0.2413  data: 0.0005  max mem: 18117
Epoch: [23]  [ 800/1251]  eta: 0:01:50  lr: 0.003998  min_lr: 0.003998  loss: 3.1026 (3.6794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6236 (0.7476)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [23]  [1000/1251]  eta: 0:01:01  lr: 0.003998  min_lr: 0.003998  loss: 3.9464 (3.6626)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7075 (0.7295)  time: 0.2364  data: 0.0005  max mem: 18117
Epoch: [23]  [1200/1251]  eta: 0:00:12  lr: 0.003998  min_lr: 0.003998  loss: 3.5771 (3.6679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5822 (0.7278)  time: 0.2422  data: 0.0004  max mem: 18117
Epoch: [23]  [1250/1251]  eta: 0:00:00  lr: 0.003998  min_lr: 0.003998  loss: 4.4642 (3.6737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6047 (0.7234)  time: 0.1967  data: 0.0010  max mem: 18117
Epoch: [23] Total time: 0:05:03 (0.2428 s / it)
Averaged stats: lr: 0.003998  min_lr: 0.003998  loss: 4.4642 (3.6589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6047 (0.7234)
Test:  [ 0/25]  eta: 0:02:23  loss: 1.0600 (1.0600)  acc1: 80.8000 (80.8000)  acc5: 95.6000 (95.6000)  time: 5.7527  data: 5.6269  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.1745 (1.2626)  acc1: 75.2000 (73.2727)  acc5: 93.2000 (92.9818)  time: 0.7531  data: 0.6401  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.6235 (1.5079)  acc1: 63.6000 (68.4952)  acc5: 88.4000 (89.3524)  time: 0.2249  data: 0.1149  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.6839 (1.5183)  acc1: 64.0000 (68.4160)  acc5: 86.4000 (89.2480)  time: 0.2230  data: 0.1149  max mem: 18117
Test: Total time: 0:00:10 (0.4306 s / it)
* Acc@1 68.370 Acc@5 89.306 loss 1.524
Accuracy of the model on the 50000 test images: 68.4%
Max accuracy: 68.37%
Epoch: [24]  [   0/1251]  eta: 1:03:57  lr: 0.003998  min_lr: 0.003998  loss: 4.5964 (4.5964)  weight_decay: 0.0500 (0.0500)  time: 3.0672  data: 2.7574  max mem: 18117
Epoch: [24]  [ 200/1251]  eta: 0:04:27  lr: 0.003998  min_lr: 0.003998  loss: 3.2795 (3.6870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6359 (0.6590)  time: 0.2381  data: 0.0005  max mem: 18117
Epoch: [24]  [ 400/1251]  eta: 0:03:29  lr: 0.003998  min_lr: 0.003998  loss: 4.0582 (3.6894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6958 (0.6918)  time: 0.2360  data: 0.0005  max mem: 18117
Epoch: [24]  [ 600/1251]  eta: 0:02:39  lr: 0.003997  min_lr: 0.003997  loss: 4.0434 (3.6941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6886 (0.7157)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [24]  [ 800/1251]  eta: 0:01:49  lr: 0.003997  min_lr: 0.003997  loss: 2.9021 (3.6970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5474 (0.7047)  time: 0.2411  data: 0.0005  max mem: 18117
Epoch: [24]  [1000/1251]  eta: 0:01:00  lr: 0.003997  min_lr: 0.003997  loss: 3.4472 (3.7056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6155 (0.7029)  time: 0.2375  data: 0.0005  max mem: 18117
Epoch: [24]  [1200/1251]  eta: 0:00:12  lr: 0.003997  min_lr: 0.003997  loss: 3.4298 (3.6910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7765 (0.7060)  time: 0.2375  data: 0.0005  max mem: 18117
Epoch: [24]  [1250/1251]  eta: 0:00:00  lr: 0.003997  min_lr: 0.003997  loss: 3.6181 (3.6916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6785 (0.7082)  time: 0.1957  data: 0.0007  max mem: 18117
Epoch: [24] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.003997  min_lr: 0.003997  loss: 3.6181 (3.6554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6785 (0.7082)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.9843 (0.9843)  acc1: 82.0000 (82.0000)  acc5: 93.6000 (93.6000)  time: 5.5846  data: 5.4585  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.1339 (1.1663)  acc1: 75.6000 (74.3636)  acc5: 93.6000 (93.0546)  time: 0.7543  data: 0.6413  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.5062 (1.3867)  acc1: 66.4000 (69.9238)  acc5: 87.6000 (89.5238)  time: 0.2163  data: 0.1060  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.5283 (1.4023)  acc1: 66.0000 (69.3120)  acc5: 87.2000 (89.5360)  time: 0.2155  data: 0.1059  max mem: 18117
Test: Total time: 0:00:10 (0.4177 s / it)
* Acc@1 69.446 Acc@5 89.540 loss 1.402
Accuracy of the model on the 50000 test images: 69.4%
Max accuracy: 69.45%
Epoch: [25]  [   0/1251]  eta: 1:08:13  lr: 0.003997  min_lr: 0.003997  loss: 4.4999 (4.4999)  weight_decay: 0.0500 (0.0500)  time: 3.2725  data: 3.0245  max mem: 18117
Epoch: [25]  [ 200/1251]  eta: 0:04:27  lr: 0.003997  min_lr: 0.003997  loss: 3.8709 (3.5825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6390 (0.6710)  time: 0.2398  data: 0.0003  max mem: 18117
Epoch: [25]  [ 400/1251]  eta: 0:03:29  lr: 0.003996  min_lr: 0.003996  loss: 3.5781 (3.6066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5956 (0.6947)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [25]  [ 600/1251]  eta: 0:02:38  lr: 0.003996  min_lr: 0.003996  loss: 3.2030 (3.6118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6743 (0.6916)  time: 0.2368  data: 0.0003  max mem: 18117
Epoch: [25]  [ 800/1251]  eta: 0:01:49  lr: 0.003996  min_lr: 0.003996  loss: 3.2204 (3.6158)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6438 (0.7053)  time: 0.2392  data: 0.0003  max mem: 18117
Epoch: [25]  [1000/1251]  eta: 0:01:00  lr: 0.003996  min_lr: 0.003996  loss: 3.9078 (3.6358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7016 (0.7026)  time: 0.2392  data: 0.0003  max mem: 18117
Epoch: [25]  [1200/1251]  eta: 0:00:12  lr: 0.003996  min_lr: 0.003996  loss: 3.9515 (3.6404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7198 (0.6970)  time: 0.2414  data: 0.0003  max mem: 18117
Epoch: [25]  [1250/1251]  eta: 0:00:00  lr: 0.003995  min_lr: 0.003995  loss: 4.2495 (3.6384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6311 (0.6943)  time: 0.1954  data: 0.0005  max mem: 18117
Epoch: [25] Total time: 0:05:01 (0.2410 s / it)
Averaged stats: lr: 0.003995  min_lr: 0.003995  loss: 4.2495 (3.6260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6311 (0.6943)
Test:  [ 0/25]  eta: 0:02:20  loss: 1.0124 (1.0124)  acc1: 78.8000 (78.8000)  acc5: 96.0000 (96.0000)  time: 5.6162  data: 5.4890  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.1185 (1.2123)  acc1: 78.0000 (74.1091)  acc5: 94.4000 (93.4545)  time: 0.7567  data: 0.6451  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.6075 (1.4414)  acc1: 64.0000 (69.3333)  acc5: 88.0000 (90.0191)  time: 0.2267  data: 0.1171  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.6075 (1.4480)  acc1: 64.8000 (69.2960)  acc5: 86.8000 (89.8400)  time: 0.2258  data: 0.1170  max mem: 18117
Test: Total time: 0:00:10 (0.4267 s / it)
* Acc@1 69.506 Acc@5 89.792 loss 1.443
Accuracy of the model on the 50000 test images: 69.5%
Max accuracy: 69.51%
Epoch: [26]  [   0/1251]  eta: 1:06:15  lr: 0.003995  min_lr: 0.003995  loss: 4.1769 (4.1769)  weight_decay: 0.0500 (0.0500)  time: 3.1776  data: 2.8728  max mem: 18117
Epoch: [26]  [ 200/1251]  eta: 0:04:25  lr: 0.003995  min_lr: 0.003995  loss: 2.8621 (3.4681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7080 (0.7628)  time: 0.2355  data: 0.0007  max mem: 18117
Epoch: [26]  [ 400/1251]  eta: 0:03:29  lr: 0.003995  min_lr: 0.003995  loss: 3.0907 (3.5583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6111 (0.7083)  time: 0.2402  data: 0.0005  max mem: 18117
Epoch: [26]  [ 600/1251]  eta: 0:02:38  lr: 0.003995  min_lr: 0.003995  loss: 3.0654 (3.5649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5934 (0.6903)  time: 0.2449  data: 0.0004  max mem: 18117
Epoch: [26]  [ 800/1251]  eta: 0:01:49  lr: 0.003994  min_lr: 0.003994  loss: 3.2765 (3.5564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7037 (0.6833)  time: 0.2448  data: 0.0004  max mem: 18117
Epoch: [26]  [1000/1251]  eta: 0:01:00  lr: 0.003994  min_lr: 0.003994  loss: 3.5126 (3.5614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7543 (0.7038)  time: 0.2354  data: 0.0004  max mem: 18117
Epoch: [26]  [1200/1251]  eta: 0:00:12  lr: 0.003994  min_lr: 0.003994  loss: 3.0977 (3.5722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6593 (0.6945)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [26]  [1250/1251]  eta: 0:00:00  lr: 0.003994  min_lr: 0.003994  loss: 3.3576 (3.5713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6539 (0.6932)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [26] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.003994  min_lr: 0.003994  loss: 3.3576 (3.6075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6539 (0.6932)
Test:  [ 0/25]  eta: 0:02:14  loss: 1.0352 (1.0352)  acc1: 79.2000 (79.2000)  acc5: 94.0000 (94.0000)  time: 5.3959  data: 5.2432  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.2274 (1.2011)  acc1: 74.0000 (74.8364)  acc5: 94.0000 (93.7818)  time: 0.7595  data: 0.6429  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.4662 (1.4303)  acc1: 66.4000 (69.7524)  acc5: 88.4000 (89.6952)  time: 0.2254  data: 0.1148  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.6041 (1.4325)  acc1: 66.4000 (69.3920)  acc5: 87.6000 (89.8560)  time: 0.2244  data: 0.1147  max mem: 18117
Test: Total time: 0:00:10 (0.4169 s / it)
* Acc@1 69.374 Acc@5 89.832 loss 1.427
Accuracy of the model on the 50000 test images: 69.4%
Max accuracy: 69.51%
Epoch: [27]  [   0/1251]  eta: 1:02:00  lr: 0.003994  min_lr: 0.003994  loss: 2.8341 (2.8341)  weight_decay: 0.0500 (0.0500)  time: 2.9740  data: 2.2731  max mem: 18117
Epoch: [27]  [ 200/1251]  eta: 0:04:25  lr: 0.003994  min_lr: 0.003994  loss: 2.8121 (3.5744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6180 (0.6572)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [27]  [ 400/1251]  eta: 0:03:29  lr: 0.003993  min_lr: 0.003993  loss: 3.8504 (3.5722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6683 (0.6753)  time: 0.2420  data: 0.0004  max mem: 18117
Epoch: [27]  [ 600/1251]  eta: 0:02:38  lr: 0.003993  min_lr: 0.003993  loss: 3.2423 (3.5856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6939 (0.6996)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [27]  [ 800/1251]  eta: 0:01:49  lr: 0.003993  min_lr: 0.003993  loss: 3.3483 (3.5779)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7338 (0.6988)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [27]  [1000/1251]  eta: 0:01:00  lr: 0.003992  min_lr: 0.003992  loss: 3.5901 (3.5746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5801 (0.6917)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [27]  [1200/1251]  eta: 0:00:12  lr: 0.003992  min_lr: 0.003992  loss: 3.7750 (3.5803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6027 (0.6826)  time: 0.2383  data: 0.0005  max mem: 18117
Epoch: [27]  [1250/1251]  eta: 0:00:00  lr: 0.003992  min_lr: 0.003992  loss: 3.1619 (3.5819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6553 (0.6852)  time: 0.1953  data: 0.0008  max mem: 18117
Epoch: [27] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.003992  min_lr: 0.003992  loss: 3.1619 (3.5838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6553 (0.6852)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.8996 (0.8996)  acc1: 81.2000 (81.2000)  acc5: 94.4000 (94.4000)  time: 5.8448  data: 5.6937  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0990 (1.1352)  acc1: 75.6000 (75.3455)  acc5: 94.4000 (93.5636)  time: 0.7517  data: 0.6366  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.4359 (1.3456)  acc1: 68.8000 (70.9905)  acc5: 90.0000 (90.1143)  time: 0.1957  data: 0.0858  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4748 (1.3581)  acc1: 68.4000 (70.4960)  acc5: 87.2000 (90.0800)  time: 0.1952  data: 0.0857  max mem: 18117
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 70.280 Acc@5 90.230 loss 1.352
Accuracy of the model on the 50000 test images: 70.3%
Max accuracy: 70.28%
Epoch: [28]  [   0/1251]  eta: 1:03:28  lr: 0.003992  min_lr: 0.003992  loss: 4.5205 (4.5205)  weight_decay: 0.0500 (0.0500)  time: 3.0443  data: 2.6579  max mem: 18117
Epoch: [28]  [ 200/1251]  eta: 0:04:26  lr: 0.003992  min_lr: 0.003992  loss: 3.0592 (3.6092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7090 (0.7721)  time: 0.2403  data: 0.0004  max mem: 18117
Epoch: [28]  [ 400/1251]  eta: 0:03:29  lr: 0.003991  min_lr: 0.003991  loss: 3.3611 (3.5935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6196 (0.7127)  time: 0.2421  data: 0.0004  max mem: 18117
Epoch: [28]  [ 600/1251]  eta: 0:02:39  lr: 0.003991  min_lr: 0.003991  loss: 4.2415 (3.5720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6431 (0.6915)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [28]  [ 800/1251]  eta: 0:01:49  lr: 0.003991  min_lr: 0.003991  loss: 3.6947 (3.5859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6844 (0.6926)  time: 0.2380  data: 0.0005  max mem: 18117
Epoch: [28]  [1000/1251]  eta: 0:01:00  lr: 0.003990  min_lr: 0.003990  loss: 3.7390 (3.6000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8616 (0.7002)  time: 0.2376  data: 0.0005  max mem: 18117
Epoch: [28]  [1200/1251]  eta: 0:00:12  lr: 0.003990  min_lr: 0.003990  loss: 3.2450 (3.5832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5673 (0.6882)  time: 0.2460  data: 0.0004  max mem: 18117
Epoch: [28]  [1250/1251]  eta: 0:00:00  lr: 0.003990  min_lr: 0.003990  loss: 3.5224 (3.5870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7824 (0.6948)  time: 0.1982  data: 0.0005  max mem: 18117
Epoch: [28] Total time: 0:05:02 (0.2420 s / it)
Averaged stats: lr: 0.003990  min_lr: 0.003990  loss: 3.5224 (3.5738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7824 (0.6948)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.9687 (0.9687)  acc1: 78.8000 (78.8000)  acc5: 94.8000 (94.8000)  time: 5.5148  data: 5.3902  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.1601 (1.1359)  acc1: 75.6000 (75.3091)  acc5: 94.8000 (93.6000)  time: 0.7415  data: 0.6294  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.4385 (1.3527)  acc1: 67.2000 (70.8762)  acc5: 88.4000 (90.2286)  time: 0.2028  data: 0.0914  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.5267 (1.3704)  acc1: 66.4000 (70.0320)  acc5: 87.6000 (90.1120)  time: 0.2206  data: 0.1101  max mem: 18117
Test: Total time: 0:00:10 (0.4186 s / it)
* Acc@1 69.906 Acc@5 90.126 loss 1.368
Accuracy of the model on the 50000 test images: 69.9%
Max accuracy: 70.28%
Epoch: [29]  [   0/1251]  eta: 1:06:30  lr: 0.003990  min_lr: 0.003990  loss: 2.9961 (2.9961)  weight_decay: 0.0500 (0.0500)  time: 3.1897  data: 2.7800  max mem: 18117
Epoch: [29]  [ 200/1251]  eta: 0:04:26  lr: 0.003989  min_lr: 0.003989  loss: 3.4906 (3.4972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6769 (0.6793)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [29]  [ 400/1251]  eta: 0:03:29  lr: 0.003989  min_lr: 0.003989  loss: 3.0528 (3.5078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6289 (0.6789)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [29]  [ 600/1251]  eta: 0:02:38  lr: 0.003989  min_lr: 0.003989  loss: 3.0559 (3.5215)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5322 (0.6451)  time: 0.2354  data: 0.0004  max mem: 18117
Epoch: [29]  [ 800/1251]  eta: 0:01:49  lr: 0.003988  min_lr: 0.003988  loss: 2.8941 (3.5275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5739 (0.6504)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [29]  [1000/1251]  eta: 0:01:00  lr: 0.003988  min_lr: 0.003988  loss: 3.9179 (3.5332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7361 (0.6778)  time: 0.2408  data: 0.0004  max mem: 18117
Epoch: [29]  [1200/1251]  eta: 0:00:12  lr: 0.003988  min_lr: 0.003988  loss: 3.0931 (3.5390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6462 (0.6803)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [29]  [1250/1251]  eta: 0:00:00  lr: 0.003987  min_lr: 0.003987  loss: 3.5471 (3.5444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6554 (0.6782)  time: 0.1954  data: 0.0007  max mem: 18117
Epoch: [29] Total time: 0:05:02 (0.2420 s / it)
Averaged stats: lr: 0.003987  min_lr: 0.003987  loss: 3.5471 (3.5526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6554 (0.6782)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.9405 (0.9405)  acc1: 79.2000 (79.2000)  acc5: 96.0000 (96.0000)  time: 5.4276  data: 5.3031  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 1.0730 (1.1068)  acc1: 77.2000 (74.7636)  acc5: 95.2000 (93.8182)  time: 0.6837  data: 0.5719  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.4690 (1.3353)  acc1: 66.4000 (70.0762)  acc5: 89.2000 (90.3619)  time: 0.1964  data: 0.0870  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4917 (1.3491)  acc1: 66.8000 (69.8400)  acc5: 87.6000 (90.2560)  time: 0.2012  data: 0.0929  max mem: 18117
Test: Total time: 0:00:10 (0.4014 s / it)
* Acc@1 70.260 Acc@5 90.174 loss 1.340
Accuracy of the model on the 50000 test images: 70.3%
Max accuracy: 70.28%
Epoch: [30]  [   0/1251]  eta: 1:06:14  lr: 0.003987  min_lr: 0.003987  loss: 4.2908 (4.2908)  weight_decay: 0.0500 (0.0500)  time: 3.1774  data: 2.4608  max mem: 18117
Epoch: [30]  [ 200/1251]  eta: 0:04:27  lr: 0.003987  min_lr: 0.003987  loss: 3.4848 (3.4998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5144 (0.5649)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [30]  [ 400/1251]  eta: 0:03:29  lr: 0.003987  min_lr: 0.003987  loss: 3.0977 (3.5073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6498 (0.6318)  time: 0.2430  data: 0.0004  max mem: 18117
Epoch: [30]  [ 600/1251]  eta: 0:02:38  lr: 0.003986  min_lr: 0.003986  loss: 2.8142 (3.5164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5740 (0.6481)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [30]  [ 800/1251]  eta: 0:01:49  lr: 0.003986  min_lr: 0.003986  loss: 2.8466 (3.4831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6747 (0.6716)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [30]  [1000/1251]  eta: 0:01:00  lr: 0.003985  min_lr: 0.003985  loss: 3.5950 (3.4797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7559 (0.6781)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [30]  [1200/1251]  eta: 0:00:12  lr: 0.003985  min_lr: 0.003985  loss: 3.2125 (3.4991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6031 (0.6824)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [30]  [1250/1251]  eta: 0:00:00  lr: 0.003985  min_lr: 0.003985  loss: 3.1173 (3.5043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6060 (0.6831)  time: 0.1960  data: 0.0007  max mem: 18117
Epoch: [30] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.003985  min_lr: 0.003985  loss: 3.1173 (3.5179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6060 (0.6831)
Test:  [ 0/25]  eta: 0:02:22  loss: 1.0025 (1.0025)  acc1: 80.0000 (80.0000)  acc5: 96.4000 (96.4000)  time: 5.7150  data: 5.5905  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0956 (1.0974)  acc1: 76.4000 (75.2727)  acc5: 95.2000 (94.1818)  time: 0.7649  data: 0.6507  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.3283 (1.3112)  acc1: 67.2000 (70.9524)  acc5: 89.6000 (90.9333)  time: 0.2020  data: 0.0886  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.5280 (1.3232)  acc1: 66.8000 (70.5600)  acc5: 88.0000 (90.7520)  time: 0.2004  data: 0.0885  max mem: 18117
Test: Total time: 0:00:10 (0.4110 s / it)
* Acc@1 71.052 Acc@5 90.776 loss 1.320
Accuracy of the model on the 50000 test images: 71.1%
Max accuracy: 71.05%
Epoch: [31]  [   0/1251]  eta: 1:09:17  lr: 0.003985  min_lr: 0.003985  loss: 4.1763 (4.1763)  weight_decay: 0.0500 (0.0500)  time: 3.3235  data: 3.0465  max mem: 18117
Epoch: [31]  [ 200/1251]  eta: 0:04:26  lr: 0.003984  min_lr: 0.003984  loss: 3.3280 (3.4426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5034 (0.6060)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [31]  [ 400/1251]  eta: 0:03:29  lr: 0.003984  min_lr: 0.003984  loss: 3.1430 (3.4206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6431 (0.6727)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [31]  [ 600/1251]  eta: 0:02:38  lr: 0.003983  min_lr: 0.003983  loss: 2.8765 (3.4426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6327 (0.6790)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [31]  [ 800/1251]  eta: 0:01:48  lr: 0.003983  min_lr: 0.003983  loss: 3.0775 (3.4554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5999 (0.6747)  time: 0.2345  data: 0.0004  max mem: 18117
Epoch: [31]  [1000/1251]  eta: 0:01:00  lr: 0.003982  min_lr: 0.003982  loss: 2.9397 (3.4845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6326 (0.6842)  time: 0.2429  data: 0.0004  max mem: 18117
Epoch: [31]  [1200/1251]  eta: 0:00:12  lr: 0.003982  min_lr: 0.003982  loss: 2.9634 (3.4978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6044 (0.6817)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [31]  [1250/1251]  eta: 0:00:00  lr: 0.003982  min_lr: 0.003982  loss: 2.9811 (3.5040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6884 (0.6847)  time: 0.1953  data: 0.0005  max mem: 18117
Epoch: [31] Total time: 0:05:01 (0.2409 s / it)
Averaged stats: lr: 0.003982  min_lr: 0.003982  loss: 2.9811 (3.5265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6884 (0.6847)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.9128 (0.9128)  acc1: 81.6000 (81.6000)  acc5: 95.2000 (95.2000)  time: 5.3452  data: 5.2168  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0625 (1.0768)  acc1: 74.4000 (75.8909)  acc5: 94.8000 (94.0727)  time: 0.7471  data: 0.6329  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.3410 (1.3154)  acc1: 68.4000 (71.1429)  acc5: 90.8000 (90.8571)  time: 0.2157  data: 0.1051  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4648 (1.3247)  acc1: 68.4000 (70.7360)  acc5: 88.4000 (90.8160)  time: 0.1985  data: 0.0902  max mem: 18117
Test: Total time: 0:00:10 (0.4074 s / it)
* Acc@1 71.176 Acc@5 90.698 loss 1.317
Accuracy of the model on the 50000 test images: 71.2%
Max accuracy: 71.18%
Epoch: [32]  [   0/1251]  eta: 1:10:48  lr: 0.003982  min_lr: 0.003982  loss: 2.6314 (2.6314)  weight_decay: 0.0500 (0.0500)  time: 3.3959  data: 3.0985  max mem: 18117
Epoch: [32]  [ 200/1251]  eta: 0:04:27  lr: 0.003981  min_lr: 0.003981  loss: 3.8021 (3.5024)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8185 (0.7537)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [32]  [ 400/1251]  eta: 0:03:29  lr: 0.003981  min_lr: 0.003981  loss: 3.3225 (3.5316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6568 (0.7419)  time: 0.2424  data: 0.0004  max mem: 18117
Epoch: [32]  [ 600/1251]  eta: 0:02:39  lr: 0.003980  min_lr: 0.003980  loss: 3.5663 (3.5177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6605 (0.7032)  time: 0.2425  data: 0.0003  max mem: 18117
Epoch: [32]  [ 800/1251]  eta: 0:01:49  lr: 0.003980  min_lr: 0.003980  loss: 2.9003 (3.5142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7371 (0.7197)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [32]  [1000/1251]  eta: 0:01:00  lr: 0.003979  min_lr: 0.003979  loss: 4.1519 (3.5184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5296 (0.7148)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [32]  [1200/1251]  eta: 0:00:12  lr: 0.003979  min_lr: 0.003979  loss: 3.3912 (3.5329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5041 (0.6954)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [32]  [1250/1251]  eta: 0:00:00  lr: 0.003979  min_lr: 0.003979  loss: 3.4594 (3.5324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5951 (0.6927)  time: 0.1960  data: 0.0006  max mem: 18117
Epoch: [32] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.003979  min_lr: 0.003979  loss: 3.4594 (3.5179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5951 (0.6927)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8687 (0.8687)  acc1: 81.6000 (81.6000)  acc5: 94.4000 (94.4000)  time: 5.5741  data: 5.4243  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9949 (1.0406)  acc1: 77.2000 (75.8182)  acc5: 94.4000 (94.5091)  time: 0.7177  data: 0.6030  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2460 (1.2650)  acc1: 69.6000 (71.6000)  acc5: 91.2000 (90.8762)  time: 0.1950  data: 0.0849  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4639 (1.2831)  acc1: 68.0000 (71.0400)  acc5: 88.0000 (90.7200)  time: 0.2072  data: 0.0972  max mem: 18117
Test: Total time: 0:00:10 (0.4098 s / it)
* Acc@1 71.400 Acc@5 90.866 loss 1.272
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.40%
Epoch: [33]  [   0/1251]  eta: 1:01:20  lr: 0.003979  min_lr: 0.003979  loss: 3.2588 (3.2588)  weight_decay: 0.0500 (0.0500)  time: 2.9424  data: 2.6156  max mem: 18117
Epoch: [33]  [ 200/1251]  eta: 0:04:26  lr: 0.003978  min_lr: 0.003978  loss: 2.8470 (3.4749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6216 (0.6753)  time: 0.2462  data: 0.0004  max mem: 18117
Epoch: [33]  [ 400/1251]  eta: 0:03:30  lr: 0.003978  min_lr: 0.003978  loss: 4.2308 (3.4992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5712 (0.6699)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [33]  [ 600/1251]  eta: 0:02:39  lr: 0.003977  min_lr: 0.003977  loss: 3.6629 (3.4943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7123 (0.6954)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [33]  [ 800/1251]  eta: 0:01:49  lr: 0.003977  min_lr: 0.003977  loss: 3.5956 (3.4767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6388 (0.6876)  time: 0.2407  data: 0.0004  max mem: 18117
Epoch: [33]  [1000/1251]  eta: 0:01:00  lr: 0.003976  min_lr: 0.003976  loss: 3.4605 (3.4884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6682 (0.6911)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [33]  [1200/1251]  eta: 0:00:12  lr: 0.003976  min_lr: 0.003976  loss: 2.9374 (3.4974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6005 (0.6889)  time: 0.2409  data: 0.0004  max mem: 18117
Epoch: [33]  [1250/1251]  eta: 0:00:00  lr: 0.003975  min_lr: 0.003975  loss: 3.2371 (3.4919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7598 (0.6922)  time: 0.2008  data: 0.0006  max mem: 18117
Epoch: [33] Total time: 0:05:02 (0.2420 s / it)
Averaged stats: lr: 0.003975  min_lr: 0.003975  loss: 3.2371 (3.5014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7598 (0.6922)
Test:  [ 0/25]  eta: 0:01:38  loss: 0.9099 (0.9099)  acc1: 81.2000 (81.2000)  acc5: 97.6000 (97.6000)  time: 3.9530  data: 3.7884  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 1.0710 (1.1195)  acc1: 76.0000 (76.0364)  acc5: 93.6000 (93.9273)  time: 0.6545  data: 0.5398  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.3969 (1.3343)  acc1: 68.8000 (71.4476)  acc5: 90.8000 (90.9714)  time: 0.2482  data: 0.1392  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4610 (1.3450)  acc1: 68.8000 (71.2960)  acc5: 89.2000 (90.8480)  time: 0.1944  data: 0.0854  max mem: 18117
Test: Total time: 0:00:09 (0.3965 s / it)
* Acc@1 71.368 Acc@5 90.782 loss 1.334
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.40%
Epoch: [34]  [   0/1251]  eta: 1:07:18  lr: 0.003975  min_lr: 0.003975  loss: 2.6976 (2.6976)  weight_decay: 0.0500 (0.0500)  time: 3.2279  data: 1.6348  max mem: 18117
Epoch: [34]  [ 200/1251]  eta: 0:04:26  lr: 0.003975  min_lr: 0.003975  loss: 2.8488 (3.4204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6998 (0.7060)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [34]  [ 400/1251]  eta: 0:03:29  lr: 0.003974  min_lr: 0.003974  loss: 4.1160 (3.4420)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7467 (0.7302)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [34]  [ 600/1251]  eta: 0:02:38  lr: 0.003974  min_lr: 0.003974  loss: 3.5754 (3.4729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5927 (0.6994)  time: 0.2371  data: 0.0005  max mem: 18117
Epoch: [34]  [ 800/1251]  eta: 0:01:48  lr: 0.003973  min_lr: 0.003973  loss: 3.0649 (3.4650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7264 (0.7065)  time: 0.2346  data: 0.0004  max mem: 18117
Epoch: [34]  [1000/1251]  eta: 0:01:00  lr: 0.003972  min_lr: 0.003972  loss: 3.0634 (3.4655)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [34]  [1200/1251]  eta: 0:00:12  lr: 0.003972  min_lr: 0.003972  loss: 3.5300 (3.4698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6670 (nan)  time: 0.2410  data: 0.0004  max mem: 18117
Epoch: [34]  [1250/1251]  eta: 0:00:00  lr: 0.003972  min_lr: 0.003972  loss: 2.7758 (3.4649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6066 (nan)  time: 0.1956  data: 0.0007  max mem: 18117
Epoch: [34] Total time: 0:05:00 (0.2400 s / it)
Averaged stats: lr: 0.003972  min_lr: 0.003972  loss: 2.7758 (3.4920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6066 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8010 (0.8010)  acc1: 82.0000 (82.0000)  acc5: 96.4000 (96.4000)  time: 5.5928  data: 5.4662  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9369 (1.0132)  acc1: 77.6000 (76.5455)  acc5: 95.2000 (94.3636)  time: 0.7713  data: 0.6582  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.3400 (1.2474)  acc1: 66.8000 (71.4286)  acc5: 89.6000 (90.9333)  time: 0.2296  data: 0.1196  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4462 (1.2631)  acc1: 66.0000 (71.0400)  acc5: 88.4000 (90.9280)  time: 0.2277  data: 0.1195  max mem: 18117
Test: Total time: 0:00:10 (0.4279 s / it)
* Acc@1 71.912 Acc@5 91.054 loss 1.252
Accuracy of the model on the 50000 test images: 71.9%
Max accuracy: 71.91%
Epoch: [35]  [   0/1251]  eta: 1:04:01  lr: 0.003972  min_lr: 0.003972  loss: 2.5354 (2.5354)  weight_decay: 0.0500 (0.0500)  time: 3.0710  data: 2.7435  max mem: 18117
Epoch: [35]  [ 200/1251]  eta: 0:04:26  lr: 0.003971  min_lr: 0.003971  loss: 3.1581 (3.4475)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [35]  [ 400/1251]  eta: 0:03:29  lr: 0.003971  min_lr: 0.003971  loss: 3.8211 (3.4560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6474 (nan)  time: 0.2359  data: 0.0003  max mem: 18117
Epoch: [35]  [ 600/1251]  eta: 0:02:38  lr: 0.003970  min_lr: 0.003970  loss: 2.7638 (3.4533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6530 (nan)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [35]  [ 800/1251]  eta: 0:01:49  lr: 0.003969  min_lr: 0.003969  loss: 3.2501 (3.4772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5984 (nan)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [35]  [1000/1251]  eta: 0:01:00  lr: 0.003969  min_lr: 0.003969  loss: 3.0297 (3.4970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6280 (nan)  time: 0.2352  data: 0.0004  max mem: 18117
Epoch: [35]  [1200/1251]  eta: 0:00:12  lr: 0.003968  min_lr: 0.003968  loss: 3.7993 (3.5007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7741 (nan)  time: 0.2420  data: 0.0004  max mem: 18117
Epoch: [35]  [1250/1251]  eta: 0:00:00  lr: 0.003968  min_lr: 0.003968  loss: 3.4374 (3.5003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8050 (nan)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [35] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.003968  min_lr: 0.003968  loss: 3.4374 (3.4606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8050 (nan)
Test:  [ 0/25]  eta: 0:01:23  loss: 0.9153 (0.9153)  acc1: 81.6000 (81.6000)  acc5: 95.6000 (95.6000)  time: 3.3419  data: 3.2150  max mem: 18117
Test:  [10/25]  eta: 0:00:08  loss: 1.0354 (1.0783)  acc1: 75.6000 (76.2909)  acc5: 94.8000 (94.2909)  time: 0.5986  data: 0.4866  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.3644 (1.2988)  acc1: 69.2000 (71.6571)  acc5: 90.4000 (91.0857)  time: 0.2869  data: 0.1776  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4681 (1.3123)  acc1: 68.0000 (71.4560)  acc5: 89.2000 (90.9440)  time: 0.2310  data: 0.1228  max mem: 18117
Test: Total time: 0:00:10 (0.4092 s / it)
* Acc@1 71.716 Acc@5 90.966 loss 1.307
Accuracy of the model on the 50000 test images: 71.7%
Max accuracy: 71.91%
Epoch: [36]  [   0/1251]  eta: 1:11:04  lr: 0.003968  min_lr: 0.003968  loss: 4.3361 (4.3361)  weight_decay: 0.0500 (0.0500)  time: 3.4089  data: 2.7948  max mem: 18117
Epoch: [36]  [ 200/1251]  eta: 0:04:27  lr: 0.003967  min_lr: 0.003967  loss: 2.8370 (3.4411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7484 (0.7015)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [36]  [ 400/1251]  eta: 0:03:29  lr: 0.003967  min_lr: 0.003967  loss: 3.6252 (3.4080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8461 (0.7402)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [36]  [ 600/1251]  eta: 0:02:38  lr: 0.003966  min_lr: 0.003966  loss: 3.5655 (3.4206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6904 (0.7104)  time: 0.2376  data: 0.0003  max mem: 18117
Epoch: [36]  [ 800/1251]  eta: 0:01:49  lr: 0.003965  min_lr: 0.003965  loss: 3.2566 (3.4291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (0.7155)  time: 0.2481  data: 0.0004  max mem: 18117
Epoch: [36]  [1000/1251]  eta: 0:01:01  lr: 0.003965  min_lr: 0.003965  loss: 2.9042 (3.4428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8111 (0.7237)  time: 0.2383  data: 0.0003  max mem: 18117
Epoch: [36]  [1200/1251]  eta: 0:00:12  lr: 0.003964  min_lr: 0.003964  loss: 4.1100 (3.4672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6401 (0.7183)  time: 0.2381  data: 0.0003  max mem: 18117
Epoch: [36]  [1250/1251]  eta: 0:00:00  lr: 0.003964  min_lr: 0.003964  loss: 3.2366 (3.4714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6721 (0.7197)  time: 0.1953  data: 0.0007  max mem: 18117
Epoch: [36] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.003964  min_lr: 0.003964  loss: 3.2366 (3.4653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6721 (0.7197)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8433 (0.8433)  acc1: 82.8000 (82.8000)  acc5: 96.0000 (96.0000)  time: 5.5561  data: 5.4276  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 1.0284 (1.0612)  acc1: 77.2000 (76.5818)  acc5: 95.6000 (94.1818)  time: 0.7168  data: 0.6031  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.3640 (1.2939)  acc1: 69.2000 (71.9810)  acc5: 89.6000 (91.2000)  time: 0.1944  data: 0.0842  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4432 (1.2998)  acc1: 69.2000 (71.7280)  acc5: 88.8000 (91.1040)  time: 0.1924  data: 0.0841  max mem: 18117
Test: Total time: 0:00:09 (0.3990 s / it)
* Acc@1 71.776 Acc@5 91.032 loss 1.301
Accuracy of the model on the 50000 test images: 71.8%
Max accuracy: 71.91%
Epoch: [37]  [   0/1251]  eta: 1:10:00  lr: 0.003964  min_lr: 0.003964  loss: 3.3764 (3.3764)  weight_decay: 0.0500 (0.0500)  time: 3.3579  data: 2.9838  max mem: 18117
Epoch: [37]  [ 200/1251]  eta: 0:04:26  lr: 0.003963  min_lr: 0.003963  loss: 2.7694 (3.3331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6359 (0.7275)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [37]  [ 400/1251]  eta: 0:03:29  lr: 0.003962  min_lr: 0.003962  loss: 3.2173 (3.3862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7798 (0.7425)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [37]  [ 600/1251]  eta: 0:02:38  lr: 0.003962  min_lr: 0.003962  loss: 2.8295 (3.4089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8648 (0.7500)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [37]  [ 800/1251]  eta: 0:01:49  lr: 0.003961  min_lr: 0.003961  loss: 2.8413 (3.4109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5298 (0.7237)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [37]  [1000/1251]  eta: 0:01:00  lr: 0.003960  min_lr: 0.003960  loss: 2.8828 (3.4214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7498 (0.7355)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [37]  [1200/1251]  eta: 0:00:12  lr: 0.003960  min_lr: 0.003960  loss: 3.2855 (3.4336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6198 (0.7366)  time: 0.2372  data: 0.0010  max mem: 18117
Epoch: [37]  [1250/1251]  eta: 0:00:00  lr: 0.003959  min_lr: 0.003959  loss: 3.8275 (3.4391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5498 (0.7295)  time: 0.1963  data: 0.0006  max mem: 18117
Epoch: [37] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.003959  min_lr: 0.003959  loss: 3.8275 (3.4464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5498 (0.7295)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.8539 (0.8539)  acc1: 85.2000 (85.2000)  acc5: 96.0000 (96.0000)  time: 5.6892  data: 5.5631  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0011 (1.0694)  acc1: 76.8000 (76.0727)  acc5: 94.8000 (94.4364)  time: 0.7456  data: 0.6339  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.3051 (1.2954)  acc1: 68.4000 (71.8667)  acc5: 90.8000 (91.2381)  time: 0.1945  data: 0.0847  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4857 (1.3026)  acc1: 68.4000 (71.5040)  acc5: 89.2000 (91.2640)  time: 0.1946  data: 0.0846  max mem: 18117
Test: Total time: 0:00:10 (0.4053 s / it)
* Acc@1 71.990 Acc@5 91.322 loss 1.300
Accuracy of the model on the 50000 test images: 72.0%
Max accuracy: 71.99%
Epoch: [38]  [   0/1251]  eta: 1:03:39  lr: 0.003959  min_lr: 0.003959  loss: 3.7340 (3.7340)  weight_decay: 0.0500 (0.0500)  time: 3.0530  data: 2.7585  max mem: 18117
Epoch: [38]  [ 200/1251]  eta: 0:04:26  lr: 0.003959  min_lr: 0.003959  loss: 2.7728 (3.3890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7286 (0.8452)  time: 0.2373  data: 0.0005  max mem: 18117
Epoch: [38]  [ 400/1251]  eta: 0:03:30  lr: 0.003958  min_lr: 0.003958  loss: 2.7807 (3.4045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7018 (0.7760)  time: 0.2425  data: 0.0004  max mem: 18117
Epoch: [38]  [ 600/1251]  eta: 0:02:39  lr: 0.003957  min_lr: 0.003957  loss: 3.5510 (3.4357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7788 (0.7618)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [38]  [ 800/1251]  eta: 0:01:49  lr: 0.003956  min_lr: 0.003956  loss: 3.7235 (3.4391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6133 (0.7502)  time: 0.2376  data: 0.0005  max mem: 18117
Epoch: [38]  [1000/1251]  eta: 0:01:00  lr: 0.003956  min_lr: 0.003956  loss: 2.8952 (3.4260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7821 (0.7507)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [38]  [1200/1251]  eta: 0:00:12  lr: 0.003955  min_lr: 0.003955  loss: 3.6289 (3.4249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6616 (0.7390)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [38]  [1250/1251]  eta: 0:00:00  lr: 0.003955  min_lr: 0.003955  loss: 4.0447 (3.4309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7327 (0.7433)  time: 0.1959  data: 0.0007  max mem: 18117
Epoch: [38] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.003955  min_lr: 0.003955  loss: 4.0447 (3.4434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7327 (0.7433)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.8637 (0.8637)  acc1: 82.8000 (82.8000)  acc5: 96.0000 (96.0000)  time: 5.8863  data: 5.7401  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0330 (1.0441)  acc1: 78.0000 (76.0364)  acc5: 96.0000 (94.8727)  time: 0.7501  data: 0.6375  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.3305 (1.2587)  acc1: 69.2000 (72.3429)  acc5: 92.0000 (91.6381)  time: 0.1932  data: 0.0845  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4350 (1.2645)  acc1: 69.2000 (71.9680)  acc5: 88.8000 (91.5520)  time: 0.1926  data: 0.0845  max mem: 18117
Test: Total time: 0:00:10 (0.4106 s / it)
* Acc@1 72.254 Acc@5 91.414 loss 1.260
Accuracy of the model on the 50000 test images: 72.3%
Max accuracy: 72.25%
Epoch: [39]  [   0/1251]  eta: 1:02:05  lr: 0.003955  min_lr: 0.003955  loss: 2.6657 (2.6657)  weight_decay: 0.0500 (0.0500)  time: 2.9778  data: 2.6315  max mem: 18117
Epoch: [39]  [ 200/1251]  eta: 0:04:26  lr: 0.003954  min_lr: 0.003954  loss: 2.8132 (3.4713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6737 (0.7220)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [39]  [ 400/1251]  eta: 0:03:29  lr: 0.003953  min_lr: 0.003953  loss: 2.8617 (3.4452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6963 (0.7451)  time: 0.2422  data: 0.0004  max mem: 18117
Epoch: [39]  [ 600/1251]  eta: 0:02:38  lr: 0.003952  min_lr: 0.003952  loss: 3.4515 (3.4638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5228 (0.7374)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [39]  [ 800/1251]  eta: 0:01:49  lr: 0.003952  min_lr: 0.003952  loss: 2.9809 (3.4420)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6255 (0.7538)  time: 0.2374  data: 0.0005  max mem: 18117
Epoch: [39]  [1000/1251]  eta: 0:01:00  lr: 0.003951  min_lr: 0.003951  loss: 2.8229 (3.4261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7899 (0.7508)  time: 0.2376  data: 0.0003  max mem: 18117
Epoch: [39]  [1200/1251]  eta: 0:00:12  lr: 0.003950  min_lr: 0.003950  loss: 2.6655 (3.4105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6175 (0.7454)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [39]  [1250/1251]  eta: 0:00:00  lr: 0.003950  min_lr: 0.003950  loss: 3.1430 (3.4113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7060 (0.7464)  time: 0.1957  data: 0.0007  max mem: 18117
Epoch: [39] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.003950  min_lr: 0.003950  loss: 3.1430 (3.4232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7060 (0.7464)
Test:  [ 0/25]  eta: 0:01:45  loss: 0.8546 (0.8546)  acc1: 81.6000 (81.6000)  acc5: 95.6000 (95.6000)  time: 4.2040  data: 4.0537  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9696 (1.0115)  acc1: 78.4000 (76.8727)  acc5: 95.2000 (94.4727)  time: 0.7122  data: 0.5923  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2332 (1.2227)  acc1: 70.4000 (73.0095)  acc5: 90.8000 (91.2952)  time: 0.2591  data: 0.1465  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3576 (1.2317)  acc1: 70.4000 (72.7360)  acc5: 90.4000 (91.2960)  time: 0.2454  data: 0.1339  max mem: 18117
Test: Total time: 0:00:10 (0.4003 s / it)
* Acc@1 72.466 Acc@5 91.436 loss 1.223
Accuracy of the model on the 50000 test images: 72.5%
Max accuracy: 72.47%
Epoch: [40]  [   0/1251]  eta: 1:05:02  lr: 0.003950  min_lr: 0.003950  loss: 2.6158 (2.6158)  weight_decay: 0.0500 (0.0500)  time: 3.1198  data: 2.7876  max mem: 18117
Epoch: [40]  [ 200/1251]  eta: 0:04:26  lr: 0.003949  min_lr: 0.003949  loss: 3.4524 (3.4553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7262 (0.7672)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [40]  [ 400/1251]  eta: 0:03:29  lr: 0.003948  min_lr: 0.003948  loss: 3.7458 (3.4241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5034 (0.7696)  time: 0.2362  data: 0.0004  max mem: 18117
Epoch: [40]  [ 600/1251]  eta: 0:02:38  lr: 0.003947  min_lr: 0.003947  loss: 2.8979 (3.4152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6054 (0.7372)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [40]  [ 800/1251]  eta: 0:01:49  lr: 0.003947  min_lr: 0.003947  loss: 2.7784 (3.4188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5596 (0.7580)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [40]  [1000/1251]  eta: 0:01:00  lr: 0.003946  min_lr: 0.003946  loss: 3.6306 (3.4302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6350 (0.7511)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [40]  [1200/1251]  eta: 0:00:12  lr: 0.003945  min_lr: 0.003945  loss: 4.2184 (3.4453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7670 (0.7676)  time: 0.2395  data: 0.0005  max mem: 18117
Epoch: [40]  [1250/1251]  eta: 0:00:00  lr: 0.003945  min_lr: 0.003945  loss: 3.0803 (3.4408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6630 (0.7666)  time: 0.1950  data: 0.0007  max mem: 18117
Epoch: [40] Total time: 0:05:01 (0.2407 s / it)
Averaged stats: lr: 0.003945  min_lr: 0.003945  loss: 3.0803 (3.4168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6630 (0.7666)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7712 (0.7712)  acc1: 82.8000 (82.8000)  acc5: 95.6000 (95.6000)  time: 5.7200  data: 5.5778  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0013 (0.9551)  acc1: 75.2000 (77.3818)  acc5: 95.6000 (94.6545)  time: 0.7532  data: 0.6373  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1294 (1.1696)  acc1: 72.0000 (73.0095)  acc5: 90.8000 (91.8667)  time: 0.2044  data: 0.0925  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3168 (1.1916)  acc1: 70.0000 (72.3200)  acc5: 90.0000 (91.7760)  time: 0.2033  data: 0.0924  max mem: 18117
Test: Total time: 0:00:10 (0.4129 s / it)
* Acc@1 72.814 Acc@5 91.716 loss 1.175
Accuracy of the model on the 50000 test images: 72.8%
Max accuracy: 72.81%
Epoch: [41]  [   0/1251]  eta: 1:04:49  lr: 0.003945  min_lr: 0.003945  loss: 2.6237 (2.6237)  weight_decay: 0.0500 (0.0500)  time: 3.1095  data: 2.7442  max mem: 18117
Epoch: [41]  [ 200/1251]  eta: 0:04:25  lr: 0.003944  min_lr: 0.003944  loss: 4.0584 (3.3674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6878 (0.7078)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [41]  [ 400/1251]  eta: 0:03:28  lr: 0.003943  min_lr: 0.003943  loss: 2.5553 (3.3700)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6059 (0.7033)  time: 0.2411  data: 0.0004  max mem: 18117
Epoch: [41]  [ 600/1251]  eta: 0:02:38  lr: 0.003942  min_lr: 0.003942  loss: 3.9227 (3.4148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5968 (0.7066)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [41]  [ 800/1251]  eta: 0:01:48  lr: 0.003941  min_lr: 0.003941  loss: 2.8340 (3.4102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6759 (0.7431)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [41]  [1000/1251]  eta: 0:01:00  lr: 0.003940  min_lr: 0.003940  loss: 2.8068 (3.4076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6705 (0.7568)  time: 0.2421  data: 0.0005  max mem: 18117
Epoch: [41]  [1200/1251]  eta: 0:00:12  lr: 0.003940  min_lr: 0.003940  loss: 3.2096 (3.4001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7700 (0.7738)  time: 0.2374  data: 0.0005  max mem: 18117
Epoch: [41]  [1250/1251]  eta: 0:00:00  lr: 0.003939  min_lr: 0.003939  loss: 2.8880 (3.3972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7221 (0.7746)  time: 0.1955  data: 0.0007  max mem: 18117
Epoch: [41] Total time: 0:05:01 (0.2408 s / it)
Averaged stats: lr: 0.003939  min_lr: 0.003939  loss: 2.8880 (3.4082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7221 (0.7746)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8356 (0.8356)  acc1: 82.0000 (82.0000)  acc5: 96.4000 (96.4000)  time: 5.8240  data: 5.6789  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0002 (0.9935)  acc1: 76.8000 (77.0909)  acc5: 94.8000 (95.0545)  time: 0.7605  data: 0.6460  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2039 (1.2024)  acc1: 70.4000 (72.6667)  acc5: 91.6000 (92.1333)  time: 0.2054  data: 0.0956  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3761 (1.2183)  acc1: 70.0000 (72.2560)  acc5: 90.4000 (91.9680)  time: 0.2046  data: 0.0955  max mem: 18117
Test: Total time: 0:00:10 (0.4182 s / it)
* Acc@1 72.666 Acc@5 91.598 loss 1.219
Accuracy of the model on the 50000 test images: 72.7%
Max accuracy: 72.81%
Epoch: [42]  [   0/1251]  eta: 1:05:48  lr: 0.003939  min_lr: 0.003939  loss: 4.7982 (4.7982)  weight_decay: 0.0500 (0.0500)  time: 3.1561  data: 2.8027  max mem: 18117
Epoch: [42]  [ 200/1251]  eta: 0:04:26  lr: 0.003939  min_lr: 0.003939  loss: 2.7387 (3.3178)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5187 (0.7629)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [42]  [ 400/1251]  eta: 0:03:30  lr: 0.003938  min_lr: 0.003938  loss: 3.1726 (3.3070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7133 (0.7477)  time: 0.2425  data: 0.0004  max mem: 18117
Epoch: [42]  [ 600/1251]  eta: 0:02:39  lr: 0.003937  min_lr: 0.003937  loss: 3.1775 (3.3188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5784 (0.7597)  time: 0.2403  data: 0.0004  max mem: 18117
Epoch: [42]  [ 800/1251]  eta: 0:01:49  lr: 0.003936  min_lr: 0.003936  loss: 3.3547 (3.3239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8801 (0.7747)  time: 0.2393  data: 0.0004  max mem: 18117
Epoch: [42]  [1000/1251]  eta: 0:01:00  lr: 0.003935  min_lr: 0.003935  loss: 3.1787 (3.3519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6007 (0.7705)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [42]  [1200/1251]  eta: 0:00:12  lr: 0.003934  min_lr: 0.003934  loss: 2.9063 (3.3606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8745 (0.7762)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [42]  [1250/1251]  eta: 0:00:00  lr: 0.003934  min_lr: 0.003934  loss: 2.6650 (3.3533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7944 (0.7772)  time: 0.1959  data: 0.0008  max mem: 18117
Epoch: [42] Total time: 0:05:03 (0.2423 s / it)
Averaged stats: lr: 0.003934  min_lr: 0.003934  loss: 2.6650 (3.4111)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7944 (0.7772)
Test:  [ 0/25]  eta: 0:01:55  loss: 0.7928 (0.7928)  acc1: 83.2000 (83.2000)  acc5: 96.8000 (96.8000)  time: 4.6093  data: 4.4599  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9401 (0.9801)  acc1: 80.4000 (78.0727)  acc5: 95.2000 (94.6545)  time: 0.7137  data: 0.5978  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2102 (1.1878)  acc1: 70.8000 (73.6381)  acc5: 91.6000 (91.6952)  time: 0.2415  data: 0.1310  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3224 (1.1947)  acc1: 70.8000 (73.4240)  acc5: 89.2000 (91.6320)  time: 0.2233  data: 0.1141  max mem: 18117
Test: Total time: 0:00:10 (0.4003 s / it)
* Acc@1 73.008 Acc@5 91.836 loss 1.198
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.01%
Epoch: [43]  [   0/1251]  eta: 1:05:51  lr: 0.003934  min_lr: 0.003934  loss: 2.3988 (2.3988)  weight_decay: 0.0500 (0.0500)  time: 3.1587  data: 2.8442  max mem: 18117
Epoch: [43]  [ 200/1251]  eta: 0:04:25  lr: 0.003933  min_lr: 0.003933  loss: 3.7155 (3.3341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5790 (0.6822)  time: 0.2390  data: 0.0005  max mem: 18117
Epoch: [43]  [ 400/1251]  eta: 0:03:31  lr: 0.003932  min_lr: 0.003932  loss: 2.7422 (3.3810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8621 (0.7230)  time: 0.2474  data: 0.0004  max mem: 18117
Epoch: [43]  [ 600/1251]  eta: 0:02:39  lr: 0.003931  min_lr: 0.003931  loss: 3.9936 (3.3875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7668 (0.7296)  time: 0.2474  data: 0.0003  max mem: 18117
Epoch: [43]  [ 800/1251]  eta: 0:01:50  lr: 0.003930  min_lr: 0.003930  loss: 2.9952 (3.4112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7348 (0.7486)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [43]  [1000/1251]  eta: 0:01:00  lr: 0.003929  min_lr: 0.003929  loss: 3.0733 (3.4257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6983 (0.7377)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [43]  [1200/1251]  eta: 0:00:12  lr: 0.003928  min_lr: 0.003928  loss: 3.1161 (3.4153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6981 (0.7426)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [43]  [1250/1251]  eta: 0:00:00  lr: 0.003928  min_lr: 0.003928  loss: 4.0813 (3.4173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6385 (0.7384)  time: 0.1959  data: 0.0011  max mem: 18117
Epoch: [43] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.003928  min_lr: 0.003928  loss: 4.0813 (3.3788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6385 (0.7384)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.9079 (0.9079)  acc1: 82.4000 (82.4000)  acc5: 96.4000 (96.4000)  time: 5.8811  data: 5.7434  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0276 (1.0509)  acc1: 78.8000 (77.0545)  acc5: 94.4000 (94.8364)  time: 0.7564  data: 0.6425  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2805 (1.2804)  acc1: 70.0000 (72.3429)  acc5: 92.0000 (91.5429)  time: 0.1995  data: 0.0866  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4237 (1.2928)  acc1: 70.0000 (72.0640)  acc5: 89.6000 (91.3600)  time: 0.2011  data: 0.0891  max mem: 18117
Test: Total time: 0:00:10 (0.4178 s / it)
* Acc@1 72.832 Acc@5 91.624 loss 1.281
Accuracy of the model on the 50000 test images: 72.8%
Max accuracy: 73.01%
Epoch: [44]  [   0/1251]  eta: 1:11:59  lr: 0.003928  min_lr: 0.003928  loss: 4.2692 (4.2692)  weight_decay: 0.0500 (0.0500)  time: 3.4527  data: 3.0278  max mem: 18117
Epoch: [44]  [ 200/1251]  eta: 0:04:29  lr: 0.003927  min_lr: 0.003927  loss: 3.1629 (3.4901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8039 (0.7361)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [44]  [ 400/1251]  eta: 0:03:30  lr: 0.003926  min_lr: 0.003926  loss: 3.5041 (3.4423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5526 (0.7696)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [44]  [ 600/1251]  eta: 0:02:39  lr: 0.003925  min_lr: 0.003925  loss: 3.5065 (3.4412)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6579 (0.7581)  time: 0.2393  data: 0.0005  max mem: 18117
Epoch: [44]  [ 800/1251]  eta: 0:01:49  lr: 0.003924  min_lr: 0.003924  loss: 2.9068 (3.4107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6547 (0.7668)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [44]  [1000/1251]  eta: 0:01:00  lr: 0.003923  min_lr: 0.003923  loss: 3.1864 (3.4023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5932 (0.7486)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [44]  [1200/1251]  eta: 0:00:12  lr: 0.003922  min_lr: 0.003922  loss: 3.2751 (3.4106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5986 (0.7671)  time: 0.2413  data: 0.0004  max mem: 18117
Epoch: [44]  [1250/1251]  eta: 0:00:00  lr: 0.003922  min_lr: 0.003922  loss: 2.7156 (3.4075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6579 (0.7692)  time: 0.1949  data: 0.0007  max mem: 18117
Epoch: [44] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.003922  min_lr: 0.003922  loss: 2.7156 (3.3917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6579 (0.7692)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.7827 (0.7827)  acc1: 82.0000 (82.0000)  acc5: 97.2000 (97.2000)  time: 5.9023  data: 5.7759  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9721 (0.9725)  acc1: 78.0000 (77.8545)  acc5: 94.8000 (94.6909)  time: 0.7280  data: 0.6158  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1595 (1.1920)  acc1: 71.2000 (73.3714)  acc5: 91.2000 (91.6000)  time: 0.1827  data: 0.0726  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4036 (1.2039)  acc1: 70.4000 (72.9920)  acc5: 89.2000 (91.6800)  time: 0.1884  data: 0.0794  max mem: 18117
Test: Total time: 0:00:10 (0.4083 s / it)
* Acc@1 73.208 Acc@5 91.840 loss 1.198
Accuracy of the model on the 50000 test images: 73.2%
Max accuracy: 73.21%
Epoch: [45]  [   0/1251]  eta: 1:12:14  lr: 0.003922  min_lr: 0.003922  loss: 4.5778 (4.5778)  weight_decay: 0.0500 (0.0500)  time: 3.4648  data: 3.2094  max mem: 18117
Epoch: [45]  [ 200/1251]  eta: 0:04:25  lr: 0.003921  min_lr: 0.003921  loss: 3.5426 (3.4818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5918 (0.7735)  time: 0.2344  data: 0.0004  max mem: 18117
Epoch: [45]  [ 400/1251]  eta: 0:03:29  lr: 0.003920  min_lr: 0.003920  loss: 3.2746 (3.5025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8977 (0.8081)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [45]  [ 600/1251]  eta: 0:02:38  lr: 0.003919  min_lr: 0.003919  loss: 2.8812 (3.4650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5843 (0.7818)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [45]  [ 800/1251]  eta: 0:01:49  lr: 0.003918  min_lr: 0.003918  loss: 2.7812 (3.4426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7360 (0.7933)  time: 0.2344  data: 0.0003  max mem: 18117
Epoch: [45]  [1000/1251]  eta: 0:01:00  lr: 0.003917  min_lr: 0.003917  loss: 3.2955 (3.4320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.8044)  time: 0.2411  data: 0.0003  max mem: 18117
Epoch: [45]  [1200/1251]  eta: 0:00:12  lr: 0.003916  min_lr: 0.003916  loss: 2.7101 (3.4116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7301 (0.7876)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [45]  [1250/1251]  eta: 0:00:00  lr: 0.003916  min_lr: 0.003916  loss: 3.6813 (3.4169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7268 (0.7904)  time: 0.1968  data: 0.0006  max mem: 18117
Epoch: [45] Total time: 0:05:01 (0.2410 s / it)
Averaged stats: lr: 0.003916  min_lr: 0.003916  loss: 3.6813 (3.3852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7268 (0.7904)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8989 (0.8989)  acc1: 84.8000 (84.8000)  acc5: 96.0000 (96.0000)  time: 5.7506  data: 5.6003  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0174 (0.9989)  acc1: 78.0000 (77.7091)  acc5: 95.6000 (95.2364)  time: 0.7783  data: 0.6641  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2428 (1.2189)  acc1: 70.0000 (72.8381)  acc5: 92.8000 (92.0191)  time: 0.2153  data: 0.1059  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3682 (1.2268)  acc1: 68.0000 (72.6240)  acc5: 90.0000 (91.9360)  time: 0.2148  data: 0.1058  max mem: 18117
Test: Total time: 0:00:10 (0.4228 s / it)
* Acc@1 72.988 Acc@5 91.930 loss 1.228
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.21%
Epoch: [46]  [   0/1251]  eta: 1:02:34  lr: 0.003916  min_lr: 0.003916  loss: 2.4596 (2.4596)  weight_decay: 0.0500 (0.0500)  time: 3.0010  data: 2.5634  max mem: 18117
Epoch: [46]  [ 200/1251]  eta: 0:04:27  lr: 0.003914  min_lr: 0.003914  loss: 3.8014 (3.3923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7887 (0.8864)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [46]  [ 400/1251]  eta: 0:03:29  lr: 0.003913  min_lr: 0.003913  loss: 3.0560 (3.4007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8839 (0.8149)  time: 0.2391  data: 0.0005  max mem: 18117
Epoch: [46]  [ 600/1251]  eta: 0:02:39  lr: 0.003912  min_lr: 0.003912  loss: 3.0003 (3.4028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7409 (0.8129)  time: 0.2406  data: 0.0005  max mem: 18117
Epoch: [46]  [ 800/1251]  eta: 0:01:49  lr: 0.003911  min_lr: 0.003911  loss: 3.3755 (3.4007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8381 (0.8078)  time: 0.2407  data: 0.0005  max mem: 18117
Epoch: [46]  [1000/1251]  eta: 0:01:00  lr: 0.003910  min_lr: 0.003910  loss: 3.2307 (3.3977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5985 (0.7904)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [46]  [1200/1251]  eta: 0:00:12  lr: 0.003909  min_lr: 0.003909  loss: 3.1188 (3.4174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7417 (0.7874)  time: 0.2400  data: 0.0005  max mem: 18117
Epoch: [46]  [1250/1251]  eta: 0:00:00  lr: 0.003909  min_lr: 0.003909  loss: 2.8383 (3.4116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6658 (0.7801)  time: 0.1956  data: 0.0007  max mem: 18117
Epoch: [46] Total time: 0:05:03 (0.2424 s / it)
Averaged stats: lr: 0.003909  min_lr: 0.003909  loss: 2.8383 (3.4097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6658 (0.7801)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8535 (0.8535)  acc1: 84.4000 (84.4000)  acc5: 95.6000 (95.6000)  time: 5.8232  data: 5.6972  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0046 (0.9859)  acc1: 78.4000 (78.3273)  acc5: 94.4000 (94.7636)  time: 0.7557  data: 0.6422  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1578 (1.2049)  acc1: 70.4000 (73.5429)  acc5: 92.0000 (92.0191)  time: 0.1935  data: 0.0817  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3635 (1.2147)  acc1: 69.6000 (72.8960)  acc5: 89.6000 (91.9200)  time: 0.1926  data: 0.0816  max mem: 18117
Test: Total time: 0:00:10 (0.4089 s / it)
* Acc@1 72.986 Acc@5 91.870 loss 1.206
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.21%
Epoch: [47]  [   0/1251]  eta: 1:02:12  lr: 0.003909  min_lr: 0.003909  loss: 2.3430 (2.3430)  weight_decay: 0.0500 (0.0500)  time: 2.9833  data: 1.5755  max mem: 18117
Epoch: [47]  [ 200/1251]  eta: 0:04:26  lr: 0.003908  min_lr: 0.003908  loss: 4.0658 (3.2932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8844 (0.8846)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [47]  [ 400/1251]  eta: 0:03:28  lr: 0.003907  min_lr: 0.003907  loss: 2.8639 (3.3662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8265 (0.8602)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [47]  [ 600/1251]  eta: 0:02:38  lr: 0.003906  min_lr: 0.003906  loss: 2.6607 (3.3693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5784 (0.8460)  time: 0.2447  data: 0.0005  max mem: 18117
Epoch: [47]  [ 800/1251]  eta: 0:01:49  lr: 0.003905  min_lr: 0.003905  loss: 3.5111 (3.3694)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7393 (0.8463)  time: 0.2391  data: 0.0005  max mem: 18117
Epoch: [47]  [1000/1251]  eta: 0:01:00  lr: 0.003904  min_lr: 0.003904  loss: 3.8843 (3.3845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5896 (0.8111)  time: 0.2376  data: 0.0005  max mem: 18117
Epoch: [47]  [1200/1251]  eta: 0:00:12  lr: 0.003902  min_lr: 0.003902  loss: 3.9713 (3.3803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9093 (0.8194)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [47]  [1250/1251]  eta: 0:00:00  lr: 0.003902  min_lr: 0.003902  loss: 3.2592 (3.3822)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0048 (0.8240)  time: 0.1961  data: 0.0010  max mem: 18117
Epoch: [47] Total time: 0:05:01 (0.2409 s / it)
Averaged stats: lr: 0.003902  min_lr: 0.003902  loss: 3.2592 (3.3744)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0048 (0.8240)
Test:  [ 0/25]  eta: 0:01:45  loss: 0.8022 (0.8022)  acc1: 83.6000 (83.6000)  acc5: 96.0000 (96.0000)  time: 4.2135  data: 4.0501  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 1.0888 (1.0132)  acc1: 76.4000 (77.4909)  acc5: 95.2000 (94.8364)  time: 0.6769  data: 0.5610  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.3034 (1.2312)  acc1: 70.8000 (73.3143)  acc5: 90.8000 (91.6191)  time: 0.2541  data: 0.1445  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3944 (1.2402)  acc1: 70.4000 (72.9120)  acc5: 89.6000 (91.4720)  time: 0.2386  data: 0.1291  max mem: 18117
Test: Total time: 0:00:10 (0.4251 s / it)
* Acc@1 73.472 Acc@5 91.960 loss 1.229
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.47%
Epoch: [48]  [   0/1251]  eta: 1:01:11  lr: 0.003902  min_lr: 0.003902  loss: 3.9975 (3.9975)  weight_decay: 0.0500 (0.0500)  time: 2.9352  data: 2.6157  max mem: 18117
Epoch: [48]  [ 200/1251]  eta: 0:04:24  lr: 0.003901  min_lr: 0.003901  loss: 3.5255 (3.3731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8806 (0.9283)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [48]  [ 400/1251]  eta: 0:03:28  lr: 0.003900  min_lr: 0.003900  loss: 3.9073 (3.4101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6336 (0.7930)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [48]  [ 600/1251]  eta: 0:02:38  lr: 0.003899  min_lr: 0.003899  loss: 3.7030 (3.4200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6369 (0.7965)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [48]  [ 800/1251]  eta: 0:01:49  lr: 0.003898  min_lr: 0.003898  loss: 3.7014 (3.4118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6351 (0.8063)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [48]  [1000/1251]  eta: 0:01:00  lr: 0.003897  min_lr: 0.003897  loss: 3.2634 (3.3967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6834 (0.8017)  time: 0.2411  data: 0.0004  max mem: 18117
Epoch: [48]  [1200/1251]  eta: 0:00:12  lr: 0.003895  min_lr: 0.003895  loss: 4.0253 (3.4076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7173 (0.8039)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [48]  [1250/1251]  eta: 0:00:00  lr: 0.003895  min_lr: 0.003895  loss: 2.7104 (3.4065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7290 (0.8022)  time: 0.1965  data: 0.0014  max mem: 18117
Epoch: [48] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.003895  min_lr: 0.003895  loss: 2.7104 (3.3635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7290 (0.8022)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.7800 (0.7800)  acc1: 83.6000 (83.6000)  acc5: 97.2000 (97.2000)  time: 5.8030  data: 5.6765  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8936 (0.9622)  acc1: 80.0000 (77.4909)  acc5: 95.6000 (95.2364)  time: 0.7360  data: 0.6233  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2002 (1.1731)  acc1: 69.6000 (73.5048)  acc5: 92.4000 (92.4381)  time: 0.1925  data: 0.0819  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3502 (1.1924)  acc1: 69.6000 (73.0880)  acc5: 90.0000 (92.0800)  time: 0.1930  data: 0.0818  max mem: 18117
Test: Total time: 0:00:10 (0.4081 s / it)
* Acc@1 73.616 Acc@5 92.058 loss 1.189
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 73.62%
Epoch: [49]  [   0/1251]  eta: 1:05:59  lr: 0.003895  min_lr: 0.003895  loss: 3.2078 (3.2078)  weight_decay: 0.0500 (0.0500)  time: 3.1654  data: 2.8320  max mem: 18117
Epoch: [49]  [ 200/1251]  eta: 0:04:25  lr: 0.003894  min_lr: 0.003894  loss: 2.7037 (3.2889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5038 (0.8051)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [49]  [ 400/1251]  eta: 0:03:28  lr: 0.003893  min_lr: 0.003893  loss: 3.5042 (3.3225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.8491)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [49]  [ 600/1251]  eta: 0:02:38  lr: 0.003892  min_lr: 0.003892  loss: 3.1925 (3.3295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7637 (0.8303)  time: 0.2416  data: 0.0005  max mem: 18117
Epoch: [49]  [ 800/1251]  eta: 0:01:49  lr: 0.003890  min_lr: 0.003890  loss: 3.1647 (3.3384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6781 (nan)  time: 0.2403  data: 0.0005  max mem: 18117
Epoch: [49]  [1000/1251]  eta: 0:01:00  lr: 0.003889  min_lr: 0.003889  loss: 2.9725 (3.3453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5556 (nan)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [49]  [1200/1251]  eta: 0:00:12  lr: 0.003888  min_lr: 0.003888  loss: 3.5878 (3.3492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7338 (nan)  time: 0.2553  data: 0.0003  max mem: 18117
Epoch: [49]  [1250/1251]  eta: 0:00:00  lr: 0.003888  min_lr: 0.003888  loss: 2.7795 (3.3490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7047 (nan)  time: 0.1950  data: 0.0006  max mem: 18117
Epoch: [49] Total time: 0:05:02 (0.2420 s / it)
Averaged stats: lr: 0.003888  min_lr: 0.003888  loss: 2.7795 (3.3487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7047 (nan)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.7763 (0.7763)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 5.2271  data: 5.0664  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9758 (0.9855)  acc1: 78.4000 (77.8545)  acc5: 95.6000 (95.0909)  time: 0.7175  data: 0.6021  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2085 (1.1961)  acc1: 70.8000 (73.5238)  acc5: 91.6000 (92.3810)  time: 0.2142  data: 0.1046  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2998 (1.2076)  acc1: 70.8000 (73.3120)  acc5: 89.6000 (92.1440)  time: 0.2109  data: 0.1027  max mem: 18117
Test: Total time: 0:00:10 (0.4009 s / it)
* Acc@1 73.412 Acc@5 92.170 loss 1.197
Accuracy of the model on the 50000 test images: 73.4%
Max accuracy: 73.62%
Epoch: [50]  [   0/1251]  eta: 1:10:54  lr: 0.003888  min_lr: 0.003888  loss: 2.3249 (2.3249)  weight_decay: 0.0500 (0.0500)  time: 3.4006  data: 2.7067  max mem: 18117
Epoch: [50]  [ 200/1251]  eta: 0:04:29  lr: 0.003887  min_lr: 0.003887  loss: 3.3851 (3.3832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7066 (0.6898)  time: 0.2378  data: 0.0006  max mem: 18117
Epoch: [50]  [ 400/1251]  eta: 0:03:30  lr: 0.003885  min_lr: 0.003885  loss: 3.2203 (3.3870)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0226 (0.7933)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [50]  [ 600/1251]  eta: 0:02:39  lr: 0.003884  min_lr: 0.003884  loss: 3.8633 (3.3999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6156 (0.7735)  time: 0.2383  data: 0.0005  max mem: 18117
Epoch: [50]  [ 800/1251]  eta: 0:01:49  lr: 0.003883  min_lr: 0.003883  loss: 3.5272 (3.3877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9326 (0.8063)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [50]  [1000/1251]  eta: 0:01:00  lr: 0.003882  min_lr: 0.003882  loss: 2.8398 (3.3789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6035 (0.7947)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [50]  [1200/1251]  eta: 0:00:12  lr: 0.003881  min_lr: 0.003881  loss: 2.6020 (3.3758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8607 (0.8082)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [50]  [1250/1251]  eta: 0:00:00  lr: 0.003880  min_lr: 0.003880  loss: 3.0239 (3.3727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6240 (0.8026)  time: 0.1968  data: 0.0008  max mem: 18117
Epoch: [50] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.003880  min_lr: 0.003880  loss: 3.0239 (3.3617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6240 (0.8026)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7952 (0.7952)  acc1: 85.6000 (85.6000)  acc5: 95.6000 (95.6000)  time: 5.7335  data: 5.5906  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9282 (0.9620)  acc1: 79.2000 (78.4727)  acc5: 95.6000 (95.2364)  time: 0.7052  data: 0.5911  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1808 (1.1740)  acc1: 72.0000 (73.7714)  acc5: 92.4000 (92.4952)  time: 0.1817  data: 0.0714  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3286 (1.1879)  acc1: 70.8000 (73.5200)  acc5: 90.4000 (92.4160)  time: 0.1848  data: 0.0745  max mem: 18117
Test: Total time: 0:00:09 (0.3991 s / it)
* Acc@1 74.010 Acc@5 92.380 loss 1.180
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.01%
Epoch: [51]  [   0/1251]  eta: 0:58:05  lr: 0.003880  min_lr: 0.003880  loss: 3.6587 (3.6587)  weight_decay: 0.0500 (0.0500)  time: 2.7860  data: 2.4695  max mem: 18117
Epoch: [51]  [ 200/1251]  eta: 0:04:24  lr: 0.003879  min_lr: 0.003879  loss: 3.8691 (3.2880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7337 (0.8622)  time: 0.2353  data: 0.0004  max mem: 18117
Epoch: [51]  [ 400/1251]  eta: 0:03:29  lr: 0.003878  min_lr: 0.003878  loss: 2.8264 (3.3048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6958 (0.8051)  time: 0.2432  data: 0.0005  max mem: 18117
Epoch: [51]  [ 600/1251]  eta: 0:02:38  lr: 0.003877  min_lr: 0.003877  loss: 3.7529 (3.3416)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2380  data: 0.0003  max mem: 18117
Epoch: [51]  [ 800/1251]  eta: 0:01:49  lr: 0.003875  min_lr: 0.003875  loss: 2.5533 (3.3335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7876 (nan)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [51]  [1000/1251]  eta: 0:01:00  lr: 0.003874  min_lr: 0.003874  loss: 2.7946 (3.3109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7007 (nan)  time: 0.2378  data: 0.0005  max mem: 18117
Epoch: [51]  [1200/1251]  eta: 0:00:12  lr: 0.003873  min_lr: 0.003873  loss: 3.0299 (3.3151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7216 (nan)  time: 0.2370  data: 0.0003  max mem: 18117
Epoch: [51]  [1250/1251]  eta: 0:00:00  lr: 0.003873  min_lr: 0.003873  loss: 3.1080 (3.3136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (nan)  time: 0.1955  data: 0.0008  max mem: 18117
Epoch: [51] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.003873  min_lr: 0.003873  loss: 3.1080 (3.3534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (nan)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8411 (0.8411)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.8167  data: 5.6727  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0007 (1.0012)  acc1: 78.4000 (78.4727)  acc5: 95.2000 (94.8727)  time: 0.7636  data: 0.6499  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2172 (1.2142)  acc1: 72.8000 (73.9619)  acc5: 91.2000 (92.2286)  time: 0.2067  data: 0.0973  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.4179 (1.2261)  acc1: 70.4000 (73.4720)  acc5: 90.4000 (92.0480)  time: 0.2063  data: 0.0972  max mem: 18117
Test: Total time: 0:00:10 (0.4185 s / it)
* Acc@1 73.570 Acc@5 92.098 loss 1.224
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 74.01%
Epoch: [52]  [   0/1251]  eta: 1:08:03  lr: 0.003873  min_lr: 0.003873  loss: 4.8025 (4.8025)  weight_decay: 0.0500 (0.0500)  time: 3.2644  data: 1.8139  max mem: 18117
Epoch: [52]  [ 200/1251]  eta: 0:04:27  lr: 0.003871  min_lr: 0.003871  loss: 3.1717 (3.4079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7745 (0.8910)  time: 0.2350  data: 0.0004  max mem: 18117
Epoch: [52]  [ 400/1251]  eta: 0:03:29  lr: 0.003870  min_lr: 0.003870  loss: 3.0790 (3.3821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6152 (0.7906)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [52]  [ 600/1251]  eta: 0:02:38  lr: 0.003869  min_lr: 0.003869  loss: 2.8332 (3.3411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7886 (0.7822)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [52]  [ 800/1251]  eta: 0:01:49  lr: 0.003867  min_lr: 0.003867  loss: 2.7522 (3.3180)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6658 (0.7948)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [52]  [1000/1251]  eta: 0:01:00  lr: 0.003866  min_lr: 0.003866  loss: 3.3573 (3.3343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9350 (0.8088)  time: 0.2369  data: 0.0005  max mem: 18117
Epoch: [52]  [1200/1251]  eta: 0:00:12  lr: 0.003865  min_lr: 0.003865  loss: 2.9103 (3.3434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5584 (0.8015)  time: 0.2418  data: 0.0005  max mem: 18117
Epoch: [52]  [1250/1251]  eta: 0:00:00  lr: 0.003865  min_lr: 0.003865  loss: 2.7470 (3.3369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6070 (0.8034)  time: 0.1953  data: 0.0007  max mem: 18117
Epoch: [52] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.003865  min_lr: 0.003865  loss: 2.7470 (3.3483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6070 (0.8034)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7685 (0.7685)  acc1: 84.0000 (84.0000)  acc5: 95.6000 (95.6000)  time: 5.7877  data: 5.6407  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9403 (0.9358)  acc1: 80.0000 (78.7273)  acc5: 95.6000 (95.0909)  time: 0.7304  data: 0.6162  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1853 (1.1571)  acc1: 71.6000 (74.0952)  acc5: 91.6000 (92.0381)  time: 0.1912  data: 0.0817  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3084 (1.1603)  acc1: 70.8000 (73.8400)  acc5: 90.0000 (92.0000)  time: 0.2085  data: 0.0992  max mem: 18117
Test: Total time: 0:00:10 (0.4191 s / it)
* Acc@1 74.048 Acc@5 92.184 loss 1.153
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.05%
Epoch: [53]  [   0/1251]  eta: 1:00:21  lr: 0.003865  min_lr: 0.003865  loss: 3.4999 (3.4999)  weight_decay: 0.0500 (0.0500)  time: 2.8946  data: 2.5697  max mem: 18117
Epoch: [53]  [ 200/1251]  eta: 0:04:25  lr: 0.003863  min_lr: 0.003863  loss: 2.9671 (3.2837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6628 (0.8223)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [53]  [ 400/1251]  eta: 0:03:30  lr: 0.003862  min_lr: 0.003862  loss: 3.7085 (3.3037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6509 (0.7980)  time: 0.2441  data: 0.0004  max mem: 18117
Epoch: [53]  [ 600/1251]  eta: 0:02:39  lr: 0.003861  min_lr: 0.003861  loss: 2.9234 (3.2918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7387 (0.7977)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [53]  [ 800/1251]  eta: 0:01:49  lr: 0.003859  min_lr: 0.003859  loss: 2.9009 (3.3130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6146 (0.7981)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [53]  [1000/1251]  eta: 0:01:00  lr: 0.003858  min_lr: 0.003858  loss: 2.7864 (3.2990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7270 (0.8216)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [53]  [1200/1251]  eta: 0:00:12  lr: 0.003857  min_lr: 0.003857  loss: 2.8939 (3.3139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6641 (0.8081)  time: 0.2389  data: 0.0003  max mem: 18117
Epoch: [53]  [1250/1251]  eta: 0:00:00  lr: 0.003856  min_lr: 0.003856  loss: 2.8224 (3.3103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7252 (0.8051)  time: 0.1960  data: 0.0006  max mem: 18117
Epoch: [53] Total time: 0:05:03 (0.2423 s / it)
Averaged stats: lr: 0.003856  min_lr: 0.003856  loss: 2.8224 (3.3207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7252 (0.8051)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.8815 (0.8815)  acc1: 83.2000 (83.2000)  acc5: 95.6000 (95.6000)  time: 5.6896  data: 5.5631  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9453 (0.9952)  acc1: 79.2000 (78.5091)  acc5: 95.2000 (94.8364)  time: 0.7379  data: 0.6248  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2207 (1.2072)  acc1: 71.6000 (74.0000)  acc5: 92.0000 (92.2476)  time: 0.2044  data: 0.0945  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3901 (1.2159)  acc1: 69.6000 (73.7920)  acc5: 90.4000 (92.1440)  time: 0.2030  data: 0.0948  max mem: 18117
Test: Total time: 0:00:10 (0.4123 s / it)
* Acc@1 73.842 Acc@5 92.236 loss 1.209
Accuracy of the model on the 50000 test images: 73.8%
Max accuracy: 74.05%
Epoch: [54]  [   0/1251]  eta: 1:06:38  lr: 0.003856  min_lr: 0.003856  loss: 2.9662 (2.9662)  weight_decay: 0.0500 (0.0500)  time: 3.1966  data: 2.4782  max mem: 18117
Epoch: [54]  [ 200/1251]  eta: 0:04:25  lr: 0.003855  min_lr: 0.003855  loss: 3.6955 (3.2681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7965 (0.8564)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [54]  [ 400/1251]  eta: 0:03:28  lr: 0.003854  min_lr: 0.003854  loss: 2.7268 (3.2487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7002 (0.8294)  time: 0.2446  data: 0.0004  max mem: 18117
Epoch: [54]  [ 600/1251]  eta: 0:02:38  lr: 0.003852  min_lr: 0.003852  loss: 2.7730 (3.2890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6926 (0.8134)  time: 0.2484  data: 0.0004  max mem: 18117
Epoch: [54]  [ 800/1251]  eta: 0:01:49  lr: 0.003851  min_lr: 0.003851  loss: 3.3604 (3.3089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5302 (0.7950)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [54]  [1000/1251]  eta: 0:01:00  lr: 0.003849  min_lr: 0.003849  loss: 4.1237 (3.3427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7310 (0.8176)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [54]  [1200/1251]  eta: 0:00:12  lr: 0.003848  min_lr: 0.003848  loss: 3.0723 (3.3457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5951 (0.8022)  time: 0.2420  data: 0.0004  max mem: 18117
Epoch: [54]  [1250/1251]  eta: 0:00:00  lr: 0.003848  min_lr: 0.003848  loss: 3.6858 (3.3461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5951 (0.8024)  time: 0.1956  data: 0.0006  max mem: 18117
Epoch: [54] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.003848  min_lr: 0.003848  loss: 3.6858 (3.3303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5951 (0.8024)
Test:  [ 0/25]  eta: 0:01:23  loss: 0.9106 (0.9106)  acc1: 83.6000 (83.6000)  acc5: 97.6000 (97.6000)  time: 3.3561  data: 3.1975  max mem: 18117
Test:  [10/25]  eta: 0:00:08  loss: 1.0758 (1.0698)  acc1: 78.0000 (77.9636)  acc5: 95.6000 (95.1273)  time: 0.5801  data: 0.4642  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2614 (1.2497)  acc1: 71.6000 (73.6952)  acc5: 91.2000 (92.3429)  time: 0.2836  data: 0.1737  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3886 (1.2601)  acc1: 70.8000 (73.4880)  acc5: 90.0000 (92.1760)  time: 0.2351  data: 0.1254  max mem: 18117
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 73.788 Acc@5 92.368 loss 1.251
Accuracy of the model on the 50000 test images: 73.8%
Max accuracy: 74.05%
Epoch: [55]  [   0/1251]  eta: 1:06:47  lr: 0.003848  min_lr: 0.003848  loss: 2.6268 (2.6268)  weight_decay: 0.0500 (0.0500)  time: 3.2036  data: 2.8013  max mem: 18117
Epoch: [55]  [ 200/1251]  eta: 0:04:27  lr: 0.003846  min_lr: 0.003846  loss: 2.8152 (3.3116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6476 (0.7587)  time: 0.2388  data: 0.0005  max mem: 18117
Epoch: [55]  [ 400/1251]  eta: 0:03:29  lr: 0.003845  min_lr: 0.003845  loss: 3.3477 (3.2563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6052 (0.7652)  time: 0.2338  data: 0.0004  max mem: 18117
Epoch: [55]  [ 600/1251]  eta: 0:02:38  lr: 0.003844  min_lr: 0.003844  loss: 3.8038 (3.2958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6257 (0.7779)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [55]  [ 800/1251]  eta: 0:01:49  lr: 0.003842  min_lr: 0.003842  loss: 3.6212 (3.2849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6291 (0.7909)  time: 0.2362  data: 0.0004  max mem: 18117
Epoch: [55]  [1000/1251]  eta: 0:01:00  lr: 0.003841  min_lr: 0.003841  loss: 3.6174 (3.2934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6248 (0.7849)  time: 0.2430  data: 0.0005  max mem: 18117
Epoch: [55]  [1200/1251]  eta: 0:00:12  lr: 0.003839  min_lr: 0.003839  loss: 3.6227 (3.2959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8272 (0.7859)  time: 0.2396  data: 0.0005  max mem: 18117
Epoch: [55]  [1250/1251]  eta: 0:00:00  lr: 0.003839  min_lr: 0.003839  loss: 2.7147 (3.2896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7210 (0.7833)  time: 0.1951  data: 0.0007  max mem: 18117
Epoch: [55] Total time: 0:05:01 (0.2414 s / it)
Averaged stats: lr: 0.003839  min_lr: 0.003839  loss: 2.7147 (3.3180)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7210 (0.7833)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.8009 (0.8009)  acc1: 81.6000 (81.6000)  acc5: 96.8000 (96.8000)  time: 5.2914  data: 5.1416  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9956 (0.9718)  acc1: 78.8000 (77.6364)  acc5: 95.6000 (95.3091)  time: 0.6990  data: 0.5843  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1327 (1.1820)  acc1: 71.2000 (73.5048)  acc5: 92.0000 (92.4381)  time: 0.2056  data: 0.0959  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3370 (1.1889)  acc1: 71.2000 (73.1200)  acc5: 90.8000 (92.3680)  time: 0.2147  data: 0.1035  max mem: 18117
Test: Total time: 0:00:10 (0.4054 s / it)
* Acc@1 73.774 Acc@5 92.310 loss 1.179
Accuracy of the model on the 50000 test images: 73.8%
Max accuracy: 74.05%
Epoch: [56]  [   0/1251]  eta: 1:09:52  lr: 0.003839  min_lr: 0.003839  loss: 2.6544 (2.6544)  weight_decay: 0.0500 (0.0500)  time: 3.3511  data: 2.2073  max mem: 18117
Epoch: [56]  [ 200/1251]  eta: 0:04:27  lr: 0.003838  min_lr: 0.003838  loss: 3.3695 (3.3213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8660 (0.8928)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [56]  [ 400/1251]  eta: 0:03:29  lr: 0.003836  min_lr: 0.003836  loss: 2.6316 (3.3119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5966 (0.8637)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [56]  [ 600/1251]  eta: 0:02:38  lr: 0.003835  min_lr: 0.003835  loss: 3.5543 (3.2879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6208 (0.8106)  time: 0.2408  data: 0.0004  max mem: 18117
Epoch: [56]  [ 800/1251]  eta: 0:01:49  lr: 0.003833  min_lr: 0.003833  loss: 3.6708 (3.3256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7042 (0.8229)  time: 0.2483  data: 0.0004  max mem: 18117
Epoch: [56]  [1000/1251]  eta: 0:01:00  lr: 0.003832  min_lr: 0.003832  loss: 3.3527 (3.3431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7703 (0.8191)  time: 0.2457  data: 0.0004  max mem: 18117
Epoch: [56]  [1200/1251]  eta: 0:00:12  lr: 0.003831  min_lr: 0.003831  loss: 2.8165 (3.3536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7238 (0.8295)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [56]  [1250/1251]  eta: 0:00:00  lr: 0.003830  min_lr: 0.003830  loss: 2.6953 (3.3447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7362 (0.8262)  time: 0.2021  data: 0.0007  max mem: 18117
Epoch: [56] Total time: 0:05:03 (0.2425 s / it)
Averaged stats: lr: 0.003830  min_lr: 0.003830  loss: 2.6953 (3.3253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7362 (0.8262)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7481 (0.7481)  acc1: 84.0000 (84.0000)  acc5: 96.0000 (96.0000)  time: 5.6575  data: 5.5318  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9889 (0.9320)  acc1: 75.6000 (77.9636)  acc5: 96.0000 (95.1636)  time: 0.7539  data: 0.6429  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1896 (1.1440)  acc1: 72.0000 (73.9048)  acc5: 91.2000 (91.9810)  time: 0.2065  data: 0.0977  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2790 (1.1554)  acc1: 72.0000 (73.5680)  acc5: 90.8000 (91.9040)  time: 0.2057  data: 0.0977  max mem: 18117
Test: Total time: 0:00:10 (0.4117 s / it)
* Acc@1 73.972 Acc@5 92.458 loss 1.138
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.05%
Epoch: [57]  [   0/1251]  eta: 1:10:05  lr: 0.003830  min_lr: 0.003830  loss: 2.5180 (2.5180)  weight_decay: 0.0500 (0.0500)  time: 3.3617  data: 2.3218  max mem: 18117
Epoch: [57]  [ 200/1251]  eta: 0:04:27  lr: 0.003829  min_lr: 0.003829  loss: 3.5580 (3.3335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7439 (0.7749)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [57]  [ 400/1251]  eta: 0:03:29  lr: 0.003827  min_lr: 0.003827  loss: 3.6799 (3.3312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8841 (0.8185)  time: 0.2378  data: 0.0003  max mem: 18117
Epoch: [57]  [ 600/1251]  eta: 0:02:38  lr: 0.003826  min_lr: 0.003826  loss: 2.8178 (3.3232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7216 (0.8166)  time: 0.2402  data: 0.0005  max mem: 18117
Epoch: [57]  [ 800/1251]  eta: 0:01:49  lr: 0.003824  min_lr: 0.003824  loss: 2.6909 (3.3162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6724 (0.8293)  time: 0.2449  data: 0.0004  max mem: 18117
Epoch: [57]  [1000/1251]  eta: 0:01:00  lr: 0.003823  min_lr: 0.003823  loss: 3.8651 (3.3282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7034 (0.8341)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [57]  [1200/1251]  eta: 0:00:12  lr: 0.003821  min_lr: 0.003821  loss: 3.0470 (3.3242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6065 (0.8055)  time: 0.2433  data: 0.0004  max mem: 18117
Epoch: [57]  [1250/1251]  eta: 0:00:00  lr: 0.003821  min_lr: 0.003821  loss: 2.6649 (3.3271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6068 (0.8027)  time: 0.1955  data: 0.0007  max mem: 18117
Epoch: [57] Total time: 0:05:01 (0.2414 s / it)
Averaged stats: lr: 0.003821  min_lr: 0.003821  loss: 2.6649 (3.3205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6068 (0.8027)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7594 (0.7594)  acc1: 82.4000 (82.4000)  acc5: 96.0000 (96.0000)  time: 5.6834  data: 5.5573  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9332 (0.9202)  acc1: 78.4000 (78.0727)  acc5: 95.6000 (95.1636)  time: 0.7664  data: 0.6542  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1555 (1.1219)  acc1: 72.0000 (74.4000)  acc5: 92.0000 (92.2857)  time: 0.2090  data: 0.0996  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2269 (1.1323)  acc1: 71.6000 (74.1280)  acc5: 90.8000 (92.2560)  time: 0.2082  data: 0.1002  max mem: 18117
Test: Total time: 0:00:10 (0.4158 s / it)
* Acc@1 74.432 Acc@5 92.446 loss 1.122
Accuracy of the model on the 50000 test images: 74.4%
Max accuracy: 74.43%
Epoch: [58]  [   0/1251]  eta: 1:02:44  lr: 0.003821  min_lr: 0.003821  loss: 2.2361 (2.2361)  weight_decay: 0.0500 (0.0500)  time: 3.0090  data: 2.7129  max mem: 18117
Epoch: [58]  [ 200/1251]  eta: 0:04:24  lr: 0.003820  min_lr: 0.003820  loss: 3.0769 (3.1641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5779 (0.7369)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [58]  [ 400/1251]  eta: 0:03:28  lr: 0.003818  min_lr: 0.003818  loss: 3.3776 (3.2639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6213 (0.6983)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [58]  [ 600/1251]  eta: 0:02:37  lr: 0.003817  min_lr: 0.003817  loss: 3.4796 (3.2840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6515 (0.7531)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [58]  [ 800/1251]  eta: 0:01:48  lr: 0.003815  min_lr: 0.003815  loss: 2.9266 (3.3079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9530 (0.7611)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [58]  [1000/1251]  eta: 0:01:00  lr: 0.003813  min_lr: 0.003813  loss: 3.5024 (3.3220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7393 (0.7522)  time: 0.2355  data: 0.0004  max mem: 18117
Epoch: [58]  [1200/1251]  eta: 0:00:12  lr: 0.003812  min_lr: 0.003812  loss: 3.0058 (3.3143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5972 (0.7556)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [58]  [1250/1251]  eta: 0:00:00  lr: 0.003812  min_lr: 0.003812  loss: 3.6947 (3.3167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6548 (0.7580)  time: 0.1952  data: 0.0005  max mem: 18117
Epoch: [58] Total time: 0:05:01 (0.2406 s / it)
Averaged stats: lr: 0.003812  min_lr: 0.003812  loss: 3.6947 (3.3135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6548 (0.7580)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7643 (0.7643)  acc1: 83.2000 (83.2000)  acc5: 96.8000 (96.8000)  time: 5.5387  data: 5.3924  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9594 (0.9481)  acc1: 78.0000 (78.2545)  acc5: 96.0000 (95.2364)  time: 0.7545  data: 0.6408  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1493 (1.1538)  acc1: 70.8000 (74.0191)  acc5: 92.0000 (92.6476)  time: 0.2340  data: 0.1244  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2984 (1.1723)  acc1: 70.8000 (73.7440)  acc5: 90.8000 (92.4320)  time: 0.2338  data: 0.1243  max mem: 18117
Test: Total time: 0:00:10 (0.4294 s / it)
* Acc@1 74.208 Acc@5 92.520 loss 1.159
Accuracy of the model on the 50000 test images: 74.2%
Max accuracy: 74.43%
Epoch: [59]  [   0/1251]  eta: 1:05:40  lr: 0.003812  min_lr: 0.003812  loss: 3.2652 (3.2652)  weight_decay: 0.0500 (0.0500)  time: 3.1496  data: 2.5161  max mem: 18117
Epoch: [59]  [ 200/1251]  eta: 0:04:27  lr: 0.003810  min_lr: 0.003810  loss: 3.7387 (3.4037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6273 (0.7422)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [59]  [ 400/1251]  eta: 0:03:29  lr: 0.003809  min_lr: 0.003809  loss: 3.4318 (3.3531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8408 (0.7962)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [59]  [ 600/1251]  eta: 0:02:38  lr: 0.003807  min_lr: 0.003807  loss: 3.1828 (3.3569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6648 (0.7805)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [59]  [ 800/1251]  eta: 0:01:49  lr: 0.003805  min_lr: 0.003805  loss: 3.5504 (3.3271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6505 (0.7942)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [59]  [1000/1251]  eta: 0:01:00  lr: 0.003804  min_lr: 0.003804  loss: 3.4660 (3.3329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7514 (0.8047)  time: 0.2406  data: 0.0004  max mem: 18117
Epoch: [59]  [1200/1251]  eta: 0:00:12  lr: 0.003802  min_lr: 0.003802  loss: 2.9957 (3.3232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6124 (0.8001)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [59]  [1250/1251]  eta: 0:00:00  lr: 0.003802  min_lr: 0.003802  loss: 3.0987 (3.3236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6384 (0.7946)  time: 0.2007  data: 0.0006  max mem: 18117
Epoch: [59] Total time: 0:05:01 (0.2410 s / it)
Averaged stats: lr: 0.003802  min_lr: 0.003802  loss: 3.0987 (3.3128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6384 (0.7946)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7630 (0.7630)  acc1: 82.8000 (82.8000)  acc5: 96.0000 (96.0000)  time: 5.5259  data: 5.4014  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9532 (0.9135)  acc1: 78.0000 (79.1636)  acc5: 96.0000 (95.4909)  time: 0.7574  data: 0.6446  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0804 (1.1257)  acc1: 72.4000 (74.3810)  acc5: 93.2000 (92.6286)  time: 0.2176  data: 0.1077  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3082 (1.1436)  acc1: 72.0000 (73.9520)  acc5: 90.8000 (92.4960)  time: 0.2159  data: 0.1076  max mem: 18117
Test: Total time: 0:00:10 (0.4180 s / it)
* Acc@1 74.376 Acc@5 92.570 loss 1.133
Accuracy of the model on the 50000 test images: 74.4%
Max accuracy: 74.43%
Epoch: [60]  [   0/1251]  eta: 1:09:23  lr: 0.003802  min_lr: 0.003802  loss: 4.7685 (4.7685)  weight_decay: 0.0500 (0.0500)  time: 3.3280  data: 1.8855  max mem: 18117
Epoch: [60]  [ 200/1251]  eta: 0:04:31  lr: 0.003800  min_lr: 0.003800  loss: 2.9023 (3.2108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8371 (0.7908)  time: 0.2348  data: 0.0004  max mem: 18117
Epoch: [60]  [ 400/1251]  eta: 0:03:31  lr: 0.003799  min_lr: 0.003799  loss: 2.7101 (3.2660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5927 (0.7234)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [60]  [ 600/1251]  eta: 0:02:39  lr: 0.003797  min_lr: 0.003797  loss: 2.7272 (3.2897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6890 (0.7383)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [60]  [ 800/1251]  eta: 0:01:49  lr: 0.003796  min_lr: 0.003796  loss: 3.0484 (3.2867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8309 (0.7563)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [60]  [1000/1251]  eta: 0:01:00  lr: 0.003794  min_lr: 0.003794  loss: 2.8662 (3.2924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6786 (0.7589)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [60]  [1200/1251]  eta: 0:00:12  lr: 0.003793  min_lr: 0.003793  loss: 3.3423 (3.2980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5572 (0.7627)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [60]  [1250/1251]  eta: 0:00:00  lr: 0.003792  min_lr: 0.003792  loss: 3.4685 (3.3019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8444 (0.7670)  time: 0.1968  data: 0.0010  max mem: 18117
Epoch: [60] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.003792  min_lr: 0.003792  loss: 3.4685 (3.3020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8444 (0.7670)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8476 (0.8476)  acc1: 84.0000 (84.0000)  acc5: 95.6000 (95.6000)  time: 5.6475  data: 5.5250  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 1.0266 (1.0005)  acc1: 77.6000 (78.2545)  acc5: 95.6000 (95.5273)  time: 0.7131  data: 0.6007  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2023 (1.1838)  acc1: 71.6000 (74.6667)  acc5: 91.6000 (92.6857)  time: 0.2064  data: 0.0967  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3377 (1.1975)  acc1: 71.6000 (74.1440)  acc5: 90.8000 (92.3520)  time: 0.2080  data: 0.0984  max mem: 18117
Test: Total time: 0:00:10 (0.4128 s / it)
* Acc@1 74.260 Acc@5 92.528 loss 1.188
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.43%
Epoch: [61]  [   0/1251]  eta: 1:06:55  lr: 0.003792  min_lr: 0.003792  loss: 3.9709 (3.9709)  weight_decay: 0.0500 (0.0500)  time: 3.2100  data: 1.6426  max mem: 18117
Epoch: [61]  [ 200/1251]  eta: 0:04:27  lr: 0.003791  min_lr: 0.003791  loss: 2.7116 (3.2506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6784 (0.7997)  time: 0.2410  data: 0.0004  max mem: 18117
Epoch: [61]  [ 400/1251]  eta: 0:03:29  lr: 0.003789  min_lr: 0.003789  loss: 3.8489 (3.3277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7640 (0.8333)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [61]  [ 600/1251]  eta: 0:02:38  lr: 0.003787  min_lr: 0.003787  loss: 2.9560 (3.3364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6002 (0.8244)  time: 0.2371  data: 0.0005  max mem: 18117
Epoch: [61]  [ 800/1251]  eta: 0:01:49  lr: 0.003786  min_lr: 0.003786  loss: 2.9740 (3.3452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8014 (0.8192)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [61]  [1000/1251]  eta: 0:01:00  lr: 0.003784  min_lr: 0.003784  loss: 3.1818 (3.3547)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6896 (0.8414)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [61]  [1200/1251]  eta: 0:00:12  lr: 0.003782  min_lr: 0.003782  loss: 3.9334 (3.3438)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [61]  [1250/1251]  eta: 0:00:00  lr: 0.003782  min_lr: 0.003782  loss: 3.0950 (3.3393)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1963  data: 0.0006  max mem: 18117
Epoch: [61] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.003782  min_lr: 0.003782  loss: 3.0950 (3.3161)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.7381 (0.7381)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 5.4276  data: 5.2989  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9841 (0.9602)  acc1: 78.0000 (79.0182)  acc5: 96.0000 (95.0545)  time: 0.7259  data: 0.6134  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2343 (1.1750)  acc1: 72.0000 (74.5143)  acc5: 90.4000 (92.3619)  time: 0.2139  data: 0.1039  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3212 (1.1833)  acc1: 72.0000 (74.4320)  acc5: 89.6000 (92.2880)  time: 0.2081  data: 0.0984  max mem: 18117
Test: Total time: 0:00:10 (0.4095 s / it)
* Acc@1 74.306 Acc@5 92.426 loss 1.176
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.43%
Epoch: [62]  [   0/1251]  eta: 1:06:09  lr: 0.003782  min_lr: 0.003782  loss: 2.5486 (2.5486)  weight_decay: 0.0500 (0.0500)  time: 3.1727  data: 2.5300  max mem: 18117
Epoch: [62]  [ 200/1251]  eta: 0:04:27  lr: 0.003780  min_lr: 0.003780  loss: 3.3303 (3.3209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6918 (0.7762)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [62]  [ 400/1251]  eta: 0:03:30  lr: 0.003779  min_lr: 0.003779  loss: 3.5780 (3.3401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6558 (0.7449)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [62]  [ 600/1251]  eta: 0:02:38  lr: 0.003777  min_lr: 0.003777  loss: 3.1679 (3.3296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5402 (0.7193)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [62]  [ 800/1251]  eta: 0:01:49  lr: 0.003775  min_lr: 0.003775  loss: 3.0091 (3.3023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6458 (0.7155)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [62]  [1000/1251]  eta: 0:01:00  lr: 0.003774  min_lr: 0.003774  loss: 2.9150 (3.2909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6447 (0.7283)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [62]  [1200/1251]  eta: 0:00:12  lr: 0.003772  min_lr: 0.003772  loss: 3.7486 (3.3044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7238 (0.7440)  time: 0.2430  data: 0.0004  max mem: 18117
Epoch: [62]  [1250/1251]  eta: 0:00:00  lr: 0.003772  min_lr: 0.003772  loss: 2.9244 (3.3039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7797 (0.7449)  time: 0.1964  data: 0.0007  max mem: 18117
Epoch: [62] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.003772  min_lr: 0.003772  loss: 2.9244 (3.2897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7797 (0.7449)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7826 (0.7826)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.5515  data: 5.3969  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0407 (1.0141)  acc1: 78.4000 (78.2909)  acc5: 95.6000 (95.1273)  time: 0.7686  data: 0.6518  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1629 (1.2066)  acc1: 70.8000 (73.7714)  acc5: 91.6000 (92.4000)  time: 0.2248  data: 0.1140  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3733 (1.2168)  acc1: 70.0000 (73.4880)  acc5: 91.2000 (92.3200)  time: 0.2238  data: 0.1139  max mem: 18117
Test: Total time: 0:00:10 (0.4225 s / it)
* Acc@1 73.930 Acc@5 92.412 loss 1.209
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 74.43%
Epoch: [63]  [   0/1251]  eta: 1:10:06  lr: 0.003772  min_lr: 0.003772  loss: 4.2468 (4.2468)  weight_decay: 0.0500 (0.0500)  time: 3.3624  data: 2.6965  max mem: 18117
Epoch: [63]  [ 200/1251]  eta: 0:04:28  lr: 0.003770  min_lr: 0.003770  loss: 3.4870 (3.2583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6522 (0.7118)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [63]  [ 400/1251]  eta: 0:03:31  lr: 0.003768  min_lr: 0.003768  loss: 3.3252 (3.2734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6152 (0.6936)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [63]  [ 600/1251]  eta: 0:02:40  lr: 0.003767  min_lr: 0.003767  loss: 2.7909 (3.2748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6565 (0.7352)  time: 0.2483  data: 0.0004  max mem: 18117
Epoch: [63]  [ 800/1251]  eta: 0:01:50  lr: 0.003765  min_lr: 0.003765  loss: 2.9915 (3.2628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6599 (0.7404)  time: 0.2413  data: 0.0005  max mem: 18117
Epoch: [63]  [1000/1251]  eta: 0:01:01  lr: 0.003763  min_lr: 0.003763  loss: 2.5494 (3.2541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7624 (0.7638)  time: 0.2391  data: 0.0005  max mem: 18117
Epoch: [63]  [1200/1251]  eta: 0:00:12  lr: 0.003762  min_lr: 0.003762  loss: 3.6257 (3.2834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7421 (0.7802)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [63]  [1250/1251]  eta: 0:00:00  lr: 0.003761  min_lr: 0.003761  loss: 3.0110 (3.2853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5431 (0.7732)  time: 0.1954  data: 0.0007  max mem: 18117
Epoch: [63] Total time: 0:05:03 (0.2426 s / it)
Averaged stats: lr: 0.003761  min_lr: 0.003761  loss: 3.0110 (3.2914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5431 (0.7732)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.8134 (0.8134)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.4858  data: 5.3539  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0418 (0.9961)  acc1: 79.2000 (78.1818)  acc5: 95.6000 (95.3091)  time: 0.7686  data: 0.6548  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.2369 (1.1775)  acc1: 71.2000 (74.1143)  acc5: 92.4000 (92.8952)  time: 0.2143  data: 0.1037  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3003 (1.1917)  acc1: 71.2000 (73.8080)  acc5: 91.2000 (92.5600)  time: 0.2131  data: 0.1036  max mem: 18117
Test: Total time: 0:00:10 (0.4114 s / it)
* Acc@1 74.300 Acc@5 92.484 loss 1.181
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.43%
Epoch: [64]  [   0/1251]  eta: 1:10:32  lr: 0.003761  min_lr: 0.003761  loss: 3.8615 (3.8615)  weight_decay: 0.0500 (0.0500)  time: 3.3836  data: 2.4921  max mem: 18117
Epoch: [64]  [ 200/1251]  eta: 0:04:28  lr: 0.003760  min_lr: 0.003760  loss: 3.3703 (3.2265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9446 (0.8154)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [64]  [ 400/1251]  eta: 0:03:30  lr: 0.003758  min_lr: 0.003758  loss: 2.9835 (3.2813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6671 (0.8009)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [64]  [ 600/1251]  eta: 0:02:39  lr: 0.003756  min_lr: 0.003756  loss: 3.4327 (3.3118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7339 (0.7813)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [64]  [ 800/1251]  eta: 0:01:49  lr: 0.003754  min_lr: 0.003754  loss: 3.9489 (3.3151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7041 (0.7873)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [64]  [1000/1251]  eta: 0:01:00  lr: 0.003753  min_lr: 0.003753  loss: 3.4238 (3.3112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7837 (0.7960)  time: 0.2356  data: 0.0004  max mem: 18117
Epoch: [64]  [1200/1251]  eta: 0:00:12  lr: 0.003751  min_lr: 0.003751  loss: 3.0554 (3.3128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7683 (0.7933)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [64]  [1250/1251]  eta: 0:00:00  lr: 0.003751  min_lr: 0.003751  loss: 3.0461 (3.3151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6751 (0.7875)  time: 0.1953  data: 0.0008  max mem: 18117
Epoch: [64] Total time: 0:05:02 (0.2414 s / it)
Averaged stats: lr: 0.003751  min_lr: 0.003751  loss: 3.0461 (3.2895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6751 (0.7875)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7765 (0.7765)  acc1: 83.2000 (83.2000)  acc5: 97.2000 (97.2000)  time: 5.6342  data: 5.5081  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 1.0033 (0.9722)  acc1: 79.2000 (78.6182)  acc5: 96.4000 (95.4182)  time: 0.7140  data: 0.6020  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1487 (1.1649)  acc1: 72.4000 (74.5714)  acc5: 92.4000 (92.7810)  time: 0.1967  data: 0.0873  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2953 (1.1691)  acc1: 72.4000 (74.2240)  acc5: 91.2000 (92.6880)  time: 0.2208  data: 0.1126  max mem: 18117
Test: Total time: 0:00:10 (0.4236 s / it)
* Acc@1 74.528 Acc@5 92.684 loss 1.174
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.53%
Epoch: [65]  [   0/1251]  eta: 1:03:06  lr: 0.003751  min_lr: 0.003751  loss: 2.1522 (2.1522)  weight_decay: 0.0500 (0.0500)  time: 3.0265  data: 2.7680  max mem: 18117
Epoch: [65]  [ 200/1251]  eta: 0:04:25  lr: 0.003749  min_lr: 0.003749  loss: 3.0069 (3.2439)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7374 (0.7426)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [65]  [ 400/1251]  eta: 0:03:28  lr: 0.003747  min_lr: 0.003747  loss: 2.7959 (3.3156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6265 (0.7245)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [65]  [ 600/1251]  eta: 0:02:37  lr: 0.003745  min_lr: 0.003745  loss: 2.7063 (3.3054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7056 (0.7289)  time: 0.2374  data: 0.0005  max mem: 18117
Epoch: [65]  [ 800/1251]  eta: 0:01:49  lr: 0.003744  min_lr: 0.003744  loss: 3.3982 (3.2806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7672 (0.7693)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [65]  [1000/1251]  eta: 0:01:00  lr: 0.003742  min_lr: 0.003742  loss: 2.6968 (3.2828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5645 (0.7582)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [65]  [1200/1251]  eta: 0:00:12  lr: 0.003740  min_lr: 0.003740  loss: 3.5071 (3.2911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7025 (0.7625)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [65]  [1250/1251]  eta: 0:00:00  lr: 0.003740  min_lr: 0.003740  loss: 2.9470 (3.2918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8565 (0.7643)  time: 0.1961  data: 0.0007  max mem: 18117
Epoch: [65] Total time: 0:05:01 (0.2408 s / it)
Averaged stats: lr: 0.003740  min_lr: 0.003740  loss: 2.9470 (3.2922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8565 (0.7643)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8542 (0.8542)  acc1: 81.6000 (81.6000)  acc5: 95.6000 (95.6000)  time: 5.6505  data: 5.4973  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9921 (0.9406)  acc1: 77.2000 (78.7273)  acc5: 95.6000 (95.1273)  time: 0.7537  data: 0.6395  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1473 (1.1457)  acc1: 71.6000 (74.1524)  acc5: 92.8000 (92.5333)  time: 0.1990  data: 0.0896  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2993 (1.1617)  acc1: 71.2000 (73.8080)  acc5: 90.8000 (92.2720)  time: 0.1988  data: 0.0896  max mem: 18117
Test: Total time: 0:00:10 (0.4060 s / it)
* Acc@1 74.286 Acc@5 92.516 loss 1.147
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.53%
Epoch: [66]  [   0/1251]  eta: 1:07:23  lr: 0.003740  min_lr: 0.003740  loss: 4.3268 (4.3268)  weight_decay: 0.0500 (0.0500)  time: 3.2325  data: 1.6121  max mem: 18117
Epoch: [66]  [ 200/1251]  eta: 0:04:27  lr: 0.003738  min_lr: 0.003738  loss: 2.5781 (3.3113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6228 (0.6806)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [66]  [ 400/1251]  eta: 0:03:29  lr: 0.003736  min_lr: 0.003736  loss: 2.6506 (3.2823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5496 (0.6957)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [66]  [ 600/1251]  eta: 0:02:38  lr: 0.003734  min_lr: 0.003734  loss: 3.4050 (3.2475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7610 (0.7275)  time: 0.2344  data: 0.0004  max mem: 18117
Epoch: [66]  [ 800/1251]  eta: 0:01:49  lr: 0.003732  min_lr: 0.003732  loss: 2.6093 (3.2583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8850 (0.7546)  time: 0.2407  data: 0.0005  max mem: 18117
Epoch: [66]  [1000/1251]  eta: 0:01:00  lr: 0.003731  min_lr: 0.003731  loss: 3.4716 (3.2752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6740 (0.7453)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [66]  [1200/1251]  eta: 0:00:12  lr: 0.003729  min_lr: 0.003729  loss: 3.4501 (3.2727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6160 (0.7446)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [66]  [1250/1251]  eta: 0:00:00  lr: 0.003728  min_lr: 0.003728  loss: 2.7196 (3.2709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6121 (0.7451)  time: 0.1969  data: 0.0010  max mem: 18117
Epoch: [66] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.003728  min_lr: 0.003728  loss: 2.7196 (3.2812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6121 (0.7451)
Test:  [ 0/25]  eta: 0:01:36  loss: 0.7972 (0.7972)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 3.8660  data: 3.7017  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9957 (0.9678)  acc1: 79.2000 (79.7091)  acc5: 96.0000 (95.4545)  time: 0.6697  data: 0.5527  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1433 (1.1794)  acc1: 72.8000 (74.7429)  acc5: 92.4000 (92.5143)  time: 0.2612  data: 0.1510  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2969 (1.1895)  acc1: 72.0000 (74.3840)  acc5: 91.2000 (92.5280)  time: 0.2093  data: 0.1011  max mem: 18117
Test: Total time: 0:00:10 (0.4163 s / it)
* Acc@1 74.596 Acc@5 92.712 loss 1.174
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 74.60%
Epoch: [67]  [   0/1251]  eta: 1:03:09  lr: 0.003728  min_lr: 0.003728  loss: 3.9069 (3.9069)  weight_decay: 0.0500 (0.0500)  time: 3.0293  data: 2.6748  max mem: 18117
Epoch: [67]  [ 200/1251]  eta: 0:04:27  lr: 0.003727  min_lr: 0.003727  loss: 2.8279 (3.2865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5747 (0.7825)  time: 0.2424  data: 0.0003  max mem: 18117
Epoch: [67]  [ 400/1251]  eta: 0:03:30  lr: 0.003725  min_lr: 0.003725  loss: 3.9049 (3.3206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7316 (0.7910)  time: 0.2408  data: 0.0003  max mem: 18117
Epoch: [67]  [ 600/1251]  eta: 0:02:38  lr: 0.003723  min_lr: 0.003723  loss: 3.4317 (3.3005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6983 (0.7772)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [67]  [ 800/1251]  eta: 0:01:49  lr: 0.003721  min_lr: 0.003721  loss: 2.8610 (3.2858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6019 (0.7596)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [67]  [1000/1251]  eta: 0:01:00  lr: 0.003719  min_lr: 0.003719  loss: 3.7683 (3.3035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8045 (0.7825)  time: 0.2348  data: 0.0004  max mem: 18117
Epoch: [67]  [1200/1251]  eta: 0:00:12  lr: 0.003717  min_lr: 0.003717  loss: 2.7888 (3.3224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6162 (0.7618)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [67]  [1250/1251]  eta: 0:00:00  lr: 0.003717  min_lr: 0.003717  loss: 3.5861 (3.3257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5810 (0.7584)  time: 0.1958  data: 0.0006  max mem: 18117
Epoch: [67] Total time: 0:05:01 (0.2410 s / it)
Averaged stats: lr: 0.003717  min_lr: 0.003717  loss: 3.5861 (3.2867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5810 (0.7584)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8717 (0.8717)  acc1: 83.6000 (83.6000)  acc5: 96.8000 (96.8000)  time: 5.8162  data: 5.6891  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0016 (0.9920)  acc1: 79.6000 (79.6727)  acc5: 96.0000 (95.5636)  time: 0.7751  data: 0.6628  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1757 (1.2016)  acc1: 72.8000 (74.9524)  acc5: 92.4000 (92.9524)  time: 0.2152  data: 0.1056  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3527 (1.2116)  acc1: 73.2000 (74.8000)  acc5: 90.8000 (92.9120)  time: 0.2138  data: 0.1055  max mem: 18117
Test: Total time: 0:00:10 (0.4256 s / it)
* Acc@1 74.430 Acc@5 92.700 loss 1.202
Accuracy of the model on the 50000 test images: 74.4%
Max accuracy: 74.60%
Epoch: [68]  [   0/1251]  eta: 1:07:28  lr: 0.003717  min_lr: 0.003717  loss: 2.3921 (2.3921)  weight_decay: 0.0500 (0.0500)  time: 3.2364  data: 2.9358  max mem: 18117
Epoch: [68]  [ 200/1251]  eta: 0:04:27  lr: 0.003715  min_lr: 0.003715  loss: 2.8131 (3.1196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6059 (0.8331)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [68]  [ 400/1251]  eta: 0:03:29  lr: 0.003713  min_lr: 0.003713  loss: 3.0663 (3.1734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6960 (0.7721)  time: 0.2409  data: 0.0005  max mem: 18117
Epoch: [68]  [ 600/1251]  eta: 0:02:39  lr: 0.003711  min_lr: 0.003711  loss: 2.9172 (3.2197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (0.7571)  time: 0.2429  data: 0.0006  max mem: 18117
Epoch: [68]  [ 800/1251]  eta: 0:01:49  lr: 0.003710  min_lr: 0.003710  loss: 3.3905 (3.2412)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6397 (0.7657)  time: 0.2422  data: 0.0005  max mem: 18117
Epoch: [68]  [1000/1251]  eta: 0:01:00  lr: 0.003708  min_lr: 0.003708  loss: 2.6119 (3.2636)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7485 (0.7741)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [68]  [1200/1251]  eta: 0:00:12  lr: 0.003706  min_lr: 0.003706  loss: 3.7931 (3.2708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6217 (0.7635)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [68]  [1250/1251]  eta: 0:00:00  lr: 0.003705  min_lr: 0.003705  loss: 3.3086 (3.2670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7637 (0.7700)  time: 0.1959  data: 0.0007  max mem: 18117
Epoch: [68] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.003705  min_lr: 0.003705  loss: 3.3086 (3.2644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7637 (0.7700)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.7084 (0.7084)  acc1: 85.6000 (85.6000)  acc5: 97.2000 (97.2000)  time: 6.0310  data: 5.8805  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9356 (0.9068)  acc1: 77.2000 (79.0909)  acc5: 96.0000 (95.3818)  time: 0.7758  data: 0.6622  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1371 (1.1138)  acc1: 71.2000 (74.3619)  acc5: 92.0000 (92.4571)  time: 0.2003  data: 0.0913  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2618 (1.1228)  acc1: 70.4000 (74.1120)  acc5: 90.0000 (92.4160)  time: 0.2000  data: 0.0912  max mem: 18117
Test: Total time: 0:00:10 (0.4222 s / it)
* Acc@1 74.740 Acc@5 92.736 loss 1.105
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 74.74%
Epoch: [69]  [   0/1251]  eta: 0:59:37  lr: 0.003705  min_lr: 0.003705  loss: 3.2768 (3.2768)  weight_decay: 0.0500 (0.0500)  time: 2.8596  data: 2.5525  max mem: 18117
Epoch: [69]  [ 200/1251]  eta: 0:04:26  lr: 0.003703  min_lr: 0.003703  loss: 3.2584 (3.3258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6523 (0.8009)  time: 0.2393  data: 0.0005  max mem: 18117
Epoch: [69]  [ 400/1251]  eta: 0:03:29  lr: 0.003702  min_lr: 0.003702  loss: 2.8602 (3.2917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6790 (0.7434)  time: 0.2386  data: 0.0005  max mem: 18117
Epoch: [69]  [ 600/1251]  eta: 0:02:38  lr: 0.003700  min_lr: 0.003700  loss: 3.7157 (3.2781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7184 (0.7594)  time: 0.2411  data: 0.0005  max mem: 18117
Epoch: [69]  [ 800/1251]  eta: 0:01:49  lr: 0.003698  min_lr: 0.003698  loss: 2.6947 (3.2793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5861 (0.7391)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [69]  [1000/1251]  eta: 0:01:00  lr: 0.003696  min_lr: 0.003696  loss: 3.9095 (3.2896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5976 (0.7198)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [69]  [1200/1251]  eta: 0:00:12  lr: 0.003694  min_lr: 0.003694  loss: 3.6636 (3.3052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6585 (0.7327)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [69]  [1250/1251]  eta: 0:00:00  lr: 0.003694  min_lr: 0.003694  loss: 2.8689 (3.2995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6109 (0.7310)  time: 0.1969  data: 0.0009  max mem: 18117
Epoch: [69] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.003694  min_lr: 0.003694  loss: 2.8689 (3.2775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6109 (0.7310)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.7934 (0.7934)  acc1: 85.2000 (85.2000)  acc5: 96.0000 (96.0000)  time: 5.2902  data: 5.1639  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9139 (0.9650)  acc1: 79.6000 (78.2182)  acc5: 96.0000 (95.2364)  time: 0.7541  data: 0.6407  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1703 (1.1678)  acc1: 72.0000 (74.3048)  acc5: 92.8000 (92.6476)  time: 0.2245  data: 0.1141  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3348 (1.1758)  acc1: 72.0000 (74.1440)  acc5: 91.2000 (92.5760)  time: 0.2234  data: 0.1140  max mem: 18117
Test: Total time: 0:00:10 (0.4116 s / it)
* Acc@1 74.738 Acc@5 92.754 loss 1.165
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 74.74%
Epoch: [70]  [   0/1251]  eta: 1:04:44  lr: 0.003694  min_lr: 0.003694  loss: 2.2452 (2.2452)  weight_decay: 0.0500 (0.0500)  time: 3.1054  data: 2.7628  max mem: 18117
Epoch: [70]  [ 200/1251]  eta: 0:04:32  lr: 0.003692  min_lr: 0.003692  loss: 3.1274 (3.2561)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6350 (0.7767)  time: 0.2392  data: 0.0005  max mem: 18117
Epoch: [70]  [ 400/1251]  eta: 0:03:32  lr: 0.003690  min_lr: 0.003690  loss: 3.7747 (3.2624)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7896 (0.8141)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [70]  [ 600/1251]  eta: 0:02:40  lr: 0.003688  min_lr: 0.003688  loss: 3.1982 (3.2424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6295 (0.7747)  time: 0.2363  data: 0.0005  max mem: 18117
Epoch: [70]  [ 800/1251]  eta: 0:01:50  lr: 0.003686  min_lr: 0.003686  loss: 3.3081 (3.2368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5855 (0.7546)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [70]  [1000/1251]  eta: 0:01:01  lr: 0.003684  min_lr: 0.003684  loss: 2.7679 (3.2361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6509 (0.7493)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [70]  [1200/1251]  eta: 0:00:12  lr: 0.003682  min_lr: 0.003682  loss: 3.4872 (3.2437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6541 (0.7489)  time: 0.2378  data: 0.0005  max mem: 18117
Epoch: [70]  [1250/1251]  eta: 0:00:00  lr: 0.003682  min_lr: 0.003682  loss: 3.3211 (3.2462)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1951  data: 0.0007  max mem: 18117
Epoch: [70] Total time: 0:05:03 (0.2429 s / it)
Averaged stats: lr: 0.003682  min_lr: 0.003682  loss: 3.3211 (3.2758)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8118 (0.8118)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.5280  data: 5.3991  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8726 (0.9487)  acc1: 79.2000 (79.2000)  acc5: 96.4000 (95.7818)  time: 0.7687  data: 0.6558  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1710 (1.1577)  acc1: 72.0000 (74.6667)  acc5: 92.4000 (93.0286)  time: 0.2211  data: 0.1111  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3376 (1.1666)  acc1: 71.6000 (74.6400)  acc5: 90.8000 (92.8800)  time: 0.2202  data: 0.1110  max mem: 18117
Test: Total time: 0:00:10 (0.4184 s / it)
* Acc@1 74.902 Acc@5 92.850 loss 1.163
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.90%
Epoch: [71]  [   0/1251]  eta: 1:05:30  lr: 0.003681  min_lr: 0.003681  loss: 3.7814 (3.7814)  weight_decay: 0.0500 (0.0500)  time: 3.1420  data: 2.8252  max mem: 18117
Epoch: [71]  [ 200/1251]  eta: 0:04:25  lr: 0.003680  min_lr: 0.003680  loss: 2.8932 (3.2622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6059 (0.8248)  time: 0.2356  data: 0.0004  max mem: 18117
Epoch: [71]  [ 400/1251]  eta: 0:03:29  lr: 0.003678  min_lr: 0.003678  loss: 2.8431 (3.2887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7603 (0.7987)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [71]  [ 600/1251]  eta: 0:02:38  lr: 0.003676  min_lr: 0.003676  loss: 3.6636 (3.2580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6757 (0.7724)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [71]  [ 800/1251]  eta: 0:01:49  lr: 0.003674  min_lr: 0.003674  loss: 2.6915 (3.2610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6659 (0.7660)  time: 0.2358  data: 0.0004  max mem: 18117
Epoch: [71]  [1000/1251]  eta: 0:01:00  lr: 0.003672  min_lr: 0.003672  loss: 2.8492 (3.2603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5578 (0.7508)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [71]  [1200/1251]  eta: 0:00:12  lr: 0.003670  min_lr: 0.003670  loss: 3.8007 (3.2774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6107 (0.7523)  time: 0.2354  data: 0.0004  max mem: 18117
Epoch: [71]  [1250/1251]  eta: 0:00:00  lr: 0.003669  min_lr: 0.003669  loss: 3.0937 (3.2802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6972 (0.7573)  time: 0.1956  data: 0.0006  max mem: 18117
Epoch: [71] Total time: 0:05:00 (0.2404 s / it)
Averaged stats: lr: 0.003669  min_lr: 0.003669  loss: 3.0937 (3.2639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6972 (0.7573)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7637 (0.7637)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 5.7695  data: 5.6417  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8957 (0.9287)  acc1: 78.8000 (79.1636)  acc5: 96.0000 (95.2000)  time: 0.7713  data: 0.6587  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1195 (1.1448)  acc1: 72.4000 (74.8191)  acc5: 92.4000 (92.3429)  time: 0.2154  data: 0.1049  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3039 (1.1533)  acc1: 71.2000 (74.6080)  acc5: 91.6000 (92.4640)  time: 0.2149  data: 0.1049  max mem: 18117
Test: Total time: 0:00:10 (0.4242 s / it)
* Acc@1 74.628 Acc@5 92.664 loss 1.140
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 74.90%
Epoch: [72]  [   0/1251]  eta: 1:10:33  lr: 0.003669  min_lr: 0.003669  loss: 3.6600 (3.6600)  weight_decay: 0.0500 (0.0500)  time: 3.3840  data: 3.0905  max mem: 18117
Epoch: [72]  [ 200/1251]  eta: 0:04:27  lr: 0.003667  min_lr: 0.003667  loss: 2.5533 (3.0800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5984 (0.6519)  time: 0.2376  data: 0.0005  max mem: 18117
Epoch: [72]  [ 400/1251]  eta: 0:03:30  lr: 0.003665  min_lr: 0.003665  loss: 2.6552 (3.1268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5331 (0.6649)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [72]  [ 600/1251]  eta: 0:02:39  lr: 0.003663  min_lr: 0.003663  loss: 2.7006 (3.1913)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6725 (0.6720)  time: 0.2398  data: 0.0005  max mem: 18117
Epoch: [72]  [ 800/1251]  eta: 0:01:49  lr: 0.003661  min_lr: 0.003661  loss: 2.9896 (3.1926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6273 (0.6812)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [72]  [1000/1251]  eta: 0:01:00  lr: 0.003659  min_lr: 0.003659  loss: 3.6571 (3.2105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6745 (0.6901)  time: 0.2415  data: 0.0004  max mem: 18117
Epoch: [72]  [1200/1251]  eta: 0:00:12  lr: 0.003657  min_lr: 0.003657  loss: 2.7946 (3.2003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6195 (0.6852)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [72]  [1250/1251]  eta: 0:00:00  lr: 0.003657  min_lr: 0.003657  loss: 2.7470 (3.2090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7656 (0.6911)  time: 0.1958  data: 0.0007  max mem: 18117
Epoch: [72] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.003657  min_lr: 0.003657  loss: 2.7470 (3.2460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7656 (0.6911)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7814 (0.7814)  acc1: 84.8000 (84.8000)  acc5: 96.0000 (96.0000)  time: 5.8715  data: 5.7451  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9334 (0.9507)  acc1: 78.4000 (78.6182)  acc5: 95.6000 (95.6000)  time: 0.7456  data: 0.6328  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1988 (1.1634)  acc1: 72.0000 (74.3048)  acc5: 92.8000 (92.8381)  time: 0.1884  data: 0.0771  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3004 (1.1702)  acc1: 70.4000 (74.2080)  acc5: 90.8000 (92.7840)  time: 0.2044  data: 0.0942  max mem: 18117
Test: Total time: 0:00:10 (0.4200 s / it)
* Acc@1 74.856 Acc@5 92.738 loss 1.149
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.90%
Epoch: [73]  [   0/1251]  eta: 1:05:38  lr: 0.003657  min_lr: 0.003657  loss: 2.3720 (2.3720)  weight_decay: 0.0500 (0.0500)  time: 3.1486  data: 2.7603  max mem: 18117
Epoch: [73]  [ 200/1251]  eta: 0:04:26  lr: 0.003655  min_lr: 0.003655  loss: 3.8011 (3.2445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5987 (0.6884)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [73]  [ 400/1251]  eta: 0:03:29  lr: 0.003653  min_lr: 0.003653  loss: 2.9390 (3.2405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6579 (0.7344)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [73]  [ 600/1251]  eta: 0:02:39  lr: 0.003651  min_lr: 0.003651  loss: 2.7526 (3.1972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7073 (0.7388)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [73]  [ 800/1251]  eta: 0:01:50  lr: 0.003649  min_lr: 0.003649  loss: 3.2210 (3.2200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6423 (0.7418)  time: 0.2418  data: 0.0004  max mem: 18117
Epoch: [73]  [1000/1251]  eta: 0:01:01  lr: 0.003647  min_lr: 0.003647  loss: 2.5300 (3.2300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5679 (0.7129)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [73]  [1200/1251]  eta: 0:00:12  lr: 0.003645  min_lr: 0.003645  loss: 3.1388 (3.2359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7341 (0.7288)  time: 0.2391  data: 0.0003  max mem: 18117
Epoch: [73]  [1250/1251]  eta: 0:00:00  lr: 0.003644  min_lr: 0.003644  loss: 2.8586 (3.2338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7848 (0.7291)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [73] Total time: 0:05:03 (0.2425 s / it)
Averaged stats: lr: 0.003644  min_lr: 0.003644  loss: 2.8586 (3.2562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7848 (0.7291)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7186 (0.7186)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 5.5900  data: 5.4456  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9135 (0.9026)  acc1: 78.0000 (79.4909)  acc5: 96.4000 (95.6000)  time: 0.7420  data: 0.6265  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0717 (1.0988)  acc1: 72.8000 (75.1048)  acc5: 92.4000 (92.8571)  time: 0.2019  data: 0.0898  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2480 (1.1123)  acc1: 72.8000 (74.8960)  acc5: 90.8000 (92.6880)  time: 0.2010  data: 0.0897  max mem: 18117
Test: Total time: 0:00:10 (0.4089 s / it)
* Acc@1 74.882 Acc@5 92.828 loss 1.107
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.90%
Epoch: [74]  [   0/1251]  eta: 1:08:34  lr: 0.003644  min_lr: 0.003644  loss: 2.7521 (2.7521)  weight_decay: 0.0500 (0.0500)  time: 3.2890  data: 2.9472  max mem: 18117
Epoch: [74]  [ 200/1251]  eta: 0:04:27  lr: 0.003642  min_lr: 0.003642  loss: 3.4993 (3.2323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5859 (0.7813)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [74]  [ 400/1251]  eta: 0:03:29  lr: 0.003640  min_lr: 0.003640  loss: 3.4608 (3.2278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7151 (0.7349)  time: 0.2358  data: 0.0004  max mem: 18117
Epoch: [74]  [ 600/1251]  eta: 0:02:38  lr: 0.003638  min_lr: 0.003638  loss: 3.2295 (3.2502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5793 (0.7038)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [74]  [ 800/1251]  eta: 0:01:49  lr: 0.003636  min_lr: 0.003636  loss: 2.6909 (3.2544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6750 (0.7067)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [74]  [1000/1251]  eta: 0:01:00  lr: 0.003634  min_lr: 0.003634  loss: 2.5932 (3.2658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5110 (0.7222)  time: 0.2402  data: 0.0005  max mem: 18117
Epoch: [74]  [1200/1251]  eta: 0:00:12  lr: 0.003632  min_lr: 0.003632  loss: 3.3027 (3.2716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7388 (0.7194)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [74]  [1250/1251]  eta: 0:00:00  lr: 0.003631  min_lr: 0.003631  loss: 2.7418 (3.2655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5863 (0.7187)  time: 0.1963  data: 0.0007  max mem: 18117
Epoch: [74] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.003631  min_lr: 0.003631  loss: 2.7418 (3.2474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5863 (0.7187)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6784 (0.6784)  acc1: 86.8000 (86.8000)  acc5: 96.8000 (96.8000)  time: 5.4088  data: 5.2509  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9263 (0.8851)  acc1: 79.2000 (79.2727)  acc5: 96.4000 (95.5636)  time: 0.7439  data: 0.6296  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0608 (1.0939)  acc1: 74.0000 (75.3333)  acc5: 92.0000 (92.8000)  time: 0.2213  data: 0.1123  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2384 (1.1022)  acc1: 73.2000 (75.0720)  acc5: 90.4000 (92.7040)  time: 0.2209  data: 0.1122  max mem: 18117
Test: Total time: 0:00:10 (0.4145 s / it)
* Acc@1 75.164 Acc@5 92.922 loss 1.100
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.16%
Epoch: [75]  [   0/1251]  eta: 1:04:36  lr: 0.003631  min_lr: 0.003631  loss: 2.9762 (2.9762)  weight_decay: 0.0500 (0.0500)  time: 3.0985  data: 2.7417  max mem: 18117
Epoch: [75]  [ 200/1251]  eta: 0:04:26  lr: 0.003629  min_lr: 0.003629  loss: 2.7792 (3.3020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6239 (0.7490)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [75]  [ 400/1251]  eta: 0:03:28  lr: 0.003627  min_lr: 0.003627  loss: 3.2768 (3.2201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6276 (0.7397)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [75]  [ 600/1251]  eta: 0:02:38  lr: 0.003625  min_lr: 0.003625  loss: 2.7228 (3.2014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6357 (0.7209)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [75]  [ 800/1251]  eta: 0:01:49  lr: 0.003623  min_lr: 0.003623  loss: 3.2522 (3.2132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6113 (0.7214)  time: 0.2347  data: 0.0004  max mem: 18117
Epoch: [75]  [1000/1251]  eta: 0:01:00  lr: 0.003621  min_lr: 0.003621  loss: 2.6561 (3.2396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6741 (0.7186)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [75]  [1200/1251]  eta: 0:00:12  lr: 0.003619  min_lr: 0.003619  loss: 2.6673 (3.2362)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5489 (0.7115)  time: 0.2372  data: 0.0003  max mem: 18117
Epoch: [75]  [1250/1251]  eta: 0:00:00  lr: 0.003618  min_lr: 0.003618  loss: 2.5591 (3.2290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6892 (0.7241)  time: 0.1953  data: 0.0006  max mem: 18117
Epoch: [75] Total time: 0:05:01 (0.2408 s / it)
Averaged stats: lr: 0.003618  min_lr: 0.003618  loss: 2.5591 (3.2587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6892 (0.7241)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7359 (0.7359)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 5.3598  data: 5.2343  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8918 (0.8806)  acc1: 79.6000 (79.7091)  acc5: 95.6000 (95.4182)  time: 0.7476  data: 0.6349  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0790 (1.0866)  acc1: 74.8000 (75.4667)  acc5: 92.8000 (92.7238)  time: 0.2191  data: 0.1090  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2413 (1.1091)  acc1: 73.2000 (74.8320)  acc5: 91.2000 (92.4640)  time: 0.2198  data: 0.1114  max mem: 18117
Test: Total time: 0:00:10 (0.4123 s / it)
* Acc@1 75.052 Acc@5 92.862 loss 1.091
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.16%
Epoch: [76]  [   0/1251]  eta: 1:07:46  lr: 0.003618  min_lr: 0.003618  loss: 3.2619 (3.2619)  weight_decay: 0.0500 (0.0500)  time: 3.2510  data: 2.1223  max mem: 18117
Epoch: [76]  [ 200/1251]  eta: 0:04:28  lr: 0.003616  min_lr: 0.003616  loss: 3.5496 (3.2853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6845 (0.7217)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [76]  [ 400/1251]  eta: 0:03:30  lr: 0.003614  min_lr: 0.003614  loss: 3.7608 (3.2645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6545 (0.6822)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [76]  [ 600/1251]  eta: 0:02:38  lr: 0.003612  min_lr: 0.003612  loss: 3.4539 (3.2739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6834 (0.7138)  time: 0.2379  data: 0.0005  max mem: 18117
Epoch: [76]  [ 800/1251]  eta: 0:01:49  lr: 0.003610  min_lr: 0.003610  loss: 2.7369 (3.2768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6084 (0.7157)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [76]  [1000/1251]  eta: 0:01:00  lr: 0.003607  min_lr: 0.003607  loss: 2.9621 (3.2726)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6145 (0.7067)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [76]  [1200/1251]  eta: 0:00:12  lr: 0.003605  min_lr: 0.003605  loss: 2.5702 (3.2711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7123 (0.7130)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [76]  [1250/1251]  eta: 0:00:00  lr: 0.003605  min_lr: 0.003605  loss: 3.3361 (3.2724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.7131)  time: 0.1957  data: 0.0005  max mem: 18117
Epoch: [76] Total time: 0:05:03 (0.2422 s / it)
Averaged stats: lr: 0.003605  min_lr: 0.003605  loss: 3.3361 (3.2488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.7131)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7741 (0.7741)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 5.3847  data: 5.2290  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9128 (0.9327)  acc1: 79.6000 (79.0909)  acc5: 95.6000 (95.6000)  time: 0.7615  data: 0.6447  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1652 (1.1275)  acc1: 73.2000 (74.5333)  acc5: 91.6000 (92.8381)  time: 0.2220  data: 0.1111  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2660 (1.1443)  acc1: 71.2000 (74.0480)  acc5: 90.8000 (92.5760)  time: 0.2221  data: 0.1110  max mem: 18117
Test: Total time: 0:00:10 (0.4141 s / it)
* Acc@1 74.768 Acc@5 92.780 loss 1.126
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 75.16%
Epoch: [77]  [   0/1251]  eta: 0:59:28  lr: 0.003605  min_lr: 0.003605  loss: 2.0670 (2.0670)  weight_decay: 0.0500 (0.0500)  time: 2.8526  data: 1.7624  max mem: 18117
Epoch: [77]  [ 200/1251]  eta: 0:04:29  lr: 0.003603  min_lr: 0.003603  loss: 2.6403 (3.2073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5207 (0.6204)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [77]  [ 400/1251]  eta: 0:03:30  lr: 0.003601  min_lr: 0.003601  loss: 2.9325 (3.2177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5457 (0.6252)  time: 0.2407  data: 0.0005  max mem: 18117
Epoch: [77]  [ 600/1251]  eta: 0:02:38  lr: 0.003598  min_lr: 0.003598  loss: 3.3840 (3.2644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6956 (0.6800)  time: 0.2386  data: 0.0005  max mem: 18117
Epoch: [77]  [ 800/1251]  eta: 0:01:49  lr: 0.003596  min_lr: 0.003596  loss: 2.9161 (3.2326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.7034)  time: 0.2414  data: 0.0004  max mem: 18117
Epoch: [77]  [1000/1251]  eta: 0:01:00  lr: 0.003594  min_lr: 0.003594  loss: 3.7999 (3.2350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5448 (0.7006)  time: 0.2403  data: 0.0004  max mem: 18117
Epoch: [77]  [1200/1251]  eta: 0:00:12  lr: 0.003592  min_lr: 0.003592  loss: 3.7324 (3.2550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6690 (0.6929)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [77]  [1250/1251]  eta: 0:00:00  lr: 0.003591  min_lr: 0.003591  loss: 2.6925 (3.2504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7957 (0.7036)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [77] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.003591  min_lr: 0.003591  loss: 2.6925 (3.2453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7957 (0.7036)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7307 (0.7307)  acc1: 82.4000 (82.4000)  acc5: 97.2000 (97.2000)  time: 5.6887  data: 5.5626  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8994 (0.8876)  acc1: 80.0000 (79.7091)  acc5: 96.0000 (95.3091)  time: 0.7131  data: 0.6012  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1439 (1.1014)  acc1: 73.2000 (75.0095)  acc5: 92.8000 (92.7810)  time: 0.2044  data: 0.0951  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2562 (1.1155)  acc1: 72.0000 (74.6240)  acc5: 90.8000 (92.7680)  time: 0.2052  data: 0.0950  max mem: 18117
Test: Total time: 0:00:10 (0.4139 s / it)
* Acc@1 74.850 Acc@5 92.800 loss 1.110
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 75.16%
Epoch: [78]  [   0/1251]  eta: 0:59:17  lr: 0.003591  min_lr: 0.003591  loss: 3.4211 (3.4211)  weight_decay: 0.0500 (0.0500)  time: 2.8434  data: 2.2058  max mem: 18117
Epoch: [78]  [ 200/1251]  eta: 0:04:29  lr: 0.003589  min_lr: 0.003589  loss: 3.4971 (3.2759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5212 (0.7175)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [78]  [ 400/1251]  eta: 0:03:30  lr: 0.003587  min_lr: 0.003587  loss: 3.0664 (3.2883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5741 (0.6879)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [78]  [ 600/1251]  eta: 0:02:39  lr: 0.003585  min_lr: 0.003585  loss: 2.8314 (3.2712)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5871 (0.6687)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [78]  [ 800/1251]  eta: 0:01:49  lr: 0.003583  min_lr: 0.003583  loss: 2.5568 (3.2562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6609 (0.6604)  time: 0.2412  data: 0.0005  max mem: 18117
Epoch: [78]  [1000/1251]  eta: 0:01:00  lr: 0.003580  min_lr: 0.003580  loss: 3.5789 (3.2730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7065 (0.6760)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [78]  [1200/1251]  eta: 0:00:12  lr: 0.003578  min_lr: 0.003578  loss: 3.3434 (3.2612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7197 (0.6906)  time: 0.2424  data: 0.0004  max mem: 18117
Epoch: [78]  [1250/1251]  eta: 0:00:00  lr: 0.003578  min_lr: 0.003578  loss: 2.9325 (3.2628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6027 (0.6894)  time: 0.1952  data: 0.0006  max mem: 18117
Epoch: [78] Total time: 0:05:03 (0.2423 s / it)
Averaged stats: lr: 0.003578  min_lr: 0.003578  loss: 2.9325 (3.2371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6027 (0.6894)
Test:  [ 0/25]  eta: 0:01:48  loss: 0.7383 (0.7383)  acc1: 86.8000 (86.8000)  acc5: 96.8000 (96.8000)  time: 4.3285  data: 4.2018  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9216 (0.9372)  acc1: 78.8000 (79.9273)  acc5: 95.2000 (95.3818)  time: 0.6934  data: 0.5835  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1279 (1.1443)  acc1: 72.8000 (75.2000)  acc5: 93.2000 (92.8952)  time: 0.2504  data: 0.1414  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3265 (1.1576)  acc1: 72.8000 (74.8480)  acc5: 92.4000 (92.8320)  time: 0.1973  data: 0.0883  max mem: 18117
Test: Total time: 0:00:09 (0.3952 s / it)
* Acc@1 75.072 Acc@5 92.892 loss 1.148
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.16%
Epoch: [79]  [   0/1251]  eta: 1:03:22  lr: 0.003578  min_lr: 0.003578  loss: 3.4829 (3.4829)  weight_decay: 0.0500 (0.0500)  time: 3.0392  data: 2.5165  max mem: 18117
Epoch: [79]  [ 200/1251]  eta: 0:04:26  lr: 0.003575  min_lr: 0.003575  loss: 3.6708 (3.2618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5589 (nan)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [79]  [ 400/1251]  eta: 0:03:28  lr: 0.003573  min_lr: 0.003573  loss: 2.9098 (3.2481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6972 (nan)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [79]  [ 600/1251]  eta: 0:02:37  lr: 0.003571  min_lr: 0.003571  loss: 3.2822 (3.2526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6708 (nan)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [79]  [ 800/1251]  eta: 0:01:48  lr: 0.003569  min_lr: 0.003569  loss: 2.9703 (3.2328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6733 (nan)  time: 0.2367  data: 0.0005  max mem: 18117
Epoch: [79]  [1000/1251]  eta: 0:01:00  lr: 0.003567  min_lr: 0.003567  loss: 2.4874 (3.2089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6116 (nan)  time: 0.2352  data: 0.0004  max mem: 18117
Epoch: [79]  [1200/1251]  eta: 0:00:12  lr: 0.003564  min_lr: 0.003564  loss: 3.1136 (3.2088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5802 (nan)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [79]  [1250/1251]  eta: 0:00:00  lr: 0.003564  min_lr: 0.003564  loss: 3.2598 (3.2125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6223 (nan)  time: 0.1954  data: 0.0007  max mem: 18117
Epoch: [79] Total time: 0:05:00 (0.2403 s / it)
Averaged stats: lr: 0.003564  min_lr: 0.003564  loss: 3.2598 (3.2283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6223 (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7194 (0.7194)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 5.7379  data: 5.5870  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9482 (0.9270)  acc1: 78.4000 (79.3818)  acc5: 96.0000 (95.7455)  time: 0.7594  data: 0.6447  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1160 (1.1346)  acc1: 72.8000 (75.3905)  acc5: 92.8000 (92.9524)  time: 0.2019  data: 0.0918  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2819 (1.1448)  acc1: 72.8000 (75.1360)  acc5: 90.4000 (92.8000)  time: 0.2129  data: 0.1036  max mem: 18117
Test: Total time: 0:00:10 (0.4210 s / it)
* Acc@1 75.076 Acc@5 92.880 loss 1.139
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.16%
Epoch: [80]  [   0/1251]  eta: 1:06:51  lr: 0.003564  min_lr: 0.003564  loss: 2.8052 (2.8052)  weight_decay: 0.0500 (0.0500)  time: 3.2070  data: 1.6073  max mem: 18117
Epoch: [80]  [ 200/1251]  eta: 0:04:32  lr: 0.003562  min_lr: 0.003562  loss: 2.6326 (3.1379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6099 (0.7407)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [80]  [ 400/1251]  eta: 0:03:32  lr: 0.003559  min_lr: 0.003559  loss: 2.7638 (3.1633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6237 (0.7089)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [80]  [ 600/1251]  eta: 0:02:40  lr: 0.003557  min_lr: 0.003557  loss: 3.7645 (3.1988)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6638 (0.7458)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [80]  [ 800/1251]  eta: 0:01:50  lr: 0.003555  min_lr: 0.003555  loss: 2.9282 (3.2008)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6001 (0.7184)  time: 0.2409  data: 0.0004  max mem: 18117
Epoch: [80]  [1000/1251]  eta: 0:01:01  lr: 0.003553  min_lr: 0.003553  loss: 3.1924 (3.2098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5588 (0.7111)  time: 0.2385  data: 0.0006  max mem: 18117
Epoch: [80]  [1200/1251]  eta: 0:00:12  lr: 0.003550  min_lr: 0.003550  loss: 3.2263 (3.2396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7004 (0.7105)  time: 0.2395  data: 0.0005  max mem: 18117
Epoch: [80]  [1250/1251]  eta: 0:00:00  lr: 0.003550  min_lr: 0.003550  loss: 2.5127 (3.2224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7619 (0.7136)  time: 0.1958  data: 0.0007  max mem: 18117
Epoch: [80] Total time: 0:05:04 (0.2433 s / it)
Averaged stats: lr: 0.003550  min_lr: 0.003550  loss: 2.5127 (3.2425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7619 (0.7136)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6596 (0.6596)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.8675  data: 5.7424  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8864 (0.8729)  acc1: 79.6000 (79.7091)  acc5: 96.4000 (95.7091)  time: 0.6824  data: 0.5722  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1384 (1.0819)  acc1: 73.2000 (75.1810)  acc5: 92.0000 (93.0095)  time: 0.1692  data: 0.0608  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2152 (1.0934)  acc1: 73.2000 (74.9600)  acc5: 92.0000 (92.8640)  time: 0.1690  data: 0.0607  max mem: 18117
Test: Total time: 0:00:09 (0.3920 s / it)
* Acc@1 75.218 Acc@5 92.948 loss 1.083
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.22%
Epoch: [81]  [   0/1251]  eta: 1:06:06  lr: 0.003550  min_lr: 0.003550  loss: 4.1055 (4.1055)  weight_decay: 0.0500 (0.0500)  time: 3.1708  data: 2.8487  max mem: 18117
Epoch: [81]  [ 200/1251]  eta: 0:04:25  lr: 0.003547  min_lr: 0.003547  loss: 2.5527 (3.2406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5677 (0.5713)  time: 0.2401  data: 0.0004  max mem: 18117
Epoch: [81]  [ 400/1251]  eta: 0:03:28  lr: 0.003545  min_lr: 0.003545  loss: 2.6438 (3.2141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6736 (0.6386)  time: 0.2407  data: 0.0004  max mem: 18117
Epoch: [81]  [ 600/1251]  eta: 0:02:38  lr: 0.003543  min_lr: 0.003543  loss: 2.9949 (3.1959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6667 (0.6569)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [81]  [ 800/1251]  eta: 0:01:49  lr: 0.003541  min_lr: 0.003541  loss: 2.6683 (3.2210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6931 (0.6673)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [81]  [1000/1251]  eta: 0:01:00  lr: 0.003538  min_lr: 0.003538  loss: 2.9113 (3.2290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7022 (0.6870)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [81]  [1200/1251]  eta: 0:00:12  lr: 0.003536  min_lr: 0.003536  loss: 2.7387 (3.2189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6191 (0.6875)  time: 0.2417  data: 0.0005  max mem: 18117
Epoch: [81]  [1250/1251]  eta: 0:00:00  lr: 0.003535  min_lr: 0.003535  loss: 2.5834 (3.2221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7643 (0.6874)  time: 0.1966  data: 0.0005  max mem: 18117
Epoch: [81] Total time: 0:05:01 (0.2408 s / it)
Averaged stats: lr: 0.003535  min_lr: 0.003535  loss: 2.5834 (3.2234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7643 (0.6874)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.8676 (0.8676)  acc1: 81.2000 (81.2000)  acc5: 96.8000 (96.8000)  time: 5.5179  data: 5.3884  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9552 (0.9350)  acc1: 80.8000 (79.7091)  acc5: 96.4000 (96.0364)  time: 0.7411  data: 0.6269  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1264 (1.1381)  acc1: 74.0000 (75.4667)  acc5: 92.8000 (93.0857)  time: 0.2102  data: 0.0984  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2841 (1.1496)  acc1: 72.4000 (75.1360)  acc5: 90.8000 (92.8800)  time: 0.2082  data: 0.0983  max mem: 18117
Test: Total time: 0:00:10 (0.4097 s / it)
* Acc@1 75.244 Acc@5 92.920 loss 1.141
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.24%
Epoch: [82]  [   0/1251]  eta: 1:00:10  lr: 0.003535  min_lr: 0.003535  loss: 2.1971 (2.1971)  weight_decay: 0.0500 (0.0500)  time: 2.8862  data: 2.6203  max mem: 18117
Epoch: [82]  [ 200/1251]  eta: 0:04:24  lr: 0.003533  min_lr: 0.003533  loss: 2.5171 (3.1797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6248 (0.6821)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [82]  [ 400/1251]  eta: 0:03:28  lr: 0.003531  min_lr: 0.003531  loss: 3.5625 (3.1715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6198 (0.6681)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [82]  [ 600/1251]  eta: 0:02:38  lr: 0.003528  min_lr: 0.003528  loss: 3.3911 (3.1685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6777 (0.6721)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [82]  [ 800/1251]  eta: 0:01:49  lr: 0.003526  min_lr: 0.003526  loss: 3.3231 (3.1848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6016 (0.6758)  time: 0.2398  data: 0.0005  max mem: 18117
Epoch: [82]  [1000/1251]  eta: 0:01:00  lr: 0.003524  min_lr: 0.003524  loss: 2.7660 (3.1866)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6004 (0.6789)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [82]  [1200/1251]  eta: 0:00:12  lr: 0.003521  min_lr: 0.003521  loss: 2.7778 (3.1877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5818 (0.6744)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [82]  [1250/1251]  eta: 0:00:00  lr: 0.003521  min_lr: 0.003521  loss: 2.5624 (3.1788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7310 (0.6871)  time: 0.1967  data: 0.0007  max mem: 18117
Epoch: [82] Total time: 0:05:00 (0.2403 s / it)
Averaged stats: lr: 0.003521  min_lr: 0.003521  loss: 2.5624 (3.2356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7310 (0.6871)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7014 (0.7014)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.5468  data: 5.4203  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8665 (0.8604)  acc1: 81.2000 (80.4000)  acc5: 95.6000 (95.4909)  time: 0.6862  data: 0.5731  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0735 (1.0587)  acc1: 73.6000 (75.7524)  acc5: 92.0000 (92.9143)  time: 0.1867  data: 0.0759  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1593 (1.0707)  acc1: 73.6000 (75.5040)  acc5: 91.2000 (92.7680)  time: 0.2009  data: 0.0920  max mem: 18117
Test: Total time: 0:00:10 (0.4075 s / it)
* Acc@1 75.310 Acc@5 92.878 loss 1.068
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.31%
Epoch: [83]  [   0/1251]  eta: 1:02:05  lr: 0.003521  min_lr: 0.003521  loss: 3.7753 (3.7753)  weight_decay: 0.0500 (0.0500)  time: 2.9784  data: 2.6882  max mem: 18117
Epoch: [83]  [ 200/1251]  eta: 0:04:25  lr: 0.003519  min_lr: 0.003519  loss: 2.6730 (3.1651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5644 (0.7141)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [83]  [ 400/1251]  eta: 0:03:28  lr: 0.003516  min_lr: 0.003516  loss: 3.5653 (3.1837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5564 (0.6720)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [83]  [ 600/1251]  eta: 0:02:38  lr: 0.003514  min_lr: 0.003514  loss: 3.5549 (3.1699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5443 (0.6756)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [83]  [ 800/1251]  eta: 0:01:49  lr: 0.003512  min_lr: 0.003512  loss: 2.7376 (3.2086)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6164 (0.6764)  time: 0.2454  data: 0.0005  max mem: 18117
Epoch: [83]  [1000/1251]  eta: 0:01:01  lr: 0.003509  min_lr: 0.003509  loss: 3.0173 (3.2222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7087 (0.6770)  time: 0.2423  data: 0.0004  max mem: 18117
Epoch: [83]  [1200/1251]  eta: 0:00:12  lr: 0.003507  min_lr: 0.003507  loss: 2.7913 (3.2264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6918 (0.6873)  time: 0.2447  data: 0.0003  max mem: 18117
Epoch: [83]  [1250/1251]  eta: 0:00:00  lr: 0.003506  min_lr: 0.003506  loss: 2.7838 (3.2231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6110 (0.6841)  time: 0.1969  data: 0.0006  max mem: 18117
Epoch: [83] Total time: 0:05:03 (0.2429 s / it)
Averaged stats: lr: 0.003506  min_lr: 0.003506  loss: 2.7838 (3.2245)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6110 (0.6841)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8712 (0.8712)  acc1: 85.2000 (85.2000)  acc5: 96.4000 (96.4000)  time: 5.8257  data: 5.6696  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8849 (0.9599)  acc1: 81.6000 (80.6909)  acc5: 96.4000 (95.8909)  time: 0.7795  data: 0.6645  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1315 (1.1529)  acc1: 73.2000 (75.9810)  acc5: 93.2000 (93.2571)  time: 0.2088  data: 0.0993  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2539 (1.1646)  acc1: 72.4000 (75.3760)  acc5: 91.2000 (93.1520)  time: 0.2081  data: 0.0992  max mem: 18117
Test: Total time: 0:00:10 (0.4207 s / it)
* Acc@1 75.310 Acc@5 93.134 loss 1.160
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.31%
Epoch: [84]  [   0/1251]  eta: 1:03:24  lr: 0.003506  min_lr: 0.003506  loss: 2.5402 (2.5402)  weight_decay: 0.0500 (0.0500)  time: 3.0411  data: 2.5340  max mem: 18117
Epoch: [84]  [ 200/1251]  eta: 0:04:26  lr: 0.003504  min_lr: 0.003504  loss: 2.7760 (3.2262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5883 (0.6679)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [84]  [ 400/1251]  eta: 0:03:29  lr: 0.003502  min_lr: 0.003502  loss: 3.0427 (3.2059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6475 (0.6711)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [84]  [ 600/1251]  eta: 0:02:38  lr: 0.003499  min_lr: 0.003499  loss: 3.1222 (3.2232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5755 (0.6928)  time: 0.2379  data: 0.0005  max mem: 18117
Epoch: [84]  [ 800/1251]  eta: 0:01:49  lr: 0.003497  min_lr: 0.003497  loss: 2.6261 (3.2086)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6223 (0.6895)  time: 0.2365  data: 0.0005  max mem: 18117
Epoch: [84]  [1000/1251]  eta: 0:01:00  lr: 0.003494  min_lr: 0.003494  loss: 3.4089 (3.2297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6201 (0.7010)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [84]  [1200/1251]  eta: 0:00:12  lr: 0.003492  min_lr: 0.003492  loss: 3.4382 (3.2308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6198 (0.6975)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [84]  [1250/1251]  eta: 0:00:00  lr: 0.003491  min_lr: 0.003491  loss: 2.7703 (3.2315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6198 (0.6982)  time: 0.1953  data: 0.0007  max mem: 18117
Epoch: [84] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.003491  min_lr: 0.003491  loss: 2.7703 (3.2263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6198 (0.6982)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7736 (0.7736)  acc1: 83.6000 (83.6000)  acc5: 96.8000 (96.8000)  time: 5.6096  data: 5.4875  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9353 (0.8991)  acc1: 81.6000 (80.1818)  acc5: 96.4000 (95.8545)  time: 0.7602  data: 0.6484  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1056 (1.1050)  acc1: 72.4000 (75.4286)  acc5: 92.8000 (92.9714)  time: 0.2133  data: 0.1039  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2151 (1.1162)  acc1: 71.6000 (75.0400)  acc5: 92.0000 (92.8480)  time: 0.2131  data: 0.1038  max mem: 18117
Test: Total time: 0:00:10 (0.4159 s / it)
* Acc@1 75.430 Acc@5 93.082 loss 1.100
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.43%
Epoch: [85]  [   0/1251]  eta: 1:05:31  lr: 0.003491  min_lr: 0.003491  loss: 2.4709 (2.4709)  weight_decay: 0.0500 (0.0500)  time: 3.1423  data: 2.8261  max mem: 18117
Epoch: [85]  [ 200/1251]  eta: 0:04:25  lr: 0.003489  min_lr: 0.003489  loss: 2.8620 (3.1589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5259 (0.6912)  time: 0.2361  data: 0.0005  max mem: 18117
Epoch: [85]  [ 400/1251]  eta: 0:03:28  lr: 0.003487  min_lr: 0.003487  loss: 2.6861 (3.1803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5995 (0.6920)  time: 0.2400  data: 0.0005  max mem: 18117
Epoch: [85]  [ 600/1251]  eta: 0:02:38  lr: 0.003484  min_lr: 0.003484  loss: 3.1590 (3.1777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6531 (0.7130)  time: 0.2364  data: 0.0005  max mem: 18117
Epoch: [85]  [ 800/1251]  eta: 0:01:49  lr: 0.003482  min_lr: 0.003482  loss: 3.2703 (3.1998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7196 (0.7069)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [85]  [1000/1251]  eta: 0:01:00  lr: 0.003479  min_lr: 0.003479  loss: 2.7196 (3.2267)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5934 (0.7028)  time: 0.2417  data: 0.0004  max mem: 18117
Epoch: [85]  [1200/1251]  eta: 0:00:12  lr: 0.003477  min_lr: 0.003477  loss: 2.4903 (3.2101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4996 (0.6871)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [85]  [1250/1251]  eta: 0:00:00  lr: 0.003476  min_lr: 0.003476  loss: 2.8122 (3.2082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5756 (0.6887)  time: 0.1951  data: 0.0007  max mem: 18117
Epoch: [85] Total time: 0:05:00 (0.2405 s / it)
Averaged stats: lr: 0.003476  min_lr: 0.003476  loss: 2.8122 (3.2129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5756 (0.6887)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7009 (0.7009)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 5.7621  data: 5.6344  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9388 (0.8846)  acc1: 78.8000 (79.0182)  acc5: 96.8000 (96.5091)  time: 0.7557  data: 0.6433  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0738 (1.0894)  acc1: 72.8000 (74.8952)  acc5: 94.4000 (93.3333)  time: 0.1981  data: 0.0881  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2184 (1.0987)  acc1: 72.8000 (74.8480)  acc5: 90.8000 (93.1040)  time: 0.2169  data: 0.1080  max mem: 18117
Test: Total time: 0:00:10 (0.4254 s / it)
* Acc@1 75.230 Acc@5 93.116 loss 1.101
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.43%
Epoch: [86]  [   0/1251]  eta: 1:09:50  lr: 0.003476  min_lr: 0.003476  loss: 4.0157 (4.0157)  weight_decay: 0.0500 (0.0500)  time: 3.3495  data: 2.6835  max mem: 18117
Epoch: [86]  [ 200/1251]  eta: 0:04:29  lr: 0.003474  min_lr: 0.003474  loss: 3.7929 (3.2211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5519 (0.6202)  time: 0.2409  data: 0.0004  max mem: 18117
Epoch: [86]  [ 400/1251]  eta: 0:03:30  lr: 0.003472  min_lr: 0.003472  loss: 3.9595 (3.2256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6492 (0.6352)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [86]  [ 600/1251]  eta: 0:02:39  lr: 0.003469  min_lr: 0.003469  loss: 3.1101 (3.2063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6404 (0.6699)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [86]  [ 800/1251]  eta: 0:01:49  lr: 0.003467  min_lr: 0.003467  loss: 3.6670 (3.2152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6499 (0.6671)  time: 0.2455  data: 0.0004  max mem: 18117
Epoch: [86]  [1000/1251]  eta: 0:01:01  lr: 0.003464  min_lr: 0.003464  loss: 2.5018 (3.1931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5815 (0.6811)  time: 0.2462  data: 0.0004  max mem: 18117
Epoch: [86]  [1200/1251]  eta: 0:00:12  lr: 0.003462  min_lr: 0.003462  loss: 3.2997 (3.2077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5477 (0.6687)  time: 0.2430  data: 0.0004  max mem: 18117
Epoch: [86]  [1250/1251]  eta: 0:00:00  lr: 0.003461  min_lr: 0.003461  loss: 2.6298 (3.2067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6000 (0.6714)  time: 0.1960  data: 0.0008  max mem: 18117
Epoch: [86] Total time: 0:05:04 (0.2432 s / it)
Averaged stats: lr: 0.003461  min_lr: 0.003461  loss: 2.6298 (3.2068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6000 (0.6714)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6605 (0.6605)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 5.4763  data: 5.3515  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8672 (0.8430)  acc1: 79.2000 (79.6364)  acc5: 96.8000 (95.8909)  time: 0.7362  data: 0.6233  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0647 (1.0601)  acc1: 75.2000 (75.5810)  acc5: 92.4000 (92.9905)  time: 0.2160  data: 0.1060  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1710 (1.0727)  acc1: 74.8000 (75.2640)  acc5: 91.6000 (92.8800)  time: 0.2158  data: 0.1060  max mem: 18117
Test: Total time: 0:00:10 (0.4133 s / it)
* Acc@1 75.436 Acc@5 92.978 loss 1.064
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.44%
Epoch: [87]  [   0/1251]  eta: 1:00:11  lr: 0.003461  min_lr: 0.003461  loss: 3.7469 (3.7469)  weight_decay: 0.0500 (0.0500)  time: 2.8867  data: 2.5388  max mem: 18117
Epoch: [87]  [ 200/1251]  eta: 0:04:26  lr: 0.003459  min_lr: 0.003459  loss: 2.7788 (3.2146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6336 (0.6907)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [87]  [ 400/1251]  eta: 0:03:30  lr: 0.003456  min_lr: 0.003456  loss: 2.8542 (3.1786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6503 (0.6790)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [87]  [ 600/1251]  eta: 0:02:39  lr: 0.003454  min_lr: 0.003454  loss: 3.0065 (3.1734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5747 (0.6808)  time: 0.2385  data: 0.0003  max mem: 18117
Epoch: [87]  [ 800/1251]  eta: 0:01:49  lr: 0.003451  min_lr: 0.003451  loss: 2.4448 (3.1669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5743 (0.6774)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [87]  [1000/1251]  eta: 0:01:00  lr: 0.003449  min_lr: 0.003449  loss: 2.7162 (3.1716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6121 (0.6744)  time: 0.2416  data: 0.0004  max mem: 18117
Epoch: [87]  [1200/1251]  eta: 0:00:12  lr: 0.003446  min_lr: 0.003446  loss: 2.4441 (3.1690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7320 (0.6843)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [87]  [1250/1251]  eta: 0:00:00  lr: 0.003446  min_lr: 0.003446  loss: 2.9811 (3.1742)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6992 (0.6862)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [87] Total time: 0:05:03 (0.2423 s / it)
Averaged stats: lr: 0.003446  min_lr: 0.003446  loss: 2.9811 (3.2106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6992 (0.6862)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8819 (0.8819)  acc1: 82.0000 (82.0000)  acc5: 96.8000 (96.8000)  time: 5.5862  data: 5.4391  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9327 (0.9682)  acc1: 79.6000 (78.7273)  acc5: 96.0000 (95.4909)  time: 0.7386  data: 0.6234  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1870 (1.1606)  acc1: 73.2000 (74.5143)  acc5: 92.4000 (92.8571)  time: 0.2024  data: 0.0918  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2779 (1.1626)  acc1: 73.2000 (74.4160)  acc5: 92.0000 (92.9440)  time: 0.2075  data: 0.0964  max mem: 18117
Test: Total time: 0:00:10 (0.4123 s / it)
* Acc@1 75.180 Acc@5 92.942 loss 1.152
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.44%
Epoch: [88]  [   0/1251]  eta: 1:06:19  lr: 0.003446  min_lr: 0.003446  loss: 3.0089 (3.0089)  weight_decay: 0.0500 (0.0500)  time: 3.1807  data: 1.7176  max mem: 18117
Epoch: [88]  [ 200/1251]  eta: 0:04:29  lr: 0.003443  min_lr: 0.003443  loss: 2.5956 (3.1483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5663 (0.6163)  time: 0.2416  data: 0.0004  max mem: 18117
Epoch: [88]  [ 400/1251]  eta: 0:03:30  lr: 0.003441  min_lr: 0.003441  loss: 2.8421 (3.1694)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5590 (0.6404)  time: 0.2388  data: 0.0005  max mem: 18117
Epoch: [88]  [ 600/1251]  eta: 0:02:39  lr: 0.003438  min_lr: 0.003438  loss: 2.6895 (3.1767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6223 (0.6564)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [88]  [ 800/1251]  eta: 0:01:49  lr: 0.003436  min_lr: 0.003436  loss: 2.9871 (3.1871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6838 (0.6510)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [88]  [1000/1251]  eta: 0:01:00  lr: 0.003433  min_lr: 0.003433  loss: 3.2726 (3.1888)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6103 (0.6887)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [88]  [1200/1251]  eta: 0:00:12  lr: 0.003431  min_lr: 0.003431  loss: 2.6720 (3.1857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6501 (0.6819)  time: 0.2370  data: 0.0005  max mem: 18117
Epoch: [88]  [1250/1251]  eta: 0:00:00  lr: 0.003430  min_lr: 0.003430  loss: 3.7492 (3.1901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6457 (0.6801)  time: 0.1952  data: 0.0007  max mem: 18117
Epoch: [88] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.003430  min_lr: 0.003430  loss: 3.7492 (3.2106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6457 (0.6801)
Test:  [ 0/25]  eta: 0:01:48  loss: 0.8249 (0.8249)  acc1: 86.4000 (86.4000)  acc5: 95.6000 (95.6000)  time: 4.3306  data: 4.2021  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 1.0082 (1.0079)  acc1: 80.8000 (79.8545)  acc5: 95.6000 (95.6000)  time: 0.6839  data: 0.5715  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1948 (1.1973)  acc1: 73.2000 (75.5048)  acc5: 93.2000 (93.1810)  time: 0.2468  data: 0.1374  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2825 (1.1993)  acc1: 73.2000 (75.3760)  acc5: 91.6000 (93.0720)  time: 0.2182  data: 0.1101  max mem: 18117
Test: Total time: 0:00:10 (0.4220 s / it)
* Acc@1 75.320 Acc@5 93.136 loss 1.185
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.44%
Epoch: [89]  [   0/1251]  eta: 1:07:53  lr: 0.003430  min_lr: 0.003430  loss: 2.8261 (2.8261)  weight_decay: 0.0500 (0.0500)  time: 3.2562  data: 2.4664  max mem: 18117
Epoch: [89]  [ 200/1251]  eta: 0:04:28  lr: 0.003428  min_lr: 0.003428  loss: 2.4282 (3.1646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6304 (0.6899)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [89]  [ 400/1251]  eta: 0:03:30  lr: 0.003425  min_lr: 0.003425  loss: 2.7915 (3.1808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6919 (0.6859)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [89]  [ 600/1251]  eta: 0:02:39  lr: 0.003423  min_lr: 0.003423  loss: 2.9404 (3.1774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5836 (0.6716)  time: 0.2407  data: 0.0003  max mem: 18117
Epoch: [89]  [ 800/1251]  eta: 0:01:49  lr: 0.003420  min_lr: 0.003420  loss: 2.5764 (3.1837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5767 (0.6705)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [89]  [1000/1251]  eta: 0:01:00  lr: 0.003418  min_lr: 0.003418  loss: 2.9476 (3.1830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5873 (0.6985)  time: 0.2407  data: 0.0005  max mem: 18117
Epoch: [89]  [1200/1251]  eta: 0:00:12  lr: 0.003415  min_lr: 0.003415  loss: 3.0829 (3.1802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5541 (0.6774)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [89]  [1250/1251]  eta: 0:00:00  lr: 0.003414  min_lr: 0.003414  loss: 3.1857 (3.1822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6459 (0.6804)  time: 0.1961  data: 0.0008  max mem: 18117
Epoch: [89] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.003414  min_lr: 0.003414  loss: 3.1857 (3.2054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6459 (0.6804)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8193 (0.8193)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 5.6694  data: 5.5205  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0606 (0.9919)  acc1: 79.6000 (79.2727)  acc5: 95.6000 (95.2364)  time: 0.7729  data: 0.6608  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1856 (1.1787)  acc1: 74.0000 (75.3524)  acc5: 92.8000 (92.6476)  time: 0.2075  data: 0.0992  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2847 (1.1875)  acc1: 72.8000 (75.0720)  acc5: 91.6000 (92.6400)  time: 0.2107  data: 0.1026  max mem: 18117
Test: Total time: 0:00:10 (0.4162 s / it)
* Acc@1 75.376 Acc@5 92.950 loss 1.174
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.44%
Epoch: [90]  [   0/1251]  eta: 1:01:58  lr: 0.003414  min_lr: 0.003414  loss: 4.2779 (4.2779)  weight_decay: 0.0500 (0.0500)  time: 2.9721  data: 2.6019  max mem: 18117
Epoch: [90]  [ 200/1251]  eta: 0:04:29  lr: 0.003412  min_lr: 0.003412  loss: 3.5614 (3.1710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5913 (0.8626)  time: 0.2373  data: 0.0003  max mem: 18117
Epoch: [90]  [ 400/1251]  eta: 0:03:30  lr: 0.003409  min_lr: 0.003409  loss: 3.0893 (3.2118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5927 (0.7532)  time: 0.2354  data: 0.0004  max mem: 18117
Epoch: [90]  [ 600/1251]  eta: 0:02:39  lr: 0.003407  min_lr: 0.003407  loss: 2.6553 (3.2007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5368 (0.7065)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [90]  [ 800/1251]  eta: 0:01:50  lr: 0.003404  min_lr: 0.003404  loss: 2.8820 (3.1954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6983 (0.6979)  time: 0.2408  data: 0.0004  max mem: 18117
Epoch: [90]  [1000/1251]  eta: 0:01:01  lr: 0.003402  min_lr: 0.003402  loss: 3.1964 (3.1986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6370 (0.6888)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [90]  [1200/1251]  eta: 0:00:12  lr: 0.003399  min_lr: 0.003399  loss: 3.4480 (3.1982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5919 (0.6834)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [90]  [1250/1251]  eta: 0:00:00  lr: 0.003398  min_lr: 0.003398  loss: 3.2359 (3.2000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6973 (0.6846)  time: 0.1979  data: 0.0006  max mem: 18117
Epoch: [90] Total time: 0:05:03 (0.2427 s / it)
Averaged stats: lr: 0.003398  min_lr: 0.003398  loss: 3.2359 (3.2001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6973 (0.6846)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7235 (0.7235)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.4941  data: 5.3697  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9215 (0.9142)  acc1: 79.2000 (78.9455)  acc5: 96.0000 (95.5636)  time: 0.7416  data: 0.6304  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1135 (1.1080)  acc1: 72.8000 (74.9333)  acc5: 92.8000 (92.9524)  time: 0.2068  data: 0.0978  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2240 (1.1122)  acc1: 72.8000 (75.0880)  acc5: 92.8000 (93.0880)  time: 0.2060  data: 0.0977  max mem: 18117
Test: Total time: 0:00:10 (0.4059 s / it)
* Acc@1 75.542 Acc@5 93.206 loss 1.108
Accuracy of the model on the 50000 test images: 75.5%
Max accuracy: 75.54%
Epoch: [91]  [   0/1251]  eta: 0:52:37  lr: 0.003398  min_lr: 0.003398  loss: 3.5641 (3.5641)  weight_decay: 0.0500 (0.0500)  time: 2.5242  data: 2.1605  max mem: 18117
Epoch: [91]  [ 200/1251]  eta: 0:04:25  lr: 0.003396  min_lr: 0.003396  loss: 3.0571 (3.2049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6347 (0.6894)  time: 0.2429  data: 0.0004  max mem: 18117
Epoch: [91]  [ 400/1251]  eta: 0:03:28  lr: 0.003393  min_lr: 0.003393  loss: 2.7198 (3.1752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5897 (0.6741)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [91]  [ 600/1251]  eta: 0:02:38  lr: 0.003391  min_lr: 0.003391  loss: 3.9084 (3.1818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7038 (0.6715)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [91]  [ 800/1251]  eta: 0:01:49  lr: 0.003388  min_lr: 0.003388  loss: 3.7726 (3.2031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6161 (0.6838)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [91]  [1000/1251]  eta: 0:01:00  lr: 0.003385  min_lr: 0.003385  loss: 3.0741 (3.2241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5797 (0.6751)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [91]  [1200/1251]  eta: 0:00:12  lr: 0.003383  min_lr: 0.003383  loss: 2.5838 (3.2071)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5329 (0.6739)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [91]  [1250/1251]  eta: 0:00:00  lr: 0.003382  min_lr: 0.003382  loss: 3.3065 (3.2121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6309 (0.6735)  time: 0.1952  data: 0.0006  max mem: 18117
Epoch: [91] Total time: 0:05:01 (0.2414 s / it)
Averaged stats: lr: 0.003382  min_lr: 0.003382  loss: 3.3065 (3.1958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6309 (0.6735)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7828 (0.7828)  acc1: 85.2000 (85.2000)  acc5: 98.0000 (98.0000)  time: 5.6522  data: 5.5272  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 1.0014 (1.0014)  acc1: 81.2000 (79.4182)  acc5: 96.0000 (95.7455)  time: 0.7460  data: 0.6339  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1948 (1.1968)  acc1: 72.4000 (75.0857)  acc5: 92.8000 (93.0095)  time: 0.1919  data: 0.0825  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3767 (1.2132)  acc1: 72.0000 (74.6720)  acc5: 91.6000 (92.8320)  time: 0.1905  data: 0.0824  max mem: 18117
Test: Total time: 0:00:10 (0.4005 s / it)
* Acc@1 75.018 Acc@5 93.110 loss 1.200
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 75.54%
Epoch: [92]  [   0/1251]  eta: 1:08:14  lr: 0.003382  min_lr: 0.003382  loss: 3.0194 (3.0194)  weight_decay: 0.0500 (0.0500)  time: 3.2733  data: 2.3147  max mem: 18117
Epoch: [92]  [ 200/1251]  eta: 0:04:25  lr: 0.003380  min_lr: 0.003380  loss: 3.6093 (3.1483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5477 (0.6782)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [92]  [ 400/1251]  eta: 0:03:28  lr: 0.003377  min_lr: 0.003377  loss: 2.6449 (3.1709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5910 (0.6988)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [92]  [ 600/1251]  eta: 0:02:38  lr: 0.003374  min_lr: 0.003374  loss: 2.7761 (3.1704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7073 (0.7033)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [92]  [ 800/1251]  eta: 0:01:49  lr: 0.003372  min_lr: 0.003372  loss: 3.0246 (3.1544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5815 (0.6859)  time: 0.2452  data: 0.0003  max mem: 18117
Epoch: [92]  [1000/1251]  eta: 0:01:00  lr: 0.003369  min_lr: 0.003369  loss: 2.6417 (3.1542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6490 (0.6865)  time: 0.2370  data: 0.0005  max mem: 18117
Epoch: [92]  [1200/1251]  eta: 0:00:12  lr: 0.003367  min_lr: 0.003367  loss: 3.0203 (3.1546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5958 (0.6830)  time: 0.2373  data: 0.0003  max mem: 18117
Epoch: [92]  [1250/1251]  eta: 0:00:00  lr: 0.003366  min_lr: 0.003366  loss: 3.0027 (3.1544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5687 (0.6785)  time: 0.1957  data: 0.0007  max mem: 18117
Epoch: [92] Total time: 0:05:01 (0.2414 s / it)
Averaged stats: lr: 0.003366  min_lr: 0.003366  loss: 3.0027 (3.1927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5687 (0.6785)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7750 (0.7750)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 5.5881  data: 5.4629  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8914 (0.9488)  acc1: 82.0000 (79.9273)  acc5: 96.4000 (95.6727)  time: 0.7325  data: 0.6218  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1069 (1.1270)  acc1: 73.6000 (76.0762)  acc5: 92.8000 (93.2000)  time: 0.1967  data: 0.0879  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1815 (1.1328)  acc1: 73.6000 (75.8720)  acc5: 92.8000 (93.3120)  time: 0.1948  data: 0.0865  max mem: 18117
Test: Total time: 0:00:10 (0.4022 s / it)
* Acc@1 75.790 Acc@5 93.174 loss 1.125
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 75.79%
Epoch: [93]  [   0/1251]  eta: 1:00:43  lr: 0.003366  min_lr: 0.003366  loss: 4.0643 (4.0643)  weight_decay: 0.0500 (0.0500)  time: 2.9126  data: 2.4370  max mem: 18117
Epoch: [93]  [ 200/1251]  eta: 0:04:25  lr: 0.003363  min_lr: 0.003363  loss: 3.2766 (3.1871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5842 (0.5991)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [93]  [ 400/1251]  eta: 0:03:28  lr: 0.003361  min_lr: 0.003361  loss: 3.8812 (3.2346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6487 (0.6494)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [93]  [ 600/1251]  eta: 0:02:38  lr: 0.003358  min_lr: 0.003358  loss: 2.6347 (3.2317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6263 (0.6594)  time: 0.2356  data: 0.0004  max mem: 18117
Epoch: [93]  [ 800/1251]  eta: 0:01:49  lr: 0.003355  min_lr: 0.003355  loss: 2.8041 (3.2066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5509 (0.6432)  time: 0.2452  data: 0.0004  max mem: 18117
Epoch: [93]  [1000/1251]  eta: 0:01:00  lr: 0.003353  min_lr: 0.003353  loss: 3.6201 (3.2035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6891 (0.6482)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [93]  [1200/1251]  eta: 0:00:12  lr: 0.003350  min_lr: 0.003350  loss: 2.6335 (3.1874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6397 (0.6481)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [93]  [1250/1251]  eta: 0:00:00  lr: 0.003350  min_lr: 0.003350  loss: 2.4244 (3.1823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6793 (0.6546)  time: 0.1967  data: 0.0007  max mem: 18117
Epoch: [93] Total time: 0:05:02 (0.2420 s / it)
Averaged stats: lr: 0.003350  min_lr: 0.003350  loss: 2.4244 (3.1963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6793 (0.6546)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7543 (0.7543)  acc1: 83.2000 (83.2000)  acc5: 97.2000 (97.2000)  time: 5.8521  data: 5.7109  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8278 (0.8780)  acc1: 82.0000 (79.8182)  acc5: 95.6000 (95.2000)  time: 0.7595  data: 0.6456  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1029 (1.0679)  acc1: 73.2000 (76.0000)  acc5: 92.8000 (93.0476)  time: 0.2053  data: 0.0956  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1612 (1.0766)  acc1: 72.8000 (75.6160)  acc5: 92.8000 (93.1200)  time: 0.2049  data: 0.0956  max mem: 18117
Test: Total time: 0:00:10 (0.4188 s / it)
* Acc@1 75.562 Acc@5 93.100 loss 1.075
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.79%
Epoch: [94]  [   0/1251]  eta: 1:03:16  lr: 0.003350  min_lr: 0.003350  loss: 2.9759 (2.9759)  weight_decay: 0.0500 (0.0500)  time: 3.0349  data: 2.2787  max mem: 18117
Epoch: [94]  [ 200/1251]  eta: 0:04:27  lr: 0.003347  min_lr: 0.003347  loss: 2.9972 (3.0852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5213 (0.6504)  time: 0.2406  data: 0.0005  max mem: 18117
Epoch: [94]  [ 400/1251]  eta: 0:03:30  lr: 0.003344  min_lr: 0.003344  loss: 2.4430 (3.1449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5094 (0.6472)  time: 0.2413  data: 0.0004  max mem: 18117
Epoch: [94]  [ 600/1251]  eta: 0:02:39  lr: 0.003342  min_lr: 0.003342  loss: 2.6816 (3.1460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5708 (0.6707)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [94]  [ 800/1251]  eta: 0:01:49  lr: 0.003339  min_lr: 0.003339  loss: 3.7723 (3.1828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7386 (0.6675)  time: 0.2398  data: 0.0005  max mem: 18117
Epoch: [94]  [1000/1251]  eta: 0:01:00  lr: 0.003336  min_lr: 0.003336  loss: 3.3136 (3.1848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6235 (0.6679)  time: 0.2366  data: 0.0005  max mem: 18117
Epoch: [94]  [1200/1251]  eta: 0:00:12  lr: 0.003334  min_lr: 0.003334  loss: 3.4199 (3.1911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6462 (0.6915)  time: 0.2454  data: 0.0005  max mem: 18117
Epoch: [94]  [1250/1251]  eta: 0:00:00  lr: 0.003333  min_lr: 0.003333  loss: 2.5856 (3.1885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6185 (0.6899)  time: 0.1950  data: 0.0007  max mem: 18117
Epoch: [94] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.003333  min_lr: 0.003333  loss: 2.5856 (3.1897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6185 (0.6899)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7688 (0.7688)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 5.7008  data: 5.5714  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8314 (0.8910)  acc1: 81.2000 (79.7091)  acc5: 96.0000 (95.9273)  time: 0.7136  data: 0.6014  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0519 (1.0826)  acc1: 74.0000 (75.9429)  acc5: 93.6000 (93.2571)  time: 0.1813  data: 0.0717  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1630 (1.0915)  acc1: 74.4000 (75.6000)  acc5: 91.6000 (93.1840)  time: 0.1829  data: 0.0717  max mem: 18117
Test: Total time: 0:00:09 (0.3951 s / it)
* Acc@1 75.808 Acc@5 93.400 loss 1.078
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 75.81%
Epoch: [95]  [   0/1251]  eta: 1:08:06  lr: 0.003333  min_lr: 0.003333  loss: 2.3771 (2.3771)  weight_decay: 0.0500 (0.0500)  time: 3.2668  data: 2.9846  max mem: 18117
Epoch: [95]  [ 200/1251]  eta: 0:04:26  lr: 0.003330  min_lr: 0.003330  loss: 2.6654 (3.1608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5891 (0.6185)  time: 0.2401  data: 0.0004  max mem: 18117
Epoch: [95]  [ 400/1251]  eta: 0:03:30  lr: 0.003327  min_lr: 0.003327  loss: 2.9282 (3.1753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6708 (0.6411)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [95]  [ 600/1251]  eta: 0:02:39  lr: 0.003325  min_lr: 0.003325  loss: 3.3171 (3.1746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5717 (0.6587)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [95]  [ 800/1251]  eta: 0:01:49  lr: 0.003322  min_lr: 0.003322  loss: 2.8860 (3.1898)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6016 (0.6545)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [95]  [1000/1251]  eta: 0:01:01  lr: 0.003319  min_lr: 0.003319  loss: 3.6142 (3.2127)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6045 (0.6528)  time: 0.2436  data: 0.0005  max mem: 18117
Epoch: [95]  [1200/1251]  eta: 0:00:12  lr: 0.003317  min_lr: 0.003317  loss: 3.2562 (3.2094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6311 (0.6476)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [95]  [1250/1251]  eta: 0:00:00  lr: 0.003316  min_lr: 0.003316  loss: 3.6356 (3.2188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6349 (0.6509)  time: 0.1962  data: 0.0009  max mem: 18117
Epoch: [95] Total time: 0:05:03 (0.2428 s / it)
Averaged stats: lr: 0.003316  min_lr: 0.003316  loss: 3.6356 (3.1924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6349 (0.6509)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7432 (0.7432)  acc1: 86.8000 (86.8000)  acc5: 96.0000 (96.0000)  time: 5.5538  data: 5.4236  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9744 (0.9440)  acc1: 83.2000 (80.6545)  acc5: 96.0000 (95.9636)  time: 0.7230  data: 0.6114  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1285 (1.1478)  acc1: 72.0000 (75.8667)  acc5: 93.6000 (93.1238)  time: 0.1986  data: 0.0892  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2728 (1.1605)  acc1: 72.0000 (75.2800)  acc5: 91.2000 (93.0080)  time: 0.2012  data: 0.0917  max mem: 18117
Test: Total time: 0:00:10 (0.4070 s / it)
* Acc@1 75.560 Acc@5 93.260 loss 1.153
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.81%
Epoch: [96]  [   0/1251]  eta: 1:09:19  lr: 0.003316  min_lr: 0.003316  loss: 2.9101 (2.9101)  weight_decay: 0.0500 (0.0500)  time: 3.3252  data: 1.7342  max mem: 18117
Epoch: [96]  [ 200/1251]  eta: 0:04:27  lr: 0.003313  min_lr: 0.003313  loss: 3.1830 (3.1381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7299 (nan)  time: 0.2406  data: 0.0004  max mem: 18117
Epoch: [96]  [ 400/1251]  eta: 0:03:29  lr: 0.003311  min_lr: 0.003311  loss: 2.5075 (3.1446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5428 (nan)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [96]  [ 600/1251]  eta: 0:02:39  lr: 0.003308  min_lr: 0.003308  loss: 3.2638 (3.1572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5763 (nan)  time: 0.2459  data: 0.0004  max mem: 18117
Epoch: [96]  [ 800/1251]  eta: 0:01:49  lr: 0.003305  min_lr: 0.003305  loss: 3.0509 (3.1518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6705 (nan)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [96]  [1000/1251]  eta: 0:01:01  lr: 0.003302  min_lr: 0.003302  loss: 2.7359 (3.1513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6015 (nan)  time: 0.2465  data: 0.0004  max mem: 18117
Epoch: [96]  [1200/1251]  eta: 0:00:12  lr: 0.003300  min_lr: 0.003300  loss: 3.1508 (3.1741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5278 (nan)  time: 0.2380  data: 0.0003  max mem: 18117
Epoch: [96]  [1250/1251]  eta: 0:00:00  lr: 0.003299  min_lr: 0.003299  loss: 3.3885 (3.1764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5769 (nan)  time: 0.2019  data: 0.0006  max mem: 18117
Epoch: [96] Total time: 0:05:03 (0.2425 s / it)
Averaged stats: lr: 0.003299  min_lr: 0.003299  loss: 3.3885 (3.1876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5769 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7822 (0.7822)  acc1: 85.2000 (85.2000)  acc5: 96.8000 (96.8000)  time: 5.6174  data: 5.4893  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9569 (0.9374)  acc1: 80.4000 (79.7818)  acc5: 95.2000 (95.5273)  time: 0.7283  data: 0.6148  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1617 (1.1471)  acc1: 73.6000 (75.3143)  acc5: 92.8000 (92.7429)  time: 0.1974  data: 0.0872  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.3194 (1.1552)  acc1: 73.2000 (75.0560)  acc5: 91.2000 (92.7040)  time: 0.2185  data: 0.1103  max mem: 18117
Test: Total time: 0:00:10 (0.4216 s / it)
* Acc@1 75.708 Acc@5 93.244 loss 1.135
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 75.81%
Epoch: [97]  [   0/1251]  eta: 1:08:22  lr: 0.003299  min_lr: 0.003299  loss: 2.2545 (2.2545)  weight_decay: 0.0500 (0.0500)  time: 3.2795  data: 1.7339  max mem: 18117
Epoch: [97]  [ 200/1251]  eta: 0:04:29  lr: 0.003296  min_lr: 0.003296  loss: 2.7532 (3.1910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6093 (0.5982)  time: 0.2426  data: 0.0005  max mem: 18117
Epoch: [97]  [ 400/1251]  eta: 0:03:30  lr: 0.003294  min_lr: 0.003294  loss: 2.8293 (3.2266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6035 (0.6161)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [97]  [ 600/1251]  eta: 0:02:39  lr: 0.003291  min_lr: 0.003291  loss: 2.7091 (3.2173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6333 (0.6326)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [97]  [ 800/1251]  eta: 0:01:49  lr: 0.003288  min_lr: 0.003288  loss: 2.7122 (3.2145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6180 (0.6713)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [97]  [1000/1251]  eta: 0:01:00  lr: 0.003285  min_lr: 0.003285  loss: 3.2199 (3.2061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6767 (0.6650)  time: 0.2389  data: 0.0005  max mem: 18117
Epoch: [97]  [1200/1251]  eta: 0:00:12  lr: 0.003283  min_lr: 0.003283  loss: 3.8222 (3.1938)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6211 (0.6576)  time: 0.2386  data: 0.0005  max mem: 18117
Epoch: [97]  [1250/1251]  eta: 0:00:00  lr: 0.003282  min_lr: 0.003282  loss: 3.4381 (3.1948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5996 (0.6598)  time: 0.1957  data: 0.0013  max mem: 18117
Epoch: [97] Total time: 0:05:02 (0.2420 s / it)
Averaged stats: lr: 0.003282  min_lr: 0.003282  loss: 3.4381 (3.1837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5996 (0.6598)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7660 (0.7660)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 5.7505  data: 5.6249  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9777 (0.9413)  acc1: 80.8000 (79.1273)  acc5: 96.4000 (95.8182)  time: 0.7544  data: 0.6423  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1634 (1.1341)  acc1: 72.4000 (75.1810)  acc5: 92.8000 (93.3143)  time: 0.2027  data: 0.0933  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2767 (1.1466)  acc1: 72.8000 (75.1680)  acc5: 91.2000 (93.2320)  time: 0.2022  data: 0.0932  max mem: 18117
Test: Total time: 0:00:10 (0.4124 s / it)
* Acc@1 75.874 Acc@5 93.346 loss 1.142
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 75.87%
Epoch: [98]  [   0/1251]  eta: 1:07:46  lr: 0.003282  min_lr: 0.003282  loss: 4.1764 (4.1764)  weight_decay: 0.0500 (0.0500)  time: 3.2505  data: 2.9533  max mem: 18117
Epoch: [98]  [ 200/1251]  eta: 0:04:25  lr: 0.003279  min_lr: 0.003279  loss: 2.3800 (3.1401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5187 (0.6156)  time: 0.2368  data: 0.0007  max mem: 18117
Epoch: [98]  [ 400/1251]  eta: 0:03:29  lr: 0.003276  min_lr: 0.003276  loss: 2.8870 (3.1607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6692 (0.6707)  time: 0.2419  data: 0.0003  max mem: 18117
Epoch: [98]  [ 600/1251]  eta: 0:02:39  lr: 0.003274  min_lr: 0.003274  loss: 3.2717 (3.1187)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6675 (0.6684)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [98]  [ 800/1251]  eta: 0:01:49  lr: 0.003271  min_lr: 0.003271  loss: 2.9610 (3.1094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6020 (0.6619)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [98]  [1000/1251]  eta: 0:01:00  lr: 0.003268  min_lr: 0.003268  loss: 2.7608 (3.1092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5398 (0.6471)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [98]  [1200/1251]  eta: 0:00:12  lr: 0.003265  min_lr: 0.003265  loss: 3.5588 (3.1254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5859 (0.6688)  time: 0.2433  data: 0.0004  max mem: 18117
Epoch: [98]  [1250/1251]  eta: 0:00:00  lr: 0.003265  min_lr: 0.003265  loss: 3.4793 (3.1309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6246 (0.6678)  time: 0.1962  data: 0.0009  max mem: 18117
Epoch: [98] Total time: 0:05:02 (0.2422 s / it)
Averaged stats: lr: 0.003265  min_lr: 0.003265  loss: 3.4793 (3.1720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6246 (0.6678)
Test:  [ 0/25]  eta: 0:01:22  loss: 0.6606 (0.6606)  acc1: 88.0000 (88.0000)  acc5: 97.6000 (97.6000)  time: 3.2957  data: 3.1467  max mem: 18117
Test:  [10/25]  eta: 0:00:08  loss: 0.8834 (0.8787)  acc1: 80.8000 (80.2545)  acc5: 96.4000 (95.7091)  time: 0.5585  data: 0.4382  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1233 (1.0738)  acc1: 73.6000 (76.0952)  acc5: 92.8000 (93.3524)  time: 0.2779  data: 0.1649  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1950 (1.0801)  acc1: 73.6000 (75.9680)  acc5: 92.0000 (93.2800)  time: 0.2286  data: 0.1196  max mem: 18117
Test: Total time: 0:00:10 (0.4028 s / it)
* Acc@1 75.806 Acc@5 93.296 loss 1.076
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 75.87%
Epoch: [99]  [   0/1251]  eta: 1:10:39  lr: 0.003265  min_lr: 0.003265  loss: 4.0747 (4.0747)  weight_decay: 0.0500 (0.0500)  time: 3.3885  data: 2.5529  max mem: 18117
Epoch: [99]  [ 200/1251]  eta: 0:04:29  lr: 0.003262  min_lr: 0.003262  loss: 3.2940 (3.1235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4923 (0.5798)  time: 0.2414  data: 0.0004  max mem: 18117
Epoch: [99]  [ 400/1251]  eta: 0:03:30  lr: 0.003259  min_lr: 0.003259  loss: 2.8284 (3.1560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5917 (0.6557)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [99]  [ 600/1251]  eta: 0:02:39  lr: 0.003256  min_lr: 0.003256  loss: 3.4277 (3.1408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6785 (0.6530)  time: 0.2380  data: 0.0009  max mem: 18117
Epoch: [99]  [ 800/1251]  eta: 0:01:49  lr: 0.003253  min_lr: 0.003253  loss: 3.0468 (3.1653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6057 (0.6522)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [99]  [1000/1251]  eta: 0:01:00  lr: 0.003251  min_lr: 0.003251  loss: 2.5920 (3.1574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5705 (0.6562)  time: 0.2376  data: 0.0005  max mem: 18117
Epoch: [99]  [1200/1251]  eta: 0:00:12  lr: 0.003248  min_lr: 0.003248  loss: 3.2122 (3.1710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5978 (0.6541)  time: 0.2353  data: 0.0004  max mem: 18117
Epoch: [99]  [1250/1251]  eta: 0:00:00  lr: 0.003247  min_lr: 0.003247  loss: 2.9501 (3.1765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6267 (0.6551)  time: 0.1953  data: 0.0006  max mem: 18117
Epoch: [99] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.003247  min_lr: 0.003247  loss: 2.9501 (3.1830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6267 (0.6551)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8504 (0.8504)  acc1: 83.2000 (83.2000)  acc5: 96.4000 (96.4000)  time: 5.5833  data: 5.4553  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9357 (0.9323)  acc1: 81.6000 (80.4000)  acc5: 95.6000 (95.4182)  time: 0.7607  data: 0.6474  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1315 (1.1150)  acc1: 74.0000 (76.4000)  acc5: 92.8000 (93.0286)  time: 0.2266  data: 0.1166  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2094 (1.1263)  acc1: 72.8000 (75.7440)  acc5: 92.8000 (93.0560)  time: 0.2247  data: 0.1165  max mem: 18117
Test: Total time: 0:00:10 (0.4255 s / it)
* Acc@1 75.874 Acc@5 93.386 loss 1.112
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 75.87%
Epoch: [100]  [   0/1251]  eta: 1:08:30  lr: 0.003247  min_lr: 0.003247  loss: 2.2888 (2.2888)  weight_decay: 0.0500 (0.0500)  time: 3.2859  data: 2.9353  max mem: 18117
Epoch: [100]  [ 200/1251]  eta: 0:04:29  lr: 0.003244  min_lr: 0.003244  loss: 3.8466 (3.1836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6232 (0.6191)  time: 0.2559  data: 0.0004  max mem: 18117
Epoch: [100]  [ 400/1251]  eta: 0:03:30  lr: 0.003242  min_lr: 0.003242  loss: 2.6701 (3.1708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5009 (0.6657)  time: 0.2432  data: 0.0004  max mem: 18117
Epoch: [100]  [ 600/1251]  eta: 0:02:39  lr: 0.003239  min_lr: 0.003239  loss: 3.5515 (3.1711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5236 (0.6531)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [100]  [ 800/1251]  eta: 0:01:49  lr: 0.003236  min_lr: 0.003236  loss: 3.3158 (3.1837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6613 (0.6779)  time: 0.2372  data: 0.0003  max mem: 18117
Epoch: [100]  [1000/1251]  eta: 0:01:00  lr: 0.003233  min_lr: 0.003233  loss: 2.6222 (3.1887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5703 (0.6738)  time: 0.2419  data: 0.0003  max mem: 18117
Epoch: [100]  [1200/1251]  eta: 0:00:12  lr: 0.003230  min_lr: 0.003230  loss: 2.5786 (3.2028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5873 (0.6752)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [100]  [1250/1251]  eta: 0:00:00  lr: 0.003230  min_lr: 0.003230  loss: 2.5908 (3.1970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5873 (0.6709)  time: 0.1954  data: 0.0006  max mem: 18117
Epoch: [100] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.003230  min_lr: 0.003230  loss: 2.5908 (3.1803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5873 (0.6709)
Test:  [ 0/25]  eta: 0:02:02  loss: 0.7215 (0.7215)  acc1: 84.4000 (84.4000)  acc5: 96.4000 (96.4000)  time: 4.8900  data: 4.7541  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.8568 (0.8581)  acc1: 81.2000 (80.9455)  acc5: 95.6000 (95.8182)  time: 0.6646  data: 0.5471  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0753 (1.0613)  acc1: 74.0000 (76.2476)  acc5: 92.8000 (93.5048)  time: 0.2082  data: 0.0951  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1396 (1.0700)  acc1: 74.0000 (75.8400)  acc5: 92.4000 (93.5520)  time: 0.2116  data: 0.1017  max mem: 18117
Test: Total time: 0:00:10 (0.4073 s / it)
* Acc@1 76.006 Acc@5 93.396 loss 1.060
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.01%
Epoch: [101]  [   0/1251]  eta: 1:05:54  lr: 0.003230  min_lr: 0.003230  loss: 2.1473 (2.1473)  weight_decay: 0.0500 (0.0500)  time: 3.1607  data: 2.8645  max mem: 18117
Epoch: [101]  [ 200/1251]  eta: 0:04:26  lr: 0.003227  min_lr: 0.003227  loss: 2.4553 (3.1021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6232 (0.5974)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [101]  [ 400/1251]  eta: 0:03:29  lr: 0.003224  min_lr: 0.003224  loss: 3.1692 (3.0915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6287 (0.6241)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [101]  [ 600/1251]  eta: 0:02:38  lr: 0.003221  min_lr: 0.003221  loss: 3.0720 (3.1116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7142 (0.6491)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [101]  [ 800/1251]  eta: 0:01:49  lr: 0.003218  min_lr: 0.003218  loss: 3.5303 (3.1378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6504 (0.6615)  time: 0.2354  data: 0.0004  max mem: 18117
Epoch: [101]  [1000/1251]  eta: 0:01:00  lr: 0.003215  min_lr: 0.003215  loss: 3.1803 (3.1578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6436 (0.6634)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [101]  [1200/1251]  eta: 0:00:12  lr: 0.003212  min_lr: 0.003212  loss: 2.7301 (3.1564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6010 (0.6572)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [101]  [1250/1251]  eta: 0:00:00  lr: 0.003212  min_lr: 0.003212  loss: 3.5197 (3.1625)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1952  data: 0.0006  max mem: 18117
Epoch: [101] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.003212  min_lr: 0.003212  loss: 3.5197 (3.1735)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7563 (0.7563)  acc1: 84.8000 (84.8000)  acc5: 98.4000 (98.4000)  time: 5.8458  data: 5.7078  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9386 (0.9455)  acc1: 82.0000 (80.1091)  acc5: 95.6000 (95.4909)  time: 0.7630  data: 0.6490  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1522 (1.1442)  acc1: 72.4000 (75.7714)  acc5: 92.8000 (93.0476)  time: 0.1995  data: 0.0877  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2610 (1.1551)  acc1: 72.0000 (75.3760)  acc5: 92.8000 (92.9760)  time: 0.1992  data: 0.0876  max mem: 18117
Test: Total time: 0:00:10 (0.4141 s / it)
* Acc@1 75.796 Acc@5 93.260 loss 1.150
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 76.01%
Epoch: [102]  [   0/1251]  eta: 1:07:40  lr: 0.003212  min_lr: 0.003212  loss: 2.4241 (2.4241)  weight_decay: 0.0500 (0.0500)  time: 3.2459  data: 2.3205  max mem: 18117
Epoch: [102]  [ 200/1251]  eta: 0:04:27  lr: 0.003209  min_lr: 0.003209  loss: 3.0214 (3.1193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6264 (0.6698)  time: 0.2370  data: 0.0005  max mem: 18117
Epoch: [102]  [ 400/1251]  eta: 0:03:29  lr: 0.003206  min_lr: 0.003206  loss: 2.9889 (3.1569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5624 (0.6646)  time: 0.2371  data: 0.0003  max mem: 18117
Epoch: [102]  [ 600/1251]  eta: 0:02:38  lr: 0.003203  min_lr: 0.003203  loss: 3.0047 (3.1974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6742 (0.6862)  time: 0.2383  data: 0.0003  max mem: 18117
Epoch: [102]  [ 800/1251]  eta: 0:01:49  lr: 0.003200  min_lr: 0.003200  loss: 3.8473 (3.1939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6177 (0.6837)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [102]  [1000/1251]  eta: 0:01:00  lr: 0.003197  min_lr: 0.003197  loss: 2.5981 (3.1890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5476 (0.6716)  time: 0.2403  data: 0.0004  max mem: 18117
Epoch: [102]  [1200/1251]  eta: 0:00:12  lr: 0.003195  min_lr: 0.003195  loss: 2.9371 (3.1912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6154 (0.6734)  time: 0.2371  data: 0.0005  max mem: 18117
Epoch: [102]  [1250/1251]  eta: 0:00:00  lr: 0.003194  min_lr: 0.003194  loss: 2.6928 (3.1883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6773 (0.6733)  time: 0.1953  data: 0.0007  max mem: 18117
Epoch: [102] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.003194  min_lr: 0.003194  loss: 2.6928 (3.1778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6773 (0.6733)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.8297 (0.8297)  acc1: 84.8000 (84.8000)  acc5: 96.4000 (96.4000)  time: 5.2567  data: 5.1051  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9866 (0.9487)  acc1: 80.0000 (80.3636)  acc5: 96.4000 (95.9273)  time: 0.7556  data: 0.6377  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1237 (1.1475)  acc1: 74.8000 (76.0000)  acc5: 93.2000 (93.4095)  time: 0.2243  data: 0.1123  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2661 (1.1566)  acc1: 73.2000 (75.4240)  acc5: 91.6000 (93.2800)  time: 0.2160  data: 0.1065  max mem: 18117
Test: Total time: 0:00:10 (0.4102 s / it)
* Acc@1 75.676 Acc@5 93.250 loss 1.154
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 76.01%
Epoch: [103]  [   0/1251]  eta: 1:08:23  lr: 0.003194  min_lr: 0.003194  loss: 4.1378 (4.1378)  weight_decay: 0.0500 (0.0500)  time: 3.2799  data: 1.7936  max mem: 18117
Epoch: [103]  [ 200/1251]  eta: 0:04:28  lr: 0.003191  min_lr: 0.003191  loss: 3.2948 (3.2408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6574 (0.6735)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [103]  [ 400/1251]  eta: 0:03:30  lr: 0.003188  min_lr: 0.003188  loss: 3.4400 (3.2124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5060 (0.7425)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [103]  [ 600/1251]  eta: 0:02:38  lr: 0.003185  min_lr: 0.003185  loss: 2.7841 (3.1982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5927 (0.7055)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [103]  [ 800/1251]  eta: 0:01:49  lr: 0.003182  min_lr: 0.003182  loss: 2.4966 (3.1981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6714 (0.6871)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [103]  [1000/1251]  eta: 0:01:00  lr: 0.003179  min_lr: 0.003179  loss: 3.3255 (3.2030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6339 (0.6902)  time: 0.2436  data: 0.0004  max mem: 18117
Epoch: [103]  [1200/1251]  eta: 0:00:12  lr: 0.003176  min_lr: 0.003176  loss: 2.5101 (3.2011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6357 (0.6862)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [103]  [1250/1251]  eta: 0:00:00  lr: 0.003176  min_lr: 0.003176  loss: 2.6381 (3.1977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6759 (0.6891)  time: 0.1952  data: 0.0007  max mem: 18117
Epoch: [103] Total time: 0:05:03 (0.2423 s / it)
Averaged stats: lr: 0.003176  min_lr: 0.003176  loss: 2.6381 (3.1704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6759 (0.6891)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7564 (0.7564)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 5.7872  data: 5.6623  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9240 (0.8748)  acc1: 80.0000 (81.3091)  acc5: 96.0000 (96.0000)  time: 0.7681  data: 0.6558  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0846 (1.0770)  acc1: 76.4000 (76.8000)  acc5: 93.2000 (93.3143)  time: 0.2123  data: 0.1028  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2490 (1.0919)  acc1: 72.8000 (76.2560)  acc5: 92.0000 (93.1680)  time: 0.2118  data: 0.1027  max mem: 18117
Test: Total time: 0:00:10 (0.4219 s / it)
* Acc@1 76.156 Acc@5 93.360 loss 1.087
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.16%
Epoch: [104]  [   0/1251]  eta: 1:04:15  lr: 0.003176  min_lr: 0.003176  loss: 2.2305 (2.2305)  weight_decay: 0.0500 (0.0500)  time: 3.0821  data: 2.7354  max mem: 18117
Epoch: [104]  [ 200/1251]  eta: 0:04:24  lr: 0.003173  min_lr: 0.003173  loss: 2.9461 (3.1394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5865 (0.5954)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [104]  [ 400/1251]  eta: 0:03:28  lr: 0.003170  min_lr: 0.003170  loss: 3.7714 (3.1596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5494 (0.6166)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [104]  [ 600/1251]  eta: 0:02:38  lr: 0.003167  min_lr: 0.003167  loss: 2.6897 (3.1805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6283 (0.6263)  time: 0.2379  data: 0.0003  max mem: 18117
Epoch: [104]  [ 800/1251]  eta: 0:01:48  lr: 0.003164  min_lr: 0.003164  loss: 3.1401 (3.1966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5605 (0.6286)  time: 0.2403  data: 0.0004  max mem: 18117
Epoch: [104]  [1000/1251]  eta: 0:01:00  lr: 0.003161  min_lr: 0.003161  loss: 3.2450 (3.2087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6149 (0.6454)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [104]  [1200/1251]  eta: 0:00:12  lr: 0.003158  min_lr: 0.003158  loss: 3.0856 (3.2034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6065 (0.6409)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [104]  [1250/1251]  eta: 0:00:00  lr: 0.003158  min_lr: 0.003158  loss: 2.3953 (3.1950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5839 (0.6391)  time: 0.1958  data: 0.0005  max mem: 18117
Epoch: [104] Total time: 0:05:00 (0.2405 s / it)
Averaged stats: lr: 0.003158  min_lr: 0.003158  loss: 2.3953 (3.1676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5839 (0.6391)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6974 (0.6974)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.8838  data: 5.7623  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8139 (0.8663)  acc1: 81.2000 (80.4727)  acc5: 97.2000 (96.3636)  time: 0.7353  data: 0.6209  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1106 (1.0617)  acc1: 73.6000 (76.2476)  acc5: 94.0000 (93.3905)  time: 0.1977  data: 0.0868  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1818 (1.0739)  acc1: 72.4000 (75.8080)  acc5: 91.6000 (93.2800)  time: 0.1974  data: 0.0867  max mem: 18117
Test: Total time: 0:00:10 (0.4151 s / it)
* Acc@1 76.164 Acc@5 93.438 loss 1.061
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.16%
Epoch: [105]  [   0/1251]  eta: 1:05:49  lr: 0.003158  min_lr: 0.003158  loss: 2.2413 (2.2413)  weight_decay: 0.0500 (0.0500)  time: 3.1568  data: 2.8029  max mem: 18117
Epoch: [105]  [ 200/1251]  eta: 0:04:25  lr: 0.003155  min_lr: 0.003155  loss: 2.8489 (3.1523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6136 (0.6798)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [105]  [ 400/1251]  eta: 0:03:29  lr: 0.003152  min_lr: 0.003152  loss: 3.3433 (3.1737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5882 (0.6722)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [105]  [ 600/1251]  eta: 0:02:38  lr: 0.003149  min_lr: 0.003149  loss: 3.0481 (3.1861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6783 (0.6886)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [105]  [ 800/1251]  eta: 0:01:49  lr: 0.003146  min_lr: 0.003146  loss: 2.5383 (3.1672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5535 (0.6666)  time: 0.2399  data: 0.0005  max mem: 18117
Epoch: [105]  [1000/1251]  eta: 0:01:00  lr: 0.003143  min_lr: 0.003143  loss: 2.4698 (3.1796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7530 (0.6681)  time: 0.2399  data: 0.0005  max mem: 18117
Epoch: [105]  [1200/1251]  eta: 0:00:12  lr: 0.003140  min_lr: 0.003140  loss: 3.7762 (3.1656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6936 (0.6804)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [105]  [1250/1251]  eta: 0:00:00  lr: 0.003139  min_lr: 0.003139  loss: 2.9593 (3.1621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6058 (0.6821)  time: 0.1959  data: 0.0006  max mem: 18117
Epoch: [105] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.003139  min_lr: 0.003139  loss: 2.9593 (3.1711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6058 (0.6821)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.8032 (0.8032)  acc1: 82.8000 (82.8000)  acc5: 96.4000 (96.4000)  time: 5.8404  data: 5.7144  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8684 (0.9030)  acc1: 82.4000 (79.9636)  acc5: 96.4000 (96.0000)  time: 0.7359  data: 0.6225  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1253 (1.1049)  acc1: 72.8000 (75.6571)  acc5: 92.4000 (93.2191)  time: 0.1926  data: 0.0824  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1758 (1.1076)  acc1: 72.8000 (75.5200)  acc5: 92.0000 (93.1520)  time: 0.2023  data: 0.0929  max mem: 18117
Test: Total time: 0:00:10 (0.4179 s / it)
* Acc@1 75.924 Acc@5 93.344 loss 1.096
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 76.16%
Epoch: [106]  [   0/1251]  eta: 1:09:07  lr: 0.003139  min_lr: 0.003139  loss: 2.5585 (2.5585)  weight_decay: 0.0500 (0.0500)  time: 3.3151  data: 2.4251  max mem: 18117
Epoch: [106]  [ 200/1251]  eta: 0:04:27  lr: 0.003136  min_lr: 0.003136  loss: 3.2287 (3.2169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6109 (0.6562)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [106]  [ 400/1251]  eta: 0:03:30  lr: 0.003133  min_lr: 0.003133  loss: 2.9701 (3.1645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6714 (0.6721)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [106]  [ 600/1251]  eta: 0:02:39  lr: 0.003130  min_lr: 0.003130  loss: 2.6231 (3.1647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6149 (0.6579)  time: 0.2430  data: 0.0004  max mem: 18117
Epoch: [106]  [ 800/1251]  eta: 0:01:49  lr: 0.003127  min_lr: 0.003127  loss: 2.9202 (3.1631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6186 (0.6618)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [106]  [1000/1251]  eta: 0:01:00  lr: 0.003124  min_lr: 0.003124  loss: 2.6512 (3.1709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5669 (0.6591)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [106]  [1200/1251]  eta: 0:00:12  lr: 0.003121  min_lr: 0.003121  loss: 3.5248 (3.1754)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6732 (0.6563)  time: 0.2473  data: 0.0004  max mem: 18117
Epoch: [106]  [1250/1251]  eta: 0:00:00  lr: 0.003121  min_lr: 0.003121  loss: 2.5077 (3.1679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6225 (0.6588)  time: 0.1954  data: 0.0008  max mem: 18117
Epoch: [106] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.003121  min_lr: 0.003121  loss: 2.5077 (3.1628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6225 (0.6588)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7001 (0.7001)  acc1: 84.0000 (84.0000)  acc5: 96.4000 (96.4000)  time: 5.7095  data: 5.5851  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8407 (0.8362)  acc1: 81.6000 (80.3636)  acc5: 96.4000 (96.1818)  time: 0.7330  data: 0.6174  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0555 (1.0244)  acc1: 75.6000 (76.3619)  acc5: 92.8000 (93.7333)  time: 0.1915  data: 0.0779  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1338 (1.0287)  acc1: 72.4000 (76.0960)  acc5: 92.4000 (93.7600)  time: 0.1903  data: 0.0778  max mem: 18117
Test: Total time: 0:00:10 (0.4027 s / it)
* Acc@1 76.370 Acc@5 93.516 loss 1.031
Accuracy of the model on the 50000 test images: 76.4%
Max accuracy: 76.37%
Epoch: [107]  [   0/1251]  eta: 0:55:31  lr: 0.003121  min_lr: 0.003121  loss: 2.4061 (2.4061)  weight_decay: 0.0500 (0.0500)  time: 2.6627  data: 2.3014  max mem: 18117
Epoch: [107]  [ 200/1251]  eta: 0:04:26  lr: 0.003118  min_lr: 0.003118  loss: 3.3978 (3.0677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7105 (0.6664)  time: 0.2448  data: 0.0006  max mem: 18117
Epoch: [107]  [ 400/1251]  eta: 0:03:29  lr: 0.003115  min_lr: 0.003115  loss: 2.8518 (3.0803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6665 (0.6906)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [107]  [ 600/1251]  eta: 0:02:38  lr: 0.003112  min_lr: 0.003112  loss: 2.7571 (3.0977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5604 (0.6988)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [107]  [ 800/1251]  eta: 0:01:49  lr: 0.003109  min_lr: 0.003109  loss: 2.8516 (3.1278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5114 (0.6731)  time: 0.2350  data: 0.0004  max mem: 18117
Epoch: [107]  [1000/1251]  eta: 0:01:00  lr: 0.003106  min_lr: 0.003106  loss: 3.1838 (3.1357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5075 (0.6569)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [107]  [1200/1251]  eta: 0:00:12  lr: 0.003103  min_lr: 0.003103  loss: 3.0039 (3.1465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7034 (0.6715)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [107]  [1250/1251]  eta: 0:00:00  lr: 0.003102  min_lr: 0.003102  loss: 2.5160 (3.1457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6039 (0.6688)  time: 0.1953  data: 0.0009  max mem: 18117
Epoch: [107] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.003102  min_lr: 0.003102  loss: 2.5160 (3.1682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6039 (0.6688)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7732 (0.7732)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.5409  data: 5.4150  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8993 (0.8996)  acc1: 80.8000 (81.2000)  acc5: 96.8000 (96.4000)  time: 0.7482  data: 0.6358  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0973 (1.0955)  acc1: 75.6000 (76.3810)  acc5: 94.0000 (93.8476)  time: 0.2109  data: 0.1011  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2490 (1.1095)  acc1: 72.4000 (76.0640)  acc5: 92.4000 (93.6000)  time: 0.2095  data: 0.1010  max mem: 18117
Test: Total time: 0:00:10 (0.4117 s / it)
* Acc@1 76.132 Acc@5 93.352 loss 1.098
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.37%
Epoch: [108]  [   0/1251]  eta: 1:04:33  lr: 0.003102  min_lr: 0.003102  loss: 3.2912 (3.2912)  weight_decay: 0.0500 (0.0500)  time: 3.0961  data: 1.6066  max mem: 18117
Epoch: [108]  [ 200/1251]  eta: 0:04:27  lr: 0.003099  min_lr: 0.003099  loss: 3.3661 (3.1858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5818 (0.6427)  time: 0.2442  data: 0.0004  max mem: 18117
Epoch: [108]  [ 400/1251]  eta: 0:03:30  lr: 0.003096  min_lr: 0.003096  loss: 3.8458 (3.2070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6461 (0.6443)  time: 0.2398  data: 0.0005  max mem: 18117
Epoch: [108]  [ 600/1251]  eta: 0:02:39  lr: 0.003093  min_lr: 0.003093  loss: 3.0562 (3.1672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5894 (0.6661)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [108]  [ 800/1251]  eta: 0:01:49  lr: 0.003090  min_lr: 0.003090  loss: 2.4588 (3.1821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6177 (0.6524)  time: 0.2405  data: 0.0003  max mem: 18117
Epoch: [108]  [1000/1251]  eta: 0:01:00  lr: 0.003087  min_lr: 0.003087  loss: 3.3360 (3.1793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6216 (0.6586)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [108]  [1200/1251]  eta: 0:00:12  lr: 0.003084  min_lr: 0.003084  loss: 2.8528 (3.1926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6501 (0.6619)  time: 0.2418  data: 0.0004  max mem: 18117
Epoch: [108]  [1250/1251]  eta: 0:00:00  lr: 0.003083  min_lr: 0.003083  loss: 2.4640 (3.1905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5809 (0.6586)  time: 0.1955  data: 0.0010  max mem: 18117
Epoch: [108] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.003083  min_lr: 0.003083  loss: 2.4640 (3.1471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5809 (0.6586)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7183 (0.7183)  acc1: 84.8000 (84.8000)  acc5: 97.2000 (97.2000)  time: 5.6304  data: 5.5016  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8898 (0.8881)  acc1: 80.4000 (80.2182)  acc5: 96.4000 (96.1818)  time: 0.7811  data: 0.6648  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1050 (1.0910)  acc1: 73.2000 (75.8095)  acc5: 93.2000 (93.6952)  time: 0.2243  data: 0.1124  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2400 (1.0972)  acc1: 72.8000 (75.7760)  acc5: 92.4000 (93.5200)  time: 0.2441  data: 0.1334  max mem: 18117
Test: Total time: 0:00:11 (0.4417 s / it)
* Acc@1 76.160 Acc@5 93.382 loss 1.084
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.37%
Epoch: [109]  [   0/1251]  eta: 1:07:56  lr: 0.003083  min_lr: 0.003083  loss: 2.3925 (2.3925)  weight_decay: 0.0500 (0.0500)  time: 3.2588  data: 2.3616  max mem: 18117
Epoch: [109]  [ 200/1251]  eta: 0:04:27  lr: 0.003080  min_lr: 0.003080  loss: 3.0736 (3.1541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6691 (0.6644)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [109]  [ 400/1251]  eta: 0:03:30  lr: 0.003077  min_lr: 0.003077  loss: 3.1974 (3.1490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6400 (0.6662)  time: 0.2394  data: 0.0005  max mem: 18117
Epoch: [109]  [ 600/1251]  eta: 0:02:39  lr: 0.003074  min_lr: 0.003074  loss: 2.4822 (3.1553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6461 (0.6618)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [109]  [ 800/1251]  eta: 0:01:49  lr: 0.003071  min_lr: 0.003071  loss: 2.5887 (3.1445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5588 (0.6635)  time: 0.2419  data: 0.0004  max mem: 18117
Epoch: [109]  [1000/1251]  eta: 0:01:00  lr: 0.003068  min_lr: 0.003068  loss: 2.7654 (3.1479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5908 (0.6531)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [109]  [1200/1251]  eta: 0:00:12  lr: 0.003065  min_lr: 0.003065  loss: 3.2160 (3.1489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6132 (0.6515)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [109]  [1250/1251]  eta: 0:00:00  lr: 0.003064  min_lr: 0.003064  loss: 3.1193 (3.1534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6386 (0.6532)  time: 0.1955  data: 0.0008  max mem: 18117
Epoch: [109] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.003064  min_lr: 0.003064  loss: 3.1193 (3.1522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6386 (0.6532)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6751 (0.6751)  acc1: 88.8000 (88.8000)  acc5: 97.6000 (97.6000)  time: 5.7174  data: 5.5930  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9494 (0.8994)  acc1: 81.6000 (80.9818)  acc5: 96.8000 (96.4000)  time: 0.7512  data: 0.6364  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1235 (1.1156)  acc1: 74.4000 (76.2667)  acc5: 93.2000 (93.3143)  time: 0.2135  data: 0.1025  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2598 (1.1313)  acc1: 72.0000 (75.7600)  acc5: 92.0000 (93.2800)  time: 0.2121  data: 0.1025  max mem: 18117
Test: Total time: 0:00:10 (0.4200 s / it)
* Acc@1 76.188 Acc@5 93.410 loss 1.110
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.37%
Epoch: [110]  [   0/1251]  eta: 1:05:25  lr: 0.003064  min_lr: 0.003064  loss: 2.2575 (2.2575)  weight_decay: 0.0500 (0.0500)  time: 3.1378  data: 2.7085  max mem: 18117
Epoch: [110]  [ 200/1251]  eta: 0:04:26  lr: 0.003061  min_lr: 0.003061  loss: 3.7188 (3.1625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6496 (0.6900)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [110]  [ 400/1251]  eta: 0:03:29  lr: 0.003058  min_lr: 0.003058  loss: 3.0099 (3.1639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5549 (0.6861)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [110]  [ 600/1251]  eta: 0:02:39  lr: 0.003055  min_lr: 0.003055  loss: 2.5951 (3.1464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6109 (0.6725)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [110]  [ 800/1251]  eta: 0:01:49  lr: 0.003052  min_lr: 0.003052  loss: 2.6390 (3.1557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6125 (0.6826)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [110]  [1000/1251]  eta: 0:01:00  lr: 0.003049  min_lr: 0.003049  loss: 2.6569 (3.1498)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [110]  [1200/1251]  eta: 0:00:12  lr: 0.003046  min_lr: 0.003046  loss: 2.9639 (3.1589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7178 (nan)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [110]  [1250/1251]  eta: 0:00:00  lr: 0.003045  min_lr: 0.003045  loss: 2.5420 (3.1593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6359 (nan)  time: 0.1955  data: 0.0007  max mem: 18117
Epoch: [110] Total time: 0:05:01 (0.2408 s / it)
Averaged stats: lr: 0.003045  min_lr: 0.003045  loss: 2.5420 (3.1490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6359 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7382 (0.7382)  acc1: 84.8000 (84.8000)  acc5: 96.0000 (96.0000)  time: 5.4973  data: 5.3505  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8900 (0.8793)  acc1: 81.2000 (80.5091)  acc5: 96.0000 (96.0364)  time: 0.7030  data: 0.5879  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0654 (1.0693)  acc1: 75.2000 (76.4762)  acc5: 93.6000 (93.3714)  time: 0.1964  data: 0.0864  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1649 (1.0771)  acc1: 73.6000 (76.0320)  acc5: 92.4000 (93.3280)  time: 0.2082  data: 0.0989  max mem: 18117
Test: Total time: 0:00:10 (0.4074 s / it)
* Acc@1 76.252 Acc@5 93.556 loss 1.066
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.37%
Epoch: [111]  [   0/1251]  eta: 1:07:42  lr: 0.003045  min_lr: 0.003045  loss: 2.2874 (2.2874)  weight_decay: 0.0500 (0.0500)  time: 3.2473  data: 2.9205  max mem: 18117
Epoch: [111]  [ 200/1251]  eta: 0:04:25  lr: 0.003042  min_lr: 0.003042  loss: 3.1056 (3.1796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5841 (0.6138)  time: 0.2350  data: 0.0005  max mem: 18117
Epoch: [111]  [ 400/1251]  eta: 0:03:28  lr: 0.003039  min_lr: 0.003039  loss: 2.7474 (3.1300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6918 (0.6566)  time: 0.2362  data: 0.0003  max mem: 18117
Epoch: [111]  [ 600/1251]  eta: 0:02:37  lr: 0.003036  min_lr: 0.003036  loss: 3.5508 (3.1456)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5053 (0.6451)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [111]  [ 800/1251]  eta: 0:01:48  lr: 0.003033  min_lr: 0.003033  loss: 2.4609 (3.1395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6341 (0.6537)  time: 0.2388  data: 0.0003  max mem: 18117
Epoch: [111]  [1000/1251]  eta: 0:01:00  lr: 0.003030  min_lr: 0.003030  loss: 2.8634 (3.1340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6185 (0.6509)  time: 0.2434  data: 0.0004  max mem: 18117
Epoch: [111]  [1200/1251]  eta: 0:00:12  lr: 0.003027  min_lr: 0.003027  loss: 3.8720 (3.1448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6030 (0.6653)  time: 0.2349  data: 0.0004  max mem: 18117
Epoch: [111]  [1250/1251]  eta: 0:00:00  lr: 0.003026  min_lr: 0.003026  loss: 2.7080 (3.1380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6411 (0.6666)  time: 0.1955  data: 0.0010  max mem: 18117
Epoch: [111] Total time: 0:04:59 (0.2397 s / it)
Averaged stats: lr: 0.003026  min_lr: 0.003026  loss: 2.7080 (3.1472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6411 (0.6666)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6219 (0.6219)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.5095  data: 5.3620  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7879 (0.8378)  acc1: 80.4000 (80.3636)  acc5: 96.8000 (96.3636)  time: 0.7023  data: 0.5878  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0368 (1.0552)  acc1: 74.0000 (76.1905)  acc5: 93.6000 (93.5810)  time: 0.1868  data: 0.0772  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1459 (1.0759)  acc1: 73.6000 (75.6480)  acc5: 91.2000 (93.3120)  time: 0.1912  data: 0.0823  max mem: 18117
Test: Total time: 0:00:09 (0.3954 s / it)
* Acc@1 76.118 Acc@5 93.348 loss 1.052
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.37%
Epoch: [112]  [   0/1251]  eta: 1:07:44  lr: 0.003026  min_lr: 0.003026  loss: 4.1612 (4.1612)  weight_decay: 0.0500 (0.0500)  time: 3.2488  data: 2.4881  max mem: 18117
Epoch: [112]  [ 200/1251]  eta: 0:04:28  lr: 0.003023  min_lr: 0.003023  loss: 2.7644 (3.1624)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7286 (0.7181)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [112]  [ 400/1251]  eta: 0:03:30  lr: 0.003020  min_lr: 0.003020  loss: 2.8358 (3.1535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5761 (0.7080)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [112]  [ 600/1251]  eta: 0:02:39  lr: 0.003017  min_lr: 0.003017  loss: 2.7256 (3.1310)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6455 (0.7065)  time: 0.2411  data: 0.0005  max mem: 18117
Epoch: [112]  [ 800/1251]  eta: 0:01:49  lr: 0.003014  min_lr: 0.003014  loss: 3.0339 (3.1174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5471 (0.6838)  time: 0.2350  data: 0.0004  max mem: 18117
Epoch: [112]  [1000/1251]  eta: 0:01:00  lr: 0.003011  min_lr: 0.003011  loss: 3.4346 (3.1455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7318 (0.6748)  time: 0.2396  data: 0.0005  max mem: 18117
Epoch: [112]  [1200/1251]  eta: 0:00:12  lr: 0.003007  min_lr: 0.003007  loss: 2.8316 (3.1532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6226 (0.6775)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [112]  [1250/1251]  eta: 0:00:00  lr: 0.003007  min_lr: 0.003007  loss: 2.9419 (3.1543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6197 (0.6835)  time: 0.1952  data: 0.0006  max mem: 18117
Epoch: [112] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.003007  min_lr: 0.003007  loss: 2.9419 (3.1481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6197 (0.6835)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.7375 (0.7375)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 5.9035  data: 5.7770  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8812 (0.8931)  acc1: 81.6000 (81.3091)  acc5: 96.8000 (96.2545)  time: 0.7087  data: 0.5957  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1050 (1.0944)  acc1: 74.4000 (76.5905)  acc5: 92.4000 (93.8286)  time: 0.1765  data: 0.0666  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1994 (1.1034)  acc1: 73.2000 (76.2880)  acc5: 91.6000 (93.7440)  time: 0.1748  data: 0.0665  max mem: 18117
Test: Total time: 0:00:09 (0.3990 s / it)
* Acc@1 76.164 Acc@5 93.556 loss 1.095
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.37%
Epoch: [113]  [   0/1251]  eta: 1:06:46  lr: 0.003007  min_lr: 0.003007  loss: 2.8876 (2.8876)  weight_decay: 0.0500 (0.0500)  time: 3.2025  data: 2.2739  max mem: 18117
Epoch: [113]  [ 200/1251]  eta: 0:04:26  lr: 0.003004  min_lr: 0.003004  loss: 2.9740 (3.1314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6040 (0.6697)  time: 0.2350  data: 0.0004  max mem: 18117
Epoch: [113]  [ 400/1251]  eta: 0:03:28  lr: 0.003000  min_lr: 0.003000  loss: 3.7692 (3.1334)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6893 (0.6660)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [113]  [ 600/1251]  eta: 0:02:37  lr: 0.002997  min_lr: 0.002997  loss: 3.1392 (3.1477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7328 (0.6836)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [113]  [ 800/1251]  eta: 0:01:49  lr: 0.002994  min_lr: 0.002994  loss: 3.5668 (3.1480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6156 (0.6859)  time: 0.2470  data: 0.0004  max mem: 18117
Epoch: [113]  [1000/1251]  eta: 0:01:00  lr: 0.002991  min_lr: 0.002991  loss: 2.8027 (3.1470)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5206 (0.6868)  time: 0.2431  data: 0.0004  max mem: 18117
Epoch: [113]  [1200/1251]  eta: 0:00:12  lr: 0.002988  min_lr: 0.002988  loss: 2.7046 (3.1504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6260 (0.6811)  time: 0.2466  data: 0.0004  max mem: 18117
Epoch: [113]  [1250/1251]  eta: 0:00:00  lr: 0.002987  min_lr: 0.002987  loss: 2.6509 (3.1471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6435 (0.6786)  time: 0.1956  data: 0.0007  max mem: 18117
Epoch: [113] Total time: 0:05:01 (0.2410 s / it)
Averaged stats: lr: 0.002987  min_lr: 0.002987  loss: 2.6509 (3.1501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6435 (0.6786)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7235 (0.7235)  acc1: 84.8000 (84.8000)  acc5: 97.2000 (97.2000)  time: 5.6321  data: 5.5030  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8876 (0.8925)  acc1: 80.0000 (80.7273)  acc5: 96.8000 (96.0000)  time: 0.7426  data: 0.6299  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1187 (1.0778)  acc1: 74.4000 (76.1333)  acc5: 92.8000 (93.2762)  time: 0.1953  data: 0.0857  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1449 (1.0854)  acc1: 72.8000 (75.7920)  acc5: 92.4000 (93.2320)  time: 0.2045  data: 0.0962  max mem: 18117
Test: Total time: 0:00:10 (0.4104 s / it)
* Acc@1 76.122 Acc@5 93.452 loss 1.074
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.37%
Epoch: [114]  [   0/1251]  eta: 1:12:34  lr: 0.002987  min_lr: 0.002987  loss: 3.1137 (3.1137)  weight_decay: 0.0500 (0.0500)  time: 3.4807  data: 2.9816  max mem: 18117
Epoch: [114]  [ 200/1251]  eta: 0:04:30  lr: 0.002984  min_lr: 0.002984  loss: 2.7681 (3.0508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5884 (0.6272)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [114]  [ 400/1251]  eta: 0:03:30  lr: 0.002981  min_lr: 0.002981  loss: 3.3537 (3.1120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5953 (0.6450)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [114]  [ 600/1251]  eta: 0:02:39  lr: 0.002978  min_lr: 0.002978  loss: 3.4497 (3.0802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6250 (0.6441)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [114]  [ 800/1251]  eta: 0:01:49  lr: 0.002975  min_lr: 0.002975  loss: 2.8998 (3.0913)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5478 (0.6469)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [114]  [1000/1251]  eta: 0:01:00  lr: 0.002972  min_lr: 0.002972  loss: 3.8269 (3.1251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5953 (0.6520)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [114]  [1200/1251]  eta: 0:00:12  lr: 0.002968  min_lr: 0.002968  loss: 2.9403 (3.1312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5771 (0.6542)  time: 0.2352  data: 0.0004  max mem: 18117
Epoch: [114]  [1250/1251]  eta: 0:00:00  lr: 0.002968  min_lr: 0.002968  loss: 3.6615 (3.1292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5940 (0.6548)  time: 0.1955  data: 0.0010  max mem: 18117
Epoch: [114] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.002968  min_lr: 0.002968  loss: 3.6615 (3.1324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5940 (0.6548)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.7767 (0.7767)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.9592  data: 5.8132  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.9381 (0.9464)  acc1: 81.2000 (80.7636)  acc5: 96.0000 (96.0727)  time: 0.7521  data: 0.6385  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1112 (1.1312)  acc1: 74.0000 (76.3619)  acc5: 94.0000 (93.8095)  time: 0.1962  data: 0.0868  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2050 (1.1390)  acc1: 74.0000 (76.1280)  acc5: 92.8000 (93.8080)  time: 0.1959  data: 0.0868  max mem: 18117
Test: Total time: 0:00:10 (0.4163 s / it)
* Acc@1 76.238 Acc@5 93.556 loss 1.138
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.37%
Epoch: [115]  [   0/1251]  eta: 1:05:09  lr: 0.002968  min_lr: 0.002968  loss: 2.3666 (2.3666)  weight_decay: 0.0500 (0.0500)  time: 3.1251  data: 2.2161  max mem: 18117
Epoch: [115]  [ 200/1251]  eta: 0:04:27  lr: 0.002965  min_lr: 0.002965  loss: 3.8832 (3.1917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5978 (0.6540)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [115]  [ 400/1251]  eta: 0:03:30  lr: 0.002961  min_lr: 0.002961  loss: 3.2541 (3.1421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7620 (0.6657)  time: 0.2406  data: 0.0004  max mem: 18117
Epoch: [115]  [ 600/1251]  eta: 0:02:39  lr: 0.002958  min_lr: 0.002958  loss: 3.4551 (3.1292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5604 (0.6898)  time: 0.2410  data: 0.0005  max mem: 18117
Epoch: [115]  [ 800/1251]  eta: 0:01:49  lr: 0.002955  min_lr: 0.002955  loss: 2.6583 (3.1107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5924 (0.6790)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [115]  [1000/1251]  eta: 0:01:00  lr: 0.002952  min_lr: 0.002952  loss: 2.9088 (3.1174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6502 (0.6675)  time: 0.2367  data: 0.0003  max mem: 18117
Epoch: [115]  [1200/1251]  eta: 0:00:12  lr: 0.002949  min_lr: 0.002949  loss: 2.7288 (3.1234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6253 (0.6642)  time: 0.2421  data: 0.0004  max mem: 18117
Epoch: [115]  [1250/1251]  eta: 0:00:00  lr: 0.002948  min_lr: 0.002948  loss: 2.7253 (3.1192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6250 (0.6666)  time: 0.1959  data: 0.0006  max mem: 18117
Epoch: [115] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.002948  min_lr: 0.002948  loss: 2.7253 (3.1383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6250 (0.6666)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6900 (0.6900)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.6307  data: 5.5030  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8115 (0.8436)  acc1: 82.0000 (80.9818)  acc5: 96.4000 (96.2909)  time: 0.7743  data: 0.6614  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0940 (1.0485)  acc1: 74.4000 (76.7238)  acc5: 92.0000 (93.4476)  time: 0.2143  data: 0.1032  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1941 (1.0582)  acc1: 72.8000 (76.2240)  acc5: 91.6000 (93.3600)  time: 0.2141  data: 0.1031  max mem: 18117
Test: Total time: 0:00:10 (0.4171 s / it)
* Acc@1 76.386 Acc@5 93.450 loss 1.052
Accuracy of the model on the 50000 test images: 76.4%
Max accuracy: 76.39%
Epoch: [116]  [   0/1251]  eta: 1:04:31  lr: 0.002948  min_lr: 0.002948  loss: 3.9307 (3.9307)  weight_decay: 0.0500 (0.0500)  time: 3.0950  data: 2.7691  max mem: 18117
Epoch: [116]  [ 200/1251]  eta: 0:04:26  lr: 0.002945  min_lr: 0.002945  loss: 3.4306 (3.1462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6551 (0.6788)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [116]  [ 400/1251]  eta: 0:03:29  lr: 0.002942  min_lr: 0.002942  loss: 2.5689 (3.1437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6299 (0.6719)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [116]  [ 600/1251]  eta: 0:02:38  lr: 0.002938  min_lr: 0.002938  loss: 2.8127 (3.0883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6115 (0.6922)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [116]  [ 800/1251]  eta: 0:01:49  lr: 0.002935  min_lr: 0.002935  loss: 2.6817 (3.1059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5730 (0.6733)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [116]  [1000/1251]  eta: 0:01:00  lr: 0.002932  min_lr: 0.002932  loss: 3.1823 (3.1041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5784 (0.6597)  time: 0.2359  data: 0.0005  max mem: 18117
Epoch: [116]  [1200/1251]  eta: 0:00:12  lr: 0.002929  min_lr: 0.002929  loss: 2.7596 (3.1143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6190 (0.6557)  time: 0.2449  data: 0.0004  max mem: 18117
Epoch: [116]  [1250/1251]  eta: 0:00:00  lr: 0.002928  min_lr: 0.002928  loss: 2.8902 (3.1151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5611 (0.6513)  time: 0.2026  data: 0.0006  max mem: 18117
Epoch: [116] Total time: 0:05:01 (0.2408 s / it)
Averaged stats: lr: 0.002928  min_lr: 0.002928  loss: 2.8902 (3.1272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5611 (0.6513)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7074 (0.7074)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.5330  data: 5.3784  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8649 (0.8704)  acc1: 82.4000 (80.5091)  acc5: 96.8000 (96.2182)  time: 0.7580  data: 0.6423  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0588 (1.0665)  acc1: 73.2000 (76.4000)  acc5: 94.4000 (93.6381)  time: 0.2289  data: 0.1189  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2369 (1.0799)  acc1: 73.2000 (76.0000)  acc5: 91.6000 (93.5680)  time: 0.2270  data: 0.1188  max mem: 18117
Test: Total time: 0:00:10 (0.4248 s / it)
* Acc@1 76.352 Acc@5 93.746 loss 1.071
Accuracy of the model on the 50000 test images: 76.4%
Max accuracy: 76.39%
Epoch: [117]  [   0/1251]  eta: 1:05:47  lr: 0.002928  min_lr: 0.002928  loss: 2.9341 (2.9341)  weight_decay: 0.0500 (0.0500)  time: 3.1553  data: 2.7134  max mem: 18117
Epoch: [117]  [ 200/1251]  eta: 0:04:29  lr: 0.002925  min_lr: 0.002925  loss: 2.5244 (3.0119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5544 (0.6288)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [117]  [ 400/1251]  eta: 0:03:31  lr: 0.002922  min_lr: 0.002922  loss: 3.3725 (3.0753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7009 (0.6790)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [117]  [ 600/1251]  eta: 0:02:39  lr: 0.002919  min_lr: 0.002919  loss: 2.5436 (3.0841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6505 (0.6625)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [117]  [ 800/1251]  eta: 0:01:49  lr: 0.002915  min_lr: 0.002915  loss: 2.5720 (3.0933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7144 (0.6665)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [117]  [1000/1251]  eta: 0:01:00  lr: 0.002912  min_lr: 0.002912  loss: 3.0728 (3.1046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5424 (0.6638)  time: 0.2422  data: 0.0003  max mem: 18117
Epoch: [117]  [1200/1251]  eta: 0:00:12  lr: 0.002909  min_lr: 0.002909  loss: 3.4798 (3.1093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7455 (0.6879)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [117]  [1250/1251]  eta: 0:00:00  lr: 0.002908  min_lr: 0.002908  loss: 2.9039 (3.1094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6841 (0.6861)  time: 0.1953  data: 0.0007  max mem: 18117
Epoch: [117] Total time: 0:05:03 (0.2422 s / it)
Averaged stats: lr: 0.002908  min_lr: 0.002908  loss: 2.9039 (3.1313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6841 (0.6861)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7097 (0.7097)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.3267  data: 5.1788  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8933 (0.8907)  acc1: 79.6000 (80.3273)  acc5: 96.0000 (96.2182)  time: 0.7288  data: 0.6153  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0737 (1.0901)  acc1: 74.4000 (76.4000)  acc5: 92.4000 (93.4095)  time: 0.2172  data: 0.1080  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2329 (1.1042)  acc1: 73.6000 (75.9040)  acc5: 91.6000 (93.2480)  time: 0.2162  data: 0.1079  max mem: 18117
Test: Total time: 0:00:10 (0.4073 s / it)
* Acc@1 76.194 Acc@5 93.534 loss 1.095
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.39%
Epoch: [118]  [   0/1251]  eta: 1:08:37  lr: 0.002908  min_lr: 0.002908  loss: 3.6919 (3.6919)  weight_decay: 0.0500 (0.0500)  time: 3.2913  data: 1.8618  max mem: 18117
Epoch: [118]  [ 200/1251]  eta: 0:04:26  lr: 0.002905  min_lr: 0.002905  loss: 2.5363 (3.0632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6320 (0.6201)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [118]  [ 400/1251]  eta: 0:03:29  lr: 0.002902  min_lr: 0.002902  loss: 2.4886 (3.1322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6007 (0.6726)  time: 0.2386  data: 0.0003  max mem: 18117
Epoch: [118]  [ 600/1251]  eta: 0:02:38  lr: 0.002899  min_lr: 0.002899  loss: 3.4483 (3.1562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5669 (0.6782)  time: 0.2358  data: 0.0005  max mem: 18117
Epoch: [118]  [ 800/1251]  eta: 0:01:49  lr: 0.002895  min_lr: 0.002895  loss: 3.3844 (3.1532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6513 (0.6724)  time: 0.2411  data: 0.0005  max mem: 18117
Epoch: [118]  [1000/1251]  eta: 0:01:00  lr: 0.002892  min_lr: 0.002892  loss: 3.2024 (3.1412)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5879 (0.6586)  time: 0.2364  data: 0.0005  max mem: 18117
Epoch: [118]  [1200/1251]  eta: 0:00:12  lr: 0.002889  min_lr: 0.002889  loss: 2.9067 (3.1561)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6959 (0.6552)  time: 0.2368  data: 0.0003  max mem: 18117
Epoch: [118]  [1250/1251]  eta: 0:00:00  lr: 0.002888  min_lr: 0.002888  loss: 3.5364 (3.1521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7829 (0.6607)  time: 0.1961  data: 0.0006  max mem: 18117
Epoch: [118] Total time: 0:05:01 (0.2406 s / it)
Averaged stats: lr: 0.002888  min_lr: 0.002888  loss: 3.5364 (3.1303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7829 (0.6607)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6877 (0.6877)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 5.7633  data: 5.6120  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9313 (0.8920)  acc1: 81.2000 (80.0000)  acc5: 96.4000 (96.3273)  time: 0.7060  data: 0.5909  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1189 (1.0777)  acc1: 74.0000 (76.0000)  acc5: 93.6000 (93.7143)  time: 0.1806  data: 0.0708  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2015 (1.0878)  acc1: 72.8000 (75.7760)  acc5: 92.4000 (93.5680)  time: 0.2031  data: 0.0936  max mem: 18117
Test: Total time: 0:00:10 (0.4139 s / it)
* Acc@1 76.306 Acc@5 93.500 loss 1.076
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.39%
Epoch: [119]  [   0/1251]  eta: 1:06:07  lr: 0.002888  min_lr: 0.002888  loss: 4.6865 (4.6865)  weight_decay: 0.0500 (0.0500)  time: 3.1718  data: 2.5336  max mem: 18117
Epoch: [119]  [ 200/1251]  eta: 0:04:25  lr: 0.002885  min_lr: 0.002885  loss: 2.4203 (3.1286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5158 (0.6179)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [119]  [ 400/1251]  eta: 0:03:28  lr: 0.002882  min_lr: 0.002882  loss: 2.5065 (3.0804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6836 (0.6652)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [119]  [ 600/1251]  eta: 0:02:38  lr: 0.002879  min_lr: 0.002879  loss: 2.9822 (3.0972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5737 (0.6467)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [119]  [ 800/1251]  eta: 0:01:49  lr: 0.002875  min_lr: 0.002875  loss: 3.5601 (3.1155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7153 (0.6688)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [119]  [1000/1251]  eta: 0:01:00  lr: 0.002872  min_lr: 0.002872  loss: 2.7228 (3.1220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6257 (0.6673)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [119]  [1200/1251]  eta: 0:00:12  lr: 0.002869  min_lr: 0.002869  loss: 2.7830 (3.1073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5428 (0.6720)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [119]  [1250/1251]  eta: 0:00:00  lr: 0.002868  min_lr: 0.002868  loss: 3.2196 (3.1187)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5514 (0.6706)  time: 0.1972  data: 0.0008  max mem: 18117
Epoch: [119] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.002868  min_lr: 0.002868  loss: 3.2196 (3.1216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5514 (0.6706)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7077 (0.7077)  acc1: 84.8000 (84.8000)  acc5: 96.4000 (96.4000)  time: 5.7351  data: 5.5846  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8835 (0.8743)  acc1: 81.6000 (80.1091)  acc5: 96.0000 (95.8545)  time: 0.7432  data: 0.6281  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0900 (1.0761)  acc1: 73.2000 (76.6286)  acc5: 92.8000 (93.4095)  time: 0.2035  data: 0.0932  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1711 (1.0869)  acc1: 72.8000 (76.3040)  acc5: 92.4000 (93.3760)  time: 0.2057  data: 0.0962  max mem: 18117
Test: Total time: 0:00:10 (0.4154 s / it)
* Acc@1 76.508 Acc@5 93.678 loss 1.078
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.51%
Epoch: [120]  [   0/1251]  eta: 1:08:09  lr: 0.002868  min_lr: 0.002868  loss: 4.1285 (4.1285)  weight_decay: 0.0500 (0.0500)  time: 3.2691  data: 2.9484  max mem: 18117
Epoch: [120]  [ 200/1251]  eta: 0:04:28  lr: 0.002865  min_lr: 0.002865  loss: 2.5993 (3.0891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6544 (0.6169)  time: 0.2420  data: 0.0003  max mem: 18117
Epoch: [120]  [ 400/1251]  eta: 0:03:30  lr: 0.002862  min_lr: 0.002862  loss: 2.9819 (3.0629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6652 (0.6217)  time: 0.2434  data: 0.0004  max mem: 18117
Epoch: [120]  [ 600/1251]  eta: 0:02:39  lr: 0.002858  min_lr: 0.002858  loss: 3.0908 (3.0865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7493 (0.6569)  time: 0.2473  data: 0.0005  max mem: 18117
Epoch: [120]  [ 800/1251]  eta: 0:01:50  lr: 0.002855  min_lr: 0.002855  loss: 2.4855 (3.1020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5498 (0.6445)  time: 0.2419  data: 0.0005  max mem: 18117
Epoch: [120]  [1000/1251]  eta: 0:01:01  lr: 0.002852  min_lr: 0.002852  loss: 2.8443 (3.1106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6461 (0.6620)  time: 0.2372  data: 0.0005  max mem: 18117
Epoch: [120]  [1200/1251]  eta: 0:00:12  lr: 0.002849  min_lr: 0.002849  loss: 2.6746 (3.1040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5813 (0.6587)  time: 0.2407  data: 0.0003  max mem: 18117
Epoch: [120]  [1250/1251]  eta: 0:00:00  lr: 0.002848  min_lr: 0.002848  loss: 3.4516 (3.1053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5800 (0.6558)  time: 0.1959  data: 0.0007  max mem: 18117
Epoch: [120] Total time: 0:05:04 (0.2430 s / it)
Averaged stats: lr: 0.002848  min_lr: 0.002848  loss: 3.4516 (3.1204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5800 (0.6558)
Test:  [ 0/25]  eta: 0:01:31  loss: 0.7399 (0.7399)  acc1: 85.6000 (85.6000)  acc5: 97.2000 (97.2000)  time: 3.6544  data: 3.5261  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.8447 (0.8799)  acc1: 83.2000 (80.4727)  acc5: 96.4000 (96.2182)  time: 0.6247  data: 0.5128  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0637 (1.0683)  acc1: 74.8000 (76.4571)  acc5: 93.6000 (93.8286)  time: 0.2696  data: 0.1605  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1756 (1.0803)  acc1: 74.0000 (76.0320)  acc5: 93.2000 (93.6160)  time: 0.2134  data: 0.1054  max mem: 18117
Test: Total time: 0:00:10 (0.4075 s / it)
* Acc@1 76.592 Acc@5 93.710 loss 1.071
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.59%
Epoch: [121]  [   0/1251]  eta: 1:09:29  lr: 0.002848  min_lr: 0.002848  loss: 3.2057 (3.2057)  weight_decay: 0.0500 (0.0500)  time: 3.3331  data: 3.0730  max mem: 18117
Epoch: [121]  [ 200/1251]  eta: 0:04:27  lr: 0.002845  min_lr: 0.002845  loss: 2.7120 (3.0580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6548 (0.6832)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [121]  [ 400/1251]  eta: 0:03:29  lr: 0.002841  min_lr: 0.002841  loss: 2.3708 (3.0856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5668 (0.6646)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [121]  [ 600/1251]  eta: 0:02:38  lr: 0.002838  min_lr: 0.002838  loss: 2.6106 (3.0768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6153 (0.6609)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [121]  [ 800/1251]  eta: 0:01:49  lr: 0.002835  min_lr: 0.002835  loss: 3.5358 (3.1064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7590 (0.7051)  time: 0.2368  data: 0.0003  max mem: 18117
Epoch: [121]  [1000/1251]  eta: 0:01:00  lr: 0.002831  min_lr: 0.002831  loss: 2.4890 (3.1112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5896 (0.7019)  time: 0.2373  data: 0.0005  max mem: 18117
Epoch: [121]  [1200/1251]  eta: 0:00:12  lr: 0.002828  min_lr: 0.002828  loss: 2.4284 (3.0944)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6110 (0.6924)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [121]  [1250/1251]  eta: 0:00:00  lr: 0.002827  min_lr: 0.002827  loss: 2.9027 (3.0948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5573 (0.6874)  time: 0.1959  data: 0.0006  max mem: 18117
Epoch: [121] Total time: 0:05:00 (0.2401 s / it)
Averaged stats: lr: 0.002827  min_lr: 0.002827  loss: 2.9027 (3.1144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5573 (0.6874)
Test:  [ 0/25]  eta: 0:01:30  loss: 0.6523 (0.6523)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 3.6217  data: 3.4733  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8594 (0.8467)  acc1: 80.4000 (80.3273)  acc5: 96.4000 (96.0727)  time: 0.6718  data: 0.5593  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1083 (1.0435)  acc1: 74.4000 (76.1905)  acc5: 93.2000 (93.6571)  time: 0.2849  data: 0.1757  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1054 (1.0478)  acc1: 74.4000 (76.0800)  acc5: 93.6000 (93.6640)  time: 0.2157  data: 0.1066  max mem: 18117
Test: Total time: 0:00:09 (0.3957 s / it)
* Acc@1 76.706 Acc@5 93.680 loss 1.040
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.71%
Epoch: [122]  [   0/1251]  eta: 0:59:07  lr: 0.002827  min_lr: 0.002827  loss: 2.7528 (2.7528)  weight_decay: 0.0500 (0.0500)  time: 2.8355  data: 2.5199  max mem: 18117
Epoch: [122]  [ 200/1251]  eta: 0:04:27  lr: 0.002824  min_lr: 0.002824  loss: 2.5559 (3.1242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6102 (0.6417)  time: 0.2410  data: 0.0004  max mem: 18117
Epoch: [122]  [ 400/1251]  eta: 0:03:29  lr: 0.002821  min_lr: 0.002821  loss: 2.6135 (3.1051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6345 (0.6662)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [122]  [ 600/1251]  eta: 0:02:38  lr: 0.002818  min_lr: 0.002818  loss: 3.0142 (3.0897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5815 (0.6453)  time: 0.2428  data: 0.0004  max mem: 18117
Epoch: [122]  [ 800/1251]  eta: 0:01:49  lr: 0.002814  min_lr: 0.002814  loss: 2.7785 (3.0962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6330 (0.6415)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [122]  [1000/1251]  eta: 0:01:00  lr: 0.002811  min_lr: 0.002811  loss: 3.1458 (3.1161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5564 (0.6449)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [122]  [1200/1251]  eta: 0:00:12  lr: 0.002808  min_lr: 0.002808  loss: 2.4715 (3.1300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7167 (0.6422)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [122]  [1250/1251]  eta: 0:00:00  lr: 0.002807  min_lr: 0.002807  loss: 2.8075 (3.1310)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6009 (0.6394)  time: 0.1959  data: 0.0007  max mem: 18117
Epoch: [122] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.002807  min_lr: 0.002807  loss: 2.8075 (3.1130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6009 (0.6394)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6690 (0.6690)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.5490  data: 5.4245  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8633 (0.8423)  acc1: 80.4000 (80.7636)  acc5: 96.4000 (96.3636)  time: 0.7800  data: 0.6652  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0210 (1.0386)  acc1: 74.0000 (76.9333)  acc5: 94.4000 (93.9238)  time: 0.2176  data: 0.1059  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1609 (1.0496)  acc1: 74.0000 (76.5280)  acc5: 92.0000 (93.9040)  time: 0.2197  data: 0.1094  max mem: 18117
Test: Total time: 0:00:10 (0.4195 s / it)
* Acc@1 76.774 Acc@5 93.738 loss 1.049
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.77%
Epoch: [123]  [   0/1251]  eta: 1:03:58  lr: 0.002807  min_lr: 0.002807  loss: 3.6785 (3.6785)  weight_decay: 0.0500 (0.0500)  time: 3.0680  data: 2.7425  max mem: 18117
Epoch: [123]  [ 200/1251]  eta: 0:04:27  lr: 0.002804  min_lr: 0.002804  loss: 2.6849 (3.1428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6493 (0.7330)  time: 0.2436  data: 0.0004  max mem: 18117
Epoch: [123]  [ 400/1251]  eta: 0:03:30  lr: 0.002800  min_lr: 0.002800  loss: 3.2171 (3.1368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6120 (0.6833)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [123]  [ 600/1251]  eta: 0:02:38  lr: 0.002797  min_lr: 0.002797  loss: 3.2204 (3.1480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5694 (0.6673)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [123]  [ 800/1251]  eta: 0:01:49  lr: 0.002794  min_lr: 0.002794  loss: 2.6286 (3.1385)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2409  data: 0.0004  max mem: 18117
Epoch: [123]  [1000/1251]  eta: 0:01:00  lr: 0.002790  min_lr: 0.002790  loss: 3.3659 (3.1380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6687 (nan)  time: 0.2348  data: 0.0004  max mem: 18117
Epoch: [123]  [1200/1251]  eta: 0:00:12  lr: 0.002787  min_lr: 0.002787  loss: 3.7232 (3.1344)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6951 (nan)  time: 0.2364  data: 0.0005  max mem: 18117
Epoch: [123]  [1250/1251]  eta: 0:00:00  lr: 0.002786  min_lr: 0.002786  loss: 2.5736 (3.1312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6080 (nan)  time: 0.1958  data: 0.0007  max mem: 18117
Epoch: [123] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.002786  min_lr: 0.002786  loss: 2.5736 (3.1199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6080 (nan)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6584 (0.6584)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 5.3761  data: 5.2457  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7486 (0.8120)  acc1: 82.4000 (81.3455)  acc5: 96.8000 (96.5818)  time: 0.6866  data: 0.5742  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9948 (1.0123)  acc1: 74.8000 (76.7619)  acc5: 93.2000 (93.7524)  time: 0.1918  data: 0.0824  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1680 (1.0263)  acc1: 73.2000 (76.3520)  acc5: 92.0000 (93.6000)  time: 0.2105  data: 0.1020  max mem: 18117
Test: Total time: 0:00:10 (0.4106 s / it)
* Acc@1 76.780 Acc@5 93.678 loss 1.022
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.78%
Epoch: [124]  [   0/1251]  eta: 0:57:47  lr: 0.002786  min_lr: 0.002786  loss: 3.3815 (3.3815)  weight_decay: 0.0500 (0.0500)  time: 2.7717  data: 2.4207  max mem: 18117
Epoch: [124]  [ 200/1251]  eta: 0:04:24  lr: 0.002783  min_lr: 0.002783  loss: 3.4336 (3.2333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5903 (0.6401)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [124]  [ 400/1251]  eta: 0:03:29  lr: 0.002780  min_lr: 0.002780  loss: 3.0188 (3.1762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5990 (0.6525)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [124]  [ 600/1251]  eta: 0:02:38  lr: 0.002776  min_lr: 0.002776  loss: 3.4042 (3.1344)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6118 (0.7002)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [124]  [ 800/1251]  eta: 0:01:49  lr: 0.002773  min_lr: 0.002773  loss: 2.5616 (3.1096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6181 (0.6758)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [124]  [1000/1251]  eta: 0:01:00  lr: 0.002770  min_lr: 0.002770  loss: 3.3264 (3.1146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6752 (0.6705)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [124]  [1200/1251]  eta: 0:00:12  lr: 0.002766  min_lr: 0.002766  loss: 2.5887 (3.1045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7199 (0.6857)  time: 0.2403  data: 0.0003  max mem: 18117
Epoch: [124]  [1250/1251]  eta: 0:00:00  lr: 0.002766  min_lr: 0.002766  loss: 3.1884 (3.1024)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5764 (0.6836)  time: 0.1970  data: 0.0005  max mem: 18117
Epoch: [124] Total time: 0:05:02 (0.2414 s / it)
Averaged stats: lr: 0.002766  min_lr: 0.002766  loss: 3.1884 (3.1134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5764 (0.6836)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6668 (0.6668)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.7174  data: 5.5906  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8311 (0.8479)  acc1: 81.2000 (81.2000)  acc5: 96.4000 (96.2182)  time: 0.7850  data: 0.6727  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0546 (1.0390)  acc1: 74.8000 (77.2000)  acc5: 93.2000 (93.6952)  time: 0.2193  data: 0.1089  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1343 (1.0531)  acc1: 74.4000 (76.7680)  acc5: 92.8000 (93.6640)  time: 0.2178  data: 0.1088  max mem: 18117
Test: Total time: 0:00:10 (0.4247 s / it)
* Acc@1 76.742 Acc@5 93.716 loss 1.051
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.78%
Epoch: [125]  [   0/1251]  eta: 1:07:27  lr: 0.002766  min_lr: 0.002766  loss: 2.1655 (2.1655)  weight_decay: 0.0500 (0.0500)  time: 3.2350  data: 2.2551  max mem: 18117
Epoch: [125]  [ 200/1251]  eta: 0:04:29  lr: 0.002762  min_lr: 0.002762  loss: 3.4385 (3.1170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6655 (0.6437)  time: 0.2426  data: 0.0004  max mem: 18117
Epoch: [125]  [ 400/1251]  eta: 0:03:30  lr: 0.002759  min_lr: 0.002759  loss: 2.9743 (3.0473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5444 (0.6734)  time: 0.2391  data: 0.0005  max mem: 18117
Epoch: [125]  [ 600/1251]  eta: 0:02:39  lr: 0.002756  min_lr: 0.002756  loss: 2.8674 (3.0597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5829 (0.6667)  time: 0.2353  data: 0.0004  max mem: 18117
Epoch: [125]  [ 800/1251]  eta: 0:01:49  lr: 0.002752  min_lr: 0.002752  loss: 3.4912 (3.0609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6447 (0.6646)  time: 0.2466  data: 0.0004  max mem: 18117
Epoch: [125]  [1000/1251]  eta: 0:01:00  lr: 0.002749  min_lr: 0.002749  loss: 2.6158 (3.0630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5681 (0.6579)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [125]  [1200/1251]  eta: 0:00:12  lr: 0.002746  min_lr: 0.002746  loss: 2.6497 (3.0706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6022 (0.6617)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [125]  [1250/1251]  eta: 0:00:00  lr: 0.002745  min_lr: 0.002745  loss: 2.5665 (3.0726)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6022 (0.6605)  time: 0.1951  data: 0.0006  max mem: 18117
Epoch: [125] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.002745  min_lr: 0.002745  loss: 2.5665 (3.1066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6022 (0.6605)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6840 (0.6840)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.7834  data: 5.6322  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8638 (0.8589)  acc1: 81.2000 (81.2364)  acc5: 96.0000 (96.1455)  time: 0.7684  data: 0.6535  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0736 (1.0507)  acc1: 74.8000 (77.0095)  acc5: 93.2000 (93.6571)  time: 0.2169  data: 0.1072  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1699 (1.0605)  acc1: 72.8000 (76.4160)  acc5: 92.4000 (93.6000)  time: 0.2164  data: 0.1071  max mem: 18117
Test: Total time: 0:00:10 (0.4254 s / it)
* Acc@1 76.894 Acc@5 93.828 loss 1.050
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 76.89%
Epoch: [126]  [   0/1251]  eta: 1:04:25  lr: 0.002745  min_lr: 0.002745  loss: 3.5516 (3.5516)  weight_decay: 0.0500 (0.0500)  time: 3.0903  data: 2.7455  max mem: 18117
Epoch: [126]  [ 200/1251]  eta: 0:04:27  lr: 0.002742  min_lr: 0.002742  loss: 2.4796 (3.1269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5645 (0.6193)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [126]  [ 400/1251]  eta: 0:03:30  lr: 0.002738  min_lr: 0.002738  loss: 2.8056 (3.1501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6929 (0.6680)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [126]  [ 600/1251]  eta: 0:02:39  lr: 0.002735  min_lr: 0.002735  loss: 3.1102 (3.1309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7302 (0.6803)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [126]  [ 800/1251]  eta: 0:01:49  lr: 0.002732  min_lr: 0.002732  loss: 2.5193 (3.1094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5714 (0.6770)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [126]  [1000/1251]  eta: 0:01:00  lr: 0.002728  min_lr: 0.002728  loss: 3.4668 (3.1235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5690 (0.6728)  time: 0.2401  data: 0.0005  max mem: 18117
Epoch: [126]  [1200/1251]  eta: 0:00:12  lr: 0.002725  min_lr: 0.002725  loss: 3.2223 (3.1255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6131 (0.6679)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [126]  [1250/1251]  eta: 0:00:00  lr: 0.002724  min_lr: 0.002724  loss: 2.5210 (3.1216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6404 (0.6697)  time: 0.1965  data: 0.0005  max mem: 18117
Epoch: [126] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.002724  min_lr: 0.002724  loss: 2.5210 (3.1078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6404 (0.6697)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7328 (0.7328)  acc1: 87.6000 (87.6000)  acc5: 97.2000 (97.2000)  time: 5.6042  data: 5.4558  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8302 (0.8841)  acc1: 81.6000 (80.8000)  acc5: 96.8000 (96.3636)  time: 0.7520  data: 0.6369  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0840 (1.0829)  acc1: 74.0000 (76.5524)  acc5: 93.2000 (93.6762)  time: 0.2163  data: 0.1056  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2049 (1.0933)  acc1: 72.4000 (76.2080)  acc5: 92.4000 (93.5360)  time: 0.2157  data: 0.1055  max mem: 18117
Test: Total time: 0:00:10 (0.4178 s / it)
* Acc@1 76.506 Acc@5 93.812 loss 1.085
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.89%
Epoch: [127]  [   0/1251]  eta: 1:04:34  lr: 0.002724  min_lr: 0.002724  loss: 2.1353 (2.1353)  weight_decay: 0.0500 (0.0500)  time: 3.0973  data: 2.6923  max mem: 18117
Epoch: [127]  [ 200/1251]  eta: 0:04:29  lr: 0.002721  min_lr: 0.002721  loss: 3.2298 (3.0759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5497 (0.6360)  time: 0.2475  data: 0.0004  max mem: 18117
Epoch: [127]  [ 400/1251]  eta: 0:03:31  lr: 0.002717  min_lr: 0.002717  loss: 3.0247 (3.0786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5917 (0.6506)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [127]  [ 600/1251]  eta: 0:02:39  lr: 0.002714  min_lr: 0.002714  loss: 2.6629 (3.0956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5885 (0.6434)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [127]  [ 800/1251]  eta: 0:01:49  lr: 0.002711  min_lr: 0.002711  loss: 3.5165 (3.0951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6429 (0.6402)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [127]  [1000/1251]  eta: 0:01:00  lr: 0.002707  min_lr: 0.002707  loss: 2.6195 (3.1013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5499 (0.6480)  time: 0.2421  data: 0.0004  max mem: 18117
Epoch: [127]  [1200/1251]  eta: 0:00:12  lr: 0.002704  min_lr: 0.002704  loss: 3.2742 (3.1085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7186 (0.6620)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [127]  [1250/1251]  eta: 0:00:00  lr: 0.002703  min_lr: 0.002703  loss: 2.8293 (3.1109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6356 (0.6596)  time: 0.1987  data: 0.0005  max mem: 18117
Epoch: [127] Total time: 0:05:01 (0.2414 s / it)
Averaged stats: lr: 0.002703  min_lr: 0.002703  loss: 2.8293 (3.0944)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6356 (0.6596)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.7283 (0.7283)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.9123  data: 5.7879  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8792 (0.8981)  acc1: 81.2000 (80.7273)  acc5: 96.8000 (95.9636)  time: 0.7583  data: 0.6461  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1647 (1.1039)  acc1: 73.2000 (76.6857)  acc5: 92.8000 (93.3905)  time: 0.1996  data: 0.0894  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2325 (1.1131)  acc1: 72.4000 (76.2880)  acc5: 91.6000 (93.3920)  time: 0.1988  data: 0.0894  max mem: 18117
Test: Total time: 0:00:10 (0.4180 s / it)
* Acc@1 76.688 Acc@5 93.764 loss 1.095
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.89%
Epoch: [128]  [   0/1251]  eta: 1:06:57  lr: 0.002703  min_lr: 0.002703  loss: 2.8814 (2.8814)  weight_decay: 0.0500 (0.0500)  time: 3.2114  data: 1.8414  max mem: 18117
Epoch: [128]  [ 200/1251]  eta: 0:04:27  lr: 0.002700  min_lr: 0.002700  loss: 2.7585 (3.0708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6343 (0.6644)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [128]  [ 400/1251]  eta: 0:03:29  lr: 0.002696  min_lr: 0.002696  loss: 2.6281 (3.0782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6394 (0.6659)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [128]  [ 600/1251]  eta: 0:02:38  lr: 0.002693  min_lr: 0.002693  loss: 3.6739 (3.1029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5363 (0.6480)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [128]  [ 800/1251]  eta: 0:01:49  lr: 0.002690  min_lr: 0.002690  loss: 3.6043 (3.1090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6123 (0.6698)  time: 0.2425  data: 0.0004  max mem: 18117
Epoch: [128]  [1000/1251]  eta: 0:01:00  lr: 0.002686  min_lr: 0.002686  loss: 3.4534 (3.1275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6176 (0.6640)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [128]  [1200/1251]  eta: 0:00:12  lr: 0.002683  min_lr: 0.002683  loss: 2.5826 (3.1226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5461 (0.6669)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [128]  [1250/1251]  eta: 0:00:00  lr: 0.002682  min_lr: 0.002682  loss: 2.6327 (3.1219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6214 (0.6707)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [128] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.002682  min_lr: 0.002682  loss: 2.6327 (3.1110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6214 (0.6707)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6827 (0.6827)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.5906  data: 5.4662  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8400 (0.8592)  acc1: 81.6000 (81.6364)  acc5: 96.0000 (96.2182)  time: 0.7727  data: 0.6589  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0603 (1.0596)  acc1: 75.6000 (77.3524)  acc5: 92.8000 (93.4286)  time: 0.2206  data: 0.1099  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.2099 (1.0720)  acc1: 74.0000 (76.8320)  acc5: 92.0000 (93.2800)  time: 0.2191  data: 0.1098  max mem: 18117
Test: Total time: 0:00:10 (0.4207 s / it)
* Acc@1 76.872 Acc@5 93.718 loss 1.055
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 76.89%
Epoch: [129]  [   0/1251]  eta: 1:08:52  lr: 0.002682  min_lr: 0.002682  loss: 2.1212 (2.1212)  weight_decay: 0.0500 (0.0500)  time: 3.3032  data: 2.5542  max mem: 18117
Epoch: [129]  [ 200/1251]  eta: 0:04:29  lr: 0.002679  min_lr: 0.002679  loss: 2.5244 (3.1016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5887 (0.5957)  time: 0.2391  data: 0.0008  max mem: 18117
Epoch: [129]  [ 400/1251]  eta: 0:03:30  lr: 0.002675  min_lr: 0.002675  loss: 3.3914 (3.1032)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [129]  [ 600/1251]  eta: 0:02:39  lr: 0.002672  min_lr: 0.002672  loss: 3.0812 (3.0992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6701 (nan)  time: 0.2475  data: 0.0005  max mem: 18117
Epoch: [129]  [ 800/1251]  eta: 0:01:49  lr: 0.002668  min_lr: 0.002668  loss: 3.0570 (3.1029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6773 (nan)  time: 0.2377  data: 0.0003  max mem: 18117
Epoch: [129]  [1000/1251]  eta: 0:01:00  lr: 0.002665  min_lr: 0.002665  loss: 3.0594 (3.1146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6241 (nan)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [129]  [1200/1251]  eta: 0:00:12  lr: 0.002662  min_lr: 0.002662  loss: 3.7300 (3.1152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7351 (nan)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [129]  [1250/1251]  eta: 0:00:00  lr: 0.002661  min_lr: 0.002661  loss: 2.5756 (3.1155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6904 (nan)  time: 0.1951  data: 0.0006  max mem: 18117
Epoch: [129] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.002661  min_lr: 0.002661  loss: 2.5756 (3.1018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6904 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6974 (0.6974)  acc1: 85.2000 (85.2000)  acc5: 98.0000 (98.0000)  time: 5.5654  data: 5.4068  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8819 (0.8528)  acc1: 82.8000 (81.0909)  acc5: 97.2000 (96.4000)  time: 0.7509  data: 0.6378  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0487 (1.0460)  acc1: 74.0000 (77.2571)  acc5: 94.4000 (93.6762)  time: 0.2231  data: 0.1148  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1420 (1.0535)  acc1: 73.2000 (76.8320)  acc5: 92.0000 (93.6480)  time: 0.2228  data: 0.1147  max mem: 18117
Test: Total time: 0:00:10 (0.4215 s / it)
* Acc@1 76.912 Acc@5 93.764 loss 1.046
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 76.91%
Epoch: [130]  [   0/1251]  eta: 1:02:10  lr: 0.002661  min_lr: 0.002661  loss: 2.1582 (2.1582)  weight_decay: 0.0500 (0.0500)  time: 2.9816  data: 2.6359  max mem: 18117
Epoch: [130]  [ 200/1251]  eta: 0:04:24  lr: 0.002657  min_lr: 0.002657  loss: 2.9316 (3.1134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6891 (0.7065)  time: 0.2367  data: 0.0005  max mem: 18117
Epoch: [130]  [ 400/1251]  eta: 0:03:28  lr: 0.002654  min_lr: 0.002654  loss: 3.1113 (3.1019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5666 (0.6585)  time: 0.2470  data: 0.0004  max mem: 18117
Epoch: [130]  [ 600/1251]  eta: 0:02:38  lr: 0.002651  min_lr: 0.002651  loss: 2.8145 (3.1273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6216 (0.6729)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [130]  [ 800/1251]  eta: 0:01:49  lr: 0.002647  min_lr: 0.002647  loss: 3.3662 (3.1077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5922 (0.6731)  time: 0.2456  data: 0.0005  max mem: 18117
Epoch: [130]  [1000/1251]  eta: 0:01:00  lr: 0.002644  min_lr: 0.002644  loss: 2.6654 (3.1170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6859 (0.6656)  time: 0.2422  data: 0.0005  max mem: 18117
Epoch: [130]  [1200/1251]  eta: 0:00:12  lr: 0.002640  min_lr: 0.002640  loss: 2.6289 (3.0996)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6271 (0.6684)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [130]  [1250/1251]  eta: 0:00:00  lr: 0.002640  min_lr: 0.002640  loss: 2.8479 (3.1025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5871 (0.6669)  time: 0.1968  data: 0.0006  max mem: 18117
Epoch: [130] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.002640  min_lr: 0.002640  loss: 2.8479 (3.0910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5871 (0.6669)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6870 (0.6870)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.6165  data: 5.4879  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8157 (0.8546)  acc1: 82.8000 (81.8909)  acc5: 97.2000 (96.5818)  time: 0.6946  data: 0.5826  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0360 (1.0546)  acc1: 75.6000 (77.5048)  acc5: 93.6000 (93.9238)  time: 0.1862  data: 0.0771  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1765 (1.0663)  acc1: 75.2000 (77.1840)  acc5: 93.6000 (93.9840)  time: 0.1939  data: 0.0849  max mem: 18117
Test: Total time: 0:00:10 (0.4011 s / it)
* Acc@1 77.256 Acc@5 93.934 loss 1.070
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.26%
Epoch: [131]  [   0/1251]  eta: 1:00:42  lr: 0.002640  min_lr: 0.002640  loss: 3.1161 (3.1161)  weight_decay: 0.0500 (0.0500)  time: 2.9114  data: 2.5784  max mem: 18117
Epoch: [131]  [ 200/1251]  eta: 0:04:26  lr: 0.002636  min_lr: 0.002636  loss: 2.9676 (3.1269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5950 (0.6708)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [131]  [ 400/1251]  eta: 0:03:28  lr: 0.002633  min_lr: 0.002633  loss: 3.5554 (3.0896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6507 (0.6807)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [131]  [ 600/1251]  eta: 0:02:38  lr: 0.002629  min_lr: 0.002629  loss: 3.5650 (3.1042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6140 (0.6762)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [131]  [ 800/1251]  eta: 0:01:49  lr: 0.002626  min_lr: 0.002626  loss: 2.8822 (3.1021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6511 (0.6749)  time: 0.2419  data: 0.0004  max mem: 18117
Epoch: [131]  [1000/1251]  eta: 0:01:00  lr: 0.002623  min_lr: 0.002623  loss: 3.0559 (3.1111)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5523 (0.6821)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [131]  [1200/1251]  eta: 0:00:12  lr: 0.002619  min_lr: 0.002619  loss: 2.6983 (3.0924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6579 (0.6845)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [131]  [1250/1251]  eta: 0:00:00  lr: 0.002618  min_lr: 0.002618  loss: 2.8858 (3.0968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6669 (0.6839)  time: 0.1952  data: 0.0005  max mem: 18117
Epoch: [131] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.002618  min_lr: 0.002618  loss: 2.8858 (3.0945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6669 (0.6839)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6810 (0.6810)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.6416  data: 5.4930  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9153 (0.8723)  acc1: 80.0000 (81.1636)  acc5: 96.8000 (96.2546)  time: 0.7276  data: 0.6148  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0717 (1.0658)  acc1: 75.6000 (77.1048)  acc5: 93.6000 (93.8667)  time: 0.1969  data: 0.0883  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1934 (1.0749)  acc1: 74.0000 (76.8000)  acc5: 92.4000 (93.8400)  time: 0.1965  data: 0.0882  max mem: 18117
Test: Total time: 0:00:10 (0.4047 s / it)
* Acc@1 77.226 Acc@5 93.874 loss 1.066
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.26%
Epoch: [132]  [   0/1251]  eta: 1:10:40  lr: 0.002618  min_lr: 0.002618  loss: 3.9031 (3.9031)  weight_decay: 0.0500 (0.0500)  time: 3.3893  data: 3.0518  max mem: 18117
Epoch: [132]  [ 200/1251]  eta: 0:04:30  lr: 0.002615  min_lr: 0.002615  loss: 3.2079 (3.0842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6186 (0.6202)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [132]  [ 400/1251]  eta: 0:03:30  lr: 0.002612  min_lr: 0.002612  loss: 3.5767 (3.1148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6562 (0.6529)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [132]  [ 600/1251]  eta: 0:02:39  lr: 0.002608  min_lr: 0.002608  loss: 3.5781 (3.1017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6950 (0.6525)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [132]  [ 800/1251]  eta: 0:01:49  lr: 0.002605  min_lr: 0.002605  loss: 3.2190 (3.1067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7079 (0.6869)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [132]  [1000/1251]  eta: 0:01:00  lr: 0.002601  min_lr: 0.002601  loss: 3.3170 (3.1081)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5816 (0.6782)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [132]  [1200/1251]  eta: 0:00:12  lr: 0.002598  min_lr: 0.002598  loss: 2.5257 (3.0980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7045 (0.6827)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [132]  [1250/1251]  eta: 0:00:00  lr: 0.002597  min_lr: 0.002597  loss: 3.0157 (3.0961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7043 (0.6821)  time: 0.1954  data: 0.0006  max mem: 18117
Epoch: [132] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.002597  min_lr: 0.002597  loss: 3.0157 (3.0878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7043 (0.6821)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7157 (0.7157)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.7568  data: 5.6142  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8790 (0.8884)  acc1: 80.0000 (80.6546)  acc5: 96.8000 (96.7636)  time: 0.7099  data: 0.5984  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0244 (1.0681)  acc1: 75.2000 (76.7619)  acc5: 93.6000 (94.0381)  time: 0.1843  data: 0.0762  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1121 (1.0792)  acc1: 73.6000 (76.4160)  acc5: 92.4000 (93.8400)  time: 0.1841  data: 0.0761  max mem: 18117
Test: Total time: 0:00:09 (0.3993 s / it)
* Acc@1 77.016 Acc@5 93.930 loss 1.068
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.26%
Epoch: [133]  [   0/1251]  eta: 1:09:25  lr: 0.002597  min_lr: 0.002597  loss: 4.1711 (4.1711)  weight_decay: 0.0500 (0.0500)  time: 3.3297  data: 2.8787  max mem: 18117
Epoch: [133]  [ 200/1251]  eta: 0:04:26  lr: 0.002594  min_lr: 0.002594  loss: 2.8737 (3.1047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5804 (0.5948)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [133]  [ 400/1251]  eta: 0:03:30  lr: 0.002590  min_lr: 0.002590  loss: 2.4179 (3.0834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5530 (0.6554)  time: 0.2409  data: 0.0004  max mem: 18117
Epoch: [133]  [ 600/1251]  eta: 0:02:39  lr: 0.002587  min_lr: 0.002587  loss: 2.4575 (3.0443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6684 (0.6531)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [133]  [ 800/1251]  eta: 0:01:49  lr: 0.002583  min_lr: 0.002583  loss: 2.7397 (3.0496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5914 (0.6535)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [133]  [1000/1251]  eta: 0:01:00  lr: 0.002580  min_lr: 0.002580  loss: 2.7108 (3.0720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5691 (0.6570)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [133]  [1200/1251]  eta: 0:00:12  lr: 0.002576  min_lr: 0.002576  loss: 3.0311 (3.0668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5796 (0.6544)  time: 0.2457  data: 0.0004  max mem: 18117
Epoch: [133]  [1250/1251]  eta: 0:00:00  lr: 0.002576  min_lr: 0.002576  loss: 3.4616 (3.0698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5951 (0.6548)  time: 0.1953  data: 0.0006  max mem: 18117
Epoch: [133] Total time: 0:05:03 (0.2422 s / it)
Averaged stats: lr: 0.002576  min_lr: 0.002576  loss: 3.4616 (3.0793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5951 (0.6548)
Test:  [ 0/25]  eta: 0:01:39  loss: 0.7102 (0.7102)  acc1: 85.2000 (85.2000)  acc5: 96.8000 (96.8000)  time: 3.9791  data: 3.8197  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.8776 (0.8837)  acc1: 80.4000 (81.2364)  acc5: 96.4000 (96.2909)  time: 0.6229  data: 0.5080  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1190 (1.0765)  acc1: 74.8000 (77.1048)  acc5: 94.4000 (93.8667)  time: 0.2588  data: 0.1497  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1191 (1.0836)  acc1: 75.2000 (76.8640)  acc5: 91.6000 (93.6480)  time: 0.2247  data: 0.1168  max mem: 18117
Test: Total time: 0:00:10 (0.4154 s / it)
* Acc@1 76.948 Acc@5 93.808 loss 1.078
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 77.26%
Epoch: [134]  [   0/1251]  eta: 1:10:39  lr: 0.002576  min_lr: 0.002576  loss: 2.4532 (2.4532)  weight_decay: 0.0500 (0.0500)  time: 3.3889  data: 2.4124  max mem: 18117
Epoch: [134]  [ 200/1251]  eta: 0:04:28  lr: 0.002572  min_lr: 0.002572  loss: 2.8115 (2.9384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6149 (0.6891)  time: 0.2379  data: 0.0003  max mem: 18117
Epoch: [134]  [ 400/1251]  eta: 0:03:29  lr: 0.002569  min_lr: 0.002569  loss: 2.7201 (3.0403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6352 (0.6904)  time: 0.2391  data: 0.0003  max mem: 18117
Epoch: [134]  [ 600/1251]  eta: 0:02:38  lr: 0.002565  min_lr: 0.002565  loss: 2.6907 (3.0625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5882 (0.6734)  time: 0.2383  data: 0.0003  max mem: 18117
Epoch: [134]  [ 800/1251]  eta: 0:01:49  lr: 0.002562  min_lr: 0.002562  loss: 3.0633 (3.0786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6211 (0.6816)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [134]  [1000/1251]  eta: 0:01:00  lr: 0.002558  min_lr: 0.002558  loss: 2.8795 (3.0811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5473 (0.6679)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [134]  [1200/1251]  eta: 0:00:12  lr: 0.002555  min_lr: 0.002555  loss: 2.9190 (3.0838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6075 (0.6730)  time: 0.2390  data: 0.0006  max mem: 18117
Epoch: [134]  [1250/1251]  eta: 0:00:00  lr: 0.002554  min_lr: 0.002554  loss: 2.4588 (3.0795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6292 (0.6708)  time: 0.1958  data: 0.0007  max mem: 18117
Epoch: [134] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.002554  min_lr: 0.002554  loss: 2.4588 (3.0762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6292 (0.6708)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6101 (0.6101)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.3939  data: 5.2660  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7934 (0.8068)  acc1: 84.4000 (81.3818)  acc5: 96.4000 (96.2545)  time: 0.7602  data: 0.6464  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0193 (0.9975)  acc1: 74.4000 (77.1619)  acc5: 92.8000 (93.5429)  time: 0.2224  data: 0.1117  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1144 (1.0086)  acc1: 72.8000 (76.7520)  acc5: 92.4000 (93.5840)  time: 0.2210  data: 0.1117  max mem: 18117
Test: Total time: 0:00:10 (0.4141 s / it)
* Acc@1 77.134 Acc@5 93.872 loss 1.001
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.26%
Epoch: [135]  [   0/1251]  eta: 1:08:18  lr: 0.002554  min_lr: 0.002554  loss: 2.0797 (2.0797)  weight_decay: 0.0500 (0.0500)  time: 3.2758  data: 2.2451  max mem: 18117
Epoch: [135]  [ 200/1251]  eta: 0:04:27  lr: 0.002551  min_lr: 0.002551  loss: 2.9316 (3.0707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6546 (0.6649)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [135]  [ 400/1251]  eta: 0:03:30  lr: 0.002547  min_lr: 0.002547  loss: 3.4644 (3.0544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7303 (0.7056)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [135]  [ 600/1251]  eta: 0:02:39  lr: 0.002544  min_lr: 0.002544  loss: 2.4708 (3.0521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7357 (0.7088)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [135]  [ 800/1251]  eta: 0:01:49  lr: 0.002540  min_lr: 0.002540  loss: 3.2330 (3.0572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6253 (0.6976)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [135]  [1000/1251]  eta: 0:01:00  lr: 0.002537  min_lr: 0.002537  loss: 3.0371 (3.0869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5645 (0.6795)  time: 0.2389  data: 0.0005  max mem: 18117
Epoch: [135]  [1200/1251]  eta: 0:00:12  lr: 0.002533  min_lr: 0.002533  loss: 3.2566 (3.0961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6779 (0.6751)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [135]  [1250/1251]  eta: 0:00:00  lr: 0.002533  min_lr: 0.002533  loss: 2.8940 (3.0960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6120 (0.6753)  time: 0.1963  data: 0.0007  max mem: 18117
Epoch: [135] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.002533  min_lr: 0.002533  loss: 2.8940 (3.0877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6120 (0.6753)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6645 (0.6645)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.4397  data: 5.2746  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8467 (0.8282)  acc1: 83.6000 (82.1091)  acc5: 97.2000 (96.5818)  time: 0.7344  data: 0.6189  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0188 (1.0182)  acc1: 76.0000 (77.5810)  acc5: 93.2000 (93.9810)  time: 0.2120  data: 0.1022  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0783 (1.0261)  acc1: 74.4000 (77.2320)  acc5: 92.8000 (93.8560)  time: 0.2118  data: 0.1021  max mem: 18117
Test: Total time: 0:00:10 (0.4080 s / it)
* Acc@1 77.108 Acc@5 93.988 loss 1.026
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.26%
Epoch: [136]  [   0/1251]  eta: 1:10:01  lr: 0.002532  min_lr: 0.002532  loss: 2.2239 (2.2239)  weight_decay: 0.0500 (0.0500)  time: 3.3589  data: 2.5957  max mem: 18117
Epoch: [136]  [ 200/1251]  eta: 0:04:28  lr: 0.002529  min_lr: 0.002529  loss: 2.8869 (3.1046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8150 (0.7255)  time: 0.2402  data: 0.0005  max mem: 18117
Epoch: [136]  [ 400/1251]  eta: 0:03:29  lr: 0.002526  min_lr: 0.002526  loss: 2.7509 (3.0829)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6051 (0.7003)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [136]  [ 600/1251]  eta: 0:02:38  lr: 0.002522  min_lr: 0.002522  loss: 2.8484 (3.0657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6070 (0.6818)  time: 0.2420  data: 0.0012  max mem: 18117
Epoch: [136]  [ 800/1251]  eta: 0:01:49  lr: 0.002519  min_lr: 0.002519  loss: 2.5801 (3.0659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6564 (0.6897)  time: 0.2393  data: 0.0004  max mem: 18117
Epoch: [136]  [1000/1251]  eta: 0:01:00  lr: 0.002515  min_lr: 0.002515  loss: 3.1096 (3.0744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6162 (0.6840)  time: 0.2415  data: 0.0004  max mem: 18117
Epoch: [136]  [1200/1251]  eta: 0:00:12  lr: 0.002512  min_lr: 0.002512  loss: 2.9516 (3.0697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5666 (0.6767)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [136]  [1250/1251]  eta: 0:00:00  lr: 0.002511  min_lr: 0.002511  loss: 2.8923 (3.0715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6067 (0.6757)  time: 0.1956  data: 0.0008  max mem: 18117
Epoch: [136] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.002511  min_lr: 0.002511  loss: 2.8923 (3.0854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6067 (0.6757)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6842 (0.6842)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.8082  data: 5.6812  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8327 (0.8856)  acc1: 83.2000 (81.7091)  acc5: 96.8000 (96.7273)  time: 0.7668  data: 0.6531  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0616 (1.0778)  acc1: 74.4000 (76.8191)  acc5: 94.4000 (94.1524)  time: 0.1998  data: 0.0896  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1503 (1.0875)  acc1: 74.4000 (76.5280)  acc5: 92.8000 (94.0000)  time: 0.1977  data: 0.0895  max mem: 18117
Test: Total time: 0:00:10 (0.4132 s / it)
* Acc@1 76.858 Acc@5 93.746 loss 1.080
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 77.26%
Epoch: [137]  [   0/1251]  eta: 1:03:42  lr: 0.002511  min_lr: 0.002511  loss: 2.6294 (2.6294)  weight_decay: 0.0500 (0.0500)  time: 3.0559  data: 2.6609  max mem: 18117
Epoch: [137]  [ 200/1251]  eta: 0:04:30  lr: 0.002507  min_lr: 0.002507  loss: 3.4744 (3.2028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6570 (0.6771)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [137]  [ 400/1251]  eta: 0:03:31  lr: 0.002504  min_lr: 0.002504  loss: 2.5784 (3.1273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5873 (0.6837)  time: 0.2458  data: 0.0004  max mem: 18117
Epoch: [137]  [ 600/1251]  eta: 0:02:39  lr: 0.002500  min_lr: 0.002500  loss: 2.7625 (3.1106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5708 (0.6581)  time: 0.2358  data: 0.0005  max mem: 18117
Epoch: [137]  [ 800/1251]  eta: 0:01:49  lr: 0.002497  min_lr: 0.002497  loss: 2.8267 (3.0851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6712 (0.6601)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [137]  [1000/1251]  eta: 0:01:00  lr: 0.002493  min_lr: 0.002493  loss: 3.4940 (3.0957)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6739 (0.6802)  time: 0.2392  data: 0.0005  max mem: 18117
Epoch: [137]  [1200/1251]  eta: 0:00:12  lr: 0.002490  min_lr: 0.002490  loss: 2.4815 (3.0871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7027 (0.6876)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [137]  [1250/1251]  eta: 0:00:00  lr: 0.002489  min_lr: 0.002489  loss: 2.6026 (3.0848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6900 (0.6875)  time: 0.1966  data: 0.0006  max mem: 18117
Epoch: [137] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.002489  min_lr: 0.002489  loss: 2.6026 (3.0623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6900 (0.6875)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6523 (0.6523)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.4570  data: 5.3088  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7957 (0.8372)  acc1: 82.0000 (81.2000)  acc5: 96.4000 (96.0364)  time: 0.7543  data: 0.6404  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0037 (1.0226)  acc1: 74.4000 (77.1048)  acc5: 93.2000 (93.7905)  time: 0.2167  data: 0.1074  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1263 (1.0324)  acc1: 73.2000 (76.7200)  acc5: 92.8000 (93.7600)  time: 0.2162  data: 0.1073  max mem: 18117
Test: Total time: 0:00:10 (0.4126 s / it)
* Acc@1 77.046 Acc@5 93.976 loss 1.025
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.26%
Epoch: [138]  [   0/1251]  eta: 1:05:25  lr: 0.002489  min_lr: 0.002489  loss: 3.5587 (3.5587)  weight_decay: 0.0500 (0.0500)  time: 3.1382  data: 2.2848  max mem: 18117
Epoch: [138]  [ 200/1251]  eta: 0:04:28  lr: 0.002486  min_lr: 0.002486  loss: 3.2076 (2.9768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6154 (0.6564)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [138]  [ 400/1251]  eta: 0:03:29  lr: 0.002482  min_lr: 0.002482  loss: 2.5895 (2.9784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6187 (0.6413)  time: 0.2392  data: 0.0005  max mem: 18117
Epoch: [138]  [ 600/1251]  eta: 0:02:39  lr: 0.002479  min_lr: 0.002479  loss: 3.2011 (3.0156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6695 (0.6817)  time: 0.2391  data: 0.0005  max mem: 18117
Epoch: [138]  [ 800/1251]  eta: 0:01:49  lr: 0.002475  min_lr: 0.002475  loss: 2.6296 (3.0189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6114 (0.6798)  time: 0.2426  data: 0.0004  max mem: 18117
Epoch: [138]  [1000/1251]  eta: 0:01:00  lr: 0.002472  min_lr: 0.002472  loss: 2.3483 (3.0331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6151 (nan)  time: 0.2381  data: 0.0003  max mem: 18117
Epoch: [138]  [1200/1251]  eta: 0:00:12  lr: 0.002468  min_lr: 0.002468  loss: 3.2390 (3.0492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5323 (nan)  time: 0.2386  data: 0.0005  max mem: 18117
Epoch: [138]  [1250/1251]  eta: 0:00:00  lr: 0.002467  min_lr: 0.002467  loss: 2.4794 (3.0506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5894 (nan)  time: 0.1955  data: 0.0007  max mem: 18117
Epoch: [138] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.002467  min_lr: 0.002467  loss: 2.4794 (3.0685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5894 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6732 (0.6732)  acc1: 87.2000 (87.2000)  acc5: 96.8000 (96.8000)  time: 5.7165  data: 5.5908  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8346 (0.8557)  acc1: 80.8000 (81.1273)  acc5: 96.8000 (96.5818)  time: 0.6992  data: 0.5870  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0871 (1.0559)  acc1: 75.6000 (76.8762)  acc5: 93.6000 (93.9429)  time: 0.1822  data: 0.0727  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1598 (1.0658)  acc1: 74.0000 (76.4800)  acc5: 92.4000 (93.8560)  time: 0.1808  data: 0.0726  max mem: 18117
Test: Total time: 0:00:09 (0.3955 s / it)
* Acc@1 77.290 Acc@5 93.912 loss 1.053
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.29%
Epoch: [139]  [   0/1251]  eta: 1:00:46  lr: 0.002467  min_lr: 0.002467  loss: 3.1453 (3.1453)  weight_decay: 0.0500 (0.0500)  time: 2.9145  data: 2.5746  max mem: 18117
Epoch: [139]  [ 200/1251]  eta: 0:04:27  lr: 0.002464  min_lr: 0.002464  loss: 2.8139 (3.0885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6209 (0.6220)  time: 0.2380  data: 0.0005  max mem: 18117
Epoch: [139]  [ 400/1251]  eta: 0:03:29  lr: 0.002460  min_lr: 0.002460  loss: 3.3893 (3.0694)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5892 (0.6606)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [139]  [ 600/1251]  eta: 0:02:38  lr: 0.002457  min_lr: 0.002457  loss: 2.4402 (3.0907)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6936 (0.6815)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [139]  [ 800/1251]  eta: 0:01:49  lr: 0.002453  min_lr: 0.002453  loss: 3.6181 (3.0665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5928 (0.6693)  time: 0.2394  data: 0.0005  max mem: 18117
Epoch: [139]  [1000/1251]  eta: 0:01:00  lr: 0.002450  min_lr: 0.002450  loss: 3.1494 (3.0838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7005 (0.6831)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [139]  [1200/1251]  eta: 0:00:12  lr: 0.002446  min_lr: 0.002446  loss: 3.0579 (3.1099)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6197 (0.6809)  time: 0.2407  data: 0.0004  max mem: 18117
Epoch: [139]  [1250/1251]  eta: 0:00:00  lr: 0.002446  min_lr: 0.002446  loss: 2.4984 (3.1035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6112 (0.6801)  time: 0.1962  data: 0.0007  max mem: 18117
Epoch: [139] Total time: 0:05:02 (0.2414 s / it)
Averaged stats: lr: 0.002446  min_lr: 0.002446  loss: 2.4984 (3.0638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6112 (0.6801)
Test:  [ 0/25]  eta: 0:01:55  loss: 0.6686 (0.6686)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 4.6167  data: 4.4842  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7998 (0.8262)  acc1: 80.8000 (81.4909)  acc5: 96.8000 (96.5091)  time: 0.7031  data: 0.5886  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0398 (1.0030)  acc1: 76.0000 (77.8667)  acc5: 93.2000 (93.9048)  time: 0.2462  data: 0.1353  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0616 (1.0096)  acc1: 76.4000 (77.6800)  acc5: 92.8000 (93.8880)  time: 0.2182  data: 0.1095  max mem: 18117
Test: Total time: 0:00:10 (0.4141 s / it)
* Acc@1 77.334 Acc@5 93.966 loss 1.003
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.33%
Epoch: [140]  [   0/1251]  eta: 0:54:15  lr: 0.002445  min_lr: 0.002445  loss: 3.8156 (3.8156)  weight_decay: 0.0500 (0.0500)  time: 2.6022  data: 2.2623  max mem: 18117
Epoch: [140]  [ 200/1251]  eta: 0:04:26  lr: 0.002442  min_lr: 0.002442  loss: 3.0082 (3.0774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6715 (0.6342)  time: 0.2359  data: 0.0005  max mem: 18117
Epoch: [140]  [ 400/1251]  eta: 0:03:28  lr: 0.002438  min_lr: 0.002438  loss: 2.9132 (3.1227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6104 (0.6541)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [140]  [ 600/1251]  eta: 0:02:38  lr: 0.002435  min_lr: 0.002435  loss: 2.4816 (3.1173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6898 (0.6585)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [140]  [ 800/1251]  eta: 0:01:49  lr: 0.002431  min_lr: 0.002431  loss: 3.3776 (3.1257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6422 (0.6567)  time: 0.2450  data: 0.0004  max mem: 18117
Epoch: [140]  [1000/1251]  eta: 0:01:00  lr: 0.002428  min_lr: 0.002428  loss: 2.9575 (3.1151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7988 (0.6800)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [140]  [1200/1251]  eta: 0:00:12  lr: 0.002424  min_lr: 0.002424  loss: 2.3016 (3.1052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5782 (0.6853)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [140]  [1250/1251]  eta: 0:00:00  lr: 0.002424  min_lr: 0.002424  loss: 2.8113 (3.1042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5734 (0.6828)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [140] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.002424  min_lr: 0.002424  loss: 2.8113 (3.0681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5734 (0.6828)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.6843 (0.6843)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.9228  data: 5.7961  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8788 (0.8369)  acc1: 82.0000 (81.6727)  acc5: 96.4000 (96.4364)  time: 0.7149  data: 0.6020  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0678 (1.0184)  acc1: 76.4000 (77.7333)  acc5: 94.4000 (93.9619)  time: 0.1806  data: 0.0708  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1385 (1.0340)  acc1: 74.4000 (77.3280)  acc5: 93.2000 (93.8880)  time: 0.1790  data: 0.0707  max mem: 18117
Test: Total time: 0:00:10 (0.4020 s / it)
* Acc@1 77.224 Acc@5 94.064 loss 1.031
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.33%
Epoch: [141]  [   0/1251]  eta: 1:07:07  lr: 0.002424  min_lr: 0.002424  loss: 2.1400 (2.1400)  weight_decay: 0.0500 (0.0500)  time: 3.2193  data: 2.8323  max mem: 18117
Epoch: [141]  [ 200/1251]  eta: 0:04:26  lr: 0.002420  min_lr: 0.002420  loss: 2.6434 (3.0539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6837 (0.7129)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [141]  [ 400/1251]  eta: 0:03:28  lr: 0.002417  min_lr: 0.002417  loss: 2.6421 (3.0448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5962 (0.6749)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [141]  [ 600/1251]  eta: 0:02:38  lr: 0.002413  min_lr: 0.002413  loss: 2.8151 (3.0421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5879 (0.6753)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [141]  [ 800/1251]  eta: 0:01:49  lr: 0.002409  min_lr: 0.002409  loss: 3.0777 (3.0642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6281 (0.6798)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [141]  [1000/1251]  eta: 0:01:00  lr: 0.002406  min_lr: 0.002406  loss: 2.8196 (3.0615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6845 (0.6812)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [141]  [1200/1251]  eta: 0:00:12  lr: 0.002402  min_lr: 0.002402  loss: 2.4267 (3.0642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6783 (0.6823)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [141]  [1250/1251]  eta: 0:00:00  lr: 0.002402  min_lr: 0.002402  loss: 2.4280 (3.0670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7453 (0.6862)  time: 0.1959  data: 0.0006  max mem: 18117
Epoch: [141] Total time: 0:05:00 (0.2401 s / it)
Averaged stats: lr: 0.002402  min_lr: 0.002402  loss: 2.4280 (3.0652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7453 (0.6862)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6664 (0.6664)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 5.5718  data: 5.4419  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7747 (0.7968)  acc1: 83.6000 (81.4909)  acc5: 96.4000 (96.1091)  time: 0.7673  data: 0.6546  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0114 (0.9861)  acc1: 75.6000 (77.3143)  acc5: 93.2000 (93.6381)  time: 0.2087  data: 0.0982  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1264 (1.0000)  acc1: 73.6000 (76.9760)  acc5: 92.4000 (93.6000)  time: 0.2105  data: 0.1013  max mem: 18117
Test: Total time: 0:00:10 (0.4137 s / it)
* Acc@1 77.424 Acc@5 93.968 loss 0.988
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.42%
Epoch: [142]  [   0/1251]  eta: 1:07:46  lr: 0.002402  min_lr: 0.002402  loss: 2.2651 (2.2651)  weight_decay: 0.0500 (0.0500)  time: 3.2509  data: 2.9977  max mem: 18117
Epoch: [142]  [ 200/1251]  eta: 0:04:26  lr: 0.002398  min_lr: 0.002398  loss: 2.2652 (3.0750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5971 (nan)  time: 0.2454  data: 0.0004  max mem: 18117
Epoch: [142]  [ 400/1251]  eta: 0:03:30  lr: 0.002395  min_lr: 0.002395  loss: 2.4405 (3.0338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6213 (nan)  time: 0.2414  data: 0.0004  max mem: 18117
Epoch: [142]  [ 600/1251]  eta: 0:02:39  lr: 0.002391  min_lr: 0.002391  loss: 2.6926 (3.0509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6545 (nan)  time: 0.2407  data: 0.0004  max mem: 18117
Epoch: [142]  [ 800/1251]  eta: 0:01:49  lr: 0.002387  min_lr: 0.002387  loss: 2.7994 (3.0607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6356 (nan)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [142]  [1000/1251]  eta: 0:01:00  lr: 0.002384  min_lr: 0.002384  loss: 2.6286 (3.0607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7699 (nan)  time: 0.2391  data: 0.0005  max mem: 18117
Epoch: [142]  [1200/1251]  eta: 0:00:12  lr: 0.002380  min_lr: 0.002380  loss: 3.3414 (3.0711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5641 (nan)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [142]  [1250/1251]  eta: 0:00:00  lr: 0.002380  min_lr: 0.002380  loss: 3.0841 (3.0757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6324 (nan)  time: 0.1966  data: 0.0007  max mem: 18117
Epoch: [142] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.002380  min_lr: 0.002380  loss: 3.0841 (3.0674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6324 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7363 (0.7363)  acc1: 85.2000 (85.2000)  acc5: 98.4000 (98.4000)  time: 5.6702  data: 5.5456  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8616 (0.8710)  acc1: 80.8000 (81.2000)  acc5: 95.6000 (96.2546)  time: 0.7795  data: 0.6675  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0039 (1.0501)  acc1: 75.2000 (77.4095)  acc5: 94.0000 (93.9810)  time: 0.2244  data: 0.1137  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1719 (1.0629)  acc1: 73.2000 (77.0080)  acc5: 92.0000 (93.8080)  time: 0.2235  data: 0.1136  max mem: 18117
Test: Total time: 0:00:10 (0.4275 s / it)
* Acc@1 77.358 Acc@5 93.948 loss 1.051
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.42%
Epoch: [143]  [   0/1251]  eta: 1:06:46  lr: 0.002380  min_lr: 0.002380  loss: 4.0459 (4.0459)  weight_decay: 0.0500 (0.0500)  time: 3.2026  data: 2.5066  max mem: 18117
Epoch: [143]  [ 200/1251]  eta: 0:04:28  lr: 0.002376  min_lr: 0.002376  loss: 2.3950 (3.0317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6889 (0.6645)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [143]  [ 400/1251]  eta: 0:03:30  lr: 0.002373  min_lr: 0.002373  loss: 3.1768 (3.0582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6026 (0.6766)  time: 0.2382  data: 0.0005  max mem: 18117
Epoch: [143]  [ 600/1251]  eta: 0:02:38  lr: 0.002369  min_lr: 0.002369  loss: 2.3386 (3.0816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6626 (0.6743)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [143]  [ 800/1251]  eta: 0:01:49  lr: 0.002365  min_lr: 0.002365  loss: 2.9156 (3.0530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6553 (0.6854)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [143]  [1000/1251]  eta: 0:01:00  lr: 0.002362  min_lr: 0.002362  loss: 2.4345 (3.0551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6247 (0.6831)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [143]  [1200/1251]  eta: 0:00:12  lr: 0.002358  min_lr: 0.002358  loss: 2.4406 (3.0448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6787 (0.6912)  time: 0.2379  data: 0.0005  max mem: 18117
Epoch: [143]  [1250/1251]  eta: 0:00:00  lr: 0.002358  min_lr: 0.002358  loss: 2.8535 (3.0468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6787 (0.6931)  time: 0.1955  data: 0.0008  max mem: 18117
Epoch: [143] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.002358  min_lr: 0.002358  loss: 2.8535 (3.0460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6787 (0.6931)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6705 (0.6705)  acc1: 85.2000 (85.2000)  acc5: 98.0000 (98.0000)  time: 5.7081  data: 5.5812  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8637 (0.8542)  acc1: 81.6000 (81.3091)  acc5: 96.8000 (96.1818)  time: 0.7506  data: 0.6374  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0672 (1.0385)  acc1: 76.8000 (77.7143)  acc5: 93.2000 (93.7333)  time: 0.2094  data: 0.0990  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1082 (1.0557)  acc1: 75.6000 (77.0240)  acc5: 92.0000 (93.6480)  time: 0.2080  data: 0.0989  max mem: 18117
Test: Total time: 0:00:10 (0.4164 s / it)
* Acc@1 77.168 Acc@5 93.818 loss 1.051
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.42%
Epoch: [144]  [   0/1251]  eta: 1:06:43  lr: 0.002358  min_lr: 0.002358  loss: 2.2517 (2.2517)  weight_decay: 0.0500 (0.0500)  time: 3.2004  data: 2.3868  max mem: 18117
Epoch: [144]  [ 200/1251]  eta: 0:04:28  lr: 0.002354  min_lr: 0.002354  loss: 3.3988 (3.0620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7917 (0.7344)  time: 0.2387  data: 0.0005  max mem: 18117
Epoch: [144]  [ 400/1251]  eta: 0:03:30  lr: 0.002350  min_lr: 0.002350  loss: 2.8315 (3.0672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7262 (0.7350)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [144]  [ 600/1251]  eta: 0:02:39  lr: 0.002347  min_lr: 0.002347  loss: 2.3867 (3.0498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6412 (0.7195)  time: 0.2441  data: 0.0004  max mem: 18117
Epoch: [144]  [ 800/1251]  eta: 0:01:49  lr: 0.002343  min_lr: 0.002343  loss: 2.5297 (3.0421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6273 (0.7187)  time: 0.2407  data: 0.0004  max mem: 18117
Epoch: [144]  [1000/1251]  eta: 0:01:00  lr: 0.002340  min_lr: 0.002340  loss: 2.5287 (3.0515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6299 (0.7073)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [144]  [1200/1251]  eta: 0:00:12  lr: 0.002336  min_lr: 0.002336  loss: 3.5045 (3.0514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7100 (0.7033)  time: 0.2386  data: 0.0005  max mem: 18117
Epoch: [144]  [1250/1251]  eta: 0:00:00  lr: 0.002335  min_lr: 0.002335  loss: 3.0539 (3.0599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6509 (0.7011)  time: 0.1951  data: 0.0008  max mem: 18117
Epoch: [144] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.002335  min_lr: 0.002335  loss: 3.0539 (3.0623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6509 (0.7011)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6734 (0.6734)  acc1: 85.6000 (85.6000)  acc5: 97.2000 (97.2000)  time: 5.6383  data: 5.5139  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8577 (0.8310)  acc1: 82.0000 (81.9636)  acc5: 96.4000 (96.4727)  time: 0.7619  data: 0.6492  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9945 (1.0317)  acc1: 74.8000 (77.6381)  acc5: 94.4000 (94.1905)  time: 0.2056  data: 0.0959  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1477 (1.0426)  acc1: 74.4000 (77.2800)  acc5: 93.2000 (94.1280)  time: 0.2038  data: 0.0958  max mem: 18117
Test: Total time: 0:00:10 (0.4112 s / it)
* Acc@1 77.512 Acc@5 94.098 loss 1.031
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.51%
Epoch: [145]  [   0/1251]  eta: 0:56:05  lr: 0.002335  min_lr: 0.002335  loss: 2.8347 (2.8347)  weight_decay: 0.0500 (0.0500)  time: 2.6903  data: 2.3244  max mem: 18117
Epoch: [145]  [ 200/1251]  eta: 0:04:24  lr: 0.002332  min_lr: 0.002332  loss: 2.9637 (3.1052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6417 (0.6852)  time: 0.2380  data: 0.0005  max mem: 18117
Epoch: [145]  [ 400/1251]  eta: 0:03:27  lr: 0.002328  min_lr: 0.002328  loss: 2.4678 (3.1053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6968 (0.7561)  time: 0.2350  data: 0.0004  max mem: 18117
Epoch: [145]  [ 600/1251]  eta: 0:02:37  lr: 0.002325  min_lr: 0.002325  loss: 3.5609 (3.0867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6065 (0.7148)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [145]  [ 800/1251]  eta: 0:01:48  lr: 0.002321  min_lr: 0.002321  loss: 2.5214 (3.0585)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6232 (0.6918)  time: 0.2362  data: 0.0004  max mem: 18117
Epoch: [145]  [1000/1251]  eta: 0:01:00  lr: 0.002318  min_lr: 0.002318  loss: 2.7264 (3.0567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5753 (0.6876)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [145]  [1200/1251]  eta: 0:00:12  lr: 0.002314  min_lr: 0.002314  loss: 2.6269 (3.0399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6315 (0.6824)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [145]  [1250/1251]  eta: 0:00:00  lr: 0.002313  min_lr: 0.002313  loss: 2.6356 (3.0396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6406 (0.6809)  time: 0.1960  data: 0.0005  max mem: 18117
Epoch: [145] Total time: 0:04:59 (0.2393 s / it)
Averaged stats: lr: 0.002313  min_lr: 0.002313  loss: 2.6356 (3.0474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6406 (0.6809)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7207 (0.7207)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 5.5493  data: 5.4203  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8602 (0.8600)  acc1: 82.8000 (82.1455)  acc5: 96.8000 (96.4727)  time: 0.7467  data: 0.6330  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0378 (1.0469)  acc1: 76.4000 (77.8667)  acc5: 94.0000 (94.1905)  time: 0.2042  data: 0.0931  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1340 (1.0583)  acc1: 74.8000 (77.3600)  acc5: 94.0000 (94.0960)  time: 0.2026  data: 0.0930  max mem: 18117
Test: Total time: 0:00:10 (0.4085 s / it)
* Acc@1 77.422 Acc@5 94.102 loss 1.047
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.51%
Epoch: [146]  [   0/1251]  eta: 1:10:27  lr: 0.002313  min_lr: 0.002313  loss: 3.8855 (3.8855)  weight_decay: 0.0500 (0.0500)  time: 3.3792  data: 2.3944  max mem: 18117
Epoch: [146]  [ 200/1251]  eta: 0:04:30  lr: 0.002310  min_lr: 0.002310  loss: 3.4574 (3.0204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7484 (0.7231)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [146]  [ 400/1251]  eta: 0:03:30  lr: 0.002306  min_lr: 0.002306  loss: 2.7363 (3.0312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6792 (0.6982)  time: 0.2421  data: 0.0004  max mem: 18117
Epoch: [146]  [ 600/1251]  eta: 0:02:39  lr: 0.002303  min_lr: 0.002303  loss: 2.8492 (3.0617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7578 (0.7160)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [146]  [ 800/1251]  eta: 0:01:49  lr: 0.002299  min_lr: 0.002299  loss: 2.3481 (3.0360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6416 (0.7070)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [146]  [1000/1251]  eta: 0:01:00  lr: 0.002296  min_lr: 0.002296  loss: 3.0774 (3.0514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5811 (0.6920)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [146]  [1200/1251]  eta: 0:00:12  lr: 0.002292  min_lr: 0.002292  loss: 2.3486 (3.0509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6839 (0.6891)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [146]  [1250/1251]  eta: 0:00:00  lr: 0.002291  min_lr: 0.002291  loss: 2.5144 (3.0460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6428 (0.6854)  time: 0.1949  data: 0.0007  max mem: 18117
Epoch: [146] Total time: 0:05:02 (0.2420 s / it)
Averaged stats: lr: 0.002291  min_lr: 0.002291  loss: 2.5144 (3.0586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6428 (0.6854)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6657 (0.6657)  acc1: 85.2000 (85.2000)  acc5: 98.0000 (98.0000)  time: 5.7182  data: 5.5876  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8708 (0.8212)  acc1: 84.0000 (82.2545)  acc5: 96.4000 (96.7273)  time: 0.7151  data: 0.6025  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0352 (1.0199)  acc1: 75.6000 (77.6381)  acc5: 93.2000 (94.2667)  time: 0.1905  data: 0.0811  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1173 (1.0291)  acc1: 75.2000 (77.4720)  acc5: 92.8000 (94.1120)  time: 0.2154  data: 0.1071  max mem: 18117
Test: Total time: 0:00:10 (0.4229 s / it)
* Acc@1 77.542 Acc@5 94.106 loss 1.019
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.54%
Epoch: [147]  [   0/1251]  eta: 1:05:30  lr: 0.002291  min_lr: 0.002291  loss: 3.4405 (3.4405)  weight_decay: 0.0500 (0.0500)  time: 3.1417  data: 2.8044  max mem: 18117
Epoch: [147]  [ 200/1251]  eta: 0:04:28  lr: 0.002288  min_lr: 0.002288  loss: 3.2399 (2.9333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7095 (0.7351)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [147]  [ 400/1251]  eta: 0:03:30  lr: 0.002284  min_lr: 0.002284  loss: 3.4722 (3.0342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5615 (0.7083)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [147]  [ 600/1251]  eta: 0:02:39  lr: 0.002280  min_lr: 0.002280  loss: 2.5353 (3.0471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6613 (0.7020)  time: 0.2396  data: 0.0005  max mem: 18117
Epoch: [147]  [ 800/1251]  eta: 0:01:50  lr: 0.002277  min_lr: 0.002277  loss: 2.7474 (3.0589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6359 (0.6971)  time: 0.2436  data: 0.0005  max mem: 18117
Epoch: [147]  [1000/1251]  eta: 0:01:00  lr: 0.002273  min_lr: 0.002273  loss: 3.0704 (3.0406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6058 (0.6902)  time: 0.2383  data: 0.0003  max mem: 18117
Epoch: [147]  [1200/1251]  eta: 0:00:12  lr: 0.002270  min_lr: 0.002270  loss: 2.6465 (3.0446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6258 (0.6975)  time: 0.2368  data: 0.0003  max mem: 18117
Epoch: [147]  [1250/1251]  eta: 0:00:00  lr: 0.002269  min_lr: 0.002269  loss: 3.1205 (3.0461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6833 (0.6980)  time: 0.1968  data: 0.0005  max mem: 18117
Epoch: [147] Total time: 0:05:03 (0.2424 s / it)
Averaged stats: lr: 0.002269  min_lr: 0.002269  loss: 3.1205 (3.0443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6833 (0.6980)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7092 (0.7092)  acc1: 82.4000 (82.4000)  acc5: 97.2000 (97.2000)  time: 5.6710  data: 5.5458  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8642 (0.8594)  acc1: 82.4000 (80.5455)  acc5: 96.4000 (96.1818)  time: 0.7566  data: 0.6451  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0813 (1.0413)  acc1: 74.4000 (76.9143)  acc5: 93.6000 (93.9619)  time: 0.2101  data: 0.1010  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1181 (1.0497)  acc1: 74.4000 (76.6720)  acc5: 92.8000 (93.9680)  time: 0.2091  data: 0.1009  max mem: 18117
Test: Total time: 0:00:10 (0.4160 s / it)
* Acc@1 77.192 Acc@5 94.078 loss 1.040
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.54%
Epoch: [148]  [   0/1251]  eta: 1:10:39  lr: 0.002269  min_lr: 0.002269  loss: 1.9663 (1.9663)  weight_decay: 0.0500 (0.0500)  time: 3.3889  data: 2.5211  max mem: 18117
Epoch: [148]  [ 200/1251]  eta: 0:04:27  lr: 0.002265  min_lr: 0.002265  loss: 3.0568 (3.0011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6058 (0.6933)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [148]  [ 400/1251]  eta: 0:03:30  lr: 0.002262  min_lr: 0.002262  loss: 3.1194 (2.9754)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6938 (0.7215)  time: 0.2415  data: 0.0004  max mem: 18117
Epoch: [148]  [ 600/1251]  eta: 0:02:39  lr: 0.002258  min_lr: 0.002258  loss: 2.9781 (3.0030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7149 (0.7379)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [148]  [ 800/1251]  eta: 0:01:49  lr: 0.002255  min_lr: 0.002255  loss: 2.8372 (3.0214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7820 (0.7345)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [148]  [1000/1251]  eta: 0:01:00  lr: 0.002251  min_lr: 0.002251  loss: 2.8875 (3.0082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6166 (0.7343)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [148]  [1200/1251]  eta: 0:00:12  lr: 0.002248  min_lr: 0.002248  loss: 2.6335 (3.0147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6355 (0.7324)  time: 0.2495  data: 0.0004  max mem: 18117
Epoch: [148]  [1250/1251]  eta: 0:00:00  lr: 0.002247  min_lr: 0.002247  loss: 2.4035 (3.0058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5834 (0.7261)  time: 0.1964  data: 0.0007  max mem: 18117
Epoch: [148] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.002247  min_lr: 0.002247  loss: 2.4035 (3.0429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5834 (0.7261)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6368 (0.6368)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 5.5694  data: 5.4435  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8342 (0.8207)  acc1: 80.4000 (81.4182)  acc5: 96.4000 (96.4727)  time: 0.7764  data: 0.6626  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0099 (1.0013)  acc1: 76.4000 (77.9810)  acc5: 93.6000 (94.0762)  time: 0.2180  data: 0.1069  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9951 (1.0081)  acc1: 76.4000 (77.6320)  acc5: 93.6000 (93.9840)  time: 0.2160  data: 0.1068  max mem: 18117
Test: Total time: 0:00:10 (0.4176 s / it)
* Acc@1 77.640 Acc@5 94.196 loss 1.000
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.64%
Epoch: [149]  [   0/1251]  eta: 1:05:54  lr: 0.002247  min_lr: 0.002247  loss: 3.8519 (3.8519)  weight_decay: 0.0500 (0.0500)  time: 3.1609  data: 2.8300  max mem: 18117
Epoch: [149]  [ 200/1251]  eta: 0:04:27  lr: 0.002243  min_lr: 0.002243  loss: 3.3895 (3.1316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6530 (0.7625)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [149]  [ 400/1251]  eta: 0:03:30  lr: 0.002240  min_lr: 0.002240  loss: 2.6951 (3.0715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6694 (0.7217)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [149]  [ 600/1251]  eta: 0:02:39  lr: 0.002236  min_lr: 0.002236  loss: 2.5106 (3.0330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6039 (0.7067)  time: 0.2393  data: 0.0003  max mem: 18117
Epoch: [149]  [ 800/1251]  eta: 0:01:49  lr: 0.002232  min_lr: 0.002232  loss: 3.1053 (3.0358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6406 (0.7025)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [149]  [1000/1251]  eta: 0:01:00  lr: 0.002229  min_lr: 0.002229  loss: 2.5094 (3.0353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7105 (0.7054)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [149]  [1200/1251]  eta: 0:00:12  lr: 0.002225  min_lr: 0.002225  loss: 2.4901 (3.0377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5903 (0.6949)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [149]  [1250/1251]  eta: 0:00:00  lr: 0.002224  min_lr: 0.002224  loss: 3.1084 (3.0427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5758 (0.6905)  time: 0.1959  data: 0.0007  max mem: 18117
Epoch: [149] Total time: 0:05:01 (0.2414 s / it)
Averaged stats: lr: 0.002224  min_lr: 0.002224  loss: 3.1084 (3.0396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5758 (0.6905)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7389 (0.7389)  acc1: 85.6000 (85.6000)  acc5: 99.2000 (99.2000)  time: 5.6254  data: 5.4874  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.8862 (0.8829)  acc1: 82.4000 (81.8182)  acc5: 97.2000 (96.8000)  time: 0.6452  data: 0.5315  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0678 (1.0694)  acc1: 77.6000 (78.1333)  acc5: 94.4000 (94.4000)  time: 0.1814  data: 0.0717  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1516 (1.0797)  acc1: 74.8000 (77.6320)  acc5: 94.0000 (94.4000)  time: 0.2003  data: 0.0910  max mem: 18117
Test: Total time: 0:00:10 (0.4070 s / it)
* Acc@1 77.672 Acc@5 94.086 loss 1.080
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.67%
Epoch: [150]  [   0/1251]  eta: 0:56:36  lr: 0.002224  min_lr: 0.002224  loss: 2.2730 (2.2730)  weight_decay: 0.0500 (0.0500)  time: 2.7149  data: 2.3774  max mem: 18117
Epoch: [150]  [ 200/1251]  eta: 0:04:25  lr: 0.002221  min_lr: 0.002221  loss: 2.5592 (2.9759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6590 (0.6502)  time: 0.2420  data: 0.0003  max mem: 18117
Epoch: [150]  [ 400/1251]  eta: 0:03:29  lr: 0.002217  min_lr: 0.002217  loss: 3.2785 (2.9953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5919 (0.6554)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [150]  [ 600/1251]  eta: 0:02:38  lr: 0.002214  min_lr: 0.002214  loss: 2.4841 (2.9908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7841 (0.6945)  time: 0.2446  data: 0.0004  max mem: 18117
Epoch: [150]  [ 800/1251]  eta: 0:01:49  lr: 0.002210  min_lr: 0.002210  loss: 3.4669 (3.0001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6243 (0.6836)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [150]  [1000/1251]  eta: 0:01:00  lr: 0.002207  min_lr: 0.002207  loss: 3.1006 (3.0073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6193 (0.6877)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [150]  [1200/1251]  eta: 0:00:12  lr: 0.002203  min_lr: 0.002203  loss: 2.7320 (3.0122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6207 (0.6910)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [150]  [1250/1251]  eta: 0:00:00  lr: 0.002202  min_lr: 0.002202  loss: 3.0521 (3.0100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6615 (0.6891)  time: 0.1950  data: 0.0009  max mem: 18117
Epoch: [150] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.002202  min_lr: 0.002202  loss: 3.0521 (3.0277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6615 (0.6891)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7711 (0.7711)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 5.6900  data: 5.5518  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8916 (0.8752)  acc1: 83.6000 (81.8909)  acc5: 96.8000 (96.4364)  time: 0.7490  data: 0.6357  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1139 (1.0551)  acc1: 76.4000 (77.8476)  acc5: 94.4000 (94.2286)  time: 0.2218  data: 0.1124  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1193 (1.0599)  acc1: 75.6000 (77.5680)  acc5: 94.0000 (94.3040)  time: 0.2208  data: 0.1124  max mem: 18117
Test: Total time: 0:00:10 (0.4255 s / it)
* Acc@1 77.638 Acc@5 94.128 loss 1.053
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.67%
Epoch: [151]  [   0/1251]  eta: 1:05:34  lr: 0.002202  min_lr: 0.002202  loss: 2.9128 (2.9128)  weight_decay: 0.0500 (0.0500)  time: 3.1450  data: 2.4554  max mem: 18117
Epoch: [151]  [ 200/1251]  eta: 0:04:26  lr: 0.002198  min_lr: 0.002198  loss: 3.2255 (3.0012)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6626 (0.6857)  time: 0.2362  data: 0.0004  max mem: 18117
Epoch: [151]  [ 400/1251]  eta: 0:03:28  lr: 0.002195  min_lr: 0.002195  loss: 3.0846 (2.9850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6776 (0.6814)  time: 0.2380  data: 0.0009  max mem: 18117
Epoch: [151]  [ 600/1251]  eta: 0:02:38  lr: 0.002191  min_lr: 0.002191  loss: 3.0489 (3.0262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6818 (0.6920)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [151]  [ 800/1251]  eta: 0:01:49  lr: 0.002188  min_lr: 0.002188  loss: 2.7943 (3.0320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7683 (0.6967)  time: 0.2353  data: 0.0004  max mem: 18117
Epoch: [151]  [1000/1251]  eta: 0:01:00  lr: 0.002184  min_lr: 0.002184  loss: 3.0732 (3.0453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6478 (0.6929)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [151]  [1200/1251]  eta: 0:00:12  lr: 0.002181  min_lr: 0.002181  loss: 3.0965 (3.0416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8323 (0.7027)  time: 0.2355  data: 0.0004  max mem: 18117
Epoch: [151]  [1250/1251]  eta: 0:00:00  lr: 0.002180  min_lr: 0.002180  loss: 3.3551 (3.0407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7749 (0.7064)  time: 0.1966  data: 0.0007  max mem: 18117
Epoch: [151] Total time: 0:05:00 (0.2404 s / it)
Averaged stats: lr: 0.002180  min_lr: 0.002180  loss: 3.3551 (3.0349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7749 (0.7064)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7130 (0.7130)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.8644  data: 5.7379  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8874 (0.8849)  acc1: 80.8000 (81.0545)  acc5: 96.8000 (96.5455)  time: 0.7702  data: 0.6577  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0616 (1.0584)  acc1: 74.4000 (77.1810)  acc5: 93.2000 (94.2857)  time: 0.2110  data: 0.1003  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1571 (1.0653)  acc1: 74.4000 (76.9120)  acc5: 93.2000 (94.2080)  time: 0.2094  data: 0.1001  max mem: 18117
Test: Total time: 0:00:10 (0.4240 s / it)
* Acc@1 77.098 Acc@5 94.152 loss 1.058
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.67%
Epoch: [152]  [   0/1251]  eta: 1:07:50  lr: 0.002180  min_lr: 0.002180  loss: 2.0021 (2.0021)  weight_decay: 0.0500 (0.0500)  time: 3.2539  data: 2.5170  max mem: 18117
Epoch: [152]  [ 200/1251]  eta: 0:04:30  lr: 0.002176  min_lr: 0.002176  loss: 2.3529 (2.9144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6498 (0.6714)  time: 0.2429  data: 0.0004  max mem: 18117
Epoch: [152]  [ 400/1251]  eta: 0:03:31  lr: 0.002173  min_lr: 0.002173  loss: 3.0595 (2.9537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5885 (0.6801)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [152]  [ 600/1251]  eta: 0:02:39  lr: 0.002169  min_lr: 0.002169  loss: 2.6040 (2.9727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6743 (0.7007)  time: 0.2411  data: 0.0004  max mem: 18117
Epoch: [152]  [ 800/1251]  eta: 0:01:49  lr: 0.002165  min_lr: 0.002165  loss: 3.4689 (3.0012)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5878 (0.6926)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [152]  [1000/1251]  eta: 0:01:00  lr: 0.002162  min_lr: 0.002162  loss: 2.9637 (2.9963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6276 (0.6833)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [152]  [1200/1251]  eta: 0:00:12  lr: 0.002158  min_lr: 0.002158  loss: 2.4294 (3.0139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6049 (0.6841)  time: 0.2396  data: 0.0005  max mem: 18117
Epoch: [152]  [1250/1251]  eta: 0:00:00  lr: 0.002157  min_lr: 0.002157  loss: 3.1784 (3.0176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6190 (0.6820)  time: 0.1960  data: 0.0007  max mem: 18117
Epoch: [152] Total time: 0:05:03 (0.2425 s / it)
Averaged stats: lr: 0.002157  min_lr: 0.002157  loss: 3.1784 (3.0198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6190 (0.6820)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7113 (0.7113)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.7895  data: 5.6398  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8471 (0.8752)  acc1: 81.6000 (81.0182)  acc5: 96.4000 (96.2909)  time: 0.7728  data: 0.6581  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.1034 (1.0526)  acc1: 74.0000 (77.2762)  acc5: 93.2000 (93.9810)  time: 0.2073  data: 0.0977  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1293 (1.0572)  acc1: 74.0000 (76.9760)  acc5: 92.8000 (93.9040)  time: 0.2066  data: 0.0976  max mem: 18117
Test: Total time: 0:00:10 (0.4180 s / it)
* Acc@1 77.646 Acc@5 94.178 loss 1.043
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.67%
Epoch: [153]  [   0/1251]  eta: 1:09:02  lr: 0.002157  min_lr: 0.002157  loss: 2.5701 (2.5701)  weight_decay: 0.0500 (0.0500)  time: 3.3114  data: 2.9534  max mem: 18117
Epoch: [153]  [ 200/1251]  eta: 0:04:27  lr: 0.002154  min_lr: 0.002154  loss: 3.5268 (3.0850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6993 (0.7477)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [153]  [ 400/1251]  eta: 0:03:29  lr: 0.002150  min_lr: 0.002150  loss: 3.1819 (3.0614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6265 (0.7532)  time: 0.2369  data: 0.0003  max mem: 18117
Epoch: [153]  [ 600/1251]  eta: 0:02:38  lr: 0.002147  min_lr: 0.002147  loss: 2.8695 (3.0543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6492 (0.7299)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [153]  [ 800/1251]  eta: 0:01:49  lr: 0.002143  min_lr: 0.002143  loss: 3.3337 (3.0524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6530 (0.7157)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [153]  [1000/1251]  eta: 0:01:00  lr: 0.002139  min_lr: 0.002139  loss: 2.6082 (3.0476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7173 (0.7189)  time: 0.2415  data: 0.0004  max mem: 18117
Epoch: [153]  [1200/1251]  eta: 0:00:12  lr: 0.002136  min_lr: 0.002136  loss: 2.8630 (3.0391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6123 (0.7128)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [153]  [1250/1251]  eta: 0:00:00  lr: 0.002135  min_lr: 0.002135  loss: 2.6496 (3.0363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6345 (0.7111)  time: 0.1959  data: 0.0009  max mem: 18117
Epoch: [153] Total time: 0:05:03 (0.2425 s / it)
Averaged stats: lr: 0.002135  min_lr: 0.002135  loss: 2.6496 (3.0252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6345 (0.7111)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6554 (0.6554)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.5858  data: 5.4614  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8425 (0.8061)  acc1: 82.0000 (82.1091)  acc5: 96.4000 (96.3273)  time: 0.7414  data: 0.6293  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9791 (0.9875)  acc1: 76.8000 (78.0571)  acc5: 93.6000 (94.4000)  time: 0.2025  data: 0.0929  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1249 (1.0013)  acc1: 76.4000 (77.6800)  acc5: 93.2000 (94.2560)  time: 0.2012  data: 0.0929  max mem: 18117
Test: Total time: 0:00:10 (0.4060 s / it)
* Acc@1 77.918 Acc@5 94.316 loss 0.992
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.92%
Epoch: [154]  [   0/1251]  eta: 0:54:00  lr: 0.002135  min_lr: 0.002135  loss: 1.9269 (1.9269)  weight_decay: 0.0500 (0.0500)  time: 2.5900  data: 1.7308  max mem: 18117
Epoch: [154]  [ 200/1251]  eta: 0:04:28  lr: 0.002131  min_lr: 0.002131  loss: 2.9732 (2.9892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5714 (0.6304)  time: 0.2455  data: 0.0004  max mem: 18117
Epoch: [154]  [ 400/1251]  eta: 0:03:30  lr: 0.002128  min_lr: 0.002128  loss: 2.3486 (3.0167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7600 (0.7100)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [154]  [ 600/1251]  eta: 0:02:39  lr: 0.002124  min_lr: 0.002124  loss: 2.2782 (3.0209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6125 (0.6889)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [154]  [ 800/1251]  eta: 0:01:49  lr: 0.002121  min_lr: 0.002121  loss: 2.7055 (3.0248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7112 (0.6982)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [154]  [1000/1251]  eta: 0:01:00  lr: 0.002117  min_lr: 0.002117  loss: 3.1258 (3.0357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6802 (0.6989)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [154]  [1200/1251]  eta: 0:00:12  lr: 0.002113  min_lr: 0.002113  loss: 3.6978 (3.0402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6375 (0.6973)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [154]  [1250/1251]  eta: 0:00:00  lr: 0.002113  min_lr: 0.002113  loss: 2.6109 (3.0333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6344 (0.6960)  time: 0.1960  data: 0.0007  max mem: 18117
Epoch: [154] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.002113  min_lr: 0.002113  loss: 2.6109 (3.0153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6344 (0.6960)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6399 (0.6399)  acc1: 88.0000 (88.0000)  acc5: 97.6000 (97.6000)  time: 5.7095  data: 5.5700  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7860 (0.8037)  acc1: 83.6000 (82.3273)  acc5: 96.8000 (96.8000)  time: 0.7175  data: 0.6042  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0042 (0.9758)  acc1: 76.8000 (78.2476)  acc5: 93.6000 (94.5333)  time: 0.1923  data: 0.0829  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0444 (0.9876)  acc1: 74.0000 (77.7760)  acc5: 93.2000 (94.3520)  time: 0.1966  data: 0.0867  max mem: 18117
Test: Total time: 0:00:10 (0.4061 s / it)
* Acc@1 77.786 Acc@5 94.340 loss 0.980
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.92%
Epoch: [155]  [   0/1251]  eta: 1:08:15  lr: 0.002113  min_lr: 0.002113  loss: 3.6083 (3.6083)  weight_decay: 0.0500 (0.0500)  time: 3.2737  data: 2.3902  max mem: 18117
Epoch: [155]  [ 200/1251]  eta: 0:04:29  lr: 0.002109  min_lr: 0.002109  loss: 2.4249 (2.9832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5632 (0.6292)  time: 0.2365  data: 0.0005  max mem: 18117
Epoch: [155]  [ 400/1251]  eta: 0:03:31  lr: 0.002105  min_lr: 0.002105  loss: 3.3582 (3.0127)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8002 (0.7009)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [155]  [ 600/1251]  eta: 0:02:39  lr: 0.002102  min_lr: 0.002102  loss: 2.2535 (3.0116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6316 (0.6993)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [155]  [ 800/1251]  eta: 0:01:49  lr: 0.002098  min_lr: 0.002098  loss: 3.3301 (3.0196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7213 (0.7072)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [155]  [1000/1251]  eta: 0:01:00  lr: 0.002095  min_lr: 0.002095  loss: 3.5424 (3.0249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6607 (0.7076)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [155]  [1200/1251]  eta: 0:00:12  lr: 0.002091  min_lr: 0.002091  loss: 2.6167 (3.0266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6321 (0.7085)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [155]  [1250/1251]  eta: 0:00:00  lr: 0.002090  min_lr: 0.002090  loss: 2.8202 (3.0276)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1954  data: 0.0006  max mem: 18117
Epoch: [155] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.002090  min_lr: 0.002090  loss: 2.8202 (3.0230)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7524 (0.7524)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.7522  data: 5.6267  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8792 (0.8639)  acc1: 81.6000 (81.2364)  acc5: 96.4000 (96.5091)  time: 0.7628  data: 0.6507  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0427 (1.0460)  acc1: 75.6000 (77.5429)  acc5: 93.6000 (94.3619)  time: 0.2250  data: 0.1150  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1328 (1.0575)  acc1: 76.0000 (77.2480)  acc5: 92.4000 (94.2080)  time: 0.2236  data: 0.1149  max mem: 18117
Test: Total time: 0:00:10 (0.4311 s / it)
* Acc@1 77.688 Acc@5 94.364 loss 1.042
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.92%
Epoch: [156]  [   0/1251]  eta: 1:05:40  lr: 0.002090  min_lr: 0.002090  loss: 2.2482 (2.2482)  weight_decay: 0.0500 (0.0500)  time: 3.1498  data: 1.7370  max mem: 18117
Epoch: [156]  [ 200/1251]  eta: 0:04:27  lr: 0.002087  min_lr: 0.002087  loss: 3.2644 (3.0888)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7021 (0.6984)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [156]  [ 400/1251]  eta: 0:03:29  lr: 0.002083  min_lr: 0.002083  loss: 2.5001 (3.0703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6585 (0.7081)  time: 0.2403  data: 0.0004  max mem: 18117
Epoch: [156]  [ 600/1251]  eta: 0:02:39  lr: 0.002079  min_lr: 0.002079  loss: 2.4155 (3.0782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6912 (0.6920)  time: 0.2413  data: 0.0004  max mem: 18117
Epoch: [156]  [ 800/1251]  eta: 0:01:49  lr: 0.002076  min_lr: 0.002076  loss: 3.3701 (3.0524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6525 (0.6970)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [156]  [1000/1251]  eta: 0:01:00  lr: 0.002072  min_lr: 0.002072  loss: 2.9070 (3.0467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6812 (0.6996)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [156]  [1200/1251]  eta: 0:00:12  lr: 0.002069  min_lr: 0.002069  loss: 3.1874 (3.0480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6837 (0.7059)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [156]  [1250/1251]  eta: 0:00:00  lr: 0.002068  min_lr: 0.002068  loss: 2.8127 (3.0435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8096 (0.7081)  time: 0.1956  data: 0.0007  max mem: 18117
Epoch: [156] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.002068  min_lr: 0.002068  loss: 2.8127 (3.0075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8096 (0.7081)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6743 (0.6743)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.4714  data: 5.3162  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7608 (0.8198)  acc1: 82.0000 (82.0000)  acc5: 96.4000 (96.3636)  time: 0.7586  data: 0.6427  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9852 (0.9918)  acc1: 76.4000 (78.1143)  acc5: 94.0000 (94.5333)  time: 0.2289  data: 0.1187  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0752 (1.0024)  acc1: 75.6000 (77.8080)  acc5: 93.6000 (94.3520)  time: 0.2281  data: 0.1186  max mem: 18117
Test: Total time: 0:00:10 (0.4228 s / it)
* Acc@1 77.944 Acc@5 94.250 loss 0.988
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.94%
Epoch: [157]  [   0/1251]  eta: 1:02:09  lr: 0.002068  min_lr: 0.002068  loss: 3.1150 (3.1150)  weight_decay: 0.0500 (0.0500)  time: 2.9811  data: 2.7189  max mem: 18117
Epoch: [157]  [ 200/1251]  eta: 0:04:25  lr: 0.002064  min_lr: 0.002064  loss: 2.4064 (3.0298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6130 (0.6681)  time: 0.2440  data: 0.0004  max mem: 18117
Epoch: [157]  [ 400/1251]  eta: 0:03:29  lr: 0.002061  min_lr: 0.002061  loss: 2.6132 (2.9846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6663 (0.6997)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [157]  [ 600/1251]  eta: 0:02:38  lr: 0.002057  min_lr: 0.002057  loss: 2.6379 (2.9333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6353 (0.7059)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [157]  [ 800/1251]  eta: 0:01:49  lr: 0.002053  min_lr: 0.002053  loss: 3.4205 (2.9859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6337 (0.7086)  time: 0.2359  data: 0.0003  max mem: 18117
Epoch: [157]  [1000/1251]  eta: 0:01:00  lr: 0.002050  min_lr: 0.002050  loss: 3.3891 (2.9704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6958 (0.7145)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [157]  [1200/1251]  eta: 0:00:12  lr: 0.002046  min_lr: 0.002046  loss: 3.6417 (2.9866)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8147 (0.7239)  time: 0.2411  data: 0.0004  max mem: 18117
Epoch: [157]  [1250/1251]  eta: 0:00:00  lr: 0.002045  min_lr: 0.002045  loss: 2.4021 (2.9816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7110 (0.7228)  time: 0.1960  data: 0.0006  max mem: 18117
Epoch: [157] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.002045  min_lr: 0.002045  loss: 2.4021 (3.0069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7110 (0.7228)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6577 (0.6577)  acc1: 88.4000 (88.4000)  acc5: 97.2000 (97.2000)  time: 5.6014  data: 5.4767  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8325 (0.8230)  acc1: 82.0000 (81.7818)  acc5: 95.6000 (96.0000)  time: 0.7550  data: 0.6427  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0015 (0.9931)  acc1: 77.6000 (78.3810)  acc5: 94.0000 (94.1143)  time: 0.2086  data: 0.0990  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1231 (1.0053)  acc1: 76.0000 (78.0000)  acc5: 93.2000 (94.0320)  time: 0.2148  data: 0.1065  max mem: 18117
Test: Total time: 0:00:10 (0.4173 s / it)
* Acc@1 77.828 Acc@5 94.264 loss 0.997
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.94%
Epoch: [158]  [   0/1251]  eta: 1:10:06  lr: 0.002045  min_lr: 0.002045  loss: 2.4406 (2.4406)  weight_decay: 0.0500 (0.0500)  time: 3.3627  data: 1.9810  max mem: 18117
Epoch: [158]  [ 200/1251]  eta: 0:04:27  lr: 0.002042  min_lr: 0.002042  loss: 3.1263 (3.0635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6316 (0.6931)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [158]  [ 400/1251]  eta: 0:03:29  lr: 0.002038  min_lr: 0.002038  loss: 3.3957 (3.0448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7178 (0.6833)  time: 0.2406  data: 0.0005  max mem: 18117
Epoch: [158]  [ 600/1251]  eta: 0:02:38  lr: 0.002035  min_lr: 0.002035  loss: 3.4184 (3.0404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6909 (0.6889)  time: 0.2381  data: 0.0005  max mem: 18117
Epoch: [158]  [ 800/1251]  eta: 0:01:49  lr: 0.002031  min_lr: 0.002031  loss: 3.3034 (3.0599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6929 (0.6915)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [158]  [1000/1251]  eta: 0:01:00  lr: 0.002027  min_lr: 0.002027  loss: 3.3461 (3.0586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6513 (0.6960)  time: 0.2342  data: 0.0005  max mem: 18117
Epoch: [158]  [1200/1251]  eta: 0:00:12  lr: 0.002024  min_lr: 0.002024  loss: 3.6023 (3.0408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6495 (0.6996)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [158]  [1250/1251]  eta: 0:00:00  lr: 0.002023  min_lr: 0.002023  loss: 3.2189 (3.0398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5813 (0.6972)  time: 0.1960  data: 0.0007  max mem: 18117
Epoch: [158] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.002023  min_lr: 0.002023  loss: 3.2189 (3.0095)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5813 (0.6972)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6670 (0.6670)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.6272  data: 5.4714  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7879 (0.8254)  acc1: 82.4000 (81.4182)  acc5: 96.8000 (96.8000)  time: 0.7515  data: 0.6354  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0096 (0.9946)  acc1: 75.2000 (77.6952)  acc5: 94.0000 (94.6286)  time: 0.2155  data: 0.1052  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0764 (1.0008)  acc1: 76.8000 (77.6800)  acc5: 92.8000 (94.3200)  time: 0.2148  data: 0.1052  max mem: 18117
Test: Total time: 0:00:10 (0.4183 s / it)
* Acc@1 77.938 Acc@5 94.230 loss 0.993
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.94%
Epoch: [159]  [   0/1251]  eta: 1:08:15  lr: 0.002023  min_lr: 0.002023  loss: 4.2685 (4.2685)  weight_decay: 0.0500 (0.0500)  time: 3.2739  data: 2.5784  max mem: 18117
Epoch: [159]  [ 200/1251]  eta: 0:04:27  lr: 0.002019  min_lr: 0.002019  loss: 2.4174 (2.9051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6709 (0.7144)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [159]  [ 400/1251]  eta: 0:03:29  lr: 0.002016  min_lr: 0.002016  loss: 2.3072 (2.9369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7002 (0.7058)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [159]  [ 600/1251]  eta: 0:02:38  lr: 0.002012  min_lr: 0.002012  loss: 2.3588 (2.9699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6660 (0.7273)  time: 0.2383  data: 0.0005  max mem: 18117
Epoch: [159]  [ 800/1251]  eta: 0:01:49  lr: 0.002009  min_lr: 0.002009  loss: 2.7455 (2.9706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7097 (0.7108)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [159]  [1000/1251]  eta: 0:01:00  lr: 0.002005  min_lr: 0.002005  loss: 2.6854 (2.9783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6836 (0.7120)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [159]  [1200/1251]  eta: 0:00:12  lr: 0.002001  min_lr: 0.002001  loss: 3.3603 (2.9871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6230 (0.7039)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [159]  [1250/1251]  eta: 0:00:00  lr: 0.002001  min_lr: 0.002001  loss: 2.6201 (2.9821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6259 (0.7006)  time: 0.1952  data: 0.0005  max mem: 18117
Epoch: [159] Total time: 0:05:01 (0.2410 s / it)
Averaged stats: lr: 0.002001  min_lr: 0.002001  loss: 2.6201 (3.0105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6259 (0.7006)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6017 (0.6017)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.6838  data: 5.5573  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7660 (0.8107)  acc1: 83.2000 (81.9636)  acc5: 96.8000 (96.8000)  time: 0.7527  data: 0.6405  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9999 (0.9879)  acc1: 76.4000 (78.3619)  acc5: 94.0000 (94.4762)  time: 0.2084  data: 0.0989  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0966 (0.9991)  acc1: 75.6000 (77.9360)  acc5: 92.8000 (94.3200)  time: 0.2071  data: 0.0989  max mem: 18117
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 78.172 Acc@5 94.326 loss 0.991
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.17%
Epoch: [160]  [   0/1251]  eta: 1:10:45  lr: 0.002001  min_lr: 0.002001  loss: 4.3627 (4.3627)  weight_decay: 0.0500 (0.0500)  time: 3.3934  data: 3.0836  max mem: 18117
Epoch: [160]  [ 200/1251]  eta: 0:04:25  lr: 0.001997  min_lr: 0.001997  loss: 2.5538 (2.9332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8163 (0.7601)  time: 0.2349  data: 0.0004  max mem: 18117
Epoch: [160]  [ 400/1251]  eta: 0:03:28  lr: 0.001993  min_lr: 0.001993  loss: 3.2900 (2.9737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6067 (0.7701)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [160]  [ 600/1251]  eta: 0:02:38  lr: 0.001990  min_lr: 0.001990  loss: 3.0659 (2.9609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7125 (0.7377)  time: 0.2394  data: 0.0005  max mem: 18117
Epoch: [160]  [ 800/1251]  eta: 0:01:49  lr: 0.001986  min_lr: 0.001986  loss: 2.6394 (2.9938)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6684 (0.7279)  time: 0.2392  data: 0.0005  max mem: 18117
Epoch: [160]  [1000/1251]  eta: 0:01:00  lr: 0.001983  min_lr: 0.001983  loss: 2.5724 (2.9887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6868 (0.7178)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [160]  [1200/1251]  eta: 0:00:12  lr: 0.001979  min_lr: 0.001979  loss: 3.0894 (2.9807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6472 (0.7265)  time: 0.2391  data: 0.0005  max mem: 18117
Epoch: [160]  [1250/1251]  eta: 0:00:00  lr: 0.001978  min_lr: 0.001978  loss: 2.6034 (2.9837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6236 (0.7223)  time: 0.1963  data: 0.0005  max mem: 18117
Epoch: [160] Total time: 0:05:03 (0.2425 s / it)
Averaged stats: lr: 0.001978  min_lr: 0.001978  loss: 2.6034 (3.0056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6236 (0.7223)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6634 (0.6634)  acc1: 84.8000 (84.8000)  acc5: 98.8000 (98.8000)  time: 5.8457  data: 5.7047  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8160 (0.8250)  acc1: 83.6000 (81.9636)  acc5: 96.4000 (96.8000)  time: 0.6856  data: 0.5741  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0128 (1.0047)  acc1: 76.4000 (78.0762)  acc5: 94.4000 (94.3619)  time: 0.1908  data: 0.0824  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1314 (1.0132)  acc1: 75.2000 (77.6960)  acc5: 93.6000 (94.2720)  time: 0.1906  data: 0.0824  max mem: 18117
Test: Total time: 0:00:10 (0.4121 s / it)
* Acc@1 78.172 Acc@5 94.370 loss 1.001
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.17%
Epoch: [161]  [   0/1251]  eta: 1:06:44  lr: 0.001978  min_lr: 0.001978  loss: 2.5049 (2.5049)  weight_decay: 0.0500 (0.0500)  time: 3.2013  data: 2.8855  max mem: 18117
Epoch: [161]  [ 200/1251]  eta: 0:04:26  lr: 0.001974  min_lr: 0.001974  loss: 2.7024 (2.9582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6827 (0.7302)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [161]  [ 400/1251]  eta: 0:03:28  lr: 0.001971  min_lr: 0.001971  loss: 3.4825 (2.9587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6744 (0.6991)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [161]  [ 600/1251]  eta: 0:02:38  lr: 0.001967  min_lr: 0.001967  loss: 2.6395 (2.9506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6482 (0.7268)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [161]  [ 800/1251]  eta: 0:01:48  lr: 0.001964  min_lr: 0.001964  loss: 3.0083 (2.9726)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7135 (0.7243)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [161]  [1000/1251]  eta: 0:01:00  lr: 0.001960  min_lr: 0.001960  loss: 2.6937 (2.9729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6373 (0.7161)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [161]  [1200/1251]  eta: 0:00:12  lr: 0.001956  min_lr: 0.001956  loss: 3.1686 (2.9916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6021 (0.7064)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [161]  [1250/1251]  eta: 0:00:00  lr: 0.001956  min_lr: 0.001956  loss: 2.5526 (2.9927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6079 (0.7035)  time: 0.1958  data: 0.0007  max mem: 18117
Epoch: [161] Total time: 0:05:00 (0.2405 s / it)
Averaged stats: lr: 0.001956  min_lr: 0.001956  loss: 2.5526 (2.9964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6079 (0.7035)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6015 (0.6015)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.4552  data: 5.3062  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7691 (0.8068)  acc1: 82.4000 (82.0364)  acc5: 97.2000 (96.7636)  time: 0.6950  data: 0.5815  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0116 (0.9824)  acc1: 75.6000 (78.2857)  acc5: 94.0000 (94.3429)  time: 0.1937  data: 0.0847  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0468 (0.9917)  acc1: 75.6000 (78.1120)  acc5: 93.2000 (94.2720)  time: 0.2127  data: 0.1038  max mem: 18117
Test: Total time: 0:00:10 (0.4100 s / it)
* Acc@1 78.260 Acc@5 94.442 loss 0.982
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.26%
Epoch: [162]  [   0/1251]  eta: 0:58:15  lr: 0.001956  min_lr: 0.001956  loss: 2.0187 (2.0187)  weight_decay: 0.0500 (0.0500)  time: 2.7942  data: 2.4186  max mem: 18117
Epoch: [162]  [ 200/1251]  eta: 0:04:24  lr: 0.001952  min_lr: 0.001952  loss: 2.6559 (2.9494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6736 (0.7515)  time: 0.2409  data: 0.0004  max mem: 18117
Epoch: [162]  [ 400/1251]  eta: 0:03:28  lr: 0.001948  min_lr: 0.001948  loss: 2.5417 (2.9461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6675 (0.7197)  time: 0.2368  data: 0.0003  max mem: 18117
Epoch: [162]  [ 600/1251]  eta: 0:02:37  lr: 0.001945  min_lr: 0.001945  loss: 2.5101 (2.9249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6596 (0.7169)  time: 0.2388  data: 0.0003  max mem: 18117
Epoch: [162]  [ 800/1251]  eta: 0:01:48  lr: 0.001941  min_lr: 0.001941  loss: 2.7311 (2.9394)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2376  data: 0.0003  max mem: 18117
Epoch: [162]  [1000/1251]  eta: 0:01:00  lr: 0.001938  min_lr: 0.001938  loss: 2.8204 (2.9589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7223 (nan)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [162]  [1200/1251]  eta: 0:00:12  lr: 0.001934  min_lr: 0.001934  loss: 3.6307 (2.9795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6180 (nan)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [162]  [1250/1251]  eta: 0:00:00  lr: 0.001933  min_lr: 0.001933  loss: 2.2594 (2.9749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8202 (nan)  time: 0.1962  data: 0.0010  max mem: 18117
Epoch: [162] Total time: 0:05:00 (0.2404 s / it)
Averaged stats: lr: 0.001933  min_lr: 0.001933  loss: 2.2594 (3.0029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8202 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6110 (0.6110)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.4775  data: 5.3513  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7423 (0.7773)  acc1: 82.8000 (82.2909)  acc5: 96.8000 (96.6182)  time: 0.7490  data: 0.6358  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9433 (0.9590)  acc1: 77.6000 (78.6476)  acc5: 94.0000 (94.3810)  time: 0.2280  data: 0.1176  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0150 (0.9668)  acc1: 76.4000 (78.2880)  acc5: 93.6000 (94.4320)  time: 0.2266  data: 0.1175  max mem: 18117
Test: Total time: 0:00:10 (0.4219 s / it)
* Acc@1 78.256 Acc@5 94.348 loss 0.963
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.26%
Epoch: [163]  [   0/1251]  eta: 1:08:29  lr: 0.001933  min_lr: 0.001933  loss: 4.3144 (4.3144)  weight_decay: 0.0500 (0.0500)  time: 3.2851  data: 2.9592  max mem: 18117
Epoch: [163]  [ 200/1251]  eta: 0:04:26  lr: 0.001930  min_lr: 0.001930  loss: 2.4364 (3.0108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7454 (0.7130)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [163]  [ 400/1251]  eta: 0:03:30  lr: 0.001926  min_lr: 0.001926  loss: 2.3169 (2.9650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6351 (0.6848)  time: 0.2406  data: 0.0012  max mem: 18117
Epoch: [163]  [ 600/1251]  eta: 0:02:38  lr: 0.001922  min_lr: 0.001922  loss: 3.1657 (2.9505)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [163]  [ 800/1251]  eta: 0:01:49  lr: 0.001919  min_lr: 0.001919  loss: 2.5160 (2.9463)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6195 (nan)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [163]  [1000/1251]  eta: 0:01:00  lr: 0.001915  min_lr: 0.001915  loss: 2.9276 (2.9591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7446 (nan)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [163]  [1200/1251]  eta: 0:00:12  lr: 0.001912  min_lr: 0.001912  loss: 2.5153 (2.9627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6311 (nan)  time: 0.2490  data: 0.0004  max mem: 18117
Epoch: [163]  [1250/1251]  eta: 0:00:00  lr: 0.001911  min_lr: 0.001911  loss: 3.1589 (2.9728)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6558 (nan)  time: 0.1958  data: 0.0006  max mem: 18117
Epoch: [163] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.001911  min_lr: 0.001911  loss: 3.1589 (2.9897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6558 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6968 (0.6968)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.4845  data: 5.3390  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.9011 (0.8885)  acc1: 84.8000 (82.9818)  acc5: 97.2000 (96.6182)  time: 0.7333  data: 0.6191  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0874 (1.0653)  acc1: 76.8000 (78.5714)  acc5: 93.6000 (94.2095)  time: 0.2013  data: 0.0892  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1652 (1.0762)  acc1: 75.2000 (78.1120)  acc5: 92.4000 (94.0800)  time: 0.2004  data: 0.0891  max mem: 18117
Test: Total time: 0:00:10 (0.4036 s / it)
* Acc@1 77.920 Acc@5 94.326 loss 1.070
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 78.26%
Epoch: [164]  [   0/1251]  eta: 1:08:51  lr: 0.001911  min_lr: 0.001911  loss: 2.0669 (2.0669)  weight_decay: 0.0500 (0.0500)  time: 3.3023  data: 2.4155  max mem: 18117
Epoch: [164]  [ 200/1251]  eta: 0:04:29  lr: 0.001907  min_lr: 0.001907  loss: 2.4052 (2.9596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7384 (0.7070)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [164]  [ 400/1251]  eta: 0:03:31  lr: 0.001904  min_lr: 0.001904  loss: 2.5418 (2.9831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6488 (0.7124)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [164]  [ 600/1251]  eta: 0:02:39  lr: 0.001900  min_lr: 0.001900  loss: 3.5280 (2.9953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6369 (0.7266)  time: 0.2390  data: 0.0005  max mem: 18117
Epoch: [164]  [ 800/1251]  eta: 0:01:49  lr: 0.001896  min_lr: 0.001896  loss: 2.9297 (2.9833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7391 (0.7290)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [164]  [1000/1251]  eta: 0:01:00  lr: 0.001893  min_lr: 0.001893  loss: 3.6045 (2.9946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6763 (0.7316)  time: 0.2407  data: 0.0005  max mem: 18117
Epoch: [164]  [1200/1251]  eta: 0:00:12  lr: 0.001889  min_lr: 0.001889  loss: 2.4341 (2.9917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6901 (0.7373)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [164]  [1250/1251]  eta: 0:00:00  lr: 0.001888  min_lr: 0.001888  loss: 2.9748 (2.9844)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6746 (0.7381)  time: 0.1962  data: 0.0006  max mem: 18117
Epoch: [164] Total time: 0:05:02 (0.2422 s / it)
Averaged stats: lr: 0.001888  min_lr: 0.001888  loss: 2.9748 (2.9831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6746 (0.7381)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6308 (0.6308)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.7060  data: 5.5777  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8356 (0.8016)  acc1: 83.2000 (82.8364)  acc5: 96.8000 (96.7636)  time: 0.7082  data: 0.5961  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9969 (0.9720)  acc1: 76.0000 (78.7048)  acc5: 94.8000 (94.5714)  time: 0.1822  data: 0.0729  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0650 (0.9854)  acc1: 76.0000 (78.3360)  acc5: 93.2000 (94.4480)  time: 0.1949  data: 0.0866  max mem: 18117
Test: Total time: 0:00:10 (0.4076 s / it)
* Acc@1 78.242 Acc@5 94.416 loss 0.979
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.26%
Epoch: [165]  [   0/1251]  eta: 1:07:38  lr: 0.001888  min_lr: 0.001888  loss: 3.9990 (3.9990)  weight_decay: 0.0500 (0.0500)  time: 3.2440  data: 1.5984  max mem: 18117
Epoch: [165]  [ 200/1251]  eta: 0:04:25  lr: 0.001885  min_lr: 0.001885  loss: 3.4288 (3.0138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6375 (0.7720)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [165]  [ 400/1251]  eta: 0:03:27  lr: 0.001881  min_lr: 0.001881  loss: 3.5597 (2.9769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7670 (0.7503)  time: 0.2398  data: 0.0005  max mem: 18117
Epoch: [165]  [ 600/1251]  eta: 0:02:38  lr: 0.001878  min_lr: 0.001878  loss: 3.2495 (2.9615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6637 (0.7453)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [165]  [ 800/1251]  eta: 0:01:48  lr: 0.001874  min_lr: 0.001874  loss: 2.4518 (2.9617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6335 (0.7286)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [165]  [1000/1251]  eta: 0:01:00  lr: 0.001870  min_lr: 0.001870  loss: 2.8402 (2.9691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7025 (0.7289)  time: 0.2401  data: 0.0005  max mem: 18117
Epoch: [165]  [1200/1251]  eta: 0:00:12  lr: 0.001867  min_lr: 0.001867  loss: 2.7877 (2.9573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6422 (0.7202)  time: 0.2420  data: 0.0004  max mem: 18117
Epoch: [165]  [1250/1251]  eta: 0:00:00  lr: 0.001866  min_lr: 0.001866  loss: 3.0490 (2.9582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6370 (0.7197)  time: 0.1963  data: 0.0007  max mem: 18117
Epoch: [165] Total time: 0:05:01 (0.2407 s / it)
Averaged stats: lr: 0.001866  min_lr: 0.001866  loss: 3.0490 (2.9778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6370 (0.7197)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6066 (0.6066)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 5.5856  data: 5.4390  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8124 (0.8063)  acc1: 84.4000 (82.2909)  acc5: 96.8000 (96.5455)  time: 0.6886  data: 0.5764  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0059 (0.9885)  acc1: 76.0000 (78.1333)  acc5: 93.6000 (94.4762)  time: 0.1808  data: 0.0722  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1007 (1.0018)  acc1: 74.4000 (77.7600)  acc5: 92.8000 (94.3360)  time: 0.1894  data: 0.0802  max mem: 18117
Test: Total time: 0:00:09 (0.3964 s / it)
* Acc@1 78.010 Acc@5 94.400 loss 0.985
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 78.26%
Epoch: [166]  [   0/1251]  eta: 1:07:04  lr: 0.001866  min_lr: 0.001866  loss: 3.6744 (3.6744)  weight_decay: 0.0500 (0.0500)  time: 3.2172  data: 1.8078  max mem: 18117
Epoch: [166]  [ 200/1251]  eta: 0:04:28  lr: 0.001862  min_lr: 0.001862  loss: 3.0926 (2.9920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6270 (0.7610)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [166]  [ 400/1251]  eta: 0:03:30  lr: 0.001859  min_lr: 0.001859  loss: 3.2584 (3.0183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6827 (0.7411)  time: 0.2425  data: 0.0004  max mem: 18117
Epoch: [166]  [ 600/1251]  eta: 0:02:38  lr: 0.001855  min_lr: 0.001855  loss: 2.8667 (3.0168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7128 (0.7518)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [166]  [ 800/1251]  eta: 0:01:49  lr: 0.001852  min_lr: 0.001852  loss: 3.2352 (3.0199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6047 (0.7451)  time: 0.2357  data: 0.0003  max mem: 18117
Epoch: [166]  [1000/1251]  eta: 0:01:00  lr: 0.001848  min_lr: 0.001848  loss: 2.4376 (2.9890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6654 (0.7419)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [166]  [1200/1251]  eta: 0:00:12  lr: 0.001844  min_lr: 0.001844  loss: 2.7691 (2.9927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6274 (0.7279)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [166]  [1250/1251]  eta: 0:00:00  lr: 0.001844  min_lr: 0.001844  loss: 2.3792 (2.9870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6656 (0.7278)  time: 0.1960  data: 0.0007  max mem: 18117
Epoch: [166] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.001844  min_lr: 0.001844  loss: 2.3792 (2.9851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6656 (0.7278)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6637 (0.6637)  acc1: 84.0000 (84.0000)  acc5: 98.8000 (98.8000)  time: 5.7991  data: 5.6572  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7492 (0.7863)  acc1: 84.0000 (82.3273)  acc5: 96.4000 (96.6182)  time: 0.7181  data: 0.6046  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9572 (0.9639)  acc1: 76.8000 (78.3238)  acc5: 94.4000 (94.3048)  time: 0.1796  data: 0.0702  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0414 (0.9730)  acc1: 75.6000 (77.9840)  acc5: 92.4000 (94.1760)  time: 0.1813  data: 0.0722  max mem: 18117
Test: Total time: 0:00:09 (0.3982 s / it)
* Acc@1 78.202 Acc@5 94.484 loss 0.961
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.26%
Epoch: [167]  [   0/1251]  eta: 1:07:15  lr: 0.001844  min_lr: 0.001844  loss: 2.3029 (2.3029)  weight_decay: 0.0500 (0.0500)  time: 3.2256  data: 2.8409  max mem: 18117
Epoch: [167]  [ 200/1251]  eta: 0:04:29  lr: 0.001840  min_lr: 0.001840  loss: 2.8082 (2.9128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6517 (0.7305)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [167]  [ 400/1251]  eta: 0:03:31  lr: 0.001836  min_lr: 0.001836  loss: 2.5047 (2.9068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7464 (0.7456)  time: 0.2503  data: 0.0004  max mem: 18117
Epoch: [167]  [ 600/1251]  eta: 0:02:39  lr: 0.001833  min_lr: 0.001833  loss: 2.9903 (2.9342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6362 (0.7400)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [167]  [ 800/1251]  eta: 0:01:50  lr: 0.001829  min_lr: 0.001829  loss: 2.7939 (2.9502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6979 (0.7301)  time: 0.2474  data: 0.0003  max mem: 18117
Epoch: [167]  [1000/1251]  eta: 0:01:01  lr: 0.001826  min_lr: 0.001826  loss: 2.4023 (2.9537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8194 (0.7318)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [167]  [1200/1251]  eta: 0:00:12  lr: 0.001822  min_lr: 0.001822  loss: 3.4460 (2.9531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6926 (0.7292)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [167]  [1250/1251]  eta: 0:00:00  lr: 0.001821  min_lr: 0.001821  loss: 2.7708 (2.9552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6590 (0.7262)  time: 0.1956  data: 0.0007  max mem: 18117
Epoch: [167] Total time: 0:05:03 (0.2426 s / it)
Averaged stats: lr: 0.001821  min_lr: 0.001821  loss: 2.7708 (2.9776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6590 (0.7262)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.6402 (0.6402)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.2762  data: 5.1459  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8547 (0.8106)  acc1: 82.4000 (82.4000)  acc5: 96.4000 (96.8000)  time: 0.7306  data: 0.6145  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9944 (0.9872)  acc1: 75.6000 (78.7429)  acc5: 95.2000 (94.8191)  time: 0.2196  data: 0.1082  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1404 (0.9991)  acc1: 75.6000 (78.3040)  acc5: 93.6000 (94.6560)  time: 0.2176  data: 0.1081  max mem: 18117
Test: Total time: 0:00:10 (0.4075 s / it)
* Acc@1 78.234 Acc@5 94.450 loss 0.996
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.26%
Epoch: [168]  [   0/1251]  eta: 1:06:08  lr: 0.001821  min_lr: 0.001821  loss: 2.8398 (2.8398)  weight_decay: 0.0500 (0.0500)  time: 3.1720  data: 2.5173  max mem: 18117
Epoch: [168]  [ 200/1251]  eta: 0:04:26  lr: 0.001818  min_lr: 0.001818  loss: 3.5505 (2.9936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6124 (0.6406)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [168]  [ 400/1251]  eta: 0:03:29  lr: 0.001814  min_lr: 0.001814  loss: 2.5914 (2.9789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6930 (0.6763)  time: 0.2362  data: 0.0004  max mem: 18117
Epoch: [168]  [ 600/1251]  eta: 0:02:38  lr: 0.001811  min_lr: 0.001811  loss: 3.2394 (3.0053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6513 (0.6818)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [168]  [ 800/1251]  eta: 0:01:49  lr: 0.001807  min_lr: 0.001807  loss: 3.5218 (3.0070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7444 (0.7227)  time: 0.2423  data: 0.0004  max mem: 18117
Epoch: [168]  [1000/1251]  eta: 0:01:00  lr: 0.001803  min_lr: 0.001803  loss: 2.6629 (3.0115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6675 (0.7230)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [168]  [1200/1251]  eta: 0:00:12  lr: 0.001800  min_lr: 0.001800  loss: 3.2826 (3.0107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6509 (0.7321)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [168]  [1250/1251]  eta: 0:00:00  lr: 0.001799  min_lr: 0.001799  loss: 2.4872 (3.0101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6916 (0.7356)  time: 0.1950  data: 0.0007  max mem: 18117
Epoch: [168] Total time: 0:05:01 (0.2409 s / it)
Averaged stats: lr: 0.001799  min_lr: 0.001799  loss: 2.4872 (2.9889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6916 (0.7356)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7150 (0.7150)  acc1: 83.2000 (83.2000)  acc5: 98.0000 (98.0000)  time: 5.7088  data: 5.5835  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8059 (0.8138)  acc1: 83.6000 (82.4364)  acc5: 97.2000 (96.6546)  time: 0.7501  data: 0.6378  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0314 (0.9890)  acc1: 76.0000 (78.3238)  acc5: 94.8000 (94.6095)  time: 0.2146  data: 0.1050  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0880 (0.9951)  acc1: 75.6000 (77.8880)  acc5: 94.0000 (94.4960)  time: 0.2132  data: 0.1049  max mem: 18117
Test: Total time: 0:00:10 (0.4204 s / it)
* Acc@1 78.436 Acc@5 94.484 loss 0.998
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.44%
Epoch: [169]  [   0/1251]  eta: 1:10:49  lr: 0.001799  min_lr: 0.001799  loss: 2.8235 (2.8235)  weight_decay: 0.0500 (0.0500)  time: 3.3972  data: 3.0829  max mem: 18117
Epoch: [169]  [ 200/1251]  eta: 0:04:25  lr: 0.001795  min_lr: 0.001795  loss: 3.3502 (2.9790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7770 (0.8046)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [169]  [ 400/1251]  eta: 0:03:29  lr: 0.001792  min_lr: 0.001792  loss: 2.7572 (2.9795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6450 (0.7753)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [169]  [ 600/1251]  eta: 0:02:38  lr: 0.001788  min_lr: 0.001788  loss: 3.6928 (2.9968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7624 (0.7453)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [169]  [ 800/1251]  eta: 0:01:49  lr: 0.001785  min_lr: 0.001785  loss: 2.9634 (3.0001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7035 (0.7433)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [169]  [1000/1251]  eta: 0:01:00  lr: 0.001781  min_lr: 0.001781  loss: 3.1240 (2.9979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7683 (0.7502)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [169]  [1200/1251]  eta: 0:00:12  lr: 0.001777  min_lr: 0.001777  loss: 2.6202 (2.9768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7931 (0.7584)  time: 0.2418  data: 0.0004  max mem: 18117
Epoch: [169]  [1250/1251]  eta: 0:00:00  lr: 0.001777  min_lr: 0.001777  loss: 3.2634 (2.9832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7793 (0.7593)  time: 0.1963  data: 0.0007  max mem: 18117
Epoch: [169] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.001777  min_lr: 0.001777  loss: 3.2634 (2.9788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7793 (0.7593)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7124 (0.7124)  acc1: 83.6000 (83.6000)  acc5: 98.8000 (98.8000)  time: 5.7678  data: 5.6446  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8185 (0.8392)  acc1: 83.2000 (81.7455)  acc5: 96.4000 (96.5455)  time: 0.7314  data: 0.6205  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0084 (1.0222)  acc1: 75.2000 (78.2667)  acc5: 94.4000 (94.4762)  time: 0.2010  data: 0.0921  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0528 (1.0278)  acc1: 75.2000 (78.0800)  acc5: 94.4000 (94.4480)  time: 0.2001  data: 0.0921  max mem: 18117
Test: Total time: 0:00:10 (0.4123 s / it)
* Acc@1 78.216 Acc@5 94.488 loss 1.020
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.44%
Epoch: [170]  [   0/1251]  eta: 1:04:08  lr: 0.001777  min_lr: 0.001777  loss: 3.5983 (3.5983)  weight_decay: 0.0500 (0.0500)  time: 3.0761  data: 2.6125  max mem: 18117
Epoch: [170]  [ 200/1251]  eta: 0:04:26  lr: 0.001773  min_lr: 0.001773  loss: 2.9845 (2.9247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6979 (0.7135)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [170]  [ 400/1251]  eta: 0:03:29  lr: 0.001769  min_lr: 0.001769  loss: 2.7775 (2.9616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7333 (0.7170)  time: 0.2374  data: 0.0005  max mem: 18117
Epoch: [170]  [ 600/1251]  eta: 0:02:38  lr: 0.001766  min_lr: 0.001766  loss: 2.7885 (2.9457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6983 (0.7250)  time: 0.2406  data: 0.0004  max mem: 18117
Epoch: [170]  [ 800/1251]  eta: 0:01:49  lr: 0.001762  min_lr: 0.001762  loss: 2.9272 (2.9589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6361 (0.7162)  time: 0.2371  data: 0.0005  max mem: 18117
Epoch: [170]  [1000/1251]  eta: 0:01:00  lr: 0.001759  min_lr: 0.001759  loss: 2.9220 (2.9724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7699 (0.7215)  time: 0.2408  data: 0.0005  max mem: 18117
Epoch: [170]  [1200/1251]  eta: 0:00:12  lr: 0.001755  min_lr: 0.001755  loss: 2.8053 (2.9808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5814 (0.7305)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [170]  [1250/1251]  eta: 0:00:00  lr: 0.001754  min_lr: 0.001754  loss: 3.6362 (2.9871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6141 (0.7289)  time: 0.2002  data: 0.0007  max mem: 18117
Epoch: [170] Total time: 0:05:03 (0.2422 s / it)
Averaged stats: lr: 0.001754  min_lr: 0.001754  loss: 3.6362 (2.9653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6141 (0.7289)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7324 (0.7324)  acc1: 85.2000 (85.2000)  acc5: 96.8000 (96.8000)  time: 5.4898  data: 5.3633  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8839 (0.8683)  acc1: 84.0000 (82.1091)  acc5: 96.8000 (96.6546)  time: 0.7637  data: 0.6509  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0619 (1.0509)  acc1: 76.0000 (78.1333)  acc5: 95.2000 (94.6476)  time: 0.2129  data: 0.1024  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0645 (1.0560)  acc1: 76.0000 (77.9840)  acc5: 93.6000 (94.5920)  time: 0.2142  data: 0.1048  max mem: 18117
Test: Total time: 0:00:10 (0.4128 s / it)
* Acc@1 78.218 Acc@5 94.574 loss 1.050
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.44%
Epoch: [171]  [   0/1251]  eta: 1:07:22  lr: 0.001754  min_lr: 0.001754  loss: 3.4848 (3.4848)  weight_decay: 0.0500 (0.0500)  time: 3.2317  data: 2.5606  max mem: 18117
Epoch: [171]  [ 200/1251]  eta: 0:04:27  lr: 0.001751  min_lr: 0.001751  loss: 2.5091 (3.0114)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8596 (0.8114)  time: 0.2420  data: 0.0007  max mem: 18117
Epoch: [171]  [ 400/1251]  eta: 0:03:30  lr: 0.001747  min_lr: 0.001747  loss: 3.4020 (2.9952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7407 (0.7763)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [171]  [ 600/1251]  eta: 0:02:39  lr: 0.001744  min_lr: 0.001744  loss: 3.3563 (3.0148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6992 (0.7470)  time: 0.2366  data: 0.0005  max mem: 18117
Epoch: [171]  [ 800/1251]  eta: 0:01:49  lr: 0.001740  min_lr: 0.001740  loss: 2.7564 (3.0112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6954 (0.7491)  time: 0.2447  data: 0.0004  max mem: 18117
Epoch: [171]  [1000/1251]  eta: 0:01:00  lr: 0.001737  min_lr: 0.001737  loss: 2.5024 (3.0002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.7656)  time: 0.2356  data: 0.0004  max mem: 18117
Epoch: [171]  [1200/1251]  eta: 0:00:12  lr: 0.001733  min_lr: 0.001733  loss: 3.3307 (2.9913)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6935 (0.7533)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [171]  [1250/1251]  eta: 0:00:00  lr: 0.001732  min_lr: 0.001732  loss: 2.8033 (2.9860)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5708 (0.7472)  time: 0.1955  data: 0.0006  max mem: 18117
Epoch: [171] Total time: 0:05:03 (0.2422 s / it)
Averaged stats: lr: 0.001732  min_lr: 0.001732  loss: 2.8033 (2.9686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5708 (0.7472)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6156 (0.6156)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.4059  data: 5.2782  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7659 (0.7870)  acc1: 84.0000 (82.0000)  acc5: 97.2000 (96.9455)  time: 0.7350  data: 0.6234  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0358 (0.9661)  acc1: 76.4000 (78.5714)  acc5: 94.4000 (94.7810)  time: 0.2096  data: 0.1005  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0570 (0.9796)  acc1: 76.4000 (78.2720)  acc5: 93.2000 (94.5440)  time: 0.2086  data: 0.1004  max mem: 18117
Test: Total time: 0:00:10 (0.4045 s / it)
* Acc@1 78.422 Acc@5 94.540 loss 0.972
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.44%
Epoch: [172]  [   0/1251]  eta: 1:02:17  lr: 0.001732  min_lr: 0.001732  loss: 3.7179 (3.7179)  weight_decay: 0.0500 (0.0500)  time: 2.9872  data: 2.1476  max mem: 18117
Epoch: [172]  [ 200/1251]  eta: 0:04:26  lr: 0.001729  min_lr: 0.001729  loss: 3.0579 (2.9994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6467 (0.6668)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [172]  [ 400/1251]  eta: 0:03:28  lr: 0.001725  min_lr: 0.001725  loss: 2.3198 (2.9478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6973 (0.6822)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [172]  [ 600/1251]  eta: 0:02:38  lr: 0.001721  min_lr: 0.001721  loss: 3.1141 (2.9525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7152 (0.7165)  time: 0.2378  data: 0.0005  max mem: 18117
Epoch: [172]  [ 800/1251]  eta: 0:01:49  lr: 0.001718  min_lr: 0.001718  loss: 3.6360 (2.9468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7067 (0.7238)  time: 0.2395  data: 0.0005  max mem: 18117
Epoch: [172]  [1000/1251]  eta: 0:01:00  lr: 0.001714  min_lr: 0.001714  loss: 3.5034 (2.9578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7452 (0.7313)  time: 0.2380  data: 0.0003  max mem: 18117
Epoch: [172]  [1200/1251]  eta: 0:00:12  lr: 0.001711  min_lr: 0.001711  loss: 2.6498 (2.9528)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7210 (0.7401)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [172]  [1250/1251]  eta: 0:00:00  lr: 0.001710  min_lr: 0.001710  loss: 2.5019 (2.9504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7477 (0.7425)  time: 0.1954  data: 0.0007  max mem: 18117
Epoch: [172] Total time: 0:05:01 (0.2408 s / it)
Averaged stats: lr: 0.001710  min_lr: 0.001710  loss: 2.5019 (2.9475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7477 (0.7425)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6840 (0.6840)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 5.7939  data: 5.6474  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7962 (0.8171)  acc1: 84.8000 (82.2545)  acc5: 97.2000 (96.8000)  time: 0.7162  data: 0.6023  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0238 (0.9987)  acc1: 76.0000 (78.2476)  acc5: 94.4000 (94.5714)  time: 0.1828  data: 0.0720  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0424 (1.0040)  acc1: 76.0000 (77.9840)  acc5: 92.8000 (94.4640)  time: 0.1870  data: 0.0767  max mem: 18117
Test: Total time: 0:00:10 (0.4027 s / it)
* Acc@1 78.496 Acc@5 94.462 loss 0.987
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.50%
Epoch: [173]  [   0/1251]  eta: 1:06:47  lr: 0.001710  min_lr: 0.001710  loss: 3.1455 (3.1455)  weight_decay: 0.0500 (0.0500)  time: 3.2031  data: 2.8731  max mem: 18117
Epoch: [173]  [ 200/1251]  eta: 0:04:27  lr: 0.001706  min_lr: 0.001706  loss: 2.5946 (2.8948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6989 (0.7089)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [173]  [ 400/1251]  eta: 0:03:29  lr: 0.001703  min_lr: 0.001703  loss: 3.1979 (2.9593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7521 (0.7472)  time: 0.2356  data: 0.0004  max mem: 18117
Epoch: [173]  [ 600/1251]  eta: 0:02:38  lr: 0.001699  min_lr: 0.001699  loss: 3.3654 (2.9943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6895 (0.7594)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [173]  [ 800/1251]  eta: 0:01:49  lr: 0.001696  min_lr: 0.001696  loss: 2.6723 (2.9912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7504 (0.7592)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [173]  [1000/1251]  eta: 0:01:00  lr: 0.001692  min_lr: 0.001692  loss: 3.5507 (3.0024)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6788 (0.7549)  time: 0.2406  data: 0.0011  max mem: 18117
Epoch: [173]  [1200/1251]  eta: 0:00:12  lr: 0.001689  min_lr: 0.001689  loss: 3.2816 (3.0049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5989 (0.7541)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [173]  [1250/1251]  eta: 0:00:00  lr: 0.001688  min_lr: 0.001688  loss: 2.3721 (2.9962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6719 (0.7567)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [173] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.001688  min_lr: 0.001688  loss: 2.3721 (2.9773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6719 (0.7567)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5902 (0.5902)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 5.7229  data: 5.5973  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8175 (0.7888)  acc1: 84.8000 (82.7273)  acc5: 96.8000 (96.4364)  time: 0.7549  data: 0.6407  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0318 (0.9671)  acc1: 76.8000 (78.6667)  acc5: 94.8000 (94.5143)  time: 0.2023  data: 0.0917  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0387 (0.9742)  acc1: 76.4000 (78.4160)  acc5: 93.6000 (94.4320)  time: 0.2009  data: 0.0916  max mem: 18117
Test: Total time: 0:00:10 (0.4113 s / it)
* Acc@1 78.486 Acc@5 94.558 loss 0.959
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.50%
Epoch: [174]  [   0/1251]  eta: 1:08:52  lr: 0.001688  min_lr: 0.001688  loss: 2.1766 (2.1766)  weight_decay: 0.0500 (0.0500)  time: 3.3036  data: 2.6515  max mem: 18117
Epoch: [174]  [ 200/1251]  eta: 0:04:29  lr: 0.001684  min_lr: 0.001684  loss: 2.3587 (2.9338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6566 (0.7640)  time: 0.2399  data: 0.0005  max mem: 18117
Epoch: [174]  [ 400/1251]  eta: 0:03:31  lr: 0.001681  min_lr: 0.001681  loss: 2.7406 (2.9611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6503 (0.7449)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [174]  [ 600/1251]  eta: 0:02:39  lr: 0.001677  min_lr: 0.001677  loss: 2.2161 (2.9545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6885 (0.7321)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [174]  [ 800/1251]  eta: 0:01:49  lr: 0.001674  min_lr: 0.001674  loss: 2.3112 (2.9555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7066 (0.7529)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [174]  [1000/1251]  eta: 0:01:01  lr: 0.001670  min_lr: 0.001670  loss: 2.4740 (2.9548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6733 (0.7392)  time: 0.2433  data: 0.0004  max mem: 18117
Epoch: [174]  [1200/1251]  eta: 0:00:12  lr: 0.001666  min_lr: 0.001666  loss: 2.5519 (2.9643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6360 (0.7350)  time: 0.2370  data: 0.0005  max mem: 18117
Epoch: [174]  [1250/1251]  eta: 0:00:00  lr: 0.001666  min_lr: 0.001666  loss: 2.7868 (2.9641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.7374)  time: 0.1955  data: 0.0006  max mem: 18117
Epoch: [174] Total time: 0:05:03 (0.2426 s / it)
Averaged stats: lr: 0.001666  min_lr: 0.001666  loss: 2.7868 (2.9543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.7374)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6325 (0.6325)  acc1: 89.6000 (89.6000)  acc5: 98.0000 (98.0000)  time: 5.4642  data: 5.3398  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8108 (0.8312)  acc1: 83.2000 (82.6546)  acc5: 97.2000 (96.7636)  time: 0.7678  data: 0.6523  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0435 (1.0060)  acc1: 77.2000 (78.7619)  acc5: 93.6000 (94.5905)  time: 0.2187  data: 0.1068  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1246 (1.0163)  acc1: 76.4000 (78.3040)  acc5: 93.6000 (94.5120)  time: 0.2172  data: 0.1067  max mem: 18117
Test: Total time: 0:00:10 (0.4146 s / it)
* Acc@1 78.454 Acc@5 94.534 loss 1.012
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.50%
Epoch: [175]  [   0/1251]  eta: 1:09:50  lr: 0.001666  min_lr: 0.001666  loss: 2.0243 (2.0243)  weight_decay: 0.0500 (0.0500)  time: 3.3499  data: 1.6008  max mem: 18117
Epoch: [175]  [ 200/1251]  eta: 0:04:27  lr: 0.001662  min_lr: 0.001662  loss: 3.4002 (2.9473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6051 (0.6909)  time: 0.2396  data: 0.0005  max mem: 18117
Epoch: [175]  [ 400/1251]  eta: 0:03:30  lr: 0.001658  min_lr: 0.001658  loss: 2.6663 (2.9581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7095 (0.7091)  time: 0.2426  data: 0.0005  max mem: 18117
Epoch: [175]  [ 600/1251]  eta: 0:02:39  lr: 0.001655  min_lr: 0.001655  loss: 2.9922 (2.9994)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2382  data: 0.0005  max mem: 18117
Epoch: [175]  [ 800/1251]  eta: 0:01:49  lr: 0.001651  min_lr: 0.001651  loss: 2.2865 (2.9855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7324 (nan)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [175]  [1000/1251]  eta: 0:01:00  lr: 0.001648  min_lr: 0.001648  loss: 2.2751 (2.9597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6773 (nan)  time: 0.2393  data: 0.0004  max mem: 18117
Epoch: [175]  [1200/1251]  eta: 0:00:12  lr: 0.001644  min_lr: 0.001644  loss: 3.4469 (2.9527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7299 (nan)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [175]  [1250/1251]  eta: 0:00:00  lr: 0.001644  min_lr: 0.001644  loss: 2.6341 (2.9521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7949 (nan)  time: 0.1967  data: 0.0006  max mem: 18117
Epoch: [175] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.001644  min_lr: 0.001644  loss: 2.6341 (2.9434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7949 (nan)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6410 (0.6410)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.8477  data: 5.7215  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7995 (0.8186)  acc1: 83.6000 (82.2909)  acc5: 96.8000 (96.6546)  time: 0.7434  data: 0.6313  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0103 (0.9857)  acc1: 77.2000 (78.8191)  acc5: 94.0000 (94.6667)  time: 0.1832  data: 0.0737  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0506 (0.9905)  acc1: 75.2000 (78.5440)  acc5: 93.6000 (94.5440)  time: 0.1826  data: 0.0737  max mem: 18117
Test: Total time: 0:00:10 (0.4027 s / it)
* Acc@1 78.596 Acc@5 94.610 loss 0.991
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.60%
Epoch: [176]  [   0/1251]  eta: 1:09:53  lr: 0.001643  min_lr: 0.001643  loss: 3.6549 (3.6549)  weight_decay: 0.0500 (0.0500)  time: 3.3524  data: 3.0591  max mem: 18117
Epoch: [176]  [ 200/1251]  eta: 0:04:26  lr: 0.001640  min_lr: 0.001640  loss: 2.2953 (2.8531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7045 (0.7633)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [176]  [ 400/1251]  eta: 0:03:29  lr: 0.001636  min_lr: 0.001636  loss: 2.5454 (2.9158)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6885 (0.7407)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [176]  [ 600/1251]  eta: 0:02:38  lr: 0.001633  min_lr: 0.001633  loss: 2.5617 (2.9241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7904 (0.7396)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [176]  [ 800/1251]  eta: 0:01:49  lr: 0.001629  min_lr: 0.001629  loss: 2.9185 (2.9495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7355 (0.7393)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [176]  [1000/1251]  eta: 0:01:00  lr: 0.001626  min_lr: 0.001626  loss: 3.4257 (2.9487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6738 (0.7413)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [176]  [1200/1251]  eta: 0:00:12  lr: 0.001622  min_lr: 0.001622  loss: 3.0148 (2.9491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8334 (0.7587)  time: 0.2348  data: 0.0004  max mem: 18117
Epoch: [176]  [1250/1251]  eta: 0:00:00  lr: 0.001621  min_lr: 0.001621  loss: 2.6381 (2.9474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7264 (0.7564)  time: 0.1969  data: 0.0014  max mem: 18117
Epoch: [176] Total time: 0:05:01 (0.2414 s / it)
Averaged stats: lr: 0.001621  min_lr: 0.001621  loss: 2.6381 (2.9484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7264 (0.7564)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6688 (0.6688)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.7808  data: 5.6555  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7983 (0.8181)  acc1: 82.8000 (82.2182)  acc5: 97.2000 (96.8000)  time: 0.6684  data: 0.5570  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0566 (0.9862)  acc1: 76.8000 (78.8191)  acc5: 94.0000 (94.6286)  time: 0.1676  data: 0.0584  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1159 (0.9943)  acc1: 76.8000 (78.5120)  acc5: 94.0000 (94.5600)  time: 0.2065  data: 0.0982  max mem: 18117
Test: Total time: 0:00:10 (0.4175 s / it)
* Acc@1 78.520 Acc@5 94.494 loss 0.990
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.60%
Epoch: [177]  [   0/1251]  eta: 1:06:31  lr: 0.001621  min_lr: 0.001621  loss: 3.6192 (3.6192)  weight_decay: 0.0500 (0.0500)  time: 3.1904  data: 2.8164  max mem: 18117
Epoch: [177]  [ 200/1251]  eta: 0:04:28  lr: 0.001618  min_lr: 0.001618  loss: 3.5002 (2.9044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7477 (0.7544)  time: 0.2460  data: 0.0004  max mem: 18117
Epoch: [177]  [ 400/1251]  eta: 0:03:30  lr: 0.001614  min_lr: 0.001614  loss: 2.4840 (2.9449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6880 (0.7634)  time: 0.2407  data: 0.0004  max mem: 18117
Epoch: [177]  [ 600/1251]  eta: 0:02:39  lr: 0.001611  min_lr: 0.001611  loss: 3.4492 (2.9718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6819 (0.7479)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [177]  [ 800/1251]  eta: 0:01:50  lr: 0.001607  min_lr: 0.001607  loss: 2.3478 (2.9565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7786 (0.7490)  time: 0.2475  data: 0.0004  max mem: 18117
Epoch: [177]  [1000/1251]  eta: 0:01:01  lr: 0.001604  min_lr: 0.001604  loss: 3.0196 (2.9625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6811 (0.7475)  time: 0.2457  data: 0.0004  max mem: 18117
Epoch: [177]  [1200/1251]  eta: 0:00:12  lr: 0.001600  min_lr: 0.001600  loss: 2.8719 (2.9529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7010 (0.7516)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [177]  [1250/1251]  eta: 0:00:00  lr: 0.001599  min_lr: 0.001599  loss: 2.8381 (2.9495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7385 (0.7545)  time: 0.1959  data: 0.0006  max mem: 18117
Epoch: [177] Total time: 0:05:03 (0.2423 s / it)
Averaged stats: lr: 0.001599  min_lr: 0.001599  loss: 2.8381 (2.9427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7385 (0.7545)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6848 (0.6848)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.7438  data: 5.5914  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8115 (0.8165)  acc1: 83.6000 (82.5091)  acc5: 96.8000 (96.5091)  time: 0.7505  data: 0.6359  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0372 (0.9804)  acc1: 76.8000 (78.7238)  acc5: 94.0000 (94.4000)  time: 0.2076  data: 0.0981  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0455 (0.9880)  acc1: 77.2000 (78.4640)  acc5: 92.8000 (94.3680)  time: 0.2162  data: 0.1071  max mem: 18117
Test: Total time: 0:00:10 (0.4236 s / it)
* Acc@1 78.452 Acc@5 94.462 loss 0.982
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.60%
Epoch: [178]  [   0/1251]  eta: 1:09:05  lr: 0.001599  min_lr: 0.001599  loss: 3.5613 (3.5613)  weight_decay: 0.0500 (0.0500)  time: 3.3134  data: 3.0056  max mem: 18117
Epoch: [178]  [ 200/1251]  eta: 0:04:27  lr: 0.001596  min_lr: 0.001596  loss: 3.3687 (3.0059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7339 (0.7437)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [178]  [ 400/1251]  eta: 0:03:29  lr: 0.001592  min_lr: 0.001592  loss: 2.7400 (2.9765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8243 (0.7560)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [178]  [ 600/1251]  eta: 0:02:38  lr: 0.001589  min_lr: 0.001589  loss: 2.7704 (2.9500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6842 (0.7384)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [178]  [ 800/1251]  eta: 0:01:49  lr: 0.001585  min_lr: 0.001585  loss: 2.7428 (2.9209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7942 (0.7351)  time: 0.2435  data: 0.0004  max mem: 18117
Epoch: [178]  [1000/1251]  eta: 0:01:00  lr: 0.001582  min_lr: 0.001582  loss: 2.4672 (2.9290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6732 (0.7397)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [178]  [1200/1251]  eta: 0:00:12  lr: 0.001578  min_lr: 0.001578  loss: 3.2012 (2.9359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6542 (0.7303)  time: 0.2449  data: 0.0004  max mem: 18117
Epoch: [178]  [1250/1251]  eta: 0:00:00  lr: 0.001578  min_lr: 0.001578  loss: 2.4692 (2.9280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6873 (0.7329)  time: 0.1960  data: 0.0007  max mem: 18117
Epoch: [178] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.001578  min_lr: 0.001578  loss: 2.4692 (2.9383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6873 (0.7329)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6456 (0.6456)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.8645  data: 5.7361  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7938 (0.7869)  acc1: 84.4000 (82.8727)  acc5: 97.2000 (97.0182)  time: 0.7116  data: 0.5986  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0253 (0.9683)  acc1: 76.4000 (78.8381)  acc5: 93.6000 (94.6857)  time: 0.1864  data: 0.0741  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0529 (0.9751)  acc1: 76.4000 (78.6400)  acc5: 92.4000 (94.6560)  time: 0.1848  data: 0.0741  max mem: 18117
Test: Total time: 0:00:10 (0.4046 s / it)
* Acc@1 78.498 Acc@5 94.674 loss 0.968
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.60%
Epoch: [179]  [   0/1251]  eta: 1:09:37  lr: 0.001577  min_lr: 0.001577  loss: 3.6728 (3.6728)  weight_decay: 0.0500 (0.0500)  time: 3.3397  data: 2.9271  max mem: 18117
Epoch: [179]  [ 200/1251]  eta: 0:04:28  lr: 0.001574  min_lr: 0.001574  loss: 3.4858 (2.8768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7635 (0.8248)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [179]  [ 400/1251]  eta: 0:03:30  lr: 0.001570  min_lr: 0.001570  loss: 3.5089 (2.9185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6968 (0.7959)  time: 0.2367  data: 0.0005  max mem: 18117
Epoch: [179]  [ 600/1251]  eta: 0:02:39  lr: 0.001567  min_lr: 0.001567  loss: 2.6217 (2.9131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7085 (0.7903)  time: 0.2379  data: 0.0006  max mem: 18117
Epoch: [179]  [ 800/1251]  eta: 0:01:49  lr: 0.001563  min_lr: 0.001563  loss: 2.5481 (2.9027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6449 (0.7792)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [179]  [1000/1251]  eta: 0:01:00  lr: 0.001560  min_lr: 0.001560  loss: 2.8740 (2.9434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7915 (0.7938)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [179]  [1200/1251]  eta: 0:00:12  lr: 0.001556  min_lr: 0.001556  loss: 2.5642 (2.9555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7619 (0.7992)  time: 0.2405  data: 0.0003  max mem: 18117
Epoch: [179]  [1250/1251]  eta: 0:00:00  lr: 0.001556  min_lr: 0.001556  loss: 2.4524 (2.9503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7303 (0.7969)  time: 0.1951  data: 0.0006  max mem: 18117
Epoch: [179] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.001556  min_lr: 0.001556  loss: 2.4524 (2.9362)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7303 (0.7969)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6048 (0.6048)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.8240  data: 5.6966  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7849 (0.7435)  acc1: 84.0000 (83.6364)  acc5: 97.6000 (97.2727)  time: 0.6811  data: 0.5696  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9166 (0.9234)  acc1: 76.8000 (79.6381)  acc5: 94.0000 (95.0857)  time: 0.1749  data: 0.0647  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0537 (0.9381)  acc1: 76.0000 (78.9600)  acc5: 93.6000 (94.9280)  time: 0.1875  data: 0.0780  max mem: 18117
Test: Total time: 0:00:10 (0.4046 s / it)
* Acc@1 78.850 Acc@5 94.692 loss 0.935
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 78.85%
Epoch: [180]  [   0/1251]  eta: 1:06:11  lr: 0.001556  min_lr: 0.001556  loss: 3.8997 (3.8997)  weight_decay: 0.0500 (0.0500)  time: 3.1745  data: 2.8634  max mem: 18117
Epoch: [180]  [ 200/1251]  eta: 0:04:25  lr: 0.001552  min_lr: 0.001552  loss: 3.0644 (2.9556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6883 (0.7479)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [180]  [ 400/1251]  eta: 0:03:28  lr: 0.001549  min_lr: 0.001549  loss: 3.3908 (2.9606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7340 (0.7454)  time: 0.2386  data: 0.0005  max mem: 18117
Epoch: [180]  [ 600/1251]  eta: 0:02:38  lr: 0.001545  min_lr: 0.001545  loss: 2.5479 (2.9431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7355 (0.7537)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [180]  [ 800/1251]  eta: 0:01:49  lr: 0.001542  min_lr: 0.001542  loss: 2.8498 (2.9381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6780 (0.7556)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [180]  [1000/1251]  eta: 0:01:00  lr: 0.001538  min_lr: 0.001538  loss: 2.8784 (2.9554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7494 (0.7604)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [180]  [1200/1251]  eta: 0:00:12  lr: 0.001535  min_lr: 0.001535  loss: 2.7150 (2.9416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6775 (0.7515)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [180]  [1250/1251]  eta: 0:00:00  lr: 0.001534  min_lr: 0.001534  loss: 2.4987 (2.9345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7715 (0.7542)  time: 0.1954  data: 0.0006  max mem: 18117
Epoch: [180] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.001534  min_lr: 0.001534  loss: 2.4987 (2.9289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7715 (0.7542)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5955 (0.5955)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.6790  data: 5.5513  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7881 (0.7734)  acc1: 83.2000 (83.3818)  acc5: 97.2000 (97.0545)  time: 0.7272  data: 0.6155  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0016 (0.9477)  acc1: 76.4000 (79.4286)  acc5: 95.2000 (94.9143)  time: 0.1958  data: 0.0867  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0781 (0.9629)  acc1: 75.6000 (78.9920)  acc5: 93.6000 (94.7520)  time: 0.2119  data: 0.1036  max mem: 18117
Test: Total time: 0:00:10 (0.4178 s / it)
* Acc@1 78.794 Acc@5 94.676 loss 0.952
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.85%
Epoch: [181]  [   0/1251]  eta: 1:01:45  lr: 0.001534  min_lr: 0.001534  loss: 2.0458 (2.0458)  weight_decay: 0.0500 (0.0500)  time: 2.9621  data: 2.4170  max mem: 18117
Epoch: [181]  [ 200/1251]  eta: 0:04:28  lr: 0.001530  min_lr: 0.001530  loss: 2.5236 (2.9397)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7474 (0.7939)  time: 0.2376  data: 0.0005  max mem: 18117
Epoch: [181]  [ 400/1251]  eta: 0:03:30  lr: 0.001527  min_lr: 0.001527  loss: 2.7940 (2.8949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6882 (0.7564)  time: 0.2421  data: 0.0004  max mem: 18117
Epoch: [181]  [ 600/1251]  eta: 0:02:38  lr: 0.001523  min_lr: 0.001523  loss: 2.5790 (2.8937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6620 (0.7505)  time: 0.2349  data: 0.0004  max mem: 18117
Epoch: [181]  [ 800/1251]  eta: 0:01:49  lr: 0.001520  min_lr: 0.001520  loss: 2.9699 (2.9009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6772 (0.7480)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [181]  [1000/1251]  eta: 0:01:00  lr: 0.001516  min_lr: 0.001516  loss: 3.1478 (2.9181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7148 (0.7695)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [181]  [1200/1251]  eta: 0:00:12  lr: 0.001513  min_lr: 0.001513  loss: 2.9998 (2.9233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8299 (0.7691)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [181]  [1250/1251]  eta: 0:00:00  lr: 0.001512  min_lr: 0.001512  loss: 3.3837 (2.9290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7107 (0.7678)  time: 0.1954  data: 0.0007  max mem: 18117
Epoch: [181] Total time: 0:05:00 (0.2404 s / it)
Averaged stats: lr: 0.001512  min_lr: 0.001512  loss: 3.3837 (2.9295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7107 (0.7678)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6573 (0.6573)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.6023  data: 5.4367  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8307 (0.8225)  acc1: 84.0000 (82.8727)  acc5: 96.8000 (96.6909)  time: 0.7297  data: 0.6142  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9897 (0.9895)  acc1: 76.4000 (79.2571)  acc5: 94.4000 (94.8381)  time: 0.2105  data: 0.1012  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1103 (1.0042)  acc1: 75.6000 (78.7840)  acc5: 94.0000 (94.7360)  time: 0.2092  data: 0.1011  max mem: 18117
Test: Total time: 0:00:10 (0.4144 s / it)
* Acc@1 78.768 Acc@5 94.686 loss 0.999
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.85%
Epoch: [182]  [   0/1251]  eta: 1:09:37  lr: 0.001512  min_lr: 0.001512  loss: 3.3846 (3.3846)  weight_decay: 0.0500 (0.0500)  time: 3.3390  data: 2.0488  max mem: 18117
Epoch: [182]  [ 200/1251]  eta: 0:04:27  lr: 0.001508  min_lr: 0.001508  loss: 2.3596 (2.9094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (0.6898)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [182]  [ 400/1251]  eta: 0:03:29  lr: 0.001505  min_lr: 0.001505  loss: 2.2720 (2.9024)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7339 (0.7132)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [182]  [ 600/1251]  eta: 0:02:38  lr: 0.001501  min_lr: 0.001501  loss: 3.0937 (2.9032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7502 (0.7320)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [182]  [ 800/1251]  eta: 0:01:49  lr: 0.001498  min_lr: 0.001498  loss: 2.3245 (2.9191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7522 (0.7373)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [182]  [1000/1251]  eta: 0:01:00  lr: 0.001495  min_lr: 0.001495  loss: 2.5589 (2.9209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6996 (0.7463)  time: 0.2410  data: 0.0004  max mem: 18117
Epoch: [182]  [1200/1251]  eta: 0:00:12  lr: 0.001491  min_lr: 0.001491  loss: 2.3267 (2.9189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7142 (0.7512)  time: 0.2427  data: 0.0004  max mem: 18117
Epoch: [182]  [1250/1251]  eta: 0:00:00  lr: 0.001490  min_lr: 0.001490  loss: 3.0049 (2.9203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7679 (0.7538)  time: 0.1973  data: 0.0010  max mem: 18117
Epoch: [182] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.001490  min_lr: 0.001490  loss: 3.0049 (2.9327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7679 (0.7538)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6608 (0.6608)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.7874  data: 5.6365  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7822 (0.8079)  acc1: 84.4000 (83.0182)  acc5: 96.8000 (96.9455)  time: 0.7833  data: 0.6692  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0162 (0.9713)  acc1: 78.0000 (79.5048)  acc5: 94.8000 (95.0476)  time: 0.2213  data: 0.1120  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0529 (0.9813)  acc1: 76.8000 (79.0560)  acc5: 94.4000 (94.9440)  time: 0.2268  data: 0.1180  max mem: 18117
Test: Total time: 0:00:10 (0.4343 s / it)
* Acc@1 78.768 Acc@5 94.724 loss 0.978
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.85%
Epoch: [183]  [   0/1251]  eta: 1:05:56  lr: 0.001490  min_lr: 0.001490  loss: 2.4173 (2.4173)  weight_decay: 0.0500 (0.0500)  time: 3.1628  data: 2.5183  max mem: 18117
Epoch: [183]  [ 200/1251]  eta: 0:04:26  lr: 0.001487  min_lr: 0.001487  loss: 2.7146 (2.9327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7009 (0.7350)  time: 0.2370  data: 0.0005  max mem: 18117
Epoch: [183]  [ 400/1251]  eta: 0:03:30  lr: 0.001483  min_lr: 0.001483  loss: 3.0951 (2.8933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7916 (0.7425)  time: 0.2459  data: 0.0004  max mem: 18117
Epoch: [183]  [ 600/1251]  eta: 0:02:39  lr: 0.001480  min_lr: 0.001480  loss: 2.3427 (2.8899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8367 (0.7549)  time: 0.2356  data: 0.0004  max mem: 18117
Epoch: [183]  [ 800/1251]  eta: 0:01:49  lr: 0.001476  min_lr: 0.001476  loss: 2.3462 (2.8861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7213 (0.7521)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [183]  [1000/1251]  eta: 0:01:00  lr: 0.001473  min_lr: 0.001473  loss: 3.1761 (2.8988)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6808 (0.7492)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [183]  [1200/1251]  eta: 0:00:12  lr: 0.001469  min_lr: 0.001469  loss: 2.4016 (2.9006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6827 (0.7443)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [183]  [1250/1251]  eta: 0:00:00  lr: 0.001469  min_lr: 0.001469  loss: 2.9921 (2.9007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7447 (0.7513)  time: 0.1960  data: 0.0006  max mem: 18117
Epoch: [183] Total time: 0:05:03 (0.2423 s / it)
Averaged stats: lr: 0.001469  min_lr: 0.001469  loss: 2.9921 (2.9185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7447 (0.7513)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5980 (0.5980)  acc1: 86.0000 (86.0000)  acc5: 99.2000 (99.2000)  time: 5.6566  data: 5.5308  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7979 (0.7782)  acc1: 82.8000 (82.2546)  acc5: 97.6000 (97.1636)  time: 0.7279  data: 0.6137  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9671 (0.9591)  acc1: 76.8000 (78.4952)  acc5: 94.0000 (94.7810)  time: 0.2026  data: 0.0921  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0133 (0.9689)  acc1: 77.2000 (78.2240)  acc5: 93.2000 (94.7040)  time: 0.2007  data: 0.0920  max mem: 18117
Test: Total time: 0:00:10 (0.4086 s / it)
* Acc@1 78.614 Acc@5 94.598 loss 0.957
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.85%
Epoch: [184]  [   0/1251]  eta: 1:02:20  lr: 0.001469  min_lr: 0.001469  loss: 3.5936 (3.5936)  weight_decay: 0.0500 (0.0500)  time: 2.9897  data: 1.6329  max mem: 18117
Epoch: [184]  [ 200/1251]  eta: 0:04:28  lr: 0.001465  min_lr: 0.001465  loss: 2.9521 (2.8167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6278 (0.7552)  time: 0.2413  data: 0.0004  max mem: 18117
Epoch: [184]  [ 400/1251]  eta: 0:03:29  lr: 0.001462  min_lr: 0.001462  loss: 2.6738 (2.8727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8628 (0.7749)  time: 0.2403  data: 0.0003  max mem: 18117
Epoch: [184]  [ 600/1251]  eta: 0:02:39  lr: 0.001458  min_lr: 0.001458  loss: 2.3765 (2.8836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7265 (0.7752)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [184]  [ 800/1251]  eta: 0:01:49  lr: 0.001455  min_lr: 0.001455  loss: 2.1502 (2.9020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6553 (0.7688)  time: 0.2414  data: 0.0004  max mem: 18117
Epoch: [184]  [1000/1251]  eta: 0:01:00  lr: 0.001451  min_lr: 0.001451  loss: 2.2312 (2.9063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6946 (0.7646)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [184]  [1200/1251]  eta: 0:00:12  lr: 0.001448  min_lr: 0.001448  loss: 2.6789 (2.9192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7920 (0.7829)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [184]  [1250/1251]  eta: 0:00:00  lr: 0.001447  min_lr: 0.001447  loss: 2.3131 (2.9130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8241 (0.7941)  time: 0.1956  data: 0.0006  max mem: 18117
Epoch: [184] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.001447  min_lr: 0.001447  loss: 2.3131 (2.9155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8241 (0.7941)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6202 (0.6202)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.6984  data: 5.5687  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8074 (0.7986)  acc1: 82.8000 (82.7273)  acc5: 96.8000 (96.8000)  time: 0.7428  data: 0.6297  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0380 (0.9713)  acc1: 77.2000 (78.8762)  acc5: 94.0000 (94.8381)  time: 0.1952  data: 0.0854  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0429 (0.9799)  acc1: 76.8000 (78.3520)  acc5: 94.0000 (94.8800)  time: 0.1936  data: 0.0853  max mem: 18117
Test: Total time: 0:00:10 (0.4099 s / it)
* Acc@1 78.976 Acc@5 94.772 loss 0.969
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 78.98%
Epoch: [185]  [   0/1251]  eta: 1:10:53  lr: 0.001447  min_lr: 0.001447  loss: 2.2299 (2.2299)  weight_decay: 0.0500 (0.0500)  time: 3.4000  data: 3.1346  max mem: 18117
Epoch: [185]  [ 200/1251]  eta: 0:04:26  lr: 0.001444  min_lr: 0.001444  loss: 2.8028 (2.8680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7728 (0.7924)  time: 0.2391  data: 0.0005  max mem: 18117
Epoch: [185]  [ 400/1251]  eta: 0:03:29  lr: 0.001440  min_lr: 0.001440  loss: 2.4998 (2.9141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7541 (0.7701)  time: 0.2402  data: 0.0005  max mem: 18117
Epoch: [185]  [ 600/1251]  eta: 0:02:38  lr: 0.001437  min_lr: 0.001437  loss: 3.2536 (2.9060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.7636)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [185]  [ 800/1251]  eta: 0:01:49  lr: 0.001433  min_lr: 0.001433  loss: 2.6696 (2.9265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6863 (0.7632)  time: 0.2407  data: 0.0004  max mem: 18117
Epoch: [185]  [1000/1251]  eta: 0:01:00  lr: 0.001430  min_lr: 0.001430  loss: 3.1833 (2.9351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6770 (0.7606)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [185]  [1200/1251]  eta: 0:00:12  lr: 0.001426  min_lr: 0.001426  loss: 2.3287 (2.9197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6970 (0.7637)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [185]  [1250/1251]  eta: 0:00:00  lr: 0.001426  min_lr: 0.001426  loss: 2.9150 (2.9219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7811 (0.7653)  time: 0.1971  data: 0.0009  max mem: 18117
Epoch: [185] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.001426  min_lr: 0.001426  loss: 2.9150 (2.9186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7811 (0.7653)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6753 (0.6753)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 5.4261  data: 5.2568  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.8342 (0.8550)  acc1: 81.6000 (82.7273)  acc5: 97.6000 (96.9455)  time: 0.6523  data: 0.5353  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0500 (1.0091)  acc1: 78.4000 (79.3905)  acc5: 94.4000 (95.2571)  time: 0.1949  data: 0.0850  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0844 (1.0222)  acc1: 77.6000 (78.9280)  acc5: 94.0000 (95.0560)  time: 0.2093  data: 0.0997  max mem: 18117
Test: Total time: 0:00:10 (0.4080 s / it)
* Acc@1 78.898 Acc@5 94.778 loss 1.015
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 78.98%
Epoch: [186]  [   0/1251]  eta: 1:09:08  lr: 0.001425  min_lr: 0.001425  loss: 3.6403 (3.6403)  weight_decay: 0.0500 (0.0500)  time: 3.3158  data: 2.9197  max mem: 18117
Epoch: [186]  [ 200/1251]  eta: 0:04:28  lr: 0.001422  min_lr: 0.001422  loss: 3.2261 (2.8731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7434 (0.7757)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [186]  [ 400/1251]  eta: 0:03:30  lr: 0.001419  min_lr: 0.001419  loss: 2.2977 (2.8239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7087 (0.7754)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [186]  [ 600/1251]  eta: 0:02:39  lr: 0.001415  min_lr: 0.001415  loss: 2.4285 (2.8651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7789 (0.7907)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [186]  [ 800/1251]  eta: 0:01:49  lr: 0.001412  min_lr: 0.001412  loss: 2.7672 (2.8887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8932 (0.8421)  time: 0.2445  data: 0.0004  max mem: 18117
Epoch: [186]  [1000/1251]  eta: 0:01:01  lr: 0.001408  min_lr: 0.001408  loss: 3.0392 (2.9191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6764 (0.8161)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [186]  [1200/1251]  eta: 0:00:12  lr: 0.001405  min_lr: 0.001405  loss: 2.3584 (2.9155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7362 (0.8008)  time: 0.2379  data: 0.0005  max mem: 18117
Epoch: [186]  [1250/1251]  eta: 0:00:00  lr: 0.001404  min_lr: 0.001404  loss: 3.3394 (2.9168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7260 (0.8002)  time: 0.1960  data: 0.0007  max mem: 18117
Epoch: [186] Total time: 0:05:03 (0.2426 s / it)
Averaged stats: lr: 0.001404  min_lr: 0.001404  loss: 3.3394 (2.9121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7260 (0.8002)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6635 (0.6635)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.6805  data: 5.5352  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8231 (0.8455)  acc1: 84.0000 (82.6182)  acc5: 96.8000 (96.5818)  time: 0.7421  data: 0.6277  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0932 (1.0023)  acc1: 77.2000 (79.2571)  acc5: 94.0000 (94.8381)  time: 0.1916  data: 0.0812  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0932 (1.0133)  acc1: 76.4000 (78.8640)  acc5: 94.0000 (94.7840)  time: 0.2051  data: 0.0956  max mem: 18117
Test: Total time: 0:00:10 (0.4128 s / it)
* Acc@1 78.826 Acc@5 94.690 loss 1.009
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.98%
Epoch: [187]  [   0/1251]  eta: 1:07:38  lr: 0.001404  min_lr: 0.001404  loss: 2.0679 (2.0679)  weight_decay: 0.0500 (0.0500)  time: 3.2441  data: 1.7820  max mem: 18117
Epoch: [187]  [ 200/1251]  eta: 0:04:26  lr: 0.001401  min_lr: 0.001401  loss: 2.3540 (2.8587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7428 (0.7852)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [187]  [ 400/1251]  eta: 0:03:29  lr: 0.001397  min_lr: 0.001397  loss: 2.2595 (2.8854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7057 (0.7789)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [187]  [ 600/1251]  eta: 0:02:38  lr: 0.001394  min_lr: 0.001394  loss: 3.0664 (2.8798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6846 (0.7683)  time: 0.2385  data: 0.0009  max mem: 18117
Epoch: [187]  [ 800/1251]  eta: 0:01:49  lr: 0.001390  min_lr: 0.001390  loss: 2.4909 (2.8847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7122 (0.7615)  time: 0.2407  data: 0.0004  max mem: 18117
Epoch: [187]  [1000/1251]  eta: 0:01:00  lr: 0.001387  min_lr: 0.001387  loss: 3.4778 (2.9077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7403 (0.7654)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [187]  [1200/1251]  eta: 0:00:12  lr: 0.001383  min_lr: 0.001383  loss: 3.0649 (2.9121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7957 (nan)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [187]  [1250/1251]  eta: 0:00:00  lr: 0.001383  min_lr: 0.001383  loss: 2.6808 (2.9103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7301 (nan)  time: 0.1955  data: 0.0009  max mem: 18117
Epoch: [187] Total time: 0:05:02 (0.2420 s / it)
Averaged stats: lr: 0.001383  min_lr: 0.001383  loss: 2.6808 (2.9048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7301 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6258 (0.6258)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.6966  data: 5.5581  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8255 (0.8213)  acc1: 84.8000 (83.2000)  acc5: 96.8000 (96.6182)  time: 0.7564  data: 0.6428  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0565 (0.9859)  acc1: 76.4000 (79.4667)  acc5: 94.4000 (94.7429)  time: 0.2046  data: 0.0950  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0696 (0.9969)  acc1: 76.4000 (79.1040)  acc5: 94.0000 (94.7200)  time: 0.2041  data: 0.0949  max mem: 18117
Test: Total time: 0:00:10 (0.4124 s / it)
* Acc@1 79.168 Acc@5 94.712 loss 0.994
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.17%
Epoch: [188]  [   0/1251]  eta: 0:57:53  lr: 0.001383  min_lr: 0.001383  loss: 2.1815 (2.1815)  weight_decay: 0.0500 (0.0500)  time: 2.7769  data: 2.4366  max mem: 18117
Epoch: [188]  [ 200/1251]  eta: 0:04:25  lr: 0.001379  min_lr: 0.001379  loss: 2.7814 (2.9778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7426 (0.8378)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [188]  [ 400/1251]  eta: 0:03:29  lr: 0.001376  min_lr: 0.001376  loss: 3.3071 (2.9788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7054 (0.7773)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [188]  [ 600/1251]  eta: 0:02:38  lr: 0.001372  min_lr: 0.001372  loss: 3.0718 (2.9542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8525 (0.7782)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [188]  [ 800/1251]  eta: 0:01:49  lr: 0.001369  min_lr: 0.001369  loss: 2.2266 (2.9551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7326 (0.7804)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [188]  [1000/1251]  eta: 0:01:00  lr: 0.001366  min_lr: 0.001366  loss: 2.2096 (2.9499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.7860)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [188]  [1200/1251]  eta: 0:00:12  lr: 0.001362  min_lr: 0.001362  loss: 2.5977 (2.9474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8420 (0.7867)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [188]  [1250/1251]  eta: 0:00:00  lr: 0.001361  min_lr: 0.001361  loss: 2.9356 (2.9504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8420 (0.7873)  time: 0.1958  data: 0.0005  max mem: 18117
Epoch: [188] Total time: 0:05:01 (0.2408 s / it)
Averaged stats: lr: 0.001361  min_lr: 0.001361  loss: 2.9356 (2.9090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8420 (0.7873)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6357 (0.6357)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.6009  data: 5.4494  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8043 (0.8040)  acc1: 85.2000 (83.1636)  acc5: 96.8000 (96.7273)  time: 0.7607  data: 0.6469  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0158 (0.9685)  acc1: 77.2000 (79.6762)  acc5: 94.8000 (94.9905)  time: 0.2244  data: 0.1152  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0593 (0.9833)  acc1: 76.8000 (79.1040)  acc5: 94.8000 (94.9920)  time: 0.2242  data: 0.1152  max mem: 18117
Test: Total time: 0:00:10 (0.4244 s / it)
* Acc@1 79.062 Acc@5 94.818 loss 0.974
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.17%
Epoch: [189]  [   0/1251]  eta: 1:02:16  lr: 0.001361  min_lr: 0.001361  loss: 2.9442 (2.9442)  weight_decay: 0.0500 (0.0500)  time: 2.9865  data: 2.5344  max mem: 18117
Epoch: [189]  [ 200/1251]  eta: 0:04:27  lr: 0.001358  min_lr: 0.001358  loss: 2.3013 (2.8305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7669 (0.7292)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [189]  [ 400/1251]  eta: 0:03:29  lr: 0.001355  min_lr: 0.001355  loss: 2.4741 (2.8674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6653 (0.7535)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [189]  [ 600/1251]  eta: 0:02:38  lr: 0.001351  min_lr: 0.001351  loss: 2.4616 (2.8595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6597 (0.7399)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [189]  [ 800/1251]  eta: 0:01:49  lr: 0.001348  min_lr: 0.001348  loss: 2.8306 (2.8999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7132 (0.7482)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [189]  [1000/1251]  eta: 0:01:00  lr: 0.001344  min_lr: 0.001344  loss: 3.4017 (2.9076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7975 (0.7556)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [189]  [1200/1251]  eta: 0:00:12  lr: 0.001341  min_lr: 0.001341  loss: 2.4121 (2.9045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8239 (0.7776)  time: 0.2418  data: 0.0004  max mem: 18117
Epoch: [189]  [1250/1251]  eta: 0:00:00  lr: 0.001340  min_lr: 0.001340  loss: 2.7614 (2.8963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7207 (0.7749)  time: 0.2004  data: 0.0006  max mem: 18117
Epoch: [189] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.001340  min_lr: 0.001340  loss: 2.7614 (2.9027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7207 (0.7749)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5513 (0.5513)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.6256  data: 5.4799  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7943 (0.7708)  acc1: 83.2000 (83.0545)  acc5: 96.8000 (96.9818)  time: 0.7377  data: 0.6234  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9845 (0.9400)  acc1: 77.6000 (79.2952)  acc5: 94.8000 (94.9143)  time: 0.2013  data: 0.0916  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0202 (0.9493)  acc1: 77.6000 (78.9440)  acc5: 94.0000 (94.8000)  time: 0.2004  data: 0.0916  max mem: 18117
Test: Total time: 0:00:10 (0.4068 s / it)
* Acc@1 79.028 Acc@5 94.828 loss 0.934
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.17%
Epoch: [190]  [   0/1251]  eta: 1:09:28  lr: 0.001340  min_lr: 0.001340  loss: 3.4775 (3.4775)  weight_decay: 0.0500 (0.0500)  time: 3.3322  data: 2.4401  max mem: 18117
Epoch: [190]  [ 200/1251]  eta: 0:04:29  lr: 0.001337  min_lr: 0.001337  loss: 2.2500 (2.8813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8014 (0.7955)  time: 0.2427  data: 0.0004  max mem: 18117
Epoch: [190]  [ 400/1251]  eta: 0:03:30  lr: 0.001333  min_lr: 0.001333  loss: 2.7420 (2.8741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7124 (0.8052)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [190]  [ 600/1251]  eta: 0:02:38  lr: 0.001330  min_lr: 0.001330  loss: 2.4214 (2.8914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7682 (0.8017)  time: 0.2381  data: 0.0009  max mem: 18117
Epoch: [190]  [ 800/1251]  eta: 0:01:49  lr: 0.001327  min_lr: 0.001327  loss: 2.6153 (2.8896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7017 (0.8022)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [190]  [1000/1251]  eta: 0:01:00  lr: 0.001323  min_lr: 0.001323  loss: 2.4994 (2.8768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8192 (0.8001)  time: 0.2514  data: 0.0005  max mem: 18117
Epoch: [190]  [1200/1251]  eta: 0:00:12  lr: 0.001320  min_lr: 0.001320  loss: 2.5121 (2.8781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7511 (0.8019)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [190]  [1250/1251]  eta: 0:00:00  lr: 0.001319  min_lr: 0.001319  loss: 3.0570 (2.8871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7262 (0.8007)  time: 0.1952  data: 0.0007  max mem: 18117
Epoch: [190] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.001319  min_lr: 0.001319  loss: 3.0570 (2.8947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7262 (0.8007)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6158 (0.6158)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.5128  data: 5.3880  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8122 (0.8237)  acc1: 84.8000 (83.3455)  acc5: 97.2000 (97.1273)  time: 0.7395  data: 0.6275  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0333 (0.9924)  acc1: 78.0000 (79.3905)  acc5: 94.0000 (94.7619)  time: 0.2095  data: 0.0997  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0139 (1.0013)  acc1: 77.2000 (78.9280)  acc5: 93.6000 (94.6400)  time: 0.2080  data: 0.0997  max mem: 18117
Test: Total time: 0:00:10 (0.4086 s / it)
* Acc@1 78.976 Acc@5 94.740 loss 0.994
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.17%
Epoch: [191]  [   0/1251]  eta: 1:09:35  lr: 0.001319  min_lr: 0.001319  loss: 3.6319 (3.6319)  weight_decay: 0.0500 (0.0500)  time: 3.3377  data: 2.9791  max mem: 18117
Epoch: [191]  [ 200/1251]  eta: 0:04:31  lr: 0.001316  min_lr: 0.001316  loss: 2.7415 (2.9105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7285 (0.7724)  time: 0.2419  data: 0.0003  max mem: 18117
Epoch: [191]  [ 400/1251]  eta: 0:03:32  lr: 0.001312  min_lr: 0.001312  loss: 2.2938 (2.8827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7235 (0.7615)  time: 0.2435  data: 0.0004  max mem: 18117
Epoch: [191]  [ 600/1251]  eta: 0:02:40  lr: 0.001309  min_lr: 0.001309  loss: 2.4596 (2.9108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.7868)  time: 0.2358  data: 0.0004  max mem: 18117
Epoch: [191]  [ 800/1251]  eta: 0:01:50  lr: 0.001305  min_lr: 0.001305  loss: 2.5906 (2.9158)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7546 (0.8028)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [191]  [1000/1251]  eta: 0:01:01  lr: 0.001302  min_lr: 0.001302  loss: 2.9989 (2.9048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6648 (0.7843)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [191]  [1200/1251]  eta: 0:00:12  lr: 0.001299  min_lr: 0.001299  loss: 2.4708 (2.8903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6423 (0.7806)  time: 0.2426  data: 0.0004  max mem: 18117
Epoch: [191]  [1250/1251]  eta: 0:00:00  lr: 0.001298  min_lr: 0.001298  loss: 2.6720 (2.8888)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7073 (0.7799)  time: 0.1955  data: 0.0006  max mem: 18117
Epoch: [191] Total time: 0:05:03 (0.2427 s / it)
Averaged stats: lr: 0.001298  min_lr: 0.001298  loss: 2.6720 (2.8916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7073 (0.7799)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5779 (0.5779)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 5.8005  data: 5.6615  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7490 (0.7598)  acc1: 84.4000 (83.6000)  acc5: 97.2000 (96.9818)  time: 0.7702  data: 0.6574  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9365 (0.9285)  acc1: 77.6000 (79.9048)  acc5: 95.2000 (95.0667)  time: 0.2064  data: 0.0973  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0274 (0.9412)  acc1: 77.6000 (79.3120)  acc5: 94.4000 (94.8640)  time: 0.2057  data: 0.0972  max mem: 18117
Test: Total time: 0:00:10 (0.4176 s / it)
* Acc@1 79.276 Acc@5 94.886 loss 0.927
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.28%
Epoch: [192]  [   0/1251]  eta: 1:01:49  lr: 0.001298  min_lr: 0.001298  loss: 3.2673 (3.2673)  weight_decay: 0.0500 (0.0500)  time: 2.9649  data: 2.6501  max mem: 18117
Epoch: [192]  [ 200/1251]  eta: 0:04:26  lr: 0.001295  min_lr: 0.001295  loss: 2.2632 (2.8552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8033 (0.7942)  time: 0.2416  data: 0.0004  max mem: 18117
Epoch: [192]  [ 400/1251]  eta: 0:03:29  lr: 0.001291  min_lr: 0.001291  loss: 3.0667 (2.8991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7852 (0.7908)  time: 0.2368  data: 0.0003  max mem: 18117
Epoch: [192]  [ 600/1251]  eta: 0:02:38  lr: 0.001288  min_lr: 0.001288  loss: 3.2310 (2.9181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7132 (0.7774)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [192]  [ 800/1251]  eta: 0:01:49  lr: 0.001284  min_lr: 0.001284  loss: 3.2880 (2.9184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9015 (0.7862)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [192]  [1000/1251]  eta: 0:01:00  lr: 0.001281  min_lr: 0.001281  loss: 2.7239 (2.9078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7357 (0.7808)  time: 0.2418  data: 0.0005  max mem: 18117
Epoch: [192]  [1200/1251]  eta: 0:00:12  lr: 0.001278  min_lr: 0.001278  loss: 2.8727 (2.9093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8564 (0.7944)  time: 0.2418  data: 0.0005  max mem: 18117
Epoch: [192]  [1250/1251]  eta: 0:00:00  lr: 0.001277  min_lr: 0.001277  loss: 2.3950 (2.9053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7759 (0.7931)  time: 0.1952  data: 0.0007  max mem: 18117
Epoch: [192] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.001277  min_lr: 0.001277  loss: 2.3950 (2.8990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7759 (0.7931)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6497 (0.6497)  acc1: 88.4000 (88.4000)  acc5: 97.6000 (97.6000)  time: 5.3351  data: 5.2084  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7993 (0.8162)  acc1: 84.8000 (83.1636)  acc5: 97.6000 (96.9818)  time: 0.7494  data: 0.6367  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0160 (0.9886)  acc1: 76.4000 (79.4286)  acc5: 94.8000 (95.1238)  time: 0.2150  data: 0.1053  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0722 (1.0008)  acc1: 76.0000 (78.8960)  acc5: 94.0000 (94.8800)  time: 0.2134  data: 0.1052  max mem: 18117
Test: Total time: 0:00:10 (0.4063 s / it)
* Acc@1 79.162 Acc@5 94.844 loss 0.988
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.28%
Epoch: [193]  [   0/1251]  eta: 1:08:10  lr: 0.001277  min_lr: 0.001277  loss: 3.5439 (3.5439)  weight_decay: 0.0500 (0.0500)  time: 3.2697  data: 2.6535  max mem: 18117
Epoch: [193]  [ 200/1251]  eta: 0:04:27  lr: 0.001274  min_lr: 0.001274  loss: 2.8288 (2.8314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7168 (0.7361)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [193]  [ 400/1251]  eta: 0:03:30  lr: 0.001270  min_lr: 0.001270  loss: 2.5352 (2.8358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7967 (0.7853)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [193]  [ 600/1251]  eta: 0:02:39  lr: 0.001267  min_lr: 0.001267  loss: 2.6256 (2.8515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7409 (0.7810)  time: 0.2368  data: 0.0005  max mem: 18117
Epoch: [193]  [ 800/1251]  eta: 0:01:49  lr: 0.001264  min_lr: 0.001264  loss: 2.3493 (2.8524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6604 (0.7766)  time: 0.2392  data: 0.0005  max mem: 18117
Epoch: [193]  [1000/1251]  eta: 0:01:00  lr: 0.001260  min_lr: 0.001260  loss: 2.4719 (2.8556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7252 (0.7862)  time: 0.2384  data: 0.0005  max mem: 18117
Epoch: [193]  [1200/1251]  eta: 0:00:12  lr: 0.001257  min_lr: 0.001257  loss: 2.2574 (2.8571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7328 (0.7848)  time: 0.2461  data: 0.0005  max mem: 18117
Epoch: [193]  [1250/1251]  eta: 0:00:00  lr: 0.001256  min_lr: 0.001256  loss: 2.9276 (2.8557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8377 (0.7874)  time: 0.1954  data: 0.0008  max mem: 18117
Epoch: [193] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.001256  min_lr: 0.001256  loss: 2.9276 (2.8678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8377 (0.7874)
Test:  [ 0/25]  eta: 0:01:28  loss: 0.6349 (0.6349)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 3.5505  data: 3.4015  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.8370 (0.8200)  acc1: 83.6000 (83.4545)  acc5: 97.2000 (97.1636)  time: 0.6197  data: 0.5027  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0043 (0.9927)  acc1: 77.2000 (79.1429)  acc5: 94.4000 (94.8952)  time: 0.2616  data: 0.1486  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1338 (1.0069)  acc1: 76.4000 (78.6880)  acc5: 93.6000 (94.7840)  time: 0.2040  data: 0.0938  max mem: 18117
Test: Total time: 0:00:09 (0.3981 s / it)
* Acc@1 79.100 Acc@5 94.832 loss 1.003
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.28%
Epoch: [194]  [   0/1251]  eta: 1:04:56  lr: 0.001256  min_lr: 0.001256  loss: 1.8680 (1.8680)  weight_decay: 0.0500 (0.0500)  time: 3.1149  data: 1.6924  max mem: 18117
Epoch: [194]  [ 200/1251]  eta: 0:04:29  lr: 0.001253  min_lr: 0.001253  loss: 3.1241 (2.9072)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7523 (0.7806)  time: 0.2374  data: 0.0005  max mem: 18117
Epoch: [194]  [ 400/1251]  eta: 0:03:30  lr: 0.001249  min_lr: 0.001249  loss: 3.2417 (2.9104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6840 (0.7651)  time: 0.2377  data: 0.0005  max mem: 18117
Epoch: [194]  [ 600/1251]  eta: 0:02:39  lr: 0.001246  min_lr: 0.001246  loss: 2.8522 (2.8990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6727 (0.7905)  time: 0.2365  data: 0.0003  max mem: 18117
Epoch: [194]  [ 800/1251]  eta: 0:01:49  lr: 0.001243  min_lr: 0.001243  loss: 2.2544 (2.8894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7311 (0.7941)  time: 0.2354  data: 0.0003  max mem: 18117
Epoch: [194]  [1000/1251]  eta: 0:01:00  lr: 0.001239  min_lr: 0.001239  loss: 3.2152 (2.8878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7988 (0.7995)  time: 0.2376  data: 0.0005  max mem: 18117
Epoch: [194]  [1200/1251]  eta: 0:00:12  lr: 0.001236  min_lr: 0.001236  loss: 2.4667 (2.8869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7934 (0.8020)  time: 0.2414  data: 0.0004  max mem: 18117
Epoch: [194]  [1250/1251]  eta: 0:00:00  lr: 0.001235  min_lr: 0.001235  loss: 3.2567 (2.8923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7863 (0.7999)  time: 0.1952  data: 0.0008  max mem: 18117
Epoch: [194] Total time: 0:05:02 (0.2414 s / it)
Averaged stats: lr: 0.001235  min_lr: 0.001235  loss: 3.2567 (2.8851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7863 (0.7999)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6726 (0.6726)  acc1: 89.6000 (89.6000)  acc5: 97.2000 (97.2000)  time: 5.8822  data: 5.7421  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8330 (0.8518)  acc1: 83.2000 (83.0182)  acc5: 97.2000 (96.6546)  time: 0.7656  data: 0.6524  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0023 (1.0179)  acc1: 76.4000 (79.0857)  acc5: 94.0000 (94.7238)  time: 0.2110  data: 0.1018  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1084 (1.0278)  acc1: 76.0000 (78.5920)  acc5: 93.6000 (94.5280)  time: 0.2108  data: 0.1017  max mem: 18117
Test: Total time: 0:00:10 (0.4248 s / it)
* Acc@1 79.104 Acc@5 94.736 loss 1.014
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.28%
Epoch: [195]  [   0/1251]  eta: 1:04:10  lr: 0.001235  min_lr: 0.001235  loss: 2.0454 (2.0454)  weight_decay: 0.0500 (0.0500)  time: 3.0777  data: 2.4666  max mem: 18117
Epoch: [195]  [ 200/1251]  eta: 0:04:28  lr: 0.001232  min_lr: 0.001232  loss: 2.1658 (2.8637)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8014 (0.8424)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [195]  [ 400/1251]  eta: 0:03:30  lr: 0.001229  min_lr: 0.001229  loss: 3.0891 (2.8531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8812 (0.8456)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [195]  [ 600/1251]  eta: 0:02:39  lr: 0.001225  min_lr: 0.001225  loss: 2.7859 (2.8575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7147 (0.8121)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [195]  [ 800/1251]  eta: 0:01:49  lr: 0.001222  min_lr: 0.001222  loss: 2.7004 (2.8643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7049 (0.8099)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [195]  [1000/1251]  eta: 0:01:00  lr: 0.001219  min_lr: 0.001219  loss: 2.6886 (2.8622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7301 (0.7959)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [195]  [1200/1251]  eta: 0:00:12  lr: 0.001215  min_lr: 0.001215  loss: 3.0569 (2.8725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7522 (0.7946)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [195]  [1250/1251]  eta: 0:00:00  lr: 0.001215  min_lr: 0.001215  loss: 2.6927 (2.8705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7522 (0.7945)  time: 0.1953  data: 0.0007  max mem: 18117
Epoch: [195] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.001215  min_lr: 0.001215  loss: 2.6927 (2.8794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7522 (0.7945)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6835 (0.6835)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 5.7431  data: 5.6159  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8042 (0.8260)  acc1: 84.4000 (83.2727)  acc5: 97.2000 (96.9818)  time: 0.7378  data: 0.6266  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0055 (0.9933)  acc1: 77.2000 (79.3714)  acc5: 95.2000 (95.1429)  time: 0.1988  data: 0.0893  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1140 (1.0025)  acc1: 76.8000 (78.9600)  acc5: 94.4000 (95.0720)  time: 0.1983  data: 0.0892  max mem: 18117
Test: Total time: 0:00:10 (0.4096 s / it)
* Acc@1 79.176 Acc@5 94.890 loss 0.987
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.28%
Epoch: [196]  [   0/1251]  eta: 1:06:42  lr: 0.001215  min_lr: 0.001215  loss: 3.0602 (3.0602)  weight_decay: 0.0500 (0.0500)  time: 3.1997  data: 2.5925  max mem: 18117
Epoch: [196]  [ 200/1251]  eta: 0:04:26  lr: 0.001211  min_lr: 0.001211  loss: 2.9069 (2.8820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7807 (0.8176)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [196]  [ 400/1251]  eta: 0:03:29  lr: 0.001208  min_lr: 0.001208  loss: 2.7981 (2.8894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7525 (0.8465)  time: 0.2415  data: 0.0004  max mem: 18117
Epoch: [196]  [ 600/1251]  eta: 0:02:38  lr: 0.001205  min_lr: 0.001205  loss: 2.8615 (2.8926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6594 (0.8071)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [196]  [ 800/1251]  eta: 0:01:49  lr: 0.001201  min_lr: 0.001201  loss: 2.3506 (2.8897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7528 (0.8118)  time: 0.2425  data: 0.0004  max mem: 18117
Epoch: [196]  [1000/1251]  eta: 0:01:00  lr: 0.001198  min_lr: 0.001198  loss: 2.6441 (2.8654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7225 (0.8167)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [196]  [1200/1251]  eta: 0:00:12  lr: 0.001195  min_lr: 0.001195  loss: 2.3112 (2.8699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8607 (0.8284)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [196]  [1250/1251]  eta: 0:00:00  lr: 0.001194  min_lr: 0.001194  loss: 2.8776 (2.8697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7858 (0.8280)  time: 0.1965  data: 0.0008  max mem: 18117
Epoch: [196] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.001194  min_lr: 0.001194  loss: 2.8776 (2.8810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7858 (0.8280)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5785 (0.5785)  acc1: 89.6000 (89.6000)  acc5: 98.0000 (98.0000)  time: 5.4524  data: 5.3236  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7804 (0.7566)  acc1: 82.8000 (83.3091)  acc5: 97.6000 (97.0546)  time: 0.7497  data: 0.6364  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9162 (0.9177)  acc1: 76.4000 (79.6000)  acc5: 94.8000 (94.9333)  time: 0.2050  data: 0.0945  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0069 (0.9333)  acc1: 76.4000 (79.2160)  acc5: 93.2000 (94.7040)  time: 0.2103  data: 0.0996  max mem: 18117
Test: Total time: 0:00:10 (0.4097 s / it)
* Acc@1 79.348 Acc@5 94.722 loss 0.930
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.35%
Epoch: [197]  [   0/1251]  eta: 1:13:45  lr: 0.001194  min_lr: 0.001194  loss: 2.1223 (2.1223)  weight_decay: 0.0500 (0.0500)  time: 3.5379  data: 3.2931  max mem: 18117
Epoch: [197]  [ 200/1251]  eta: 0:04:27  lr: 0.001191  min_lr: 0.001191  loss: 2.9407 (2.9096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8109 (0.7968)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [197]  [ 400/1251]  eta: 0:03:29  lr: 0.001187  min_lr: 0.001187  loss: 3.1966 (2.9086)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7994 (0.8187)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [197]  [ 600/1251]  eta: 0:02:39  lr: 0.001184  min_lr: 0.001184  loss: 3.1989 (2.9028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7347 (0.8069)  time: 0.2450  data: 0.0004  max mem: 18117
Epoch: [197]  [ 800/1251]  eta: 0:01:49  lr: 0.001181  min_lr: 0.001181  loss: 2.7590 (2.8863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7097 (0.8030)  time: 0.2358  data: 0.0004  max mem: 18117
Epoch: [197]  [1000/1251]  eta: 0:01:00  lr: 0.001178  min_lr: 0.001178  loss: 2.5790 (2.8839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7333 (0.8080)  time: 0.2472  data: 0.0004  max mem: 18117
Epoch: [197]  [1200/1251]  eta: 0:00:12  lr: 0.001174  min_lr: 0.001174  loss: 2.8687 (2.8892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7189 (0.8024)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [197]  [1250/1251]  eta: 0:00:00  lr: 0.001174  min_lr: 0.001174  loss: 2.4086 (2.8880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7268 (0.8006)  time: 0.2025  data: 0.0006  max mem: 18117
Epoch: [197] Total time: 0:05:02 (0.2420 s / it)
Averaged stats: lr: 0.001174  min_lr: 0.001174  loss: 2.4086 (2.8663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7268 (0.8006)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6179 (0.6179)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 5.4555  data: 5.3342  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7772 (0.7662)  acc1: 84.4000 (83.0909)  acc5: 96.8000 (96.8000)  time: 0.6970  data: 0.5858  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9658 (0.9323)  acc1: 77.2000 (79.4095)  acc5: 94.4000 (94.8381)  time: 0.2096  data: 0.1006  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0642 (0.9416)  acc1: 76.4000 (79.1040)  acc5: 93.6000 (94.8000)  time: 0.2092  data: 0.1005  max mem: 18117
Test: Total time: 0:00:10 (0.4062 s / it)
* Acc@1 79.180 Acc@5 94.868 loss 0.931
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.35%
Epoch: [198]  [   0/1251]  eta: 1:05:45  lr: 0.001174  min_lr: 0.001174  loss: 2.0569 (2.0569)  weight_decay: 0.0500 (0.0500)  time: 3.1538  data: 2.6433  max mem: 18117
Epoch: [198]  [ 200/1251]  eta: 0:04:28  lr: 0.001170  min_lr: 0.001170  loss: 3.0584 (2.8099)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7999 (0.7932)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [198]  [ 400/1251]  eta: 0:03:30  lr: 0.001167  min_lr: 0.001167  loss: 2.5249 (2.8200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7206 (0.7982)  time: 0.2415  data: 0.0004  max mem: 18117
Epoch: [198]  [ 600/1251]  eta: 0:02:39  lr: 0.001164  min_lr: 0.001164  loss: 2.3071 (2.8401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8073 (nan)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [198]  [ 800/1251]  eta: 0:01:49  lr: 0.001161  min_lr: 0.001161  loss: 2.8970 (2.8458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (nan)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [198]  [1000/1251]  eta: 0:01:00  lr: 0.001157  min_lr: 0.001157  loss: 2.8170 (2.8572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8217 (nan)  time: 0.2377  data: 0.0003  max mem: 18117
Epoch: [198]  [1200/1251]  eta: 0:00:12  lr: 0.001154  min_lr: 0.001154  loss: 2.7962 (2.8607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7413 (nan)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [198]  [1250/1251]  eta: 0:00:00  lr: 0.001153  min_lr: 0.001153  loss: 2.5264 (2.8587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7413 (nan)  time: 0.1969  data: 0.0007  max mem: 18117
Epoch: [198] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.001153  min_lr: 0.001153  loss: 2.5264 (2.8706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7413 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6041 (0.6041)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.6146  data: 5.4644  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7902 (0.7775)  acc1: 84.8000 (83.5636)  acc5: 97.2000 (96.8364)  time: 0.7273  data: 0.6126  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0574 (0.9487)  acc1: 78.0000 (79.5048)  acc5: 94.0000 (95.0286)  time: 0.1971  data: 0.0875  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0288 (0.9586)  acc1: 76.4000 (79.1840)  acc5: 94.0000 (94.9120)  time: 0.2002  data: 0.0915  max mem: 18117
Test: Total time: 0:00:10 (0.4068 s / it)
* Acc@1 79.508 Acc@5 94.918 loss 0.953
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.51%
Epoch: [199]  [   0/1251]  eta: 1:03:06  lr: 0.001153  min_lr: 0.001153  loss: 3.1907 (3.1907)  weight_decay: 0.0500 (0.0500)  time: 3.0265  data: 2.7253  max mem: 18117
Epoch: [199]  [ 200/1251]  eta: 0:04:26  lr: 0.001150  min_lr: 0.001150  loss: 2.2889 (2.8154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7076 (0.8758)  time: 0.2375  data: 0.0005  max mem: 18117
Epoch: [199]  [ 400/1251]  eta: 0:03:29  lr: 0.001147  min_lr: 0.001147  loss: 2.7658 (2.8615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7658 (0.8679)  time: 0.2413  data: 0.0004  max mem: 18117
Epoch: [199]  [ 600/1251]  eta: 0:02:38  lr: 0.001143  min_lr: 0.001143  loss: 2.5285 (2.8664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8193 (0.8709)  time: 0.2395  data: 0.0005  max mem: 18117
Epoch: [199]  [ 800/1251]  eta: 0:01:49  lr: 0.001140  min_lr: 0.001140  loss: 2.4704 (2.8652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.8425)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [199]  [1000/1251]  eta: 0:01:00  lr: 0.001137  min_lr: 0.001137  loss: 3.4343 (2.8725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7327 (0.8290)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [199]  [1200/1251]  eta: 0:00:12  lr: 0.001134  min_lr: 0.001134  loss: 2.3323 (2.8729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8172 (0.8302)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [199]  [1250/1251]  eta: 0:00:00  lr: 0.001133  min_lr: 0.001133  loss: 3.5325 (2.8738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8539 (0.8330)  time: 0.1958  data: 0.0007  max mem: 18117
Epoch: [199] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.001133  min_lr: 0.001133  loss: 3.5325 (2.8604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8539 (0.8330)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.6624 (0.6624)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.9350  data: 5.8061  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8380 (0.8173)  acc1: 85.2000 (84.4364)  acc5: 97.6000 (97.0545)  time: 0.7462  data: 0.6344  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0371 (1.0042)  acc1: 77.6000 (80.0000)  acc5: 94.4000 (94.9905)  time: 0.1904  data: 0.0813  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1112 (1.0138)  acc1: 76.8000 (79.6960)  acc5: 94.4000 (94.9920)  time: 0.1892  data: 0.0810  max mem: 18117
Test: Total time: 0:00:10 (0.4106 s / it)
* Acc@1 79.494 Acc@5 94.884 loss 1.003
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.51%
Epoch: [200]  [   0/1251]  eta: 1:06:07  lr: 0.001133  min_lr: 0.001133  loss: 2.0100 (2.0100)  weight_decay: 0.0500 (0.0500)  time: 3.1716  data: 2.3616  max mem: 18117
Epoch: [200]  [ 200/1251]  eta: 0:04:27  lr: 0.001130  min_lr: 0.001130  loss: 2.8451 (2.8268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7287 (0.7746)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [200]  [ 400/1251]  eta: 0:03:29  lr: 0.001126  min_lr: 0.001126  loss: 2.6645 (2.8314)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0416 (0.8385)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [200]  [ 600/1251]  eta: 0:02:38  lr: 0.001123  min_lr: 0.001123  loss: 2.4709 (2.8390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (0.8308)  time: 0.2355  data: 0.0005  max mem: 18117
Epoch: [200]  [ 800/1251]  eta: 0:01:49  lr: 0.001120  min_lr: 0.001120  loss: 2.9496 (2.8502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6843 (0.8243)  time: 0.2403  data: 0.0005  max mem: 18117
Epoch: [200]  [1000/1251]  eta: 0:01:00  lr: 0.001117  min_lr: 0.001117  loss: 2.4896 (2.8297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7663 (0.8168)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [200]  [1200/1251]  eta: 0:00:12  lr: 0.001114  min_lr: 0.001114  loss: 2.2762 (2.8345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8276 (0.8254)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [200]  [1250/1251]  eta: 0:00:00  lr: 0.001113  min_lr: 0.001113  loss: 2.1670 (2.8340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7718 (0.8267)  time: 0.1955  data: 0.0006  max mem: 18117
Epoch: [200] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.001113  min_lr: 0.001113  loss: 2.1670 (2.8538)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7718 (0.8267)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5845 (0.5845)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.7115  data: 5.5786  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7765 (0.7590)  acc1: 84.4000 (83.0909)  acc5: 97.2000 (96.9091)  time: 0.7115  data: 0.5973  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9347 (0.9199)  acc1: 77.6000 (79.4857)  acc5: 94.8000 (94.8762)  time: 0.1934  data: 0.0831  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9954 (0.9351)  acc1: 76.4000 (78.9280)  acc5: 93.6000 (94.7680)  time: 0.2048  data: 0.0963  max mem: 18117
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 79.378 Acc@5 94.976 loss 0.923
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.51%
Epoch: [201]  [   0/1251]  eta: 1:06:26  lr: 0.001113  min_lr: 0.001113  loss: 2.1378 (2.1378)  weight_decay: 0.0500 (0.0500)  time: 3.1865  data: 2.4328  max mem: 18117
Epoch: [201]  [ 200/1251]  eta: 0:04:28  lr: 0.001110  min_lr: 0.001110  loss: 3.0863 (2.9599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7952 (0.8544)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [201]  [ 400/1251]  eta: 0:03:31  lr: 0.001106  min_lr: 0.001106  loss: 2.4038 (2.8949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7459 (0.8685)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [201]  [ 600/1251]  eta: 0:02:40  lr: 0.001103  min_lr: 0.001103  loss: 2.5082 (2.8621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8839 (0.8609)  time: 0.2439  data: 0.0004  max mem: 18117
Epoch: [201]  [ 800/1251]  eta: 0:01:50  lr: 0.001100  min_lr: 0.001100  loss: 2.3550 (2.8600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6889 (0.8343)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [201]  [1000/1251]  eta: 0:01:01  lr: 0.001097  min_lr: 0.001097  loss: 2.7427 (2.8500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7213 (0.8318)  time: 0.2370  data: 0.0005  max mem: 18117
Epoch: [201]  [1200/1251]  eta: 0:00:12  lr: 0.001094  min_lr: 0.001094  loss: 2.6868 (2.8433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8042 (0.8253)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [201]  [1250/1251]  eta: 0:00:00  lr: 0.001093  min_lr: 0.001093  loss: 2.1979 (2.8438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8123 (0.8249)  time: 0.1956  data: 0.0010  max mem: 18117
Epoch: [201] Total time: 0:05:02 (0.2422 s / it)
Averaged stats: lr: 0.001093  min_lr: 0.001093  loss: 2.1979 (2.8649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8123 (0.8249)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5940 (0.5940)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.4913  data: 5.3346  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7917 (0.7986)  acc1: 84.4000 (83.5273)  acc5: 97.2000 (97.1636)  time: 0.7885  data: 0.6710  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9770 (0.9707)  acc1: 77.6000 (79.6952)  acc5: 94.4000 (95.3333)  time: 0.2245  data: 0.1128  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0929 (0.9821)  acc1: 76.8000 (79.2160)  acc5: 94.4000 (95.1520)  time: 0.2240  data: 0.1127  max mem: 18117
Test: Total time: 0:00:10 (0.4206 s / it)
* Acc@1 79.210 Acc@5 94.966 loss 0.969
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.51%
Epoch: [202]  [   0/1251]  eta: 1:09:29  lr: 0.001093  min_lr: 0.001093  loss: 3.6942 (3.6942)  weight_decay: 0.0500 (0.0500)  time: 3.3325  data: 2.8770  max mem: 18117
Epoch: [202]  [ 200/1251]  eta: 0:04:27  lr: 0.001090  min_lr: 0.001090  loss: 2.1812 (3.0459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8088 (0.8706)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [202]  [ 400/1251]  eta: 0:03:29  lr: 0.001086  min_lr: 0.001086  loss: 2.5602 (2.9569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6938 (0.8202)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [202]  [ 600/1251]  eta: 0:02:38  lr: 0.001083  min_lr: 0.001083  loss: 2.5195 (2.9072)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8420 (0.8085)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [202]  [ 800/1251]  eta: 0:01:49  lr: 0.001080  min_lr: 0.001080  loss: 2.5509 (2.8995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7505 (0.8195)  time: 0.2393  data: 0.0005  max mem: 18117
Epoch: [202]  [1000/1251]  eta: 0:01:00  lr: 0.001077  min_lr: 0.001077  loss: 2.6619 (2.8860)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7413 (0.8263)  time: 0.2361  data: 0.0003  max mem: 18117
Epoch: [202]  [1200/1251]  eta: 0:00:12  lr: 0.001074  min_lr: 0.001074  loss: 2.5616 (2.8709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7956 (0.8351)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [202]  [1250/1251]  eta: 0:00:00  lr: 0.001073  min_lr: 0.001073  loss: 3.0007 (2.8698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7360 (0.8337)  time: 0.1970  data: 0.0007  max mem: 18117
Epoch: [202] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.001073  min_lr: 0.001073  loss: 3.0007 (2.8549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7360 (0.8337)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6130 (0.6130)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.6060  data: 5.4796  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7842 (0.7775)  acc1: 85.2000 (83.0909)  acc5: 97.6000 (97.0909)  time: 0.7456  data: 0.6322  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0395 (0.9433)  acc1: 78.0000 (79.8857)  acc5: 94.4000 (95.0476)  time: 0.2157  data: 0.1056  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0532 (0.9569)  acc1: 78.0000 (79.5520)  acc5: 94.0000 (94.8320)  time: 0.2137  data: 0.1055  max mem: 18117
Test: Total time: 0:00:10 (0.4186 s / it)
* Acc@1 79.444 Acc@5 94.918 loss 0.943
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.51%
Epoch: [203]  [   0/1251]  eta: 1:06:55  lr: 0.001073  min_lr: 0.001073  loss: 3.6329 (3.6329)  weight_decay: 0.0500 (0.0500)  time: 3.2101  data: 2.3372  max mem: 18117
Epoch: [203]  [ 200/1251]  eta: 0:04:27  lr: 0.001070  min_lr: 0.001070  loss: 3.1467 (2.8497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6821 (0.7966)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [203]  [ 400/1251]  eta: 0:03:29  lr: 0.001066  min_lr: 0.001066  loss: 2.8652 (2.8920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7173 (0.7986)  time: 0.2358  data: 0.0005  max mem: 18117
Epoch: [203]  [ 600/1251]  eta: 0:02:38  lr: 0.001063  min_lr: 0.001063  loss: 2.9404 (2.8876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8414 (0.8304)  time: 0.2380  data: 0.0005  max mem: 18117
Epoch: [203]  [ 800/1251]  eta: 0:01:49  lr: 0.001060  min_lr: 0.001060  loss: 2.7055 (2.8784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7828 (0.8280)  time: 0.2392  data: 0.0005  max mem: 18117
Epoch: [203]  [1000/1251]  eta: 0:01:00  lr: 0.001057  min_lr: 0.001057  loss: 3.2930 (2.8690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7025 (0.8222)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [203]  [1200/1251]  eta: 0:00:12  lr: 0.001054  min_lr: 0.001054  loss: 3.1786 (2.8684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7833 (0.8259)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [203]  [1250/1251]  eta: 0:00:00  lr: 0.001053  min_lr: 0.001053  loss: 2.1704 (2.8652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7656 (0.8245)  time: 0.1964  data: 0.0008  max mem: 18117
Epoch: [203] Total time: 0:05:01 (0.2407 s / it)
Averaged stats: lr: 0.001053  min_lr: 0.001053  loss: 2.1704 (2.8479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7656 (0.8245)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6155 (0.6155)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.7490  data: 5.6221  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7248 (0.7645)  acc1: 85.6000 (83.4545)  acc5: 97.2000 (97.0909)  time: 0.7684  data: 0.6580  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0114 (0.9391)  acc1: 76.4000 (79.6000)  acc5: 94.8000 (94.9333)  time: 0.2160  data: 0.1073  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0502 (0.9492)  acc1: 76.4000 (79.3280)  acc5: 94.4000 (94.8320)  time: 0.2145  data: 0.1060  max mem: 18117
Test: Total time: 0:00:10 (0.4233 s / it)
* Acc@1 79.632 Acc@5 94.944 loss 0.928
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.63%
Epoch: [204]  [   0/1251]  eta: 1:04:41  lr: 0.001053  min_lr: 0.001053  loss: 2.1206 (2.1206)  weight_decay: 0.0500 (0.0500)  time: 3.1031  data: 2.8074  max mem: 18117
Epoch: [204]  [ 200/1251]  eta: 0:04:26  lr: 0.001050  min_lr: 0.001050  loss: 2.3447 (2.7721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6967 (0.7970)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [204]  [ 400/1251]  eta: 0:03:29  lr: 0.001047  min_lr: 0.001047  loss: 2.8330 (2.8023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8407 (0.8177)  time: 0.2410  data: 0.0005  max mem: 18117
Epoch: [204]  [ 600/1251]  eta: 0:02:38  lr: 0.001044  min_lr: 0.001044  loss: 3.0938 (2.8178)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8234 (0.8334)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [204]  [ 800/1251]  eta: 0:01:49  lr: 0.001040  min_lr: 0.001040  loss: 2.2357 (2.7885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7565 (0.8218)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [204]  [1000/1251]  eta: 0:01:00  lr: 0.001037  min_lr: 0.001037  loss: 2.5683 (2.8021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8527 (0.8294)  time: 0.2350  data: 0.0004  max mem: 18117
Epoch: [204]  [1200/1251]  eta: 0:00:12  lr: 0.001034  min_lr: 0.001034  loss: 2.7135 (2.8172)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8495 (0.8405)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [204]  [1250/1251]  eta: 0:00:00  lr: 0.001033  min_lr: 0.001033  loss: 2.3432 (2.8136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8759 (0.8423)  time: 0.1952  data: 0.0005  max mem: 18117
Epoch: [204] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.001033  min_lr: 0.001033  loss: 2.3432 (2.8419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8759 (0.8423)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5984 (0.5984)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.7770  data: 5.6350  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7751 (0.7741)  acc1: 85.2000 (83.4546)  acc5: 97.2000 (97.0545)  time: 0.7399  data: 0.6262  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0462 (0.9421)  acc1: 76.8000 (79.8286)  acc5: 94.4000 (94.8952)  time: 0.1913  data: 0.0815  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0212 (0.9514)  acc1: 76.8000 (79.4880)  acc5: 94.0000 (94.8640)  time: 0.1993  data: 0.0892  max mem: 18117
Test: Total time: 0:00:10 (0.4118 s / it)
* Acc@1 79.674 Acc@5 94.984 loss 0.940
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.67%
Epoch: [205]  [   0/1251]  eta: 1:05:03  lr: 0.001033  min_lr: 0.001033  loss: 3.4608 (3.4608)  weight_decay: 0.0500 (0.0500)  time: 3.1201  data: 2.7731  max mem: 18117
Epoch: [205]  [ 200/1251]  eta: 0:04:24  lr: 0.001030  min_lr: 0.001030  loss: 2.2632 (2.7750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7180 (0.8114)  time: 0.2365  data: 0.0003  max mem: 18117
Epoch: [205]  [ 400/1251]  eta: 0:03:28  lr: 0.001027  min_lr: 0.001027  loss: 2.6954 (2.8186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7356 (0.7934)  time: 0.2356  data: 0.0003  max mem: 18117
Epoch: [205]  [ 600/1251]  eta: 0:02:38  lr: 0.001024  min_lr: 0.001024  loss: 2.7212 (2.8198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.7887)  time: 0.2411  data: 0.0004  max mem: 18117
Epoch: [205]  [ 800/1251]  eta: 0:01:49  lr: 0.001021  min_lr: 0.001021  loss: 2.3755 (2.8167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9345 (0.8296)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [205]  [1000/1251]  eta: 0:01:00  lr: 0.001018  min_lr: 0.001018  loss: 2.8222 (2.8304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9396 (nan)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [205]  [1200/1251]  eta: 0:00:12  lr: 0.001014  min_lr: 0.001014  loss: 2.5744 (2.8336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8357 (nan)  time: 0.2408  data: 0.0004  max mem: 18117
Epoch: [205]  [1250/1251]  eta: 0:00:00  lr: 0.001014  min_lr: 0.001014  loss: 2.3505 (2.8328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8724 (nan)  time: 0.1962  data: 0.0007  max mem: 18117
Epoch: [205] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.001014  min_lr: 0.001014  loss: 2.3505 (2.8465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8724 (nan)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6195 (0.6195)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 5.4116  data: 5.2527  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7673 (0.7715)  acc1: 83.6000 (83.2727)  acc5: 97.2000 (96.7636)  time: 0.7335  data: 0.6188  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0049 (0.9367)  acc1: 78.4000 (79.6381)  acc5: 94.0000 (94.8762)  time: 0.2145  data: 0.1052  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9775 (0.9423)  acc1: 78.0000 (79.4080)  acc5: 94.0000 (94.8480)  time: 0.2143  data: 0.1052  max mem: 18117
Test: Total time: 0:00:10 (0.4086 s / it)
* Acc@1 79.642 Acc@5 95.038 loss 0.925
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.67%
Epoch: [206]  [   0/1251]  eta: 1:06:44  lr: 0.001014  min_lr: 0.001014  loss: 2.0659 (2.0659)  weight_decay: 0.0500 (0.0500)  time: 3.2011  data: 1.6252  max mem: 18117
Epoch: [206]  [ 200/1251]  eta: 0:04:27  lr: 0.001011  min_lr: 0.001011  loss: 2.2998 (2.7937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7376 (0.7766)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [206]  [ 400/1251]  eta: 0:03:29  lr: 0.001007  min_lr: 0.001007  loss: 2.2693 (2.8349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9043 (0.8037)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [206]  [ 600/1251]  eta: 0:02:38  lr: 0.001004  min_lr: 0.001004  loss: 2.4618 (2.8364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7663 (0.8096)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [206]  [ 800/1251]  eta: 0:01:49  lr: 0.001001  min_lr: 0.001001  loss: 2.7102 (2.8465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8189 (0.8326)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [206]  [1000/1251]  eta: 0:01:00  lr: 0.000998  min_lr: 0.000998  loss: 2.5673 (2.8511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8318 (0.8441)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [206]  [1200/1251]  eta: 0:00:12  lr: 0.000995  min_lr: 0.000995  loss: 2.5992 (2.8450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8287 (0.8484)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [206]  [1250/1251]  eta: 0:00:00  lr: 0.000994  min_lr: 0.000994  loss: 2.2522 (2.8438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7994 (0.8471)  time: 0.1950  data: 0.0008  max mem: 18117
Epoch: [206] Total time: 0:05:01 (0.2407 s / it)
Averaged stats: lr: 0.000994  min_lr: 0.000994  loss: 2.2522 (2.8408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7994 (0.8471)
Test:  [ 0/25]  eta: 0:01:37  loss: 0.6037 (0.6037)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 3.9106  data: 3.7800  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7875 (0.7550)  acc1: 84.4000 (83.2727)  acc5: 96.8000 (97.1273)  time: 0.6699  data: 0.5576  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9429 (0.9212)  acc1: 77.2000 (79.5238)  acc5: 94.4000 (94.9524)  time: 0.2721  data: 0.1629  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9381 (0.9268)  acc1: 77.2000 (79.2800)  acc5: 94.0000 (94.9760)  time: 0.2234  data: 0.1149  max mem: 18117
Test: Total time: 0:00:10 (0.4134 s / it)
* Acc@1 79.556 Acc@5 95.030 loss 0.911
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.67%
Epoch: [207]  [   0/1251]  eta: 1:09:47  lr: 0.000994  min_lr: 0.000994  loss: 2.2212 (2.2212)  weight_decay: 0.0500 (0.0500)  time: 3.3471  data: 3.0493  max mem: 18117
Epoch: [207]  [ 200/1251]  eta: 0:04:26  lr: 0.000991  min_lr: 0.000991  loss: 2.5048 (2.7939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8157 (0.7832)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [207]  [ 400/1251]  eta: 0:03:29  lr: 0.000988  min_lr: 0.000988  loss: 3.0235 (2.8234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7992 (0.8058)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [207]  [ 600/1251]  eta: 0:02:38  lr: 0.000985  min_lr: 0.000985  loss: 2.9591 (2.8413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7515 (0.8066)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [207]  [ 800/1251]  eta: 0:01:49  lr: 0.000982  min_lr: 0.000982  loss: 2.3267 (2.8083)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8013 (0.8189)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [207]  [1000/1251]  eta: 0:01:00  lr: 0.000979  min_lr: 0.000979  loss: 2.5563 (2.8011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8002 (0.8269)  time: 0.2399  data: 0.0003  max mem: 18117
Epoch: [207]  [1200/1251]  eta: 0:00:12  lr: 0.000976  min_lr: 0.000976  loss: 3.2164 (2.8099)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7652 (0.8321)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [207]  [1250/1251]  eta: 0:00:00  lr: 0.000975  min_lr: 0.000975  loss: 3.0078 (2.8109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8149 (0.8339)  time: 0.2001  data: 0.0008  max mem: 18117
Epoch: [207] Total time: 0:05:02 (0.2414 s / it)
Averaged stats: lr: 0.000975  min_lr: 0.000975  loss: 3.0078 (2.8175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8149 (0.8339)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6587 (0.6587)  acc1: 88.0000 (88.0000)  acc5: 97.2000 (97.2000)  time: 5.6263  data: 5.4946  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8161 (0.7976)  acc1: 83.6000 (83.2364)  acc5: 97.2000 (97.2364)  time: 0.7359  data: 0.6227  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9585 (0.9603)  acc1: 77.2000 (79.8667)  acc5: 95.2000 (95.0857)  time: 0.2043  data: 0.0926  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0460 (0.9726)  acc1: 77.2000 (79.4880)  acc5: 94.0000 (94.9280)  time: 0.2073  data: 0.0971  max mem: 18117
Test: Total time: 0:00:10 (0.4129 s / it)
* Acc@1 79.750 Acc@5 95.058 loss 0.959
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.75%
Epoch: [208]  [   0/1251]  eta: 1:03:20  lr: 0.000975  min_lr: 0.000975  loss: 2.1334 (2.1334)  weight_decay: 0.0500 (0.0500)  time: 3.0379  data: 2.6853  max mem: 18117
Epoch: [208]  [ 200/1251]  eta: 0:04:27  lr: 0.000972  min_lr: 0.000972  loss: 2.3534 (2.8717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7691 (0.8639)  time: 0.2403  data: 0.0004  max mem: 18117
Epoch: [208]  [ 400/1251]  eta: 0:03:30  lr: 0.000969  min_lr: 0.000969  loss: 2.3063 (2.8718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7288 (0.8280)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [208]  [ 600/1251]  eta: 0:02:38  lr: 0.000966  min_lr: 0.000966  loss: 2.6799 (2.8514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7673 (0.8222)  time: 0.2387  data: 0.0005  max mem: 18117
Epoch: [208]  [ 800/1251]  eta: 0:01:49  lr: 0.000963  min_lr: 0.000963  loss: 2.6446 (2.8341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.8304)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [208]  [1000/1251]  eta: 0:01:00  lr: 0.000960  min_lr: 0.000960  loss: 2.4552 (2.8394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7665 (0.8377)  time: 0.2365  data: 0.0005  max mem: 18117
Epoch: [208]  [1200/1251]  eta: 0:00:12  lr: 0.000956  min_lr: 0.000956  loss: 2.3609 (2.8293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9799 (0.8502)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [208]  [1250/1251]  eta: 0:00:00  lr: 0.000956  min_lr: 0.000956  loss: 3.0318 (2.8304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8061 (0.8516)  time: 0.1956  data: 0.0005  max mem: 18117
Epoch: [208] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.000956  min_lr: 0.000956  loss: 3.0318 (2.8243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8061 (0.8516)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6605 (0.6605)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.6412  data: 5.4945  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8545 (0.7994)  acc1: 84.4000 (83.4182)  acc5: 97.2000 (97.2727)  time: 0.7415  data: 0.6278  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9820 (0.9674)  acc1: 78.0000 (79.4857)  acc5: 94.8000 (95.0667)  time: 0.1889  data: 0.0797  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0514 (0.9802)  acc1: 75.6000 (79.0240)  acc5: 93.6000 (94.8800)  time: 0.1881  data: 0.0797  max mem: 18117
Test: Total time: 0:00:09 (0.3978 s / it)
* Acc@1 79.526 Acc@5 94.944 loss 0.965
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.75%
Epoch: [209]  [   0/1251]  eta: 1:10:33  lr: 0.000956  min_lr: 0.000956  loss: 3.7108 (3.7108)  weight_decay: 0.0500 (0.0500)  time: 3.3837  data: 3.0332  max mem: 18117
Epoch: [209]  [ 200/1251]  eta: 0:04:28  lr: 0.000953  min_lr: 0.000953  loss: 3.1934 (2.7894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7883 (0.8954)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [209]  [ 400/1251]  eta: 0:03:29  lr: 0.000950  min_lr: 0.000950  loss: 2.6955 (2.7730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7337 (0.8470)  time: 0.2383  data: 0.0005  max mem: 18117
Epoch: [209]  [ 600/1251]  eta: 0:02:39  lr: 0.000947  min_lr: 0.000947  loss: 2.3472 (2.8154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8880 (0.8453)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [209]  [ 800/1251]  eta: 0:01:49  lr: 0.000944  min_lr: 0.000944  loss: 2.7879 (2.8120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8390 (0.8488)  time: 0.2450  data: 0.0004  max mem: 18117
Epoch: [209]  [1000/1251]  eta: 0:01:00  lr: 0.000940  min_lr: 0.000940  loss: 2.4433 (2.8200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8289 (0.8591)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [209]  [1200/1251]  eta: 0:00:12  lr: 0.000937  min_lr: 0.000937  loss: 2.7155 (2.8179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8716 (0.8671)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [209]  [1250/1251]  eta: 0:00:00  lr: 0.000937  min_lr: 0.000937  loss: 2.1970 (2.8163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7921 (0.8648)  time: 0.1958  data: 0.0010  max mem: 18117
Epoch: [209] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000937  min_lr: 0.000937  loss: 2.1970 (2.8247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7921 (0.8648)
Test:  [ 0/25]  eta: 0:01:32  loss: 0.5702 (0.5702)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 3.7093  data: 3.5547  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.7370 (0.7284)  acc1: 83.6000 (83.6727)  acc5: 97.6000 (97.2727)  time: 0.6145  data: 0.4970  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9248 (0.9029)  acc1: 77.6000 (79.9048)  acc5: 94.8000 (95.1048)  time: 0.2713  data: 0.1603  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9586 (0.9142)  acc1: 77.2000 (79.3760)  acc5: 94.4000 (95.0080)  time: 0.2068  data: 0.0978  max mem: 18117
Test: Total time: 0:00:10 (0.4060 s / it)
* Acc@1 79.662 Acc@5 95.048 loss 0.901
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.75%
Epoch: [210]  [   0/1251]  eta: 1:08:53  lr: 0.000937  min_lr: 0.000937  loss: 3.5945 (3.5945)  weight_decay: 0.0500 (0.0500)  time: 3.3039  data: 2.6478  max mem: 18117
Epoch: [210]  [ 200/1251]  eta: 0:04:26  lr: 0.000934  min_lr: 0.000934  loss: 2.1593 (2.7081)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8498 (0.8871)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [210]  [ 400/1251]  eta: 0:03:29  lr: 0.000931  min_lr: 0.000931  loss: 3.1557 (2.7698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8555 (0.8982)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [210]  [ 600/1251]  eta: 0:02:38  lr: 0.000928  min_lr: 0.000928  loss: 3.4405 (2.7867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8337 (0.8843)  time: 0.2430  data: 0.0004  max mem: 18117
Epoch: [210]  [ 800/1251]  eta: 0:01:49  lr: 0.000925  min_lr: 0.000925  loss: 2.8117 (2.7890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7603 (0.8745)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [210]  [1000/1251]  eta: 0:01:00  lr: 0.000922  min_lr: 0.000922  loss: 3.0088 (2.8015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7524 (0.8651)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [210]  [1200/1251]  eta: 0:00:12  lr: 0.000918  min_lr: 0.000918  loss: 2.6604 (2.8112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8051 (0.8710)  time: 0.2433  data: 0.0004  max mem: 18117
Epoch: [210]  [1250/1251]  eta: 0:00:00  lr: 0.000918  min_lr: 0.000918  loss: 2.4102 (2.8193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8499 (0.8769)  time: 0.1971  data: 0.0010  max mem: 18117
Epoch: [210] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000918  min_lr: 0.000918  loss: 2.4102 (2.8295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8499 (0.8769)
Test:  [ 0/25]  eta: 0:01:55  loss: 0.6213 (0.6213)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 4.6235  data: 4.4985  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.7705 (0.7500)  acc1: 84.8000 (83.5273)  acc5: 97.2000 (97.3818)  time: 0.6339  data: 0.5179  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9436 (0.9151)  acc1: 77.2000 (79.7905)  acc5: 96.0000 (95.3524)  time: 0.2156  data: 0.1028  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0103 (0.9259)  acc1: 76.4000 (79.3120)  acc5: 94.0000 (95.2480)  time: 0.2417  data: 0.1323  max mem: 18117
Test: Total time: 0:00:10 (0.4045 s / it)
* Acc@1 79.754 Acc@5 95.148 loss 0.918
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.75%
Epoch: [211]  [   0/1251]  eta: 1:11:09  lr: 0.000918  min_lr: 0.000918  loss: 1.9434 (1.9434)  weight_decay: 0.0500 (0.0500)  time: 3.4127  data: 3.1386  max mem: 18117
Epoch: [211]  [ 200/1251]  eta: 0:04:28  lr: 0.000915  min_lr: 0.000915  loss: 2.6483 (2.7387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7536 (0.8852)  time: 0.2376  data: 0.0005  max mem: 18117
Epoch: [211]  [ 400/1251]  eta: 0:03:31  lr: 0.000912  min_lr: 0.000912  loss: 2.8432 (2.7821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8168 (0.8619)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [211]  [ 600/1251]  eta: 0:02:39  lr: 0.000909  min_lr: 0.000909  loss: 3.1949 (2.8267)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.8526)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [211]  [ 800/1251]  eta: 0:01:49  lr: 0.000906  min_lr: 0.000906  loss: 2.0893 (2.8023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8487 (0.8621)  time: 0.2350  data: 0.0004  max mem: 18117
Epoch: [211]  [1000/1251]  eta: 0:01:00  lr: 0.000903  min_lr: 0.000903  loss: 3.3409 (2.8076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8517 (0.8786)  time: 0.2473  data: 0.0004  max mem: 18117
Epoch: [211]  [1200/1251]  eta: 0:00:12  lr: 0.000900  min_lr: 0.000900  loss: 2.6380 (2.8220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7723 (0.8671)  time: 0.2416  data: 0.0004  max mem: 18117
Epoch: [211]  [1250/1251]  eta: 0:00:00  lr: 0.000899  min_lr: 0.000899  loss: 3.2512 (2.8323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7369 (0.8614)  time: 0.1952  data: 0.0007  max mem: 18117
Epoch: [211] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000899  min_lr: 0.000899  loss: 3.2512 (2.8111)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7369 (0.8614)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6800 (0.6800)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.7358  data: 5.6111  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8787 (0.8377)  acc1: 84.0000 (83.6000)  acc5: 97.2000 (96.9091)  time: 0.7513  data: 0.6400  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0094 (0.9898)  acc1: 77.2000 (79.8857)  acc5: 95.6000 (95.3333)  time: 0.2029  data: 0.0939  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0693 (0.9996)  acc1: 77.2000 (79.5680)  acc5: 94.8000 (95.2320)  time: 0.2039  data: 0.0957  max mem: 18117
Test: Total time: 0:00:10 (0.4150 s / it)
* Acc@1 79.648 Acc@5 95.182 loss 0.995
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.75%
Epoch: [212]  [   0/1251]  eta: 1:07:46  lr: 0.000899  min_lr: 0.000899  loss: 3.4366 (3.4366)  weight_decay: 0.0500 (0.0500)  time: 3.2509  data: 2.7181  max mem: 18117
Epoch: [212]  [ 200/1251]  eta: 0:04:27  lr: 0.000896  min_lr: 0.000896  loss: 2.2845 (2.6878)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [212]  [ 400/1251]  eta: 0:03:29  lr: 0.000893  min_lr: 0.000893  loss: 2.3686 (2.7447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8237 (nan)  time: 0.2370  data: 0.0003  max mem: 18117
Epoch: [212]  [ 600/1251]  eta: 0:02:38  lr: 0.000890  min_lr: 0.000890  loss: 2.2650 (2.7572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9041 (nan)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [212]  [ 800/1251]  eta: 0:01:49  lr: 0.000887  min_lr: 0.000887  loss: 3.0203 (2.7639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8381 (nan)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [212]  [1000/1251]  eta: 0:01:00  lr: 0.000884  min_lr: 0.000884  loss: 2.5724 (2.7741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8288 (nan)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [212]  [1200/1251]  eta: 0:00:12  lr: 0.000881  min_lr: 0.000881  loss: 2.0165 (2.7708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8584 (nan)  time: 0.2422  data: 0.0004  max mem: 18117
Epoch: [212]  [1250/1251]  eta: 0:00:00  lr: 0.000880  min_lr: 0.000880  loss: 3.1359 (2.7748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8123 (nan)  time: 0.1964  data: 0.0011  max mem: 18117
Epoch: [212] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.000880  min_lr: 0.000880  loss: 3.1359 (2.8087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8123 (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6173 (0.6173)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.7226  data: 5.5955  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7940 (0.7972)  acc1: 85.2000 (83.7455)  acc5: 97.2000 (97.2364)  time: 0.7353  data: 0.6214  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0437 (0.9651)  acc1: 77.6000 (80.0571)  acc5: 95.2000 (95.2762)  time: 0.2016  data: 0.0906  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0587 (0.9736)  acc1: 77.2000 (79.7440)  acc5: 94.8000 (95.2000)  time: 0.2020  data: 0.0905  max mem: 18117
Test: Total time: 0:00:10 (0.4134 s / it)
* Acc@1 79.922 Acc@5 95.080 loss 0.966
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.92%
Epoch: [213]  [   0/1251]  eta: 1:06:52  lr: 0.000880  min_lr: 0.000880  loss: 2.0586 (2.0586)  weight_decay: 0.0500 (0.0500)  time: 3.2075  data: 2.8987  max mem: 18117
Epoch: [213]  [ 200/1251]  eta: 0:04:26  lr: 0.000877  min_lr: 0.000877  loss: 2.2646 (2.8041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8080 (0.8648)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [213]  [ 400/1251]  eta: 0:03:28  lr: 0.000874  min_lr: 0.000874  loss: 2.8091 (2.8560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7751 (0.8300)  time: 0.2353  data: 0.0004  max mem: 18117
Epoch: [213]  [ 600/1251]  eta: 0:02:38  lr: 0.000871  min_lr: 0.000871  loss: 2.3148 (2.8375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9867 (0.8811)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [213]  [ 800/1251]  eta: 0:01:49  lr: 0.000868  min_lr: 0.000868  loss: 2.5008 (2.8231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8456 (0.8935)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [213]  [1000/1251]  eta: 0:01:00  lr: 0.000865  min_lr: 0.000865  loss: 2.2699 (2.8134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8011 (0.8743)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [213]  [1200/1251]  eta: 0:00:12  lr: 0.000863  min_lr: 0.000863  loss: 2.7690 (2.8191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7931 (0.8657)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [213]  [1250/1251]  eta: 0:00:00  lr: 0.000862  min_lr: 0.000862  loss: 2.5092 (2.8210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (0.8627)  time: 0.1957  data: 0.0005  max mem: 18117
Epoch: [213] Total time: 0:05:01 (0.2408 s / it)
Averaged stats: lr: 0.000862  min_lr: 0.000862  loss: 2.5092 (2.8108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (0.8627)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6166 (0.6166)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.6163  data: 5.4907  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7754 (0.7629)  acc1: 83.6000 (84.1455)  acc5: 96.8000 (97.1273)  time: 0.7615  data: 0.6486  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9767 (0.9243)  acc1: 78.4000 (80.1333)  acc5: 94.8000 (95.3524)  time: 0.2128  data: 0.1025  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9913 (0.9311)  acc1: 76.8000 (79.7280)  acc5: 94.8000 (95.3440)  time: 0.2122  data: 0.1023  max mem: 18117
Test: Total time: 0:00:10 (0.4155 s / it)
* Acc@1 79.984 Acc@5 95.112 loss 0.924
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 79.98%
Epoch: [214]  [   0/1251]  eta: 1:04:30  lr: 0.000862  min_lr: 0.000862  loss: 3.8059 (3.8059)  weight_decay: 0.0500 (0.0500)  time: 3.0937  data: 2.7529  max mem: 18117
Epoch: [214]  [ 200/1251]  eta: 0:04:26  lr: 0.000859  min_lr: 0.000859  loss: 2.6226 (2.7971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8741 (0.9124)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [214]  [ 400/1251]  eta: 0:03:30  lr: 0.000856  min_lr: 0.000856  loss: 2.9829 (2.8010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7919 (0.8655)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [214]  [ 600/1251]  eta: 0:02:39  lr: 0.000853  min_lr: 0.000853  loss: 3.0047 (2.8125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8041 (0.8712)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [214]  [ 800/1251]  eta: 0:01:49  lr: 0.000850  min_lr: 0.000850  loss: 2.6045 (2.8044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8162 (0.8615)  time: 0.2447  data: 0.0004  max mem: 18117
Epoch: [214]  [1000/1251]  eta: 0:01:00  lr: 0.000847  min_lr: 0.000847  loss: 2.4438 (2.8058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8650 (0.8728)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [214]  [1200/1251]  eta: 0:00:12  lr: 0.000844  min_lr: 0.000844  loss: 2.8142 (2.7990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7746 (0.8634)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [214]  [1250/1251]  eta: 0:00:00  lr: 0.000844  min_lr: 0.000844  loss: 2.4873 (2.7950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8217 (0.8629)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [214] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000844  min_lr: 0.000844  loss: 2.4873 (2.8010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8217 (0.8629)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6252 (0.6252)  acc1: 86.8000 (86.8000)  acc5: 99.2000 (99.2000)  time: 5.6130  data: 5.4859  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7660 (0.7681)  acc1: 84.4000 (83.7091)  acc5: 97.2000 (97.2364)  time: 0.7662  data: 0.6522  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9798 (0.9280)  acc1: 77.2000 (79.8095)  acc5: 95.2000 (95.3714)  time: 0.2146  data: 0.1032  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0015 (0.9373)  acc1: 76.4000 (79.3760)  acc5: 94.4000 (95.2960)  time: 0.2125  data: 0.1031  max mem: 18117
Test: Total time: 0:00:10 (0.4175 s / it)
* Acc@1 79.902 Acc@5 95.216 loss 0.922
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.98%
Epoch: [215]  [   0/1251]  eta: 1:05:02  lr: 0.000843  min_lr: 0.000843  loss: 2.8500 (2.8500)  weight_decay: 0.0500 (0.0500)  time: 3.1193  data: 2.1551  max mem: 18117
Epoch: [215]  [ 200/1251]  eta: 0:04:27  lr: 0.000841  min_lr: 0.000841  loss: 2.4349 (2.8476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8070 (1.0061)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [215]  [ 400/1251]  eta: 0:03:29  lr: 0.000838  min_lr: 0.000838  loss: 2.3452 (2.8050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8389 (0.9142)  time: 0.2377  data: 0.0003  max mem: 18117
Epoch: [215]  [ 600/1251]  eta: 0:02:38  lr: 0.000835  min_lr: 0.000835  loss: 2.4236 (2.7973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8160 (0.8857)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [215]  [ 800/1251]  eta: 0:01:49  lr: 0.000832  min_lr: 0.000832  loss: 2.7626 (2.7898)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8082 (0.8744)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [215]  [1000/1251]  eta: 0:01:00  lr: 0.000829  min_lr: 0.000829  loss: 2.9871 (2.7876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8489 (0.8760)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [215]  [1200/1251]  eta: 0:00:12  lr: 0.000826  min_lr: 0.000826  loss: 2.1102 (2.7800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8407 (0.8767)  time: 0.2402  data: 0.0005  max mem: 18117
Epoch: [215]  [1250/1251]  eta: 0:00:00  lr: 0.000825  min_lr: 0.000825  loss: 2.6396 (2.7802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7593 (0.8729)  time: 0.1966  data: 0.0007  max mem: 18117
Epoch: [215] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.000825  min_lr: 0.000825  loss: 2.6396 (2.7879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7593 (0.8729)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6401 (0.6401)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.3699  data: 5.2426  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8067 (0.7931)  acc1: 84.8000 (83.4182)  acc5: 97.2000 (97.1273)  time: 0.7544  data: 0.6430  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9924 (0.9479)  acc1: 76.4000 (79.7714)  acc5: 95.6000 (95.2000)  time: 0.2361  data: 0.1271  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0094 (0.9566)  acc1: 76.0000 (79.4400)  acc5: 94.0000 (95.1040)  time: 0.2352  data: 0.1270  max mem: 18117
Test: Total time: 0:00:10 (0.4252 s / it)
* Acc@1 79.904 Acc@5 95.130 loss 0.940
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.98%
Epoch: [216]  [   0/1251]  eta: 1:07:12  lr: 0.000825  min_lr: 0.000825  loss: 2.1248 (2.1248)  weight_decay: 0.0500 (0.0500)  time: 3.2235  data: 2.3391  max mem: 18117
Epoch: [216]  [ 200/1251]  eta: 0:04:27  lr: 0.000822  min_lr: 0.000822  loss: 2.4360 (2.7424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9590 (0.8861)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [216]  [ 400/1251]  eta: 0:03:29  lr: 0.000819  min_lr: 0.000819  loss: 2.5751 (2.7849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9010 (0.9218)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [216]  [ 600/1251]  eta: 0:02:38  lr: 0.000817  min_lr: 0.000817  loss: 2.2249 (2.7622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8215 (0.9191)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [216]  [ 800/1251]  eta: 0:01:49  lr: 0.000814  min_lr: 0.000814  loss: 2.4032 (2.7721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8228 (0.9031)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [216]  [1000/1251]  eta: 0:01:00  lr: 0.000811  min_lr: 0.000811  loss: 2.4903 (2.7617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8449 (0.8876)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [216]  [1200/1251]  eta: 0:00:12  lr: 0.000808  min_lr: 0.000808  loss: 2.2496 (2.7580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8336 (0.8907)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [216]  [1250/1251]  eta: 0:00:00  lr: 0.000807  min_lr: 0.000807  loss: 2.8472 (2.7610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7449 (0.8870)  time: 0.1949  data: 0.0006  max mem: 18117
Epoch: [216] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.000807  min_lr: 0.000807  loss: 2.8472 (2.7956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7449 (0.8870)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6628 (0.6628)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.8320  data: 5.7054  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7988 (0.7904)  acc1: 84.0000 (83.9273)  acc5: 97.2000 (97.4182)  time: 0.6693  data: 0.5572  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9399 (0.9529)  acc1: 78.0000 (80.3429)  acc5: 95.6000 (95.4857)  time: 0.1622  data: 0.0527  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0635 (0.9673)  acc1: 77.2000 (79.7120)  acc5: 94.4000 (95.3440)  time: 0.1872  data: 0.0780  max mem: 18117
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 79.832 Acc@5 95.170 loss 0.957
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.98%
Epoch: [217]  [   0/1251]  eta: 1:02:54  lr: 0.000807  min_lr: 0.000807  loss: 3.9193 (3.9193)  weight_decay: 0.0500 (0.0500)  time: 3.0176  data: 1.6515  max mem: 18117
Epoch: [217]  [ 200/1251]  eta: 0:04:27  lr: 0.000804  min_lr: 0.000804  loss: 2.5017 (2.8685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8982 (0.9615)  time: 0.2409  data: 0.0003  max mem: 18117
Epoch: [217]  [ 400/1251]  eta: 0:03:29  lr: 0.000801  min_lr: 0.000801  loss: 2.9506 (2.7748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7460 (0.9006)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [217]  [ 600/1251]  eta: 0:02:38  lr: 0.000799  min_lr: 0.000799  loss: 2.2072 (2.7644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9340 (0.9191)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [217]  [ 800/1251]  eta: 0:01:49  lr: 0.000796  min_lr: 0.000796  loss: 3.1111 (2.7688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8679 (0.9136)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [217]  [1000/1251]  eta: 0:01:00  lr: 0.000793  min_lr: 0.000793  loss: 2.2853 (2.7812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8649 (0.8963)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [217]  [1200/1251]  eta: 0:00:12  lr: 0.000790  min_lr: 0.000790  loss: 2.6694 (2.7717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8778 (0.8904)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [217]  [1250/1251]  eta: 0:00:00  lr: 0.000789  min_lr: 0.000789  loss: 2.1747 (2.7714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8778 (0.8909)  time: 0.1957  data: 0.0010  max mem: 18117
Epoch: [217] Total time: 0:05:01 (0.2406 s / it)
Averaged stats: lr: 0.000789  min_lr: 0.000789  loss: 2.1747 (2.7863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8778 (0.8909)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.6344 (0.6344)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.9316  data: 5.7896  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7533 (0.7476)  acc1: 86.0000 (84.3636)  acc5: 97.2000 (97.0546)  time: 0.7401  data: 0.6248  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9596 (0.9045)  acc1: 80.0000 (80.5333)  acc5: 95.6000 (95.4667)  time: 0.1920  data: 0.0789  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9695 (0.9166)  acc1: 77.6000 (79.9680)  acc5: 95.2000 (95.3440)  time: 0.1911  data: 0.0788  max mem: 18117
Test: Total time: 0:00:10 (0.4114 s / it)
* Acc@1 79.998 Acc@5 95.240 loss 0.905
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.00%
Epoch: [218]  [   0/1251]  eta: 1:06:10  lr: 0.000789  min_lr: 0.000789  loss: 2.1861 (2.1861)  weight_decay: 0.0500 (0.0500)  time: 3.1742  data: 2.8552  max mem: 18117
Epoch: [218]  [ 200/1251]  eta: 0:04:30  lr: 0.000786  min_lr: 0.000786  loss: 2.7964 (2.6626)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8068 (0.8568)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [218]  [ 400/1251]  eta: 0:03:31  lr: 0.000784  min_lr: 0.000784  loss: 2.7503 (2.6760)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8698 (0.9188)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [218]  [ 600/1251]  eta: 0:02:39  lr: 0.000781  min_lr: 0.000781  loss: 2.0314 (2.7142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9245 (0.9115)  time: 0.2446  data: 0.0004  max mem: 18117
Epoch: [218]  [ 800/1251]  eta: 0:01:50  lr: 0.000778  min_lr: 0.000778  loss: 2.4406 (2.7442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7652 (0.8961)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [218]  [1000/1251]  eta: 0:01:01  lr: 0.000775  min_lr: 0.000775  loss: 2.4574 (2.7612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8565 (0.9002)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [218]  [1200/1251]  eta: 0:00:12  lr: 0.000772  min_lr: 0.000772  loss: 3.3299 (2.7645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8700 (0.8996)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [218]  [1250/1251]  eta: 0:00:00  lr: 0.000772  min_lr: 0.000772  loss: 2.9483 (2.7695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8913 (0.9015)  time: 0.1953  data: 0.0007  max mem: 18117
Epoch: [218] Total time: 0:05:03 (0.2429 s / it)
Averaged stats: lr: 0.000772  min_lr: 0.000772  loss: 2.9483 (2.7911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8913 (0.9015)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6192 (0.6192)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.5678  data: 5.4432  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8108 (0.7874)  acc1: 83.2000 (83.4182)  acc5: 97.2000 (97.2000)  time: 0.7387  data: 0.6272  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9707 (0.9428)  acc1: 78.4000 (80.2286)  acc5: 95.6000 (95.4667)  time: 0.2013  data: 0.0923  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0794 (0.9567)  acc1: 77.6000 (79.6960)  acc5: 95.2000 (95.3920)  time: 0.2167  data: 0.1080  max mem: 18117
Test: Total time: 0:00:10 (0.4178 s / it)
* Acc@1 79.996 Acc@5 95.358 loss 0.943
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.00%
Epoch: [219]  [   0/1251]  eta: 1:13:14  lr: 0.000771  min_lr: 0.000771  loss: 1.7735 (1.7735)  weight_decay: 0.0500 (0.0500)  time: 3.5129  data: 1.6225  max mem: 18117
Epoch: [219]  [ 200/1251]  eta: 0:04:27  lr: 0.000769  min_lr: 0.000769  loss: 2.4231 (2.8231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9101 (0.9018)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [219]  [ 400/1251]  eta: 0:03:29  lr: 0.000766  min_lr: 0.000766  loss: 2.2684 (2.7982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7730 (0.8934)  time: 0.2365  data: 0.0005  max mem: 18117
Epoch: [219]  [ 600/1251]  eta: 0:02:38  lr: 0.000763  min_lr: 0.000763  loss: 2.9828 (2.7944)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9292 (0.9068)  time: 0.2394  data: 0.0005  max mem: 18117
Epoch: [219]  [ 800/1251]  eta: 0:01:49  lr: 0.000760  min_lr: 0.000760  loss: 2.5710 (2.7804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7615 (0.8920)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [219]  [1000/1251]  eta: 0:01:00  lr: 0.000757  min_lr: 0.000757  loss: 2.7187 (2.7690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8331 (0.8831)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [219]  [1200/1251]  eta: 0:00:12  lr: 0.000755  min_lr: 0.000755  loss: 3.3830 (2.7714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9019 (0.8842)  time: 0.2387  data: 0.0003  max mem: 18117
Epoch: [219]  [1250/1251]  eta: 0:00:00  lr: 0.000754  min_lr: 0.000754  loss: 2.1425 (2.7642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8288 (0.8848)  time: 0.1957  data: 0.0005  max mem: 18117
Epoch: [219] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.000754  min_lr: 0.000754  loss: 2.1425 (2.7869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8288 (0.8848)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6274 (0.6274)  acc1: 85.2000 (85.2000)  acc5: 98.0000 (98.0000)  time: 5.5076  data: 5.3734  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7291 (0.7226)  acc1: 85.2000 (83.8545)  acc5: 97.6000 (97.1273)  time: 0.7135  data: 0.6001  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9243 (0.8839)  acc1: 79.2000 (80.1143)  acc5: 94.8000 (95.3333)  time: 0.1941  data: 0.0845  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9409 (0.8901)  acc1: 77.6000 (79.8400)  acc5: 94.4000 (95.2640)  time: 0.1931  data: 0.0844  max mem: 18117
Test: Total time: 0:00:09 (0.3961 s / it)
* Acc@1 80.180 Acc@5 95.262 loss 0.884
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.18%
Epoch: [220]  [   0/1251]  eta: 0:55:56  lr: 0.000754  min_lr: 0.000754  loss: 3.6520 (3.6520)  weight_decay: 0.0500 (0.0500)  time: 2.6833  data: 2.3749  max mem: 18117
Epoch: [220]  [ 200/1251]  eta: 0:04:24  lr: 0.000751  min_lr: 0.000751  loss: 2.2957 (2.8135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9014 (nan)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [220]  [ 400/1251]  eta: 0:03:28  lr: 0.000748  min_lr: 0.000748  loss: 2.8187 (2.7828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8114 (nan)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [220]  [ 600/1251]  eta: 0:02:38  lr: 0.000745  min_lr: 0.000745  loss: 2.7435 (2.7768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8578 (nan)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [220]  [ 800/1251]  eta: 0:01:49  lr: 0.000743  min_lr: 0.000743  loss: 2.9219 (2.7747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7932 (nan)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [220]  [1000/1251]  eta: 0:01:00  lr: 0.000740  min_lr: 0.000740  loss: 2.7321 (2.7943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7816 (nan)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [220]  [1200/1251]  eta: 0:00:12  lr: 0.000737  min_lr: 0.000737  loss: 2.5034 (2.7831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8526 (nan)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [220]  [1250/1251]  eta: 0:00:00  lr: 0.000736  min_lr: 0.000736  loss: 2.5489 (2.7875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8653 (nan)  time: 0.1956  data: 0.0007  max mem: 18117
Epoch: [220] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.000736  min_lr: 0.000736  loss: 2.5489 (2.7840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8653 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6045 (0.6045)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.5660  data: 5.4414  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7608 (0.7546)  acc1: 85.2000 (83.7818)  acc5: 97.2000 (97.3455)  time: 0.7077  data: 0.5949  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9625 (0.9107)  acc1: 77.2000 (80.0191)  acc5: 95.6000 (95.5238)  time: 0.1934  data: 0.0835  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0273 (0.9204)  acc1: 76.4000 (79.5680)  acc5: 94.8000 (95.3280)  time: 0.2022  data: 0.0940  max mem: 18117
Test: Total time: 0:00:10 (0.4062 s / it)
* Acc@1 80.076 Acc@5 95.264 loss 0.907
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.18%
Epoch: [221]  [   0/1251]  eta: 1:06:00  lr: 0.000736  min_lr: 0.000736  loss: 2.5157 (2.5157)  weight_decay: 0.0500 (0.0500)  time: 3.1656  data: 2.8018  max mem: 18117
Epoch: [221]  [ 200/1251]  eta: 0:04:30  lr: 0.000734  min_lr: 0.000734  loss: 3.3351 (2.7218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (0.9600)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [221]  [ 400/1251]  eta: 0:03:30  lr: 0.000731  min_lr: 0.000731  loss: 2.9588 (2.7837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8498 (0.9183)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [221]  [ 600/1251]  eta: 0:02:39  lr: 0.000728  min_lr: 0.000728  loss: 3.5068 (2.7797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8489 (0.9010)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [221]  [ 800/1251]  eta: 0:01:50  lr: 0.000725  min_lr: 0.000725  loss: 2.6836 (2.7590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7784 (0.8762)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [221]  [1000/1251]  eta: 0:01:00  lr: 0.000722  min_lr: 0.000722  loss: 2.3505 (2.7464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8732 (0.8845)  time: 0.2392  data: 0.0005  max mem: 18117
Epoch: [221]  [1200/1251]  eta: 0:00:12  lr: 0.000720  min_lr: 0.000720  loss: 3.0694 (2.7612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8207 (0.8805)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [221]  [1250/1251]  eta: 0:00:00  lr: 0.000719  min_lr: 0.000719  loss: 2.1739 (2.7557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8191 (0.8801)  time: 0.1960  data: 0.0006  max mem: 18117
Epoch: [221] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000719  min_lr: 0.000719  loss: 2.1739 (2.7702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8191 (0.8801)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.5698 (0.5698)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 5.8785  data: 5.7521  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7395 (0.7376)  acc1: 85.6000 (83.7455)  acc5: 97.6000 (97.2000)  time: 0.7333  data: 0.6214  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9667 (0.8886)  acc1: 78.0000 (80.1905)  acc5: 94.8000 (95.4667)  time: 0.1918  data: 0.0825  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9786 (0.8960)  acc1: 77.6000 (79.9040)  acc5: 94.4000 (95.4240)  time: 0.2057  data: 0.0975  max mem: 18117
Test: Total time: 0:00:10 (0.4217 s / it)
* Acc@1 80.106 Acc@5 95.296 loss 0.888
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.18%
Epoch: [222]  [   0/1251]  eta: 1:08:21  lr: 0.000719  min_lr: 0.000719  loss: 2.8424 (2.8424)  weight_decay: 0.0500 (0.0500)  time: 3.2788  data: 2.9082  max mem: 18117
Epoch: [222]  [ 200/1251]  eta: 0:04:27  lr: 0.000716  min_lr: 0.000716  loss: 2.3204 (2.7003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9387 (0.8638)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [222]  [ 400/1251]  eta: 0:03:29  lr: 0.000714  min_lr: 0.000714  loss: 2.1754 (2.6802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8020 (0.8700)  time: 0.2380  data: 0.0003  max mem: 18117
Epoch: [222]  [ 600/1251]  eta: 0:02:39  lr: 0.000711  min_lr: 0.000711  loss: 2.2148 (2.6821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8502 (0.8987)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [222]  [ 800/1251]  eta: 0:01:49  lr: 0.000708  min_lr: 0.000708  loss: 2.8994 (2.7408)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [222]  [1000/1251]  eta: 0:01:00  lr: 0.000705  min_lr: 0.000705  loss: 3.2183 (2.7515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8277 (nan)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [222]  [1200/1251]  eta: 0:00:12  lr: 0.000703  min_lr: 0.000703  loss: 3.2378 (2.7572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9325 (nan)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [222]  [1250/1251]  eta: 0:00:00  lr: 0.000702  min_lr: 0.000702  loss: 2.2160 (2.7581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8811 (nan)  time: 0.1953  data: 0.0005  max mem: 18117
Epoch: [222] Total time: 0:05:00 (0.2406 s / it)
Averaged stats: lr: 0.000702  min_lr: 0.000702  loss: 2.2160 (2.7774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8811 (nan)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5802 (0.5802)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.7977  data: 5.6477  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7595 (0.7557)  acc1: 85.2000 (84.1455)  acc5: 96.8000 (97.1273)  time: 0.7685  data: 0.6538  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9523 (0.9098)  acc1: 77.6000 (80.1905)  acc5: 95.2000 (95.3905)  time: 0.2065  data: 0.0969  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9841 (0.9183)  acc1: 76.8000 (79.8880)  acc5: 94.8000 (95.2800)  time: 0.2058  data: 0.0968  max mem: 18117
Test: Total time: 0:00:10 (0.4184 s / it)
* Acc@1 80.116 Acc@5 95.222 loss 0.911
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.18%
Epoch: [223]  [   0/1251]  eta: 1:11:34  lr: 0.000702  min_lr: 0.000702  loss: 1.8979 (1.8979)  weight_decay: 0.0500 (0.0500)  time: 3.4328  data: 2.8584  max mem: 18117
Epoch: [223]  [ 200/1251]  eta: 0:04:28  lr: 0.000699  min_lr: 0.000699  loss: 2.9880 (2.7193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9038 (0.9299)  time: 0.2371  data: 0.0005  max mem: 18117
Epoch: [223]  [ 400/1251]  eta: 0:03:30  lr: 0.000696  min_lr: 0.000696  loss: 2.5415 (2.7061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7597 (0.8719)  time: 0.2455  data: 0.0004  max mem: 18117
Epoch: [223]  [ 600/1251]  eta: 0:02:39  lr: 0.000694  min_lr: 0.000694  loss: 2.1802 (2.7049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8425 (0.8800)  time: 0.2427  data: 0.0004  max mem: 18117
Epoch: [223]  [ 800/1251]  eta: 0:01:49  lr: 0.000691  min_lr: 0.000691  loss: 2.6651 (2.7352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9217 (0.8799)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [223]  [1000/1251]  eta: 0:01:00  lr: 0.000688  min_lr: 0.000688  loss: 2.2252 (2.7357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8336 (0.8782)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [223]  [1200/1251]  eta: 0:00:12  lr: 0.000686  min_lr: 0.000686  loss: 2.2762 (2.7388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7813 (0.8852)  time: 0.2446  data: 0.0004  max mem: 18117
Epoch: [223]  [1250/1251]  eta: 0:00:00  lr: 0.000685  min_lr: 0.000685  loss: 2.8515 (2.7422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8101 (0.8854)  time: 0.1954  data: 0.0008  max mem: 18117
Epoch: [223] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.000685  min_lr: 0.000685  loss: 2.8515 (2.7633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8101 (0.8854)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.6413 (0.6413)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.9398  data: 5.7979  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7677 (0.7780)  acc1: 84.0000 (83.8545)  acc5: 97.2000 (97.1636)  time: 0.7758  data: 0.6615  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9694 (0.9358)  acc1: 77.2000 (80.0381)  acc5: 95.2000 (95.3524)  time: 0.1963  data: 0.0864  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0152 (0.9469)  acc1: 76.8000 (79.6160)  acc5: 94.8000 (95.3280)  time: 0.1958  data: 0.0863  max mem: 18117
Test: Total time: 0:00:10 (0.4152 s / it)
* Acc@1 80.078 Acc@5 95.302 loss 0.937
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.18%
Epoch: [224]  [   0/1251]  eta: 1:09:49  lr: 0.000685  min_lr: 0.000685  loss: 2.2172 (2.2172)  weight_decay: 0.0500 (0.0500)  time: 3.3490  data: 2.9930  max mem: 18117
Epoch: [224]  [ 200/1251]  eta: 0:04:26  lr: 0.000682  min_lr: 0.000682  loss: 3.1945 (2.7898)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9226 (0.9256)  time: 0.2353  data: 0.0004  max mem: 18117
Epoch: [224]  [ 400/1251]  eta: 0:03:28  lr: 0.000680  min_lr: 0.000680  loss: 2.7888 (2.7635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8460 (0.9014)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [224]  [ 600/1251]  eta: 0:02:38  lr: 0.000677  min_lr: 0.000677  loss: 2.7459 (2.7503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9038 (0.9311)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [224]  [ 800/1251]  eta: 0:01:49  lr: 0.000674  min_lr: 0.000674  loss: 2.9372 (2.7600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8001 (0.9199)  time: 0.2341  data: 0.0004  max mem: 18117
Epoch: [224]  [1000/1251]  eta: 0:01:00  lr: 0.000671  min_lr: 0.000671  loss: 2.3606 (2.7626)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8404 (0.9076)  time: 0.2464  data: 0.0004  max mem: 18117
Epoch: [224]  [1200/1251]  eta: 0:00:12  lr: 0.000669  min_lr: 0.000669  loss: 2.7853 (2.7530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8284 (0.8990)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [224]  [1250/1251]  eta: 0:00:00  lr: 0.000668  min_lr: 0.000668  loss: 2.0299 (2.7503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8272 (0.8984)  time: 0.1967  data: 0.0017  max mem: 18117
Epoch: [224] Total time: 0:05:01 (0.2407 s / it)
Averaged stats: lr: 0.000668  min_lr: 0.000668  loss: 2.0299 (2.7652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8272 (0.8984)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5746 (0.5746)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.7352  data: 5.5879  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7134 (0.7153)  acc1: 85.2000 (83.6364)  acc5: 98.0000 (97.4182)  time: 0.7310  data: 0.6186  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9326 (0.8687)  acc1: 77.2000 (80.1143)  acc5: 94.8000 (95.6762)  time: 0.1885  data: 0.0794  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9330 (0.8777)  acc1: 77.2000 (79.7280)  acc5: 94.4000 (95.5840)  time: 0.1883  data: 0.0794  max mem: 18117
Test: Total time: 0:00:10 (0.4019 s / it)
* Acc@1 80.366 Acc@5 95.372 loss 0.869
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.37%
Epoch: [225]  [   0/1251]  eta: 1:11:33  lr: 0.000668  min_lr: 0.000668  loss: 2.0285 (2.0285)  weight_decay: 0.0500 (0.0500)  time: 3.4320  data: 3.1224  max mem: 18117
Epoch: [225]  [ 200/1251]  eta: 0:04:27  lr: 0.000665  min_lr: 0.000665  loss: 2.5746 (2.7952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8397 (0.9307)  time: 0.2410  data: 0.0004  max mem: 18117
Epoch: [225]  [ 400/1251]  eta: 0:03:30  lr: 0.000663  min_lr: 0.000663  loss: 2.4228 (2.7803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8146 (0.8979)  time: 0.2418  data: 0.0004  max mem: 18117
Epoch: [225]  [ 600/1251]  eta: 0:02:39  lr: 0.000660  min_lr: 0.000660  loss: 2.8011 (2.7665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8700 (0.8920)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [225]  [ 800/1251]  eta: 0:01:49  lr: 0.000657  min_lr: 0.000657  loss: 2.9710 (2.7725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7917 (0.8866)  time: 0.2375  data: 0.0005  max mem: 18117
Epoch: [225]  [1000/1251]  eta: 0:01:00  lr: 0.000655  min_lr: 0.000655  loss: 2.7607 (2.7914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8271 (0.8872)  time: 0.2390  data: 0.0005  max mem: 18117
Epoch: [225]  [1200/1251]  eta: 0:00:12  lr: 0.000652  min_lr: 0.000652  loss: 2.4311 (2.7818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8718 (0.9010)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [225]  [1250/1251]  eta: 0:00:00  lr: 0.000652  min_lr: 0.000652  loss: 2.5334 (2.7820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8833 (0.9005)  time: 0.1954  data: 0.0005  max mem: 18117
Epoch: [225] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000652  min_lr: 0.000652  loss: 2.5334 (2.7663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8833 (0.9005)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5910 (0.5910)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.6960  data: 5.5418  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7988 (0.7749)  acc1: 84.8000 (84.6546)  acc5: 97.6000 (97.4182)  time: 0.7298  data: 0.6174  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9721 (0.9413)  acc1: 78.8000 (80.8191)  acc5: 94.8000 (95.5048)  time: 0.1990  data: 0.0905  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0424 (0.9549)  acc1: 77.6000 (80.2720)  acc5: 94.4000 (95.4720)  time: 0.1987  data: 0.0904  max mem: 18117
Test: Total time: 0:00:10 (0.4103 s / it)
* Acc@1 80.354 Acc@5 95.218 loss 0.951
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.37%
Epoch: [226]  [   0/1251]  eta: 1:06:56  lr: 0.000651  min_lr: 0.000651  loss: 1.9375 (1.9375)  weight_decay: 0.0500 (0.0500)  time: 3.2110  data: 2.5254  max mem: 18117
Epoch: [226]  [ 200/1251]  eta: 0:04:25  lr: 0.000649  min_lr: 0.000649  loss: 2.9558 (2.7584)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8853 (0.9045)  time: 0.2440  data: 0.0004  max mem: 18117
Epoch: [226]  [ 400/1251]  eta: 0:03:28  lr: 0.000646  min_lr: 0.000646  loss: 3.1456 (2.7258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8808 (0.9470)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [226]  [ 600/1251]  eta: 0:02:38  lr: 0.000644  min_lr: 0.000644  loss: 3.1514 (2.7106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9408 (0.9522)  time: 0.2429  data: 0.0004  max mem: 18117
Epoch: [226]  [ 800/1251]  eta: 0:01:49  lr: 0.000641  min_lr: 0.000641  loss: 3.0949 (2.7144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8817 (0.9508)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [226]  [1000/1251]  eta: 0:01:00  lr: 0.000638  min_lr: 0.000638  loss: 2.3390 (2.7411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8917 (0.9413)  time: 0.2348  data: 0.0004  max mem: 18117
Epoch: [226]  [1200/1251]  eta: 0:00:12  lr: 0.000636  min_lr: 0.000636  loss: 2.3176 (2.7456)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9168 (0.9368)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [226]  [1250/1251]  eta: 0:00:00  lr: 0.000635  min_lr: 0.000635  loss: 2.5597 (2.7475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8685 (0.9327)  time: 0.1956  data: 0.0007  max mem: 18117
Epoch: [226] Total time: 0:05:00 (0.2406 s / it)
Averaged stats: lr: 0.000635  min_lr: 0.000635  loss: 2.5597 (2.7631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8685 (0.9327)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6129 (0.6129)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.3874  data: 5.2590  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.7975 (0.7619)  acc1: 84.0000 (84.0364)  acc5: 97.2000 (97.1273)  time: 0.6636  data: 0.5520  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9532 (0.9195)  acc1: 78.8000 (80.6476)  acc5: 94.8000 (95.4286)  time: 0.1790  data: 0.0696  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0035 (0.9285)  acc1: 78.4000 (80.2400)  acc5: 94.4000 (95.3600)  time: 0.1826  data: 0.0741  max mem: 18117
Test: Total time: 0:00:09 (0.3831 s / it)
* Acc@1 80.376 Acc@5 95.312 loss 0.921
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.38%
Epoch: [227]  [   0/1251]  eta: 1:13:16  lr: 0.000635  min_lr: 0.000635  loss: 3.9423 (3.9423)  weight_decay: 0.0500 (0.0500)  time: 3.5147  data: 3.2595  max mem: 18117
Epoch: [227]  [ 200/1251]  eta: 0:04:28  lr: 0.000632  min_lr: 0.000632  loss: 2.4539 (2.7882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8911 (0.9183)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [227]  [ 400/1251]  eta: 0:03:30  lr: 0.000630  min_lr: 0.000630  loss: 2.2881 (2.7260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9418 (0.9285)  time: 0.2395  data: 0.0005  max mem: 18117
Epoch: [227]  [ 600/1251]  eta: 0:02:39  lr: 0.000627  min_lr: 0.000627  loss: 3.1367 (2.7238)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8348 (0.9182)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [227]  [ 800/1251]  eta: 0:01:50  lr: 0.000625  min_lr: 0.000625  loss: 2.1844 (2.7347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8658 (0.9083)  time: 0.2416  data: 0.0004  max mem: 18117
Epoch: [227]  [1000/1251]  eta: 0:01:01  lr: 0.000622  min_lr: 0.000622  loss: 2.1115 (2.7396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9507 (0.9207)  time: 0.2467  data: 0.0004  max mem: 18117
Epoch: [227]  [1200/1251]  eta: 0:00:12  lr: 0.000619  min_lr: 0.000619  loss: 3.0396 (2.7388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8908 (0.9144)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [227]  [1250/1251]  eta: 0:00:00  lr: 0.000619  min_lr: 0.000619  loss: 3.1661 (2.7418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (0.9176)  time: 0.1956  data: 0.0007  max mem: 18117
Epoch: [227] Total time: 0:05:03 (0.2425 s / it)
Averaged stats: lr: 0.000619  min_lr: 0.000619  loss: 3.1661 (2.7524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (0.9176)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7343 (0.7343)  acc1: 90.0000 (90.0000)  acc5: 97.6000 (97.6000)  time: 5.5366  data: 5.4012  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8718 (0.8660)  acc1: 85.2000 (84.6182)  acc5: 96.8000 (97.1273)  time: 0.7766  data: 0.6627  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0626 (1.0310)  acc1: 78.4000 (80.6476)  acc5: 94.8000 (95.3905)  time: 0.2282  data: 0.1183  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.1186 (1.0389)  acc1: 78.4000 (80.3680)  acc5: 94.8000 (95.2960)  time: 0.2269  data: 0.1182  max mem: 18117
Test: Total time: 0:00:10 (0.4251 s / it)
* Acc@1 80.282 Acc@5 95.226 loss 1.031
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.38%
Epoch: [228]  [   0/1251]  eta: 1:09:45  lr: 0.000619  min_lr: 0.000619  loss: 2.2147 (2.2147)  weight_decay: 0.0500 (0.0500)  time: 3.3457  data: 2.2743  max mem: 18117
Epoch: [228]  [ 200/1251]  eta: 0:04:31  lr: 0.000616  min_lr: 0.000616  loss: 2.5368 (2.7210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9599 (0.9725)  time: 0.2507  data: 0.0004  max mem: 18117
Epoch: [228]  [ 400/1251]  eta: 0:03:30  lr: 0.000614  min_lr: 0.000614  loss: 2.2152 (2.6542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8104 (0.9409)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [228]  [ 600/1251]  eta: 0:02:39  lr: 0.000611  min_lr: 0.000611  loss: 3.3045 (2.6983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8647 (0.9218)  time: 0.2393  data: 0.0004  max mem: 18117
Epoch: [228]  [ 800/1251]  eta: 0:01:49  lr: 0.000608  min_lr: 0.000608  loss: 3.0314 (2.7298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8567 (0.9159)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [228]  [1000/1251]  eta: 0:01:00  lr: 0.000606  min_lr: 0.000606  loss: 2.4674 (2.7255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9325 (0.9138)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [228]  [1200/1251]  eta: 0:00:12  lr: 0.000603  min_lr: 0.000603  loss: 2.4614 (2.7281)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9336 (0.9143)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [228]  [1250/1251]  eta: 0:00:00  lr: 0.000603  min_lr: 0.000603  loss: 2.9201 (2.7309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9210 (0.9167)  time: 0.1954  data: 0.0010  max mem: 18117
Epoch: [228] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000603  min_lr: 0.000603  loss: 2.9201 (2.7602)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9210 (0.9167)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6584 (0.6584)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 5.7723  data: 5.6413  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7988 (0.7831)  acc1: 86.0000 (84.2909)  acc5: 96.8000 (97.1273)  time: 0.7775  data: 0.6644  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9712 (0.9424)  acc1: 78.0000 (80.4191)  acc5: 95.2000 (95.3905)  time: 0.2048  data: 0.0951  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0336 (0.9499)  acc1: 77.2000 (80.0640)  acc5: 94.4000 (95.3440)  time: 0.2031  data: 0.0950  max mem: 18117
Test: Total time: 0:00:10 (0.4153 s / it)
* Acc@1 80.430 Acc@5 95.364 loss 0.939
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.43%
Epoch: [229]  [   0/1251]  eta: 0:59:02  lr: 0.000603  min_lr: 0.000603  loss: 1.7373 (1.7373)  weight_decay: 0.0500 (0.0500)  time: 2.8319  data: 1.9264  max mem: 18117
Epoch: [229]  [ 200/1251]  eta: 0:04:23  lr: 0.000600  min_lr: 0.000600  loss: 2.6629 (2.7817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7770 (0.8283)  time: 0.2354  data: 0.0004  max mem: 18117
Epoch: [229]  [ 400/1251]  eta: 0:03:27  lr: 0.000597  min_lr: 0.000597  loss: 2.1012 (2.7618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8615 (0.8609)  time: 0.2383  data: 0.0003  max mem: 18117
Epoch: [229]  [ 600/1251]  eta: 0:02:37  lr: 0.000595  min_lr: 0.000595  loss: 2.6340 (2.7742)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8350 (0.9046)  time: 0.2354  data: 0.0003  max mem: 18117
Epoch: [229]  [ 800/1251]  eta: 0:01:48  lr: 0.000592  min_lr: 0.000592  loss: 2.9313 (2.7960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8468 (0.9003)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [229]  [1000/1251]  eta: 0:01:00  lr: 0.000590  min_lr: 0.000590  loss: 2.2695 (2.7815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8316 (0.8987)  time: 0.2358  data: 0.0004  max mem: 18117
Epoch: [229]  [1200/1251]  eta: 0:00:12  lr: 0.000587  min_lr: 0.000587  loss: 2.7150 (2.7873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9052 (0.9134)  time: 0.2352  data: 0.0004  max mem: 18117
Epoch: [229]  [1250/1251]  eta: 0:00:00  lr: 0.000587  min_lr: 0.000587  loss: 2.5074 (2.7925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8701 (0.9129)  time: 0.1958  data: 0.0007  max mem: 18117
Epoch: [229] Total time: 0:04:58 (0.2387 s / it)
Averaged stats: lr: 0.000587  min_lr: 0.000587  loss: 2.5074 (2.7581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8701 (0.9129)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5973 (0.5973)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 5.7500  data: 5.6231  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7262 (0.7331)  acc1: 84.8000 (84.0727)  acc5: 97.6000 (97.2000)  time: 0.7563  data: 0.6432  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9276 (0.8837)  acc1: 78.8000 (80.4191)  acc5: 94.8000 (95.4857)  time: 0.1949  data: 0.0847  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9625 (0.8929)  acc1: 78.4000 (80.0960)  acc5: 94.4000 (95.3120)  time: 0.1935  data: 0.0847  max mem: 18117
Test: Total time: 0:00:10 (0.4074 s / it)
* Acc@1 80.562 Acc@5 95.364 loss 0.882
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.56%
Epoch: [230]  [   0/1251]  eta: 1:05:36  lr: 0.000587  min_lr: 0.000587  loss: 3.4839 (3.4839)  weight_decay: 0.0500 (0.0500)  time: 3.1468  data: 2.8563  max mem: 18117
Epoch: [230]  [ 200/1251]  eta: 0:04:26  lr: 0.000584  min_lr: 0.000584  loss: 2.8548 (2.6302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9899 (1.0243)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [230]  [ 400/1251]  eta: 0:03:29  lr: 0.000582  min_lr: 0.000582  loss: 3.3073 (2.6821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8723 (0.9679)  time: 0.2411  data: 0.0004  max mem: 18117
Epoch: [230]  [ 600/1251]  eta: 0:02:38  lr: 0.000579  min_lr: 0.000579  loss: 3.0525 (2.7021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9014 (0.9654)  time: 0.2403  data: 0.0004  max mem: 18117
Epoch: [230]  [ 800/1251]  eta: 0:01:49  lr: 0.000577  min_lr: 0.000577  loss: 3.3266 (2.7269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7942 (0.9393)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [230]  [1000/1251]  eta: 0:01:00  lr: 0.000574  min_lr: 0.000574  loss: 2.1427 (2.7304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8827 (0.9281)  time: 0.2401  data: 0.0005  max mem: 18117
Epoch: [230]  [1200/1251]  eta: 0:00:12  lr: 0.000571  min_lr: 0.000571  loss: 2.3999 (2.7406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9154 (0.9318)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [230]  [1250/1251]  eta: 0:00:00  lr: 0.000571  min_lr: 0.000571  loss: 2.9798 (2.7370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9522 (0.9350)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [230] Total time: 0:05:01 (0.2414 s / it)
Averaged stats: lr: 0.000571  min_lr: 0.000571  loss: 2.9798 (2.7420)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9522 (0.9350)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6383 (0.6383)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.5625  data: 5.4297  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7613 (0.7685)  acc1: 83.6000 (84.0000)  acc5: 96.8000 (96.8727)  time: 0.6819  data: 0.5696  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0130 (0.9233)  acc1: 78.8000 (80.3048)  acc5: 94.8000 (95.2571)  time: 0.2045  data: 0.0947  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0129 (0.9324)  acc1: 78.8000 (80.0960)  acc5: 94.4000 (95.0880)  time: 0.2133  data: 0.1034  max mem: 18117
Test: Total time: 0:00:10 (0.4152 s / it)
* Acc@1 80.452 Acc@5 95.236 loss 0.921
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.56%
Epoch: [231]  [   0/1251]  eta: 1:02:37  lr: 0.000571  min_lr: 0.000571  loss: 2.9568 (2.9568)  weight_decay: 0.0500 (0.0500)  time: 3.0037  data: 2.3036  max mem: 18117
Epoch: [231]  [ 200/1251]  eta: 0:04:27  lr: 0.000568  min_lr: 0.000568  loss: 2.3529 (2.7549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8646 (0.9683)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [231]  [ 400/1251]  eta: 0:03:29  lr: 0.000566  min_lr: 0.000566  loss: 2.0220 (2.7300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8932 (nan)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [231]  [ 600/1251]  eta: 0:02:38  lr: 0.000563  min_lr: 0.000563  loss: 2.7815 (2.7371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8774 (nan)  time: 0.2499  data: 0.0004  max mem: 18117
Epoch: [231]  [ 800/1251]  eta: 0:01:49  lr: 0.000561  min_lr: 0.000561  loss: 2.0471 (2.7241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8827 (nan)  time: 0.2416  data: 0.0004  max mem: 18117
Epoch: [231]  [1000/1251]  eta: 0:01:00  lr: 0.000558  min_lr: 0.000558  loss: 2.6549 (2.7458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8548 (nan)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [231]  [1200/1251]  eta: 0:00:12  lr: 0.000556  min_lr: 0.000556  loss: 1.9870 (2.7385)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9192 (nan)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [231]  [1250/1251]  eta: 0:00:00  lr: 0.000555  min_lr: 0.000555  loss: 3.1489 (2.7402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8612 (nan)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [231] Total time: 0:05:02 (0.2414 s / it)
Averaged stats: lr: 0.000555  min_lr: 0.000555  loss: 3.1489 (2.7450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8612 (nan)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.6624 (0.6624)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.9986  data: 5.8536  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8312 (0.8030)  acc1: 84.8000 (83.7091)  acc5: 97.6000 (97.4182)  time: 0.7493  data: 0.6354  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9882 (0.9574)  acc1: 77.6000 (80.0952)  acc5: 95.2000 (95.7333)  time: 0.1918  data: 0.0823  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0079 (0.9654)  acc1: 77.6000 (79.9840)  acc5: 94.8000 (95.6320)  time: 0.1915  data: 0.0822  max mem: 18117
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 80.482 Acc@5 95.446 loss 0.957
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.56%
Epoch: [232]  [   0/1251]  eta: 1:07:49  lr: 0.000555  min_lr: 0.000555  loss: 3.3386 (3.3386)  weight_decay: 0.0500 (0.0500)  time: 3.2533  data: 2.2217  max mem: 18117
Epoch: [232]  [ 200/1251]  eta: 0:04:28  lr: 0.000553  min_lr: 0.000553  loss: 2.3631 (2.7921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8821 (1.0158)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [232]  [ 400/1251]  eta: 0:03:30  lr: 0.000550  min_lr: 0.000550  loss: 2.1036 (2.7542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8958 (0.9556)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [232]  [ 600/1251]  eta: 0:02:39  lr: 0.000548  min_lr: 0.000548  loss: 2.1477 (2.7623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9746 (0.9917)  time: 0.2417  data: 0.0004  max mem: 18117
Epoch: [232]  [ 800/1251]  eta: 0:01:49  lr: 0.000545  min_lr: 0.000545  loss: 2.5319 (2.7441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9868 (0.9854)  time: 0.2359  data: 0.0005  max mem: 18117
Epoch: [232]  [1000/1251]  eta: 0:01:00  lr: 0.000543  min_lr: 0.000543  loss: 2.3430 (2.7530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7976 (0.9671)  time: 0.2343  data: 0.0004  max mem: 18117
Epoch: [232]  [1200/1251]  eta: 0:00:12  lr: 0.000540  min_lr: 0.000540  loss: 2.6653 (2.7476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8913 (0.9585)  time: 0.2393  data: 0.0005  max mem: 18117
Epoch: [232]  [1250/1251]  eta: 0:00:00  lr: 0.000540  min_lr: 0.000540  loss: 2.0962 (2.7446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8963 (0.9559)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [232] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.000540  min_lr: 0.000540  loss: 2.0962 (2.7346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8963 (0.9559)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6096 (0.6096)  acc1: 88.8000 (88.8000)  acc5: 99.6000 (99.6000)  time: 5.4194  data: 5.2900  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7875 (0.7623)  acc1: 83.6000 (83.9636)  acc5: 97.2000 (97.6000)  time: 0.6958  data: 0.5797  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9428 (0.9145)  acc1: 77.2000 (80.3810)  acc5: 95.2000 (95.6381)  time: 0.1968  data: 0.0853  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9706 (0.9263)  acc1: 77.2000 (79.9680)  acc5: 94.8000 (95.5040)  time: 0.2123  data: 0.1026  max mem: 18117
Test: Total time: 0:00:10 (0.4090 s / it)
* Acc@1 80.494 Acc@5 95.356 loss 0.917
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.56%
Epoch: [233]  [   0/1251]  eta: 1:06:08  lr: 0.000540  min_lr: 0.000540  loss: 3.8415 (3.8415)  weight_decay: 0.0500 (0.0500)  time: 3.1722  data: 2.3002  max mem: 18117
Epoch: [233]  [ 200/1251]  eta: 0:04:26  lr: 0.000537  min_lr: 0.000537  loss: 2.6385 (2.6660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9266 (0.9191)  time: 0.2425  data: 0.0005  max mem: 18117
Epoch: [233]  [ 400/1251]  eta: 0:03:29  lr: 0.000535  min_lr: 0.000535  loss: 2.1768 (2.6773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9476 (0.9326)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [233]  [ 600/1251]  eta: 0:02:38  lr: 0.000533  min_lr: 0.000533  loss: 2.9550 (2.7028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8905 (0.9252)  time: 0.2386  data: 0.0005  max mem: 18117
Epoch: [233]  [ 800/1251]  eta: 0:01:49  lr: 0.000530  min_lr: 0.000530  loss: 2.5647 (2.7084)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9172 (0.9224)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [233]  [1000/1251]  eta: 0:01:00  lr: 0.000528  min_lr: 0.000528  loss: 2.1077 (2.7198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8347 (0.9165)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [233]  [1200/1251]  eta: 0:00:12  lr: 0.000525  min_lr: 0.000525  loss: 2.3354 (2.7227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9261 (0.9296)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [233]  [1250/1251]  eta: 0:00:00  lr: 0.000525  min_lr: 0.000525  loss: 2.1441 (2.7229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9842 (0.9356)  time: 0.1958  data: 0.0006  max mem: 18117
Epoch: [233] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.000525  min_lr: 0.000525  loss: 2.1441 (2.7346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9842 (0.9356)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6033 (0.6033)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.7825  data: 5.6428  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.7537 (0.7434)  acc1: 83.6000 (84.2182)  acc5: 97.2000 (97.4182)  time: 0.6632  data: 0.5502  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9527 (0.8934)  acc1: 77.6000 (80.5333)  acc5: 94.8000 (95.6571)  time: 0.1621  data: 0.0512  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9513 (0.9019)  acc1: 78.8000 (80.3520)  acc5: 94.4000 (95.5200)  time: 0.1935  data: 0.0823  max mem: 18117
Test: Total time: 0:00:10 (0.4076 s / it)
* Acc@1 80.684 Acc@5 95.380 loss 0.893
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.68%
Epoch: [234]  [   0/1251]  eta: 0:58:42  lr: 0.000525  min_lr: 0.000525  loss: 3.6619 (3.6619)  weight_decay: 0.0500 (0.0500)  time: 2.8155  data: 2.3446  max mem: 18117
Epoch: [234]  [ 200/1251]  eta: 0:04:24  lr: 0.000522  min_lr: 0.000522  loss: 2.4538 (2.6621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9419 (0.9769)  time: 0.2379  data: 0.0005  max mem: 18117
Epoch: [234]  [ 400/1251]  eta: 0:03:29  lr: 0.000520  min_lr: 0.000520  loss: 2.8406 (2.7128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8823 (0.9438)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [234]  [ 600/1251]  eta: 0:02:38  lr: 0.000517  min_lr: 0.000517  loss: 3.0384 (2.7101)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0164 (0.9642)  time: 0.2374  data: 0.0005  max mem: 18117
Epoch: [234]  [ 800/1251]  eta: 0:01:49  lr: 0.000515  min_lr: 0.000515  loss: 2.8003 (2.7141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9178 (0.9671)  time: 0.2410  data: 0.0004  max mem: 18117
Epoch: [234]  [1000/1251]  eta: 0:01:00  lr: 0.000513  min_lr: 0.000513  loss: 2.2123 (2.7197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8700 (0.9603)  time: 0.2392  data: 0.0005  max mem: 18117
Epoch: [234]  [1200/1251]  eta: 0:00:12  lr: 0.000510  min_lr: 0.000510  loss: 2.0589 (2.7232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9556 (0.9647)  time: 0.2364  data: 0.0011  max mem: 18117
Epoch: [234]  [1250/1251]  eta: 0:00:00  lr: 0.000510  min_lr: 0.000510  loss: 2.4121 (2.7228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9956 (0.9653)  time: 0.1954  data: 0.0009  max mem: 18117
Epoch: [234] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.000510  min_lr: 0.000510  loss: 2.4121 (2.7314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9956 (0.9653)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5796 (0.5796)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.7952  data: 5.6596  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7790 (0.7587)  acc1: 84.0000 (84.2546)  acc5: 97.2000 (97.2000)  time: 0.7326  data: 0.6199  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9812 (0.9217)  acc1: 77.2000 (80.6476)  acc5: 95.2000 (95.4476)  time: 0.1855  data: 0.0746  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0257 (0.9337)  acc1: 77.2000 (80.2240)  acc5: 94.4000 (95.3280)  time: 0.2076  data: 0.0972  max mem: 18117
Test: Total time: 0:00:10 (0.4190 s / it)
* Acc@1 80.660 Acc@5 95.372 loss 0.921
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.68%
Epoch: [235]  [   0/1251]  eta: 1:05:16  lr: 0.000510  min_lr: 0.000510  loss: 2.1505 (2.1505)  weight_decay: 0.0500 (0.0500)  time: 3.1308  data: 2.6792  max mem: 18117
Epoch: [235]  [ 200/1251]  eta: 0:04:27  lr: 0.000507  min_lr: 0.000507  loss: 2.5057 (2.7051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8372 (0.8959)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [235]  [ 400/1251]  eta: 0:03:30  lr: 0.000505  min_lr: 0.000505  loss: 2.1226 (2.7376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9359 (0.9247)  time: 0.2385  data: 0.0003  max mem: 18117
Epoch: [235]  [ 600/1251]  eta: 0:02:38  lr: 0.000502  min_lr: 0.000502  loss: 3.0228 (2.7298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9688 (0.9460)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [235]  [ 800/1251]  eta: 0:01:49  lr: 0.000500  min_lr: 0.000500  loss: 3.0197 (2.7182)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8516 (0.9388)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [235]  [1000/1251]  eta: 0:01:00  lr: 0.000498  min_lr: 0.000498  loss: 2.7560 (2.7199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8431 (0.9366)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [235]  [1200/1251]  eta: 0:00:12  lr: 0.000495  min_lr: 0.000495  loss: 2.0834 (2.7258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9119 (0.9312)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [235]  [1250/1251]  eta: 0:00:00  lr: 0.000495  min_lr: 0.000495  loss: 2.7827 (2.7253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8763 (0.9317)  time: 0.1960  data: 0.0010  max mem: 18117
Epoch: [235] Total time: 0:05:01 (0.2410 s / it)
Averaged stats: lr: 0.000495  min_lr: 0.000495  loss: 2.7827 (2.7282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8763 (0.9317)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6274 (0.6274)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.5706  data: 5.4423  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8012 (0.7871)  acc1: 85.6000 (84.5091)  acc5: 97.2000 (97.3818)  time: 0.7515  data: 0.6370  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9767 (0.9375)  acc1: 79.6000 (81.0476)  acc5: 95.6000 (95.7524)  time: 0.2052  data: 0.0922  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9997 (0.9452)  acc1: 78.4000 (80.7520)  acc5: 95.2000 (95.6320)  time: 0.2042  data: 0.0920  max mem: 18117
Test: Total time: 0:00:10 (0.4101 s / it)
* Acc@1 80.744 Acc@5 95.452 loss 0.939
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.74%
Epoch: [236]  [   0/1251]  eta: 1:04:51  lr: 0.000495  min_lr: 0.000495  loss: 3.5153 (3.5153)  weight_decay: 0.0500 (0.0500)  time: 3.1106  data: 2.8243  max mem: 18117
Epoch: [236]  [ 200/1251]  eta: 0:04:26  lr: 0.000492  min_lr: 0.000492  loss: 2.8217 (2.7319)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9828 (0.9689)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [236]  [ 400/1251]  eta: 0:03:29  lr: 0.000490  min_lr: 0.000490  loss: 2.7586 (2.7800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9537 (0.9582)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [236]  [ 600/1251]  eta: 0:02:38  lr: 0.000488  min_lr: 0.000488  loss: 2.4045 (2.7685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9003 (0.9550)  time: 0.2376  data: 0.0003  max mem: 18117
Epoch: [236]  [ 800/1251]  eta: 0:01:49  lr: 0.000485  min_lr: 0.000485  loss: 2.2825 (2.7563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9179 (0.9463)  time: 0.2458  data: 0.0004  max mem: 18117
Epoch: [236]  [1000/1251]  eta: 0:01:00  lr: 0.000483  min_lr: 0.000483  loss: 2.2597 (2.7290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9221 (0.9479)  time: 0.2379  data: 0.0003  max mem: 18117
Epoch: [236]  [1200/1251]  eta: 0:00:12  lr: 0.000481  min_lr: 0.000481  loss: 2.3856 (2.7324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9174 (0.9476)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [236]  [1250/1251]  eta: 0:00:00  lr: 0.000480  min_lr: 0.000480  loss: 2.4736 (2.7270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8983 (0.9469)  time: 0.1958  data: 0.0005  max mem: 18117
Epoch: [236] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.000480  min_lr: 0.000480  loss: 2.4736 (2.7227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8983 (0.9469)
Test:  [ 0/25]  eta: 0:01:43  loss: 0.5854 (0.5854)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 4.1228  data: 3.9859  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.7776 (0.7505)  acc1: 83.2000 (84.0727)  acc5: 97.6000 (97.5273)  time: 0.6565  data: 0.5428  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9457 (0.9086)  acc1: 78.8000 (80.4762)  acc5: 95.2000 (95.6191)  time: 0.2484  data: 0.1387  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0201 (0.9181)  acc1: 78.4000 (80.1120)  acc5: 94.4000 (95.5040)  time: 0.2155  data: 0.1074  max mem: 18117
Test: Total time: 0:00:10 (0.4070 s / it)
* Acc@1 80.678 Acc@5 95.464 loss 0.906
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.74%
Epoch: [237]  [   0/1251]  eta: 1:08:16  lr: 0.000480  min_lr: 0.000480  loss: 1.9104 (1.9104)  weight_decay: 0.0500 (0.0500)  time: 3.2748  data: 1.8467  max mem: 18117
Epoch: [237]  [ 200/1251]  eta: 0:04:26  lr: 0.000478  min_lr: 0.000478  loss: 2.7717 (2.7475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9863 (1.0788)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [237]  [ 400/1251]  eta: 0:03:29  lr: 0.000475  min_lr: 0.000475  loss: 2.5569 (2.7630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8985 (1.0320)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [237]  [ 600/1251]  eta: 0:02:39  lr: 0.000473  min_lr: 0.000473  loss: 2.7802 (2.7451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8680 (1.0115)  time: 0.2396  data: 0.0005  max mem: 18117
Epoch: [237]  [ 800/1251]  eta: 0:01:49  lr: 0.000471  min_lr: 0.000471  loss: 2.1666 (2.7357)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0654 (1.0207)  time: 0.2397  data: 0.0003  max mem: 18117
Epoch: [237]  [1000/1251]  eta: 0:01:01  lr: 0.000468  min_lr: 0.000468  loss: 2.4471 (2.7299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9229 (1.0034)  time: 0.2389  data: 0.0003  max mem: 18117
Epoch: [237]  [1200/1251]  eta: 0:00:12  lr: 0.000466  min_lr: 0.000466  loss: 2.7718 (2.7394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8961 (0.9929)  time: 0.2477  data: 0.0003  max mem: 18117
Epoch: [237]  [1250/1251]  eta: 0:00:00  lr: 0.000466  min_lr: 0.000466  loss: 2.9453 (2.7408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9543 (0.9908)  time: 0.1955  data: 0.0008  max mem: 18117
Epoch: [237] Total time: 0:05:03 (0.2428 s / it)
Averaged stats: lr: 0.000466  min_lr: 0.000466  loss: 2.9453 (2.7193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9543 (0.9908)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6383 (0.6383)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.6644  data: 5.5396  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7847 (0.7738)  acc1: 84.8000 (84.1818)  acc5: 97.2000 (97.4182)  time: 0.7540  data: 0.6415  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9330 (0.9298)  acc1: 79.2000 (80.6857)  acc5: 95.6000 (95.6952)  time: 0.2069  data: 0.0966  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0098 (0.9377)  acc1: 78.0000 (80.3200)  acc5: 94.8000 (95.7280)  time: 0.2053  data: 0.0965  max mem: 18117
Test: Total time: 0:00:10 (0.4124 s / it)
* Acc@1 80.736 Acc@5 95.474 loss 0.930
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.74%
Epoch: [238]  [   0/1251]  eta: 1:07:05  lr: 0.000466  min_lr: 0.000466  loss: 2.3500 (2.3500)  weight_decay: 0.0500 (0.0500)  time: 3.2181  data: 2.3121  max mem: 18117
Epoch: [238]  [ 200/1251]  eta: 0:04:28  lr: 0.000463  min_lr: 0.000463  loss: 2.6162 (2.6431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9473 (1.0226)  time: 0.2369  data: 0.0007  max mem: 18117
Epoch: [238]  [ 400/1251]  eta: 0:03:30  lr: 0.000461  min_lr: 0.000461  loss: 2.5253 (2.6874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8950 (0.9793)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [238]  [ 600/1251]  eta: 0:02:39  lr: 0.000459  min_lr: 0.000459  loss: 2.5201 (2.7009)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2396  data: 0.0005  max mem: 18117
Epoch: [238]  [ 800/1251]  eta: 0:01:49  lr: 0.000456  min_lr: 0.000456  loss: 2.5155 (2.7095)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9202 (nan)  time: 0.2375  data: 0.0003  max mem: 18117
Epoch: [238]  [1000/1251]  eta: 0:01:00  lr: 0.000454  min_lr: 0.000454  loss: 2.2020 (2.6982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8913 (nan)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [238]  [1200/1251]  eta: 0:00:12  lr: 0.000452  min_lr: 0.000452  loss: 2.9475 (2.7133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0498 (nan)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [238]  [1250/1251]  eta: 0:00:00  lr: 0.000451  min_lr: 0.000451  loss: 2.1672 (2.7056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9650 (nan)  time: 0.1950  data: 0.0006  max mem: 18117
Epoch: [238] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.000451  min_lr: 0.000451  loss: 2.1672 (2.7124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9650 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5220 (0.5220)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.6099  data: 5.4813  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7376 (0.7101)  acc1: 83.2000 (84.1455)  acc5: 97.2000 (97.4909)  time: 0.7667  data: 0.6537  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9012 (0.8564)  acc1: 78.8000 (80.5714)  acc5: 95.6000 (95.6952)  time: 0.2421  data: 0.1324  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9509 (0.8668)  acc1: 77.6000 (80.1920)  acc5: 94.8000 (95.5840)  time: 0.2403  data: 0.1323  max mem: 18117
Test: Total time: 0:00:10 (0.4387 s / it)
* Acc@1 80.878 Acc@5 95.462 loss 0.855
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.88%
Epoch: [239]  [   0/1251]  eta: 1:07:41  lr: 0.000451  min_lr: 0.000451  loss: 1.8700 (1.8700)  weight_decay: 0.0500 (0.0500)  time: 3.2467  data: 2.9531  max mem: 18117
Epoch: [239]  [ 200/1251]  eta: 0:04:28  lr: 0.000449  min_lr: 0.000449  loss: 2.3446 (2.6119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9205 (0.9913)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [239]  [ 400/1251]  eta: 0:03:29  lr: 0.000447  min_lr: 0.000447  loss: 2.9654 (2.6325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8880 (0.9853)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [239]  [ 600/1251]  eta: 0:02:39  lr: 0.000445  min_lr: 0.000445  loss: 2.3946 (2.6596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9318 (0.9722)  time: 0.2393  data: 0.0004  max mem: 18117
Epoch: [239]  [ 800/1251]  eta: 0:01:49  lr: 0.000442  min_lr: 0.000442  loss: 2.6556 (2.6784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8931 (0.9601)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [239]  [1000/1251]  eta: 0:01:00  lr: 0.000440  min_lr: 0.000440  loss: 2.2862 (2.6951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9486 (0.9849)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [239]  [1200/1251]  eta: 0:00:12  lr: 0.000438  min_lr: 0.000438  loss: 2.3939 (2.7039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9418 (0.9761)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [239]  [1250/1251]  eta: 0:00:00  lr: 0.000437  min_lr: 0.000437  loss: 2.9355 (2.7128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9775 (0.9789)  time: 0.1970  data: 0.0005  max mem: 18117
Epoch: [239] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.000437  min_lr: 0.000437  loss: 2.9355 (2.7219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9775 (0.9789)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6629 (0.6629)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.5368  data: 5.3886  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8290 (0.8093)  acc1: 84.4000 (84.1455)  acc5: 97.6000 (97.2727)  time: 0.7893  data: 0.6725  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0256 (0.9676)  acc1: 77.6000 (80.3619)  acc5: 95.2000 (95.4857)  time: 0.2395  data: 0.1274  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0515 (0.9766)  acc1: 76.8000 (79.9040)  acc5: 94.4000 (95.3440)  time: 0.2391  data: 0.1273  max mem: 18117
Test: Total time: 0:00:10 (0.4341 s / it)
* Acc@1 80.634 Acc@5 95.436 loss 0.962
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.88%
Epoch: [240]  [   0/1251]  eta: 1:04:40  lr: 0.000437  min_lr: 0.000437  loss: 1.8282 (1.8282)  weight_decay: 0.0500 (0.0500)  time: 3.1018  data: 1.8942  max mem: 18117
Epoch: [240]  [ 200/1251]  eta: 0:04:28  lr: 0.000435  min_lr: 0.000435  loss: 2.5495 (2.6336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9645 (0.9624)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [240]  [ 400/1251]  eta: 0:03:30  lr: 0.000433  min_lr: 0.000433  loss: 2.4846 (2.6880)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0363 (0.9904)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [240]  [ 600/1251]  eta: 0:02:39  lr: 0.000431  min_lr: 0.000431  loss: 2.5224 (2.6819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9669 (0.9889)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [240]  [ 800/1251]  eta: 0:01:49  lr: 0.000428  min_lr: 0.000428  loss: 2.1186 (2.6788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9982 (0.9859)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [240]  [1000/1251]  eta: 0:01:00  lr: 0.000426  min_lr: 0.000426  loss: 2.2286 (2.6960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9199 (0.9786)  time: 0.2375  data: 0.0005  max mem: 18117
Epoch: [240]  [1200/1251]  eta: 0:00:12  lr: 0.000424  min_lr: 0.000424  loss: 2.5616 (2.7036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8366 (0.9650)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [240]  [1250/1251]  eta: 0:00:00  lr: 0.000423  min_lr: 0.000423  loss: 2.0200 (2.7046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8532 (0.9612)  time: 0.1962  data: 0.0007  max mem: 18117
Epoch: [240] Total time: 0:05:01 (0.2414 s / it)
Averaged stats: lr: 0.000423  min_lr: 0.000423  loss: 2.0200 (2.7069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8532 (0.9612)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5700 (0.5700)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6201  data: 5.4622  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7479 (0.7351)  acc1: 84.4000 (84.2546)  acc5: 97.2000 (97.2727)  time: 0.7658  data: 0.6498  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9457 (0.8855)  acc1: 76.8000 (80.2286)  acc5: 95.2000 (95.6191)  time: 0.2301  data: 0.1202  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9740 (0.8953)  acc1: 76.4000 (79.9200)  acc5: 95.2000 (95.5360)  time: 0.2299  data: 0.1201  max mem: 18117
Test: Total time: 0:00:10 (0.4302 s / it)
* Acc@1 80.822 Acc@5 95.506 loss 0.876
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.88%
Epoch: [241]  [   0/1251]  eta: 1:03:45  lr: 0.000423  min_lr: 0.000423  loss: 3.4828 (3.4828)  weight_decay: 0.0500 (0.0500)  time: 3.0578  data: 2.6571  max mem: 18117
Epoch: [241]  [ 200/1251]  eta: 0:04:28  lr: 0.000421  min_lr: 0.000421  loss: 2.0705 (2.7370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8860 (0.9076)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [241]  [ 400/1251]  eta: 0:03:30  lr: 0.000419  min_lr: 0.000419  loss: 2.8476 (2.7566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9165 (0.9230)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [241]  [ 600/1251]  eta: 0:02:38  lr: 0.000417  min_lr: 0.000417  loss: 2.0689 (2.7130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9491 (0.9285)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [241]  [ 800/1251]  eta: 0:01:49  lr: 0.000415  min_lr: 0.000415  loss: 3.1456 (2.7298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9456 (0.9409)  time: 0.2393  data: 0.0004  max mem: 18117
Epoch: [241]  [1000/1251]  eta: 0:01:00  lr: 0.000412  min_lr: 0.000412  loss: 2.5798 (2.7252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9480 (0.9541)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [241]  [1200/1251]  eta: 0:00:12  lr: 0.000410  min_lr: 0.000410  loss: 2.1793 (2.7392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8698 (0.9589)  time: 0.2409  data: 0.0004  max mem: 18117
Epoch: [241]  [1250/1251]  eta: 0:00:00  lr: 0.000410  min_lr: 0.000410  loss: 3.1079 (2.7440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9751 (0.9610)  time: 0.1954  data: 0.0007  max mem: 18117
Epoch: [241] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000410  min_lr: 0.000410  loss: 3.1079 (2.7027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9751 (0.9610)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6269 (0.6269)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 5.5868  data: 5.4620  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8034 (0.7917)  acc1: 84.4000 (83.8545)  acc5: 97.6000 (97.3455)  time: 0.6992  data: 0.5873  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9942 (0.9417)  acc1: 78.4000 (80.4381)  acc5: 95.6000 (95.6000)  time: 0.1893  data: 0.0797  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0307 (0.9497)  acc1: 78.0000 (79.9520)  acc5: 94.8000 (95.5200)  time: 0.2049  data: 0.0964  max mem: 18117
Test: Total time: 0:00:10 (0.4094 s / it)
* Acc@1 80.642 Acc@5 95.526 loss 0.935
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.88%
Epoch: [242]  [   0/1251]  eta: 1:07:11  lr: 0.000410  min_lr: 0.000410  loss: 1.8522 (1.8522)  weight_decay: 0.0500 (0.0500)  time: 3.2227  data: 2.4822  max mem: 18117
Epoch: [242]  [ 200/1251]  eta: 0:04:27  lr: 0.000407  min_lr: 0.000407  loss: 2.5107 (2.6881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9833 (0.9818)  time: 0.2371  data: 0.0005  max mem: 18117
Epoch: [242]  [ 400/1251]  eta: 0:03:29  lr: 0.000405  min_lr: 0.000405  loss: 2.4595 (2.7239)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0126 (0.9673)  time: 0.2425  data: 0.0004  max mem: 18117
Epoch: [242]  [ 600/1251]  eta: 0:02:38  lr: 0.000403  min_lr: 0.000403  loss: 2.8660 (2.7255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0231 (0.9946)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [242]  [ 800/1251]  eta: 0:01:49  lr: 0.000401  min_lr: 0.000401  loss: 2.4459 (2.7154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9841 (0.9957)  time: 0.2380  data: 0.0003  max mem: 18117
Epoch: [242]  [1000/1251]  eta: 0:01:00  lr: 0.000399  min_lr: 0.000399  loss: 3.1354 (2.7049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8801 (0.9895)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [242]  [1200/1251]  eta: 0:00:12  lr: 0.000397  min_lr: 0.000397  loss: 2.8697 (2.6963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8494 (0.9808)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [242]  [1250/1251]  eta: 0:00:00  lr: 0.000396  min_lr: 0.000396  loss: 2.1294 (2.7001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8754 (0.9782)  time: 0.1962  data: 0.0010  max mem: 18117
Epoch: [242] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.000396  min_lr: 0.000396  loss: 2.1294 (2.7011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8754 (0.9782)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5780 (0.5780)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 5.6457  data: 5.4949  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7453 (0.7344)  acc1: 84.8000 (84.8364)  acc5: 97.6000 (97.4182)  time: 0.7628  data: 0.6470  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9144 (0.8964)  acc1: 79.2000 (80.9333)  acc5: 95.6000 (95.6381)  time: 0.2076  data: 0.0974  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9843 (0.9035)  acc1: 78.4000 (80.5440)  acc5: 94.8000 (95.6000)  time: 0.2066  data: 0.0973  max mem: 18117
Test: Total time: 0:00:10 (0.4125 s / it)
* Acc@1 80.806 Acc@5 95.622 loss 0.893
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.88%
Epoch: [243]  [   0/1251]  eta: 1:01:27  lr: 0.000396  min_lr: 0.000396  loss: 3.3800 (3.3800)  weight_decay: 0.0500 (0.0500)  time: 2.9480  data: 1.9452  max mem: 18117
Epoch: [243]  [ 200/1251]  eta: 0:04:26  lr: 0.000394  min_lr: 0.000394  loss: 2.1067 (2.6635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9473 (0.9770)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [243]  [ 400/1251]  eta: 0:03:29  lr: 0.000392  min_lr: 0.000392  loss: 2.1324 (2.6695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9945 (1.0075)  time: 0.2418  data: 0.0004  max mem: 18117
Epoch: [243]  [ 600/1251]  eta: 0:02:38  lr: 0.000390  min_lr: 0.000390  loss: 2.8843 (2.6621)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0118 (1.0083)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [243]  [ 800/1251]  eta: 0:01:49  lr: 0.000388  min_lr: 0.000388  loss: 3.2355 (2.6806)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0058 (1.0111)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [243]  [1000/1251]  eta: 0:01:00  lr: 0.000385  min_lr: 0.000385  loss: 2.5042 (2.6795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8934 (1.0054)  time: 0.2389  data: 0.0003  max mem: 18117
Epoch: [243]  [1200/1251]  eta: 0:00:12  lr: 0.000383  min_lr: 0.000383  loss: 2.6476 (2.6962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9548 (1.0032)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [243]  [1250/1251]  eta: 0:00:00  lr: 0.000383  min_lr: 0.000383  loss: 2.0797 (2.6975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8892 (1.0033)  time: 0.1951  data: 0.0008  max mem: 18117
Epoch: [243] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.000383  min_lr: 0.000383  loss: 2.0797 (2.6960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8892 (1.0033)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5464 (0.5464)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.6717  data: 5.5239  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7239 (0.7023)  acc1: 84.8000 (84.7273)  acc5: 97.2000 (97.3455)  time: 0.7298  data: 0.6179  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8701 (0.8584)  acc1: 78.0000 (80.8191)  acc5: 96.4000 (95.7905)  time: 0.2007  data: 0.0925  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9256 (0.8682)  acc1: 78.0000 (80.4800)  acc5: 95.6000 (95.6960)  time: 0.2032  data: 0.0951  max mem: 18117
Test: Total time: 0:00:10 (0.4103 s / it)
* Acc@1 80.936 Acc@5 95.554 loss 0.855
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.94%
Epoch: [244]  [   0/1251]  eta: 1:12:40  lr: 0.000383  min_lr: 0.000383  loss: 2.1547 (2.1547)  weight_decay: 0.0500 (0.0500)  time: 3.4853  data: 3.2133  max mem: 18117
Epoch: [244]  [ 200/1251]  eta: 0:04:28  lr: 0.000381  min_lr: 0.000381  loss: 3.1374 (2.6826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9541 (0.9990)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [244]  [ 400/1251]  eta: 0:03:30  lr: 0.000379  min_lr: 0.000379  loss: 2.9146 (2.6659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8638 (0.9466)  time: 0.2438  data: 0.0004  max mem: 18117
Epoch: [244]  [ 600/1251]  eta: 0:02:39  lr: 0.000377  min_lr: 0.000377  loss: 3.0183 (2.6886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9507 (0.9542)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [244]  [ 800/1251]  eta: 0:01:49  lr: 0.000374  min_lr: 0.000374  loss: 2.6620 (2.6899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8675 (0.9444)  time: 0.2393  data: 0.0003  max mem: 18117
Epoch: [244]  [1000/1251]  eta: 0:01:00  lr: 0.000372  min_lr: 0.000372  loss: 2.8884 (2.6749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9953 (0.9661)  time: 0.2428  data: 0.0004  max mem: 18117
Epoch: [244]  [1200/1251]  eta: 0:00:12  lr: 0.000370  min_lr: 0.000370  loss: 2.8658 (2.6824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9410 (0.9748)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [244]  [1250/1251]  eta: 0:00:00  lr: 0.000370  min_lr: 0.000370  loss: 2.6584 (2.6841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9099 (0.9724)  time: 0.2016  data: 0.0007  max mem: 18117
Epoch: [244] Total time: 0:05:03 (0.2425 s / it)
Averaged stats: lr: 0.000370  min_lr: 0.000370  loss: 2.6584 (2.6874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9099 (0.9724)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5830 (0.5830)  acc1: 89.6000 (89.6000)  acc5: 99.6000 (99.6000)  time: 5.4379  data: 5.2938  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7956 (0.7531)  acc1: 84.8000 (84.9455)  acc5: 97.6000 (97.5273)  time: 0.7050  data: 0.5903  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9425 (0.9166)  acc1: 77.6000 (80.6667)  acc5: 95.2000 (95.5238)  time: 0.2012  data: 0.0912  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0316 (0.9265)  acc1: 76.4000 (80.2720)  acc5: 94.8000 (95.4560)  time: 0.2004  data: 0.0911  max mem: 18117
Test: Total time: 0:00:10 (0.4021 s / it)
* Acc@1 80.948 Acc@5 95.516 loss 0.911
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.95%
Epoch: [245]  [   0/1251]  eta: 1:03:20  lr: 0.000370  min_lr: 0.000370  loss: 2.0261 (2.0261)  weight_decay: 0.0500 (0.0500)  time: 3.0383  data: 2.7345  max mem: 18117
Epoch: [245]  [ 200/1251]  eta: 0:04:27  lr: 0.000368  min_lr: 0.000368  loss: 2.8104 (2.7553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9727 (0.9864)  time: 0.2425  data: 0.0005  max mem: 18117
Epoch: [245]  [ 400/1251]  eta: 0:03:30  lr: 0.000366  min_lr: 0.000366  loss: 2.5112 (2.7634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9522 (0.9863)  time: 0.2399  data: 0.0005  max mem: 18117
Epoch: [245]  [ 600/1251]  eta: 0:02:39  lr: 0.000364  min_lr: 0.000364  loss: 2.8755 (2.7311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8840 (0.9946)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [245]  [ 800/1251]  eta: 0:01:49  lr: 0.000362  min_lr: 0.000362  loss: 2.0163 (2.7076)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0777 (nan)  time: 0.2409  data: 0.0005  max mem: 18117
Epoch: [245]  [1000/1251]  eta: 0:01:00  lr: 0.000359  min_lr: 0.000359  loss: 2.4903 (2.7069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9208 (nan)  time: 0.2448  data: 0.0004  max mem: 18117
Epoch: [245]  [1200/1251]  eta: 0:00:12  lr: 0.000357  min_lr: 0.000357  loss: 2.0398 (2.6965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9490 (nan)  time: 0.2388  data: 0.0005  max mem: 18117
Epoch: [245]  [1250/1251]  eta: 0:00:00  lr: 0.000357  min_lr: 0.000357  loss: 3.4449 (2.7001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9789 (nan)  time: 0.1961  data: 0.0007  max mem: 18117
Epoch: [245] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000357  min_lr: 0.000357  loss: 3.4449 (2.6822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9789 (nan)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6656 (0.6656)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.4279  data: 5.2770  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8695 (0.8337)  acc1: 84.4000 (84.4727)  acc5: 97.2000 (97.4545)  time: 0.7191  data: 0.6051  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 1.0199 (0.9939)  acc1: 78.4000 (80.5905)  acc5: 95.2000 (95.5238)  time: 0.2058  data: 0.0966  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0556 (1.0047)  acc1: 78.0000 (80.0640)  acc5: 94.4000 (95.3440)  time: 0.2136  data: 0.1031  max mem: 18117
Test: Total time: 0:00:10 (0.4088 s / it)
* Acc@1 80.768 Acc@5 95.500 loss 0.988
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.95%
Epoch: [246]  [   0/1251]  eta: 1:03:32  lr: 0.000357  min_lr: 0.000357  loss: 1.8627 (1.8627)  weight_decay: 0.0500 (0.0500)  time: 3.0474  data: 2.3895  max mem: 18117
Epoch: [246]  [ 200/1251]  eta: 0:04:26  lr: 0.000355  min_lr: 0.000355  loss: 3.2553 (2.7810)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0614 (1.0026)  time: 0.2387  data: 0.0005  max mem: 18117
Epoch: [246]  [ 400/1251]  eta: 0:03:29  lr: 0.000353  min_lr: 0.000353  loss: 2.3645 (2.7484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9139 (0.9847)  time: 0.2401  data: 0.0004  max mem: 18117
Epoch: [246]  [ 600/1251]  eta: 0:02:38  lr: 0.000351  min_lr: 0.000351  loss: 2.9396 (2.7237)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0220 (1.0019)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [246]  [ 800/1251]  eta: 0:01:49  lr: 0.000349  min_lr: 0.000349  loss: 2.4583 (2.7141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8964 (0.9990)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [246]  [1000/1251]  eta: 0:01:00  lr: 0.000347  min_lr: 0.000347  loss: 2.9653 (2.7214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9506 (0.9864)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [246]  [1200/1251]  eta: 0:00:12  lr: 0.000345  min_lr: 0.000345  loss: 2.4694 (2.7216)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0315 (0.9971)  time: 0.2398  data: 0.0003  max mem: 18117
Epoch: [246]  [1250/1251]  eta: 0:00:00  lr: 0.000344  min_lr: 0.000344  loss: 3.2070 (2.7269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9925 (1.0062)  time: 0.1961  data: 0.0007  max mem: 18117
Epoch: [246] Total time: 0:05:00 (0.2403 s / it)
Averaged stats: lr: 0.000344  min_lr: 0.000344  loss: 3.2070 (2.6947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9925 (1.0062)
Test:  [ 0/25]  eta: 0:02:08  loss: 0.6236 (0.6236)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.1436  data: 5.0186  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.8102 (0.7745)  acc1: 84.4000 (84.5818)  acc5: 97.6000 (97.3818)  time: 0.6800  data: 0.5679  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9529 (0.9310)  acc1: 79.6000 (80.4191)  acc5: 95.2000 (95.7714)  time: 0.2171  data: 0.1078  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0606 (0.9405)  acc1: 76.8000 (80.2080)  acc5: 94.8000 (95.6480)  time: 0.2065  data: 0.0986  max mem: 18117
Test: Total time: 0:00:10 (0.4028 s / it)
* Acc@1 80.962 Acc@5 95.534 loss 0.928
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 80.96%
Epoch: [247]  [   0/1251]  eta: 1:10:36  lr: 0.000344  min_lr: 0.000344  loss: 2.5307 (2.5307)  weight_decay: 0.0500 (0.0500)  time: 3.3864  data: 3.0682  max mem: 18117
Epoch: [247]  [ 200/1251]  eta: 0:04:26  lr: 0.000342  min_lr: 0.000342  loss: 2.7001 (2.6118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9043 (1.0148)  time: 0.2351  data: 0.0004  max mem: 18117
Epoch: [247]  [ 400/1251]  eta: 0:03:29  lr: 0.000340  min_lr: 0.000340  loss: 2.8760 (2.6313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9619 (1.0082)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [247]  [ 600/1251]  eta: 0:02:38  lr: 0.000338  min_lr: 0.000338  loss: 2.1477 (2.6190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9154 (1.0084)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [247]  [ 800/1251]  eta: 0:01:49  lr: 0.000336  min_lr: 0.000336  loss: 2.6661 (2.6210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9517 (0.9934)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [247]  [1000/1251]  eta: 0:01:00  lr: 0.000334  min_lr: 0.000334  loss: 3.0976 (2.6369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8994 (0.9867)  time: 0.2410  data: 0.0004  max mem: 18117
Epoch: [247]  [1200/1251]  eta: 0:00:12  lr: 0.000332  min_lr: 0.000332  loss: 2.8988 (2.6448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9764 (0.9838)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [247]  [1250/1251]  eta: 0:00:00  lr: 0.000332  min_lr: 0.000332  loss: 2.5148 (2.6531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9235 (0.9849)  time: 0.1960  data: 0.0008  max mem: 18117
Epoch: [247] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.000332  min_lr: 0.000332  loss: 2.5148 (2.6761)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9235 (0.9849)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6055 (0.6055)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.6003  data: 5.4695  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8024 (0.7650)  acc1: 84.8000 (84.3273)  acc5: 97.6000 (97.5273)  time: 0.7521  data: 0.6385  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9679 (0.9254)  acc1: 78.0000 (80.6286)  acc5: 96.0000 (95.8095)  time: 0.2009  data: 0.0909  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0213 (0.9334)  acc1: 77.6000 (80.3520)  acc5: 94.8000 (95.7760)  time: 0.1990  data: 0.0908  max mem: 18117
Test: Total time: 0:00:10 (0.4060 s / it)
* Acc@1 81.078 Acc@5 95.638 loss 0.917
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.08%
Epoch: [248]  [   0/1251]  eta: 1:05:49  lr: 0.000332  min_lr: 0.000332  loss: 1.9976 (1.9976)  weight_decay: 0.0500 (0.0500)  time: 3.1573  data: 2.8761  max mem: 18117
Epoch: [248]  [ 200/1251]  eta: 0:04:25  lr: 0.000330  min_lr: 0.000330  loss: 2.4356 (2.6486)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0047 (0.9841)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [248]  [ 400/1251]  eta: 0:03:30  lr: 0.000328  min_lr: 0.000328  loss: 2.6986 (2.6131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9872 (0.9839)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [248]  [ 600/1251]  eta: 0:02:39  lr: 0.000326  min_lr: 0.000326  loss: 2.4653 (2.6431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9656 (0.9807)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [248]  [ 800/1251]  eta: 0:01:49  lr: 0.000324  min_lr: 0.000324  loss: 2.6905 (2.6651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9247 (0.9767)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [248]  [1000/1251]  eta: 0:01:00  lr: 0.000322  min_lr: 0.000322  loss: 2.9779 (2.6625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9285 (0.9834)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [248]  [1200/1251]  eta: 0:00:12  lr: 0.000320  min_lr: 0.000320  loss: 2.2900 (2.6679)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0254 (0.9873)  time: 0.2383  data: 0.0003  max mem: 18117
Epoch: [248]  [1250/1251]  eta: 0:00:00  lr: 0.000320  min_lr: 0.000320  loss: 3.0775 (2.6685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9869 (0.9851)  time: 0.1952  data: 0.0005  max mem: 18117
Epoch: [248] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.000320  min_lr: 0.000320  loss: 3.0775 (2.6659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9869 (0.9851)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6134 (0.6134)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5536  data: 5.4173  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7775 (0.7610)  acc1: 85.6000 (84.6182)  acc5: 97.6000 (97.4546)  time: 0.7314  data: 0.6180  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9719 (0.9111)  acc1: 78.4000 (81.0286)  acc5: 95.6000 (95.6762)  time: 0.2093  data: 0.0953  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9939 (0.9225)  acc1: 78.4000 (80.6080)  acc5: 95.2000 (95.5520)  time: 0.2079  data: 0.0952  max mem: 18117
Test: Total time: 0:00:10 (0.4102 s / it)
* Acc@1 81.044 Acc@5 95.592 loss 0.908
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.08%
Epoch: [249]  [   0/1251]  eta: 1:06:12  lr: 0.000320  min_lr: 0.000320  loss: 2.2368 (2.2368)  weight_decay: 0.0500 (0.0500)  time: 3.1751  data: 1.5643  max mem: 18117
Epoch: [249]  [ 200/1251]  eta: 0:04:28  lr: 0.000318  min_lr: 0.000318  loss: 2.3612 (2.6952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9282 (0.9183)  time: 0.2429  data: 0.0004  max mem: 18117
Epoch: [249]  [ 400/1251]  eta: 0:03:30  lr: 0.000316  min_lr: 0.000316  loss: 2.5183 (2.7011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8726 (0.9436)  time: 0.2431  data: 0.0004  max mem: 18117
Epoch: [249]  [ 600/1251]  eta: 0:02:39  lr: 0.000314  min_lr: 0.000314  loss: 2.6832 (2.6718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8941 (0.9478)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [249]  [ 800/1251]  eta: 0:01:49  lr: 0.000312  min_lr: 0.000312  loss: 2.8447 (2.6541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (0.9671)  time: 0.2418  data: 0.0004  max mem: 18117
Epoch: [249]  [1000/1251]  eta: 0:01:00  lr: 0.000310  min_lr: 0.000310  loss: 2.2624 (2.6636)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9346 (0.9647)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [249]  [1200/1251]  eta: 0:00:12  lr: 0.000308  min_lr: 0.000308  loss: 2.3387 (2.6596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9644 (0.9735)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [249]  [1250/1251]  eta: 0:00:00  lr: 0.000308  min_lr: 0.000308  loss: 2.8233 (2.6590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0559 (0.9749)  time: 0.1951  data: 0.0006  max mem: 18117
Epoch: [249] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.000308  min_lr: 0.000308  loss: 2.8233 (2.6607)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0559 (0.9749)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5834 (0.5834)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.7839  data: 5.6614  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7738 (0.7375)  acc1: 85.2000 (84.8000)  acc5: 97.6000 (97.6000)  time: 0.7494  data: 0.6378  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9112 (0.8895)  acc1: 80.4000 (81.2000)  acc5: 95.2000 (95.7143)  time: 0.1983  data: 0.0891  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9506 (0.9013)  acc1: 77.6000 (80.7520)  acc5: 94.8000 (95.6320)  time: 0.1971  data: 0.0890  max mem: 18117
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 81.142 Acc@5 95.640 loss 0.887
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.14%
Epoch: [250]  [   0/1251]  eta: 1:06:35  lr: 0.000307  min_lr: 0.000307  loss: 3.3144 (3.3144)  weight_decay: 0.0500 (0.0500)  time: 3.1935  data: 2.8768  max mem: 18117
Epoch: [250]  [ 200/1251]  eta: 0:04:26  lr: 0.000306  min_lr: 0.000306  loss: 2.2313 (2.6918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9329 (0.9786)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [250]  [ 400/1251]  eta: 0:03:29  lr: 0.000304  min_lr: 0.000304  loss: 3.0300 (2.6539)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0547 (1.0010)  time: 0.2407  data: 0.0004  max mem: 18117
Epoch: [250]  [ 600/1251]  eta: 0:02:39  lr: 0.000302  min_lr: 0.000302  loss: 1.9486 (2.6534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9232 (1.0048)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [250]  [ 800/1251]  eta: 0:01:49  lr: 0.000300  min_lr: 0.000300  loss: 2.4418 (2.6357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9116 (0.9947)  time: 0.2419  data: 0.0004  max mem: 18117
Epoch: [250]  [1000/1251]  eta: 0:01:00  lr: 0.000298  min_lr: 0.000298  loss: 2.7879 (2.6478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9641 (0.9904)  time: 0.2409  data: 0.0005  max mem: 18117
Epoch: [250]  [1200/1251]  eta: 0:00:12  lr: 0.000296  min_lr: 0.000296  loss: 2.9273 (2.6423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8702 (0.9910)  time: 0.2395  data: 0.0003  max mem: 18117
Epoch: [250]  [1250/1251]  eta: 0:00:00  lr: 0.000296  min_lr: 0.000296  loss: 1.9708 (2.6457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9589 (0.9945)  time: 0.1961  data: 0.0007  max mem: 18117
Epoch: [250] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000296  min_lr: 0.000296  loss: 1.9708 (2.6679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9589 (0.9945)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5951 (0.5951)  acc1: 89.2000 (89.2000)  acc5: 99.6000 (99.6000)  time: 5.6290  data: 5.5010  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7626 (0.7307)  acc1: 85.2000 (84.7636)  acc5: 97.6000 (97.7455)  time: 0.7047  data: 0.5928  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9057 (0.8833)  acc1: 78.4000 (81.0286)  acc5: 96.4000 (95.9619)  time: 0.1923  data: 0.0795  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9641 (0.8939)  acc1: 78.4000 (80.6400)  acc5: 94.8000 (95.8080)  time: 0.2116  data: 0.0999  max mem: 18117
Test: Total time: 0:00:10 (0.4161 s / it)
* Acc@1 81.162 Acc@5 95.628 loss 0.883
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.16%
Epoch: [251]  [   0/1251]  eta: 1:01:43  lr: 0.000296  min_lr: 0.000296  loss: 2.5501 (2.5501)  weight_decay: 0.0500 (0.0500)  time: 2.9601  data: 2.6222  max mem: 18117
Epoch: [251]  [ 200/1251]  eta: 0:04:28  lr: 0.000294  min_lr: 0.000294  loss: 2.8511 (2.7143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9797 (1.0354)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [251]  [ 400/1251]  eta: 0:03:30  lr: 0.000292  min_lr: 0.000292  loss: 2.0926 (2.6620)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0016 (1.0379)  time: 0.2456  data: 0.0004  max mem: 18117
Epoch: [251]  [ 600/1251]  eta: 0:02:39  lr: 0.000290  min_lr: 0.000290  loss: 2.6366 (2.6705)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0224 (1.0232)  time: 0.2370  data: 0.0003  max mem: 18117
Epoch: [251]  [ 800/1251]  eta: 0:01:49  lr: 0.000288  min_lr: 0.000288  loss: 2.2174 (2.6625)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0003 (1.0155)  time: 0.2358  data: 0.0003  max mem: 18117
Epoch: [251]  [1000/1251]  eta: 0:01:00  lr: 0.000286  min_lr: 0.000286  loss: 2.0704 (2.6608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9674 (1.0063)  time: 0.2356  data: 0.0004  max mem: 18117
Epoch: [251]  [1200/1251]  eta: 0:00:12  lr: 0.000284  min_lr: 0.000284  loss: 2.7036 (2.6549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9931 (1.0035)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [251]  [1250/1251]  eta: 0:00:00  lr: 0.000284  min_lr: 0.000284  loss: 2.1719 (2.6560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9756 (1.0043)  time: 0.1957  data: 0.0007  max mem: 18117
Epoch: [251] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.000284  min_lr: 0.000284  loss: 2.1719 (2.6689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9756 (1.0043)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5522 (0.5522)  acc1: 88.4000 (88.4000)  acc5: 99.6000 (99.6000)  time: 5.6491  data: 5.5200  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7356 (0.7064)  acc1: 86.0000 (84.8727)  acc5: 97.6000 (97.6364)  time: 0.7165  data: 0.6051  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8854 (0.8649)  acc1: 80.0000 (81.3333)  acc5: 95.2000 (95.8095)  time: 0.2051  data: 0.0949  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9434 (0.8746)  acc1: 78.4000 (80.8640)  acc5: 94.8000 (95.7120)  time: 0.2044  data: 0.0948  max mem: 18117
Test: Total time: 0:00:10 (0.4112 s / it)
* Acc@1 81.238 Acc@5 95.620 loss 0.861
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.24%
Epoch: [252]  [   0/1251]  eta: 1:06:38  lr: 0.000284  min_lr: 0.000284  loss: 2.3195 (2.3195)  weight_decay: 0.0500 (0.0500)  time: 3.1960  data: 2.8994  max mem: 18117
Epoch: [252]  [ 200/1251]  eta: 0:04:28  lr: 0.000282  min_lr: 0.000282  loss: 2.8352 (2.5897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9674 (nan)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [252]  [ 400/1251]  eta: 0:03:30  lr: 0.000280  min_lr: 0.000280  loss: 2.5510 (2.6026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9483 (nan)  time: 0.2444  data: 0.0004  max mem: 18117
Epoch: [252]  [ 600/1251]  eta: 0:02:39  lr: 0.000279  min_lr: 0.000279  loss: 1.9919 (2.6113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9013 (nan)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [252]  [ 800/1251]  eta: 0:01:49  lr: 0.000277  min_lr: 0.000277  loss: 3.1139 (2.6259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (nan)  time: 0.2415  data: 0.0005  max mem: 18117
Epoch: [252]  [1000/1251]  eta: 0:01:00  lr: 0.000275  min_lr: 0.000275  loss: 2.3212 (2.6297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9312 (nan)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [252]  [1200/1251]  eta: 0:00:12  lr: 0.000273  min_lr: 0.000273  loss: 2.0527 (2.6386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9582 (nan)  time: 0.2362  data: 0.0004  max mem: 18117
Epoch: [252]  [1250/1251]  eta: 0:00:00  lr: 0.000273  min_lr: 0.000273  loss: 2.0691 (2.6427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9790 (nan)  time: 0.1955  data: 0.0007  max mem: 18117
Epoch: [252] Total time: 0:05:02 (0.2420 s / it)
Averaged stats: lr: 0.000273  min_lr: 0.000273  loss: 2.0691 (2.6579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9790 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5861 (0.5861)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.7013  data: 5.5440  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7601 (0.7328)  acc1: 84.4000 (84.5818)  acc5: 98.0000 (97.7091)  time: 0.7495  data: 0.6343  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9289 (0.8933)  acc1: 79.2000 (80.9524)  acc5: 95.6000 (95.9048)  time: 0.1998  data: 0.0891  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9848 (0.9036)  acc1: 78.8000 (80.5280)  acc5: 95.2000 (95.8240)  time: 0.1993  data: 0.0890  max mem: 18117
Test: Total time: 0:00:10 (0.4086 s / it)
* Acc@1 81.040 Acc@5 95.580 loss 0.892
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.24%
Epoch: [253]  [   0/1251]  eta: 1:09:50  lr: 0.000273  min_lr: 0.000273  loss: 1.9212 (1.9212)  weight_decay: 0.0500 (0.0500)  time: 3.3500  data: 2.6804  max mem: 18117
Epoch: [253]  [ 200/1251]  eta: 0:04:27  lr: 0.000271  min_lr: 0.000271  loss: 2.7354 (2.6771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9551 (1.0204)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [253]  [ 400/1251]  eta: 0:03:29  lr: 0.000269  min_lr: 0.000269  loss: 3.0361 (2.6494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9324 (0.9996)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [253]  [ 600/1251]  eta: 0:02:38  lr: 0.000267  min_lr: 0.000267  loss: 2.6732 (2.6642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9491 (1.0168)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [253]  [ 800/1251]  eta: 0:01:49  lr: 0.000265  min_lr: 0.000265  loss: 2.0112 (2.6567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9863 (1.0294)  time: 0.2381  data: 0.0005  max mem: 18117
Epoch: [253]  [1000/1251]  eta: 0:01:00  lr: 0.000264  min_lr: 0.000264  loss: 2.3738 (2.6509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0147 (1.0331)  time: 0.2396  data: 0.0005  max mem: 18117
Epoch: [253]  [1200/1251]  eta: 0:00:12  lr: 0.000262  min_lr: 0.000262  loss: 2.6229 (2.6680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9351 (1.0268)  time: 0.2378  data: 0.0005  max mem: 18117
Epoch: [253]  [1250/1251]  eta: 0:00:00  lr: 0.000261  min_lr: 0.000261  loss: 2.9519 (2.6648)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0245 (1.0261)  time: 0.1953  data: 0.0005  max mem: 18117
Epoch: [253] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.000261  min_lr: 0.000261  loss: 2.9519 (2.6512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0245 (1.0261)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5974 (0.5974)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.5047  data: 5.3548  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7794 (0.7577)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 0.7443  data: 0.6305  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9328 (0.9067)  acc1: 78.0000 (80.8571)  acc5: 95.6000 (95.8667)  time: 0.2168  data: 0.1076  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9456 (0.9179)  acc1: 77.6000 (80.3520)  acc5: 95.2000 (95.7280)  time: 0.2165  data: 0.1076  max mem: 18117
Test: Total time: 0:00:10 (0.4138 s / it)
* Acc@1 81.156 Acc@5 95.692 loss 0.901
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.24%
Epoch: [254]  [   0/1251]  eta: 1:03:36  lr: 0.000261  min_lr: 0.000261  loss: 1.6908 (1.6908)  weight_decay: 0.0500 (0.0500)  time: 3.0509  data: 2.6344  max mem: 18117
Epoch: [254]  [ 200/1251]  eta: 0:04:28  lr: 0.000260  min_lr: 0.000260  loss: 2.0651 (2.6860)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9712 (0.9945)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [254]  [ 400/1251]  eta: 0:03:30  lr: 0.000258  min_lr: 0.000258  loss: 2.5346 (2.6778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0084 (1.0115)  time: 0.2413  data: 0.0005  max mem: 18117
Epoch: [254]  [ 600/1251]  eta: 0:02:39  lr: 0.000256  min_lr: 0.000256  loss: 2.3026 (2.6764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9536 (1.0192)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [254]  [ 800/1251]  eta: 0:01:49  lr: 0.000254  min_lr: 0.000254  loss: 2.0388 (2.6870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9130 (1.0095)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [254]  [1000/1251]  eta: 0:01:00  lr: 0.000253  min_lr: 0.000253  loss: 2.1789 (2.6909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9424 (0.9964)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [254]  [1200/1251]  eta: 0:00:12  lr: 0.000251  min_lr: 0.000251  loss: 2.0832 (2.6953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9546 (0.9964)  time: 0.2462  data: 0.0004  max mem: 18117
Epoch: [254]  [1250/1251]  eta: 0:00:00  lr: 0.000251  min_lr: 0.000251  loss: 3.0863 (2.6986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9610 (0.9958)  time: 0.1961  data: 0.0008  max mem: 18117
Epoch: [254] Total time: 0:05:03 (0.2423 s / it)
Averaged stats: lr: 0.000251  min_lr: 0.000251  loss: 3.0863 (2.6606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9610 (0.9958)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6393 (0.6393)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.6737  data: 5.5244  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7847 (0.7761)  acc1: 84.8000 (84.5818)  acc5: 97.6000 (97.4909)  time: 0.7450  data: 0.6317  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9548 (0.9292)  acc1: 78.8000 (80.9333)  acc5: 95.2000 (95.7714)  time: 0.2005  data: 0.0915  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0217 (0.9401)  acc1: 78.4000 (80.5280)  acc5: 94.8000 (95.6640)  time: 0.2068  data: 0.0961  max mem: 18117
Test: Total time: 0:00:10 (0.4141 s / it)
* Acc@1 81.020 Acc@5 95.656 loss 0.927
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.24%
Epoch: [255]  [   0/1251]  eta: 1:06:46  lr: 0.000250  min_lr: 0.000250  loss: 3.2134 (3.2134)  weight_decay: 0.0500 (0.0500)  time: 3.2023  data: 2.5099  max mem: 18117
Epoch: [255]  [ 200/1251]  eta: 0:04:27  lr: 0.000249  min_lr: 0.000249  loss: 1.9975 (2.6793)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0190 (1.0342)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [255]  [ 400/1251]  eta: 0:03:29  lr: 0.000247  min_lr: 0.000247  loss: 2.3196 (2.6242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9275 (1.0027)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [255]  [ 600/1251]  eta: 0:02:38  lr: 0.000245  min_lr: 0.000245  loss: 2.1022 (2.6244)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9482 (1.0142)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [255]  [ 800/1251]  eta: 0:01:49  lr: 0.000244  min_lr: 0.000244  loss: 3.0930 (2.6295)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0393 (1.0364)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [255]  [1000/1251]  eta: 0:01:00  lr: 0.000242  min_lr: 0.000242  loss: 2.2186 (2.6320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9464 (1.0302)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [255]  [1200/1251]  eta: 0:00:12  lr: 0.000240  min_lr: 0.000240  loss: 2.7414 (2.6261)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0382 (1.0397)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [255]  [1250/1251]  eta: 0:00:00  lr: 0.000240  min_lr: 0.000240  loss: 2.2194 (2.6205)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0823 (1.0436)  time: 0.1952  data: 0.0006  max mem: 18117
Epoch: [255] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.000240  min_lr: 0.000240  loss: 2.2194 (2.6427)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0823 (1.0436)
Test:  [ 0/25]  eta: 0:01:42  loss: 0.5453 (0.5453)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 4.0849  data: 3.9593  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7227 (0.6968)  acc1: 85.6000 (85.0182)  acc5: 98.0000 (97.6000)  time: 0.6839  data: 0.5742  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8714 (0.8498)  acc1: 79.2000 (81.2571)  acc5: 94.8000 (95.7333)  time: 0.2581  data: 0.1501  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9340 (0.8605)  acc1: 78.4000 (80.8320)  acc5: 94.8000 (95.6000)  time: 0.2049  data: 0.0970  max mem: 18117
Test: Total time: 0:00:10 (0.4023 s / it)
* Acc@1 81.200 Acc@5 95.564 loss 0.850
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.24%
Epoch: [256]  [   0/1251]  eta: 1:10:10  lr: 0.000240  min_lr: 0.000240  loss: 4.0719 (4.0719)  weight_decay: 0.0500 (0.0500)  time: 3.3654  data: 2.5620  max mem: 18117
Epoch: [256]  [ 200/1251]  eta: 0:04:28  lr: 0.000238  min_lr: 0.000238  loss: 2.8393 (2.6713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9200 (1.0382)  time: 0.2418  data: 0.0004  max mem: 18117
Epoch: [256]  [ 400/1251]  eta: 0:03:29  lr: 0.000236  min_lr: 0.000236  loss: 2.6102 (2.6791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9513 (0.9995)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [256]  [ 600/1251]  eta: 0:02:38  lr: 0.000235  min_lr: 0.000235  loss: 3.0196 (2.6888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0932 (1.0527)  time: 0.2393  data: 0.0005  max mem: 18117
Epoch: [256]  [ 800/1251]  eta: 0:01:49  lr: 0.000233  min_lr: 0.000233  loss: 3.1421 (2.7047)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0189 (1.0551)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [256]  [1000/1251]  eta: 0:01:00  lr: 0.000231  min_lr: 0.000231  loss: 2.0900 (2.6980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9095 (1.0405)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [256]  [1200/1251]  eta: 0:00:12  lr: 0.000230  min_lr: 0.000230  loss: 2.1248 (2.6877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9471 (1.0354)  time: 0.2449  data: 0.0004  max mem: 18117
Epoch: [256]  [1250/1251]  eta: 0:00:00  lr: 0.000229  min_lr: 0.000229  loss: 2.6108 (2.6920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9693 (1.0345)  time: 0.1953  data: 0.0007  max mem: 18117
Epoch: [256] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.000229  min_lr: 0.000229  loss: 2.6108 (2.6554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9693 (1.0345)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6062 (0.6062)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 5.6510  data: 5.5240  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7845 (0.7451)  acc1: 85.6000 (84.7273)  acc5: 97.6000 (97.3455)  time: 0.7307  data: 0.6191  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9066 (0.8916)  acc1: 79.6000 (81.2762)  acc5: 95.6000 (95.7333)  time: 0.1983  data: 0.0893  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9798 (0.9017)  acc1: 78.0000 (80.8000)  acc5: 95.2000 (95.6320)  time: 0.2128  data: 0.1048  max mem: 18117
Test: Total time: 0:00:10 (0.4178 s / it)
* Acc@1 81.208 Acc@5 95.600 loss 0.891
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.24%
Epoch: [257]  [   0/1251]  eta: 1:10:28  lr: 0.000229  min_lr: 0.000229  loss: 2.9939 (2.9939)  weight_decay: 0.0500 (0.0500)  time: 3.3802  data: 2.2982  max mem: 18117
Epoch: [257]  [ 200/1251]  eta: 0:04:26  lr: 0.000228  min_lr: 0.000228  loss: 2.2256 (2.6467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9784 (0.9733)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [257]  [ 400/1251]  eta: 0:03:29  lr: 0.000226  min_lr: 0.000226  loss: 2.6367 (2.6605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9239 (0.9948)  time: 0.2389  data: 0.0005  max mem: 18117
Epoch: [257]  [ 600/1251]  eta: 0:02:38  lr: 0.000224  min_lr: 0.000224  loss: 3.0914 (2.6462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9218 (0.9953)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [257]  [ 800/1251]  eta: 0:01:49  lr: 0.000223  min_lr: 0.000223  loss: 3.0438 (2.6634)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1487 (1.0165)  time: 0.2353  data: 0.0003  max mem: 18117
Epoch: [257]  [1000/1251]  eta: 0:01:00  lr: 0.000221  min_lr: 0.000221  loss: 3.1732 (2.6546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9116 (1.0106)  time: 0.2524  data: 0.0005  max mem: 18117
Epoch: [257]  [1200/1251]  eta: 0:00:12  lr: 0.000219  min_lr: 0.000219  loss: 2.1877 (2.6501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9685 (1.0209)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [257]  [1250/1251]  eta: 0:00:00  lr: 0.000219  min_lr: 0.000219  loss: 2.2547 (2.6466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9457 (1.0191)  time: 0.1972  data: 0.0007  max mem: 18117
Epoch: [257] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.000219  min_lr: 0.000219  loss: 2.2547 (2.6378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9457 (1.0191)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5506 (0.5506)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.5270  data: 5.4010  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7302 (0.7081)  acc1: 85.6000 (85.1273)  acc5: 98.0000 (97.7091)  time: 0.7627  data: 0.6477  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9002 (0.8610)  acc1: 80.0000 (81.4667)  acc5: 96.0000 (95.9810)  time: 0.2287  data: 0.1174  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9317 (0.8707)  acc1: 78.4000 (81.0560)  acc5: 95.2000 (95.7920)  time: 0.2276  data: 0.1173  max mem: 18117
Test: Total time: 0:00:10 (0.4251 s / it)
* Acc@1 81.442 Acc@5 95.686 loss 0.860
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.44%
Epoch: [258]  [   0/1251]  eta: 1:07:43  lr: 0.000219  min_lr: 0.000219  loss: 3.1153 (3.1153)  weight_decay: 0.0500 (0.0500)  time: 3.2482  data: 2.9265  max mem: 18117
Epoch: [258]  [ 200/1251]  eta: 0:04:27  lr: 0.000217  min_lr: 0.000217  loss: 2.0270 (2.6081)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0227 (1.0408)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [258]  [ 400/1251]  eta: 0:03:31  lr: 0.000216  min_lr: 0.000216  loss: 2.3734 (2.6033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9143 (1.0206)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [258]  [ 600/1251]  eta: 0:02:39  lr: 0.000214  min_lr: 0.000214  loss: 2.3291 (2.6094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9275 (1.0201)  time: 0.2355  data: 0.0004  max mem: 18117
Epoch: [258]  [ 800/1251]  eta: 0:01:49  lr: 0.000212  min_lr: 0.000212  loss: 2.1276 (2.6180)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [258]  [1000/1251]  eta: 0:01:00  lr: 0.000211  min_lr: 0.000211  loss: 2.6069 (2.6391)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0172 (nan)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [258]  [1200/1251]  eta: 0:00:12  lr: 0.000209  min_lr: 0.000209  loss: 2.0582 (2.6413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9532 (nan)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [258]  [1250/1251]  eta: 0:00:00  lr: 0.000209  min_lr: 0.000209  loss: 2.5305 (2.6420)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9858 (nan)  time: 0.1956  data: 0.0006  max mem: 18117
Epoch: [258] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.000209  min_lr: 0.000209  loss: 2.5305 (2.6338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9858 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5769 (0.5769)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.5863  data: 5.4600  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7869 (0.7366)  acc1: 85.2000 (84.7636)  acc5: 97.6000 (97.5273)  time: 0.7639  data: 0.6530  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9369 (0.8843)  acc1: 79.6000 (81.2381)  acc5: 95.6000 (95.7143)  time: 0.2221  data: 0.1133  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9456 (0.8942)  acc1: 78.8000 (80.9440)  acc5: 94.8000 (95.6480)  time: 0.2221  data: 0.1139  max mem: 18117
Test: Total time: 0:00:10 (0.4228 s / it)
* Acc@1 81.362 Acc@5 95.594 loss 0.885
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.44%
Epoch: [259]  [   0/1251]  eta: 1:06:46  lr: 0.000209  min_lr: 0.000209  loss: 3.3184 (3.3184)  weight_decay: 0.0500 (0.0500)  time: 3.2027  data: 1.7027  max mem: 18117
Epoch: [259]  [ 200/1251]  eta: 0:04:26  lr: 0.000207  min_lr: 0.000207  loss: 3.0838 (2.6799)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0079 (1.1165)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [259]  [ 400/1251]  eta: 0:03:28  lr: 0.000206  min_lr: 0.000206  loss: 2.5306 (2.6795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9679 (1.0848)  time: 0.2353  data: 0.0004  max mem: 18117
Epoch: [259]  [ 600/1251]  eta: 0:02:38  lr: 0.000204  min_lr: 0.000204  loss: 2.2430 (2.6819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9579 (1.0759)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [259]  [ 800/1251]  eta: 0:01:49  lr: 0.000203  min_lr: 0.000203  loss: 2.5276 (2.6632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9339 (1.0611)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [259]  [1000/1251]  eta: 0:01:00  lr: 0.000201  min_lr: 0.000201  loss: 2.4197 (2.6512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9687 (1.0593)  time: 0.2363  data: 0.0003  max mem: 18117
Epoch: [259]  [1200/1251]  eta: 0:00:12  lr: 0.000199  min_lr: 0.000199  loss: 2.9993 (2.6570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9870 (1.0646)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [259]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 2.1524 (2.6549)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0679 (1.0673)  time: 0.1954  data: 0.0006  max mem: 18117
Epoch: [259] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 2.1524 (2.6383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0679 (1.0673)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5735 (0.5735)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.4427  data: 5.2812  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7454 (0.7121)  acc1: 85.6000 (84.6545)  acc5: 97.2000 (97.4909)  time: 0.7557  data: 0.6389  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9017 (0.8567)  acc1: 79.2000 (81.0667)  acc5: 95.6000 (95.7905)  time: 0.2194  data: 0.1076  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9365 (0.8680)  acc1: 78.8000 (80.7040)  acc5: 95.2000 (95.6800)  time: 0.2191  data: 0.1075  max mem: 18117
Test: Total time: 0:00:10 (0.4140 s / it)
* Acc@1 81.298 Acc@5 95.668 loss 0.855
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.44%
Epoch: [260]  [   0/1251]  eta: 1:06:18  lr: 0.000199  min_lr: 0.000199  loss: 1.8627 (1.8627)  weight_decay: 0.0500 (0.0500)  time: 3.1799  data: 2.3439  max mem: 18117
Epoch: [260]  [ 200/1251]  eta: 0:04:28  lr: 0.000197  min_lr: 0.000197  loss: 3.2625 (2.6556)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0504 (1.0315)  time: 0.2408  data: 0.0004  max mem: 18117
Epoch: [260]  [ 400/1251]  eta: 0:03:30  lr: 0.000196  min_lr: 0.000196  loss: 2.0838 (2.5954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0772 (1.0480)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [260]  [ 600/1251]  eta: 0:02:38  lr: 0.000194  min_lr: 0.000194  loss: 2.8850 (2.6275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9621 (1.0371)  time: 0.2413  data: 0.0005  max mem: 18117
Epoch: [260]  [ 800/1251]  eta: 0:01:49  lr: 0.000193  min_lr: 0.000193  loss: 3.0256 (2.6497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9810 (1.0307)  time: 0.2395  data: 0.0004  max mem: 18117
Epoch: [260]  [1000/1251]  eta: 0:01:00  lr: 0.000191  min_lr: 0.000191  loss: 3.1165 (2.6493)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9589 (nan)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [260]  [1200/1251]  eta: 0:00:12  lr: 0.000190  min_lr: 0.000190  loss: 2.0381 (2.6373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0492 (nan)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [260]  [1250/1251]  eta: 0:00:00  lr: 0.000189  min_lr: 0.000189  loss: 2.7431 (2.6372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0254 (nan)  time: 0.1955  data: 0.0007  max mem: 18117
Epoch: [260] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.000189  min_lr: 0.000189  loss: 2.7431 (2.6356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0254 (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5931 (0.5931)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.7231  data: 5.5998  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7632 (0.7374)  acc1: 84.8000 (84.7273)  acc5: 98.0000 (97.6727)  time: 0.7708  data: 0.6578  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9191 (0.8917)  acc1: 79.2000 (81.1238)  acc5: 95.2000 (95.8667)  time: 0.2264  data: 0.1164  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9606 (0.8995)  acc1: 78.4000 (80.6880)  acc5: 94.8000 (95.7760)  time: 0.2248  data: 0.1163  max mem: 18117
Test: Total time: 0:00:10 (0.4308 s / it)
* Acc@1 81.280 Acc@5 95.694 loss 0.888
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.44%
Epoch: [261]  [   0/1251]  eta: 1:08:55  lr: 0.000189  min_lr: 0.000189  loss: 2.3350 (2.3350)  weight_decay: 0.0500 (0.0500)  time: 3.3056  data: 2.8543  max mem: 18117
Epoch: [261]  [ 200/1251]  eta: 0:04:27  lr: 0.000188  min_lr: 0.000188  loss: 2.6283 (2.6630)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0438 (1.0691)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [261]  [ 400/1251]  eta: 0:03:30  lr: 0.000186  min_lr: 0.000186  loss: 2.5776 (2.6849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9415 (1.0226)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [261]  [ 600/1251]  eta: 0:02:38  lr: 0.000185  min_lr: 0.000185  loss: 2.8994 (2.6787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9648 (1.0088)  time: 0.2428  data: 0.0004  max mem: 18117
Epoch: [261]  [ 800/1251]  eta: 0:01:49  lr: 0.000183  min_lr: 0.000183  loss: 2.0070 (2.6759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0543 (1.0189)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [261]  [1000/1251]  eta: 0:01:00  lr: 0.000182  min_lr: 0.000182  loss: 3.0755 (2.6736)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0040 (1.0182)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [261]  [1200/1251]  eta: 0:00:12  lr: 0.000180  min_lr: 0.000180  loss: 2.1035 (2.6657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9930 (1.0147)  time: 0.2445  data: 0.0005  max mem: 18117
Epoch: [261]  [1250/1251]  eta: 0:00:00  lr: 0.000180  min_lr: 0.000180  loss: 3.0270 (2.6720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9633 (1.0136)  time: 0.1984  data: 0.0010  max mem: 18117
Epoch: [261] Total time: 0:05:03 (0.2424 s / it)
Averaged stats: lr: 0.000180  min_lr: 0.000180  loss: 3.0270 (2.6318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9633 (1.0136)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6173 (0.6173)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.4399  data: 5.3152  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.8140 (0.7644)  acc1: 84.8000 (85.0909)  acc5: 97.6000 (97.6364)  time: 0.7437  data: 0.6310  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9300 (0.9175)  acc1: 79.6000 (81.4476)  acc5: 95.6000 (95.9429)  time: 0.2151  data: 0.1048  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 1.0114 (0.9276)  acc1: 78.4000 (80.8480)  acc5: 95.2000 (95.7920)  time: 0.2134  data: 0.1047  max mem: 18117
Test: Total time: 0:00:10 (0.4109 s / it)
* Acc@1 81.228 Acc@5 95.618 loss 0.919
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.44%
Epoch: [262]  [   0/1251]  eta: 1:05:12  lr: 0.000180  min_lr: 0.000180  loss: 3.3614 (3.3614)  weight_decay: 0.0500 (0.0500)  time: 3.1277  data: 1.5735  max mem: 18117
Epoch: [262]  [ 200/1251]  eta: 0:04:27  lr: 0.000179  min_lr: 0.000179  loss: 2.4484 (2.6768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9813 (1.0207)  time: 0.2428  data: 0.0004  max mem: 18117
Epoch: [262]  [ 400/1251]  eta: 0:03:30  lr: 0.000177  min_lr: 0.000177  loss: 3.2163 (2.6954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9908 (1.0169)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [262]  [ 600/1251]  eta: 0:02:38  lr: 0.000176  min_lr: 0.000176  loss: 3.1263 (2.6745)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0122 (1.0193)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [262]  [ 800/1251]  eta: 0:01:49  lr: 0.000174  min_lr: 0.000174  loss: 2.6462 (2.6692)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0632 (1.0247)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [262]  [1000/1251]  eta: 0:01:00  lr: 0.000173  min_lr: 0.000173  loss: 3.0491 (2.6717)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1187 (1.0301)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [262]  [1200/1251]  eta: 0:00:12  lr: 0.000171  min_lr: 0.000171  loss: 2.8044 (2.6590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0746 (1.0366)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [262]  [1250/1251]  eta: 0:00:00  lr: 0.000171  min_lr: 0.000171  loss: 1.9894 (2.6469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9966 (1.0356)  time: 0.1956  data: 0.0006  max mem: 18117
Epoch: [262] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.000171  min_lr: 0.000171  loss: 1.9894 (2.6364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9966 (1.0356)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5551 (0.5551)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.7382  data: 5.6130  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7116 (0.6972)  acc1: 85.6000 (85.3455)  acc5: 97.6000 (97.5273)  time: 0.7274  data: 0.6152  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8784 (0.8435)  acc1: 79.6000 (81.6571)  acc5: 95.6000 (95.8286)  time: 0.1867  data: 0.0773  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9225 (0.8544)  acc1: 78.4000 (81.1200)  acc5: 94.8000 (95.7440)  time: 0.1906  data: 0.0825  max mem: 18117
Test: Total time: 0:00:10 (0.4038 s / it)
* Acc@1 81.418 Acc@5 95.766 loss 0.845
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.44%
Epoch: [263]  [   0/1251]  eta: 0:57:24  lr: 0.000171  min_lr: 0.000171  loss: 2.8586 (2.8586)  weight_decay: 0.0500 (0.0500)  time: 2.7530  data: 1.6446  max mem: 18117
Epoch: [263]  [ 200/1251]  eta: 0:04:26  lr: 0.000169  min_lr: 0.000169  loss: 2.7656 (2.6308)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0894 (1.0671)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [263]  [ 400/1251]  eta: 0:03:29  lr: 0.000168  min_lr: 0.000168  loss: 2.2558 (2.6175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9201 (1.0289)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [263]  [ 600/1251]  eta: 0:02:38  lr: 0.000167  min_lr: 0.000167  loss: 2.8281 (2.6091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9674 (1.0204)  time: 0.2388  data: 0.0003  max mem: 18117
Epoch: [263]  [ 800/1251]  eta: 0:01:49  lr: 0.000165  min_lr: 0.000165  loss: 2.5920 (2.6001)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0162 (1.0337)  time: 0.2376  data: 0.0005  max mem: 18117
Epoch: [263]  [1000/1251]  eta: 0:01:00  lr: 0.000164  min_lr: 0.000164  loss: 2.6843 (2.6050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9392 (1.0178)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [263]  [1200/1251]  eta: 0:00:12  lr: 0.000162  min_lr: 0.000162  loss: 1.9679 (2.5973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9117 (1.0192)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [263]  [1250/1251]  eta: 0:00:00  lr: 0.000162  min_lr: 0.000162  loss: 2.4699 (2.5953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9718 (1.0191)  time: 0.1955  data: 0.0007  max mem: 18117
Epoch: [263] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.000162  min_lr: 0.000162  loss: 2.4699 (2.6096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9718 (1.0191)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.5323 (0.5323)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.8829  data: 5.7412  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7257 (0.7019)  acc1: 85.2000 (84.8727)  acc5: 98.0000 (97.7818)  time: 0.7448  data: 0.6321  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9074 (0.8558)  acc1: 79.6000 (81.5048)  acc5: 96.0000 (96.0000)  time: 0.1966  data: 0.0838  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9406 (0.8673)  acc1: 79.2000 (81.0080)  acc5: 95.2000 (95.8400)  time: 0.1964  data: 0.0837  max mem: 18117
Test: Total time: 0:00:10 (0.4137 s / it)
* Acc@1 81.420 Acc@5 95.754 loss 0.855
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.44%
Epoch: [264]  [   0/1251]  eta: 1:08:31  lr: 0.000162  min_lr: 0.000162  loss: 3.6414 (3.6414)  weight_decay: 0.0500 (0.0500)  time: 3.2865  data: 1.5492  max mem: 18117
Epoch: [264]  [ 200/1251]  eta: 0:04:27  lr: 0.000160  min_lr: 0.000160  loss: 2.5811 (2.5965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9518 (1.0501)  time: 0.2417  data: 0.0005  max mem: 18117
Epoch: [264]  [ 400/1251]  eta: 0:03:29  lr: 0.000159  min_lr: 0.000159  loss: 2.9606 (2.6334)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9743 (1.0442)  time: 0.2387  data: 0.0005  max mem: 18117
Epoch: [264]  [ 600/1251]  eta: 0:02:38  lr: 0.000158  min_lr: 0.000158  loss: 2.7448 (2.6430)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0642 (1.0359)  time: 0.2483  data: 0.0005  max mem: 18117
Epoch: [264]  [ 800/1251]  eta: 0:01:49  lr: 0.000156  min_lr: 0.000156  loss: 2.4940 (2.6511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9996 (1.0547)  time: 0.2432  data: 0.0005  max mem: 18117
Epoch: [264]  [1000/1251]  eta: 0:01:00  lr: 0.000155  min_lr: 0.000155  loss: 2.6396 (2.6506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0832 (1.0529)  time: 0.2381  data: 0.0005  max mem: 18117
Epoch: [264]  [1200/1251]  eta: 0:00:12  lr: 0.000154  min_lr: 0.000154  loss: 2.1565 (2.6536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9188 (1.0461)  time: 0.2366  data: 0.0005  max mem: 18117
Epoch: [264]  [1250/1251]  eta: 0:00:00  lr: 0.000153  min_lr: 0.000153  loss: 2.4142 (2.6455)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0252 (1.0441)  time: 0.1954  data: 0.0007  max mem: 18117
Epoch: [264] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.000153  min_lr: 0.000153  loss: 2.4142 (2.6243)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0252 (1.0441)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5553 (0.5553)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.7766  data: 5.6350  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7450 (0.7130)  acc1: 85.2000 (85.0182)  acc5: 97.6000 (97.6364)  time: 0.7491  data: 0.6352  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9130 (0.8634)  acc1: 79.6000 (81.5429)  acc5: 95.6000 (95.9810)  time: 0.2133  data: 0.1027  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9538 (0.8762)  acc1: 79.2000 (81.0240)  acc5: 95.2000 (95.7280)  time: 0.2132  data: 0.1026  max mem: 18117
Test: Total time: 0:00:10 (0.4225 s / it)
* Acc@1 81.448 Acc@5 95.696 loss 0.866
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.45%
Epoch: [265]  [   0/1251]  eta: 1:03:15  lr: 0.000153  min_lr: 0.000153  loss: 3.3094 (3.3094)  weight_decay: 0.0500 (0.0500)  time: 3.0337  data: 2.7079  max mem: 18117
Epoch: [265]  [ 200/1251]  eta: 0:04:28  lr: 0.000152  min_lr: 0.000152  loss: 2.0388 (2.5562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9994 (1.0189)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [265]  [ 400/1251]  eta: 0:03:30  lr: 0.000150  min_lr: 0.000150  loss: 1.8938 (2.5875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0045 (1.0226)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [265]  [ 600/1251]  eta: 0:02:38  lr: 0.000149  min_lr: 0.000149  loss: 2.4298 (2.5915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9252 (1.0215)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [265]  [ 800/1251]  eta: 0:01:49  lr: 0.000148  min_lr: 0.000148  loss: 2.8864 (2.6130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9148 (1.0132)  time: 0.2368  data: 0.0004  max mem: 18117
Epoch: [265]  [1000/1251]  eta: 0:01:00  lr: 0.000146  min_lr: 0.000146  loss: 2.7646 (2.6060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9142 (1.0100)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [265]  [1200/1251]  eta: 0:00:12  lr: 0.000145  min_lr: 0.000145  loss: 2.5669 (2.6111)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0245 (1.0129)  time: 0.2415  data: 0.0004  max mem: 18117
Epoch: [265]  [1250/1251]  eta: 0:00:00  lr: 0.000145  min_lr: 0.000145  loss: 2.6205 (2.6126)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0096 (1.0123)  time: 0.1962  data: 0.0009  max mem: 18117
Epoch: [265] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.000145  min_lr: 0.000145  loss: 2.6205 (2.6164)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0096 (1.0123)
Test:  [ 0/25]  eta: 0:02:02  loss: 0.5874 (0.5874)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 4.8838  data: 4.7358  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.7861 (0.7499)  acc1: 84.0000 (84.9091)  acc5: 97.6000 (97.7455)  time: 0.6631  data: 0.5475  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9257 (0.9068)  acc1: 79.2000 (81.2952)  acc5: 96.0000 (96.0571)  time: 0.2187  data: 0.1085  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9990 (0.9176)  acc1: 78.4000 (80.8000)  acc5: 95.2000 (95.8560)  time: 0.2003  data: 0.0917  max mem: 18117
Test: Total time: 0:00:09 (0.3907 s / it)
* Acc@1 81.354 Acc@5 95.714 loss 0.907
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.45%
Epoch: [266]  [   0/1251]  eta: 1:09:40  lr: 0.000145  min_lr: 0.000145  loss: 3.2563 (3.2563)  weight_decay: 0.0500 (0.0500)  time: 3.3418  data: 2.9843  max mem: 18117
Epoch: [266]  [ 200/1251]  eta: 0:04:27  lr: 0.000143  min_lr: 0.000143  loss: 2.0340 (2.6353)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0451 (1.0500)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [266]  [ 400/1251]  eta: 0:03:29  lr: 0.000142  min_lr: 0.000142  loss: 2.5248 (2.5774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9415 (1.0418)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [266]  [ 600/1251]  eta: 0:02:38  lr: 0.000141  min_lr: 0.000141  loss: 1.9174 (2.5858)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0487 (1.0573)  time: 0.2383  data: 0.0003  max mem: 18117
Epoch: [266]  [ 800/1251]  eta: 0:01:49  lr: 0.000139  min_lr: 0.000139  loss: 2.7525 (2.5849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9190 (1.0410)  time: 0.2416  data: 0.0003  max mem: 18117
Epoch: [266]  [1000/1251]  eta: 0:01:00  lr: 0.000138  min_lr: 0.000138  loss: 2.8467 (2.5764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9553 (1.0351)  time: 0.2347  data: 0.0004  max mem: 18117
Epoch: [266]  [1200/1251]  eta: 0:00:12  lr: 0.000137  min_lr: 0.000137  loss: 2.5985 (2.5761)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1111 (1.0478)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [266]  [1250/1251]  eta: 0:00:00  lr: 0.000137  min_lr: 0.000137  loss: 1.9487 (2.5736)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1351 (1.0521)  time: 0.1958  data: 0.0009  max mem: 18117
Epoch: [266] Total time: 0:05:01 (0.2409 s / it)
Averaged stats: lr: 0.000137  min_lr: 0.000137  loss: 1.9487 (2.6125)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1351 (1.0521)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5393 (0.5393)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.4011  data: 5.2732  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7391 (0.7107)  acc1: 84.8000 (85.1636)  acc5: 97.6000 (97.6000)  time: 0.7088  data: 0.5964  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9123 (0.8578)  acc1: 79.2000 (81.6571)  acc5: 96.0000 (96.1143)  time: 0.2012  data: 0.0917  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9192 (0.8689)  acc1: 78.8000 (81.3280)  acc5: 95.6000 (95.9040)  time: 0.2402  data: 0.1320  max mem: 18117
Test: Total time: 0:00:10 (0.4295 s / it)
* Acc@1 81.572 Acc@5 95.698 loss 0.859
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.57%
Epoch: [267]  [   0/1251]  eta: 1:06:23  lr: 0.000136  min_lr: 0.000136  loss: 2.0138 (2.0138)  weight_decay: 0.0500 (0.0500)  time: 3.1840  data: 2.9064  max mem: 18117
Epoch: [267]  [ 200/1251]  eta: 0:04:24  lr: 0.000135  min_lr: 0.000135  loss: 3.0778 (2.6753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9726 (1.0652)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [267]  [ 400/1251]  eta: 0:03:28  lr: 0.000134  min_lr: 0.000134  loss: 2.6332 (2.6006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9193 (1.0294)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [267]  [ 600/1251]  eta: 0:02:38  lr: 0.000133  min_lr: 0.000133  loss: 2.1141 (2.6138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1383 (1.0493)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [267]  [ 800/1251]  eta: 0:01:49  lr: 0.000131  min_lr: 0.000131  loss: 2.0397 (2.6327)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0902 (1.0575)  time: 0.2415  data: 0.0004  max mem: 18117
Epoch: [267]  [1000/1251]  eta: 0:01:00  lr: 0.000130  min_lr: 0.000130  loss: 1.9425 (2.6215)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0818 (1.0589)  time: 0.2392  data: 0.0005  max mem: 18117
Epoch: [267]  [1200/1251]  eta: 0:00:12  lr: 0.000129  min_lr: 0.000129  loss: 1.9097 (2.6018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9879 (1.0575)  time: 0.2417  data: 0.0004  max mem: 18117
Epoch: [267]  [1250/1251]  eta: 0:00:00  lr: 0.000129  min_lr: 0.000129  loss: 2.9639 (2.6006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9873 (1.0558)  time: 0.1956  data: 0.0006  max mem: 18117
Epoch: [267] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.000129  min_lr: 0.000129  loss: 2.9639 (2.6087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9873 (1.0558)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5819 (0.5819)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 5.2822  data: 5.1206  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.7714 (0.7368)  acc1: 85.2000 (85.3818)  acc5: 98.0000 (97.6364)  time: 0.6515  data: 0.5356  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9266 (0.8917)  acc1: 79.2000 (81.6571)  acc5: 96.0000 (95.9619)  time: 0.1878  data: 0.0772  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9745 (0.9025)  acc1: 78.4000 (81.1360)  acc5: 95.6000 (95.8240)  time: 0.2055  data: 0.0954  max mem: 18117
Test: Total time: 0:00:09 (0.3993 s / it)
* Acc@1 81.492 Acc@5 95.664 loss 0.894
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.57%
Epoch: [268]  [   0/1251]  eta: 1:04:11  lr: 0.000128  min_lr: 0.000128  loss: 3.4547 (3.4547)  weight_decay: 0.0500 (0.0500)  time: 3.0788  data: 1.5558  max mem: 18117
Epoch: [268]  [ 200/1251]  eta: 0:04:28  lr: 0.000127  min_lr: 0.000127  loss: 2.1352 (2.6427)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0439 (1.0600)  time: 0.2440  data: 0.0005  max mem: 18117
Epoch: [268]  [ 400/1251]  eta: 0:03:31  lr: 0.000126  min_lr: 0.000126  loss: 3.0542 (2.6341)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0026 (1.0654)  time: 0.2386  data: 0.0003  max mem: 18117
Epoch: [268]  [ 600/1251]  eta: 0:02:40  lr: 0.000125  min_lr: 0.000125  loss: 2.8477 (2.6308)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0123 (1.0629)  time: 0.2425  data: 0.0003  max mem: 18117
Epoch: [268]  [ 800/1251]  eta: 0:01:50  lr: 0.000123  min_lr: 0.000123  loss: 2.4160 (2.6117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9485 (1.0543)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [268]  [1000/1251]  eta: 0:01:01  lr: 0.000122  min_lr: 0.000122  loss: 2.0968 (2.6291)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0326 (1.0537)  time: 0.2424  data: 0.0005  max mem: 18117
Epoch: [268]  [1200/1251]  eta: 0:00:12  lr: 0.000121  min_lr: 0.000121  loss: 2.4189 (2.6192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9610 (1.0505)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [268]  [1250/1251]  eta: 0:00:00  lr: 0.000121  min_lr: 0.000121  loss: 2.2782 (2.6244)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9847 (1.0482)  time: 0.1959  data: 0.0006  max mem: 18117
Epoch: [268] Total time: 0:05:04 (0.2432 s / it)
Averaged stats: lr: 0.000121  min_lr: 0.000121  loss: 2.2782 (2.6045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9847 (1.0482)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5692 (0.5692)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 5.7008  data: 5.5494  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7544 (0.7225)  acc1: 85.2000 (85.1636)  acc5: 97.6000 (97.7091)  time: 0.6997  data: 0.5850  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9096 (0.8738)  acc1: 79.6000 (81.6381)  acc5: 95.6000 (96.0000)  time: 0.1897  data: 0.0801  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9703 (0.8871)  acc1: 79.2000 (81.0560)  acc5: 95.6000 (95.8240)  time: 0.1891  data: 0.0800  max mem: 18117
Test: Total time: 0:00:10 (0.4002 s / it)
* Acc@1 81.510 Acc@5 95.756 loss 0.878
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.57%
Epoch: [269]  [   0/1251]  eta: 1:05:45  lr: 0.000121  min_lr: 0.000121  loss: 3.5110 (3.5110)  weight_decay: 0.0500 (0.0500)  time: 3.1540  data: 2.7467  max mem: 18117
Epoch: [269]  [ 200/1251]  eta: 0:04:27  lr: 0.000120  min_lr: 0.000120  loss: 2.8702 (2.6433)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0696 (1.0472)  time: 0.2394  data: 0.0004  max mem: 18117
Epoch: [269]  [ 400/1251]  eta: 0:03:30  lr: 0.000118  min_lr: 0.000118  loss: 2.9739 (2.6603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0387 (1.0397)  time: 0.2402  data: 0.0003  max mem: 18117
Epoch: [269]  [ 600/1251]  eta: 0:02:39  lr: 0.000117  min_lr: 0.000117  loss: 2.8711 (2.6445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0547 (1.0428)  time: 0.2401  data: 0.0005  max mem: 18117
Epoch: [269]  [ 800/1251]  eta: 0:01:49  lr: 0.000116  min_lr: 0.000116  loss: 3.1312 (2.6471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0228 (1.0660)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [269]  [1000/1251]  eta: 0:01:00  lr: 0.000115  min_lr: 0.000115  loss: 2.3438 (2.6465)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0904 (1.0624)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [269]  [1200/1251]  eta: 0:00:12  lr: 0.000113  min_lr: 0.000113  loss: 2.7906 (2.6416)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0393 (1.0638)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [269]  [1250/1251]  eta: 0:00:00  lr: 0.000113  min_lr: 0.000113  loss: 2.8992 (2.6436)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0158 (1.0637)  time: 0.1956  data: 0.0008  max mem: 18117
Epoch: [269] Total time: 0:05:03 (0.2422 s / it)
Averaged stats: lr: 0.000113  min_lr: 0.000113  loss: 2.8992 (2.6230)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0158 (1.0637)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.5461 (0.5461)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.2598  data: 5.1351  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.7393 (0.7036)  acc1: 84.8000 (84.9091)  acc5: 97.2000 (97.5273)  time: 0.6335  data: 0.5200  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8925 (0.8575)  acc1: 78.8000 (81.2952)  acc5: 95.6000 (95.8095)  time: 0.1896  data: 0.0795  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9535 (0.8696)  acc1: 78.0000 (80.8800)  acc5: 95.2000 (95.7280)  time: 0.2102  data: 0.1013  max mem: 18117
Test: Total time: 0:00:10 (0.4010 s / it)
* Acc@1 81.488 Acc@5 95.740 loss 0.859
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.57%
Epoch: [270]  [   0/1251]  eta: 1:06:46  lr: 0.000113  min_lr: 0.000113  loss: 1.8925 (1.8925)  weight_decay: 0.0500 (0.0500)  time: 3.2028  data: 2.8544  max mem: 18117
Epoch: [270]  [ 200/1251]  eta: 0:04:29  lr: 0.000112  min_lr: 0.000112  loss: 1.9724 (2.5358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9843 (1.0426)  time: 0.2430  data: 0.0004  max mem: 18117
Epoch: [270]  [ 400/1251]  eta: 0:03:30  lr: 0.000111  min_lr: 0.000111  loss: 3.0222 (2.6041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9648 (1.0368)  time: 0.2384  data: 0.0005  max mem: 18117
Epoch: [270]  [ 600/1251]  eta: 0:02:39  lr: 0.000110  min_lr: 0.000110  loss: 2.8444 (2.6343)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0305 (1.0493)  time: 0.2355  data: 0.0004  max mem: 18117
Epoch: [270]  [ 800/1251]  eta: 0:01:49  lr: 0.000109  min_lr: 0.000109  loss: 2.3905 (2.6034)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0268 (1.0465)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [270]  [1000/1251]  eta: 0:01:00  lr: 0.000107  min_lr: 0.000107  loss: 2.4083 (2.6127)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0776 (1.0517)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [270]  [1200/1251]  eta: 0:00:12  lr: 0.000106  min_lr: 0.000106  loss: 2.2768 (2.6051)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0749 (1.0513)  time: 0.2393  data: 0.0004  max mem: 18117
Epoch: [270]  [1250/1251]  eta: 0:00:00  lr: 0.000106  min_lr: 0.000106  loss: 2.7529 (2.6022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0555 (1.0492)  time: 0.1959  data: 0.0008  max mem: 18117
Epoch: [270] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.000106  min_lr: 0.000106  loss: 2.7529 (2.6154)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0555 (1.0492)
Test:  [ 0/25]  eta: 0:01:49  loss: 0.5881 (0.5881)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 4.3897  data: 4.2609  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.7676 (0.7385)  acc1: 85.6000 (85.1636)  acc5: 97.6000 (97.4546)  time: 0.6299  data: 0.5196  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9170 (0.8896)  acc1: 79.2000 (81.5048)  acc5: 95.6000 (95.8476)  time: 0.2271  data: 0.1188  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9867 (0.9013)  acc1: 78.8000 (81.0880)  acc5: 95.2000 (95.7440)  time: 0.2076  data: 0.0994  max mem: 18117
Test: Total time: 0:00:10 (0.4055 s / it)
* Acc@1 81.526 Acc@5 95.730 loss 0.889
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.57%
Epoch: [271]  [   0/1251]  eta: 1:06:47  lr: 0.000106  min_lr: 0.000106  loss: 1.8139 (1.8139)  weight_decay: 0.0500 (0.0500)  time: 3.2031  data: 2.3229  max mem: 18117
Epoch: [271]  [ 200/1251]  eta: 0:04:26  lr: 0.000105  min_lr: 0.000105  loss: 2.8001 (2.6800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9662 (1.0194)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [271]  [ 400/1251]  eta: 0:03:29  lr: 0.000104  min_lr: 0.000104  loss: 2.2835 (2.6442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0195 (1.0400)  time: 0.2364  data: 0.0003  max mem: 18117
Epoch: [271]  [ 600/1251]  eta: 0:02:38  lr: 0.000102  min_lr: 0.000102  loss: 2.0145 (2.6280)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0608 (1.0343)  time: 0.2343  data: 0.0003  max mem: 18117
Epoch: [271]  [ 800/1251]  eta: 0:01:49  lr: 0.000101  min_lr: 0.000101  loss: 2.5902 (2.6313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9662 (1.0275)  time: 0.2367  data: 0.0003  max mem: 18117
Epoch: [271]  [1000/1251]  eta: 0:01:00  lr: 0.000100  min_lr: 0.000100  loss: 2.8279 (2.6244)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0014 (1.0315)  time: 0.2498  data: 0.0004  max mem: 18117
Epoch: [271]  [1200/1251]  eta: 0:00:12  lr: 0.000099  min_lr: 0.000099  loss: 2.9812 (2.6238)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0116 (1.0342)  time: 0.2384  data: 0.0005  max mem: 18117
Epoch: [271]  [1250/1251]  eta: 0:00:00  lr: 0.000099  min_lr: 0.000099  loss: 2.8141 (2.6220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9281 (1.0314)  time: 0.1952  data: 0.0007  max mem: 18117
Epoch: [271] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.000099  min_lr: 0.000099  loss: 2.8141 (2.5949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9281 (1.0314)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6008 (0.6008)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 5.6189  data: 5.4734  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7755 (0.7436)  acc1: 85.2000 (85.1636)  acc5: 97.6000 (97.7091)  time: 0.7327  data: 0.6184  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9233 (0.8978)  acc1: 79.2000 (81.4667)  acc5: 95.6000 (95.8667)  time: 0.1952  data: 0.0854  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9736 (0.9089)  acc1: 78.8000 (81.0400)  acc5: 95.2000 (95.7760)  time: 0.1963  data: 0.0855  max mem: 18117
Test: Total time: 0:00:10 (0.4050 s / it)
* Acc@1 81.474 Acc@5 95.684 loss 0.902
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.57%
Epoch: [272]  [   0/1251]  eta: 1:06:42  lr: 0.000099  min_lr: 0.000099  loss: 3.6518 (3.6518)  weight_decay: 0.0500 (0.0500)  time: 3.1995  data: 2.6785  max mem: 18117
Epoch: [272]  [ 200/1251]  eta: 0:04:26  lr: 0.000098  min_lr: 0.000098  loss: 2.0820 (2.5958)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0035 (1.0676)  time: 0.2364  data: 0.0003  max mem: 18117
Epoch: [272]  [ 400/1251]  eta: 0:03:28  lr: 0.000097  min_lr: 0.000097  loss: 2.1775 (2.6200)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0806 (nan)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [272]  [ 600/1251]  eta: 0:02:37  lr: 0.000096  min_lr: 0.000096  loss: 2.8228 (2.6351)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0517 (nan)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [272]  [ 800/1251]  eta: 0:01:48  lr: 0.000094  min_lr: 0.000094  loss: 2.2574 (2.6173)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0814 (nan)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [272]  [1000/1251]  eta: 0:01:00  lr: 0.000093  min_lr: 0.000093  loss: 2.2007 (2.6062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9904 (nan)  time: 0.2400  data: 0.0004  max mem: 18117
Epoch: [272]  [1200/1251]  eta: 0:00:12  lr: 0.000092  min_lr: 0.000092  loss: 2.0778 (2.6102)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0102 (nan)  time: 0.2368  data: 0.0005  max mem: 18117
Epoch: [272]  [1250/1251]  eta: 0:00:00  lr: 0.000092  min_lr: 0.000092  loss: 2.7042 (2.6150)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9572 (nan)  time: 0.1966  data: 0.0008  max mem: 18117
Epoch: [272] Total time: 0:05:00 (0.2403 s / it)
Averaged stats: lr: 0.000092  min_lr: 0.000092  loss: 2.7042 (2.5964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9572 (nan)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5637 (0.5637)  acc1: 89.6000 (89.6000)  acc5: 99.6000 (99.6000)  time: 5.3018  data: 5.1745  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7473 (0.7201)  acc1: 85.6000 (85.1273)  acc5: 97.2000 (97.7091)  time: 0.7517  data: 0.6382  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9034 (0.8756)  acc1: 79.2000 (81.5619)  acc5: 95.6000 (96.0000)  time: 0.2114  data: 0.0994  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9641 (0.8868)  acc1: 78.8000 (81.0400)  acc5: 95.2000 (95.9200)  time: 0.2090  data: 0.0981  max mem: 18117
Test: Total time: 0:00:10 (0.4036 s / it)
* Acc@1 81.552 Acc@5 95.756 loss 0.878
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.57%
Epoch: [273]  [   0/1251]  eta: 1:10:23  lr: 0.000092  min_lr: 0.000092  loss: 2.9067 (2.9067)  weight_decay: 0.0500 (0.0500)  time: 3.3760  data: 2.3716  max mem: 18117
Epoch: [273]  [ 200/1251]  eta: 0:04:29  lr: 0.000091  min_lr: 0.000091  loss: 2.2099 (2.5436)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0407 (1.0380)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [273]  [ 400/1251]  eta: 0:03:31  lr: 0.000090  min_lr: 0.000090  loss: 1.9464 (2.5279)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0136 (1.0508)  time: 0.2407  data: 0.0004  max mem: 18117
Epoch: [273]  [ 600/1251]  eta: 0:02:39  lr: 0.000089  min_lr: 0.000089  loss: 1.8777 (2.5355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9694 (1.0395)  time: 0.2416  data: 0.0004  max mem: 18117
Epoch: [273]  [ 800/1251]  eta: 0:01:49  lr: 0.000088  min_lr: 0.000088  loss: 2.6745 (2.5663)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0915 (1.0516)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [273]  [1000/1251]  eta: 0:01:00  lr: 0.000087  min_lr: 0.000087  loss: 3.1076 (2.5715)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0254 (1.0620)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [273]  [1200/1251]  eta: 0:00:12  lr: 0.000086  min_lr: 0.000086  loss: 2.9628 (2.5813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9357 (1.0564)  time: 0.2423  data: 0.0004  max mem: 18117
Epoch: [273]  [1250/1251]  eta: 0:00:00  lr: 0.000085  min_lr: 0.000085  loss: 2.8772 (2.5846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9695 (1.0556)  time: 0.1954  data: 0.0008  max mem: 18117
Epoch: [273] Total time: 0:05:02 (0.2419 s / it)
Averaged stats: lr: 0.000085  min_lr: 0.000085  loss: 2.8772 (2.6096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9695 (1.0556)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6003 (0.6003)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.8006  data: 5.6747  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7936 (0.7593)  acc1: 85.2000 (85.3091)  acc5: 97.2000 (97.4545)  time: 0.7729  data: 0.6591  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9251 (0.9124)  acc1: 78.8000 (81.6571)  acc5: 96.0000 (95.7905)  time: 0.2083  data: 0.0959  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9906 (0.9227)  acc1: 78.8000 (81.1840)  acc5: 95.2000 (95.7280)  time: 0.2072  data: 0.0958  max mem: 18117
Test: Total time: 0:00:10 (0.4192 s / it)
* Acc@1 81.566 Acc@5 95.726 loss 0.912
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.57%
Epoch: [274]  [   0/1251]  eta: 1:08:01  lr: 0.000085  min_lr: 0.000085  loss: 2.3027 (2.3027)  weight_decay: 0.0500 (0.0500)  time: 3.2628  data: 2.5814  max mem: 18117
Epoch: [274]  [ 200/1251]  eta: 0:04:25  lr: 0.000084  min_lr: 0.000084  loss: 2.3937 (2.6136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9949 (1.0688)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [274]  [ 400/1251]  eta: 0:03:28  lr: 0.000083  min_lr: 0.000083  loss: 1.9445 (2.6153)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0270 (1.0525)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [274]  [ 600/1251]  eta: 0:02:38  lr: 0.000082  min_lr: 0.000082  loss: 2.4892 (2.6239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9662 (1.0539)  time: 0.2428  data: 0.0005  max mem: 18117
Epoch: [274]  [ 800/1251]  eta: 0:01:49  lr: 0.000081  min_lr: 0.000081  loss: 3.0486 (2.6055)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0186 (1.0516)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [274]  [1000/1251]  eta: 0:01:00  lr: 0.000080  min_lr: 0.000080  loss: 2.4788 (2.5998)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0181 (1.0496)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [274]  [1200/1251]  eta: 0:00:12  lr: 0.000079  min_lr: 0.000079  loss: 2.8347 (2.6026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0125 (1.0501)  time: 0.2382  data: 0.0004  max mem: 18117
Epoch: [274]  [1250/1251]  eta: 0:00:00  lr: 0.000079  min_lr: 0.000079  loss: 2.7969 (2.6010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9686 (1.0490)  time: 0.1960  data: 0.0007  max mem: 18117
Epoch: [274] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.000079  min_lr: 0.000079  loss: 2.7969 (2.6046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9686 (1.0490)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6002 (0.6002)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 5.7362  data: 5.6093  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7741 (0.7518)  acc1: 86.0000 (84.9091)  acc5: 97.6000 (97.6364)  time: 0.7597  data: 0.6468  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9414 (0.9032)  acc1: 78.8000 (81.5429)  acc5: 96.0000 (95.9810)  time: 0.2171  data: 0.1073  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9930 (0.9143)  acc1: 78.8000 (81.1200)  acc5: 95.2000 (95.8560)  time: 0.2154  data: 0.1072  max mem: 18117
Test: Total time: 0:00:10 (0.4238 s / it)
* Acc@1 81.558 Acc@5 95.760 loss 0.906
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.57%
Epoch: [275]  [   0/1251]  eta: 1:09:00  lr: 0.000079  min_lr: 0.000079  loss: 3.3935 (3.3935)  weight_decay: 0.0500 (0.0500)  time: 3.3094  data: 2.5860  max mem: 18117
Epoch: [275]  [ 200/1251]  eta: 0:04:31  lr: 0.000078  min_lr: 0.000078  loss: 2.4797 (2.6003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9820 (1.0173)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [275]  [ 400/1251]  eta: 0:03:31  lr: 0.000077  min_lr: 0.000077  loss: 2.6026 (2.6123)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0018 (1.0341)  time: 0.2421  data: 0.0004  max mem: 18117
Epoch: [275]  [ 600/1251]  eta: 0:02:40  lr: 0.000076  min_lr: 0.000076  loss: 1.9723 (2.5949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9998 (1.0398)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [275]  [ 800/1251]  eta: 0:01:50  lr: 0.000075  min_lr: 0.000075  loss: 2.9045 (2.5966)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0370 (1.0462)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [275]  [1000/1251]  eta: 0:01:01  lr: 0.000074  min_lr: 0.000074  loss: 2.2881 (2.5818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9681 (1.0420)  time: 0.2404  data: 0.0005  max mem: 18117
Epoch: [275]  [1200/1251]  eta: 0:00:12  lr: 0.000073  min_lr: 0.000073  loss: 2.0272 (2.5861)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0081 (1.0412)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [275]  [1250/1251]  eta: 0:00:00  lr: 0.000073  min_lr: 0.000073  loss: 2.5581 (2.5840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9876 (1.0378)  time: 0.1953  data: 0.0008  max mem: 18117
Epoch: [275] Total time: 0:05:03 (0.2426 s / it)
Averaged stats: lr: 0.000073  min_lr: 0.000073  loss: 2.5581 (2.5956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9876 (1.0378)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5479 (0.5479)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.3759  data: 5.2356  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7077 (0.6898)  acc1: 86.4000 (85.2727)  acc5: 97.6000 (97.4909)  time: 0.7626  data: 0.6482  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8808 (0.8360)  acc1: 80.4000 (81.7714)  acc5: 95.6000 (95.9238)  time: 0.2267  data: 0.1168  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9196 (0.8489)  acc1: 78.8000 (81.2000)  acc5: 94.8000 (95.7760)  time: 0.2196  data: 0.1114  max mem: 18117
Test: Total time: 0:00:10 (0.4184 s / it)
* Acc@1 81.686 Acc@5 95.776 loss 0.842
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.69%
Epoch: [276]  [   0/1251]  eta: 1:01:23  lr: 0.000073  min_lr: 0.000073  loss: 1.8526 (1.8526)  weight_decay: 0.0500 (0.0500)  time: 2.9442  data: 2.6130  max mem: 18117
Epoch: [276]  [ 200/1251]  eta: 0:04:25  lr: 0.000072  min_lr: 0.000072  loss: 2.8684 (2.5487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9474 (1.0133)  time: 0.2387  data: 0.0005  max mem: 18117
Epoch: [276]  [ 400/1251]  eta: 0:03:29  lr: 0.000071  min_lr: 0.000071  loss: 2.4844 (2.5701)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0135 (1.0026)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [276]  [ 600/1251]  eta: 0:02:38  lr: 0.000070  min_lr: 0.000070  loss: 2.6415 (2.5869)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0612 (1.0179)  time: 0.2369  data: 0.0004  max mem: 18117
Epoch: [276]  [ 800/1251]  eta: 0:01:49  lr: 0.000069  min_lr: 0.000069  loss: 1.9083 (2.5823)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0793 (1.0238)  time: 0.2386  data: 0.0006  max mem: 18117
Epoch: [276]  [1000/1251]  eta: 0:01:00  lr: 0.000068  min_lr: 0.000068  loss: 2.3600 (2.5544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9649 (1.0244)  time: 0.2391  data: 0.0005  max mem: 18117
Epoch: [276]  [1200/1251]  eta: 0:00:12  lr: 0.000067  min_lr: 0.000067  loss: 2.3682 (2.5719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9749 (1.0193)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [276]  [1250/1251]  eta: 0:00:00  lr: 0.000067  min_lr: 0.000067  loss: 2.5983 (2.5714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9640 (1.0168)  time: 0.1965  data: 0.0009  max mem: 18117
Epoch: [276] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.000067  min_lr: 0.000067  loss: 2.5983 (2.5933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9640 (1.0168)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.5661 (0.5661)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.8412  data: 5.6954  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7361 (0.7159)  acc1: 86.0000 (85.7455)  acc5: 97.6000 (97.4909)  time: 0.7570  data: 0.6431  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9195 (0.8659)  acc1: 79.6000 (81.9429)  acc5: 95.6000 (95.9429)  time: 0.2054  data: 0.0960  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9486 (0.8783)  acc1: 79.2000 (81.3440)  acc5: 95.2000 (95.8560)  time: 0.2049  data: 0.0959  max mem: 18117
Test: Total time: 0:00:10 (0.4187 s / it)
* Acc@1 81.602 Acc@5 95.728 loss 0.870
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.69%
Epoch: [277]  [   0/1251]  eta: 1:10:50  lr: 0.000067  min_lr: 0.000067  loss: 1.7795 (1.7795)  weight_decay: 0.0500 (0.0500)  time: 3.3975  data: 2.4197  max mem: 18117
Epoch: [277]  [ 200/1251]  eta: 0:04:26  lr: 0.000066  min_lr: 0.000066  loss: 2.1656 (2.6010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9611 (1.0177)  time: 0.2376  data: 0.0004  max mem: 18117
Epoch: [277]  [ 400/1251]  eta: 0:03:30  lr: 0.000065  min_lr: 0.000065  loss: 1.9261 (2.5935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9519 (1.0254)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [277]  [ 600/1251]  eta: 0:02:39  lr: 0.000064  min_lr: 0.000064  loss: 2.8045 (2.6033)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0755 (1.0317)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [277]  [ 800/1251]  eta: 0:01:49  lr: 0.000064  min_lr: 0.000064  loss: 2.7067 (2.5937)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0227 (1.0319)  time: 0.2410  data: 0.0004  max mem: 18117
Epoch: [277]  [1000/1251]  eta: 0:01:00  lr: 0.000063  min_lr: 0.000063  loss: 2.8360 (2.5820)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0370 (1.0354)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [277]  [1200/1251]  eta: 0:00:12  lr: 0.000062  min_lr: 0.000062  loss: 2.2874 (2.5828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0691 (1.0400)  time: 0.2454  data: 0.0004  max mem: 18117
Epoch: [277]  [1250/1251]  eta: 0:00:00  lr: 0.000062  min_lr: 0.000062  loss: 2.4816 (2.5862)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0803 (1.0404)  time: 0.1951  data: 0.0005  max mem: 18117
Epoch: [277] Total time: 0:05:02 (0.2418 s / it)
Averaged stats: lr: 0.000062  min_lr: 0.000062  loss: 2.4816 (2.6027)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0803 (1.0404)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5983 (0.5983)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.8223  data: 5.6955  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7606 (0.7425)  acc1: 85.6000 (85.3091)  acc5: 98.0000 (97.6364)  time: 0.7582  data: 0.6456  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9393 (0.8918)  acc1: 80.0000 (81.7905)  acc5: 96.0000 (96.0191)  time: 0.2024  data: 0.0928  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9591 (0.9031)  acc1: 78.8000 (81.2000)  acc5: 95.6000 (95.9040)  time: 0.2015  data: 0.0927  max mem: 18117
Test: Total time: 0:00:10 (0.4156 s / it)
* Acc@1 81.594 Acc@5 95.730 loss 0.895
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.69%
Epoch: [278]  [   0/1251]  eta: 1:09:54  lr: 0.000062  min_lr: 0.000062  loss: 1.7640 (1.7640)  weight_decay: 0.0500 (0.0500)  time: 3.3532  data: 1.6656  max mem: 18117
Epoch: [278]  [ 200/1251]  eta: 0:04:28  lr: 0.000061  min_lr: 0.000061  loss: 2.8478 (2.6078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9777 (1.0326)  time: 0.2412  data: 0.0004  max mem: 18117
Epoch: [278]  [ 400/1251]  eta: 0:03:29  lr: 0.000060  min_lr: 0.000060  loss: 2.4013 (2.5720)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0350 (1.0406)  time: 0.2365  data: 0.0004  max mem: 18117
Epoch: [278]  [ 600/1251]  eta: 0:02:39  lr: 0.000059  min_lr: 0.000059  loss: 2.4381 (2.5995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9379 (1.0266)  time: 0.2394  data: 0.0005  max mem: 18117
Epoch: [278]  [ 800/1251]  eta: 0:01:49  lr: 0.000058  min_lr: 0.000058  loss: 3.0146 (2.6129)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0432 (1.0271)  time: 0.2362  data: 0.0004  max mem: 18117
Epoch: [278]  [1000/1251]  eta: 0:01:00  lr: 0.000057  min_lr: 0.000057  loss: 2.6531 (2.6066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9734 (1.0288)  time: 0.2389  data: 0.0005  max mem: 18117
Epoch: [278]  [1200/1251]  eta: 0:00:12  lr: 0.000056  min_lr: 0.000056  loss: 2.6846 (2.6101)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1043 (1.0372)  time: 0.2382  data: 0.0005  max mem: 18117
Epoch: [278]  [1250/1251]  eta: 0:00:00  lr: 0.000056  min_lr: 0.000056  loss: 2.0619 (2.6085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0681 (1.0382)  time: 0.1952  data: 0.0006  max mem: 18117
Epoch: [278] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.000056  min_lr: 0.000056  loss: 2.0619 (2.5859)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0681 (1.0382)
Test:  [ 0/25]  eta: 0:01:56  loss: 0.5465 (0.5465)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 4.6628  data: 4.5114  max mem: 18117
Test:  [10/25]  eta: 0:00:09  loss: 0.7208 (0.6949)  acc1: 86.8000 (85.7818)  acc5: 97.6000 (97.5273)  time: 0.6618  data: 0.5459  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8956 (0.8461)  acc1: 79.6000 (82.0000)  acc5: 96.0000 (95.9429)  time: 0.2135  data: 0.1026  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9111 (0.8582)  acc1: 79.6000 (81.5200)  acc5: 95.2000 (95.8400)  time: 0.2307  data: 0.1207  max mem: 18117
Test: Total time: 0:00:10 (0.4075 s / it)
* Acc@1 81.696 Acc@5 95.714 loss 0.851
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.70%
Epoch: [279]  [   0/1251]  eta: 1:07:58  lr: 0.000056  min_lr: 0.000056  loss: 1.9485 (1.9485)  weight_decay: 0.0500 (0.0500)  time: 3.2601  data: 2.9815  max mem: 18117
Epoch: [279]  [ 200/1251]  eta: 0:04:25  lr: 0.000055  min_lr: 0.000055  loss: 1.8601 (2.5865)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0305 (1.0439)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [279]  [ 400/1251]  eta: 0:03:28  lr: 0.000055  min_lr: 0.000055  loss: 2.1273 (2.5848)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0173 (1.0432)  time: 0.2383  data: 0.0005  max mem: 18117
Epoch: [279]  [ 600/1251]  eta: 0:02:38  lr: 0.000054  min_lr: 0.000054  loss: 2.2116 (2.5717)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0347 (1.0422)  time: 0.2397  data: 0.0004  max mem: 18117
Epoch: [279]  [ 800/1251]  eta: 0:01:49  lr: 0.000053  min_lr: 0.000053  loss: 2.0863 (2.5767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9916 (1.0416)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [279]  [1000/1251]  eta: 0:01:00  lr: 0.000052  min_lr: 0.000052  loss: 2.0044 (2.5603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0035 (1.0406)  time: 0.2384  data: 0.0004  max mem: 18117
Epoch: [279]  [1200/1251]  eta: 0:00:12  lr: 0.000051  min_lr: 0.000051  loss: 2.9617 (2.5724)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0195 (1.0380)  time: 0.2401  data: 0.0004  max mem: 18117
Epoch: [279]  [1250/1251]  eta: 0:00:00  lr: 0.000051  min_lr: 0.000051  loss: 2.4587 (2.5720)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0571 (1.0388)  time: 0.1961  data: 0.0008  max mem: 18117
Epoch: [279] Total time: 0:05:01 (0.2411 s / it)
Averaged stats: lr: 0.000051  min_lr: 0.000051  loss: 2.4587 (2.5882)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0571 (1.0388)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5703 (0.5703)  acc1: 89.6000 (89.6000)  acc5: 99.6000 (99.6000)  time: 5.5993  data: 5.4511  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7404 (0.7202)  acc1: 85.6000 (85.2727)  acc5: 97.6000 (97.6364)  time: 0.7649  data: 0.6493  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9118 (0.8718)  acc1: 79.2000 (81.6571)  acc5: 95.6000 (95.9810)  time: 0.2225  data: 0.1087  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9385 (0.8832)  acc1: 79.2000 (81.2000)  acc5: 95.2000 (95.8880)  time: 0.2224  data: 0.1085  max mem: 18117
Test: Total time: 0:00:10 (0.4229 s / it)
* Acc@1 81.546 Acc@5 95.788 loss 0.876
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.70%
Epoch: [280]  [   0/1251]  eta: 1:05:36  lr: 0.000051  min_lr: 0.000051  loss: 2.1853 (2.1853)  weight_decay: 0.0500 (0.0500)  time: 3.1465  data: 2.5314  max mem: 18117
Epoch: [280]  [ 200/1251]  eta: 0:04:28  lr: 0.000050  min_lr: 0.000050  loss: 2.8197 (2.5871)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0225 (1.0240)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [280]  [ 400/1251]  eta: 0:03:31  lr: 0.000050  min_lr: 0.000050  loss: 2.2157 (2.6443)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0046 (1.0176)  time: 0.2423  data: 0.0005  max mem: 18117
Epoch: [280]  [ 600/1251]  eta: 0:02:39  lr: 0.000049  min_lr: 0.000049  loss: 2.5609 (2.6174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9923 (1.0194)  time: 0.2410  data: 0.0004  max mem: 18117
Epoch: [280]  [ 800/1251]  eta: 0:01:49  lr: 0.000048  min_lr: 0.000048  loss: 2.3938 (2.5851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9609 (1.0160)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [280]  [1000/1251]  eta: 0:01:00  lr: 0.000047  min_lr: 0.000047  loss: 2.1190 (2.5735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9867 (1.0170)  time: 0.2355  data: 0.0005  max mem: 18117
Epoch: [280]  [1200/1251]  eta: 0:00:12  lr: 0.000046  min_lr: 0.000046  loss: 3.1151 (2.5774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9955 (1.0188)  time: 0.2358  data: 0.0005  max mem: 18117
Epoch: [280]  [1250/1251]  eta: 0:00:00  lr: 0.000046  min_lr: 0.000046  loss: 2.0323 (2.5739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9763 (1.0206)  time: 0.1952  data: 0.0007  max mem: 18117
Epoch: [280] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.000046  min_lr: 0.000046  loss: 2.0323 (2.5866)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9763 (1.0206)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5562 (0.5562)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.7783  data: 5.6258  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7291 (0.7088)  acc1: 86.0000 (85.3091)  acc5: 97.6000 (97.7091)  time: 0.7257  data: 0.6101  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9175 (0.8578)  acc1: 79.2000 (81.6000)  acc5: 96.0000 (96.0571)  time: 0.2205  data: 0.1104  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9194 (0.8699)  acc1: 78.4000 (80.9920)  acc5: 95.6000 (95.9360)  time: 0.2199  data: 0.1103  max mem: 18117
Test: Total time: 0:00:10 (0.4283 s / it)
* Acc@1 81.538 Acc@5 95.756 loss 0.862
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.70%
Epoch: [281]  [   0/1251]  eta: 1:10:11  lr: 0.000046  min_lr: 0.000046  loss: 2.5920 (2.5920)  weight_decay: 0.0500 (0.0500)  time: 3.3662  data: 2.9889  max mem: 18117
Epoch: [281]  [ 200/1251]  eta: 0:04:25  lr: 0.000046  min_lr: 0.000046  loss: 1.9722 (2.5777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9684 (1.0025)  time: 0.2358  data: 0.0004  max mem: 18117
Epoch: [281]  [ 400/1251]  eta: 0:03:29  lr: 0.000045  min_lr: 0.000045  loss: 2.0757 (2.5499)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0108 (1.0144)  time: 0.2465  data: 0.0004  max mem: 18117
Epoch: [281]  [ 600/1251]  eta: 0:02:39  lr: 0.000044  min_lr: 0.000044  loss: 2.2208 (2.5684)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0385 (1.0183)  time: 0.2425  data: 0.0006  max mem: 18117
Epoch: [281]  [ 800/1251]  eta: 0:01:49  lr: 0.000043  min_lr: 0.000043  loss: 1.9572 (2.5714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9837 (1.0200)  time: 0.2379  data: 0.0003  max mem: 18117
Epoch: [281]  [1000/1251]  eta: 0:01:00  lr: 0.000043  min_lr: 0.000043  loss: 2.1643 (2.5840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9666 (1.0194)  time: 0.2382  data: 0.0003  max mem: 18117
Epoch: [281]  [1200/1251]  eta: 0:00:12  lr: 0.000042  min_lr: 0.000042  loss: 2.4806 (2.5810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9887 (1.0213)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [281]  [1250/1251]  eta: 0:00:00  lr: 0.000042  min_lr: 0.000042  loss: 2.1035 (2.5814)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9737 (1.0232)  time: 0.1956  data: 0.0007  max mem: 18117
Epoch: [281] Total time: 0:05:02 (0.2422 s / it)
Averaged stats: lr: 0.000042  min_lr: 0.000042  loss: 2.1035 (2.5936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9737 (1.0232)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.5561 (0.5561)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.1658  data: 5.0365  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7322 (0.7108)  acc1: 85.2000 (85.3455)  acc5: 97.6000 (97.5273)  time: 0.7536  data: 0.6402  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9199 (0.8601)  acc1: 79.6000 (81.7714)  acc5: 96.0000 (95.9429)  time: 0.2293  data: 0.1193  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9199 (0.8718)  acc1: 79.2000 (81.2160)  acc5: 95.6000 (95.8560)  time: 0.2242  data: 0.1154  max mem: 18117
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 81.670 Acc@5 95.728 loss 0.864
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.70%
Epoch: [282]  [   0/1251]  eta: 1:09:22  lr: 0.000042  min_lr: 0.000042  loss: 1.7365 (1.7365)  weight_decay: 0.0500 (0.0500)  time: 3.3275  data: 2.1407  max mem: 18117
Epoch: [282]  [ 200/1251]  eta: 0:04:27  lr: 0.000041  min_lr: 0.000041  loss: 2.1668 (2.5334)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9926 (1.0072)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [282]  [ 400/1251]  eta: 0:03:29  lr: 0.000040  min_lr: 0.000040  loss: 2.5056 (2.5825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9721 (1.0058)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [282]  [ 600/1251]  eta: 0:02:38  lr: 0.000040  min_lr: 0.000040  loss: 2.8908 (2.6186)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0453 (1.0176)  time: 0.2364  data: 0.0004  max mem: 18117
Epoch: [282]  [ 800/1251]  eta: 0:01:49  lr: 0.000039  min_lr: 0.000039  loss: 2.3465 (2.5866)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9563 (1.0216)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [282]  [1000/1251]  eta: 0:01:00  lr: 0.000038  min_lr: 0.000038  loss: 2.3963 (2.6050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9698 (1.0213)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [282]  [1200/1251]  eta: 0:00:12  lr: 0.000037  min_lr: 0.000037  loss: 2.2282 (2.6015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9620 (1.0239)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [282]  [1250/1251]  eta: 0:00:00  lr: 0.000037  min_lr: 0.000037  loss: 2.4175 (2.6044)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0247 (1.0264)  time: 0.1959  data: 0.0006  max mem: 18117
Epoch: [282] Total time: 0:05:01 (0.2409 s / it)
Averaged stats: lr: 0.000037  min_lr: 0.000037  loss: 2.4175 (2.5969)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0247 (1.0264)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.5791 (0.5791)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.9599  data: 5.8332  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7476 (0.7283)  acc1: 86.0000 (85.6727)  acc5: 97.6000 (97.6000)  time: 0.7413  data: 0.6291  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9323 (0.8823)  acc1: 79.6000 (81.7905)  acc5: 96.0000 (95.9810)  time: 0.1993  data: 0.0898  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9461 (0.8938)  acc1: 79.2000 (81.2640)  acc5: 95.2000 (95.8560)  time: 0.2009  data: 0.0926  max mem: 18117
Test: Total time: 0:00:10 (0.4214 s / it)
* Acc@1 81.634 Acc@5 95.760 loss 0.885
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.70%
Epoch: [283]  [   0/1251]  eta: 1:10:26  lr: 0.000037  min_lr: 0.000037  loss: 1.7120 (1.7120)  weight_decay: 0.0500 (0.0500)  time: 3.3782  data: 2.2399  max mem: 18117
Epoch: [283]  [ 200/1251]  eta: 0:04:28  lr: 0.000037  min_lr: 0.000037  loss: 2.3871 (2.5937)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0307 (1.0602)  time: 0.2425  data: 0.0004  max mem: 18117
Epoch: [283]  [ 400/1251]  eta: 0:03:29  lr: 0.000036  min_lr: 0.000036  loss: 2.9368 (2.6092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9999 (1.0486)  time: 0.2363  data: 0.0004  max mem: 18117
Epoch: [283]  [ 600/1251]  eta: 0:02:38  lr: 0.000035  min_lr: 0.000035  loss: 2.9429 (2.6161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9387 (1.0395)  time: 0.2335  data: 0.0004  max mem: 18117
Epoch: [283]  [ 800/1251]  eta: 0:01:49  lr: 0.000035  min_lr: 0.000035  loss: 2.1072 (2.5984)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0099 (1.0363)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [283]  [1000/1251]  eta: 0:01:00  lr: 0.000034  min_lr: 0.000034  loss: 2.3505 (2.6158)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9901 (1.0424)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [283]  [1200/1251]  eta: 0:00:12  lr: 0.000033  min_lr: 0.000033  loss: 2.5981 (2.6009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9885 (1.0375)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [283]  [1250/1251]  eta: 0:00:00  lr: 0.000033  min_lr: 0.000033  loss: 2.2114 (2.5983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0115 (1.0370)  time: 0.1958  data: 0.0006  max mem: 18117
Epoch: [283] Total time: 0:05:01 (0.2409 s / it)
Averaged stats: lr: 0.000033  min_lr: 0.000033  loss: 2.2114 (2.5798)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0115 (1.0370)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5420 (0.5420)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.2907  data: 5.1257  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7188 (0.6928)  acc1: 85.6000 (85.3455)  acc5: 97.6000 (97.6727)  time: 0.6697  data: 0.5552  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8949 (0.8423)  acc1: 79.6000 (81.8286)  acc5: 95.6000 (96.1333)  time: 0.1868  data: 0.0776  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9060 (0.8539)  acc1: 79.2000 (81.3280)  acc5: 95.6000 (96.0000)  time: 0.2175  data: 0.1084  max mem: 18117
Test: Total time: 0:00:10 (0.4108 s / it)
* Acc@1 81.706 Acc@5 95.806 loss 0.847
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.71%
Epoch: [284]  [   0/1251]  eta: 1:06:08  lr: 0.000033  min_lr: 0.000033  loss: 2.2607 (2.2607)  weight_decay: 0.0500 (0.0500)  time: 3.1722  data: 2.8773  max mem: 18117
Epoch: [284]  [ 200/1251]  eta: 0:04:26  lr: 0.000032  min_lr: 0.000032  loss: 1.9893 (2.5237)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [284]  [ 400/1251]  eta: 0:03:29  lr: 0.000032  min_lr: 0.000032  loss: 2.6937 (2.5465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9850 (nan)  time: 0.2367  data: 0.0005  max mem: 18117
Epoch: [284]  [ 600/1251]  eta: 0:02:38  lr: 0.000031  min_lr: 0.000031  loss: 2.4072 (2.5422)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0338 (nan)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [284]  [ 800/1251]  eta: 0:01:49  lr: 0.000031  min_lr: 0.000031  loss: 2.5405 (2.5512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0122 (nan)  time: 0.2371  data: 0.0003  max mem: 18117
Epoch: [284]  [1000/1251]  eta: 0:01:00  lr: 0.000030  min_lr: 0.000030  loss: 2.9665 (2.5715)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0049 (nan)  time: 0.2360  data: 0.0005  max mem: 18117
Epoch: [284]  [1200/1251]  eta: 0:00:12  lr: 0.000029  min_lr: 0.000029  loss: 2.1444 (2.5573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9942 (nan)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [284]  [1250/1251]  eta: 0:00:00  lr: 0.000029  min_lr: 0.000029  loss: 2.2333 (2.5536)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0345 (nan)  time: 0.2011  data: 0.0007  max mem: 18117
Epoch: [284] Total time: 0:05:01 (0.2409 s / it)
Averaged stats: lr: 0.000029  min_lr: 0.000029  loss: 2.2333 (2.5756)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0345 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5395 (0.5395)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6460  data: 5.5193  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7187 (0.6901)  acc1: 86.0000 (85.4909)  acc5: 97.6000 (97.6727)  time: 0.7336  data: 0.6209  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8936 (0.8420)  acc1: 80.4000 (81.8095)  acc5: 95.6000 (96.0191)  time: 0.2016  data: 0.0896  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9081 (0.8539)  acc1: 78.8000 (81.2640)  acc5: 95.2000 (95.8880)  time: 0.2002  data: 0.0895  max mem: 18117
Test: Total time: 0:00:10 (0.4113 s / it)
* Acc@1 81.750 Acc@5 95.780 loss 0.845
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.75%
Epoch: [285]  [   0/1251]  eta: 1:00:26  lr: 0.000029  min_lr: 0.000029  loss: 3.3757 (3.3757)  weight_decay: 0.0500 (0.0500)  time: 2.8991  data: 2.5131  max mem: 18117
Epoch: [285]  [ 200/1251]  eta: 0:04:26  lr: 0.000029  min_lr: 0.000029  loss: 2.1211 (2.5524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9914 (1.0292)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [285]  [ 400/1251]  eta: 0:03:30  lr: 0.000028  min_lr: 0.000028  loss: 2.6410 (2.5617)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0242 (1.0336)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [285]  [ 600/1251]  eta: 0:02:38  lr: 0.000027  min_lr: 0.000027  loss: 2.4378 (2.5635)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0300 (1.0382)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [285]  [ 800/1251]  eta: 0:01:49  lr: 0.000027  min_lr: 0.000027  loss: 3.0195 (2.5704)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0306 (1.0366)  time: 0.2389  data: 0.0004  max mem: 18117
Epoch: [285]  [1000/1251]  eta: 0:01:00  lr: 0.000026  min_lr: 0.000026  loss: 2.7761 (2.5769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9735 (1.0303)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [285]  [1200/1251]  eta: 0:00:12  lr: 0.000026  min_lr: 0.000026  loss: 2.8666 (2.5897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9816 (1.0312)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [285]  [1250/1251]  eta: 0:00:00  lr: 0.000026  min_lr: 0.000026  loss: 1.8822 (2.5839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9739 (1.0308)  time: 0.1953  data: 0.0007  max mem: 18117
Epoch: [285] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.000026  min_lr: 0.000026  loss: 1.8822 (2.5899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9739 (1.0308)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5232 (0.5232)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.6751  data: 5.5487  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.6864 (0.6737)  acc1: 86.0000 (85.3455)  acc5: 97.6000 (97.7091)  time: 0.7404  data: 0.6283  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8809 (0.8249)  acc1: 80.0000 (81.9619)  acc5: 96.0000 (95.9619)  time: 0.2023  data: 0.0924  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.8952 (0.8367)  acc1: 78.8000 (81.3920)  acc5: 95.2000 (95.8880)  time: 0.2008  data: 0.0923  max mem: 18117
Test: Total time: 0:00:10 (0.4097 s / it)
* Acc@1 81.798 Acc@5 95.794 loss 0.829
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.80%
Epoch: [286]  [   0/1251]  eta: 1:05:36  lr: 0.000026  min_lr: 0.000026  loss: 1.9104 (1.9104)  weight_decay: 0.0500 (0.0500)  time: 3.1466  data: 2.8063  max mem: 18117
Epoch: [286]  [ 200/1251]  eta: 0:04:25  lr: 0.000025  min_lr: 0.000025  loss: 2.3169 (2.6108)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0389 (1.0481)  time: 0.2391  data: 0.0005  max mem: 18117
Epoch: [286]  [ 400/1251]  eta: 0:03:28  lr: 0.000025  min_lr: 0.000025  loss: 2.2315 (2.6239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9821 (1.0235)  time: 0.2378  data: 0.0005  max mem: 18117
Epoch: [286]  [ 600/1251]  eta: 0:02:38  lr: 0.000024  min_lr: 0.000024  loss: 2.1647 (2.6216)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0538 (1.0330)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [286]  [ 800/1251]  eta: 0:01:48  lr: 0.000023  min_lr: 0.000023  loss: 2.7305 (2.6039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9776 (1.0349)  time: 0.2404  data: 0.0004  max mem: 18117
Epoch: [286]  [1000/1251]  eta: 0:01:00  lr: 0.000023  min_lr: 0.000023  loss: 2.5985 (2.5893)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0321 (1.0351)  time: 0.2398  data: 0.0004  max mem: 18117
Epoch: [286]  [1200/1251]  eta: 0:00:12  lr: 0.000022  min_lr: 0.000022  loss: 2.3029 (2.5875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9824 (1.0325)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [286]  [1250/1251]  eta: 0:00:00  lr: 0.000022  min_lr: 0.000022  loss: 2.0700 (2.5882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9177 (1.0329)  time: 0.1954  data: 0.0007  max mem: 18117
Epoch: [286] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.000022  min_lr: 0.000022  loss: 2.0700 (2.5767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9177 (1.0329)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5765 (0.5765)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 5.5801  data: 5.4211  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7525 (0.7289)  acc1: 85.2000 (85.3091)  acc5: 97.6000 (97.5636)  time: 0.7492  data: 0.6328  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9243 (0.8791)  acc1: 78.8000 (81.7143)  acc5: 96.0000 (95.9238)  time: 0.2218  data: 0.1116  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9364 (0.8892)  acc1: 78.4000 (81.1840)  acc5: 95.6000 (95.8720)  time: 0.2198  data: 0.1115  max mem: 18117
Test: Total time: 0:00:10 (0.4214 s / it)
* Acc@1 81.698 Acc@5 95.774 loss 0.880
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.80%
Epoch: [287]  [   0/1251]  eta: 1:10:16  lr: 0.000022  min_lr: 0.000022  loss: 2.1474 (2.1474)  weight_decay: 0.0500 (0.0500)  time: 3.3709  data: 2.3112  max mem: 18117
Epoch: [287]  [ 200/1251]  eta: 0:04:28  lr: 0.000022  min_lr: 0.000022  loss: 3.0225 (2.5556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9368 (1.0206)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [287]  [ 400/1251]  eta: 0:03:30  lr: 0.000021  min_lr: 0.000021  loss: 2.5975 (2.5297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9933 (1.0331)  time: 0.2382  data: 0.0003  max mem: 18117
Epoch: [287]  [ 600/1251]  eta: 0:02:38  lr: 0.000021  min_lr: 0.000021  loss: 2.0143 (2.5524)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0201 (1.0335)  time: 0.2374  data: 0.0004  max mem: 18117
Epoch: [287]  [ 800/1251]  eta: 0:01:49  lr: 0.000020  min_lr: 0.000020  loss: 2.7319 (2.5607)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0001 (1.0295)  time: 0.2360  data: 0.0005  max mem: 18117
Epoch: [287]  [1000/1251]  eta: 0:01:00  lr: 0.000020  min_lr: 0.000020  loss: 2.4110 (2.5474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0010 (1.0328)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [287]  [1200/1251]  eta: 0:00:12  lr: 0.000019  min_lr: 0.000019  loss: 2.6375 (2.5468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9946 (1.0398)  time: 0.2422  data: 0.0003  max mem: 18117
Epoch: [287]  [1250/1251]  eta: 0:00:00  lr: 0.000019  min_lr: 0.000019  loss: 2.4903 (2.5562)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0678 (1.0420)  time: 0.1955  data: 0.0006  max mem: 18117
Epoch: [287] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.000019  min_lr: 0.000019  loss: 2.4903 (2.5665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0678 (1.0420)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5726 (0.5726)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.6350  data: 5.5096  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7430 (0.7179)  acc1: 86.0000 (85.4546)  acc5: 97.6000 (97.5273)  time: 0.7387  data: 0.6268  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9170 (0.8679)  acc1: 80.0000 (81.8286)  acc5: 96.0000 (95.9429)  time: 0.2021  data: 0.0928  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9246 (0.8789)  acc1: 78.8000 (81.3120)  acc5: 95.6000 (95.8240)  time: 0.2008  data: 0.0927  max mem: 18117
Test: Total time: 0:00:10 (0.4105 s / it)
* Acc@1 81.704 Acc@5 95.804 loss 0.871
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.80%
Epoch: [288]  [   0/1251]  eta: 1:04:38  lr: 0.000019  min_lr: 0.000019  loss: 1.8904 (1.8904)  weight_decay: 0.0500 (0.0500)  time: 3.1003  data: 2.2525  max mem: 18117
Epoch: [288]  [ 200/1251]  eta: 0:04:30  lr: 0.000019  min_lr: 0.000019  loss: 2.2034 (2.5608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9989 (1.0671)  time: 0.2387  data: 0.0004  max mem: 18117
Epoch: [288]  [ 400/1251]  eta: 0:03:31  lr: 0.000018  min_lr: 0.000018  loss: 2.2038 (2.5854)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0453 (1.0413)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [288]  [ 600/1251]  eta: 0:02:40  lr: 0.000018  min_lr: 0.000018  loss: 2.9388 (2.5975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9492 (1.0338)  time: 0.2459  data: 0.0004  max mem: 18117
Epoch: [288]  [ 800/1251]  eta: 0:01:50  lr: 0.000017  min_lr: 0.000017  loss: 2.4679 (2.5823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9836 (1.0382)  time: 0.2385  data: 0.0004  max mem: 18117
Epoch: [288]  [1000/1251]  eta: 0:01:01  lr: 0.000017  min_lr: 0.000017  loss: 2.9119 (2.5818)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0058 (1.0393)  time: 0.2392  data: 0.0004  max mem: 18117
Epoch: [288]  [1200/1251]  eta: 0:00:12  lr: 0.000016  min_lr: 0.000016  loss: 2.1007 (2.5865)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0205 (1.0372)  time: 0.2371  data: 0.0004  max mem: 18117
Epoch: [288]  [1250/1251]  eta: 0:00:00  lr: 0.000016  min_lr: 0.000016  loss: 2.6790 (2.5906)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0496 (1.0372)  time: 0.1957  data: 0.0006  max mem: 18117
Epoch: [288] Total time: 0:05:03 (0.2428 s / it)
Averaged stats: lr: 0.000016  min_lr: 0.000016  loss: 2.6790 (2.5737)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0496 (1.0372)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.5834 (0.5834)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.8828  data: 5.7582  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7515 (0.7306)  acc1: 85.6000 (85.2000)  acc5: 97.6000 (97.4909)  time: 0.7153  data: 0.6025  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9349 (0.8820)  acc1: 80.0000 (81.6191)  acc5: 96.0000 (96.0191)  time: 0.1801  data: 0.0702  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9548 (0.8934)  acc1: 78.8000 (81.1040)  acc5: 95.6000 (95.8880)  time: 0.2058  data: 0.0976  max mem: 18117
Test: Total time: 0:00:10 (0.4220 s / it)
* Acc@1 81.680 Acc@5 95.776 loss 0.885
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.80%
Epoch: [289]  [   0/1251]  eta: 1:07:24  lr: 0.000016  min_lr: 0.000016  loss: 3.0617 (3.0617)  weight_decay: 0.0500 (0.0500)  time: 3.2329  data: 2.4476  max mem: 18117
Epoch: [289]  [ 200/1251]  eta: 0:04:27  lr: 0.000016  min_lr: 0.000016  loss: 2.8299 (2.6088)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0360 (1.0604)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [289]  [ 400/1251]  eta: 0:03:29  lr: 0.000015  min_lr: 0.000015  loss: 3.0465 (2.6437)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0069 (1.0373)  time: 0.2424  data: 0.0003  max mem: 18117
Epoch: [289]  [ 600/1251]  eta: 0:02:38  lr: 0.000015  min_lr: 0.000015  loss: 1.9962 (2.6173)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0289 (1.0340)  time: 0.2407  data: 0.0004  max mem: 18117
Epoch: [289]  [ 800/1251]  eta: 0:01:49  lr: 0.000014  min_lr: 0.000014  loss: 1.8997 (2.5825)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0858 (1.0353)  time: 0.2383  data: 0.0003  max mem: 18117
Epoch: [289]  [1000/1251]  eta: 0:01:00  lr: 0.000014  min_lr: 0.000014  loss: 2.2847 (2.5748)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0241 (1.0339)  time: 0.2388  data: 0.0004  max mem: 18117
Epoch: [289]  [1200/1251]  eta: 0:00:12  lr: 0.000014  min_lr: 0.000014  loss: 2.5808 (2.5688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9645 (1.0300)  time: 0.2428  data: 0.0003  max mem: 18117
Epoch: [289]  [1250/1251]  eta: 0:00:00  lr: 0.000014  min_lr: 0.000014  loss: 2.4501 (2.5720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9347 (1.0272)  time: 0.1962  data: 0.0010  max mem: 18117
Epoch: [289] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.000014  min_lr: 0.000014  loss: 2.4501 (2.5894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9347 (1.0272)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5720 (0.5720)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.7483  data: 5.6112  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7410 (0.7168)  acc1: 86.0000 (85.0909)  acc5: 97.6000 (97.5636)  time: 0.7333  data: 0.6204  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9152 (0.8693)  acc1: 79.6000 (81.6952)  acc5: 95.6000 (95.9619)  time: 0.1940  data: 0.0846  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9364 (0.8801)  acc1: 78.8000 (81.1680)  acc5: 95.6000 (95.8560)  time: 0.1938  data: 0.0846  max mem: 18117
Test: Total time: 0:00:10 (0.4054 s / it)
* Acc@1 81.714 Acc@5 95.770 loss 0.870
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.80%
Epoch: [290]  [   0/1251]  eta: 1:07:46  lr: 0.000014  min_lr: 0.000014  loss: 2.3653 (2.3653)  weight_decay: 0.0500 (0.0500)  time: 3.2510  data: 2.7225  max mem: 18117
Epoch: [290]  [ 200/1251]  eta: 0:04:27  lr: 0.000013  min_lr: 0.000013  loss: 2.1202 (2.5868)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0062 (0.9979)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [290]  [ 400/1251]  eta: 0:03:30  lr: 0.000013  min_lr: 0.000013  loss: 2.7987 (2.5657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9360 (1.0073)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [290]  [ 600/1251]  eta: 0:02:38  lr: 0.000012  min_lr: 0.000012  loss: 2.5477 (2.5716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9809 (1.0164)  time: 0.2402  data: 0.0005  max mem: 18117
Epoch: [290]  [ 800/1251]  eta: 0:01:49  lr: 0.000012  min_lr: 0.000012  loss: 2.3610 (2.5698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9929 (1.0110)  time: 0.2386  data: 0.0003  max mem: 18117
Epoch: [290]  [1000/1251]  eta: 0:01:00  lr: 0.000012  min_lr: 0.000012  loss: 2.3508 (2.5723)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0635 (1.0135)  time: 0.2402  data: 0.0003  max mem: 18117
Epoch: [290]  [1200/1251]  eta: 0:00:12  lr: 0.000011  min_lr: 0.000011  loss: 2.5530 (2.5772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9812 (1.0170)  time: 0.2401  data: 0.0005  max mem: 18117
Epoch: [290]  [1250/1251]  eta: 0:00:00  lr: 0.000011  min_lr: 0.000011  loss: 2.4407 (2.5750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9669 (1.0164)  time: 0.1954  data: 0.0008  max mem: 18117
Epoch: [290] Total time: 0:05:02 (0.2416 s / it)
Averaged stats: lr: 0.000011  min_lr: 0.000011  loss: 2.4407 (2.5682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9669 (1.0164)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5610 (0.5610)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.3900  data: 5.2612  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7396 (0.7128)  acc1: 86.0000 (85.0909)  acc5: 98.0000 (97.6000)  time: 0.7273  data: 0.6103  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9126 (0.8654)  acc1: 80.0000 (81.6571)  acc5: 96.0000 (96.0191)  time: 0.2125  data: 0.0985  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9257 (0.8762)  acc1: 78.4000 (81.1360)  acc5: 95.6000 (95.9200)  time: 0.2001  data: 0.0881  max mem: 18117
Test: Total time: 0:00:10 (0.4106 s / it)
* Acc@1 81.684 Acc@5 95.778 loss 0.867
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.80%
Epoch: [291]  [   0/1251]  eta: 1:12:04  lr: 0.000011  min_lr: 0.000011  loss: 1.7247 (1.7247)  weight_decay: 0.0500 (0.0500)  time: 3.4567  data: 2.4498  max mem: 18117
Epoch: [291]  [ 200/1251]  eta: 0:04:27  lr: 0.000011  min_lr: 0.000011  loss: 2.4550 (2.5406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9095 (0.9855)  time: 0.2402  data: 0.0004  max mem: 18117
Epoch: [291]  [ 400/1251]  eta: 0:03:30  lr: 0.000010  min_lr: 0.000010  loss: 2.0739 (2.5461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9573 (0.9844)  time: 0.2370  data: 0.0004  max mem: 18117
Epoch: [291]  [ 600/1251]  eta: 0:02:39  lr: 0.000010  min_lr: 0.000010  loss: 2.4571 (2.5506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0038 (0.9999)  time: 0.2403  data: 0.0004  max mem: 18117
Epoch: [291]  [ 800/1251]  eta: 0:01:49  lr: 0.000010  min_lr: 0.000010  loss: 2.6188 (2.5721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9504 (0.9983)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [291]  [1000/1251]  eta: 0:01:00  lr: 0.000009  min_lr: 0.000009  loss: 2.3914 (2.5869)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0202 (1.0076)  time: 0.2380  data: 0.0004  max mem: 18117
Epoch: [291]  [1200/1251]  eta: 0:00:12  lr: 0.000009  min_lr: 0.000009  loss: 2.0468 (2.5968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9955 (1.0152)  time: 0.2450  data: 0.0004  max mem: 18117
Epoch: [291]  [1250/1251]  eta: 0:00:00  lr: 0.000009  min_lr: 0.000009  loss: 1.9850 (2.5985)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9955 (1.0164)  time: 0.2022  data: 0.0007  max mem: 18117
Epoch: [291] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000009  min_lr: 0.000009  loss: 1.9850 (2.5874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9955 (1.0164)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5518 (0.5518)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 5.5645  data: 5.4351  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7187 (0.6988)  acc1: 86.0000 (85.4546)  acc5: 97.6000 (97.7455)  time: 0.7043  data: 0.5886  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8995 (0.8494)  acc1: 80.0000 (81.8286)  acc5: 96.0000 (96.0000)  time: 0.1939  data: 0.0797  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9130 (0.8610)  acc1: 78.4000 (81.3120)  acc5: 95.2000 (95.8880)  time: 0.2031  data: 0.0914  max mem: 18117
Test: Total time: 0:00:10 (0.4156 s / it)
* Acc@1 81.788 Acc@5 95.754 loss 0.852
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.80%
Epoch: [292]  [   0/1251]  eta: 1:06:36  lr: 0.000009  min_lr: 0.000009  loss: 1.7494 (1.7494)  weight_decay: 0.0500 (0.0500)  time: 3.1946  data: 2.2585  max mem: 18117
Epoch: [292]  [ 200/1251]  eta: 0:04:28  lr: 0.000009  min_lr: 0.000009  loss: 2.3317 (2.5543)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0443 (1.0504)  time: 0.2393  data: 0.0004  max mem: 18117
Epoch: [292]  [ 400/1251]  eta: 0:03:30  lr: 0.000008  min_lr: 0.000008  loss: 2.5304 (2.6021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9571 (1.0341)  time: 0.2421  data: 0.0004  max mem: 18117
Epoch: [292]  [ 600/1251]  eta: 0:02:39  lr: 0.000008  min_lr: 0.000008  loss: 2.0008 (2.5961)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0287 (1.0317)  time: 0.2378  data: 0.0004  max mem: 18117
Epoch: [292]  [ 800/1251]  eta: 0:01:49  lr: 0.000008  min_lr: 0.000008  loss: 2.5082 (2.6114)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9890 (1.0347)  time: 0.2401  data: 0.0004  max mem: 18117
Epoch: [292]  [1000/1251]  eta: 0:01:00  lr: 0.000008  min_lr: 0.000008  loss: 2.3648 (2.6032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9829 (1.0305)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [292]  [1200/1251]  eta: 0:00:12  lr: 0.000007  min_lr: 0.000007  loss: 2.0672 (2.5873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9414 (1.0237)  time: 0.2384  data: 0.0005  max mem: 18117
Epoch: [292]  [1250/1251]  eta: 0:00:00  lr: 0.000007  min_lr: 0.000007  loss: 2.8284 (2.5910)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0360 (1.0246)  time: 0.1956  data: 0.0008  max mem: 18117
Epoch: [292] Total time: 0:05:02 (0.2415 s / it)
Averaged stats: lr: 0.000007  min_lr: 0.000007  loss: 2.8284 (2.5811)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0360 (1.0246)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6082 (0.6082)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5065  data: 5.3783  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7780 (0.7541)  acc1: 86.0000 (85.3091)  acc5: 97.6000 (97.6000)  time: 0.7404  data: 0.6279  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9469 (0.9067)  acc1: 79.6000 (81.7143)  acc5: 95.6000 (95.9619)  time: 0.2033  data: 0.0933  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9691 (0.9167)  acc1: 78.8000 (81.2320)  acc5: 95.2000 (95.8880)  time: 0.2032  data: 0.0945  max mem: 18117
Test: Total time: 0:00:10 (0.4050 s / it)
* Acc@1 81.724 Acc@5 95.764 loss 0.908
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.80%
Epoch: [293]  [   0/1251]  eta: 1:09:52  lr: 0.000007  min_lr: 0.000007  loss: 1.6006 (1.6006)  weight_decay: 0.0500 (0.0500)  time: 3.3510  data: 2.7016  max mem: 18117
Epoch: [293]  [ 200/1251]  eta: 0:04:27  lr: 0.000007  min_lr: 0.000007  loss: 2.0883 (2.4782)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0040 (1.0327)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [293]  [ 400/1251]  eta: 0:03:29  lr: 0.000007  min_lr: 0.000007  loss: 1.9374 (2.5015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9460 (1.0168)  time: 0.2377  data: 0.0003  max mem: 18117
Epoch: [293]  [ 600/1251]  eta: 0:02:38  lr: 0.000006  min_lr: 0.000006  loss: 3.1405 (2.5233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9955 (1.0149)  time: 0.2392  data: 0.0005  max mem: 18117
Epoch: [293]  [ 800/1251]  eta: 0:01:49  lr: 0.000006  min_lr: 0.000006  loss: 2.8766 (2.5601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9798 (1.0112)  time: 0.2391  data: 0.0004  max mem: 18117
Epoch: [293]  [1000/1251]  eta: 0:01:00  lr: 0.000006  min_lr: 0.000006  loss: 2.5635 (2.5546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0529 (1.0170)  time: 0.2366  data: 0.0004  max mem: 18117
Epoch: [293]  [1200/1251]  eta: 0:00:12  lr: 0.000006  min_lr: 0.000006  loss: 2.1455 (2.5495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9962 (1.0167)  time: 0.2345  data: 0.0004  max mem: 18117
Epoch: [293]  [1250/1251]  eta: 0:00:00  lr: 0.000006  min_lr: 0.000006  loss: 2.0511 (2.5431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9733 (1.0167)  time: 0.1960  data: 0.0007  max mem: 18117
Epoch: [293] Total time: 0:05:01 (0.2413 s / it)
Averaged stats: lr: 0.000006  min_lr: 0.000006  loss: 2.0511 (2.5661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9733 (1.0167)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5571 (0.5571)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.7122  data: 5.5630  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7339 (0.7090)  acc1: 86.0000 (85.4182)  acc5: 97.6000 (97.4909)  time: 0.7819  data: 0.6669  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9192 (0.8615)  acc1: 80.0000 (81.7524)  acc5: 96.0000 (95.9048)  time: 0.2379  data: 0.1280  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9309 (0.8731)  acc1: 78.4000 (81.2160)  acc5: 95.2000 (95.7600)  time: 0.2373  data: 0.1280  max mem: 18117
Test: Total time: 0:00:11 (0.4404 s / it)
* Acc@1 81.748 Acc@5 95.718 loss 0.864
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.80%
Epoch: [294]  [   0/1251]  eta: 1:11:54  lr: 0.000006  min_lr: 0.000006  loss: 3.2581 (3.2581)  weight_decay: 0.0500 (0.0500)  time: 3.4491  data: 2.6297  max mem: 18117
Epoch: [294]  [ 200/1251]  eta: 0:04:26  lr: 0.000005  min_lr: 0.000005  loss: 2.3746 (2.5987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9756 (1.0116)  time: 0.2380  data: 0.0005  max mem: 18117
Epoch: [294]  [ 400/1251]  eta: 0:03:29  lr: 0.000005  min_lr: 0.000005  loss: 2.7268 (2.5641)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0448 (1.0180)  time: 0.2417  data: 0.0005  max mem: 18117
Epoch: [294]  [ 600/1251]  eta: 0:02:38  lr: 0.000005  min_lr: 0.000005  loss: 2.2018 (2.5623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9918 (1.0192)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [294]  [ 800/1251]  eta: 0:01:49  lr: 0.000005  min_lr: 0.000005  loss: 2.9629 (2.5751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9960 (1.0257)  time: 0.2361  data: 0.0004  max mem: 18117
Epoch: [294]  [1000/1251]  eta: 0:01:00  lr: 0.000004  min_lr: 0.000004  loss: 2.6815 (2.5700)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0333 (1.0250)  time: 0.2372  data: 0.0004  max mem: 18117
Epoch: [294]  [1200/1251]  eta: 0:00:12  lr: 0.000004  min_lr: 0.000004  loss: 2.1378 (2.5655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9515 (nan)  time: 0.2358  data: 0.0004  max mem: 18117
Epoch: [294]  [1250/1251]  eta: 0:00:00  lr: 0.000004  min_lr: 0.000004  loss: 1.9625 (2.5662)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0603 (nan)  time: 0.1954  data: 0.0008  max mem: 18117
Epoch: [294] Total time: 0:05:02 (0.2417 s / it)
Averaged stats: lr: 0.000004  min_lr: 0.000004  loss: 1.9625 (2.5700)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0603 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5391 (0.5391)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6629  data: 5.5343  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7060 (0.6872)  acc1: 86.0000 (85.3818)  acc5: 97.2000 (97.5636)  time: 0.6834  data: 0.5720  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8931 (0.8361)  acc1: 80.0000 (81.8476)  acc5: 96.0000 (95.8857)  time: 0.1750  data: 0.0662  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.8989 (0.8482)  acc1: 79.2000 (81.2640)  acc5: 95.2000 (95.7760)  time: 0.1965  data: 0.0884  max mem: 18117
Test: Total time: 0:00:10 (0.4055 s / it)
* Acc@1 81.822 Acc@5 95.766 loss 0.840
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.82%
Epoch: [295]  [   0/1251]  eta: 1:03:35  lr: 0.000004  min_lr: 0.000004  loss: 1.9070 (1.9070)  weight_decay: 0.0500 (0.0500)  time: 3.0499  data: 2.7096  max mem: 18117
Epoch: [295]  [ 200/1251]  eta: 0:04:28  lr: 0.000004  min_lr: 0.000004  loss: 2.1445 (2.6030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9576 (1.0057)  time: 0.2418  data: 0.0004  max mem: 18117
Epoch: [295]  [ 400/1251]  eta: 0:03:31  lr: 0.000004  min_lr: 0.000004  loss: 2.3477 (2.5787)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0256 (1.0131)  time: 0.2379  data: 0.0005  max mem: 18117
Epoch: [295]  [ 600/1251]  eta: 0:02:39  lr: 0.000004  min_lr: 0.000004  loss: 3.0339 (2.5784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9948 (1.0197)  time: 0.2401  data: 0.0004  max mem: 18117
Epoch: [295]  [ 800/1251]  eta: 0:01:50  lr: 0.000003  min_lr: 0.000003  loss: 2.1964 (2.5954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0052 (1.0292)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [295]  [1000/1251]  eta: 0:01:00  lr: 0.000003  min_lr: 0.000003  loss: 2.6681 (2.5925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0259 (1.0279)  time: 0.2377  data: 0.0005  max mem: 18117
Epoch: [295]  [1200/1251]  eta: 0:00:12  lr: 0.000003  min_lr: 0.000003  loss: 1.9502 (2.5669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9692 (1.0274)  time: 0.2379  data: 0.0004  max mem: 18117
Epoch: [295]  [1250/1251]  eta: 0:00:00  lr: 0.000003  min_lr: 0.000003  loss: 1.9026 (2.5633)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0600 (1.0298)  time: 0.1957  data: 0.0008  max mem: 18117
Epoch: [295] Total time: 0:05:02 (0.2421 s / it)
Averaged stats: lr: 0.000003  min_lr: 0.000003  loss: 1.9026 (2.5616)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0600 (1.0298)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.5359 (0.5359)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.9868  data: 5.8581  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7087 (0.6888)  acc1: 86.0000 (85.3091)  acc5: 97.6000 (97.5273)  time: 0.7324  data: 0.6207  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8965 (0.8376)  acc1: 80.0000 (81.6571)  acc5: 95.6000 (95.9238)  time: 0.1927  data: 0.0838  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.8965 (0.8488)  acc1: 78.8000 (81.1520)  acc5: 95.6000 (95.8080)  time: 0.1916  data: 0.0837  max mem: 18117
Test: Total time: 0:00:10 (0.4146 s / it)
* Acc@1 81.776 Acc@5 95.762 loss 0.840
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.82%
Epoch: [296]  [   0/1251]  eta: 1:08:30  lr: 0.000003  min_lr: 0.000003  loss: 3.4482 (3.4482)  weight_decay: 0.0500 (0.0500)  time: 3.2860  data: 1.7592  max mem: 18117
Epoch: [296]  [ 200/1251]  eta: 0:04:28  lr: 0.000003  min_lr: 0.000003  loss: 2.9590 (2.5931)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2367  data: 0.0004  max mem: 18117
Epoch: [296]  [ 400/1251]  eta: 0:03:30  lr: 0.000003  min_lr: 0.000003  loss: 3.2733 (2.6570)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0379 (nan)  time: 0.2386  data: 0.0003  max mem: 18117
Epoch: [296]  [ 600/1251]  eta: 0:02:39  lr: 0.000003  min_lr: 0.000003  loss: 2.4614 (2.6414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9964 (nan)  time: 0.2409  data: 0.0004  max mem: 18117
Epoch: [296]  [ 800/1251]  eta: 0:01:49  lr: 0.000002  min_lr: 0.000002  loss: 2.5987 (2.6488)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0207 (nan)  time: 0.2405  data: 0.0004  max mem: 18117
Epoch: [296]  [1000/1251]  eta: 0:01:00  lr: 0.000002  min_lr: 0.000002  loss: 2.0341 (2.6255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0585 (nan)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [296]  [1200/1251]  eta: 0:00:12  lr: 0.000002  min_lr: 0.000002  loss: 2.4928 (2.6165)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9726 (nan)  time: 0.2351  data: 0.0004  max mem: 18117
Epoch: [296]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.4050 (2.6133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9731 (nan)  time: 0.1956  data: 0.0005  max mem: 18117
Epoch: [296] Total time: 0:05:02 (0.2414 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.4050 (2.5806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9731 (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5379 (0.5379)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.7423  data: 5.6133  max mem: 18117
Test:  [10/25]  eta: 0:00:11  loss: 0.7131 (0.6939)  acc1: 85.6000 (85.2727)  acc5: 97.6000 (97.6000)  time: 0.7548  data: 0.6422  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.8998 (0.8462)  acc1: 80.0000 (81.6571)  acc5: 95.6000 (96.0000)  time: 0.2017  data: 0.0909  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9050 (0.8577)  acc1: 78.8000 (81.1200)  acc5: 95.6000 (95.9040)  time: 0.2007  data: 0.0908  max mem: 18117
Test: Total time: 0:00:10 (0.4117 s / it)
* Acc@1 81.728 Acc@5 95.760 loss 0.849
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.82%
Epoch: [297]  [   0/1251]  eta: 1:07:04  lr: 0.000002  min_lr: 0.000002  loss: 3.4633 (3.4633)  weight_decay: 0.0500 (0.0500)  time: 3.2174  data: 2.3139  max mem: 18117
Epoch: [297]  [ 200/1251]  eta: 0:04:25  lr: 0.000002  min_lr: 0.000002  loss: 2.9313 (2.6053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9791 (1.0203)  time: 0.2360  data: 0.0004  max mem: 18117
Epoch: [297]  [ 400/1251]  eta: 0:03:29  lr: 0.000002  min_lr: 0.000002  loss: 2.7681 (2.6246)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0414 (1.0146)  time: 0.2375  data: 0.0004  max mem: 18117
Epoch: [297]  [ 600/1251]  eta: 0:02:38  lr: 0.000002  min_lr: 0.000002  loss: 1.8480 (2.5968)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0453 (1.0182)  time: 0.2383  data: 0.0004  max mem: 18117
Epoch: [297]  [ 800/1251]  eta: 0:01:49  lr: 0.000002  min_lr: 0.000002  loss: 2.1274 (2.6263)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0524 (1.0209)  time: 0.2396  data: 0.0004  max mem: 18117
Epoch: [297]  [1000/1251]  eta: 0:01:00  lr: 0.000002  min_lr: 0.000002  loss: 2.0533 (2.5986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9789 (1.0180)  time: 0.2368  data: 0.0007  max mem: 18117
Epoch: [297]  [1200/1251]  eta: 0:00:12  lr: 0.000002  min_lr: 0.000002  loss: 2.2910 (2.6066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9976 (1.0185)  time: 0.2373  data: 0.0004  max mem: 18117
Epoch: [297]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.6340 (2.6006)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0012 (1.0177)  time: 0.1952  data: 0.0006  max mem: 18117
Epoch: [297] Total time: 0:05:01 (0.2406 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.6340 (2.5920)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0012 (1.0177)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5936 (0.5936)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.7566  data: 5.6347  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7663 (0.7426)  acc1: 86.0000 (85.2000)  acc5: 97.6000 (97.5273)  time: 0.6847  data: 0.5736  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9359 (0.8942)  acc1: 79.2000 (81.6000)  acc5: 96.0000 (95.9429)  time: 0.1728  data: 0.0635  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9584 (0.9045)  acc1: 78.4000 (81.0720)  acc5: 95.2000 (95.8240)  time: 0.1923  data: 0.0830  max mem: 18117
Test: Total time: 0:00:10 (0.4047 s / it)
* Acc@1 81.740 Acc@5 95.754 loss 0.896
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.82%
Epoch: [298]  [   0/1251]  eta: 1:09:19  lr: 0.000002  min_lr: 0.000002  loss: 4.0023 (4.0023)  weight_decay: 0.0500 (0.0500)  time: 3.3253  data: 1.5641  max mem: 18117
Epoch: [298]  [ 200/1251]  eta: 0:04:29  lr: 0.000001  min_lr: 0.000001  loss: 2.1900 (2.5699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9631 (0.9849)  time: 0.2414  data: 0.0004  max mem: 18117
Epoch: [298]  [ 400/1251]  eta: 0:03:30  lr: 0.000001  min_lr: 0.000001  loss: 1.9237 (2.6169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9495 (0.9930)  time: 0.2377  data: 0.0004  max mem: 18117
Epoch: [298]  [ 600/1251]  eta: 0:02:39  lr: 0.000001  min_lr: 0.000001  loss: 1.9778 (2.5911)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0092 (1.0022)  time: 0.2386  data: 0.0004  max mem: 18117
Epoch: [298]  [ 800/1251]  eta: 0:01:50  lr: 0.000001  min_lr: 0.000001  loss: 1.9075 (2.5897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9823 (1.0192)  time: 0.2485  data: 0.0004  max mem: 18117
Epoch: [298]  [1000/1251]  eta: 0:01:01  lr: 0.000001  min_lr: 0.000001  loss: 1.9201 (2.5839)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0652 (1.0195)  time: 0.2399  data: 0.0004  max mem: 18117
Epoch: [298]  [1200/1251]  eta: 0:00:12  lr: 0.000001  min_lr: 0.000001  loss: 2.6095 (2.5886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9473 (1.0135)  time: 0.2393  data: 0.0004  max mem: 18117
Epoch: [298]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.3048 (2.5884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9618 (1.0125)  time: 0.1958  data: 0.0006  max mem: 18117
Epoch: [298] Total time: 0:05:04 (0.2431 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.3048 (2.5728)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9618 (1.0125)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5854 (0.5854)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.4706  data: 5.3322  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7591 (0.7356)  acc1: 86.0000 (85.4545)  acc5: 97.6000 (97.4909)  time: 0.7021  data: 0.5875  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9316 (0.8844)  acc1: 79.6000 (81.7905)  acc5: 96.0000 (96.0000)  time: 0.1939  data: 0.0835  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9435 (0.8954)  acc1: 78.8000 (81.2480)  acc5: 95.6000 (95.9040)  time: 0.2044  data: 0.0942  max mem: 18117
Test: Total time: 0:00:10 (0.4036 s / it)
* Acc@1 81.676 Acc@5 95.794 loss 0.887
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.82%
Epoch: [299]  [   0/1251]  eta: 1:08:07  lr: 0.000001  min_lr: 0.000001  loss: 2.7552 (2.7552)  weight_decay: 0.0500 (0.0500)  time: 3.2675  data: 1.7745  max mem: 18117
Epoch: [299]  [ 200/1251]  eta: 0:04:28  lr: 0.000001  min_lr: 0.000001  loss: 3.0326 (2.5650)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0515 (1.0034)  time: 0.2440  data: 0.0004  max mem: 18117
Epoch: [299]  [ 400/1251]  eta: 0:03:30  lr: 0.000001  min_lr: 0.000001  loss: 2.2894 (2.5400)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0138 (1.0023)  time: 0.2359  data: 0.0004  max mem: 18117
Epoch: [299]  [ 600/1251]  eta: 0:02:39  lr: 0.000001  min_lr: 0.000001  loss: 3.1315 (2.5483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9692 (1.0016)  time: 0.2414  data: 0.0004  max mem: 18117
Epoch: [299]  [ 800/1251]  eta: 0:01:49  lr: 0.000001  min_lr: 0.000001  loss: 2.8300 (2.5614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9521 (1.0069)  time: 0.2381  data: 0.0004  max mem: 18117
Epoch: [299]  [1000/1251]  eta: 0:01:00  lr: 0.000001  min_lr: 0.000001  loss: 2.8981 (2.5764)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0127 (1.0120)  time: 0.2357  data: 0.0004  max mem: 18117
Epoch: [299]  [1200/1251]  eta: 0:00:12  lr: 0.000001  min_lr: 0.000001  loss: 1.9463 (2.5710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9848 (1.0151)  time: 0.2390  data: 0.0004  max mem: 18117
Epoch: [299]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.0931 (2.5706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9777 (1.0146)  time: 0.1954  data: 0.0006  max mem: 18117
Epoch: [299] Total time: 0:05:01 (0.2412 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.0931 (2.5751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9777 (1.0146)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5559 (0.5559)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.8107  data: 5.6838  max mem: 18117
Test:  [10/25]  eta: 0:00:10  loss: 0.7277 (0.7065)  acc1: 86.0000 (85.3455)  acc5: 97.6000 (97.5636)  time: 0.7223  data: 0.6063  max mem: 18117
Test:  [20/25]  eta: 0:00:02  loss: 0.9082 (0.8580)  acc1: 79.6000 (81.8095)  acc5: 95.6000 (95.8857)  time: 0.1870  data: 0.0755  max mem: 18117
Test:  [24/25]  eta: 0:00:00  loss: 0.9239 (0.8694)  acc1: 79.2000 (81.3280)  acc5: 95.2000 (95.8400)  time: 0.1982  data: 0.0880  max mem: 18117
Test: Total time: 0:00:10 (0.4135 s / it)
* Acc@1 81.754 Acc@5 95.790 loss 0.861
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.82%
Training time 1 day, 2:06:00
