| distributed init (rank 0): env://, gpu 0
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 5): env://, gpu 5
| distributed init (rank 3): env://, gpu 3
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 6): env://, gpu 6
| distributed init (rank 7): env://, gpu 7
| distributed init (rank 2): env://, gpu 2
Namespace(batch_size=128, epochs=300, update_freq=4, model='pico', drop_path=0, input_size=224, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.004, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.1, cutmix=0.2, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_pico_0.8G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=8, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(224, 224))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7f9a85d730d0>
Mixup is activated!
Model = RaCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 12, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (layer1): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(12, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(12, 12, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=12, bias=False)
          (norm): BatchNorm2d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(12, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): Identity()
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(24, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(24, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=24, bias=False)
          (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(24, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): Identity()
    )
  )
  (layer2): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(24, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=24, bias=False)
          (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(24, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): Identity()
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(48, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): Identity()
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(48, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): Identity()
    )
  )
  (layer3): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): Identity()
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): Identity()
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): Identity()
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): Identity()
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): Identity()
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): Identity()
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): Identity()
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): Identity()
    )
  )
  (layer4): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): Identity()
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): Identity()
    )
  )
  (head): ConvX(
    (conv): Conv2d(192, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 9729890
LR = 0.00400000
Batch size = 4096
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 312
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.mlp.0.conv.weight",
      "layer1.0.mlp.1.conv.weight",
      "layer1.0.mlp.2.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.mlp.conv_in.conv.weight",
      "layer1.1.mlp.dw.conv.weight",
      "layer1.1.mlp.re.region.0.weight",
      "layer1.1.mlp.re.region.3.weight",
      "layer1.1.mlp.proj.conv.weight",
      "layer1.1.dcnn.conv_in.conv.weight",
      "layer1.1.dcnn.spe.conv.weight",
      "layer1.1.dcnn.att.logit_scale",
      "layer1.1.dcnn.proj.conv.weight",
      "layer2.0.mlp.0.conv.weight",
      "layer2.0.mlp.1.conv.weight",
      "layer2.0.mlp.2.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.mlp.conv_in.conv.weight",
      "layer2.1.mlp.dw.conv.weight",
      "layer2.1.mlp.re.region.0.weight",
      "layer2.1.mlp.re.region.3.weight",
      "layer2.1.mlp.proj.conv.weight",
      "layer2.1.dcnn.conv_in.conv.weight",
      "layer2.1.dcnn.spe.conv.weight",
      "layer2.1.dcnn.att.logit_scale",
      "layer2.1.dcnn.proj.conv.weight",
      "layer2.2.mlp.conv_in.conv.weight",
      "layer2.2.mlp.dw.conv.weight",
      "layer2.2.mlp.re.region.0.weight",
      "layer2.2.mlp.re.region.3.weight",
      "layer2.2.mlp.proj.conv.weight",
      "layer2.2.dcnn.conv_in.conv.weight",
      "layer2.2.dcnn.spe.conv.weight",
      "layer2.2.dcnn.att.logit_scale",
      "layer2.2.dcnn.proj.conv.weight",
      "layer3.0.mlp.0.conv.weight",
      "layer3.0.mlp.1.conv.weight",
      "layer3.0.mlp.2.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.mlp.conv_in.conv.weight",
      "layer3.1.mlp.dw.conv.weight",
      "layer3.1.mlp.re.region.0.weight",
      "layer3.1.mlp.re.region.3.weight",
      "layer3.1.mlp.proj.conv.weight",
      "layer3.1.dcnn.conv_in.conv.weight",
      "layer3.1.dcnn.spe.conv.weight",
      "layer3.1.dcnn.att.logit_scale",
      "layer3.1.dcnn.proj.conv.weight",
      "layer3.2.mlp.conv_in.conv.weight",
      "layer3.2.mlp.dw.conv.weight",
      "layer3.2.mlp.re.region.0.weight",
      "layer3.2.mlp.re.region.3.weight",
      "layer3.2.mlp.proj.conv.weight",
      "layer3.2.dcnn.conv_in.conv.weight",
      "layer3.2.dcnn.spe.conv.weight",
      "layer3.2.dcnn.att.logit_scale",
      "layer3.2.dcnn.proj.conv.weight",
      "layer3.3.mlp.conv_in.conv.weight",
      "layer3.3.mlp.dw.conv.weight",
      "layer3.3.mlp.re.region.0.weight",
      "layer3.3.mlp.re.region.3.weight",
      "layer3.3.mlp.proj.conv.weight",
      "layer3.3.dcnn.conv_in.conv.weight",
      "layer3.3.dcnn.spe.conv.weight",
      "layer3.3.dcnn.att.logit_scale",
      "layer3.3.dcnn.proj.conv.weight",
      "layer3.4.mlp.conv_in.conv.weight",
      "layer3.4.mlp.dw.conv.weight",
      "layer3.4.mlp.re.region.0.weight",
      "layer3.4.mlp.re.region.3.weight",
      "layer3.4.mlp.proj.conv.weight",
      "layer3.4.dcnn.conv_in.conv.weight",
      "layer3.4.dcnn.spe.conv.weight",
      "layer3.4.dcnn.att.logit_scale",
      "layer3.4.dcnn.proj.conv.weight",
      "layer3.5.mlp.conv_in.conv.weight",
      "layer3.5.mlp.dw.conv.weight",
      "layer3.5.mlp.re.region.0.weight",
      "layer3.5.mlp.re.region.3.weight",
      "layer3.5.mlp.proj.conv.weight",
      "layer3.5.dcnn.conv_in.conv.weight",
      "layer3.5.dcnn.spe.conv.weight",
      "layer3.5.dcnn.att.logit_scale",
      "layer3.5.dcnn.proj.conv.weight",
      "layer3.6.mlp.conv_in.conv.weight",
      "layer3.6.mlp.dw.conv.weight",
      "layer3.6.mlp.re.region.0.weight",
      "layer3.6.mlp.re.region.3.weight",
      "layer3.6.mlp.proj.conv.weight",
      "layer3.6.dcnn.conv_in.conv.weight",
      "layer3.6.dcnn.spe.conv.weight",
      "layer3.6.dcnn.att.logit_scale",
      "layer3.6.dcnn.proj.conv.weight",
      "layer3.7.mlp.conv_in.conv.weight",
      "layer3.7.mlp.dw.conv.weight",
      "layer3.7.mlp.re.region.0.weight",
      "layer3.7.mlp.re.region.3.weight",
      "layer3.7.mlp.proj.conv.weight",
      "layer3.7.dcnn.conv_in.conv.weight",
      "layer3.7.dcnn.spe.conv.weight",
      "layer3.7.dcnn.att.logit_scale",
      "layer3.7.dcnn.proj.conv.weight",
      "layer4.0.mlp.0.conv.weight",
      "layer4.0.mlp.1.conv.weight",
      "layer4.0.mlp.2.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.mlp.conv_in.conv.weight",
      "layer4.1.mlp.dw.conv.weight",
      "layer4.1.mlp.re.region.0.weight",
      "layer4.1.mlp.re.region.3.weight",
      "layer4.1.mlp.proj.conv.weight",
      "layer4.1.dcnn.conv_in.conv.weight",
      "layer4.1.dcnn.spe.conv.weight",
      "layer4.1.dcnn.att.logit_scale",
      "layer4.1.dcnn.proj.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.mlp.0.norm.weight",
      "layer1.0.mlp.0.norm.bias",
      "layer1.0.mlp.1.norm.weight",
      "layer1.0.mlp.1.norm.bias",
      "layer1.0.mlp.2.norm.weight",
      "layer1.0.mlp.2.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.mlp.conv_in.norm.weight",
      "layer1.1.mlp.conv_in.norm.bias",
      "layer1.1.mlp.dw.norm.weight",
      "layer1.1.mlp.dw.norm.bias",
      "layer1.1.mlp.re.region.1.weight",
      "layer1.1.mlp.re.region.1.bias",
      "layer1.1.mlp.re.region.3.bias",
      "layer1.1.mlp.proj.norm.weight",
      "layer1.1.mlp.proj.norm.bias",
      "layer1.1.dcnn.conv_in.norm.weight",
      "layer1.1.dcnn.conv_in.norm.bias",
      "layer1.1.dcnn.spe.norm.weight",
      "layer1.1.dcnn.spe.norm.bias",
      "layer1.1.dcnn.proj.norm.weight",
      "layer1.1.dcnn.proj.norm.bias",
      "layer2.0.mlp.0.norm.weight",
      "layer2.0.mlp.0.norm.bias",
      "layer2.0.mlp.1.norm.weight",
      "layer2.0.mlp.1.norm.bias",
      "layer2.0.mlp.2.norm.weight",
      "layer2.0.mlp.2.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.mlp.conv_in.norm.weight",
      "layer2.1.mlp.conv_in.norm.bias",
      "layer2.1.mlp.dw.norm.weight",
      "layer2.1.mlp.dw.norm.bias",
      "layer2.1.mlp.re.region.1.weight",
      "layer2.1.mlp.re.region.1.bias",
      "layer2.1.mlp.re.region.3.bias",
      "layer2.1.mlp.proj.norm.weight",
      "layer2.1.mlp.proj.norm.bias",
      "layer2.1.dcnn.conv_in.norm.weight",
      "layer2.1.dcnn.conv_in.norm.bias",
      "layer2.1.dcnn.spe.norm.weight",
      "layer2.1.dcnn.spe.norm.bias",
      "layer2.1.dcnn.proj.norm.weight",
      "layer2.1.dcnn.proj.norm.bias",
      "layer2.2.mlp.conv_in.norm.weight",
      "layer2.2.mlp.conv_in.norm.bias",
      "layer2.2.mlp.dw.norm.weight",
      "layer2.2.mlp.dw.norm.bias",
      "layer2.2.mlp.re.region.1.weight",
      "layer2.2.mlp.re.region.1.bias",
      "layer2.2.mlp.re.region.3.bias",
      "layer2.2.mlp.proj.norm.weight",
      "layer2.2.mlp.proj.norm.bias",
      "layer2.2.dcnn.conv_in.norm.weight",
      "layer2.2.dcnn.conv_in.norm.bias",
      "layer2.2.dcnn.spe.norm.weight",
      "layer2.2.dcnn.spe.norm.bias",
      "layer2.2.dcnn.proj.norm.weight",
      "layer2.2.dcnn.proj.norm.bias",
      "layer3.0.mlp.0.norm.weight",
      "layer3.0.mlp.0.norm.bias",
      "layer3.0.mlp.1.norm.weight",
      "layer3.0.mlp.1.norm.bias",
      "layer3.0.mlp.2.norm.weight",
      "layer3.0.mlp.2.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.mlp.conv_in.norm.weight",
      "layer3.1.mlp.conv_in.norm.bias",
      "layer3.1.mlp.dw.norm.weight",
      "layer3.1.mlp.dw.norm.bias",
      "layer3.1.mlp.re.region.1.weight",
      "layer3.1.mlp.re.region.1.bias",
      "layer3.1.mlp.re.region.3.bias",
      "layer3.1.mlp.proj.norm.weight",
      "layer3.1.mlp.proj.norm.bias",
      "layer3.1.dcnn.conv_in.norm.weight",
      "layer3.1.dcnn.conv_in.norm.bias",
      "layer3.1.dcnn.spe.norm.weight",
      "layer3.1.dcnn.spe.norm.bias",
      "layer3.1.dcnn.proj.norm.weight",
      "layer3.1.dcnn.proj.norm.bias",
      "layer3.2.mlp.conv_in.norm.weight",
      "layer3.2.mlp.conv_in.norm.bias",
      "layer3.2.mlp.dw.norm.weight",
      "layer3.2.mlp.dw.norm.bias",
      "layer3.2.mlp.re.region.1.weight",
      "layer3.2.mlp.re.region.1.bias",
      "layer3.2.mlp.re.region.3.bias",
      "layer3.2.mlp.proj.norm.weight",
      "layer3.2.mlp.proj.norm.bias",
      "layer3.2.dcnn.conv_in.norm.weight",
      "layer3.2.dcnn.conv_in.norm.bias",
      "layer3.2.dcnn.spe.norm.weight",
      "layer3.2.dcnn.spe.norm.bias",
      "layer3.2.dcnn.proj.norm.weight",
      "layer3.2.dcnn.proj.norm.bias",
      "layer3.3.mlp.conv_in.norm.weight",
      "layer3.3.mlp.conv_in.norm.bias",
      "layer3.3.mlp.dw.norm.weight",
      "layer3.3.mlp.dw.norm.bias",
      "layer3.3.mlp.re.region.1.weight",
      "layer3.3.mlp.re.region.1.bias",
      "layer3.3.mlp.re.region.3.bias",
      "layer3.3.mlp.proj.norm.weight",
      "layer3.3.mlp.proj.norm.bias",
      "layer3.3.dcnn.conv_in.norm.weight",
      "layer3.3.dcnn.conv_in.norm.bias",
      "layer3.3.dcnn.spe.norm.weight",
      "layer3.3.dcnn.spe.norm.bias",
      "layer3.3.dcnn.proj.norm.weight",
      "layer3.3.dcnn.proj.norm.bias",
      "layer3.4.mlp.conv_in.norm.weight",
      "layer3.4.mlp.conv_in.norm.bias",
      "layer3.4.mlp.dw.norm.weight",
      "layer3.4.mlp.dw.norm.bias",
      "layer3.4.mlp.re.region.1.weight",
      "layer3.4.mlp.re.region.1.bias",
      "layer3.4.mlp.re.region.3.bias",
      "layer3.4.mlp.proj.norm.weight",
      "layer3.4.mlp.proj.norm.bias",
      "layer3.4.dcnn.conv_in.norm.weight",
      "layer3.4.dcnn.conv_in.norm.bias",
      "layer3.4.dcnn.spe.norm.weight",
      "layer3.4.dcnn.spe.norm.bias",
      "layer3.4.dcnn.proj.norm.weight",
      "layer3.4.dcnn.proj.norm.bias",
      "layer3.5.mlp.conv_in.norm.weight",
      "layer3.5.mlp.conv_in.norm.bias",
      "layer3.5.mlp.dw.norm.weight",
      "layer3.5.mlp.dw.norm.bias",
      "layer3.5.mlp.re.region.1.weight",
      "layer3.5.mlp.re.region.1.bias",
      "layer3.5.mlp.re.region.3.bias",
      "layer3.5.mlp.proj.norm.weight",
      "layer3.5.mlp.proj.norm.bias",
      "layer3.5.dcnn.conv_in.norm.weight",
      "layer3.5.dcnn.conv_in.norm.bias",
      "layer3.5.dcnn.spe.norm.weight",
      "layer3.5.dcnn.spe.norm.bias",
      "layer3.5.dcnn.proj.norm.weight",
      "layer3.5.dcnn.proj.norm.bias",
      "layer3.6.mlp.conv_in.norm.weight",
      "layer3.6.mlp.conv_in.norm.bias",
      "layer3.6.mlp.dw.norm.weight",
      "layer3.6.mlp.dw.norm.bias",
      "layer3.6.mlp.re.region.1.weight",
      "layer3.6.mlp.re.region.1.bias",
      "layer3.6.mlp.re.region.3.bias",
      "layer3.6.mlp.proj.norm.weight",
      "layer3.6.mlp.proj.norm.bias",
      "layer3.6.dcnn.conv_in.norm.weight",
      "layer3.6.dcnn.conv_in.norm.bias",
      "layer3.6.dcnn.spe.norm.weight",
      "layer3.6.dcnn.spe.norm.bias",
      "layer3.6.dcnn.proj.norm.weight",
      "layer3.6.dcnn.proj.norm.bias",
      "layer3.7.mlp.conv_in.norm.weight",
      "layer3.7.mlp.conv_in.norm.bias",
      "layer3.7.mlp.dw.norm.weight",
      "layer3.7.mlp.dw.norm.bias",
      "layer3.7.mlp.re.region.1.weight",
      "layer3.7.mlp.re.region.1.bias",
      "layer3.7.mlp.re.region.3.bias",
      "layer3.7.mlp.proj.norm.weight",
      "layer3.7.mlp.proj.norm.bias",
      "layer3.7.dcnn.conv_in.norm.weight",
      "layer3.7.dcnn.conv_in.norm.bias",
      "layer3.7.dcnn.spe.norm.weight",
      "layer3.7.dcnn.spe.norm.bias",
      "layer3.7.dcnn.proj.norm.weight",
      "layer3.7.dcnn.proj.norm.bias",
      "layer4.0.mlp.0.norm.weight",
      "layer4.0.mlp.0.norm.bias",
      "layer4.0.mlp.1.norm.weight",
      "layer4.0.mlp.1.norm.bias",
      "layer4.0.mlp.2.norm.weight",
      "layer4.0.mlp.2.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.mlp.conv_in.norm.weight",
      "layer4.1.mlp.conv_in.norm.bias",
      "layer4.1.mlp.dw.norm.weight",
      "layer4.1.mlp.dw.norm.bias",
      "layer4.1.mlp.re.region.1.weight",
      "layer4.1.mlp.re.region.1.bias",
      "layer4.1.mlp.re.region.3.bias",
      "layer4.1.mlp.proj.norm.weight",
      "layer4.1.mlp.proj.norm.bias",
      "layer4.1.dcnn.conv_in.norm.weight",
      "layer4.1.dcnn.conv_in.norm.bias",
      "layer4.1.dcnn.spe.norm.weight",
      "layer4.1.dcnn.spe.norm.bias",
      "layer4.1.dcnn.proj.norm.weight",
      "layer4.1.dcnn.proj.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 6240
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: 
Start training for 300 epochs
Epoch: [0]  [   0/1251]  eta: 4:03:56  lr: 0.000000  min_lr: 0.000000  loss: 6.9789 (6.9789)  weight_decay: 0.0500 (0.0500)  time: 11.6998  data: 2.9364  max mem: 12911
Epoch: [0]  [ 200/1251]  eta: 0:04:16  lr: 0.000032  min_lr: 0.000032  loss: 6.9186 (6.9446)  weight_decay: 0.0500 (0.0500)  grad_norm: 16.9981 (nan)  time: 0.1839  data: 0.0005  max mem: 12911
Epoch: [0]  [ 400/1251]  eta: 0:03:02  lr: 0.000064  min_lr: 0.000064  loss: 6.7409 (6.8889)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.0355 (nan)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [0]  [ 600/1251]  eta: 0:02:13  lr: 0.000096  min_lr: 0.000096  loss: 6.5364 (6.8147)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.8226 (nan)  time: 0.1888  data: 0.0006  max mem: 12911
Epoch: [0]  [ 800/1251]  eta: 0:01:30  lr: 0.000128  min_lr: 0.000128  loss: 6.4512 (6.7457)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.2375 (nan)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [0]  [1000/1251]  eta: 0:00:49  lr: 0.000160  min_lr: 0.000160  loss: 6.2901 (6.6758)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.0673 (nan)  time: 0.1898  data: 0.0005  max mem: 12911
Epoch: [0]  [1200/1251]  eta: 0:00:10  lr: 0.000192  min_lr: 0.000192  loss: 6.0886 (6.6098)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.5240 (nan)  time: 0.1868  data: 0.0006  max mem: 12911
Epoch: [0]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 6.2105 (6.5960)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.2622 (nan)  time: 0.1477  data: 0.0007  max mem: 12911
Epoch: [0] Total time: 0:04:05 (0.1962 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 6.2105 (6.5953)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.2622 (nan)
Test:  [ 0/25]  eta: 0:04:32  loss: 5.1601 (5.1601)  acc1: 6.8000 (6.8000)  acc5: 22.8000 (22.8000)  time: 10.9082  data: 7.5193  max mem: 12911
Test:  [10/25]  eta: 0:00:15  loss: 5.2932 (5.2725)  acc1: 6.8000 (6.4364)  acc5: 20.0000 (18.9091)  time: 1.0641  data: 0.6838  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 5.3320 (5.3286)  acc1: 6.0000 (6.3810)  acc5: 19.2000 (18.8190)  time: 0.0797  data: 0.0002  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 5.3320 (5.2876)  acc1: 6.8000 (7.0080)  acc5: 19.2000 (19.9520)  time: 0.0991  data: 0.0197  max mem: 12911
Test: Total time: 0:00:13 (0.5318 s / it)
* Acc@1 6.936 Acc@5 19.866 loss 5.297
Accuracy of the model on the 50000 test images: 6.9%
Max accuracy: 6.94%
Epoch: [1]  [   0/1251]  eta: 0:58:45  lr: 0.000200  min_lr: 0.000200  loss: 5.8744 (5.8744)  weight_decay: 0.0500 (0.0500)  time: 2.8184  data: 2.4013  max mem: 12911
Epoch: [1]  [ 200/1251]  eta: 0:03:30  lr: 0.000232  min_lr: 0.000232  loss: 6.3764 (6.1577)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.8110 (7.7102)  time: 0.1914  data: 0.0005  max mem: 12911
Epoch: [1]  [ 400/1251]  eta: 0:02:44  lr: 0.000264  min_lr: 0.000264  loss: 6.0515 (6.0898)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.0877 (7.2770)  time: 0.1845  data: 0.0003  max mem: 12911
Epoch: [1]  [ 600/1251]  eta: 0:02:04  lr: 0.000296  min_lr: 0.000296  loss: 5.6340 (6.0324)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.3689 (7.1791)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [1]  [ 800/1251]  eta: 0:01:25  lr: 0.000328  min_lr: 0.000328  loss: 6.0843 (6.0073)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.6497 (7.4151)  time: 0.1842  data: 0.0004  max mem: 12911
Epoch: [1]  [1000/1251]  eta: 0:00:47  lr: 0.000360  min_lr: 0.000360  loss: 5.5889 (5.9528)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.5569 (7.5488)  time: 0.1915  data: 0.0005  max mem: 12911
Epoch: [1]  [1200/1251]  eta: 0:00:09  lr: 0.000392  min_lr: 0.000392  loss: 5.5295 (5.8966)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.1587 (7.7815)  time: 0.1873  data: 0.0006  max mem: 12911
Epoch: [1]  [1250/1251]  eta: 0:00:00  lr: 0.000399  min_lr: 0.000399  loss: 5.8562 (5.8908)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.7946 (7.8515)  time: 0.1473  data: 0.0010  max mem: 12911
Epoch: [1] Total time: 0:03:56 (0.1890 s / it)
Averaged stats: lr: 0.000399  min_lr: 0.000399  loss: 5.8562 (5.8702)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.7946 (7.8515)
Test:  [ 0/25]  eta: 0:02:20  loss: 3.9484 (3.9484)  acc1: 20.4000 (20.4000)  acc5: 48.4000 (48.4000)  time: 5.6301  data: 5.4911  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 3.7269 (3.8942)  acc1: 20.4000 (20.8364)  acc5: 48.4000 (45.5636)  time: 0.7221  data: 0.6227  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 4.3387 (4.1359)  acc1: 18.4000 (19.2571)  acc5: 38.0000 (41.6000)  time: 0.2046  data: 0.1171  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 4.3387 (4.0987)  acc1: 18.8000 (19.8880)  acc5: 38.0000 (42.4480)  time: 0.2005  data: 0.1170  max mem: 12911
Test: Total time: 0:00:10 (0.4045 s / it)
* Acc@1 19.832 Acc@5 42.194 loss 4.093
Accuracy of the model on the 50000 test images: 19.8%
Max accuracy: 19.83%
Epoch: [2]  [   0/1251]  eta: 1:02:36  lr: 0.000400  min_lr: 0.000400  loss: 5.0755 (5.0755)  weight_decay: 0.0500 (0.0500)  time: 3.0025  data: 2.7454  max mem: 12911
Epoch: [2]  [ 200/1251]  eta: 0:03:31  lr: 0.000432  min_lr: 0.000432  loss: 5.5843 (5.5375)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.9549 (10.6981)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [2]  [ 400/1251]  eta: 0:02:45  lr: 0.000464  min_lr: 0.000464  loss: 5.2832 (5.4924)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.1015 (10.4492)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [2]  [ 600/1251]  eta: 0:02:04  lr: 0.000496  min_lr: 0.000496  loss: 5.3836 (5.4812)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.1122 (10.4030)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [2]  [ 800/1251]  eta: 0:01:25  lr: 0.000528  min_lr: 0.000528  loss: 5.4346 (5.4377)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.1644 (10.3630)  time: 0.1869  data: 0.0004  max mem: 12911
Epoch: [2]  [1000/1251]  eta: 0:00:47  lr: 0.000560  min_lr: 0.000560  loss: 5.0816 (5.4166)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.3154 (10.2478)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [2]  [1200/1251]  eta: 0:00:09  lr: 0.000592  min_lr: 0.000592  loss: 4.9245 (5.3806)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.2417 (10.2981)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [2]  [1250/1251]  eta: 0:00:00  lr: 0.000599  min_lr: 0.000599  loss: 4.8363 (5.3728)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.6206 (10.2573)  time: 0.1455  data: 0.0011  max mem: 12911
Epoch: [2] Total time: 0:03:57 (0.1895 s / it)
Averaged stats: lr: 0.000599  min_lr: 0.000599  loss: 4.8363 (5.3775)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.6206 (10.2573)
Test:  [ 0/25]  eta: 0:02:17  loss: 3.0419 (3.0419)  acc1: 37.2000 (37.2000)  acc5: 61.6000 (61.6000)  time: 5.4992  data: 5.4077  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 3.2004 (3.2332)  acc1: 31.6000 (32.2182)  acc5: 59.2000 (59.3091)  time: 0.7117  data: 0.6146  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 3.6746 (3.4975)  acc1: 26.4000 (29.1810)  acc5: 50.0000 (53.5429)  time: 0.2100  data: 0.1208  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 3.6957 (3.4734)  acc1: 26.8000 (29.6800)  acc5: 49.6000 (53.9680)  time: 0.2062  data: 0.1207  max mem: 12911
Test: Total time: 0:00:10 (0.4037 s / it)
* Acc@1 29.564 Acc@5 55.198 loss 3.457
Accuracy of the model on the 50000 test images: 29.6%
Max accuracy: 29.56%
Epoch: [3]  [   0/1251]  eta: 1:10:04  lr: 0.000600  min_lr: 0.000600  loss: 5.3087 (5.3087)  weight_decay: 0.0500 (0.0500)  time: 3.3613  data: 3.1282  max mem: 12911
Epoch: [3]  [ 200/1251]  eta: 0:03:32  lr: 0.000632  min_lr: 0.000632  loss: 5.0154 (5.1345)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.9346 (9.8935)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [3]  [ 400/1251]  eta: 0:02:46  lr: 0.000664  min_lr: 0.000664  loss: 4.6926 (5.0794)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.0129 (10.1925)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [3]  [ 600/1251]  eta: 0:02:05  lr: 0.000696  min_lr: 0.000696  loss: 4.7209 (5.0680)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.5059 (10.5987)  time: 0.1847  data: 0.0005  max mem: 12911
Epoch: [3]  [ 800/1251]  eta: 0:01:26  lr: 0.000728  min_lr: 0.000728  loss: 4.6829 (5.0733)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.3185 (10.4074)  time: 0.1869  data: 0.0006  max mem: 12911
Epoch: [3]  [1000/1251]  eta: 0:00:47  lr: 0.000760  min_lr: 0.000760  loss: 4.5438 (5.0473)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.1416 (10.2057)  time: 0.1901  data: 0.0005  max mem: 12911
Epoch: [3]  [1200/1251]  eta: 0:00:09  lr: 0.000792  min_lr: 0.000792  loss: 4.5302 (5.0320)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.1003 (10.2176)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [3]  [1250/1251]  eta: 0:00:00  lr: 0.000799  min_lr: 0.000799  loss: 5.4525 (5.0341)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.1003 (10.1715)  time: 0.1464  data: 0.0011  max mem: 12911
Epoch: [3] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.000799  min_lr: 0.000799  loss: 5.4525 (5.0661)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.1003 (10.1715)
Test:  [ 0/25]  eta: 0:02:17  loss: 2.6098 (2.6098)  acc1: 48.4000 (48.4000)  acc5: 71.6000 (71.6000)  time: 5.5178  data: 5.4193  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 2.6098 (2.7027)  acc1: 44.0000 (41.7818)  acc5: 71.2000 (69.3455)  time: 0.7214  data: 0.6267  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 3.1108 (3.0178)  acc1: 32.0000 (37.4286)  acc5: 59.6000 (63.4095)  time: 0.2025  data: 0.1142  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 3.2172 (3.0067)  acc1: 32.0000 (37.6480)  acc5: 58.0000 (63.3600)  time: 0.2001  data: 0.1141  max mem: 12911
Test: Total time: 0:00:09 (0.3993 s / it)
* Acc@1 37.070 Acc@5 63.276 loss 3.004
Accuracy of the model on the 50000 test images: 37.1%
Max accuracy: 37.07%
Epoch: [4]  [   0/1251]  eta: 0:58:34  lr: 0.000800  min_lr: 0.000800  loss: 4.1216 (4.1216)  weight_decay: 0.0500 (0.0500)  time: 2.8097  data: 1.7120  max mem: 12911
Epoch: [4]  [ 200/1251]  eta: 0:03:35  lr: 0.000832  min_lr: 0.000832  loss: 4.3093 (4.9445)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.9595 (10.5661)  time: 0.1929  data: 0.0005  max mem: 12911
Epoch: [4]  [ 400/1251]  eta: 0:02:46  lr: 0.000864  min_lr: 0.000864  loss: 4.4092 (4.8841)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.3828 (9.9783)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [4]  [ 600/1251]  eta: 0:02:05  lr: 0.000896  min_lr: 0.000896  loss: 4.4528 (4.8580)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.6678 (9.6666)  time: 0.1899  data: 0.0004  max mem: 12911
Epoch: [4]  [ 800/1251]  eta: 0:01:26  lr: 0.000928  min_lr: 0.000928  loss: 4.4016 (4.8463)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.8444 (9.6701)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [4]  [1000/1251]  eta: 0:00:47  lr: 0.000960  min_lr: 0.000960  loss: 5.1534 (4.8466)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.2463 (9.7771)  time: 0.1902  data: 0.0004  max mem: 12911
Epoch: [4]  [1200/1251]  eta: 0:00:09  lr: 0.000992  min_lr: 0.000992  loss: 4.2966 (4.8284)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.8723 (9.5851)  time: 0.1866  data: 0.0004  max mem: 12911
Epoch: [4]  [1250/1251]  eta: 0:00:00  lr: 0.001000  min_lr: 0.001000  loss: 4.2900 (4.8203)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.8919 (9.4647)  time: 0.1477  data: 0.0014  max mem: 12911
Epoch: [4] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.001000  min_lr: 0.001000  loss: 4.2900 (4.8010)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.8919 (9.4647)
Test:  [ 0/25]  eta: 0:02:17  loss: 2.2750 (2.2750)  acc1: 51.6000 (51.6000)  acc5: 78.8000 (78.8000)  time: 5.4868  data: 5.3489  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 2.3516 (2.4685)  acc1: 46.0000 (45.4909)  acc5: 75.6000 (73.3091)  time: 0.6861  data: 0.5867  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 2.8608 (2.7742)  acc1: 38.0000 (41.3905)  acc5: 65.2000 (67.4857)  time: 0.1875  data: 0.1000  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 3.0339 (2.7617)  acc1: 38.0000 (41.5680)  acc5: 64.0000 (67.6320)  time: 0.2095  data: 0.1259  max mem: 12911
Test: Total time: 0:00:10 (0.4060 s / it)
* Acc@1 41.464 Acc@5 68.018 loss 2.755
Accuracy of the model on the 50000 test images: 41.5%
Max accuracy: 41.46%
Epoch: [5]  [   0/1251]  eta: 0:53:55  lr: 0.001000  min_lr: 0.001000  loss: 5.3532 (5.3532)  weight_decay: 0.0500 (0.0500)  time: 2.5865  data: 2.1937  max mem: 12911
Epoch: [5]  [ 200/1251]  eta: 0:03:32  lr: 0.001032  min_lr: 0.001032  loss: 4.1703 (4.7343)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.4837 (8.6731)  time: 0.1902  data: 0.0004  max mem: 12911
Epoch: [5]  [ 400/1251]  eta: 0:02:46  lr: 0.001064  min_lr: 0.001064  loss: 4.2053 (4.6704)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.9027 (8.5138)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [5]  [ 600/1251]  eta: 0:02:05  lr: 0.001096  min_lr: 0.001096  loss: 4.7422 (4.6890)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.2929 (7.8561)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [5]  [ 800/1251]  eta: 0:01:26  lr: 0.001128  min_lr: 0.001128  loss: 4.3040 (4.6542)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.9506 (7.5556)  time: 0.1900  data: 0.0006  max mem: 12911
Epoch: [5]  [1000/1251]  eta: 0:00:47  lr: 0.001160  min_lr: 0.001160  loss: 4.0519 (4.6322)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.4896 (7.5272)  time: 0.1857  data: 0.0006  max mem: 12911
Epoch: [5]  [1200/1251]  eta: 0:00:09  lr: 0.001192  min_lr: 0.001192  loss: 4.4722 (4.6158)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.9449 (7.1866)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [5]  [1250/1251]  eta: 0:00:00  lr: 0.001200  min_lr: 0.001200  loss: 4.0987 (4.6136)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.4232 (7.2143)  time: 0.1465  data: 0.0008  max mem: 12911
Epoch: [5] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.001200  min_lr: 0.001200  loss: 4.0987 (4.6253)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.4232 (7.2143)
Test:  [ 0/25]  eta: 0:02:17  loss: 2.0247 (2.0247)  acc1: 59.2000 (59.2000)  acc5: 80.8000 (80.8000)  time: 5.5047  data: 5.4130  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 2.0728 (2.1278)  acc1: 53.6000 (52.9818)  acc5: 80.4000 (78.5455)  time: 0.7421  data: 0.6490  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 2.6558 (2.4825)  acc1: 44.0000 (47.4095)  acc5: 70.0000 (72.5143)  time: 0.2080  data: 0.1193  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 2.7186 (2.4836)  acc1: 44.0000 (47.4080)  acc5: 67.2000 (72.3200)  time: 0.2066  data: 0.1192  max mem: 12911
Test: Total time: 0:00:10 (0.4033 s / it)
* Acc@1 46.986 Acc@5 72.934 loss 2.472
Accuracy of the model on the 50000 test images: 47.0%
Max accuracy: 46.99%
Epoch: [6]  [   0/1251]  eta: 1:06:40  lr: 0.001200  min_lr: 0.001200  loss: 5.6548 (5.6548)  weight_decay: 0.0500 (0.0500)  time: 3.1975  data: 3.0111  max mem: 12911
Epoch: [6]  [ 200/1251]  eta: 0:03:32  lr: 0.001232  min_lr: 0.001232  loss: 4.1788 (4.5250)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.6816 (6.8782)  time: 0.1851  data: 0.0004  max mem: 12911
Epoch: [6]  [ 400/1251]  eta: 0:02:46  lr: 0.001264  min_lr: 0.001264  loss: 3.7966 (4.5170)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.3871 (6.5096)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [6]  [ 600/1251]  eta: 0:02:05  lr: 0.001296  min_lr: 0.001296  loss: 4.6450 (4.4935)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.6379 (6.1469)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [6]  [ 800/1251]  eta: 0:01:26  lr: 0.001328  min_lr: 0.001328  loss: 3.9561 (4.4944)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7762 (5.6591)  time: 0.1883  data: 0.0005  max mem: 12911
Epoch: [6]  [1000/1251]  eta: 0:00:47  lr: 0.001360  min_lr: 0.001360  loss: 4.2313 (4.4749)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.3948 (5.5967)  time: 0.1836  data: 0.0004  max mem: 12911
Epoch: [6]  [1200/1251]  eta: 0:00:09  lr: 0.001393  min_lr: 0.001393  loss: 3.8948 (4.4582)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.5363 (5.4862)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [6]  [1250/1251]  eta: 0:00:00  lr: 0.001400  min_lr: 0.001400  loss: 3.6759 (4.4547)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2340 (5.4154)  time: 0.1462  data: 0.0014  max mem: 12911
Epoch: [6] Total time: 0:03:57 (0.1901 s / it)
Averaged stats: lr: 0.001400  min_lr: 0.001400  loss: 3.6759 (4.4364)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2340 (5.4154)
Test:  [ 0/25]  eta: 0:01:23  loss: 1.7489 (1.7489)  acc1: 61.6000 (61.6000)  acc5: 83.2000 (83.2000)  time: 3.3439  data: 3.2129  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 1.7489 (1.8791)  acc1: 60.4000 (57.3455)  acc5: 84.0000 (82.6182)  time: 0.6077  data: 0.5172  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 2.3222 (2.2300)  acc1: 46.8000 (51.6381)  acc5: 71.2000 (76.2095)  time: 0.2597  data: 0.1766  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 2.5417 (2.2470)  acc1: 46.4000 (51.4720)  acc5: 70.8000 (76.0160)  time: 0.1920  data: 0.1103  max mem: 12911
Test: Total time: 0:00:09 (0.3786 s / it)
* Acc@1 51.206 Acc@5 76.400 loss 2.235
Accuracy of the model on the 50000 test images: 51.2%
Max accuracy: 51.21%
Epoch: [7]  [   0/1251]  eta: 0:58:23  lr: 0.001400  min_lr: 0.001400  loss: 5.4969 (5.4969)  weight_decay: 0.0500 (0.0500)  time: 2.8006  data: 2.0153  max mem: 12911
Epoch: [7]  [ 200/1251]  eta: 0:03:32  lr: 0.001432  min_lr: 0.001432  loss: 3.8774 (4.3389)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8268 (4.1328)  time: 0.1842  data: 0.0005  max mem: 12911
Epoch: [7]  [ 400/1251]  eta: 0:02:45  lr: 0.001464  min_lr: 0.001464  loss: 3.7121 (4.3540)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8257 (4.3116)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [7]  [ 600/1251]  eta: 0:02:04  lr: 0.001496  min_lr: 0.001496  loss: 3.6625 (4.3319)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0672 (4.2804)  time: 0.1918  data: 0.0006  max mem: 12911
Epoch: [7]  [ 800/1251]  eta: 0:01:26  lr: 0.001528  min_lr: 0.001528  loss: 4.0167 (4.3229)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7775 (4.1725)  time: 0.1910  data: 0.0006  max mem: 12911
Epoch: [7]  [1000/1251]  eta: 0:00:47  lr: 0.001561  min_lr: 0.001561  loss: 3.8587 (4.3010)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1272 (4.1703)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [7]  [1200/1251]  eta: 0:00:09  lr: 0.001593  min_lr: 0.001593  loss: 5.0577 (4.2875)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1074 (4.1195)  time: 0.1889  data: 0.0004  max mem: 12911
Epoch: [7]  [1250/1251]  eta: 0:00:00  lr: 0.001600  min_lr: 0.001600  loss: 3.8249 (4.2844)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8268 (4.1073)  time: 0.1464  data: 0.0010  max mem: 12911
Epoch: [7] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.001600  min_lr: 0.001600  loss: 3.8249 (4.2853)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8268 (4.1073)
Test:  [ 0/25]  eta: 0:02:23  loss: 1.4628 (1.4628)  acc1: 68.8000 (68.8000)  acc5: 86.8000 (86.8000)  time: 5.7217  data: 5.6075  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.7160 (1.7403)  acc1: 60.4000 (60.4364)  acc5: 86.8000 (85.0909)  time: 0.7462  data: 0.6544  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 2.1794 (2.0837)  acc1: 50.0000 (55.0476)  acc5: 76.0000 (79.6000)  time: 0.2013  data: 0.1167  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 2.3245 (2.0970)  acc1: 50.0000 (54.9120)  acc5: 75.2000 (79.2000)  time: 0.2010  data: 0.1167  max mem: 12911
Test: Total time: 0:00:10 (0.4060 s / it)
* Acc@1 54.708 Acc@5 79.466 loss 2.081
Accuracy of the model on the 50000 test images: 54.7%
Max accuracy: 54.71%
Epoch: [8]  [   0/1251]  eta: 0:56:08  lr: 0.001600  min_lr: 0.001600  loss: 3.7558 (3.7558)  weight_decay: 0.0500 (0.0500)  time: 2.6929  data: 2.3978  max mem: 12911
Epoch: [8]  [ 200/1251]  eta: 0:03:36  lr: 0.001632  min_lr: 0.001632  loss: 4.3992 (4.2820)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8791 (3.4129)  time: 0.1884  data: 0.0005  max mem: 12911
Epoch: [8]  [ 400/1251]  eta: 0:02:47  lr: 0.001664  min_lr: 0.001664  loss: 3.9319 (4.2411)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4844 (3.3250)  time: 0.1852  data: 0.0004  max mem: 12911
Epoch: [8]  [ 600/1251]  eta: 0:02:05  lr: 0.001696  min_lr: 0.001696  loss: 3.6854 (4.2406)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9018 (3.4491)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [8]  [ 800/1251]  eta: 0:01:26  lr: 0.001728  min_lr: 0.001728  loss: 3.5726 (4.2257)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2044 (3.4100)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [8]  [1000/1251]  eta: 0:00:48  lr: 0.001761  min_lr: 0.001761  loss: 3.6682 (4.2120)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3984 (3.3104)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [8]  [1200/1251]  eta: 0:00:09  lr: 0.001793  min_lr: 0.001793  loss: 3.5911 (4.2047)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6227 (3.2647)  time: 0.1910  data: 0.0004  max mem: 12911
Epoch: [8]  [1250/1251]  eta: 0:00:00  lr: 0.001800  min_lr: 0.001800  loss: 3.5371 (4.1976)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8369 (3.2121)  time: 0.1471  data: 0.0007  max mem: 12911
Epoch: [8] Total time: 0:03:59 (0.1912 s / it)
Averaged stats: lr: 0.001800  min_lr: 0.001800  loss: 3.5371 (4.1672)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8369 (3.2121)
Test:  [ 0/25]  eta: 0:02:18  loss: 1.5227 (1.5227)  acc1: 68.4000 (68.4000)  acc5: 86.4000 (86.4000)  time: 5.5520  data: 5.4593  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.5886 (1.6566)  acc1: 60.8000 (62.3273)  acc5: 87.6000 (86.1455)  time: 0.7441  data: 0.6495  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 2.0611 (1.9731)  acc1: 53.2000 (56.0571)  acc5: 79.6000 (80.9905)  time: 0.2117  data: 0.1246  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 2.1865 (1.9770)  acc1: 51.2000 (56.1600)  acc5: 76.8000 (80.9760)  time: 0.2280  data: 0.1435  max mem: 12911
Test: Total time: 0:00:10 (0.4228 s / it)
* Acc@1 56.748 Acc@5 80.934 loss 1.961
Accuracy of the model on the 50000 test images: 56.7%
Max accuracy: 56.75%
Epoch: [9]  [   0/1251]  eta: 1:05:03  lr: 0.001800  min_lr: 0.001800  loss: 4.0610 (4.0610)  weight_decay: 0.0500 (0.0500)  time: 3.1203  data: 2.9166  max mem: 12911
Epoch: [9]  [ 200/1251]  eta: 0:03:31  lr: 0.001832  min_lr: 0.001832  loss: 3.7360 (4.1642)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0252 (2.5316)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [9]  [ 400/1251]  eta: 0:02:45  lr: 0.001864  min_lr: 0.001864  loss: 3.5986 (4.1420)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4472 (2.6324)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [9]  [ 600/1251]  eta: 0:02:04  lr: 0.001896  min_lr: 0.001896  loss: 3.3136 (4.0958)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5079 (2.6740)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [9]  [ 800/1251]  eta: 0:01:25  lr: 0.001929  min_lr: 0.001929  loss: 3.6963 (4.0990)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3873 (2.6497)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [9]  [1000/1251]  eta: 0:00:47  lr: 0.001961  min_lr: 0.001961  loss: 4.0684 (4.0898)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6037 (2.6817)  time: 0.1894  data: 0.0006  max mem: 12911
Epoch: [9]  [1200/1251]  eta: 0:00:09  lr: 0.001993  min_lr: 0.001993  loss: 4.3415 (4.0834)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2552 (2.5899)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [9]  [1250/1251]  eta: 0:00:00  lr: 0.002000  min_lr: 0.002000  loss: 3.5715 (4.0715)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9228 (2.5640)  time: 0.1460  data: 0.0013  max mem: 12911
Epoch: [9] Total time: 0:03:57 (0.1897 s / it)
Averaged stats: lr: 0.002000  min_lr: 0.002000  loss: 3.5715 (4.0841)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9228 (2.5640)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.4228 (1.4228)  acc1: 72.0000 (72.0000)  acc5: 86.8000 (86.8000)  time: 5.4870  data: 5.3887  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.5097 (1.5943)  acc1: 64.4000 (65.0182)  acc5: 88.8000 (86.8727)  time: 0.7304  data: 0.6332  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.9706 (1.9194)  acc1: 54.8000 (58.5143)  acc5: 78.0000 (81.6571)  time: 0.1974  data: 0.1091  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 2.1873 (1.9244)  acc1: 53.2000 (58.4000)  acc5: 77.2000 (81.8400)  time: 0.2075  data: 0.1232  max mem: 12911
Test: Total time: 0:00:10 (0.4043 s / it)
* Acc@1 58.504 Acc@5 82.056 loss 1.914
Accuracy of the model on the 50000 test images: 58.5%
Max accuracy: 58.50%
Epoch: [10]  [   0/1251]  eta: 1:04:01  lr: 0.002000  min_lr: 0.002000  loss: 5.2833 (5.2833)  weight_decay: 0.0500 (0.0500)  time: 3.0711  data: 2.8017  max mem: 12911
Epoch: [10]  [ 200/1251]  eta: 0:03:29  lr: 0.002032  min_lr: 0.002032  loss: 4.0056 (4.1325)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6095 (2.5139)  time: 0.1849  data: 0.0005  max mem: 12911
Epoch: [10]  [ 400/1251]  eta: 0:02:43  lr: 0.002064  min_lr: 0.002064  loss: 3.9064 (4.1036)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8838 (2.2565)  time: 0.1850  data: 0.0003  max mem: 12911
Epoch: [10]  [ 600/1251]  eta: 0:02:04  lr: 0.002096  min_lr: 0.002096  loss: 3.9998 (4.0723)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1403 (2.3417)  time: 0.1859  data: 0.0004  max mem: 12911
Epoch: [10]  [ 800/1251]  eta: 0:01:25  lr: 0.002129  min_lr: 0.002129  loss: 3.5845 (4.0688)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4891 (2.1941)  time: 0.1848  data: 0.0010  max mem: 12911
Epoch: [10]  [1000/1251]  eta: 0:00:47  lr: 0.002161  min_lr: 0.002161  loss: 3.5370 (4.0655)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6089 (2.2286)  time: 0.1877  data: 0.0004  max mem: 12911
Epoch: [10]  [1200/1251]  eta: 0:00:09  lr: 0.002193  min_lr: 0.002193  loss: 3.3789 (4.0501)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8012 (2.1903)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [10]  [1250/1251]  eta: 0:00:00  lr: 0.002200  min_lr: 0.002200  loss: 3.5923 (4.0520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8420 (2.1807)  time: 0.1461  data: 0.0006  max mem: 12911
Epoch: [10] Total time: 0:03:56 (0.1893 s / it)
Averaged stats: lr: 0.002200  min_lr: 0.002200  loss: 3.5923 (4.0024)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8420 (2.1807)
Test:  [ 0/25]  eta: 0:01:22  loss: 1.3530 (1.3530)  acc1: 72.4000 (72.4000)  acc5: 90.8000 (90.8000)  time: 3.2851  data: 3.1935  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 1.4346 (1.5122)  acc1: 66.4000 (65.9273)  acc5: 90.0000 (88.6909)  time: 0.6316  data: 0.5376  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.9066 (1.8209)  acc1: 56.0000 (59.8286)  acc5: 80.4000 (83.5619)  time: 0.2762  data: 0.1882  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 2.0520 (1.8315)  acc1: 55.6000 (59.9040)  acc5: 79.2000 (83.3120)  time: 0.2237  data: 0.1381  max mem: 12911
Test: Total time: 0:00:09 (0.3940 s / it)
* Acc@1 59.918 Acc@5 83.294 loss 1.824
Accuracy of the model on the 50000 test images: 59.9%
Max accuracy: 59.92%
Epoch: [11]  [   0/1251]  eta: 0:57:11  lr: 0.002200  min_lr: 0.002200  loss: 3.1399 (3.1399)  weight_decay: 0.0500 (0.0500)  time: 2.7432  data: 2.4839  max mem: 12911
Epoch: [11]  [ 200/1251]  eta: 0:03:33  lr: 0.002232  min_lr: 0.002232  loss: 3.3711 (3.9113)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8090 (1.7430)  time: 0.1860  data: 0.0004  max mem: 12911
Epoch: [11]  [ 400/1251]  eta: 0:02:45  lr: 0.002264  min_lr: 0.002264  loss: 3.3020 (3.9859)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6113 (1.8273)  time: 0.1866  data: 0.0005  max mem: 12911
Epoch: [11]  [ 600/1251]  eta: 0:02:05  lr: 0.002297  min_lr: 0.002297  loss: 3.5252 (3.9476)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5723 (1.7804)  time: 0.1848  data: 0.0005  max mem: 12911
Epoch: [11]  [ 800/1251]  eta: 0:01:26  lr: 0.002329  min_lr: 0.002329  loss: 3.3836 (3.9337)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8568 (1.8253)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [11]  [1000/1251]  eta: 0:00:47  lr: 0.002361  min_lr: 0.002361  loss: 3.2750 (3.9212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9148 (1.8782)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [11]  [1200/1251]  eta: 0:00:09  lr: 0.002393  min_lr: 0.002393  loss: 3.2434 (3.9238)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5562 (1.8059)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [11]  [1250/1251]  eta: 0:00:00  lr: 0.002400  min_lr: 0.002400  loss: 4.8935 (3.9380)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6982 (1.8100)  time: 0.1459  data: 0.0007  max mem: 12911
Epoch: [11] Total time: 0:03:57 (0.1896 s / it)
Averaged stats: lr: 0.002400  min_lr: 0.002400  loss: 4.8935 (3.9376)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6982 (1.8100)
Test:  [ 0/25]  eta: 0:02:19  loss: 1.3616 (1.3616)  acc1: 72.0000 (72.0000)  acc5: 90.0000 (90.0000)  time: 5.5899  data: 5.4982  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.4377 (1.5311)  acc1: 69.2000 (66.6182)  acc5: 90.0000 (88.8364)  time: 0.7291  data: 0.6336  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.8902 (1.8293)  acc1: 56.0000 (60.1524)  acc5: 81.6000 (83.9619)  time: 0.2198  data: 0.1321  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 2.0856 (1.8412)  acc1: 53.2000 (60.0160)  acc5: 80.0000 (83.7920)  time: 0.2165  data: 0.1320  max mem: 12911
Test: Total time: 0:00:10 (0.4152 s / it)
* Acc@1 60.904 Acc@5 83.970 loss 1.823
Accuracy of the model on the 50000 test images: 60.9%
Max accuracy: 60.90%
Epoch: [12]  [   0/1251]  eta: 1:05:21  lr: 0.002400  min_lr: 0.002400  loss: 3.1691 (3.1691)  weight_decay: 0.0500 (0.0500)  time: 3.1345  data: 2.9250  max mem: 12911
Epoch: [12]  [ 200/1251]  eta: 0:03:31  lr: 0.002432  min_lr: 0.002432  loss: 4.0097 (3.9552)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7179 (1.7185)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [12]  [ 400/1251]  eta: 0:02:46  lr: 0.002464  min_lr: 0.002464  loss: 3.5233 (3.8981)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7643 (1.7138)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [12]  [ 600/1251]  eta: 0:02:05  lr: 0.002497  min_lr: 0.002497  loss: 3.2396 (3.9175)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4712 (1.6620)  time: 0.1862  data: 0.0004  max mem: 12911
Epoch: [12]  [ 800/1251]  eta: 0:01:26  lr: 0.002529  min_lr: 0.002529  loss: 3.1402 (3.9426)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3383 (1.6435)  time: 0.1856  data: 0.0006  max mem: 12911
Epoch: [12]  [1000/1251]  eta: 0:00:47  lr: 0.002561  min_lr: 0.002561  loss: 3.3063 (3.9271)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6299 (1.6339)  time: 0.1860  data: 0.0004  max mem: 12911
Epoch: [12]  [1200/1251]  eta: 0:00:09  lr: 0.002593  min_lr: 0.002593  loss: 3.2722 (3.9092)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4213 (1.6040)  time: 0.1838  data: 0.0005  max mem: 12911
Epoch: [12]  [1250/1251]  eta: 0:00:00  lr: 0.002600  min_lr: 0.002600  loss: 3.2375 (3.9084)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3961 (1.5983)  time: 0.1453  data: 0.0012  max mem: 12911
Epoch: [12] Total time: 0:03:56 (0.1894 s / it)
Averaged stats: lr: 0.002600  min_lr: 0.002600  loss: 3.2375 (3.8902)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3961 (1.5983)
Test:  [ 0/25]  eta: 0:02:18  loss: 1.2110 (1.2110)  acc1: 75.6000 (75.6000)  acc5: 92.0000 (92.0000)  time: 5.5291  data: 5.4239  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.2506 (1.3804)  acc1: 69.2000 (68.4727)  acc5: 90.4000 (89.5636)  time: 0.7117  data: 0.6186  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.6894 (1.6928)  acc1: 59.2000 (62.8952)  acc5: 83.6000 (85.2571)  time: 0.1947  data: 0.1090  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.9835 (1.7179)  acc1: 58.0000 (62.1760)  acc5: 82.4000 (85.0240)  time: 0.1954  data: 0.1106  max mem: 12911
Test: Total time: 0:00:09 (0.3946 s / it)
* Acc@1 62.252 Acc@5 84.992 loss 1.703
Accuracy of the model on the 50000 test images: 62.3%
Max accuracy: 62.25%
Epoch: [13]  [   0/1251]  eta: 0:56:50  lr: 0.002600  min_lr: 0.002600  loss: 4.8554 (4.8554)  weight_decay: 0.0500 (0.0500)  time: 2.7261  data: 2.4428  max mem: 12911
Epoch: [13]  [ 200/1251]  eta: 0:03:32  lr: 0.002632  min_lr: 0.002632  loss: 3.3695 (3.8863)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4010 (1.5029)  time: 0.1819  data: 0.0004  max mem: 12911
Epoch: [13]  [ 400/1251]  eta: 0:02:44  lr: 0.002665  min_lr: 0.002665  loss: 4.4707 (3.8854)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0914 (1.4347)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [13]  [ 600/1251]  eta: 0:02:04  lr: 0.002697  min_lr: 0.002697  loss: 3.2518 (3.8782)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2950 (1.4417)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [13]  [ 800/1251]  eta: 0:01:25  lr: 0.002729  min_lr: 0.002729  loss: 3.3208 (3.8717)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6074 (1.4577)  time: 0.1839  data: 0.0005  max mem: 12911
Epoch: [13]  [1000/1251]  eta: 0:00:47  lr: 0.002761  min_lr: 0.002761  loss: 4.1881 (3.8870)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3770 (1.4387)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [13]  [1200/1251]  eta: 0:00:09  lr: 0.002793  min_lr: 0.002793  loss: 3.1557 (3.8609)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3997 (1.4484)  time: 0.1877  data: 0.0005  max mem: 12911
Epoch: [13]  [1250/1251]  eta: 0:00:00  lr: 0.002800  min_lr: 0.002800  loss: 3.2633 (3.8594)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3171 (1.4497)  time: 0.1474  data: 0.0010  max mem: 12911
Epoch: [13] Total time: 0:03:57 (0.1899 s / it)
Averaged stats: lr: 0.002800  min_lr: 0.002800  loss: 3.2633 (3.8541)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3171 (1.4497)
Test:  [ 0/25]  eta: 0:01:20  loss: 1.1850 (1.1850)  acc1: 75.2000 (75.2000)  acc5: 89.2000 (89.2000)  time: 3.2036  data: 3.0889  max mem: 12911
Test:  [10/25]  eta: 0:00:08  loss: 1.2025 (1.3534)  acc1: 70.8000 (68.3636)  acc5: 91.6000 (90.3273)  time: 0.5976  data: 0.5034  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.7411 (1.6241)  acc1: 58.8000 (63.5810)  acc5: 84.0000 (85.9810)  time: 0.2653  data: 0.1793  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.8576 (1.6377)  acc1: 58.4000 (63.3280)  acc5: 81.6000 (85.6960)  time: 0.2079  data: 0.1273  max mem: 12911
Test: Total time: 0:00:09 (0.3924 s / it)
* Acc@1 63.366 Acc@5 85.642 loss 1.632
Accuracy of the model on the 50000 test images: 63.4%
Max accuracy: 63.37%
Epoch: [14]  [   0/1251]  eta: 1:01:34  lr: 0.002800  min_lr: 0.002800  loss: 4.5961 (4.5961)  weight_decay: 0.0500 (0.0500)  time: 2.9529  data: 2.6843  max mem: 12911
Epoch: [14]  [ 200/1251]  eta: 0:03:34  lr: 0.002833  min_lr: 0.002833  loss: 3.4403 (3.7565)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1456 (1.1580)  time: 0.1915  data: 0.0005  max mem: 12911
Epoch: [14]  [ 400/1251]  eta: 0:02:46  lr: 0.002865  min_lr: 0.002865  loss: 3.5394 (3.7728)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1852 (1.2548)  time: 0.1869  data: 0.0004  max mem: 12911
Epoch: [14]  [ 600/1251]  eta: 0:02:06  lr: 0.002897  min_lr: 0.002897  loss: 3.2707 (3.7776)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5892 (1.3550)  time: 0.1947  data: 0.0004  max mem: 12911
Epoch: [14]  [ 800/1251]  eta: 0:01:27  lr: 0.002929  min_lr: 0.002929  loss: 3.3877 (3.7495)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1743 (1.3430)  time: 0.1910  data: 0.0005  max mem: 12911
Epoch: [14]  [1000/1251]  eta: 0:00:48  lr: 0.002961  min_lr: 0.002961  loss: 3.2757 (3.7514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1849 (1.3431)  time: 0.1891  data: 0.0004  max mem: 12911
Epoch: [14]  [1200/1251]  eta: 0:00:09  lr: 0.002993  min_lr: 0.002993  loss: 3.2256 (3.7504)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0566 (1.3290)  time: 0.1892  data: 0.0004  max mem: 12911
Epoch: [14]  [1250/1251]  eta: 0:00:00  lr: 0.003000  min_lr: 0.003000  loss: 3.5818 (3.7506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4572 (1.3372)  time: 0.1470  data: 0.0007  max mem: 12911
Epoch: [14] Total time: 0:04:00 (0.1921 s / it)
Averaged stats: lr: 0.003000  min_lr: 0.003000  loss: 3.5818 (3.7930)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4572 (1.3372)
Test:  [ 0/25]  eta: 0:02:12  loss: 1.2875 (1.2875)  acc1: 73.6000 (73.6000)  acc5: 90.4000 (90.4000)  time: 5.2926  data: 5.2010  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.3185 (1.4300)  acc1: 70.0000 (68.5818)  acc5: 92.4000 (90.4364)  time: 0.6890  data: 0.5923  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.8723 (1.6960)  acc1: 58.8000 (63.6952)  acc5: 82.4000 (86.0571)  time: 0.2017  data: 0.1112  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.9448 (1.7184)  acc1: 58.8000 (63.2160)  acc5: 82.0000 (85.8720)  time: 0.2255  data: 0.1388  max mem: 12911
Test: Total time: 0:00:10 (0.4114 s / it)
* Acc@1 63.356 Acc@5 85.658 loss 1.724
Accuracy of the model on the 50000 test images: 63.4%
Max accuracy: 63.37%
Epoch: [15]  [   0/1251]  eta: 1:05:24  lr: 0.003000  min_lr: 0.003000  loss: 3.0024 (3.0024)  weight_decay: 0.0500 (0.0500)  time: 3.1371  data: 2.2621  max mem: 12911
Epoch: [15]  [ 200/1251]  eta: 0:03:36  lr: 0.003033  min_lr: 0.003033  loss: 3.2103 (3.6602)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2023 (1.3905)  time: 0.1894  data: 0.0003  max mem: 12911
Epoch: [15]  [ 400/1251]  eta: 0:02:48  lr: 0.003065  min_lr: 0.003065  loss: 3.4389 (3.7350)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1772 (1.2904)  time: 0.1933  data: 0.0004  max mem: 12911
Epoch: [15]  [ 600/1251]  eta: 0:02:07  lr: 0.003097  min_lr: 0.003097  loss: 3.3815 (3.7789)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1451 (1.2819)  time: 0.1902  data: 0.0005  max mem: 12911
Epoch: [15]  [ 800/1251]  eta: 0:01:27  lr: 0.003129  min_lr: 0.003129  loss: 3.3712 (3.7693)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0279 (1.2381)  time: 0.1917  data: 0.0005  max mem: 12911
Epoch: [15]  [1000/1251]  eta: 0:00:48  lr: 0.003161  min_lr: 0.003161  loss: 3.3682 (3.7819)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1447 (1.2266)  time: 0.1859  data: 0.0004  max mem: 12911
Epoch: [15]  [1200/1251]  eta: 0:00:09  lr: 0.003193  min_lr: 0.003193  loss: 4.4022 (3.7809)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1045 (1.2222)  time: 0.1873  data: 0.0004  max mem: 12911
Epoch: [15]  [1250/1251]  eta: 0:00:00  lr: 0.003200  min_lr: 0.003200  loss: 3.1223 (3.7802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9986 (1.2134)  time: 0.1454  data: 0.0008  max mem: 12911
Epoch: [15] Total time: 0:04:00 (0.1925 s / it)
Averaged stats: lr: 0.003200  min_lr: 0.003200  loss: 3.1223 (3.7610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9986 (1.2134)
Test:  [ 0/25]  eta: 0:02:16  loss: 1.1757 (1.1757)  acc1: 77.6000 (77.6000)  acc5: 92.0000 (92.0000)  time: 5.4786  data: 5.3444  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.2836 (1.3701)  acc1: 72.0000 (70.7636)  acc5: 92.0000 (90.6909)  time: 0.7127  data: 0.6136  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.7470 (1.6407)  acc1: 59.6000 (65.0667)  acc5: 83.6000 (86.2476)  time: 0.1913  data: 0.1038  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.8463 (1.6540)  acc1: 59.2000 (64.7040)  acc5: 82.4000 (85.9680)  time: 0.2000  data: 0.1164  max mem: 12911
Test: Total time: 0:00:09 (0.3980 s / it)
* Acc@1 64.356 Acc@5 86.206 loss 1.647
Accuracy of the model on the 50000 test images: 64.4%
Max accuracy: 64.36%
Epoch: [16]  [   0/1251]  eta: 0:57:10  lr: 0.003201  min_lr: 0.003201  loss: 2.8490 (2.8490)  weight_decay: 0.0500 (0.0500)  time: 2.7419  data: 2.2744  max mem: 12911
Epoch: [16]  [ 200/1251]  eta: 0:03:32  lr: 0.003233  min_lr: 0.003233  loss: 3.1817 (3.6865)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0361 (1.1096)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [16]  [ 400/1251]  eta: 0:02:45  lr: 0.003265  min_lr: 0.003265  loss: 3.1474 (3.7394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1519 (1.1564)  time: 0.1910  data: 0.0005  max mem: 12911
Epoch: [16]  [ 600/1251]  eta: 0:02:05  lr: 0.003297  min_lr: 0.003297  loss: 3.3438 (3.7509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1830 (1.2058)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [16]  [ 800/1251]  eta: 0:01:26  lr: 0.003329  min_lr: 0.003329  loss: 3.1701 (3.7286)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1260 (1.1976)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [16]  [1000/1251]  eta: 0:00:48  lr: 0.003361  min_lr: 0.003361  loss: 3.6875 (3.7360)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0077 (1.1544)  time: 0.1903  data: 0.0004  max mem: 12911
Epoch: [16]  [1200/1251]  eta: 0:00:09  lr: 0.003393  min_lr: 0.003393  loss: 3.3128 (3.7479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2195 (1.1530)  time: 0.1883  data: 0.0003  max mem: 12911
Epoch: [16]  [1250/1251]  eta: 0:00:00  lr: 0.003400  min_lr: 0.003400  loss: 3.1277 (3.7331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0013 (1.1458)  time: 0.1475  data: 0.0010  max mem: 12911
Epoch: [16] Total time: 0:03:58 (0.1910 s / it)
Averaged stats: lr: 0.003400  min_lr: 0.003400  loss: 3.1277 (3.7588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0013 (1.1458)
Test:  [ 0/25]  eta: 0:02:24  loss: 1.2010 (1.2010)  acc1: 75.2000 (75.2000)  acc5: 91.2000 (91.2000)  time: 5.7640  data: 5.6722  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 1.3233 (1.3797)  acc1: 71.6000 (69.5636)  acc5: 92.0000 (90.9091)  time: 0.6538  data: 0.5586  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.7003 (1.6316)  acc1: 61.2000 (64.5143)  acc5: 86.4000 (86.6476)  time: 0.1635  data: 0.0760  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.7542 (1.6362)  acc1: 59.6000 (64.2400)  acc5: 85.2000 (86.5760)  time: 0.1922  data: 0.1064  max mem: 12911
Test: Total time: 0:00:10 (0.4033 s / it)
* Acc@1 64.456 Acc@5 86.354 loss 1.632
Accuracy of the model on the 50000 test images: 64.5%
Max accuracy: 64.46%
Epoch: [17]  [   0/1251]  eta: 1:01:23  lr: 0.003401  min_lr: 0.003401  loss: 4.5401 (4.5401)  weight_decay: 0.0500 (0.0500)  time: 2.9445  data: 2.6751  max mem: 12911
Epoch: [17]  [ 200/1251]  eta: 0:03:33  lr: 0.003433  min_lr: 0.003433  loss: 3.1386 (3.8048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9882 (1.0490)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [17]  [ 400/1251]  eta: 0:02:46  lr: 0.003465  min_lr: 0.003465  loss: 3.3546 (3.7966)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0841 (1.0328)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [17]  [ 600/1251]  eta: 0:02:05  lr: 0.003497  min_lr: 0.003497  loss: 3.8543 (3.7495)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0635 (1.0483)  time: 0.1850  data: 0.0004  max mem: 12911
Epoch: [17]  [ 800/1251]  eta: 0:01:26  lr: 0.003529  min_lr: 0.003529  loss: 3.0192 (3.7305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8408 (1.0329)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [17]  [1000/1251]  eta: 0:00:47  lr: 0.003561  min_lr: 0.003561  loss: 3.8279 (3.7341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9965 (1.0303)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [17]  [1200/1251]  eta: 0:00:09  lr: 0.003593  min_lr: 0.003593  loss: 4.4513 (3.7467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8903 (1.0239)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [17]  [1250/1251]  eta: 0:00:00  lr: 0.003600  min_lr: 0.003600  loss: 3.3354 (3.7516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9118 (1.0169)  time: 0.1466  data: 0.0012  max mem: 12911
Epoch: [17] Total time: 0:03:57 (0.1898 s / it)
Averaged stats: lr: 0.003600  min_lr: 0.003600  loss: 3.3354 (3.7308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9118 (1.0169)
Test:  [ 0/25]  eta: 0:02:20  loss: 1.2421 (1.2421)  acc1: 73.2000 (73.2000)  acc5: 91.6000 (91.6000)  time: 5.6006  data: 5.5088  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.3260 (1.3413)  acc1: 68.8000 (70.9455)  acc5: 92.4000 (91.0545)  time: 0.7306  data: 0.6339  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.5919 (1.5974)  acc1: 62.8000 (65.7143)  acc5: 84.4000 (86.7238)  time: 0.2031  data: 0.1146  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.7915 (1.6077)  acc1: 61.2000 (65.4720)  acc5: 83.2000 (86.5280)  time: 0.2065  data: 0.1214  max mem: 12911
Test: Total time: 0:00:10 (0.4081 s / it)
* Acc@1 65.100 Acc@5 86.626 loss 1.604
Accuracy of the model on the 50000 test images: 65.1%
Max accuracy: 65.10%
Epoch: [18]  [   0/1251]  eta: 1:05:22  lr: 0.003601  min_lr: 0.003601  loss: 2.9347 (2.9347)  weight_decay: 0.0500 (0.0500)  time: 3.1353  data: 2.9503  max mem: 12911
Epoch: [18]  [ 200/1251]  eta: 0:03:31  lr: 0.003633  min_lr: 0.003633  loss: 3.0271 (3.6168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8067 (1.0488)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [18]  [ 400/1251]  eta: 0:02:45  lr: 0.003665  min_lr: 0.003665  loss: 4.0463 (3.6525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8554 (0.9693)  time: 0.1854  data: 0.0005  max mem: 12911
Epoch: [18]  [ 600/1251]  eta: 0:02:04  lr: 0.003697  min_lr: 0.003697  loss: 3.5492 (3.6513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8861 (0.9712)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [18]  [ 800/1251]  eta: 0:01:26  lr: 0.003729  min_lr: 0.003729  loss: 3.3374 (3.6368)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0990 (0.9970)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [18]  [1000/1251]  eta: 0:00:47  lr: 0.003761  min_lr: 0.003761  loss: 3.8709 (3.6739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9046 (0.9878)  time: 0.1915  data: 0.0004  max mem: 12911
Epoch: [18]  [1200/1251]  eta: 0:00:09  lr: 0.003793  min_lr: 0.003793  loss: 3.0251 (3.6646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8556 (0.9815)  time: 0.1846  data: 0.0005  max mem: 12911
Epoch: [18]  [1250/1251]  eta: 0:00:00  lr: 0.003800  min_lr: 0.003800  loss: 3.9048 (3.6606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8371 (0.9751)  time: 0.1464  data: 0.0010  max mem: 12911
Epoch: [18] Total time: 0:03:56 (0.1894 s / it)
Averaged stats: lr: 0.003800  min_lr: 0.003800  loss: 3.9048 (3.6893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8371 (0.9751)
Test:  [ 0/25]  eta: 0:01:43  loss: 1.1892 (1.1892)  acc1: 76.0000 (76.0000)  acc5: 92.8000 (92.8000)  time: 4.1369  data: 4.0331  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.3107 (1.3799)  acc1: 73.2000 (71.3455)  acc5: 92.8000 (91.4545)  time: 0.7064  data: 0.6185  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.7135 (1.6487)  acc1: 61.6000 (65.8667)  acc5: 86.4000 (87.1429)  time: 0.2689  data: 0.1860  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.8495 (1.6603)  acc1: 61.6000 (65.4880)  acc5: 84.0000 (86.9280)  time: 0.2078  data: 0.1282  max mem: 12911
Test: Total time: 0:00:10 (0.4049 s / it)
* Acc@1 65.266 Acc@5 86.918 loss 1.656
Accuracy of the model on the 50000 test images: 65.3%
Max accuracy: 65.27%
Epoch: [19]  [   0/1251]  eta: 1:01:47  lr: 0.003801  min_lr: 0.003801  loss: 3.1275 (3.1275)  weight_decay: 0.0500 (0.0500)  time: 2.9634  data: 2.6737  max mem: 12911
Epoch: [19]  [ 200/1251]  eta: 0:03:31  lr: 0.003833  min_lr: 0.003833  loss: 3.0666 (3.7149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8315 (0.8567)  time: 0.1833  data: 0.0004  max mem: 12911
Epoch: [19]  [ 400/1251]  eta: 0:02:44  lr: 0.003865  min_lr: 0.003865  loss: 3.5589 (3.7040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7722 (0.9139)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [19]  [ 600/1251]  eta: 0:02:04  lr: 0.003897  min_lr: 0.003897  loss: 3.0753 (3.6875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (0.9187)  time: 0.1847  data: 0.0005  max mem: 12911
Epoch: [19]  [ 800/1251]  eta: 0:01:25  lr: 0.003929  min_lr: 0.003929  loss: 3.9645 (3.6756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8202 (0.9123)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [19]  [1000/1251]  eta: 0:00:47  lr: 0.003961  min_lr: 0.003961  loss: 3.7134 (3.7045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7818 (0.9257)  time: 0.1899  data: 0.0004  max mem: 12911
Epoch: [19]  [1200/1251]  eta: 0:00:09  lr: 0.003993  min_lr: 0.003993  loss: 3.1476 (3.6924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7741 (0.9136)  time: 0.1898  data: 0.0004  max mem: 12911
Epoch: [19]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 3.2557 (3.6991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7709 (0.9095)  time: 0.1468  data: 0.0008  max mem: 12911
Epoch: [19] Total time: 0:03:57 (0.1895 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 3.2557 (3.6855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7709 (0.9095)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.0496 (1.0496)  acc1: 78.4000 (78.4000)  acc5: 94.4000 (94.4000)  time: 5.6623  data: 5.5686  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.1861 (1.2576)  acc1: 72.8000 (72.2545)  acc5: 93.2000 (92.0000)  time: 0.7377  data: 0.6402  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.5656 (1.5532)  acc1: 63.2000 (66.3429)  acc5: 86.4000 (87.5429)  time: 0.1976  data: 0.1075  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.8114 (1.5737)  acc1: 61.6000 (65.9520)  acc5: 84.0000 (87.3280)  time: 0.1936  data: 0.1074  max mem: 12911
Test: Total time: 0:00:10 (0.4008 s / it)
* Acc@1 65.868 Acc@5 87.280 loss 1.573
Accuracy of the model on the 50000 test images: 65.9%
Max accuracy: 65.87%
Epoch: [20]  [   0/1251]  eta: 0:58:38  lr: 0.004000  min_lr: 0.004000  loss: 4.6968 (4.6968)  weight_decay: 0.0500 (0.0500)  time: 2.8122  data: 2.3635  max mem: 12911
Epoch: [20]  [ 200/1251]  eta: 0:03:35  lr: 0.004000  min_lr: 0.004000  loss: 3.7379 (3.5947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7831 (0.8139)  time: 0.1906  data: 0.0004  max mem: 12911
Epoch: [20]  [ 400/1251]  eta: 0:02:46  lr: 0.004000  min_lr: 0.004000  loss: 3.4136 (3.6336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8106 (0.8839)  time: 0.1903  data: 0.0004  max mem: 12911
Epoch: [20]  [ 600/1251]  eta: 0:02:05  lr: 0.004000  min_lr: 0.004000  loss: 3.2470 (3.6213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6813 (0.8510)  time: 0.1903  data: 0.0005  max mem: 12911
Epoch: [20]  [ 800/1251]  eta: 0:01:26  lr: 0.004000  min_lr: 0.004000  loss: 3.0781 (3.6076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8008 (0.8494)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [20]  [1000/1251]  eta: 0:00:48  lr: 0.004000  min_lr: 0.004000  loss: 3.5335 (3.6369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7790 (0.8415)  time: 0.1892  data: 0.0005  max mem: 12911
Epoch: [20]  [1200/1251]  eta: 0:00:09  lr: 0.004000  min_lr: 0.004000  loss: 2.9833 (3.6346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6976 (0.8339)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [20]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 3.1827 (3.6396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9153 (0.8433)  time: 0.1455  data: 0.0007  max mem: 12911
Epoch: [20] Total time: 0:03:59 (0.1913 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 3.1827 (3.6603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9153 (0.8433)
Test:  [ 0/25]  eta: 0:02:27  loss: 1.0478 (1.0478)  acc1: 76.8000 (76.8000)  acc5: 94.0000 (94.0000)  time: 5.8884  data: 5.7627  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.1963 (1.2510)  acc1: 73.2000 (70.9455)  acc5: 93.2000 (92.0727)  time: 0.7043  data: 0.6128  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.6087 (1.5320)  acc1: 61.2000 (65.6000)  acc5: 85.2000 (87.6000)  time: 0.1834  data: 0.0996  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.7142 (1.5461)  acc1: 61.2000 (65.4560)  acc5: 84.4000 (87.4400)  time: 0.1969  data: 0.1141  max mem: 12911
Test: Total time: 0:00:10 (0.4107 s / it)
* Acc@1 65.766 Acc@5 87.354 loss 1.539
Accuracy of the model on the 50000 test images: 65.8%
Max accuracy: 65.87%
Epoch: [21]  [   0/1251]  eta: 1:05:28  lr: 0.004000  min_lr: 0.004000  loss: 2.9329 (2.9329)  weight_decay: 0.0500 (0.0500)  time: 3.1401  data: 2.5700  max mem: 12911
Epoch: [21]  [ 200/1251]  eta: 0:03:34  lr: 0.004000  min_lr: 0.004000  loss: 3.0191 (3.6915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8803 (0.8596)  time: 0.1838  data: 0.0005  max mem: 12911
Epoch: [21]  [ 400/1251]  eta: 0:02:47  lr: 0.004000  min_lr: 0.004000  loss: 3.8927 (3.7294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7793 (0.8859)  time: 0.1908  data: 0.0004  max mem: 12911
Epoch: [21]  [ 600/1251]  eta: 0:02:05  lr: 0.004000  min_lr: 0.004000  loss: 3.2827 (3.6792)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6657 (0.8497)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [21]  [ 800/1251]  eta: 0:01:26  lr: 0.004000  min_lr: 0.004000  loss: 3.1321 (3.6522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7929 (0.8265)  time: 0.1888  data: 0.0006  max mem: 12911
Epoch: [21]  [1000/1251]  eta: 0:00:48  lr: 0.004000  min_lr: 0.004000  loss: 2.9627 (3.6496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8162 (0.8275)  time: 0.1890  data: 0.0006  max mem: 12911
Epoch: [21]  [1200/1251]  eta: 0:00:09  lr: 0.004000  min_lr: 0.004000  loss: 3.2491 (3.6438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6903 (0.8280)  time: 0.1899  data: 0.0004  max mem: 12911
Epoch: [21]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 3.0217 (3.6326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6533 (0.8235)  time: 0.1455  data: 0.0007  max mem: 12911
Epoch: [21] Total time: 0:03:59 (0.1912 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 3.0217 (3.6177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6533 (0.8235)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.0601 (1.0601)  acc1: 78.0000 (78.0000)  acc5: 93.6000 (93.6000)  time: 5.5164  data: 5.4247  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.1243 (1.1991)  acc1: 72.4000 (72.9818)  acc5: 93.2000 (91.9636)  time: 0.7784  data: 0.6833  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.5291 (1.4595)  acc1: 63.6000 (67.7333)  acc5: 86.0000 (87.8667)  time: 0.2290  data: 0.1410  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.6672 (1.4697)  acc1: 62.4000 (67.3600)  acc5: 85.2000 (87.9040)  time: 0.2262  data: 0.1409  max mem: 12911
Test: Total time: 0:00:10 (0.4197 s / it)
* Acc@1 67.084 Acc@5 88.028 loss 1.472
Accuracy of the model on the 50000 test images: 67.1%
Max accuracy: 67.08%
Epoch: [22]  [   0/1251]  eta: 0:55:12  lr: 0.003999  min_lr: 0.003999  loss: 4.7608 (4.7608)  weight_decay: 0.0500 (0.0500)  time: 2.6480  data: 1.8588  max mem: 12911
Epoch: [22]  [ 200/1251]  eta: 0:03:30  lr: 0.003999  min_lr: 0.003999  loss: 3.0810 (3.6159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7062 (0.8164)  time: 0.1835  data: 0.0004  max mem: 12911
Epoch: [22]  [ 400/1251]  eta: 0:02:45  lr: 0.003999  min_lr: 0.003999  loss: 2.9163 (3.5656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6580 (0.7906)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [22]  [ 600/1251]  eta: 0:02:05  lr: 0.003999  min_lr: 0.003999  loss: 3.3119 (3.5940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7355 (0.7740)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [22]  [ 800/1251]  eta: 0:01:26  lr: 0.003999  min_lr: 0.003999  loss: 4.2014 (3.6218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6838 (0.7638)  time: 0.1923  data: 0.0006  max mem: 12911
Epoch: [22]  [1000/1251]  eta: 0:00:47  lr: 0.003999  min_lr: 0.003999  loss: 2.9893 (3.6233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8520 (0.7744)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [22]  [1200/1251]  eta: 0:00:09  lr: 0.003999  min_lr: 0.003999  loss: 2.8696 (3.6194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7122 (0.7819)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [22]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 2.8821 (3.6151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6868 (0.7803)  time: 0.1455  data: 0.0009  max mem: 12911
Epoch: [22] Total time: 0:03:57 (0.1897 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 2.8821 (3.5950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6868 (0.7803)
Test:  [ 0/25]  eta: 0:02:20  loss: 1.0349 (1.0349)  acc1: 78.4000 (78.4000)  acc5: 93.2000 (93.2000)  time: 5.6386  data: 5.5470  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.1066 (1.1920)  acc1: 72.4000 (73.1273)  acc5: 93.2000 (92.6182)  time: 0.7544  data: 0.6606  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.4767 (1.4564)  acc1: 66.8000 (68.3048)  acc5: 87.2000 (88.7238)  time: 0.2097  data: 0.1229  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.6349 (1.4698)  acc1: 64.4000 (67.8720)  acc5: 86.0000 (88.6560)  time: 0.2074  data: 0.1228  max mem: 12911
Test: Total time: 0:00:10 (0.4089 s / it)
* Acc@1 67.624 Acc@5 88.382 loss 1.467
Accuracy of the model on the 50000 test images: 67.6%
Max accuracy: 67.62%
Epoch: [23]  [   0/1251]  eta: 1:03:25  lr: 0.003999  min_lr: 0.003999  loss: 2.9553 (2.9553)  weight_decay: 0.0500 (0.0500)  time: 3.0423  data: 2.8151  max mem: 12911
Epoch: [23]  [ 200/1251]  eta: 0:03:33  lr: 0.003999  min_lr: 0.003999  loss: 2.9247 (3.5300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7419 (0.7148)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [23]  [ 400/1251]  eta: 0:02:45  lr: 0.003999  min_lr: 0.003999  loss: 3.2662 (3.5119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6938 (0.7320)  time: 0.1862  data: 0.0004  max mem: 12911
Epoch: [23]  [ 600/1251]  eta: 0:02:05  lr: 0.003998  min_lr: 0.003998  loss: 2.9180 (3.4882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.7393)  time: 0.1841  data: 0.0004  max mem: 12911
Epoch: [23]  [ 800/1251]  eta: 0:01:26  lr: 0.003998  min_lr: 0.003998  loss: 3.5993 (3.5112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5976 (0.7217)  time: 0.1842  data: 0.0004  max mem: 12911
Epoch: [23]  [1000/1251]  eta: 0:00:47  lr: 0.003998  min_lr: 0.003998  loss: 3.5390 (3.5182)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7467 (0.7348)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [23]  [1200/1251]  eta: 0:00:09  lr: 0.003998  min_lr: 0.003998  loss: 4.5674 (3.5451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5737 (0.7387)  time: 0.1884  data: 0.0005  max mem: 12911
Epoch: [23]  [1250/1251]  eta: 0:00:00  lr: 0.003998  min_lr: 0.003998  loss: 3.0473 (3.5450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6460 (0.7373)  time: 0.1463  data: 0.0009  max mem: 12911
Epoch: [23] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.003998  min_lr: 0.003998  loss: 3.0473 (3.5576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6460 (0.7373)
Test:  [ 0/25]  eta: 0:01:41  loss: 1.1005 (1.1005)  acc1: 76.4000 (76.4000)  acc5: 96.4000 (96.4000)  time: 4.0508  data: 3.9510  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 1.1864 (1.2627)  acc1: 76.4000 (73.1273)  acc5: 93.2000 (92.5455)  time: 0.6135  data: 0.5286  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.5623 (1.5171)  acc1: 63.6000 (68.1143)  acc5: 87.6000 (88.2095)  time: 0.2385  data: 0.1562  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.6370 (1.5272)  acc1: 63.6000 (67.6640)  acc5: 85.6000 (88.0960)  time: 0.2144  data: 0.1335  max mem: 12911
Test: Total time: 0:00:10 (0.4011 s / it)
* Acc@1 67.650 Acc@5 88.398 loss 1.527
Accuracy of the model on the 50000 test images: 67.7%
Max accuracy: 67.65%
Epoch: [24]  [   0/1251]  eta: 1:05:42  lr: 0.003998  min_lr: 0.003998  loss: 2.8849 (2.8849)  weight_decay: 0.0500 (0.0500)  time: 3.1518  data: 2.9505  max mem: 12911
Epoch: [24]  [ 200/1251]  eta: 0:03:33  lr: 0.003998  min_lr: 0.003998  loss: 4.3881 (3.5110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5884 (0.7094)  time: 0.1886  data: 0.0005  max mem: 12911
Epoch: [24]  [ 400/1251]  eta: 0:02:45  lr: 0.003998  min_lr: 0.003998  loss: 3.8660 (3.5620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6062 (0.7357)  time: 0.1896  data: 0.0005  max mem: 12911
Epoch: [24]  [ 600/1251]  eta: 0:02:05  lr: 0.003997  min_lr: 0.003997  loss: 3.2454 (3.5507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6126 (0.7244)  time: 0.1840  data: 0.0004  max mem: 12911
Epoch: [24]  [ 800/1251]  eta: 0:01:25  lr: 0.003997  min_lr: 0.003997  loss: 3.1681 (3.5360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8972 (0.7483)  time: 0.1854  data: 0.0005  max mem: 12911
Epoch: [24]  [1000/1251]  eta: 0:00:47  lr: 0.003997  min_lr: 0.003997  loss: 3.7809 (3.5553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7196 (0.7432)  time: 0.1923  data: 0.0005  max mem: 12911
Epoch: [24]  [1200/1251]  eta: 0:00:09  lr: 0.003997  min_lr: 0.003997  loss: 3.3972 (3.5656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5510 (0.7329)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [24]  [1250/1251]  eta: 0:00:00  lr: 0.003997  min_lr: 0.003997  loss: 3.3544 (3.5633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7058 (0.7337)  time: 0.1465  data: 0.0009  max mem: 12911
Epoch: [24] Total time: 0:03:57 (0.1900 s / it)
Averaged stats: lr: 0.003997  min_lr: 0.003997  loss: 3.3544 (3.5558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7058 (0.7337)
Test:  [ 0/25]  eta: 0:02:19  loss: 1.0391 (1.0391)  acc1: 79.2000 (79.2000)  acc5: 96.0000 (96.0000)  time: 5.5705  data: 5.4621  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.1294 (1.2200)  acc1: 71.6000 (72.5091)  acc5: 94.0000 (92.5091)  time: 0.7650  data: 0.6712  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.5062 (1.4716)  acc1: 64.4000 (67.7714)  acc5: 86.0000 (88.2667)  time: 0.2286  data: 0.1420  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.6819 (1.4823)  acc1: 64.4000 (67.6480)  acc5: 84.0000 (88.1120)  time: 0.2276  data: 0.1420  max mem: 12911
Test: Total time: 0:00:10 (0.4212 s / it)
* Acc@1 67.772 Acc@5 88.530 loss 1.481
Accuracy of the model on the 50000 test images: 67.8%
Max accuracy: 67.77%
Epoch: [25]  [   0/1251]  eta: 1:04:31  lr: 0.003997  min_lr: 0.003997  loss: 2.8002 (2.8002)  weight_decay: 0.0500 (0.0500)  time: 3.0949  data: 2.8432  max mem: 12911
Epoch: [25]  [ 200/1251]  eta: 0:03:32  lr: 0.003997  min_lr: 0.003997  loss: 3.6389 (3.5769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7691 (0.7829)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [25]  [ 400/1251]  eta: 0:02:45  lr: 0.003996  min_lr: 0.003996  loss: 3.4261 (3.5654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6608 (0.7501)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [25]  [ 600/1251]  eta: 0:02:04  lr: 0.003996  min_lr: 0.003996  loss: 2.9884 (3.5196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7543 (0.7528)  time: 0.1900  data: 0.0004  max mem: 12911
Epoch: [25]  [ 800/1251]  eta: 0:01:26  lr: 0.003996  min_lr: 0.003996  loss: 3.6059 (3.5248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7339 (0.7390)  time: 0.1891  data: 0.0004  max mem: 12911
Epoch: [25]  [1000/1251]  eta: 0:00:47  lr: 0.003996  min_lr: 0.003996  loss: 3.2616 (3.5254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7320 (0.7434)  time: 0.1912  data: 0.0005  max mem: 12911
Epoch: [25]  [1200/1251]  eta: 0:00:09  lr: 0.003996  min_lr: 0.003996  loss: 3.6355 (3.5378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6509 (0.7312)  time: 0.1860  data: 0.0004  max mem: 12911
Epoch: [25]  [1250/1251]  eta: 0:00:00  lr: 0.003995  min_lr: 0.003995  loss: 3.0222 (3.5342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6362 (0.7270)  time: 0.1467  data: 0.0015  max mem: 12911
Epoch: [25] Total time: 0:03:57 (0.1899 s / it)
Averaged stats: lr: 0.003995  min_lr: 0.003995  loss: 3.0222 (3.5304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6362 (0.7270)
Test:  [ 0/25]  eta: 0:01:50  loss: 1.0260 (1.0260)  acc1: 75.2000 (75.2000)  acc5: 94.0000 (94.0000)  time: 4.4363  data: 4.2885  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 1.0260 (1.1680)  acc1: 75.2000 (72.9091)  acc5: 94.4000 (92.9818)  time: 0.6329  data: 0.5433  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.4965 (1.4289)  acc1: 64.4000 (68.1524)  acc5: 88.0000 (88.7238)  time: 0.2176  data: 0.1342  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5829 (1.4501)  acc1: 63.2000 (67.6320)  acc5: 84.8000 (88.3680)  time: 0.2260  data: 0.1446  max mem: 12911
Test: Total time: 0:00:10 (0.4115 s / it)
* Acc@1 68.034 Acc@5 88.700 loss 1.449
Accuracy of the model on the 50000 test images: 68.0%
Max accuracy: 68.03%
Epoch: [26]  [   0/1251]  eta: 1:04:44  lr: 0.003995  min_lr: 0.003995  loss: 4.3635 (4.3635)  weight_decay: 0.0500 (0.0500)  time: 3.1052  data: 2.8070  max mem: 12911
Epoch: [26]  [ 200/1251]  eta: 0:03:31  lr: 0.003995  min_lr: 0.003995  loss: 2.8285 (3.4723)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8260 (0.7664)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [26]  [ 400/1251]  eta: 0:02:45  lr: 0.003995  min_lr: 0.003995  loss: 3.1110 (3.4777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6086 (0.7200)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [26]  [ 600/1251]  eta: 0:02:04  lr: 0.003995  min_lr: 0.003995  loss: 3.8739 (3.4955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6061 (0.6954)  time: 0.1891  data: 0.0005  max mem: 12911
Epoch: [26]  [ 800/1251]  eta: 0:01:26  lr: 0.003994  min_lr: 0.003994  loss: 2.8065 (3.5047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7575 (0.7046)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [26]  [1000/1251]  eta: 0:00:47  lr: 0.003994  min_lr: 0.003994  loss: 4.4377 (3.5198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6227 (0.6962)  time: 0.1934  data: 0.0004  max mem: 12911
Epoch: [26]  [1200/1251]  eta: 0:00:09  lr: 0.003994  min_lr: 0.003994  loss: 2.8244 (3.5284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6804 (0.6984)  time: 0.1853  data: 0.0005  max mem: 12911
Epoch: [26]  [1250/1251]  eta: 0:00:00  lr: 0.003994  min_lr: 0.003994  loss: 2.7111 (3.5222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6707 (0.6991)  time: 0.1482  data: 0.0011  max mem: 12911
Epoch: [26] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.003994  min_lr: 0.003994  loss: 2.7111 (3.5134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6707 (0.6991)
Test:  [ 0/25]  eta: 0:01:43  loss: 1.0609 (1.0609)  acc1: 76.4000 (76.4000)  acc5: 92.8000 (92.8000)  time: 4.1524  data: 4.0039  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 1.0827 (1.1851)  acc1: 72.8000 (72.5091)  acc5: 93.6000 (92.4364)  time: 0.6475  data: 0.5492  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.4767 (1.4295)  acc1: 65.2000 (67.9429)  acc5: 87.6000 (88.8191)  time: 0.2407  data: 0.1516  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4983 (1.4307)  acc1: 65.2000 (68.0160)  acc5: 87.2000 (88.8640)  time: 0.2056  data: 0.1204  max mem: 12911
Test: Total time: 0:00:09 (0.3866 s / it)
* Acc@1 68.180 Acc@5 88.926 loss 1.430
Accuracy of the model on the 50000 test images: 68.2%
Max accuracy: 68.18%
Epoch: [27]  [   0/1251]  eta: 0:56:23  lr: 0.003994  min_lr: 0.003994  loss: 3.5888 (3.5888)  weight_decay: 0.0500 (0.0500)  time: 2.7050  data: 2.4426  max mem: 12911
Epoch: [27]  [ 200/1251]  eta: 0:03:35  lr: 0.003994  min_lr: 0.003994  loss: 3.6736 (3.3976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7107 (0.6806)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [27]  [ 400/1251]  eta: 0:02:47  lr: 0.003993  min_lr: 0.003993  loss: 3.0589 (3.4495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5494 (0.7089)  time: 0.1913  data: 0.0004  max mem: 12911
Epoch: [27]  [ 600/1251]  eta: 0:02:06  lr: 0.003993  min_lr: 0.003993  loss: 2.7295 (3.4348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6575 (0.7103)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [27]  [ 800/1251]  eta: 0:01:26  lr: 0.003993  min_lr: 0.003993  loss: 2.9955 (3.4422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7578 (nan)  time: 0.1841  data: 0.0004  max mem: 12911
Epoch: [27]  [1000/1251]  eta: 0:00:48  lr: 0.003992  min_lr: 0.003992  loss: 3.3663 (3.4499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6008 (nan)  time: 0.1930  data: 0.0004  max mem: 12911
Epoch: [27]  [1200/1251]  eta: 0:00:09  lr: 0.003992  min_lr: 0.003992  loss: 3.5139 (3.4784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7582 (nan)  time: 0.1886  data: 0.0006  max mem: 12911
Epoch: [27]  [1250/1251]  eta: 0:00:00  lr: 0.003992  min_lr: 0.003992  loss: 2.8483 (3.4781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7071 (nan)  time: 0.1481  data: 0.0009  max mem: 12911
Epoch: [27] Total time: 0:03:59 (0.1918 s / it)
Averaged stats: lr: 0.003992  min_lr: 0.003992  loss: 2.8483 (3.4922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7071 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.9415 (0.9415)  acc1: 82.8000 (82.8000)  acc5: 94.8000 (94.8000)  time: 5.4864  data: 5.3895  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0503 (1.1292)  acc1: 76.0000 (74.9455)  acc5: 94.4000 (93.0182)  time: 0.7478  data: 0.6533  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.4230 (1.3775)  acc1: 66.4000 (69.4095)  acc5: 88.0000 (89.3333)  time: 0.2216  data: 0.1329  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4868 (1.3961)  acc1: 66.4000 (68.9920)  acc5: 86.4000 (89.1360)  time: 0.2196  data: 0.1328  max mem: 12911
Test: Total time: 0:00:10 (0.4124 s / it)
* Acc@1 68.610 Acc@5 89.228 loss 1.400
Accuracy of the model on the 50000 test images: 68.6%
Max accuracy: 68.61%
Epoch: [28]  [   0/1251]  eta: 0:55:12  lr: 0.003992  min_lr: 0.003992  loss: 2.7773 (2.7773)  weight_decay: 0.0500 (0.0500)  time: 2.6480  data: 2.3821  max mem: 12911
Epoch: [28]  [ 200/1251]  eta: 0:03:31  lr: 0.003992  min_lr: 0.003992  loss: 2.8014 (3.4204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6995 (0.7274)  time: 0.1846  data: 0.0004  max mem: 12911
Epoch: [28]  [ 400/1251]  eta: 0:02:44  lr: 0.003991  min_lr: 0.003991  loss: 2.9835 (3.4564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5926 (0.7141)  time: 0.1852  data: 0.0004  max mem: 12911
Epoch: [28]  [ 600/1251]  eta: 0:02:04  lr: 0.003991  min_lr: 0.003991  loss: 2.9146 (3.4686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (0.7243)  time: 0.1892  data: 0.0005  max mem: 12911
Epoch: [28]  [ 800/1251]  eta: 0:01:26  lr: 0.003991  min_lr: 0.003991  loss: 2.9678 (3.4743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6235 (0.7085)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [28]  [1000/1251]  eta: 0:00:47  lr: 0.003990  min_lr: 0.003990  loss: 2.9466 (3.4697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6432 (0.7119)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [28]  [1200/1251]  eta: 0:00:09  lr: 0.003990  min_lr: 0.003990  loss: 3.3162 (3.4899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8692 (0.7253)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [28]  [1250/1251]  eta: 0:00:00  lr: 0.003990  min_lr: 0.003990  loss: 4.1470 (3.4960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6697 (0.7238)  time: 0.1480  data: 0.0008  max mem: 12911
Epoch: [28] Total time: 0:03:57 (0.1899 s / it)
Averaged stats: lr: 0.003990  min_lr: 0.003990  loss: 4.1470 (3.4818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6697 (0.7238)
Test:  [ 0/25]  eta: 0:02:11  loss: 1.0956 (1.0956)  acc1: 78.4000 (78.4000)  acc5: 94.0000 (94.0000)  time: 5.2683  data: 5.1729  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.2265 (1.2570)  acc1: 72.8000 (74.4727)  acc5: 94.4000 (93.0182)  time: 0.7286  data: 0.6352  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.5809 (1.4877)  acc1: 67.2000 (69.3143)  acc5: 87.2000 (89.3524)  time: 0.2162  data: 0.1290  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.6289 (1.5054)  acc1: 65.2000 (68.6240)  acc5: 86.8000 (89.1520)  time: 0.2144  data: 0.1289  max mem: 12911
Test: Total time: 0:00:10 (0.4020 s / it)
* Acc@1 68.660 Acc@5 89.100 loss 1.500
Accuracy of the model on the 50000 test images: 68.7%
Max accuracy: 68.66%
Epoch: [29]  [   0/1251]  eta: 1:05:17  lr: 0.003990  min_lr: 0.003990  loss: 4.6895 (4.6895)  weight_decay: 0.0500 (0.0500)  time: 3.1316  data: 2.9207  max mem: 12911
Epoch: [29]  [ 200/1251]  eta: 0:03:31  lr: 0.003989  min_lr: 0.003989  loss: 2.8806 (3.4436)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7195 (0.7322)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [29]  [ 400/1251]  eta: 0:02:45  lr: 0.003989  min_lr: 0.003989  loss: 3.0672 (3.4612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5527 (0.6858)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [29]  [ 600/1251]  eta: 0:02:05  lr: 0.003989  min_lr: 0.003989  loss: 3.0448 (3.4783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5475 (0.6625)  time: 0.1860  data: 0.0006  max mem: 12911
Epoch: [29]  [ 800/1251]  eta: 0:01:26  lr: 0.003988  min_lr: 0.003988  loss: 2.8405 (3.4935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7862 (0.6827)  time: 0.1914  data: 0.0004  max mem: 12911
Epoch: [29]  [1000/1251]  eta: 0:00:47  lr: 0.003988  min_lr: 0.003988  loss: 3.0235 (3.4792)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6572 (0.6761)  time: 0.1901  data: 0.0004  max mem: 12911
Epoch: [29]  [1200/1251]  eta: 0:00:09  lr: 0.003988  min_lr: 0.003988  loss: 3.0136 (3.4876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5398 (0.6777)  time: 0.1903  data: 0.0004  max mem: 12911
Epoch: [29]  [1250/1251]  eta: 0:00:00  lr: 0.003987  min_lr: 0.003987  loss: 2.7972 (3.4894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5334 (0.6735)  time: 0.1460  data: 0.0008  max mem: 12911
Epoch: [29] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.003987  min_lr: 0.003987  loss: 2.7972 (3.4667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5334 (0.6735)
Test:  [ 0/25]  eta: 0:02:15  loss: 1.1475 (1.1475)  acc1: 76.4000 (76.4000)  acc5: 93.6000 (93.6000)  time: 5.4159  data: 5.3244  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.1475 (1.2049)  acc1: 72.4000 (73.3091)  acc5: 94.8000 (92.9455)  time: 0.7446  data: 0.6513  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.5310 (1.4372)  acc1: 64.0000 (68.5143)  acc5: 87.6000 (89.2952)  time: 0.2115  data: 0.1246  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5608 (1.4481)  acc1: 64.4000 (68.3200)  acc5: 86.8000 (89.1520)  time: 0.2133  data: 0.1283  max mem: 12911
Test: Total time: 0:00:10 (0.4053 s / it)
* Acc@1 68.948 Acc@5 89.362 loss 1.430
Accuracy of the model on the 50000 test images: 68.9%
Max accuracy: 68.95%
Epoch: [30]  [   0/1251]  eta: 0:59:00  lr: 0.003987  min_lr: 0.003987  loss: 3.6677 (3.6677)  weight_decay: 0.0500 (0.0500)  time: 2.8301  data: 2.3364  max mem: 12911
Epoch: [30]  [ 200/1251]  eta: 0:03:32  lr: 0.003987  min_lr: 0.003987  loss: 2.8063 (3.4441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6869 (0.7096)  time: 0.1841  data: 0.0004  max mem: 12911
Epoch: [30]  [ 400/1251]  eta: 0:02:46  lr: 0.003987  min_lr: 0.003987  loss: 2.7738 (3.4133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6655 (0.7155)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [30]  [ 600/1251]  eta: 0:02:05  lr: 0.003986  min_lr: 0.003986  loss: 2.9937 (3.4335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7601 (0.7202)  time: 0.1888  data: 0.0006  max mem: 12911
Epoch: [30]  [ 800/1251]  eta: 0:01:26  lr: 0.003986  min_lr: 0.003986  loss: 3.2892 (3.4263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7150 (0.7335)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [30]  [1000/1251]  eta: 0:00:47  lr: 0.003985  min_lr: 0.003985  loss: 2.9380 (3.4373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6723 (0.7253)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [30]  [1200/1251]  eta: 0:00:09  lr: 0.003985  min_lr: 0.003985  loss: 2.8562 (3.4382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5888 (0.7092)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [30]  [1250/1251]  eta: 0:00:00  lr: 0.003985  min_lr: 0.003985  loss: 3.0648 (3.4419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6702 (0.7095)  time: 0.1471  data: 0.0006  max mem: 12911
Epoch: [30] Total time: 0:03:57 (0.1898 s / it)
Averaged stats: lr: 0.003985  min_lr: 0.003985  loss: 3.0648 (3.4425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6702 (0.7095)
Test:  [ 0/25]  eta: 0:02:23  loss: 1.0683 (1.0683)  acc1: 80.0000 (80.0000)  acc5: 94.4000 (94.4000)  time: 5.7319  data: 5.6400  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0991 (1.2079)  acc1: 76.8000 (74.1818)  acc5: 94.0000 (92.4000)  time: 0.7336  data: 0.6395  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.5433 (1.4195)  acc1: 65.6000 (69.7905)  acc5: 87.2000 (89.0857)  time: 0.1917  data: 0.1048  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5983 (1.4381)  acc1: 65.6000 (69.0400)  acc5: 86.0000 (88.8320)  time: 0.1944  data: 0.1077  max mem: 12911
Test: Total time: 0:00:10 (0.4083 s / it)
* Acc@1 68.982 Acc@5 89.344 loss 1.435
Accuracy of the model on the 50000 test images: 69.0%
Max accuracy: 68.98%
Epoch: [31]  [   0/1251]  eta: 1:06:07  lr: 0.003985  min_lr: 0.003985  loss: 3.4458 (3.4458)  weight_decay: 0.0500 (0.0500)  time: 3.1711  data: 2.9258  max mem: 12911
Epoch: [31]  [ 200/1251]  eta: 0:03:31  lr: 0.003984  min_lr: 0.003984  loss: 3.0030 (3.3380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6557 (0.6735)  time: 0.1908  data: 0.0004  max mem: 12911
Epoch: [31]  [ 400/1251]  eta: 0:02:45  lr: 0.003984  min_lr: 0.003984  loss: 3.0464 (3.3705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4976 (0.6283)  time: 0.1857  data: 0.0004  max mem: 12911
Epoch: [31]  [ 600/1251]  eta: 0:02:05  lr: 0.003983  min_lr: 0.003983  loss: 2.8278 (3.3801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6430 (0.6726)  time: 0.1908  data: 0.0004  max mem: 12911
Epoch: [31]  [ 800/1251]  eta: 0:01:26  lr: 0.003983  min_lr: 0.003983  loss: 2.7441 (3.4186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5942 (0.6821)  time: 0.1893  data: 0.0004  max mem: 12911
Epoch: [31]  [1000/1251]  eta: 0:00:48  lr: 0.003982  min_lr: 0.003982  loss: 2.7693 (3.4145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6755 (0.6817)  time: 0.1860  data: 0.0004  max mem: 12911
Epoch: [31]  [1200/1251]  eta: 0:00:09  lr: 0.003982  min_lr: 0.003982  loss: 3.3786 (3.4131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6876 (0.6926)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [31]  [1250/1251]  eta: 0:00:00  lr: 0.003982  min_lr: 0.003982  loss: 3.1977 (3.4157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7533 (0.6996)  time: 0.1466  data: 0.0009  max mem: 12911
Epoch: [31] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.003982  min_lr: 0.003982  loss: 3.1977 (3.4240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7533 (0.6996)
Test:  [ 0/25]  eta: 0:02:12  loss: 1.1038 (1.1038)  acc1: 76.4000 (76.4000)  acc5: 94.0000 (94.0000)  time: 5.2863  data: 5.1947  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 1.1409 (1.2219)  acc1: 74.8000 (74.1455)  acc5: 94.0000 (93.0182)  time: 0.6458  data: 0.5521  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.5221 (1.4570)  acc1: 65.2000 (69.0095)  acc5: 88.0000 (89.3714)  time: 0.1745  data: 0.0878  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5605 (1.4657)  acc1: 66.4000 (68.9920)  acc5: 87.2000 (89.2800)  time: 0.2185  data: 0.1339  max mem: 12911
Test: Total time: 0:00:10 (0.4110 s / it)
* Acc@1 69.232 Acc@5 89.444 loss 1.469
Accuracy of the model on the 50000 test images: 69.2%
Max accuracy: 69.23%
Epoch: [32]  [   0/1251]  eta: 0:59:36  lr: 0.003982  min_lr: 0.003982  loss: 4.8453 (4.8453)  weight_decay: 0.0500 (0.0500)  time: 2.8591  data: 2.5815  max mem: 12911
Epoch: [32]  [ 200/1251]  eta: 0:03:35  lr: 0.003981  min_lr: 0.003981  loss: 2.6560 (3.3283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7042 (0.7564)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [32]  [ 400/1251]  eta: 0:02:47  lr: 0.003981  min_lr: 0.003981  loss: 2.9384 (3.3500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6341 (0.7151)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [32]  [ 600/1251]  eta: 0:02:06  lr: 0.003980  min_lr: 0.003980  loss: 3.1351 (3.3914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6103 (0.7371)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [32]  [ 800/1251]  eta: 0:01:26  lr: 0.003980  min_lr: 0.003980  loss: 2.8027 (3.4097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6946 (0.7338)  time: 0.1913  data: 0.0006  max mem: 12911
Epoch: [32]  [1000/1251]  eta: 0:00:48  lr: 0.003979  min_lr: 0.003979  loss: 3.5478 (3.4373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6436 (0.7371)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [32]  [1200/1251]  eta: 0:00:09  lr: 0.003979  min_lr: 0.003979  loss: 3.6325 (3.4534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6467 (0.7258)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [32]  [1250/1251]  eta: 0:00:00  lr: 0.003979  min_lr: 0.003979  loss: 2.9067 (3.4496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7205 (0.7296)  time: 0.1468  data: 0.0007  max mem: 12911
Epoch: [32] Total time: 0:03:59 (0.1917 s / it)
Averaged stats: lr: 0.003979  min_lr: 0.003979  loss: 2.9067 (3.4409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7205 (0.7296)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.0067 (1.0067)  acc1: 80.4000 (80.4000)  acc5: 94.0000 (94.0000)  time: 5.5033  data: 5.4106  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.1079 (1.1783)  acc1: 78.4000 (75.1636)  acc5: 94.4000 (93.2727)  time: 0.7398  data: 0.6461  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.4884 (1.4022)  acc1: 66.0000 (70.4191)  acc5: 88.8000 (90.0762)  time: 0.2060  data: 0.1181  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5208 (1.4117)  acc1: 65.2000 (69.8560)  acc5: 88.0000 (89.9040)  time: 0.2052  data: 0.1181  max mem: 12911
Test: Total time: 0:00:10 (0.4014 s / it)
* Acc@1 69.842 Acc@5 89.844 loss 1.403
Accuracy of the model on the 50000 test images: 69.8%
Max accuracy: 69.84%
Epoch: [33]  [   0/1251]  eta: 0:59:40  lr: 0.003979  min_lr: 0.003979  loss: 4.5787 (4.5787)  weight_decay: 0.0500 (0.0500)  time: 2.8619  data: 2.5852  max mem: 12911
Epoch: [33]  [ 200/1251]  eta: 0:03:32  lr: 0.003978  min_lr: 0.003978  loss: 2.6956 (3.3413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5874 (0.6927)  time: 0.1845  data: 0.0004  max mem: 12911
Epoch: [33]  [ 400/1251]  eta: 0:02:45  lr: 0.003978  min_lr: 0.003978  loss: 3.0041 (3.4112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6093 (0.6820)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [33]  [ 600/1251]  eta: 0:02:05  lr: 0.003977  min_lr: 0.003977  loss: 2.8892 (3.4235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6904 (0.6783)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [33]  [ 800/1251]  eta: 0:01:26  lr: 0.003977  min_lr: 0.003977  loss: 2.9665 (3.4037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6684 (0.6991)  time: 0.1884  data: 0.0005  max mem: 12911
Epoch: [33]  [1000/1251]  eta: 0:00:47  lr: 0.003976  min_lr: 0.003976  loss: 2.7161 (3.3928)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6076 (0.6972)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [33]  [1200/1251]  eta: 0:00:09  lr: 0.003976  min_lr: 0.003976  loss: 2.8628 (3.3898)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6083 (0.6984)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [33]  [1250/1251]  eta: 0:00:00  lr: 0.003975  min_lr: 0.003975  loss: 3.5741 (3.4044)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1462  data: 0.0008  max mem: 12911
Epoch: [33] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.003975  min_lr: 0.003975  loss: 3.5741 (3.4118)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:15  loss: 1.0421 (1.0421)  acc1: 80.4000 (80.4000)  acc5: 96.0000 (96.0000)  time: 5.4374  data: 5.3457  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.0797 (1.1710)  acc1: 76.4000 (75.2364)  acc5: 94.8000 (93.4546)  time: 0.6953  data: 0.5999  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.4527 (1.4253)  acc1: 66.4000 (70.0952)  acc5: 87.2000 (89.9619)  time: 0.2019  data: 0.1134  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5665 (1.4356)  acc1: 66.4000 (69.7120)  acc5: 86.8000 (89.8560)  time: 0.1991  data: 0.1133  max mem: 12911
Test: Total time: 0:00:09 (0.3956 s / it)
* Acc@1 69.772 Acc@5 89.802 loss 1.426
Accuracy of the model on the 50000 test images: 69.8%
Max accuracy: 69.84%
Epoch: [34]  [   0/1251]  eta: 1:08:07  lr: 0.003975  min_lr: 0.003975  loss: 4.5417 (4.5417)  weight_decay: 0.0500 (0.0500)  time: 3.2672  data: 2.3179  max mem: 12911
Epoch: [34]  [ 200/1251]  eta: 0:03:35  lr: 0.003975  min_lr: 0.003975  loss: 2.6797 (3.3084)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9625 (0.8189)  time: 0.1896  data: 0.0005  max mem: 12911
Epoch: [34]  [ 400/1251]  eta: 0:02:47  lr: 0.003974  min_lr: 0.003974  loss: 3.5727 (3.3320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6131 (0.7583)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [34]  [ 600/1251]  eta: 0:02:06  lr: 0.003974  min_lr: 0.003974  loss: 3.3826 (3.3700)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5900 (0.7437)  time: 0.1864  data: 0.0007  max mem: 12911
Epoch: [34]  [ 800/1251]  eta: 0:01:26  lr: 0.003973  min_lr: 0.003973  loss: 3.0864 (3.3610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6036 (nan)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [34]  [1000/1251]  eta: 0:00:47  lr: 0.003972  min_lr: 0.003972  loss: 2.7176 (3.3685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5941 (nan)  time: 0.1907  data: 0.0004  max mem: 12911
Epoch: [34]  [1200/1251]  eta: 0:00:09  lr: 0.003972  min_lr: 0.003972  loss: 2.8057 (3.3661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6710 (nan)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [34]  [1250/1251]  eta: 0:00:00  lr: 0.003972  min_lr: 0.003972  loss: 2.8432 (3.3701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6231 (nan)  time: 0.1464  data: 0.0012  max mem: 12911
Epoch: [34] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.003972  min_lr: 0.003972  loss: 2.8432 (3.4118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6231 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.9431 (0.9431)  acc1: 81.6000 (81.6000)  acc5: 94.8000 (94.8000)  time: 5.6535  data: 5.5285  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0050 (1.1086)  acc1: 76.8000 (76.3273)  acc5: 94.8000 (93.4545)  time: 0.7346  data: 0.6452  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.4050 (1.3630)  acc1: 66.4000 (70.5333)  acc5: 88.8000 (90.1143)  time: 0.2136  data: 0.1309  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5706 (1.3756)  acc1: 65.6000 (70.0960)  acc5: 87.2000 (89.9840)  time: 0.2125  data: 0.1300  max mem: 12911
Test: Total time: 0:00:10 (0.4131 s / it)
* Acc@1 70.252 Acc@5 89.888 loss 1.378
Accuracy of the model on the 50000 test images: 70.3%
Max accuracy: 70.25%
Epoch: [35]  [   0/1251]  eta: 0:59:02  lr: 0.003972  min_lr: 0.003972  loss: 4.1344 (4.1344)  weight_decay: 0.0500 (0.0500)  time: 2.8321  data: 2.4753  max mem: 12911
Epoch: [35]  [ 200/1251]  eta: 0:03:35  lr: 0.003971  min_lr: 0.003971  loss: 3.0988 (3.4543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8127 (0.7523)  time: 0.1924  data: 0.0005  max mem: 12911
Epoch: [35]  [ 400/1251]  eta: 0:02:47  lr: 0.003971  min_lr: 0.003971  loss: 3.2476 (3.3801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7491 (0.7546)  time: 0.1841  data: 0.0004  max mem: 12911
Epoch: [35]  [ 600/1251]  eta: 0:02:06  lr: 0.003970  min_lr: 0.003970  loss: 3.1369 (3.4134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6537 (0.7207)  time: 0.1890  data: 0.0005  max mem: 12911
Epoch: [35]  [ 800/1251]  eta: 0:01:27  lr: 0.003969  min_lr: 0.003969  loss: 3.1228 (3.4030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.7503)  time: 0.1894  data: 0.0004  max mem: 12911
Epoch: [35]  [1000/1251]  eta: 0:00:48  lr: 0.003969  min_lr: 0.003969  loss: 3.0394 (3.4069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6927 (0.7523)  time: 0.1991  data: 0.0005  max mem: 12911
Epoch: [35]  [1200/1251]  eta: 0:00:09  lr: 0.003968  min_lr: 0.003968  loss: 2.9166 (3.3910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6630 (0.7398)  time: 0.1928  data: 0.0004  max mem: 12911
Epoch: [35]  [1250/1251]  eta: 0:00:00  lr: 0.003968  min_lr: 0.003968  loss: 4.0098 (3.3959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7453 (0.7432)  time: 0.1460  data: 0.0011  max mem: 12911
Epoch: [35] Total time: 0:04:00 (0.1920 s / it)
Averaged stats: lr: 0.003968  min_lr: 0.003968  loss: 4.0098 (3.3963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7453 (0.7432)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.9628 (0.9628)  acc1: 80.4000 (80.4000)  acc5: 95.6000 (95.6000)  time: 5.5228  data: 5.4312  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.1367 (1.1400)  acc1: 74.8000 (75.5273)  acc5: 95.2000 (93.6000)  time: 0.6883  data: 0.5942  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3932 (1.3855)  acc1: 68.0000 (70.4381)  acc5: 88.4000 (90.1333)  time: 0.1844  data: 0.0956  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5663 (1.4042)  acc1: 65.6000 (70.1440)  acc5: 87.6000 (89.9360)  time: 0.2243  data: 0.1380  max mem: 12911
Test: Total time: 0:00:10 (0.4179 s / it)
* Acc@1 70.238 Acc@5 90.136 loss 1.405
Accuracy of the model on the 50000 test images: 70.2%
Max accuracy: 70.25%
Epoch: [36]  [   0/1251]  eta: 1:05:26  lr: 0.003968  min_lr: 0.003968  loss: 4.0149 (4.0149)  weight_decay: 0.0500 (0.0500)  time: 3.1386  data: 2.1921  max mem: 12911
Epoch: [36]  [ 200/1251]  eta: 0:03:35  lr: 0.003967  min_lr: 0.003967  loss: 2.8493 (3.4633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7421 (0.7220)  time: 0.1851  data: 0.0004  max mem: 12911
Epoch: [36]  [ 400/1251]  eta: 0:02:47  lr: 0.003967  min_lr: 0.003967  loss: 3.0792 (3.4298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7601 (0.7554)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [36]  [ 600/1251]  eta: 0:02:06  lr: 0.003966  min_lr: 0.003966  loss: 2.7962 (3.4102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6956 (0.7469)  time: 0.1877  data: 0.0005  max mem: 12911
Epoch: [36]  [ 800/1251]  eta: 0:01:26  lr: 0.003965  min_lr: 0.003965  loss: 2.7959 (3.3909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6780 (0.7410)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [36]  [1000/1251]  eta: 0:00:48  lr: 0.003965  min_lr: 0.003965  loss: 2.7037 (3.3967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7033 (0.7352)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [36]  [1200/1251]  eta: 0:00:09  lr: 0.003964  min_lr: 0.003964  loss: 2.8238 (3.4039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6576 (0.7373)  time: 0.1952  data: 0.0004  max mem: 12911
Epoch: [36]  [1250/1251]  eta: 0:00:00  lr: 0.003964  min_lr: 0.003964  loss: 2.7314 (3.4063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6099 (0.7353)  time: 0.1518  data: 0.0009  max mem: 12911
Epoch: [36] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.003964  min_lr: 0.003964  loss: 2.7314 (3.3744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6099 (0.7353)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.9635 (0.9635)  acc1: 79.6000 (79.6000)  acc5: 96.4000 (96.4000)  time: 5.4066  data: 5.3147  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.1259 (1.1345)  acc1: 76.4000 (75.7455)  acc5: 94.4000 (94.1091)  time: 0.7446  data: 0.6493  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.4383 (1.3657)  acc1: 67.6000 (70.9524)  acc5: 89.2000 (90.2476)  time: 0.2182  data: 0.1294  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4383 (1.3717)  acc1: 67.6000 (70.5920)  acc5: 88.8000 (90.3200)  time: 0.2152  data: 0.1293  max mem: 12911
Test: Total time: 0:00:10 (0.4070 s / it)
* Acc@1 70.568 Acc@5 90.372 loss 1.370
Accuracy of the model on the 50000 test images: 70.6%
Max accuracy: 70.57%
Epoch: [37]  [   0/1251]  eta: 1:08:47  lr: 0.003964  min_lr: 0.003964  loss: 2.7024 (2.7024)  weight_decay: 0.0500 (0.0500)  time: 3.2993  data: 3.0570  max mem: 12911
Epoch: [37]  [ 200/1251]  eta: 0:03:32  lr: 0.003963  min_lr: 0.003963  loss: 3.1285 (3.3606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7291 (0.7956)  time: 0.1849  data: 0.0005  max mem: 12911
Epoch: [37]  [ 400/1251]  eta: 0:02:45  lr: 0.003962  min_lr: 0.003962  loss: 3.5038 (3.4034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5705 (0.7630)  time: 0.1862  data: 0.0003  max mem: 12911
Epoch: [37]  [ 600/1251]  eta: 0:02:04  lr: 0.003962  min_lr: 0.003962  loss: 2.7119 (3.3731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7895 (0.7894)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [37]  [ 800/1251]  eta: 0:01:25  lr: 0.003961  min_lr: 0.003961  loss: 3.1582 (3.3780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7607 (0.7725)  time: 0.1845  data: 0.0005  max mem: 12911
Epoch: [37]  [1000/1251]  eta: 0:00:47  lr: 0.003960  min_lr: 0.003960  loss: 4.3040 (3.3969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6570 (0.7640)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [37]  [1200/1251]  eta: 0:00:09  lr: 0.003960  min_lr: 0.003960  loss: 2.7821 (3.3870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5877 (0.7552)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [37]  [1250/1251]  eta: 0:00:00  lr: 0.003959  min_lr: 0.003959  loss: 3.7029 (3.3895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7097 (0.7588)  time: 0.1465  data: 0.0013  max mem: 12911
Epoch: [37] Total time: 0:03:57 (0.1896 s / it)
Averaged stats: lr: 0.003959  min_lr: 0.003959  loss: 3.7029 (3.3709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7097 (0.7588)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.9600 (0.9600)  acc1: 79.6000 (79.6000)  acc5: 94.0000 (94.0000)  time: 5.5643  data: 5.4727  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0723 (1.1432)  acc1: 75.2000 (75.5636)  acc5: 94.4000 (93.8545)  time: 0.7629  data: 0.6695  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.4298 (1.3766)  acc1: 68.0000 (70.7048)  acc5: 88.8000 (90.2286)  time: 0.2173  data: 0.1304  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5194 (1.3917)  acc1: 67.6000 (70.0960)  acc5: 87.2000 (90.0320)  time: 0.2157  data: 0.1304  max mem: 12911
Test: Total time: 0:00:10 (0.4122 s / it)
* Acc@1 70.430 Acc@5 90.278 loss 1.390
Accuracy of the model on the 50000 test images: 70.4%
Max accuracy: 70.57%
Epoch: [38]  [   0/1251]  eta: 1:06:48  lr: 0.003959  min_lr: 0.003959  loss: 2.8477 (2.8477)  weight_decay: 0.0500 (0.0500)  time: 3.2039  data: 2.3041  max mem: 12911
Epoch: [38]  [ 200/1251]  eta: 0:03:33  lr: 0.003959  min_lr: 0.003959  loss: 3.0905 (3.3763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6329 (0.7592)  time: 0.1856  data: 0.0004  max mem: 12911
Epoch: [38]  [ 400/1251]  eta: 0:02:46  lr: 0.003958  min_lr: 0.003958  loss: 2.7762 (3.3066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7388 (0.8023)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [38]  [ 600/1251]  eta: 0:02:05  lr: 0.003957  min_lr: 0.003957  loss: 2.8412 (3.3320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7104 (0.7807)  time: 0.1869  data: 0.0004  max mem: 12911
Epoch: [38]  [ 800/1251]  eta: 0:01:26  lr: 0.003956  min_lr: 0.003956  loss: 2.8799 (3.3468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7428 (0.7729)  time: 0.1868  data: 0.0006  max mem: 12911
Epoch: [38]  [1000/1251]  eta: 0:00:47  lr: 0.003956  min_lr: 0.003956  loss: 2.7104 (3.3472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8271 (0.7712)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [38]  [1200/1251]  eta: 0:00:09  lr: 0.003955  min_lr: 0.003955  loss: 2.7589 (3.3393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6731 (0.7745)  time: 0.1910  data: 0.0004  max mem: 12911
Epoch: [38]  [1250/1251]  eta: 0:00:00  lr: 0.003955  min_lr: 0.003955  loss: 2.9025 (3.3406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6558 (0.7714)  time: 0.1481  data: 0.0008  max mem: 12911
Epoch: [38] Total time: 0:03:57 (0.1897 s / it)
Averaged stats: lr: 0.003955  min_lr: 0.003955  loss: 2.9025 (3.3683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6558 (0.7714)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.9234 (0.9234)  acc1: 80.0000 (80.0000)  acc5: 95.6000 (95.6000)  time: 5.6519  data: 5.5529  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0182 (1.0932)  acc1: 79.2000 (75.9636)  acc5: 95.6000 (94.7273)  time: 0.7544  data: 0.6581  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.4180 (1.3483)  acc1: 68.0000 (70.6857)  acc5: 90.8000 (90.8952)  time: 0.2014  data: 0.1118  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5572 (1.3673)  acc1: 66.0000 (70.3360)  acc5: 87.2000 (90.5920)  time: 0.2168  data: 0.1291  max mem: 12911
Test: Total time: 0:00:10 (0.4175 s / it)
* Acc@1 70.250 Acc@5 90.322 loss 1.372
Accuracy of the model on the 50000 test images: 70.3%
Max accuracy: 70.57%
Epoch: [39]  [   0/1251]  eta: 1:01:44  lr: 0.003955  min_lr: 0.003955  loss: 3.2419 (3.2419)  weight_decay: 0.0500 (0.0500)  time: 2.9611  data: 1.5524  max mem: 12911
Epoch: [39]  [ 200/1251]  eta: 0:03:33  lr: 0.003954  min_lr: 0.003954  loss: 2.7881 (3.3376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6545 (0.7650)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [39]  [ 400/1251]  eta: 0:02:45  lr: 0.003953  min_lr: 0.003953  loss: 3.7387 (3.3201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8238 (0.7774)  time: 0.1833  data: 0.0004  max mem: 12911
Epoch: [39]  [ 600/1251]  eta: 0:02:05  lr: 0.003952  min_lr: 0.003952  loss: 3.1501 (3.3435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6258 (0.7575)  time: 0.1886  data: 0.0005  max mem: 12911
Epoch: [39]  [ 800/1251]  eta: 0:01:26  lr: 0.003952  min_lr: 0.003952  loss: 3.9345 (3.3583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8214 (0.7802)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [39]  [1000/1251]  eta: 0:00:47  lr: 0.003951  min_lr: 0.003951  loss: 2.8218 (3.3751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8211 (0.7744)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [39]  [1200/1251]  eta: 0:00:09  lr: 0.003950  min_lr: 0.003950  loss: 3.1596 (3.3756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8044 (0.7896)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [39]  [1250/1251]  eta: 0:00:00  lr: 0.003950  min_lr: 0.003950  loss: 2.7076 (3.3691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8044 (0.7909)  time: 0.1472  data: 0.0012  max mem: 12911
Epoch: [39] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.003950  min_lr: 0.003950  loss: 2.7076 (3.3514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8044 (0.7909)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.9339 (0.9339)  acc1: 79.2000 (79.2000)  acc5: 94.0000 (94.0000)  time: 5.5034  data: 5.4113  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0169 (1.0654)  acc1: 75.2000 (76.0000)  acc5: 94.4000 (93.8545)  time: 0.7437  data: 0.6489  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3561 (1.3110)  acc1: 68.0000 (70.8762)  acc5: 88.8000 (90.1714)  time: 0.2234  data: 0.1344  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4782 (1.3239)  acc1: 67.6000 (70.5120)  acc5: 86.8000 (90.0000)  time: 0.2211  data: 0.1344  max mem: 12911
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 70.406 Acc@5 90.142 loss 1.328
Accuracy of the model on the 50000 test images: 70.4%
Max accuracy: 70.57%
Epoch: [40]  [   0/1251]  eta: 0:58:28  lr: 0.003950  min_lr: 0.003950  loss: 2.8206 (2.8206)  weight_decay: 0.0500 (0.0500)  time: 2.8046  data: 2.2555  max mem: 12911
Epoch: [40]  [ 200/1251]  eta: 0:03:36  lr: 0.003949  min_lr: 0.003949  loss: 2.8470 (3.2819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7421 (0.7983)  time: 0.1889  data: 0.0006  max mem: 12911
Epoch: [40]  [ 400/1251]  eta: 0:02:48  lr: 0.003948  min_lr: 0.003948  loss: 3.0130 (3.3478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5734 (0.7598)  time: 0.1887  data: 0.0007  max mem: 12911
Epoch: [40]  [ 600/1251]  eta: 0:02:07  lr: 0.003947  min_lr: 0.003947  loss: 2.7927 (3.3028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7411 (0.7910)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [40]  [ 800/1251]  eta: 0:01:27  lr: 0.003947  min_lr: 0.003947  loss: 2.8439 (3.2976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7905 (0.8087)  time: 0.1918  data: 0.0004  max mem: 12911
Epoch: [40]  [1000/1251]  eta: 0:00:48  lr: 0.003946  min_lr: 0.003946  loss: 4.0867 (3.3079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6243 (0.7860)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [40]  [1200/1251]  eta: 0:00:09  lr: 0.003945  min_lr: 0.003945  loss: 2.8342 (3.3239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7014 (0.7898)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [40]  [1250/1251]  eta: 0:00:00  lr: 0.003945  min_lr: 0.003945  loss: 2.7041 (3.3225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6513 (0.7872)  time: 0.1468  data: 0.0011  max mem: 12911
Epoch: [40] Total time: 0:04:00 (0.1922 s / it)
Averaged stats: lr: 0.003945  min_lr: 0.003945  loss: 2.7041 (3.3380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6513 (0.7872)
Test:  [ 0/25]  eta: 0:01:20  loss: 0.8256 (0.8256)  acc1: 82.0000 (82.0000)  acc5: 96.0000 (96.0000)  time: 3.2157  data: 3.1242  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 1.0185 (1.0329)  acc1: 76.8000 (75.8909)  acc5: 95.2000 (94.4364)  time: 0.6083  data: 0.5196  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3042 (1.2592)  acc1: 68.4000 (71.2952)  acc5: 90.4000 (91.2571)  time: 0.2818  data: 0.1977  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4614 (1.2725)  acc1: 68.4000 (70.8480)  acc5: 88.8000 (91.1360)  time: 0.2174  data: 0.1348  max mem: 12911
Test: Total time: 0:00:10 (0.4043 s / it)
* Acc@1 70.898 Acc@5 90.644 loss 1.281
Accuracy of the model on the 50000 test images: 70.9%
Max accuracy: 70.90%
Epoch: [41]  [   0/1251]  eta: 1:00:23  lr: 0.003945  min_lr: 0.003945  loss: 3.0830 (3.0830)  weight_decay: 0.0500 (0.0500)  time: 2.8964  data: 2.6252  max mem: 12911
Epoch: [41]  [ 200/1251]  eta: 0:03:32  lr: 0.003944  min_lr: 0.003944  loss: 2.8127 (3.3516)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [41]  [ 400/1251]  eta: 0:02:46  lr: 0.003943  min_lr: 0.003943  loss: 2.8547 (3.3299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6915 (nan)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [41]  [ 600/1251]  eta: 0:02:05  lr: 0.003942  min_lr: 0.003942  loss: 2.8096 (3.3571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (nan)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [41]  [ 800/1251]  eta: 0:01:26  lr: 0.003941  min_lr: 0.003941  loss: 3.3594 (3.3676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7183 (nan)  time: 0.1870  data: 0.0003  max mem: 12911
Epoch: [41]  [1000/1251]  eta: 0:00:47  lr: 0.003940  min_lr: 0.003940  loss: 2.7316 (3.3598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8332 (nan)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [41]  [1200/1251]  eta: 0:00:09  lr: 0.003940  min_lr: 0.003940  loss: 3.5536 (3.3433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6844 (nan)  time: 0.1886  data: 0.0005  max mem: 12911
Epoch: [41]  [1250/1251]  eta: 0:00:00  lr: 0.003939  min_lr: 0.003939  loss: 2.6900 (3.3393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8009 (nan)  time: 0.1474  data: 0.0007  max mem: 12911
Epoch: [41] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.003939  min_lr: 0.003939  loss: 2.6900 (3.3492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8009 (nan)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.8749 (0.8749)  acc1: 82.4000 (82.4000)  acc5: 95.2000 (95.2000)  time: 5.3446  data: 5.2131  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9895 (1.0898)  acc1: 78.8000 (76.3636)  acc5: 95.2000 (93.9636)  time: 0.7223  data: 0.6255  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3837 (1.3194)  acc1: 68.8000 (71.5238)  acc5: 88.8000 (90.7048)  time: 0.2082  data: 0.1193  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4925 (1.3294)  acc1: 66.8000 (71.2480)  acc5: 87.2000 (90.5760)  time: 0.2104  data: 0.1238  max mem: 12911
Test: Total time: 0:00:10 (0.4007 s / it)
* Acc@1 70.964 Acc@5 90.700 loss 1.335
Accuracy of the model on the 50000 test images: 71.0%
Max accuracy: 70.96%
Epoch: [42]  [   0/1251]  eta: 1:02:49  lr: 0.003939  min_lr: 0.003939  loss: 2.5714 (2.5714)  weight_decay: 0.0500 (0.0500)  time: 3.0133  data: 2.7517  max mem: 12911
Epoch: [42]  [ 200/1251]  eta: 0:03:31  lr: 0.003939  min_lr: 0.003939  loss: 2.8555 (3.2950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6406 (0.8030)  time: 0.1901  data: 0.0005  max mem: 12911
Epoch: [42]  [ 400/1251]  eta: 0:02:45  lr: 0.003938  min_lr: 0.003938  loss: 2.7180 (3.3496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6207 (0.8262)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [42]  [ 600/1251]  eta: 0:02:05  lr: 0.003937  min_lr: 0.003937  loss: 2.9210 (3.3365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6731 (0.8222)  time: 0.1915  data: 0.0006  max mem: 12911
Epoch: [42]  [ 800/1251]  eta: 0:01:26  lr: 0.003936  min_lr: 0.003936  loss: 2.8618 (3.3353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8559 (0.8469)  time: 0.1884  data: 0.0004  max mem: 12911
Epoch: [42]  [1000/1251]  eta: 0:00:47  lr: 0.003935  min_lr: 0.003935  loss: 2.6762 (3.3257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9028 (0.8455)  time: 0.1910  data: 0.0004  max mem: 12911
Epoch: [42]  [1200/1251]  eta: 0:00:09  lr: 0.003934  min_lr: 0.003934  loss: 2.9701 (3.3168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8896 (0.8408)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [42]  [1250/1251]  eta: 0:00:00  lr: 0.003934  min_lr: 0.003934  loss: 2.8601 (3.3103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8469 (0.8416)  time: 0.1454  data: 0.0009  max mem: 12911
Epoch: [42] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.003934  min_lr: 0.003934  loss: 2.8601 (3.3362)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8469 (0.8416)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.9891 (0.9891)  acc1: 81.6000 (81.6000)  acc5: 95.6000 (95.6000)  time: 5.5572  data: 5.4650  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.0965 (1.1155)  acc1: 77.2000 (76.5455)  acc5: 94.4000 (94.0727)  time: 0.6814  data: 0.5866  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.4093 (1.3440)  acc1: 68.4000 (72.0762)  acc5: 90.0000 (90.2857)  time: 0.1927  data: 0.1040  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5246 (1.3578)  acc1: 68.4000 (71.5680)  acc5: 87.2000 (90.0800)  time: 0.2178  data: 0.1319  max mem: 12911
Test: Total time: 0:00:10 (0.4158 s / it)
* Acc@1 71.258 Acc@5 90.430 loss 1.358
Accuracy of the model on the 50000 test images: 71.3%
Max accuracy: 71.26%
Epoch: [43]  [   0/1251]  eta: 0:57:29  lr: 0.003934  min_lr: 0.003934  loss: 2.6405 (2.6405)  weight_decay: 0.0500 (0.0500)  time: 2.7571  data: 2.4881  max mem: 12911
Epoch: [43]  [ 200/1251]  eta: 0:03:33  lr: 0.003933  min_lr: 0.003933  loss: 2.8354 (3.2868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8243 (0.7664)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [43]  [ 400/1251]  eta: 0:02:46  lr: 0.003932  min_lr: 0.003932  loss: 2.9599 (3.3066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6223 (0.7735)  time: 0.1923  data: 0.0005  max mem: 12911
Epoch: [43]  [ 600/1251]  eta: 0:02:05  lr: 0.003931  min_lr: 0.003931  loss: 3.3725 (3.2925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8298 (0.8178)  time: 0.1915  data: 0.0005  max mem: 12911
Epoch: [43]  [ 800/1251]  eta: 0:01:26  lr: 0.003930  min_lr: 0.003930  loss: 2.9486 (3.3203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7358 (0.8215)  time: 0.1848  data: 0.0005  max mem: 12911
Epoch: [43]  [1000/1251]  eta: 0:00:48  lr: 0.003929  min_lr: 0.003929  loss: 2.8174 (3.3385)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6181 (0.8355)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [43]  [1200/1251]  eta: 0:00:09  lr: 0.003928  min_lr: 0.003928  loss: 2.7504 (3.3302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7428 (0.8393)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [43]  [1250/1251]  eta: 0:00:00  lr: 0.003928  min_lr: 0.003928  loss: 2.8917 (3.3289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7365 (0.8354)  time: 0.1460  data: 0.0007  max mem: 12911
Epoch: [43] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.003928  min_lr: 0.003928  loss: 2.8917 (3.3291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7365 (0.8354)
Test:  [ 0/25]  eta: 0:01:41  loss: 0.9496 (0.9496)  acc1: 79.6000 (79.6000)  acc5: 95.2000 (95.2000)  time: 4.0717  data: 3.9748  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.0142 (1.0665)  acc1: 78.0000 (76.6909)  acc5: 95.2000 (94.4000)  time: 0.6731  data: 0.5894  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3279 (1.3098)  acc1: 70.8000 (71.6762)  acc5: 88.8000 (90.8762)  time: 0.2463  data: 0.1642  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4641 (1.3234)  acc1: 67.2000 (71.2480)  acc5: 88.4000 (90.7200)  time: 0.2354  data: 0.1542  max mem: 12911
Test: Total time: 0:00:10 (0.4150 s / it)
* Acc@1 71.376 Acc@5 90.684 loss 1.325
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.38%
Epoch: [44]  [   0/1251]  eta: 1:06:07  lr: 0.003928  min_lr: 0.003928  loss: 2.9881 (2.9881)  weight_decay: 0.0500 (0.0500)  time: 3.1714  data: 2.8937  max mem: 12911
Epoch: [44]  [ 200/1251]  eta: 0:03:31  lr: 0.003927  min_lr: 0.003927  loss: 2.6791 (3.2573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8350 (0.8795)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [44]  [ 400/1251]  eta: 0:02:44  lr: 0.003926  min_lr: 0.003926  loss: 2.8101 (3.3088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7658 (0.8719)  time: 0.1849  data: 0.0005  max mem: 12911
Epoch: [44]  [ 600/1251]  eta: 0:02:04  lr: 0.003925  min_lr: 0.003925  loss: 2.7840 (3.3288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8802 (0.8726)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [44]  [ 800/1251]  eta: 0:01:26  lr: 0.003924  min_lr: 0.003924  loss: 2.6762 (3.3383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6637 (0.8356)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [44]  [1000/1251]  eta: 0:00:47  lr: 0.003923  min_lr: 0.003923  loss: 2.9668 (3.3291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7714 (0.8448)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [44]  [1200/1251]  eta: 0:00:09  lr: 0.003922  min_lr: 0.003922  loss: 2.8133 (3.3328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6701 (0.8376)  time: 0.1855  data: 0.0005  max mem: 12911
Epoch: [44]  [1250/1251]  eta: 0:00:00  lr: 0.003922  min_lr: 0.003922  loss: 2.8458 (3.3331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7194 (0.8344)  time: 0.1474  data: 0.0010  max mem: 12911
Epoch: [44] Total time: 0:03:57 (0.1902 s / it)
Averaged stats: lr: 0.003922  min_lr: 0.003922  loss: 2.8458 (3.3087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7194 (0.8344)
Test:  [ 0/25]  eta: 0:01:42  loss: 0.9209 (0.9209)  acc1: 80.8000 (80.8000)  acc5: 95.6000 (95.6000)  time: 4.0955  data: 3.9629  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 1.1093 (1.0713)  acc1: 76.8000 (77.5273)  acc5: 94.8000 (94.3273)  time: 0.6379  data: 0.5519  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3146 (1.3005)  acc1: 71.6000 (72.4191)  acc5: 89.6000 (90.9714)  time: 0.2422  data: 0.1617  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5036 (1.3140)  acc1: 68.8000 (71.9680)  acc5: 88.0000 (90.8160)  time: 0.2039  data: 0.1244  max mem: 12911
Test: Total time: 0:00:10 (0.4061 s / it)
* Acc@1 71.426 Acc@5 90.860 loss 1.319
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.43%
Epoch: [45]  [   0/1251]  eta: 0:56:53  lr: 0.003922  min_lr: 0.003922  loss: 2.7517 (2.7517)  weight_decay: 0.0500 (0.0500)  time: 2.7282  data: 2.4423  max mem: 12911
Epoch: [45]  [ 200/1251]  eta: 0:03:31  lr: 0.003921  min_lr: 0.003921  loss: 3.0054 (3.2771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8470 (0.9962)  time: 0.1859  data: 0.0004  max mem: 12911
Epoch: [45]  [ 400/1251]  eta: 0:02:46  lr: 0.003920  min_lr: 0.003920  loss: 2.9033 (3.3073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6606 (0.9415)  time: 0.1905  data: 0.0006  max mem: 12911
Epoch: [45]  [ 600/1251]  eta: 0:02:05  lr: 0.003919  min_lr: 0.003919  loss: 2.6812 (3.2871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8527 (0.9198)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [45]  [ 800/1251]  eta: 0:01:26  lr: 0.003918  min_lr: 0.003918  loss: 3.0925 (3.3104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8564 (0.9004)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [45]  [1000/1251]  eta: 0:00:48  lr: 0.003917  min_lr: 0.003917  loss: 2.8161 (3.3182)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6715 (0.8806)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [45]  [1200/1251]  eta: 0:00:09  lr: 0.003916  min_lr: 0.003916  loss: 2.6628 (3.3231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6194 (0.8852)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [45]  [1250/1251]  eta: 0:00:00  lr: 0.003916  min_lr: 0.003916  loss: 3.1135 (3.3302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7846 (0.8807)  time: 0.1458  data: 0.0010  max mem: 12911
Epoch: [45] Total time: 0:03:59 (0.1916 s / it)
Averaged stats: lr: 0.003916  min_lr: 0.003916  loss: 3.1135 (3.3178)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7846 (0.8807)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.9284 (0.9284)  acc1: 82.8000 (82.8000)  acc5: 94.4000 (94.4000)  time: 5.7648  data: 5.6731  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0123 (1.0584)  acc1: 78.0000 (77.3455)  acc5: 94.8000 (94.4727)  time: 0.7654  data: 0.6701  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3187 (1.2913)  acc1: 70.0000 (72.1333)  acc5: 90.0000 (91.0667)  time: 0.2051  data: 0.1175  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4073 (1.2961)  acc1: 69.2000 (71.9360)  acc5: 89.2000 (90.8960)  time: 0.2020  data: 0.1174  max mem: 12911
Test: Total time: 0:00:10 (0.4110 s / it)
* Acc@1 71.430 Acc@5 90.764 loss 1.301
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.43%
Epoch: [46]  [   0/1251]  eta: 1:02:08  lr: 0.003916  min_lr: 0.003916  loss: 2.6128 (2.6128)  weight_decay: 0.0500 (0.0500)  time: 2.9803  data: 2.7606  max mem: 12911
Epoch: [46]  [ 200/1251]  eta: 0:03:31  lr: 0.003914  min_lr: 0.003914  loss: 3.9706 (3.3583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7811 (0.9042)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [46]  [ 400/1251]  eta: 0:02:45  lr: 0.003913  min_lr: 0.003913  loss: 2.6812 (3.3654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7670 (0.8814)  time: 0.1864  data: 0.0004  max mem: 12911
Epoch: [46]  [ 600/1251]  eta: 0:02:04  lr: 0.003912  min_lr: 0.003912  loss: 3.1348 (3.3487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6894 (0.8825)  time: 0.1862  data: 0.0004  max mem: 12911
Epoch: [46]  [ 800/1251]  eta: 0:01:26  lr: 0.003911  min_lr: 0.003911  loss: 2.6501 (3.3366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8360 (0.8931)  time: 0.1919  data: 0.0003  max mem: 12911
Epoch: [46]  [1000/1251]  eta: 0:00:47  lr: 0.003910  min_lr: 0.003910  loss: 2.8396 (3.3219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7238 (0.8898)  time: 0.1896  data: 0.0007  max mem: 12911
Epoch: [46]  [1200/1251]  eta: 0:00:09  lr: 0.003909  min_lr: 0.003909  loss: 2.7215 (3.3203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7413 (0.8701)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [46]  [1250/1251]  eta: 0:00:00  lr: 0.003909  min_lr: 0.003909  loss: 3.2231 (3.3223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8256 (0.8814)  time: 0.1461  data: 0.0013  max mem: 12911
Epoch: [46] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.003909  min_lr: 0.003909  loss: 3.2231 (3.3080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8256 (0.8814)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.9346 (0.9346)  acc1: 84.0000 (84.0000)  acc5: 95.2000 (95.2000)  time: 5.6814  data: 5.5899  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.0591 (1.0875)  acc1: 80.8000 (77.6364)  acc5: 95.2000 (94.4364)  time: 0.6991  data: 0.6024  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3937 (1.3015)  acc1: 68.8000 (72.5143)  acc5: 90.8000 (91.2191)  time: 0.1776  data: 0.0887  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4749 (1.3164)  acc1: 68.4000 (71.7920)  acc5: 88.0000 (91.1200)  time: 0.1969  data: 0.1116  max mem: 12911
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 71.578 Acc@5 90.872 loss 1.320
Accuracy of the model on the 50000 test images: 71.6%
Max accuracy: 71.58%
Epoch: [47]  [   0/1251]  eta: 0:59:11  lr: 0.003909  min_lr: 0.003909  loss: 4.1125 (4.1125)  weight_decay: 0.0500 (0.0500)  time: 2.8387  data: 2.5485  max mem: 12911
Epoch: [47]  [ 200/1251]  eta: 0:03:35  lr: 0.003908  min_lr: 0.003908  loss: 2.6356 (3.3042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9746 (0.8941)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [47]  [ 400/1251]  eta: 0:02:46  lr: 0.003907  min_lr: 0.003907  loss: 2.6608 (3.2961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6526 (0.8274)  time: 0.1853  data: 0.0004  max mem: 12911
Epoch: [47]  [ 600/1251]  eta: 0:02:05  lr: 0.003906  min_lr: 0.003906  loss: 3.1117 (3.3012)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7648 (0.8487)  time: 0.1891  data: 0.0004  max mem: 12911
Epoch: [47]  [ 800/1251]  eta: 0:01:26  lr: 0.003905  min_lr: 0.003905  loss: 2.6224 (3.3115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7078 (nan)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [47]  [1000/1251]  eta: 0:00:47  lr: 0.003904  min_lr: 0.003904  loss: 2.8184 (3.3070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6194 (nan)  time: 0.1898  data: 0.0006  max mem: 12911
Epoch: [47]  [1200/1251]  eta: 0:00:09  lr: 0.003902  min_lr: 0.003902  loss: 3.9047 (3.3084)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8087 (nan)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [47]  [1250/1251]  eta: 0:00:00  lr: 0.003902  min_lr: 0.003902  loss: 2.9002 (3.3107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6789 (nan)  time: 0.1458  data: 0.0008  max mem: 12911
Epoch: [47] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.003902  min_lr: 0.003902  loss: 2.9002 (3.3223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6789 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.9891 (0.9891)  acc1: 81.6000 (81.6000)  acc5: 94.0000 (94.0000)  time: 5.4547  data: 5.3548  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9959 (1.1136)  acc1: 78.4000 (76.7273)  acc5: 94.8000 (94.4000)  time: 0.7247  data: 0.6393  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3966 (1.3362)  acc1: 67.6000 (71.8286)  acc5: 89.2000 (90.9143)  time: 0.2170  data: 0.1352  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4721 (1.3457)  acc1: 67.6000 (71.5040)  acc5: 88.8000 (90.8800)  time: 0.2157  data: 0.1351  max mem: 12911
Test: Total time: 0:00:10 (0.4073 s / it)
* Acc@1 71.310 Acc@5 90.804 loss 1.340
Accuracy of the model on the 50000 test images: 71.3%
Max accuracy: 71.58%
Epoch: [48]  [   0/1251]  eta: 0:54:36  lr: 0.003902  min_lr: 0.003902  loss: 4.1267 (4.1267)  weight_decay: 0.0500 (0.0500)  time: 2.6191  data: 2.3207  max mem: 12911
Epoch: [48]  [ 200/1251]  eta: 0:03:37  lr: 0.003901  min_lr: 0.003901  loss: 2.5816 (3.2256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8468 (0.9174)  time: 0.1851  data: 0.0004  max mem: 12911
Epoch: [48]  [ 400/1251]  eta: 0:02:48  lr: 0.003900  min_lr: 0.003900  loss: 2.6150 (3.2692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7929 (0.9540)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [48]  [ 600/1251]  eta: 0:02:06  lr: 0.003899  min_lr: 0.003899  loss: 2.8358 (3.2826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7066 (0.9139)  time: 0.1901  data: 0.0005  max mem: 12911
Epoch: [48]  [ 800/1251]  eta: 0:01:27  lr: 0.003898  min_lr: 0.003898  loss: 2.7100 (3.3164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8057 (0.8962)  time: 0.1895  data: 0.0006  max mem: 12911
Epoch: [48]  [1000/1251]  eta: 0:00:48  lr: 0.003897  min_lr: 0.003897  loss: 3.0574 (3.3087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8764 (nan)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [48]  [1200/1251]  eta: 0:00:09  lr: 0.003895  min_lr: 0.003895  loss: 2.6582 (3.3046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9854 (nan)  time: 0.1901  data: 0.0005  max mem: 12911
Epoch: [48]  [1250/1251]  eta: 0:00:00  lr: 0.003895  min_lr: 0.003895  loss: 3.9916 (3.3052)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0752 (nan)  time: 0.1469  data: 0.0009  max mem: 12911
Epoch: [48] Total time: 0:04:00 (0.1923 s / it)
Averaged stats: lr: 0.003895  min_lr: 0.003895  loss: 3.9916 (3.2928)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0752 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.9093 (0.9093)  acc1: 82.8000 (82.8000)  acc5: 95.6000 (95.6000)  time: 5.5718  data: 5.4793  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0151 (1.1127)  acc1: 77.2000 (75.9636)  acc5: 95.2000 (94.4364)  time: 0.7497  data: 0.6570  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3917 (1.3422)  acc1: 67.2000 (71.5810)  acc5: 89.2000 (90.9905)  time: 0.2112  data: 0.1251  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4918 (1.3573)  acc1: 67.2000 (71.0560)  acc5: 87.6000 (90.7840)  time: 0.2096  data: 0.1250  max mem: 12911
Test: Total time: 0:00:10 (0.4076 s / it)
* Acc@1 71.328 Acc@5 90.742 loss 1.368
Accuracy of the model on the 50000 test images: 71.3%
Max accuracy: 71.58%
Epoch: [49]  [   0/1251]  eta: 1:03:09  lr: 0.003895  min_lr: 0.003895  loss: 2.3144 (2.3144)  weight_decay: 0.0500 (0.0500)  time: 3.0290  data: 2.1325  max mem: 12911
Epoch: [49]  [ 200/1251]  eta: 0:03:33  lr: 0.003894  min_lr: 0.003894  loss: 4.1278 (3.3364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7066 (0.9381)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [49]  [ 400/1251]  eta: 0:02:46  lr: 0.003893  min_lr: 0.003893  loss: 3.4342 (3.3426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9943 (0.9614)  time: 0.1881  data: 0.0004  max mem: 12911
Epoch: [49]  [ 600/1251]  eta: 0:02:05  lr: 0.003892  min_lr: 0.003892  loss: 2.8436 (3.3310)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7728 (0.9305)  time: 0.1858  data: 0.0005  max mem: 12911
Epoch: [49]  [ 800/1251]  eta: 0:01:26  lr: 0.003890  min_lr: 0.003890  loss: 2.6599 (3.3257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6920 (0.9053)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [49]  [1000/1251]  eta: 0:00:48  lr: 0.003889  min_lr: 0.003889  loss: 2.8832 (3.3245)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6402 (0.8785)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [49]  [1200/1251]  eta: 0:00:09  lr: 0.003888  min_lr: 0.003888  loss: 2.6149 (3.3228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8821 (0.8959)  time: 0.1873  data: 0.0004  max mem: 12911
Epoch: [49]  [1250/1251]  eta: 0:00:00  lr: 0.003888  min_lr: 0.003888  loss: 2.8161 (3.3200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7755 (0.8922)  time: 0.1463  data: 0.0008  max mem: 12911
Epoch: [49] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.003888  min_lr: 0.003888  loss: 2.8161 (3.2976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7755 (0.8922)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.9165 (0.9165)  acc1: 79.6000 (79.6000)  acc5: 96.0000 (96.0000)  time: 5.2170  data: 5.1178  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.0450 (1.0580)  acc1: 78.4000 (77.2364)  acc5: 95.2000 (94.4727)  time: 0.7052  data: 0.6130  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3207 (1.2793)  acc1: 68.4000 (71.9048)  acc5: 90.8000 (91.2571)  time: 0.2081  data: 0.1225  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4590 (1.2865)  acc1: 68.0000 (71.7920)  acc5: 88.4000 (91.1360)  time: 0.2113  data: 0.1267  max mem: 12911
Test: Total time: 0:00:09 (0.3952 s / it)
* Acc@1 71.812 Acc@5 90.942 loss 1.291
Accuracy of the model on the 50000 test images: 71.8%
Max accuracy: 71.81%
Epoch: [50]  [   0/1251]  eta: 1:06:31  lr: 0.003888  min_lr: 0.003888  loss: 2.7255 (2.7255)  weight_decay: 0.0500 (0.0500)  time: 3.1907  data: 2.9651  max mem: 12911
Epoch: [50]  [ 200/1251]  eta: 0:03:33  lr: 0.003887  min_lr: 0.003887  loss: 2.6739 (3.2131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8096 (0.9366)  time: 0.1848  data: 0.0005  max mem: 12911
Epoch: [50]  [ 400/1251]  eta: 0:02:46  lr: 0.003885  min_lr: 0.003885  loss: 2.6866 (3.1953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7683 (0.9739)  time: 0.1853  data: 0.0004  max mem: 12911
Epoch: [50]  [ 600/1251]  eta: 0:02:05  lr: 0.003884  min_lr: 0.003884  loss: 3.0000 (3.2398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9697 (0.9711)  time: 0.1890  data: 0.0005  max mem: 12911
Epoch: [50]  [ 800/1251]  eta: 0:01:26  lr: 0.003883  min_lr: 0.003883  loss: 2.8339 (3.2404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7194 (0.9326)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [50]  [1000/1251]  eta: 0:00:48  lr: 0.003882  min_lr: 0.003882  loss: 2.8480 (3.2678)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1803 (0.9657)  time: 0.1889  data: 0.0004  max mem: 12911
Epoch: [50]  [1200/1251]  eta: 0:00:09  lr: 0.003881  min_lr: 0.003881  loss: 2.8292 (3.2805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7667 (0.9535)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [50]  [1250/1251]  eta: 0:00:00  lr: 0.003880  min_lr: 0.003880  loss: 2.5190 (3.2764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7762 (0.9473)  time: 0.1461  data: 0.0008  max mem: 12911
Epoch: [50] Total time: 0:03:59 (0.1914 s / it)
Averaged stats: lr: 0.003880  min_lr: 0.003880  loss: 2.5190 (3.2862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7762 (0.9473)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8822 (0.8822)  acc1: 82.0000 (82.0000)  acc5: 96.0000 (96.0000)  time: 5.6580  data: 5.5664  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0128 (1.0339)  acc1: 77.2000 (76.5818)  acc5: 95.6000 (94.8727)  time: 0.7375  data: 0.6508  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3235 (1.2514)  acc1: 68.4000 (72.2667)  acc5: 90.0000 (91.6762)  time: 0.1970  data: 0.1129  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4356 (1.2627)  acc1: 68.0000 (71.8880)  acc5: 88.8000 (91.4720)  time: 0.1977  data: 0.1128  max mem: 12911
Test: Total time: 0:00:10 (0.4009 s / it)
* Acc@1 71.992 Acc@5 91.126 loss 1.269
Accuracy of the model on the 50000 test images: 72.0%
Max accuracy: 71.99%
Epoch: [51]  [   0/1251]  eta: 1:07:07  lr: 0.003880  min_lr: 0.003880  loss: 4.2635 (4.2635)  weight_decay: 0.0500 (0.0500)  time: 3.2198  data: 2.9548  max mem: 12911
Epoch: [51]  [ 200/1251]  eta: 0:03:31  lr: 0.003879  min_lr: 0.003879  loss: 2.8230 (3.3718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6866 (0.8217)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [51]  [ 400/1251]  eta: 0:02:46  lr: 0.003878  min_lr: 0.003878  loss: 3.1970 (3.3849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7737 (0.8912)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [51]  [ 600/1251]  eta: 0:02:05  lr: 0.003877  min_lr: 0.003877  loss: 2.8161 (3.3707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.8975)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [51]  [ 800/1251]  eta: 0:01:26  lr: 0.003875  min_lr: 0.003875  loss: 2.5786 (3.3306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7478 (0.9357)  time: 0.1947  data: 0.0005  max mem: 12911
Epoch: [51]  [1000/1251]  eta: 0:00:48  lr: 0.003874  min_lr: 0.003874  loss: 3.6495 (3.3294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7300 (0.9212)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [51]  [1200/1251]  eta: 0:00:09  lr: 0.003873  min_lr: 0.003873  loss: 2.8940 (3.3106)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0669 (0.9439)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [51]  [1250/1251]  eta: 0:00:00  lr: 0.003873  min_lr: 0.003873  loss: 2.8162 (3.3056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8078 (0.9403)  time: 0.1464  data: 0.0010  max mem: 12911
Epoch: [51] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.003873  min_lr: 0.003873  loss: 2.8162 (3.2835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8078 (0.9403)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.9085 (0.9085)  acc1: 83.2000 (83.2000)  acc5: 96.0000 (96.0000)  time: 5.6389  data: 5.5471  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9815 (1.0502)  acc1: 79.6000 (76.9818)  acc5: 95.2000 (94.1455)  time: 0.6701  data: 0.5757  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3473 (1.2732)  acc1: 68.4000 (72.2857)  acc5: 90.4000 (91.3905)  time: 0.1721  data: 0.0827  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4935 (1.2914)  acc1: 68.4000 (71.9200)  acc5: 88.4000 (91.0720)  time: 0.2008  data: 0.1140  max mem: 12911
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 71.862 Acc@5 91.094 loss 1.297
Accuracy of the model on the 50000 test images: 71.9%
Max accuracy: 71.99%
Epoch: [52]  [   0/1251]  eta: 1:00:38  lr: 0.003873  min_lr: 0.003873  loss: 2.4660 (2.4660)  weight_decay: 0.0500 (0.0500)  time: 2.9086  data: 1.7587  max mem: 12911
Epoch: [52]  [ 200/1251]  eta: 0:03:35  lr: 0.003871  min_lr: 0.003871  loss: 3.0900 (3.2109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7959 (0.9521)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [52]  [ 400/1251]  eta: 0:02:46  lr: 0.003870  min_lr: 0.003870  loss: 2.8193 (3.2684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6959 (0.9374)  time: 0.1894  data: 0.0004  max mem: 12911
Epoch: [52]  [ 600/1251]  eta: 0:02:05  lr: 0.003869  min_lr: 0.003869  loss: 2.7108 (3.2965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9156 (0.9508)  time: 0.1847  data: 0.0004  max mem: 12911
Epoch: [52]  [ 800/1251]  eta: 0:01:26  lr: 0.003867  min_lr: 0.003867  loss: 2.5930 (3.2830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8778 (0.9573)  time: 0.1856  data: 0.0004  max mem: 12911
Epoch: [52]  [1000/1251]  eta: 0:00:47  lr: 0.003866  min_lr: 0.003866  loss: 2.6790 (3.2589)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1804 (0.9625)  time: 0.1843  data: 0.0004  max mem: 12911
Epoch: [52]  [1200/1251]  eta: 0:00:09  lr: 0.003865  min_lr: 0.003865  loss: 3.0752 (3.2858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6278 (0.9311)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [52]  [1250/1251]  eta: 0:00:00  lr: 0.003865  min_lr: 0.003865  loss: 2.7113 (3.2776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5867 (0.9207)  time: 0.1474  data: 0.0007  max mem: 12911
Epoch: [52] Total time: 0:03:57 (0.1898 s / it)
Averaged stats: lr: 0.003865  min_lr: 0.003865  loss: 2.7113 (3.2818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5867 (0.9207)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8838 (0.8838)  acc1: 84.0000 (84.0000)  acc5: 96.4000 (96.4000)  time: 5.4752  data: 5.3309  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9903 (1.0575)  acc1: 78.4000 (77.2364)  acc5: 95.2000 (94.6546)  time: 0.7418  data: 0.6419  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3521 (1.3066)  acc1: 68.8000 (72.2857)  acc5: 90.0000 (91.2571)  time: 0.2065  data: 0.1190  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.5393 (1.3186)  acc1: 68.4000 (71.7280)  acc5: 88.0000 (91.0560)  time: 0.2025  data: 0.1189  max mem: 12911
Test: Total time: 0:00:10 (0.4009 s / it)
* Acc@1 71.922 Acc@5 90.960 loss 1.322
Accuracy of the model on the 50000 test images: 71.9%
Max accuracy: 71.99%
Epoch: [53]  [   0/1251]  eta: 1:07:14  lr: 0.003865  min_lr: 0.003865  loss: 2.2680 (2.2680)  weight_decay: 0.0500 (0.0500)  time: 3.2248  data: 2.6040  max mem: 12911
Epoch: [53]  [ 200/1251]  eta: 0:03:35  lr: 0.003863  min_lr: 0.003863  loss: 3.4384 (3.3225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7244 (1.0404)  time: 0.1889  data: 0.0004  max mem: 12911
Epoch: [53]  [ 400/1251]  eta: 0:02:47  lr: 0.003862  min_lr: 0.003862  loss: 3.0515 (3.3228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9663 (0.9564)  time: 0.1886  data: 0.0005  max mem: 12911
Epoch: [53]  [ 600/1251]  eta: 0:02:06  lr: 0.003861  min_lr: 0.003861  loss: 2.8348 (3.2806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8233 (0.9849)  time: 0.1860  data: 0.0004  max mem: 12911
Epoch: [53]  [ 800/1251]  eta: 0:01:26  lr: 0.003859  min_lr: 0.003859  loss: 2.7332 (3.2484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7468 (0.9232)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [53]  [1000/1251]  eta: 0:00:48  lr: 0.003858  min_lr: 0.003858  loss: 3.7665 (3.2607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6970 (0.9177)  time: 0.2017  data: 0.0005  max mem: 12911
Epoch: [53]  [1200/1251]  eta: 0:00:09  lr: 0.003857  min_lr: 0.003857  loss: 2.8673 (3.2759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8091 (0.9314)  time: 0.1899  data: 0.0005  max mem: 12911
Epoch: [53]  [1250/1251]  eta: 0:00:00  lr: 0.003856  min_lr: 0.003856  loss: 2.6447 (3.2726)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8091 (0.9340)  time: 0.1472  data: 0.0008  max mem: 12911
Epoch: [53] Total time: 0:03:59 (0.1916 s / it)
Averaged stats: lr: 0.003856  min_lr: 0.003856  loss: 2.6447 (3.2691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8091 (0.9340)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.8015 (0.8015)  acc1: 83.2000 (83.2000)  acc5: 97.2000 (97.2000)  time: 5.4213  data: 5.3217  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.9779 (1.0070)  acc1: 77.6000 (76.9091)  acc5: 95.6000 (94.5818)  time: 0.6658  data: 0.5682  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2974 (1.2241)  acc1: 70.0000 (72.5905)  acc5: 90.8000 (91.6191)  time: 0.1800  data: 0.0916  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3917 (1.2384)  acc1: 68.8000 (72.2560)  acc5: 89.2000 (91.3440)  time: 0.2023  data: 0.1178  max mem: 12911
Test: Total time: 0:00:09 (0.3999 s / it)
* Acc@1 72.264 Acc@5 91.076 loss 1.238
Accuracy of the model on the 50000 test images: 72.3%
Max accuracy: 72.26%
Epoch: [54]  [   0/1251]  eta: 0:58:38  lr: 0.003856  min_lr: 0.003856  loss: 3.9967 (3.9967)  weight_decay: 0.0500 (0.0500)  time: 2.8125  data: 1.9758  max mem: 12911
Epoch: [54]  [ 200/1251]  eta: 0:03:33  lr: 0.003855  min_lr: 0.003855  loss: 2.7727 (3.3123)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1008 (0.9647)  time: 0.1920  data: 0.0005  max mem: 12911
Epoch: [54]  [ 400/1251]  eta: 0:02:47  lr: 0.003854  min_lr: 0.003854  loss: 2.8002 (3.2789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9125 (0.9664)  time: 0.1900  data: 0.0006  max mem: 12911
Epoch: [54]  [ 600/1251]  eta: 0:02:06  lr: 0.003852  min_lr: 0.003852  loss: 2.8662 (3.2809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6629 (0.9565)  time: 0.1903  data: 0.0006  max mem: 12911
Epoch: [54]  [ 800/1251]  eta: 0:01:27  lr: 0.003851  min_lr: 0.003851  loss: 2.8747 (3.2839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9567 (0.9643)  time: 0.1910  data: 0.0004  max mem: 12911
Epoch: [54]  [1000/1251]  eta: 0:00:48  lr: 0.003849  min_lr: 0.003849  loss: 3.1750 (3.2597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6952 (0.9480)  time: 0.1900  data: 0.0005  max mem: 12911
Epoch: [54]  [1200/1251]  eta: 0:00:09  lr: 0.003848  min_lr: 0.003848  loss: 2.9961 (3.2679)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1030 (0.9480)  time: 0.1874  data: 0.0004  max mem: 12911
Epoch: [54]  [1250/1251]  eta: 0:00:00  lr: 0.003848  min_lr: 0.003848  loss: 2.7525 (3.2740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9779 (0.9508)  time: 0.1476  data: 0.0007  max mem: 12911
Epoch: [54] Total time: 0:03:59 (0.1916 s / it)
Averaged stats: lr: 0.003848  min_lr: 0.003848  loss: 2.7525 (3.2619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9779 (0.9508)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.9470 (0.9470)  acc1: 82.4000 (82.4000)  acc5: 95.2000 (95.2000)  time: 5.3968  data: 5.3050  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9777 (1.0594)  acc1: 79.6000 (76.9091)  acc5: 95.2000 (94.4727)  time: 0.7108  data: 0.6147  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3425 (1.3030)  acc1: 67.6000 (71.6191)  acc5: 89.2000 (90.9143)  time: 0.2043  data: 0.1154  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4581 (1.3123)  acc1: 66.8000 (71.5680)  acc5: 88.0000 (90.7840)  time: 0.2008  data: 0.1154  max mem: 12911
Test: Total time: 0:00:09 (0.3959 s / it)
* Acc@1 71.652 Acc@5 90.944 loss 1.313
Accuracy of the model on the 50000 test images: 71.7%
Max accuracy: 72.26%
Epoch: [55]  [   0/1251]  eta: 1:01:56  lr: 0.003848  min_lr: 0.003848  loss: 2.5634 (2.5634)  weight_decay: 0.0500 (0.0500)  time: 2.9711  data: 2.6320  max mem: 12911
Epoch: [55]  [ 200/1251]  eta: 0:03:34  lr: 0.003846  min_lr: 0.003846  loss: 3.1511 (3.1633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9135 (1.0298)  time: 0.1856  data: 0.0005  max mem: 12911
Epoch: [55]  [ 400/1251]  eta: 0:02:46  lr: 0.003845  min_lr: 0.003845  loss: 2.7930 (3.1912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8514 (0.9550)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [55]  [ 600/1251]  eta: 0:02:05  lr: 0.003844  min_lr: 0.003844  loss: 2.7419 (3.2117)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0107 (0.9663)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [55]  [ 800/1251]  eta: 0:01:26  lr: 0.003842  min_lr: 0.003842  loss: 2.7110 (3.2427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8889 (0.9626)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [55]  [1000/1251]  eta: 0:00:47  lr: 0.003841  min_lr: 0.003841  loss: 2.6919 (3.2427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8371 (0.9453)  time: 0.1854  data: 0.0009  max mem: 12911
Epoch: [55]  [1200/1251]  eta: 0:00:09  lr: 0.003839  min_lr: 0.003839  loss: 3.2587 (3.2574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7805 (0.9446)  time: 0.1854  data: 0.0003  max mem: 12911
Epoch: [55]  [1250/1251]  eta: 0:00:00  lr: 0.003839  min_lr: 0.003839  loss: 3.1232 (3.2575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8309 (0.9432)  time: 0.1466  data: 0.0006  max mem: 12911
Epoch: [55] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.003839  min_lr: 0.003839  loss: 3.1232 (3.2630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8309 (0.9432)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8743 (0.8743)  acc1: 82.4000 (82.4000)  acc5: 96.8000 (96.8000)  time: 5.5365  data: 5.4332  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0334 (1.0569)  acc1: 76.4000 (76.9091)  acc5: 96.0000 (95.1636)  time: 0.7591  data: 0.6655  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3241 (1.2996)  acc1: 68.0000 (72.0952)  acc5: 92.4000 (91.5429)  time: 0.2108  data: 0.1243  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4549 (1.3174)  acc1: 68.0000 (71.7600)  acc5: 88.8000 (91.2320)  time: 0.2097  data: 0.1243  max mem: 12911
Test: Total time: 0:00:10 (0.4065 s / it)
* Acc@1 72.032 Acc@5 91.168 loss 1.315
Accuracy of the model on the 50000 test images: 72.0%
Max accuracy: 72.26%
Epoch: [56]  [   0/1251]  eta: 1:02:18  lr: 0.003839  min_lr: 0.003839  loss: 2.2316 (2.2316)  weight_decay: 0.0500 (0.0500)  time: 2.9886  data: 2.1942  max mem: 12911
Epoch: [56]  [ 200/1251]  eta: 0:03:33  lr: 0.003838  min_lr: 0.003838  loss: 2.7351 (3.2563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7738 (0.9738)  time: 0.1854  data: 0.0005  max mem: 12911
Epoch: [56]  [ 400/1251]  eta: 0:02:45  lr: 0.003836  min_lr: 0.003836  loss: 2.5069 (3.2160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8301 (1.0083)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [56]  [ 600/1251]  eta: 0:02:05  lr: 0.003835  min_lr: 0.003835  loss: 3.8991 (3.2521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8164 (0.9636)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [56]  [ 800/1251]  eta: 0:01:26  lr: 0.003833  min_lr: 0.003833  loss: 3.2785 (3.2527)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0366 (0.9772)  time: 0.1870  data: 0.0005  max mem: 12911
Epoch: [56]  [1000/1251]  eta: 0:00:47  lr: 0.003832  min_lr: 0.003832  loss: 2.8632 (3.2525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7940 (0.9599)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [56]  [1200/1251]  eta: 0:00:09  lr: 0.003831  min_lr: 0.003831  loss: 2.7356 (3.2458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7722 (0.9610)  time: 0.1851  data: 0.0004  max mem: 12911
Epoch: [56]  [1250/1251]  eta: 0:00:00  lr: 0.003830  min_lr: 0.003830  loss: 2.6828 (3.2495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7722 (0.9585)  time: 0.1464  data: 0.0008  max mem: 12911
Epoch: [56] Total time: 0:03:57 (0.1897 s / it)
Averaged stats: lr: 0.003830  min_lr: 0.003830  loss: 2.6828 (3.2457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7722 (0.9585)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8638 (0.8638)  acc1: 81.2000 (81.2000)  acc5: 96.4000 (96.4000)  time: 5.6133  data: 5.4758  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9427 (1.0062)  acc1: 76.4000 (76.2909)  acc5: 95.2000 (94.7636)  time: 0.6871  data: 0.5860  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3044 (1.2276)  acc1: 70.0000 (72.4191)  acc5: 89.6000 (91.5238)  time: 0.1910  data: 0.1024  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3732 (1.2401)  acc1: 69.6000 (72.1120)  acc5: 89.2000 (91.3920)  time: 0.1957  data: 0.1111  max mem: 12911
Test: Total time: 0:00:10 (0.4008 s / it)
* Acc@1 72.442 Acc@5 91.304 loss 1.234
Accuracy of the model on the 50000 test images: 72.4%
Max accuracy: 72.44%
Epoch: [57]  [   0/1251]  eta: 1:06:36  lr: 0.003830  min_lr: 0.003830  loss: 2.6383 (2.6383)  weight_decay: 0.0500 (0.0500)  time: 3.1950  data: 2.9682  max mem: 12911
Epoch: [57]  [ 200/1251]  eta: 0:03:30  lr: 0.003829  min_lr: 0.003829  loss: 2.6603 (3.2452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7457 (0.8057)  time: 0.1844  data: 0.0004  max mem: 12911
Epoch: [57]  [ 400/1251]  eta: 0:02:44  lr: 0.003827  min_lr: 0.003827  loss: 3.6687 (3.2796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8658 (0.8811)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [57]  [ 600/1251]  eta: 0:02:03  lr: 0.003826  min_lr: 0.003826  loss: 2.8407 (3.2506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6577 (0.8569)  time: 0.1848  data: 0.0004  max mem: 12911
Epoch: [57]  [ 800/1251]  eta: 0:01:25  lr: 0.003824  min_lr: 0.003824  loss: 3.5327 (3.2457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6940 (0.8953)  time: 0.1829  data: 0.0005  max mem: 12911
Epoch: [57]  [1000/1251]  eta: 0:00:47  lr: 0.003823  min_lr: 0.003823  loss: 3.1207 (3.2598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9283 (0.9314)  time: 0.1820  data: 0.0004  max mem: 12911
Epoch: [57]  [1200/1251]  eta: 0:00:09  lr: 0.003821  min_lr: 0.003821  loss: 3.5992 (3.2729)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1792 (0.9458)  time: 0.1884  data: 0.0004  max mem: 12911
Epoch: [57]  [1250/1251]  eta: 0:00:00  lr: 0.003821  min_lr: 0.003821  loss: 2.6930 (3.2738)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0731 (0.9515)  time: 0.1483  data: 0.0011  max mem: 12911
Epoch: [57] Total time: 0:03:55 (0.1882 s / it)
Averaged stats: lr: 0.003821  min_lr: 0.003821  loss: 2.6930 (3.2585)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0731 (0.9515)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7505 (0.7505)  acc1: 82.4000 (82.4000)  acc5: 96.4000 (96.4000)  time: 5.5603  data: 5.4687  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9357 (0.9969)  acc1: 77.2000 (77.0545)  acc5: 95.6000 (94.9091)  time: 0.7685  data: 0.6707  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2824 (1.2163)  acc1: 69.6000 (72.4000)  acc5: 91.6000 (91.6000)  time: 0.2139  data: 0.1247  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3159 (1.2337)  acc1: 67.6000 (71.9040)  acc5: 89.2000 (91.5200)  time: 0.2099  data: 0.1246  max mem: 12911
Test: Total time: 0:00:10 (0.4100 s / it)
* Acc@1 72.202 Acc@5 91.212 loss 1.230
Accuracy of the model on the 50000 test images: 72.2%
Max accuracy: 72.44%
Epoch: [58]  [   0/1251]  eta: 0:59:23  lr: 0.003821  min_lr: 0.003821  loss: 2.9741 (2.9741)  weight_decay: 0.0500 (0.0500)  time: 2.8489  data: 2.5340  max mem: 12911
Epoch: [58]  [ 200/1251]  eta: 0:03:35  lr: 0.003820  min_lr: 0.003820  loss: 3.5981 (3.1248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8064 (nan)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [58]  [ 400/1251]  eta: 0:02:46  lr: 0.003818  min_lr: 0.003818  loss: 2.9615 (3.1955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9640 (nan)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [58]  [ 600/1251]  eta: 0:02:05  lr: 0.003817  min_lr: 0.003817  loss: 2.6578 (3.1903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7998 (nan)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [58]  [ 800/1251]  eta: 0:01:26  lr: 0.003815  min_lr: 0.003815  loss: 2.6760 (3.2256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7226 (nan)  time: 0.1867  data: 0.0006  max mem: 12911
Epoch: [58]  [1000/1251]  eta: 0:00:47  lr: 0.003813  min_lr: 0.003813  loss: 2.8415 (3.2333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7887 (nan)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [58]  [1200/1251]  eta: 0:00:09  lr: 0.003812  min_lr: 0.003812  loss: 3.4284 (3.2503)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0463 (nan)  time: 0.1891  data: 0.0005  max mem: 12911
Epoch: [58]  [1250/1251]  eta: 0:00:00  lr: 0.003812  min_lr: 0.003812  loss: 3.1023 (3.2456)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7578 (nan)  time: 0.1462  data: 0.0008  max mem: 12911
Epoch: [58] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.003812  min_lr: 0.003812  loss: 3.1023 (3.2462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7578 (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8865 (0.8865)  acc1: 80.8000 (80.8000)  acc5: 96.8000 (96.8000)  time: 5.7212  data: 5.6280  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0535 (1.0741)  acc1: 79.2000 (77.5273)  acc5: 95.6000 (94.7273)  time: 0.7710  data: 0.6774  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2845 (1.2752)  acc1: 69.2000 (73.1429)  acc5: 90.0000 (91.5238)  time: 0.2046  data: 0.1179  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4593 (1.2875)  acc1: 69.2000 (72.8960)  acc5: 89.2000 (91.3920)  time: 0.2094  data: 0.1247  max mem: 12911
Test: Total time: 0:00:10 (0.4143 s / it)
* Acc@1 72.322 Acc@5 91.340 loss 1.288
Accuracy of the model on the 50000 test images: 72.3%
Max accuracy: 72.44%
Epoch: [59]  [   0/1251]  eta: 0:59:01  lr: 0.003812  min_lr: 0.003812  loss: 2.5232 (2.5232)  weight_decay: 0.0500 (0.0500)  time: 2.8310  data: 1.6213  max mem: 12911
Epoch: [59]  [ 200/1251]  eta: 0:03:33  lr: 0.003810  min_lr: 0.003810  loss: 3.5996 (3.2571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8848 (0.8791)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [59]  [ 400/1251]  eta: 0:02:46  lr: 0.003809  min_lr: 0.003809  loss: 2.7577 (3.1644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7593 (0.9250)  time: 0.1910  data: 0.0004  max mem: 12911
Epoch: [59]  [ 600/1251]  eta: 0:02:05  lr: 0.003807  min_lr: 0.003807  loss: 2.7483 (3.2020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6900 (0.8930)  time: 0.1848  data: 0.0004  max mem: 12911
Epoch: [59]  [ 800/1251]  eta: 0:01:26  lr: 0.003805  min_lr: 0.003805  loss: 2.6662 (3.2239)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0162 (0.9402)  time: 0.1859  data: 0.0004  max mem: 12911
Epoch: [59]  [1000/1251]  eta: 0:00:47  lr: 0.003804  min_lr: 0.003804  loss: 2.6744 (3.2454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7600 (0.9219)  time: 0.1859  data: 0.0004  max mem: 12911
Epoch: [59]  [1200/1251]  eta: 0:00:09  lr: 0.003802  min_lr: 0.003802  loss: 3.1364 (3.2633)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1449 (0.9495)  time: 0.1924  data: 0.0005  max mem: 12911
Epoch: [59]  [1250/1251]  eta: 0:00:00  lr: 0.003802  min_lr: 0.003802  loss: 2.6657 (3.2560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9856 (0.9495)  time: 0.1473  data: 0.0010  max mem: 12911
Epoch: [59] Total time: 0:03:57 (0.1902 s / it)
Averaged stats: lr: 0.003802  min_lr: 0.003802  loss: 2.6657 (3.2603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9856 (0.9495)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.8268 (0.8268)  acc1: 83.2000 (83.2000)  acc5: 97.2000 (97.2000)  time: 5.3308  data: 5.2365  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9295 (1.0037)  acc1: 77.6000 (77.6364)  acc5: 94.8000 (94.4727)  time: 0.7309  data: 0.6364  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2822 (1.2285)  acc1: 70.0000 (73.2000)  acc5: 90.4000 (91.3333)  time: 0.2350  data: 0.1481  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3963 (1.2399)  acc1: 69.2000 (72.8960)  acc5: 89.6000 (91.2800)  time: 0.2324  data: 0.1480  max mem: 12911
Test: Total time: 0:00:10 (0.4170 s / it)
* Acc@1 72.610 Acc@5 91.420 loss 1.232
Accuracy of the model on the 50000 test images: 72.6%
Max accuracy: 72.61%
Epoch: [60]  [   0/1251]  eta: 0:58:20  lr: 0.003802  min_lr: 0.003802  loss: 2.4316 (2.4316)  weight_decay: 0.0500 (0.0500)  time: 2.7985  data: 2.4702  max mem: 12911
Epoch: [60]  [ 200/1251]  eta: 0:03:33  lr: 0.003800  min_lr: 0.003800  loss: 2.6935 (3.1922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9311 (0.9636)  time: 0.1894  data: 0.0004  max mem: 12911
Epoch: [60]  [ 400/1251]  eta: 0:02:46  lr: 0.003799  min_lr: 0.003799  loss: 2.7550 (3.2804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8906 (0.9252)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [60]  [ 600/1251]  eta: 0:02:05  lr: 0.003797  min_lr: 0.003797  loss: 2.6231 (3.2893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8620 (0.9139)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [60]  [ 800/1251]  eta: 0:01:26  lr: 0.003796  min_lr: 0.003796  loss: 2.6790 (3.3034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7558 (0.9080)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [60]  [1000/1251]  eta: 0:00:48  lr: 0.003794  min_lr: 0.003794  loss: 2.7020 (3.2770)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7293 (0.9189)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [60]  [1200/1251]  eta: 0:00:09  lr: 0.003793  min_lr: 0.003793  loss: 2.7945 (3.2573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6270 (0.8961)  time: 0.1836  data: 0.0004  max mem: 12911
Epoch: [60]  [1250/1251]  eta: 0:00:00  lr: 0.003792  min_lr: 0.003792  loss: 2.6971 (3.2507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6952 (0.8960)  time: 0.1462  data: 0.0007  max mem: 12911
Epoch: [60] Total time: 0:03:58 (0.1910 s / it)
Averaged stats: lr: 0.003792  min_lr: 0.003792  loss: 2.6971 (3.2514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6952 (0.8960)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8687 (0.8687)  acc1: 82.4000 (82.4000)  acc5: 95.6000 (95.6000)  time: 5.5362  data: 5.3901  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9728 (1.0195)  acc1: 76.4000 (77.4545)  acc5: 95.2000 (94.6182)  time: 0.7568  data: 0.6560  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2624 (1.2386)  acc1: 69.2000 (72.8952)  acc5: 90.8000 (91.5429)  time: 0.2068  data: 0.1181  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3945 (1.2534)  acc1: 69.2000 (72.4480)  acc5: 90.0000 (91.3440)  time: 0.2028  data: 0.1181  max mem: 12911
Test: Total time: 0:00:10 (0.4047 s / it)
* Acc@1 72.428 Acc@5 91.410 loss 1.246
Accuracy of the model on the 50000 test images: 72.4%
Max accuracy: 72.61%
Epoch: [61]  [   0/1251]  eta: 1:08:52  lr: 0.003792  min_lr: 0.003792  loss: 2.7058 (2.7058)  weight_decay: 0.0500 (0.0500)  time: 3.3031  data: 2.1421  max mem: 12911
Epoch: [61]  [ 200/1251]  eta: 0:03:34  lr: 0.003791  min_lr: 0.003791  loss: 2.8423 (3.1822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8522 (0.9744)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [61]  [ 400/1251]  eta: 0:02:46  lr: 0.003789  min_lr: 0.003789  loss: 2.5982 (3.1974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7876 (0.9587)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [61]  [ 600/1251]  eta: 0:02:05  lr: 0.003787  min_lr: 0.003787  loss: 3.2832 (3.2386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9706 (0.9688)  time: 0.1860  data: 0.0004  max mem: 12911
Epoch: [61]  [ 800/1251]  eta: 0:01:26  lr: 0.003786  min_lr: 0.003786  loss: 3.0452 (3.2336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7454 (0.9205)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [61]  [1000/1251]  eta: 0:00:47  lr: 0.003784  min_lr: 0.003784  loss: 3.4387 (3.2719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8694 (0.9133)  time: 0.1836  data: 0.0005  max mem: 12911
Epoch: [61]  [1200/1251]  eta: 0:00:09  lr: 0.003782  min_lr: 0.003782  loss: 2.7184 (3.2622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7932 (0.9168)  time: 0.1859  data: 0.0004  max mem: 12911
Epoch: [61]  [1250/1251]  eta: 0:00:00  lr: 0.003782  min_lr: 0.003782  loss: 2.6871 (3.2588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6854 (0.9070)  time: 0.1462  data: 0.0010  max mem: 12911
Epoch: [61] Total time: 0:03:57 (0.1900 s / it)
Averaged stats: lr: 0.003782  min_lr: 0.003782  loss: 2.6871 (3.2357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6854 (0.9070)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8639 (0.8639)  acc1: 82.0000 (82.0000)  acc5: 95.6000 (95.6000)  time: 5.5282  data: 5.3999  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.0420 (1.0152)  acc1: 80.0000 (77.4182)  acc5: 95.6000 (94.8364)  time: 0.7215  data: 0.6230  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2309 (1.2487)  acc1: 70.4000 (72.5333)  acc5: 89.6000 (91.4286)  time: 0.2026  data: 0.1145  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4216 (1.2584)  acc1: 69.6000 (72.1920)  acc5: 88.4000 (91.2480)  time: 0.2011  data: 0.1144  max mem: 12911
Test: Total time: 0:00:10 (0.4015 s / it)
* Acc@1 72.718 Acc@5 91.576 loss 1.246
Accuracy of the model on the 50000 test images: 72.7%
Max accuracy: 72.72%
Epoch: [62]  [   0/1251]  eta: 1:06:16  lr: 0.003782  min_lr: 0.003782  loss: 2.7935 (2.7935)  weight_decay: 0.0500 (0.0500)  time: 3.1784  data: 2.9245  max mem: 12911
Epoch: [62]  [ 200/1251]  eta: 0:03:35  lr: 0.003780  min_lr: 0.003780  loss: 2.7062 (3.2047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9935 (1.1156)  time: 0.1917  data: 0.0005  max mem: 12911
Epoch: [62]  [ 400/1251]  eta: 0:02:47  lr: 0.003779  min_lr: 0.003779  loss: 2.7053 (3.1887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7389 (1.0140)  time: 0.1873  data: 0.0006  max mem: 12911
Epoch: [62]  [ 600/1251]  eta: 0:02:06  lr: 0.003777  min_lr: 0.003777  loss: 4.0797 (3.2065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8223 (0.9893)  time: 0.1913  data: 0.0004  max mem: 12911
Epoch: [62]  [ 800/1251]  eta: 0:01:27  lr: 0.003775  min_lr: 0.003775  loss: 3.6296 (3.2523)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0163 (0.9926)  time: 0.1891  data: 0.0004  max mem: 12911
Epoch: [62]  [1000/1251]  eta: 0:00:48  lr: 0.003774  min_lr: 0.003774  loss: 2.9431 (3.2605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8869 (0.9807)  time: 0.1905  data: 0.0004  max mem: 12911
Epoch: [62]  [1200/1251]  eta: 0:00:09  lr: 0.003772  min_lr: 0.003772  loss: 3.4963 (3.2748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8096 (0.9728)  time: 0.1828  data: 0.0005  max mem: 12911
Epoch: [62]  [1250/1251]  eta: 0:00:00  lr: 0.003772  min_lr: 0.003772  loss: 2.6325 (3.2750)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0935 (0.9760)  time: 0.1466  data: 0.0015  max mem: 12911
Epoch: [62] Total time: 0:04:00 (0.1918 s / it)
Averaged stats: lr: 0.003772  min_lr: 0.003772  loss: 2.6325 (3.2433)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0935 (0.9760)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8038 (0.8038)  acc1: 83.2000 (83.2000)  acc5: 97.2000 (97.2000)  time: 5.6253  data: 5.5337  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9804 (1.0216)  acc1: 78.0000 (77.8545)  acc5: 95.2000 (95.0545)  time: 0.6720  data: 0.5767  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3075 (1.2560)  acc1: 69.6000 (72.5143)  acc5: 90.4000 (91.6000)  time: 0.1754  data: 0.0878  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4140 (1.2697)  acc1: 68.0000 (72.2880)  acc5: 88.4000 (91.4880)  time: 0.2011  data: 0.1165  max mem: 12911
Test: Total time: 0:00:10 (0.4047 s / it)
* Acc@1 72.258 Acc@5 91.232 loss 1.260
Accuracy of the model on the 50000 test images: 72.3%
Max accuracy: 72.72%
Epoch: [63]  [   0/1251]  eta: 1:03:23  lr: 0.003772  min_lr: 0.003772  loss: 4.2189 (4.2189)  weight_decay: 0.0500 (0.0500)  time: 3.0401  data: 2.3264  max mem: 12911
Epoch: [63]  [ 200/1251]  eta: 0:03:34  lr: 0.003770  min_lr: 0.003770  loss: 2.6691 (3.1973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6448 (0.9076)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [63]  [ 400/1251]  eta: 0:02:46  lr: 0.003768  min_lr: 0.003768  loss: 3.2889 (3.2020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6207 (0.9127)  time: 0.1889  data: 0.0003  max mem: 12911
Epoch: [63]  [ 600/1251]  eta: 0:02:05  lr: 0.003767  min_lr: 0.003767  loss: 2.7652 (3.2078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6951 (0.9376)  time: 0.1843  data: 0.0004  max mem: 12911
Epoch: [63]  [ 800/1251]  eta: 0:01:26  lr: 0.003765  min_lr: 0.003765  loss: 2.7456 (3.2300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6755 (0.9189)  time: 0.1869  data: 0.0004  max mem: 12911
Epoch: [63]  [1000/1251]  eta: 0:00:47  lr: 0.003763  min_lr: 0.003763  loss: 3.7066 (3.2322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9186 (0.9470)  time: 0.1848  data: 0.0004  max mem: 12911
Epoch: [63]  [1200/1251]  eta: 0:00:09  lr: 0.003762  min_lr: 0.003762  loss: 3.6373 (3.2259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8241 (0.9523)  time: 0.1979  data: 0.0011  max mem: 12911
Epoch: [63]  [1250/1251]  eta: 0:00:00  lr: 0.003761  min_lr: 0.003761  loss: 2.7786 (3.2275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7556 (0.9458)  time: 0.1550  data: 0.0010  max mem: 12911
Epoch: [63] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.003761  min_lr: 0.003761  loss: 2.7786 (3.2372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7556 (0.9458)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.8689 (0.8689)  acc1: 81.6000 (81.6000)  acc5: 95.6000 (95.6000)  time: 5.4080  data: 5.3163  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.0098 (1.0543)  acc1: 77.6000 (77.7818)  acc5: 94.8000 (94.5091)  time: 0.7279  data: 0.6340  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3297 (1.2681)  acc1: 70.0000 (72.9524)  acc5: 91.2000 (91.6000)  time: 0.2129  data: 0.1261  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4196 (1.2768)  acc1: 68.8000 (72.5120)  acc5: 89.6000 (91.5200)  time: 0.2105  data: 0.1260  max mem: 12911
Test: Total time: 0:00:10 (0.4028 s / it)
* Acc@1 72.446 Acc@5 91.366 loss 1.275
Accuracy of the model on the 50000 test images: 72.4%
Max accuracy: 72.72%
Epoch: [64]  [   0/1251]  eta: 1:05:58  lr: 0.003761  min_lr: 0.003761  loss: 2.3724 (2.3724)  weight_decay: 0.0500 (0.0500)  time: 3.1640  data: 2.8017  max mem: 12911
Epoch: [64]  [ 200/1251]  eta: 0:03:34  lr: 0.003760  min_lr: 0.003760  loss: 2.6134 (3.2314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7965 (0.9174)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [64]  [ 400/1251]  eta: 0:02:46  lr: 0.003758  min_lr: 0.003758  loss: 2.9231 (3.2081)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7344 (0.9280)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [64]  [ 600/1251]  eta: 0:02:06  lr: 0.003756  min_lr: 0.003756  loss: 3.7096 (3.2091)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0025 (0.9458)  time: 0.1888  data: 0.0004  max mem: 12911
Epoch: [64]  [ 800/1251]  eta: 0:01:26  lr: 0.003754  min_lr: 0.003754  loss: 2.5799 (3.2117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6670 (0.9644)  time: 0.1851  data: 0.0005  max mem: 12911
Epoch: [64]  [1000/1251]  eta: 0:00:47  lr: 0.003753  min_lr: 0.003753  loss: 3.1985 (3.2034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9050 (0.9464)  time: 0.1850  data: 0.0004  max mem: 12911
Epoch: [64]  [1200/1251]  eta: 0:00:09  lr: 0.003751  min_lr: 0.003751  loss: 2.6276 (3.2063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6610 (0.9182)  time: 0.1862  data: 0.0006  max mem: 12911
Epoch: [64]  [1250/1251]  eta: 0:00:00  lr: 0.003751  min_lr: 0.003751  loss: 2.5835 (3.2036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8521 (0.9173)  time: 0.1463  data: 0.0011  max mem: 12911
Epoch: [64] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.003751  min_lr: 0.003751  loss: 2.5835 (3.2254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8521 (0.9173)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8447 (0.8447)  acc1: 81.2000 (81.2000)  acc5: 95.6000 (95.6000)  time: 5.4485  data: 5.3551  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9567 (1.0100)  acc1: 79.6000 (77.8909)  acc5: 95.6000 (94.6909)  time: 0.7401  data: 0.6447  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2752 (1.2292)  acc1: 70.8000 (72.8762)  acc5: 90.4000 (91.4476)  time: 0.2156  data: 0.1248  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3254 (1.2418)  acc1: 70.0000 (72.5440)  acc5: 89.6000 (91.3760)  time: 0.2142  data: 0.1247  max mem: 12911
Test: Total time: 0:00:10 (0.4074 s / it)
* Acc@1 72.900 Acc@5 91.478 loss 1.229
Accuracy of the model on the 50000 test images: 72.9%
Max accuracy: 72.90%
Epoch: [65]  [   0/1251]  eta: 0:56:28  lr: 0.003751  min_lr: 0.003751  loss: 2.3530 (2.3530)  weight_decay: 0.0500 (0.0500)  time: 2.7088  data: 2.4227  max mem: 12911
Epoch: [65]  [ 200/1251]  eta: 0:03:32  lr: 0.003749  min_lr: 0.003749  loss: 3.0097 (3.2650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7588 (0.9150)  time: 0.1853  data: 0.0004  max mem: 12911
Epoch: [65]  [ 400/1251]  eta: 0:02:46  lr: 0.003747  min_lr: 0.003747  loss: 3.5304 (3.2691)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0256 (0.9438)  time: 0.1923  data: 0.0006  max mem: 12911
Epoch: [65]  [ 600/1251]  eta: 0:02:05  lr: 0.003745  min_lr: 0.003745  loss: 2.6579 (3.2229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7343 (0.9096)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [65]  [ 800/1251]  eta: 0:01:26  lr: 0.003744  min_lr: 0.003744  loss: 2.7693 (3.2334)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6361 (0.8717)  time: 0.1912  data: 0.0005  max mem: 12911
Epoch: [65]  [1000/1251]  eta: 0:00:48  lr: 0.003742  min_lr: 0.003742  loss: 2.5992 (3.2316)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2267 (nan)  time: 0.1911  data: 0.0006  max mem: 12911
Epoch: [65]  [1200/1251]  eta: 0:00:09  lr: 0.003740  min_lr: 0.003740  loss: 3.5406 (3.2394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6479 (nan)  time: 0.1860  data: 0.0004  max mem: 12911
Epoch: [65]  [1250/1251]  eta: 0:00:00  lr: 0.003740  min_lr: 0.003740  loss: 3.4724 (3.2434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8331 (nan)  time: 0.1471  data: 0.0011  max mem: 12911
Epoch: [65] Total time: 0:03:59 (0.1913 s / it)
Averaged stats: lr: 0.003740  min_lr: 0.003740  loss: 3.4724 (3.2277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8331 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.9737 (0.9737)  acc1: 80.0000 (80.0000)  acc5: 94.4000 (94.4000)  time: 5.4439  data: 5.3053  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.1018 (1.1044)  acc1: 77.2000 (76.1818)  acc5: 94.8000 (94.3636)  time: 0.7388  data: 0.6391  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3514 (1.2979)  acc1: 69.2000 (71.9619)  acc5: 90.0000 (91.5810)  time: 0.2009  data: 0.1123  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3802 (1.3124)  acc1: 68.8000 (71.6960)  acc5: 89.2000 (91.4560)  time: 0.1969  data: 0.1122  max mem: 12911
Test: Total time: 0:00:09 (0.3961 s / it)
* Acc@1 72.472 Acc@5 91.438 loss 1.299
Accuracy of the model on the 50000 test images: 72.5%
Max accuracy: 72.90%
Epoch: [66]  [   0/1251]  eta: 1:00:00  lr: 0.003740  min_lr: 0.003740  loss: 2.4292 (2.4292)  weight_decay: 0.0500 (0.0500)  time: 2.8779  data: 2.1149  max mem: 12911
Epoch: [66]  [ 200/1251]  eta: 0:03:35  lr: 0.003738  min_lr: 0.003738  loss: 2.6498 (3.1627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6827 (nan)  time: 0.1856  data: 0.0003  max mem: 12911
Epoch: [66]  [ 400/1251]  eta: 0:02:46  lr: 0.003736  min_lr: 0.003736  loss: 2.5768 (3.1989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6033 (nan)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [66]  [ 600/1251]  eta: 0:02:05  lr: 0.003734  min_lr: 0.003734  loss: 2.7064 (3.2135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8675 (nan)  time: 0.1867  data: 0.0003  max mem: 12911
Epoch: [66]  [ 800/1251]  eta: 0:01:26  lr: 0.003732  min_lr: 0.003732  loss: 2.9744 (3.2219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7860 (nan)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [66]  [1000/1251]  eta: 0:00:47  lr: 0.003731  min_lr: 0.003731  loss: 2.9397 (3.2239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7567 (nan)  time: 0.1887  data: 0.0006  max mem: 12911
Epoch: [66]  [1200/1251]  eta: 0:00:09  lr: 0.003729  min_lr: 0.003729  loss: 2.5310 (3.2130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6818 (nan)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [66]  [1250/1251]  eta: 0:00:00  lr: 0.003728  min_lr: 0.003728  loss: 2.7947 (3.2157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6933 (nan)  time: 0.1462  data: 0.0008  max mem: 12911
Epoch: [66] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.003728  min_lr: 0.003728  loss: 2.7947 (3.2440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6933 (nan)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.8352 (0.8352)  acc1: 80.4000 (80.4000)  acc5: 95.2000 (95.2000)  time: 5.3997  data: 5.3081  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9175 (0.9883)  acc1: 79.6000 (77.2727)  acc5: 95.6000 (94.8364)  time: 0.7064  data: 0.6112  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2519 (1.2197)  acc1: 70.4000 (72.7048)  acc5: 89.6000 (91.5429)  time: 0.1996  data: 0.1121  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3800 (1.2363)  acc1: 68.0000 (72.2880)  acc5: 89.6000 (91.5040)  time: 0.2058  data: 0.1213  max mem: 12911
Test: Total time: 0:00:10 (0.4003 s / it)
* Acc@1 72.592 Acc@5 91.438 loss 1.227
Accuracy of the model on the 50000 test images: 72.6%
Max accuracy: 72.90%
Epoch: [67]  [   0/1251]  eta: 1:01:44  lr: 0.003728  min_lr: 0.003728  loss: 4.4857 (4.4857)  weight_decay: 0.0500 (0.0500)  time: 2.9616  data: 2.2053  max mem: 12911
Epoch: [67]  [ 200/1251]  eta: 0:03:35  lr: 0.003727  min_lr: 0.003727  loss: 2.7385 (3.1663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8255 (0.8596)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [67]  [ 400/1251]  eta: 0:02:46  lr: 0.003725  min_lr: 0.003725  loss: 3.8034 (3.2046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8385 (0.8429)  time: 0.1849  data: 0.0003  max mem: 12911
Epoch: [67]  [ 600/1251]  eta: 0:02:06  lr: 0.003723  min_lr: 0.003723  loss: 3.0293 (3.2148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8375 (0.8504)  time: 0.1888  data: 0.0004  max mem: 12911
Epoch: [67]  [ 800/1251]  eta: 0:01:26  lr: 0.003721  min_lr: 0.003721  loss: 2.6174 (3.2154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7457 (0.8475)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [67]  [1000/1251]  eta: 0:00:48  lr: 0.003719  min_lr: 0.003719  loss: 2.5620 (3.1961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8298 (0.8867)  time: 0.1856  data: 0.0004  max mem: 12911
Epoch: [67]  [1200/1251]  eta: 0:00:09  lr: 0.003717  min_lr: 0.003717  loss: 2.6608 (3.2054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8310 (0.8768)  time: 0.1882  data: 0.0003  max mem: 12911
Epoch: [67]  [1250/1251]  eta: 0:00:00  lr: 0.003717  min_lr: 0.003717  loss: 2.6319 (3.2056)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0169 (0.8902)  time: 0.1467  data: 0.0007  max mem: 12911
Epoch: [67] Total time: 0:03:59 (0.1913 s / it)
Averaged stats: lr: 0.003717  min_lr: 0.003717  loss: 2.6319 (3.2082)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0169 (0.8902)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8269 (0.8269)  acc1: 80.8000 (80.8000)  acc5: 95.2000 (95.2000)  time: 5.4592  data: 5.3631  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9351 (1.0064)  acc1: 78.8000 (77.7091)  acc5: 94.8000 (94.2545)  time: 0.7334  data: 0.6387  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3330 (1.2330)  acc1: 70.0000 (73.0286)  acc5: 90.4000 (91.2762)  time: 0.2125  data: 0.1254  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4019 (1.2459)  acc1: 68.8000 (72.6080)  acc5: 88.8000 (91.1680)  time: 0.2151  data: 0.1305  max mem: 12911
Test: Total time: 0:00:10 (0.4081 s / it)
* Acc@1 72.746 Acc@5 91.466 loss 1.230
Accuracy of the model on the 50000 test images: 72.7%
Max accuracy: 72.90%
Epoch: [68]  [   0/1251]  eta: 1:01:51  lr: 0.003717  min_lr: 0.003717  loss: 2.3980 (2.3980)  weight_decay: 0.0500 (0.0500)  time: 2.9666  data: 2.7361  max mem: 12911
Epoch: [68]  [ 200/1251]  eta: 0:03:32  lr: 0.003715  min_lr: 0.003715  loss: 3.2629 (3.0724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7302 (0.9141)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [68]  [ 400/1251]  eta: 0:02:46  lr: 0.003713  min_lr: 0.003713  loss: 2.8392 (3.1047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8423 (0.8333)  time: 0.1907  data: 0.0005  max mem: 12911
Epoch: [68]  [ 600/1251]  eta: 0:02:05  lr: 0.003711  min_lr: 0.003711  loss: 3.3846 (3.1608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6505 (0.8500)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [68]  [ 800/1251]  eta: 0:01:26  lr: 0.003710  min_lr: 0.003710  loss: 3.5645 (3.1753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7280 (0.8480)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [68]  [1000/1251]  eta: 0:00:47  lr: 0.003708  min_lr: 0.003708  loss: 2.6101 (3.2036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9687 (0.8704)  time: 0.1949  data: 0.0005  max mem: 12911
Epoch: [68]  [1200/1251]  eta: 0:00:09  lr: 0.003706  min_lr: 0.003706  loss: 2.9834 (3.2154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9550 (0.8652)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [68]  [1250/1251]  eta: 0:00:00  lr: 0.003705  min_lr: 0.003705  loss: 3.1861 (3.2154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9514 (0.8668)  time: 0.1479  data: 0.0006  max mem: 12911
Epoch: [68] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.003705  min_lr: 0.003705  loss: 3.1861 (3.2124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9514 (0.8668)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8548 (0.8548)  acc1: 85.2000 (85.2000)  acc5: 96.8000 (96.8000)  time: 5.6347  data: 5.5431  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0722 (1.0714)  acc1: 78.4000 (77.2364)  acc5: 95.2000 (94.6545)  time: 0.7336  data: 0.6368  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3396 (1.2814)  acc1: 70.4000 (73.0286)  acc5: 90.4000 (91.6952)  time: 0.2093  data: 0.1209  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4692 (1.2947)  acc1: 69.6000 (72.4960)  acc5: 89.6000 (91.6480)  time: 0.2054  data: 0.1209  max mem: 12911
Test: Total time: 0:00:10 (0.4087 s / it)
* Acc@1 72.594 Acc@5 91.644 loss 1.288
Accuracy of the model on the 50000 test images: 72.6%
Max accuracy: 72.90%
Epoch: [69]  [   0/1251]  eta: 1:04:51  lr: 0.003705  min_lr: 0.003705  loss: 2.3972 (2.3972)  weight_decay: 0.0500 (0.0500)  time: 3.1110  data: 2.2514  max mem: 12911
Epoch: [69]  [ 200/1251]  eta: 0:03:34  lr: 0.003703  min_lr: 0.003703  loss: 2.7238 (3.1817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8186 (0.9566)  time: 0.1890  data: 0.0004  max mem: 12911
Epoch: [69]  [ 400/1251]  eta: 0:02:47  lr: 0.003702  min_lr: 0.003702  loss: 2.6490 (3.2493)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0250 (0.9860)  time: 0.1890  data: 0.0005  max mem: 12911
Epoch: [69]  [ 600/1251]  eta: 0:02:06  lr: 0.003700  min_lr: 0.003700  loss: 2.7387 (3.2169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7706 (0.9387)  time: 0.1877  data: 0.0005  max mem: 12911
Epoch: [69]  [ 800/1251]  eta: 0:01:26  lr: 0.003698  min_lr: 0.003698  loss: 2.6817 (3.1877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5505 (0.9112)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [69]  [1000/1251]  eta: 0:00:47  lr: 0.003696  min_lr: 0.003696  loss: 2.6928 (3.1981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8542 (0.8998)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [69]  [1200/1251]  eta: 0:00:09  lr: 0.003694  min_lr: 0.003694  loss: 2.8114 (3.2116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9602 (0.9192)  time: 0.1857  data: 0.0004  max mem: 12911
Epoch: [69]  [1250/1251]  eta: 0:00:00  lr: 0.003694  min_lr: 0.003694  loss: 2.6565 (3.2168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8182 (0.9153)  time: 0.1474  data: 0.0006  max mem: 12911
Epoch: [69] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.003694  min_lr: 0.003694  loss: 2.6565 (3.2089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8182 (0.9153)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.9125 (0.9125)  acc1: 80.4000 (80.4000)  acc5: 95.6000 (95.6000)  time: 5.6058  data: 5.5141  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0501 (1.0665)  acc1: 78.8000 (77.9636)  acc5: 95.6000 (94.9091)  time: 0.7431  data: 0.6472  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3204 (1.2697)  acc1: 70.8000 (73.5238)  acc5: 90.4000 (91.9619)  time: 0.2181  data: 0.1301  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4625 (1.2884)  acc1: 69.2000 (72.9920)  acc5: 89.2000 (91.9520)  time: 0.2146  data: 0.1300  max mem: 12911
Test: Total time: 0:00:10 (0.4147 s / it)
* Acc@1 72.896 Acc@5 91.660 loss 1.287
Accuracy of the model on the 50000 test images: 72.9%
Max accuracy: 72.90%
Epoch: [70]  [   0/1251]  eta: 1:01:18  lr: 0.003694  min_lr: 0.003694  loss: 3.1290 (3.1290)  weight_decay: 0.0500 (0.0500)  time: 2.9403  data: 2.6411  max mem: 12911
Epoch: [70]  [ 200/1251]  eta: 0:03:34  lr: 0.003692  min_lr: 0.003692  loss: 3.1034 (3.2795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7734 (0.8927)  time: 0.1854  data: 0.0005  max mem: 12911
Epoch: [70]  [ 400/1251]  eta: 0:02:46  lr: 0.003690  min_lr: 0.003690  loss: 2.8430 (3.2448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6852 (0.8467)  time: 0.1888  data: 0.0004  max mem: 12911
Epoch: [70]  [ 600/1251]  eta: 0:02:05  lr: 0.003688  min_lr: 0.003688  loss: 2.6400 (3.2533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7670 (0.8640)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [70]  [ 800/1251]  eta: 0:01:26  lr: 0.003686  min_lr: 0.003686  loss: 2.6577 (3.2373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8473 (0.8928)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [70]  [1000/1251]  eta: 0:00:47  lr: 0.003684  min_lr: 0.003684  loss: 3.0088 (3.2275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7771 (0.8876)  time: 0.1832  data: 0.0004  max mem: 12911
Epoch: [70]  [1200/1251]  eta: 0:00:09  lr: 0.003682  min_lr: 0.003682  loss: 2.5970 (3.2106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8485 (0.8999)  time: 0.1894  data: 0.0004  max mem: 12911
Epoch: [70]  [1250/1251]  eta: 0:00:00  lr: 0.003682  min_lr: 0.003682  loss: 2.7449 (3.2113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8029 (0.8953)  time: 0.1484  data: 0.0009  max mem: 12911
Epoch: [70] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.003682  min_lr: 0.003682  loss: 2.7449 (3.2207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8029 (0.8953)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.8660 (0.8660)  acc1: 81.2000 (81.2000)  acc5: 95.6000 (95.6000)  time: 5.5158  data: 5.4229  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9601 (1.0401)  acc1: 80.0000 (78.0364)  acc5: 95.2000 (94.6182)  time: 0.6963  data: 0.6009  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2911 (1.2603)  acc1: 69.6000 (73.3143)  acc5: 90.8000 (91.4476)  time: 0.1856  data: 0.0966  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3844 (1.2706)  acc1: 69.6000 (73.0720)  acc5: 89.2000 (91.5200)  time: 0.1981  data: 0.1121  max mem: 12911
Test: Total time: 0:00:09 (0.3973 s / it)
* Acc@1 72.748 Acc@5 91.448 loss 1.267
Accuracy of the model on the 50000 test images: 72.7%
Max accuracy: 72.90%
Epoch: [71]  [   0/1251]  eta: 0:57:35  lr: 0.003681  min_lr: 0.003681  loss: 2.4518 (2.4518)  weight_decay: 0.0500 (0.0500)  time: 2.7619  data: 2.4188  max mem: 12911
Epoch: [71]  [ 200/1251]  eta: 0:03:34  lr: 0.003680  min_lr: 0.003680  loss: 2.4839 (3.1737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7015 (0.7769)  time: 0.1844  data: 0.0005  max mem: 12911
Epoch: [71]  [ 400/1251]  eta: 0:02:47  lr: 0.003678  min_lr: 0.003678  loss: 2.7014 (3.2057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7206 (0.9230)  time: 0.1938  data: 0.0005  max mem: 12911
Epoch: [71]  [ 600/1251]  eta: 0:02:06  lr: 0.003676  min_lr: 0.003676  loss: 2.5978 (3.2143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7210 (0.8731)  time: 0.1890  data: 0.0004  max mem: 12911
Epoch: [71]  [ 800/1251]  eta: 0:01:26  lr: 0.003674  min_lr: 0.003674  loss: 2.8717 (3.1916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7186 (0.8441)  time: 0.1924  data: 0.0005  max mem: 12911
Epoch: [71]  [1000/1251]  eta: 0:00:48  lr: 0.003672  min_lr: 0.003672  loss: 2.4860 (3.2135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6492 (0.8413)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [71]  [1200/1251]  eta: 0:00:09  lr: 0.003670  min_lr: 0.003670  loss: 2.8090 (3.2033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8398 (0.8464)  time: 0.1913  data: 0.0005  max mem: 12911
Epoch: [71]  [1250/1251]  eta: 0:00:00  lr: 0.003669  min_lr: 0.003669  loss: 3.0539 (3.2065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8093 (0.8418)  time: 0.1465  data: 0.0010  max mem: 12911
Epoch: [71] Total time: 0:03:59 (0.1915 s / it)
Averaged stats: lr: 0.003669  min_lr: 0.003669  loss: 3.0539 (3.2033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8093 (0.8418)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.9161 (0.9161)  acc1: 82.4000 (82.4000)  acc5: 95.2000 (95.2000)  time: 5.6000  data: 5.5070  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.0244 (1.0685)  acc1: 78.0000 (77.6364)  acc5: 95.2000 (94.3273)  time: 0.7020  data: 0.6050  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3296 (1.2729)  acc1: 70.0000 (72.7238)  acc5: 91.2000 (91.9619)  time: 0.1828  data: 0.0944  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4384 (1.2856)  acc1: 69.6000 (72.4640)  acc5: 89.6000 (91.8720)  time: 0.1842  data: 0.0987  max mem: 12911
Test: Total time: 0:00:09 (0.3939 s / it)
* Acc@1 72.746 Acc@5 91.528 loss 1.278
Accuracy of the model on the 50000 test images: 72.7%
Max accuracy: 72.90%
Epoch: [72]  [   0/1251]  eta: 1:10:07  lr: 0.003669  min_lr: 0.003669  loss: 2.4960 (2.4960)  weight_decay: 0.0500 (0.0500)  time: 3.3633  data: 3.1101  max mem: 12911
Epoch: [72]  [ 200/1251]  eta: 0:03:33  lr: 0.003667  min_lr: 0.003667  loss: 2.9052 (3.0912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9227 (0.9011)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [72]  [ 400/1251]  eta: 0:02:46  lr: 0.003665  min_lr: 0.003665  loss: 2.6463 (3.1262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9432 (0.9173)  time: 0.1847  data: 0.0004  max mem: 12911
Epoch: [72]  [ 600/1251]  eta: 0:02:05  lr: 0.003663  min_lr: 0.003663  loss: 3.6354 (3.1614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8309 (0.9053)  time: 0.1884  data: 0.0005  max mem: 12911
Epoch: [72]  [ 800/1251]  eta: 0:01:26  lr: 0.003661  min_lr: 0.003661  loss: 3.7095 (3.1753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7540 (0.9003)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [72]  [1000/1251]  eta: 0:00:47  lr: 0.003659  min_lr: 0.003659  loss: 2.6926 (3.1771)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1009 (0.9026)  time: 0.1913  data: 0.0005  max mem: 12911
Epoch: [72]  [1200/1251]  eta: 0:00:09  lr: 0.003657  min_lr: 0.003657  loss: 3.9632 (3.1777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (0.9226)  time: 0.1923  data: 0.0005  max mem: 12911
Epoch: [72]  [1250/1251]  eta: 0:00:00  lr: 0.003657  min_lr: 0.003657  loss: 2.8484 (3.1738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (0.9188)  time: 0.1458  data: 0.0012  max mem: 12911
Epoch: [72] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.003657  min_lr: 0.003657  loss: 2.8484 (3.2018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (0.9188)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.8194 (0.8194)  acc1: 82.8000 (82.8000)  acc5: 96.0000 (96.0000)  time: 5.4007  data: 5.3090  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9971 (1.0282)  acc1: 78.0000 (78.0364)  acc5: 96.0000 (95.0909)  time: 0.7519  data: 0.6548  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2746 (1.2360)  acc1: 71.6000 (73.0286)  acc5: 91.2000 (91.9048)  time: 0.2182  data: 0.1287  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4091 (1.2415)  acc1: 69.2000 (72.8160)  acc5: 88.8000 (91.7760)  time: 0.2146  data: 0.1287  max mem: 12911
Test: Total time: 0:00:10 (0.4064 s / it)
* Acc@1 72.914 Acc@5 91.594 loss 1.236
Accuracy of the model on the 50000 test images: 72.9%
Max accuracy: 72.91%
Epoch: [73]  [   0/1251]  eta: 0:54:30  lr: 0.003657  min_lr: 0.003657  loss: 3.1212 (3.1212)  weight_decay: 0.0500 (0.0500)  time: 2.6143  data: 1.8741  max mem: 12911
Epoch: [73]  [ 200/1251]  eta: 0:03:33  lr: 0.003655  min_lr: 0.003655  loss: 2.6072 (3.1737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7057 (0.8504)  time: 0.1895  data: 0.0005  max mem: 12911
Epoch: [73]  [ 400/1251]  eta: 0:02:46  lr: 0.003653  min_lr: 0.003653  loss: 2.5093 (3.1934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6480 (0.8691)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [73]  [ 600/1251]  eta: 0:02:05  lr: 0.003651  min_lr: 0.003651  loss: 2.8266 (3.2064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7125 (0.8309)  time: 0.1881  data: 0.0004  max mem: 12911
Epoch: [73]  [ 800/1251]  eta: 0:01:26  lr: 0.003649  min_lr: 0.003649  loss: 2.5485 (3.1911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7810 (0.8593)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [73]  [1000/1251]  eta: 0:00:47  lr: 0.003647  min_lr: 0.003647  loss: 2.6565 (3.1737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5983 (0.8379)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [73]  [1200/1251]  eta: 0:00:09  lr: 0.003645  min_lr: 0.003645  loss: 2.4641 (3.1788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6110 (nan)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [73]  [1250/1251]  eta: 0:00:00  lr: 0.003644  min_lr: 0.003644  loss: 3.5442 (3.1875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6115 (nan)  time: 0.1461  data: 0.0008  max mem: 12911
Epoch: [73] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.003644  min_lr: 0.003644  loss: 3.5442 (3.1883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6115 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.9023 (0.9023)  acc1: 79.2000 (79.2000)  acc5: 96.0000 (96.0000)  time: 5.6438  data: 5.5521  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9023 (1.0295)  acc1: 79.2000 (76.5091)  acc5: 95.2000 (94.6909)  time: 0.7318  data: 0.6378  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3307 (1.2408)  acc1: 70.4000 (72.7619)  acc5: 89.6000 (91.9238)  time: 0.2048  data: 0.1179  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3881 (1.2591)  acc1: 70.4000 (72.2240)  acc5: 89.2000 (91.6960)  time: 0.2014  data: 0.1178  max mem: 12911
Test: Total time: 0:00:10 (0.4053 s / it)
* Acc@1 72.624 Acc@5 91.548 loss 1.249
Accuracy of the model on the 50000 test images: 72.6%
Max accuracy: 72.91%
Epoch: [74]  [   0/1251]  eta: 1:03:52  lr: 0.003644  min_lr: 0.003644  loss: 5.0482 (5.0482)  weight_decay: 0.0500 (0.0500)  time: 3.0633  data: 1.5645  max mem: 12911
Epoch: [74]  [ 200/1251]  eta: 0:03:38  lr: 0.003642  min_lr: 0.003642  loss: 2.6066 (3.1917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9723 (0.8450)  time: 0.1985  data: 0.0004  max mem: 12911
Epoch: [74]  [ 400/1251]  eta: 0:02:49  lr: 0.003640  min_lr: 0.003640  loss: 2.9183 (3.1679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6458 (0.8767)  time: 0.1888  data: 0.0004  max mem: 12911
Epoch: [74]  [ 600/1251]  eta: 0:02:07  lr: 0.003638  min_lr: 0.003638  loss: 2.6323 (3.1392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8211 (0.8816)  time: 0.1899  data: 0.0011  max mem: 12911
Epoch: [74]  [ 800/1251]  eta: 0:01:27  lr: 0.003636  min_lr: 0.003636  loss: 2.9583 (3.1571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7500 (0.8813)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [74]  [1000/1251]  eta: 0:00:48  lr: 0.003634  min_lr: 0.003634  loss: 3.1148 (3.1543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8932 (0.9120)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [74]  [1200/1251]  eta: 0:00:09  lr: 0.003632  min_lr: 0.003632  loss: 2.8166 (3.1677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7746 (0.8962)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [74]  [1250/1251]  eta: 0:00:00  lr: 0.003631  min_lr: 0.003631  loss: 2.6604 (3.1668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7141 (0.8898)  time: 0.1460  data: 0.0011  max mem: 12911
Epoch: [74] Total time: 0:04:00 (0.1921 s / it)
Averaged stats: lr: 0.003631  min_lr: 0.003631  loss: 2.6604 (3.1868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7141 (0.8898)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8399 (0.8399)  acc1: 82.4000 (82.4000)  acc5: 96.0000 (96.0000)  time: 5.5863  data: 5.4945  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9665 (0.9809)  acc1: 78.8000 (77.5273)  acc5: 95.6000 (94.7636)  time: 0.6807  data: 0.5848  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2352 (1.1969)  acc1: 70.0000 (73.1429)  acc5: 91.6000 (91.8095)  time: 0.1980  data: 0.1095  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3634 (1.2060)  acc1: 70.0000 (72.9600)  acc5: 89.2000 (91.5200)  time: 0.2080  data: 0.1228  max mem: 12911
Test: Total time: 0:00:10 (0.4099 s / it)
* Acc@1 73.230 Acc@5 91.872 loss 1.202
Accuracy of the model on the 50000 test images: 73.2%
Max accuracy: 73.23%
Epoch: [75]  [   0/1251]  eta: 1:05:30  lr: 0.003631  min_lr: 0.003631  loss: 3.0103 (3.0103)  weight_decay: 0.0500 (0.0500)  time: 3.1420  data: 2.8831  max mem: 12911
Epoch: [75]  [ 200/1251]  eta: 0:03:30  lr: 0.003629  min_lr: 0.003629  loss: 2.5507 (3.1555)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0021 (0.8990)  time: 0.1852  data: 0.0004  max mem: 12911
Epoch: [75]  [ 400/1251]  eta: 0:02:45  lr: 0.003627  min_lr: 0.003627  loss: 2.8919 (3.1660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7489 (0.8242)  time: 0.1898  data: 0.0006  max mem: 12911
Epoch: [75]  [ 600/1251]  eta: 0:02:05  lr: 0.003625  min_lr: 0.003625  loss: 2.9048 (3.1743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7570 (0.8453)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [75]  [ 800/1251]  eta: 0:01:26  lr: 0.003623  min_lr: 0.003623  loss: 2.5630 (3.1756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6528 (0.8465)  time: 0.1904  data: 0.0006  max mem: 12911
Epoch: [75]  [1000/1251]  eta: 0:00:47  lr: 0.003621  min_lr: 0.003621  loss: 3.4478 (3.1783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8077 (0.8630)  time: 0.1893  data: 0.0004  max mem: 12911
Epoch: [75]  [1200/1251]  eta: 0:00:09  lr: 0.003619  min_lr: 0.003619  loss: 2.6818 (3.1865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6837 (0.8681)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [75]  [1250/1251]  eta: 0:00:00  lr: 0.003618  min_lr: 0.003618  loss: 2.9850 (3.1861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6837 (0.8629)  time: 0.1473  data: 0.0008  max mem: 12911
Epoch: [75] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.003618  min_lr: 0.003618  loss: 2.9850 (3.1948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6837 (0.8629)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.8921 (0.8921)  acc1: 79.6000 (79.6000)  acc5: 97.6000 (97.6000)  time: 5.3598  data: 5.2681  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.9677 (1.0230)  acc1: 77.6000 (77.4545)  acc5: 94.8000 (94.4727)  time: 0.6446  data: 0.5508  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2604 (1.2224)  acc1: 71.6000 (73.3905)  acc5: 91.6000 (91.5048)  time: 0.1707  data: 0.0839  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3844 (1.2384)  acc1: 70.8000 (72.8320)  acc5: 89.2000 (91.4080)  time: 0.2023  data: 0.1176  max mem: 12911
Test: Total time: 0:00:09 (0.3950 s / it)
* Acc@1 73.178 Acc@5 91.638 loss 1.239
Accuracy of the model on the 50000 test images: 73.2%
Max accuracy: 73.23%
Epoch: [76]  [   0/1251]  eta: 1:04:20  lr: 0.003618  min_lr: 0.003618  loss: 3.7694 (3.7694)  weight_decay: 0.0500 (0.0500)  time: 3.0863  data: 2.3484  max mem: 12911
Epoch: [76]  [ 200/1251]  eta: 0:03:36  lr: 0.003616  min_lr: 0.003616  loss: 3.5497 (3.1916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6727 (0.7668)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [76]  [ 400/1251]  eta: 0:02:47  lr: 0.003614  min_lr: 0.003614  loss: 2.6333 (3.1770)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8164 (0.8288)  time: 0.1895  data: 0.0005  max mem: 12911
Epoch: [76]  [ 600/1251]  eta: 0:02:06  lr: 0.003612  min_lr: 0.003612  loss: 2.8055 (3.1628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7116 (0.8329)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [76]  [ 800/1251]  eta: 0:01:26  lr: 0.003610  min_lr: 0.003610  loss: 2.7684 (3.2029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8526 (0.8375)  time: 0.1893  data: 0.0005  max mem: 12911
Epoch: [76]  [1000/1251]  eta: 0:00:48  lr: 0.003607  min_lr: 0.003607  loss: 2.6017 (3.2059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7813 (0.8623)  time: 0.1900  data: 0.0005  max mem: 12911
Epoch: [76]  [1200/1251]  eta: 0:00:09  lr: 0.003605  min_lr: 0.003605  loss: 2.8672 (3.2038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7802 (0.8514)  time: 0.1898  data: 0.0005  max mem: 12911
Epoch: [76]  [1250/1251]  eta: 0:00:00  lr: 0.003605  min_lr: 0.003605  loss: 2.6310 (3.1984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7184 (0.8462)  time: 0.1476  data: 0.0008  max mem: 12911
Epoch: [76] Total time: 0:03:59 (0.1914 s / it)
Averaged stats: lr: 0.003605  min_lr: 0.003605  loss: 2.6310 (3.2118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7184 (0.8462)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.8689 (0.8689)  acc1: 78.8000 (78.8000)  acc5: 95.2000 (95.2000)  time: 5.3793  data: 5.2648  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9990 (1.0128)  acc1: 77.6000 (76.9091)  acc5: 94.8000 (94.3273)  time: 0.7045  data: 0.6113  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2786 (1.2311)  acc1: 70.4000 (72.6857)  acc5: 91.2000 (91.6000)  time: 0.2019  data: 0.1164  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3821 (1.2422)  acc1: 69.2000 (72.5760)  acc5: 89.6000 (91.2800)  time: 0.2087  data: 0.1239  max mem: 12911
Test: Total time: 0:00:09 (0.3985 s / it)
* Acc@1 72.834 Acc@5 91.634 loss 1.235
Accuracy of the model on the 50000 test images: 72.8%
Max accuracy: 73.23%
Epoch: [77]  [   0/1251]  eta: 1:04:58  lr: 0.003605  min_lr: 0.003605  loss: 3.0166 (3.0166)  weight_decay: 0.0500 (0.0500)  time: 3.1162  data: 2.6211  max mem: 12911
Epoch: [77]  [ 200/1251]  eta: 0:03:35  lr: 0.003603  min_lr: 0.003603  loss: 3.2494 (3.2250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7637 (0.9547)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [77]  [ 400/1251]  eta: 0:02:46  lr: 0.003601  min_lr: 0.003601  loss: 2.7608 (3.1731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6722 (0.8458)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [77]  [ 600/1251]  eta: 0:02:05  lr: 0.003598  min_lr: 0.003598  loss: 2.6461 (3.1871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9354 (0.8910)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [77]  [ 800/1251]  eta: 0:01:26  lr: 0.003596  min_lr: 0.003596  loss: 2.7012 (3.1842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6549 (0.8473)  time: 0.1884  data: 0.0005  max mem: 12911
Epoch: [77]  [1000/1251]  eta: 0:00:47  lr: 0.003594  min_lr: 0.003594  loss: 2.6516 (3.1692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6828 (0.8531)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [77]  [1200/1251]  eta: 0:00:09  lr: 0.003592  min_lr: 0.003592  loss: 2.9703 (3.1883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6939 (0.8482)  time: 0.1854  data: 0.0005  max mem: 12911
Epoch: [77]  [1250/1251]  eta: 0:00:00  lr: 0.003591  min_lr: 0.003591  loss: 2.8569 (3.1872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6167 (0.8413)  time: 0.1463  data: 0.0009  max mem: 12911
Epoch: [77] Total time: 0:03:57 (0.1896 s / it)
Averaged stats: lr: 0.003591  min_lr: 0.003591  loss: 2.8569 (3.2017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6167 (0.8413)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8723 (0.8723)  acc1: 80.8000 (80.8000)  acc5: 96.4000 (96.4000)  time: 5.4791  data: 5.3864  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8843 (1.0092)  acc1: 79.2000 (76.9455)  acc5: 95.2000 (94.5455)  time: 0.7381  data: 0.6458  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2331 (1.2097)  acc1: 68.8000 (72.8381)  acc5: 92.0000 (91.7143)  time: 0.2090  data: 0.1197  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3467 (1.2237)  acc1: 68.8000 (72.3360)  acc5: 89.6000 (91.5680)  time: 0.2077  data: 0.1196  max mem: 12911
Test: Total time: 0:00:10 (0.4028 s / it)
* Acc@1 73.162 Acc@5 91.776 loss 1.219
Accuracy of the model on the 50000 test images: 73.2%
Max accuracy: 73.23%
Epoch: [78]  [   0/1251]  eta: 1:01:18  lr: 0.003591  min_lr: 0.003591  loss: 2.4100 (2.4100)  weight_decay: 0.0500 (0.0500)  time: 2.9406  data: 2.2505  max mem: 12911
Epoch: [78]  [ 200/1251]  eta: 0:03:33  lr: 0.003589  min_lr: 0.003589  loss: 2.6129 (3.2543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7605 (0.7816)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [78]  [ 400/1251]  eta: 0:02:46  lr: 0.003587  min_lr: 0.003587  loss: 2.8143 (3.2506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7241 (0.7899)  time: 0.1866  data: 0.0005  max mem: 12911
Epoch: [78]  [ 600/1251]  eta: 0:02:05  lr: 0.003585  min_lr: 0.003585  loss: 3.7195 (3.2398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6862 (0.8295)  time: 0.1868  data: 0.0006  max mem: 12911
Epoch: [78]  [ 800/1251]  eta: 0:01:26  lr: 0.003583  min_lr: 0.003583  loss: 3.2592 (3.2161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6655 (0.8116)  time: 0.1858  data: 0.0005  max mem: 12911
Epoch: [78]  [1000/1251]  eta: 0:00:47  lr: 0.003580  min_lr: 0.003580  loss: 3.8750 (3.2049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7885 (0.8292)  time: 0.1843  data: 0.0005  max mem: 12911
Epoch: [78]  [1200/1251]  eta: 0:00:09  lr: 0.003578  min_lr: 0.003578  loss: 3.2553 (3.2049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7856 (0.8473)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [78]  [1250/1251]  eta: 0:00:00  lr: 0.003578  min_lr: 0.003578  loss: 2.6067 (3.2054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9087 (0.8489)  time: 0.1466  data: 0.0007  max mem: 12911
Epoch: [78] Total time: 0:03:57 (0.1900 s / it)
Averaged stats: lr: 0.003578  min_lr: 0.003578  loss: 2.6067 (3.1914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9087 (0.8489)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8582 (0.8582)  acc1: 81.2000 (81.2000)  acc5: 95.6000 (95.6000)  time: 5.6748  data: 5.5804  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9365 (1.0121)  acc1: 78.8000 (77.8909)  acc5: 94.8000 (94.1818)  time: 0.7551  data: 0.6579  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2631 (1.2155)  acc1: 70.8000 (73.4286)  acc5: 91.6000 (91.5810)  time: 0.2109  data: 0.1224  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3706 (1.2287)  acc1: 69.2000 (72.9280)  acc5: 89.6000 (91.4720)  time: 0.2068  data: 0.1223  max mem: 12911
Test: Total time: 0:00:10 (0.4116 s / it)
* Acc@1 73.100 Acc@5 91.766 loss 1.227
Accuracy of the model on the 50000 test images: 73.1%
Max accuracy: 73.23%
Epoch: [79]  [   0/1251]  eta: 1:08:35  lr: 0.003578  min_lr: 0.003578  loss: 4.1261 (4.1261)  weight_decay: 0.0500 (0.0500)  time: 3.2899  data: 2.1820  max mem: 12911
Epoch: [79]  [ 200/1251]  eta: 0:03:34  lr: 0.003575  min_lr: 0.003575  loss: 2.6476 (3.1329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7496 (0.7246)  time: 0.1897  data: 0.0004  max mem: 12911
Epoch: [79]  [ 400/1251]  eta: 0:02:47  lr: 0.003573  min_lr: 0.003573  loss: 2.7958 (3.1762)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0479 (0.8363)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [79]  [ 600/1251]  eta: 0:02:06  lr: 0.003571  min_lr: 0.003571  loss: 2.8357 (3.1473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7035 (0.8057)  time: 0.1899  data: 0.0005  max mem: 12911
Epoch: [79]  [ 800/1251]  eta: 0:01:27  lr: 0.003569  min_lr: 0.003569  loss: 3.0881 (3.1629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5561 (0.7943)  time: 0.1893  data: 0.0005  max mem: 12911
Epoch: [79]  [1000/1251]  eta: 0:00:48  lr: 0.003567  min_lr: 0.003567  loss: 2.9684 (3.1893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8975 (0.8036)  time: 0.1910  data: 0.0005  max mem: 12911
Epoch: [79]  [1200/1251]  eta: 0:00:09  lr: 0.003564  min_lr: 0.003564  loss: 2.6328 (3.2010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7801 (0.7990)  time: 0.1869  data: 0.0006  max mem: 12911
Epoch: [79]  [1250/1251]  eta: 0:00:00  lr: 0.003564  min_lr: 0.003564  loss: 2.6445 (3.1969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6223 (0.7960)  time: 0.1472  data: 0.0007  max mem: 12911
Epoch: [79] Total time: 0:04:00 (0.1923 s / it)
Averaged stats: lr: 0.003564  min_lr: 0.003564  loss: 2.6445 (3.1838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6223 (0.7960)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.8369 (0.8369)  acc1: 84.4000 (84.4000)  acc5: 95.2000 (95.2000)  time: 5.9235  data: 5.8317  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9619 (1.0251)  acc1: 80.4000 (77.6000)  acc5: 95.2000 (94.6182)  time: 0.7580  data: 0.6732  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3297 (1.2366)  acc1: 70.4000 (73.3524)  acc5: 91.2000 (91.8667)  time: 0.1914  data: 0.1087  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4025 (1.2511)  acc1: 70.0000 (72.9120)  acc5: 90.0000 (91.6000)  time: 0.1903  data: 0.1086  max mem: 12911
Test: Total time: 0:00:10 (0.4079 s / it)
* Acc@1 73.320 Acc@5 91.666 loss 1.236
Accuracy of the model on the 50000 test images: 73.3%
Max accuracy: 73.32%
Epoch: [80]  [   0/1251]  eta: 0:57:58  lr: 0.003564  min_lr: 0.003564  loss: 2.2291 (2.2291)  weight_decay: 0.0500 (0.0500)  time: 2.7808  data: 2.5271  max mem: 12911
Epoch: [80]  [ 200/1251]  eta: 0:03:33  lr: 0.003562  min_lr: 0.003562  loss: 2.6468 (3.2077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8285 (0.8349)  time: 0.1847  data: 0.0004  max mem: 12911
Epoch: [80]  [ 400/1251]  eta: 0:02:45  lr: 0.003559  min_lr: 0.003559  loss: 2.8989 (3.1807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7088 (0.8447)  time: 0.1863  data: 0.0010  max mem: 12911
Epoch: [80]  [ 600/1251]  eta: 0:02:05  lr: 0.003557  min_lr: 0.003557  loss: 2.8892 (3.1686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8713 (0.8313)  time: 0.1904  data: 0.0004  max mem: 12911
Epoch: [80]  [ 800/1251]  eta: 0:01:26  lr: 0.003555  min_lr: 0.003555  loss: 3.6324 (3.1816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6592 (0.8165)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [80]  [1000/1251]  eta: 0:00:47  lr: 0.003553  min_lr: 0.003553  loss: 2.6670 (3.1926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8582 (0.8195)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [80]  [1200/1251]  eta: 0:00:09  lr: 0.003550  min_lr: 0.003550  loss: 3.0252 (3.1933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6504 (0.8135)  time: 0.1923  data: 0.0005  max mem: 12911
Epoch: [80]  [1250/1251]  eta: 0:00:00  lr: 0.003550  min_lr: 0.003550  loss: 2.8041 (3.1899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.8128)  time: 0.1478  data: 0.0012  max mem: 12911
Epoch: [80] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.003550  min_lr: 0.003550  loss: 2.8041 (3.1828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.8128)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7801 (0.7801)  acc1: 84.4000 (84.4000)  acc5: 95.2000 (95.2000)  time: 5.3517  data: 5.2465  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8841 (0.9738)  acc1: 78.8000 (77.7455)  acc5: 95.2000 (94.3273)  time: 0.7276  data: 0.6319  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2596 (1.1716)  acc1: 70.4000 (73.9429)  acc5: 90.4000 (91.8476)  time: 0.2091  data: 0.1219  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3220 (1.1909)  acc1: 69.6000 (73.4720)  acc5: 89.6000 (91.7280)  time: 0.2216  data: 0.1380  max mem: 12911
Test: Total time: 0:00:10 (0.4134 s / it)
* Acc@1 73.524 Acc@5 92.028 loss 1.187
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.52%
Epoch: [81]  [   0/1251]  eta: 1:00:49  lr: 0.003550  min_lr: 0.003550  loss: 2.5391 (2.5391)  weight_decay: 0.0500 (0.0500)  time: 2.9177  data: 2.6706  max mem: 12911
Epoch: [81]  [ 200/1251]  eta: 0:03:30  lr: 0.003547  min_lr: 0.003547  loss: 2.6435 (3.2479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7075 (0.7445)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [81]  [ 400/1251]  eta: 0:02:45  lr: 0.003545  min_lr: 0.003545  loss: 2.9661 (3.1921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7674 (0.8063)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [81]  [ 600/1251]  eta: 0:02:05  lr: 0.003543  min_lr: 0.003543  loss: 3.0031 (3.1790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6779 (0.8090)  time: 0.1872  data: 0.0006  max mem: 12911
Epoch: [81]  [ 800/1251]  eta: 0:01:26  lr: 0.003541  min_lr: 0.003541  loss: 2.7218 (3.1728)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7024 (0.8078)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [81]  [1000/1251]  eta: 0:00:47  lr: 0.003538  min_lr: 0.003538  loss: 2.8041 (3.1733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6198 (0.8182)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [81]  [1200/1251]  eta: 0:00:09  lr: 0.003536  min_lr: 0.003536  loss: 2.6366 (3.1761)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6248 (0.8069)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [81]  [1250/1251]  eta: 0:00:00  lr: 0.003535  min_lr: 0.003535  loss: 2.6415 (3.1766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6639 (0.8079)  time: 0.1459  data: 0.0012  max mem: 12911
Epoch: [81] Total time: 0:03:57 (0.1900 s / it)
Averaged stats: lr: 0.003535  min_lr: 0.003535  loss: 2.6415 (3.1704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6639 (0.8079)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.8981 (0.8981)  acc1: 82.8000 (82.8000)  acc5: 95.6000 (95.6000)  time: 5.3182  data: 5.2263  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 1.0373 (1.0594)  acc1: 79.2000 (78.5091)  acc5: 95.6000 (94.3636)  time: 0.6460  data: 0.5527  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2600 (1.2659)  acc1: 72.0000 (73.6000)  acc5: 90.4000 (91.9048)  time: 0.1731  data: 0.0838  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4173 (1.2763)  acc1: 72.0000 (73.3920)  acc5: 90.0000 (91.8400)  time: 0.2087  data: 0.1213  max mem: 12911
Test: Total time: 0:00:09 (0.3986 s / it)
* Acc@1 72.858 Acc@5 91.724 loss 1.270
Accuracy of the model on the 50000 test images: 72.9%
Max accuracy: 73.52%
Epoch: [82]  [   0/1251]  eta: 1:03:31  lr: 0.003535  min_lr: 0.003535  loss: 2.2540 (2.2540)  weight_decay: 0.0500 (0.0500)  time: 3.0472  data: 2.5102  max mem: 12911
Epoch: [82]  [ 200/1251]  eta: 0:03:32  lr: 0.003533  min_lr: 0.003533  loss: 3.3234 (3.1048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6686 (0.8872)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [82]  [ 400/1251]  eta: 0:02:45  lr: 0.003531  min_lr: 0.003531  loss: 2.6927 (3.1942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6165 (0.8235)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [82]  [ 600/1251]  eta: 0:02:04  lr: 0.003528  min_lr: 0.003528  loss: 3.3166 (3.2021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8467 (0.8204)  time: 0.1836  data: 0.0005  max mem: 12911
Epoch: [82]  [ 800/1251]  eta: 0:01:25  lr: 0.003526  min_lr: 0.003526  loss: 2.4593 (3.2087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7436 (0.8124)  time: 0.1910  data: 0.0005  max mem: 12911
Epoch: [82]  [1000/1251]  eta: 0:00:47  lr: 0.003524  min_lr: 0.003524  loss: 2.9174 (3.1881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6553 (0.8213)  time: 0.1893  data: 0.0005  max mem: 12911
Epoch: [82]  [1200/1251]  eta: 0:00:09  lr: 0.003521  min_lr: 0.003521  loss: 2.5997 (3.1757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5752 (0.7998)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [82]  [1250/1251]  eta: 0:00:00  lr: 0.003521  min_lr: 0.003521  loss: 3.5435 (3.1830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7593 (0.7998)  time: 0.1466  data: 0.0009  max mem: 12911
Epoch: [82] Total time: 0:03:57 (0.1897 s / it)
Averaged stats: lr: 0.003521  min_lr: 0.003521  loss: 3.5435 (3.1885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7593 (0.7998)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.8840 (0.8840)  acc1: 82.0000 (82.0000)  acc5: 95.2000 (95.2000)  time: 5.7684  data: 5.6767  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9364 (1.0083)  acc1: 79.2000 (78.2182)  acc5: 95.2000 (94.7273)  time: 0.7549  data: 0.6612  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2742 (1.2283)  acc1: 71.2000 (73.6000)  acc5: 91.2000 (91.7905)  time: 0.2244  data: 0.1377  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.4111 (1.2500)  acc1: 70.0000 (73.1520)  acc5: 89.2000 (91.6800)  time: 0.2221  data: 0.1376  max mem: 12911
Test: Total time: 0:00:10 (0.4265 s / it)
* Acc@1 73.480 Acc@5 91.904 loss 1.237
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.52%
Epoch: [83]  [   0/1251]  eta: 1:03:00  lr: 0.003521  min_lr: 0.003521  loss: 3.6225 (3.6225)  weight_decay: 0.0500 (0.0500)  time: 3.0219  data: 1.9372  max mem: 12911
Epoch: [83]  [ 200/1251]  eta: 0:03:34  lr: 0.003519  min_lr: 0.003519  loss: 2.6064 (3.1229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8386 (0.8076)  time: 0.1887  data: 0.0006  max mem: 12911
Epoch: [83]  [ 400/1251]  eta: 0:02:47  lr: 0.003516  min_lr: 0.003516  loss: 2.5542 (3.1236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6899 (0.7831)  time: 0.1921  data: 0.0005  max mem: 12911
Epoch: [83]  [ 600/1251]  eta: 0:02:06  lr: 0.003514  min_lr: 0.003514  loss: 3.0695 (3.1210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6401 (0.7832)  time: 0.1890  data: 0.0005  max mem: 12911
Epoch: [83]  [ 800/1251]  eta: 0:01:26  lr: 0.003512  min_lr: 0.003512  loss: 2.5937 (3.1163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6794 (0.7691)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [83]  [1000/1251]  eta: 0:00:48  lr: 0.003509  min_lr: 0.003509  loss: 2.9174 (3.1239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6355 (0.7676)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [83]  [1200/1251]  eta: 0:00:09  lr: 0.003507  min_lr: 0.003507  loss: 2.6678 (3.1074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6668 (0.7580)  time: 0.1851  data: 0.0006  max mem: 12911
Epoch: [83]  [1250/1251]  eta: 0:00:00  lr: 0.003506  min_lr: 0.003506  loss: 3.0612 (3.1095)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6299 (0.7534)  time: 0.1457  data: 0.0008  max mem: 12911
Epoch: [83] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.003506  min_lr: 0.003506  loss: 3.0612 (3.1681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6299 (0.7534)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8324 (0.8324)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 5.6220  data: 5.5304  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9182 (0.9955)  acc1: 79.2000 (77.8909)  acc5: 95.6000 (95.0545)  time: 0.7377  data: 0.6413  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2767 (1.1805)  acc1: 70.8000 (73.6571)  acc5: 91.2000 (92.5333)  time: 0.2065  data: 0.1183  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3266 (1.1991)  acc1: 69.2000 (73.2480)  acc5: 90.8000 (92.2720)  time: 0.2028  data: 0.1182  max mem: 12911
Test: Total time: 0:00:10 (0.4062 s / it)
* Acc@1 73.486 Acc@5 92.034 loss 1.198
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.52%
Epoch: [84]  [   0/1251]  eta: 0:55:43  lr: 0.003506  min_lr: 0.003506  loss: 4.3682 (4.3682)  weight_decay: 0.0500 (0.0500)  time: 2.6727  data: 2.1206  max mem: 12911
Epoch: [84]  [ 200/1251]  eta: 0:03:37  lr: 0.003504  min_lr: 0.003504  loss: 2.7105 (3.1424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6506 (0.7516)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [84]  [ 400/1251]  eta: 0:02:49  lr: 0.003502  min_lr: 0.003502  loss: 2.8895 (3.1367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7983 (0.8617)  time: 0.1918  data: 0.0004  max mem: 12911
Epoch: [84]  [ 600/1251]  eta: 0:02:08  lr: 0.003499  min_lr: 0.003499  loss: 2.7927 (3.1524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6979 (0.8337)  time: 0.1900  data: 0.0004  max mem: 12911
Epoch: [84]  [ 800/1251]  eta: 0:01:28  lr: 0.003497  min_lr: 0.003497  loss: 3.0693 (3.1546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7390 (0.8159)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [84]  [1000/1251]  eta: 0:00:48  lr: 0.003494  min_lr: 0.003494  loss: 3.1073 (3.1693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7717 (0.8076)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [84]  [1200/1251]  eta: 0:00:09  lr: 0.003492  min_lr: 0.003492  loss: 3.4135 (3.1763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7140 (0.8246)  time: 0.1884  data: 0.0004  max mem: 12911
Epoch: [84]  [1250/1251]  eta: 0:00:00  lr: 0.003491  min_lr: 0.003491  loss: 2.5516 (3.1709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7183 (0.8248)  time: 0.1463  data: 0.0011  max mem: 12911
Epoch: [84] Total time: 0:04:01 (0.1931 s / it)
Averaged stats: lr: 0.003491  min_lr: 0.003491  loss: 2.5516 (3.1765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7183 (0.8248)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7350 (0.7350)  acc1: 83.6000 (83.6000)  acc5: 96.8000 (96.8000)  time: 5.6527  data: 5.5611  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8491 (0.9406)  acc1: 81.2000 (78.2182)  acc5: 96.0000 (95.2364)  time: 0.7410  data: 0.6448  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2345 (1.1625)  acc1: 70.4000 (73.6381)  acc5: 91.6000 (92.3619)  time: 0.1928  data: 0.1047  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3644 (1.1797)  acc1: 70.0000 (73.2800)  acc5: 91.2000 (92.1760)  time: 0.1939  data: 0.1093  max mem: 12911
Test: Total time: 0:00:10 (0.4036 s / it)
* Acc@1 73.606 Acc@5 92.026 loss 1.175
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 73.61%
Epoch: [85]  [   0/1251]  eta: 0:59:24  lr: 0.003491  min_lr: 0.003491  loss: 2.2272 (2.2272)  weight_decay: 0.0500 (0.0500)  time: 2.8493  data: 2.5868  max mem: 12911
Epoch: [85]  [ 200/1251]  eta: 0:03:32  lr: 0.003489  min_lr: 0.003489  loss: 2.6366 (3.1270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7206 (0.7320)  time: 0.1838  data: 0.0005  max mem: 12911
Epoch: [85]  [ 400/1251]  eta: 0:02:45  lr: 0.003487  min_lr: 0.003487  loss: 2.4460 (3.1194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7912 (0.7648)  time: 0.1831  data: 0.0004  max mem: 12911
Epoch: [85]  [ 600/1251]  eta: 0:02:04  lr: 0.003484  min_lr: 0.003484  loss: 2.8096 (3.1526)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1463 (0.8276)  time: 0.1845  data: 0.0005  max mem: 12911
Epoch: [85]  [ 800/1251]  eta: 0:01:25  lr: 0.003482  min_lr: 0.003482  loss: 2.5906 (3.1586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7581 (0.8424)  time: 0.1890  data: 0.0004  max mem: 12911
Epoch: [85]  [1000/1251]  eta: 0:00:47  lr: 0.003479  min_lr: 0.003479  loss: 2.5587 (3.1640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5820 (0.8252)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [85]  [1200/1251]  eta: 0:00:09  lr: 0.003477  min_lr: 0.003477  loss: 2.6552 (3.1740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8109 (0.8157)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [85]  [1250/1251]  eta: 0:00:00  lr: 0.003476  min_lr: 0.003476  loss: 2.6810 (3.1729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (0.8146)  time: 0.1459  data: 0.0009  max mem: 12911
Epoch: [85] Total time: 0:03:56 (0.1890 s / it)
Averaged stats: lr: 0.003476  min_lr: 0.003476  loss: 2.6810 (3.1832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (0.8146)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7343 (0.7343)  acc1: 84.0000 (84.0000)  acc5: 96.0000 (96.0000)  time: 5.6934  data: 5.6018  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8751 (0.9323)  acc1: 77.6000 (77.9273)  acc5: 96.0000 (95.1636)  time: 0.7493  data: 0.6648  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2059 (1.1460)  acc1: 71.2000 (73.9429)  acc5: 90.8000 (92.3429)  time: 0.2033  data: 0.1207  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2871 (1.1602)  acc1: 70.8000 (73.5040)  acc5: 90.0000 (92.1440)  time: 0.2014  data: 0.1206  max mem: 12911
Test: Total time: 0:00:10 (0.4057 s / it)
* Acc@1 73.548 Acc@5 92.018 loss 1.166
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.61%
Epoch: [86]  [   0/1251]  eta: 1:00:56  lr: 0.003476  min_lr: 0.003476  loss: 2.1518 (2.1518)  weight_decay: 0.0500 (0.0500)  time: 2.9228  data: 1.6786  max mem: 12911
Epoch: [86]  [ 200/1251]  eta: 0:03:35  lr: 0.003474  min_lr: 0.003474  loss: 2.5345 (3.1593)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [86]  [ 400/1251]  eta: 0:02:46  lr: 0.003472  min_lr: 0.003472  loss: 2.5782 (3.1518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6300 (nan)  time: 0.1898  data: 0.0005  max mem: 12911
Epoch: [86]  [ 600/1251]  eta: 0:02:06  lr: 0.003469  min_lr: 0.003469  loss: 2.7248 (3.1549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6515 (nan)  time: 0.1913  data: 0.0006  max mem: 12911
Epoch: [86]  [ 800/1251]  eta: 0:01:26  lr: 0.003467  min_lr: 0.003467  loss: 2.9022 (3.1534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7183 (nan)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [86]  [1000/1251]  eta: 0:00:48  lr: 0.003464  min_lr: 0.003464  loss: 2.5413 (3.1521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7149 (nan)  time: 0.1841  data: 0.0004  max mem: 12911
Epoch: [86]  [1200/1251]  eta: 0:00:09  lr: 0.003462  min_lr: 0.003462  loss: 2.6322 (3.1681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7523 (nan)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [86]  [1250/1251]  eta: 0:00:00  lr: 0.003461  min_lr: 0.003461  loss: 2.9704 (3.1659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (nan)  time: 0.1457  data: 0.0011  max mem: 12911
Epoch: [86] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.003461  min_lr: 0.003461  loss: 2.9704 (3.1693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (nan)
Test:  [ 0/25]  eta: 0:01:20  loss: 0.7716 (0.7716)  acc1: 84.8000 (84.8000)  acc5: 97.2000 (97.2000)  time: 3.2350  data: 3.1434  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.9205 (0.9892)  acc1: 80.8000 (78.7636)  acc5: 95.2000 (95.0182)  time: 0.6154  data: 0.5218  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2426 (1.1992)  acc1: 72.0000 (74.1143)  acc5: 90.4000 (92.3048)  time: 0.2700  data: 0.1827  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3592 (1.2202)  acc1: 70.0000 (73.5040)  acc5: 90.4000 (92.0320)  time: 0.1949  data: 0.1097  max mem: 12911
Test: Total time: 0:00:09 (0.3802 s / it)
* Acc@1 73.454 Acc@5 91.954 loss 1.216
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.61%
Epoch: [87]  [   0/1251]  eta: 1:03:50  lr: 0.003461  min_lr: 0.003461  loss: 2.1978 (2.1978)  weight_decay: 0.0500 (0.0500)  time: 3.0621  data: 1.5989  max mem: 12911
Epoch: [87]  [ 200/1251]  eta: 0:03:33  lr: 0.003459  min_lr: 0.003459  loss: 2.9622 (3.0731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6367 (0.6970)  time: 0.1836  data: 0.0005  max mem: 12911
Epoch: [87]  [ 400/1251]  eta: 0:02:46  lr: 0.003456  min_lr: 0.003456  loss: 2.4806 (3.0891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6265 (0.7684)  time: 0.1928  data: 0.0005  max mem: 12911
Epoch: [87]  [ 600/1251]  eta: 0:02:05  lr: 0.003454  min_lr: 0.003454  loss: 2.7455 (3.1042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6444 (0.7596)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [87]  [ 800/1251]  eta: 0:01:26  lr: 0.003451  min_lr: 0.003451  loss: 3.1396 (3.1426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7328 (0.7594)  time: 0.1864  data: 0.0004  max mem: 12911
Epoch: [87]  [1000/1251]  eta: 0:00:48  lr: 0.003449  min_lr: 0.003449  loss: 2.8479 (3.1162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5877 (0.7559)  time: 0.1918  data: 0.0004  max mem: 12911
Epoch: [87]  [1200/1251]  eta: 0:00:09  lr: 0.003446  min_lr: 0.003446  loss: 2.7119 (3.1250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6691 (0.7572)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [87]  [1250/1251]  eta: 0:00:00  lr: 0.003446  min_lr: 0.003446  loss: 2.6850 (3.1292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6789 (0.7573)  time: 0.1459  data: 0.0008  max mem: 12911
Epoch: [87] Total time: 0:03:59 (0.1915 s / it)
Averaged stats: lr: 0.003446  min_lr: 0.003446  loss: 2.6850 (3.1498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6789 (0.7573)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.8740 (0.8740)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 5.4239  data: 5.3324  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9808 (1.0503)  acc1: 79.6000 (77.7455)  acc5: 94.8000 (94.5091)  time: 0.7115  data: 0.6170  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.3168 (1.2335)  acc1: 70.8000 (73.2191)  acc5: 90.4000 (92.0191)  time: 0.2020  data: 0.1142  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3704 (1.2464)  acc1: 70.4000 (72.8000)  acc5: 90.0000 (91.9360)  time: 0.2143  data: 0.1291  max mem: 12911
Test: Total time: 0:00:10 (0.4068 s / it)
* Acc@1 73.202 Acc@5 91.930 loss 1.246
Accuracy of the model on the 50000 test images: 73.2%
Max accuracy: 73.61%
Epoch: [88]  [   0/1251]  eta: 1:03:20  lr: 0.003446  min_lr: 0.003446  loss: 2.3138 (2.3138)  weight_decay: 0.0500 (0.0500)  time: 3.0382  data: 2.3536  max mem: 12911
Epoch: [88]  [ 200/1251]  eta: 0:03:35  lr: 0.003443  min_lr: 0.003443  loss: 2.6598 (3.1932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7103 (0.8008)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [88]  [ 400/1251]  eta: 0:02:46  lr: 0.003441  min_lr: 0.003441  loss: 2.7121 (3.1746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6213 (0.7555)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [88]  [ 600/1251]  eta: 0:02:05  lr: 0.003438  min_lr: 0.003438  loss: 2.6189 (3.1651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6305 (0.7582)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [88]  [ 800/1251]  eta: 0:01:26  lr: 0.003436  min_lr: 0.003436  loss: 2.6447 (3.1453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7369 (0.7645)  time: 0.1866  data: 0.0004  max mem: 12911
Epoch: [88]  [1000/1251]  eta: 0:00:47  lr: 0.003433  min_lr: 0.003433  loss: 3.5497 (3.1717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8960 (0.8096)  time: 0.1890  data: 0.0004  max mem: 12911
Epoch: [88]  [1200/1251]  eta: 0:00:09  lr: 0.003431  min_lr: 0.003431  loss: 2.5026 (3.1839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7685 (0.7958)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [88]  [1250/1251]  eta: 0:00:00  lr: 0.003430  min_lr: 0.003430  loss: 2.8150 (3.1807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7621 (0.7976)  time: 0.1462  data: 0.0008  max mem: 12911
Epoch: [88] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.003430  min_lr: 0.003430  loss: 2.8150 (3.1599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7621 (0.7976)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8995 (0.8995)  acc1: 82.8000 (82.8000)  acc5: 96.8000 (96.8000)  time: 5.8143  data: 5.7189  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.0063 (1.0392)  acc1: 79.6000 (77.8909)  acc5: 95.6000 (95.0909)  time: 0.7331  data: 0.6357  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2309 (1.2502)  acc1: 70.0000 (73.3905)  acc5: 91.2000 (92.1333)  time: 0.1823  data: 0.0938  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3671 (1.2670)  acc1: 69.6000 (73.0400)  acc5: 90.0000 (92.0480)  time: 0.1898  data: 0.1052  max mem: 12911
Test: Total time: 0:00:10 (0.4037 s / it)
* Acc@1 73.312 Acc@5 91.924 loss 1.264
Accuracy of the model on the 50000 test images: 73.3%
Max accuracy: 73.61%
Epoch: [89]  [   0/1251]  eta: 1:01:38  lr: 0.003430  min_lr: 0.003430  loss: 3.9406 (3.9406)  weight_decay: 0.0500 (0.0500)  time: 2.9562  data: 2.1926  max mem: 12911
Epoch: [89]  [ 200/1251]  eta: 0:03:33  lr: 0.003428  min_lr: 0.003428  loss: 2.6181 (3.0544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6587 (0.7709)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [89]  [ 400/1251]  eta: 0:02:45  lr: 0.003425  min_lr: 0.003425  loss: 2.5412 (3.1183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5949 (0.7223)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [89]  [ 600/1251]  eta: 0:02:05  lr: 0.003423  min_lr: 0.003423  loss: 3.3045 (3.1171)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1009 (0.7867)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [89]  [ 800/1251]  eta: 0:01:26  lr: 0.003420  min_lr: 0.003420  loss: 2.6550 (3.1365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6292 (0.8072)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [89]  [1000/1251]  eta: 0:00:47  lr: 0.003418  min_lr: 0.003418  loss: 3.1789 (3.1415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5817 (0.7968)  time: 0.1888  data: 0.0006  max mem: 12911
Epoch: [89]  [1200/1251]  eta: 0:00:09  lr: 0.003415  min_lr: 0.003415  loss: 3.0070 (3.1485)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7172 (0.7857)  time: 0.1856  data: 0.0005  max mem: 12911
Epoch: [89]  [1250/1251]  eta: 0:00:00  lr: 0.003414  min_lr: 0.003414  loss: 2.9643 (3.1513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7749 (0.7880)  time: 0.1476  data: 0.0014  max mem: 12911
Epoch: [89] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.003414  min_lr: 0.003414  loss: 2.9643 (3.1716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7749 (0.7880)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8090 (0.8090)  acc1: 84.0000 (84.0000)  acc5: 96.0000 (96.0000)  time: 5.4439  data: 5.3495  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.9492 (0.9842)  acc1: 78.8000 (78.0000)  acc5: 96.0000 (94.9455)  time: 0.6466  data: 0.5494  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2419 (1.1923)  acc1: 72.0000 (73.4667)  acc5: 90.8000 (92.0571)  time: 0.1845  data: 0.0936  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3255 (1.2051)  acc1: 72.0000 (73.1360)  acc5: 90.4000 (92.0000)  time: 0.2048  data: 0.1179  max mem: 12911
Test: Total time: 0:00:10 (0.4008 s / it)
* Acc@1 73.624 Acc@5 92.096 loss 1.195
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 73.62%
Epoch: [90]  [   0/1251]  eta: 0:53:46  lr: 0.003414  min_lr: 0.003414  loss: 4.3539 (4.3539)  weight_decay: 0.0500 (0.0500)  time: 2.5791  data: 1.9296  max mem: 12911
Epoch: [90]  [ 200/1251]  eta: 0:03:33  lr: 0.003412  min_lr: 0.003412  loss: 2.5680 (3.1347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6209 (0.7613)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [90]  [ 400/1251]  eta: 0:02:46  lr: 0.003409  min_lr: 0.003409  loss: 2.7692 (3.1429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5631 (0.7563)  time: 0.1868  data: 0.0008  max mem: 12911
Epoch: [90]  [ 600/1251]  eta: 0:02:05  lr: 0.003407  min_lr: 0.003407  loss: 2.5627 (3.1217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6732 (0.7513)  time: 0.1856  data: 0.0005  max mem: 12911
Epoch: [90]  [ 800/1251]  eta: 0:01:26  lr: 0.003404  min_lr: 0.003404  loss: 3.3282 (3.1129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6595 (0.7385)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [90]  [1000/1251]  eta: 0:00:47  lr: 0.003402  min_lr: 0.003402  loss: 2.6438 (3.1015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6196 (0.7656)  time: 0.1851  data: 0.0005  max mem: 12911
Epoch: [90]  [1200/1251]  eta: 0:00:09  lr: 0.003399  min_lr: 0.003399  loss: 3.0328 (3.1270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6676 (0.7526)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [90]  [1250/1251]  eta: 0:00:00  lr: 0.003398  min_lr: 0.003398  loss: 2.6192 (3.1214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6327 (0.7522)  time: 0.1460  data: 0.0012  max mem: 12911
Epoch: [90] Total time: 0:03:56 (0.1893 s / it)
Averaged stats: lr: 0.003398  min_lr: 0.003398  loss: 2.6192 (3.1360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6327 (0.7522)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7603 (0.7603)  acc1: 82.4000 (82.4000)  acc5: 96.8000 (96.8000)  time: 5.6804  data: 5.5735  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8428 (0.9314)  acc1: 79.2000 (78.8364)  acc5: 95.6000 (95.2727)  time: 0.7564  data: 0.6598  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1340 (1.1427)  acc1: 72.0000 (74.0000)  acc5: 92.4000 (92.4571)  time: 0.2130  data: 0.1255  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2212 (1.1519)  acc1: 72.0000 (73.6320)  acc5: 91.6000 (92.4960)  time: 0.2076  data: 0.1241  max mem: 12911
Test: Total time: 0:00:10 (0.4135 s / it)
* Acc@1 74.062 Acc@5 92.262 loss 1.144
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.06%
Epoch: [91]  [   0/1251]  eta: 0:58:33  lr: 0.003398  min_lr: 0.003398  loss: 2.4313 (2.4313)  weight_decay: 0.0500 (0.0500)  time: 2.8086  data: 2.5543  max mem: 12911
Epoch: [91]  [ 200/1251]  eta: 0:03:33  lr: 0.003396  min_lr: 0.003396  loss: 2.5318 (3.1163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6845 (0.8274)  time: 0.1881  data: 0.0004  max mem: 12911
Epoch: [91]  [ 400/1251]  eta: 0:02:45  lr: 0.003393  min_lr: 0.003393  loss: 2.7729 (3.1412)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5546 (0.7396)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [91]  [ 600/1251]  eta: 0:02:05  lr: 0.003391  min_lr: 0.003391  loss: 2.6220 (3.1593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6678 (0.7520)  time: 0.1857  data: 0.0004  max mem: 12911
Epoch: [91]  [ 800/1251]  eta: 0:01:26  lr: 0.003388  min_lr: 0.003388  loss: 2.6720 (3.1530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8326 (0.7755)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [91]  [1000/1251]  eta: 0:00:47  lr: 0.003385  min_lr: 0.003385  loss: 2.6482 (3.1378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6140 (0.7757)  time: 0.1895  data: 0.0005  max mem: 12911
Epoch: [91]  [1200/1251]  eta: 0:00:09  lr: 0.003383  min_lr: 0.003383  loss: 2.5107 (3.1427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6880 (0.7713)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [91]  [1250/1251]  eta: 0:00:00  lr: 0.003382  min_lr: 0.003382  loss: 2.6054 (3.1405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6739 (0.7716)  time: 0.1473  data: 0.0008  max mem: 12911
Epoch: [91] Total time: 0:03:57 (0.1902 s / it)
Averaged stats: lr: 0.003382  min_lr: 0.003382  loss: 2.6054 (3.1500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6739 (0.7716)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7533 (0.7533)  acc1: 83.2000 (83.2000)  acc5: 98.0000 (98.0000)  time: 5.3524  data: 5.2609  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9555 (0.9651)  acc1: 79.6000 (78.6545)  acc5: 95.6000 (94.6909)  time: 0.7501  data: 0.6555  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2265 (1.1753)  acc1: 70.0000 (73.9429)  acc5: 89.6000 (91.8857)  time: 0.2277  data: 0.1382  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3440 (1.1865)  acc1: 69.6000 (73.7440)  acc5: 89.6000 (91.7920)  time: 0.2255  data: 0.1381  max mem: 12911
Test: Total time: 0:00:10 (0.4119 s / it)
* Acc@1 73.728 Acc@5 91.956 loss 1.178
Accuracy of the model on the 50000 test images: 73.7%
Max accuracy: 74.06%
Epoch: [92]  [   0/1251]  eta: 1:03:38  lr: 0.003382  min_lr: 0.003382  loss: 2.4399 (2.4399)  weight_decay: 0.0500 (0.0500)  time: 3.0520  data: 2.5219  max mem: 12911
Epoch: [92]  [ 200/1251]  eta: 0:03:35  lr: 0.003380  min_lr: 0.003380  loss: 2.6506 (3.2121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6438 (0.7606)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [92]  [ 400/1251]  eta: 0:02:47  lr: 0.003377  min_lr: 0.003377  loss: 2.6227 (3.1876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6941 (0.8094)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [92]  [ 600/1251]  eta: 0:02:06  lr: 0.003374  min_lr: 0.003374  loss: 2.6059 (3.1754)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7657 (0.7933)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [92]  [ 800/1251]  eta: 0:01:26  lr: 0.003372  min_lr: 0.003372  loss: 2.6115 (3.1758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6385 (0.8054)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [92]  [1000/1251]  eta: 0:00:48  lr: 0.003369  min_lr: 0.003369  loss: 2.5835 (3.1764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6717 (0.8047)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [92]  [1200/1251]  eta: 0:00:09  lr: 0.003367  min_lr: 0.003367  loss: 2.7490 (3.1714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6377 (0.7831)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [92]  [1250/1251]  eta: 0:00:00  lr: 0.003366  min_lr: 0.003366  loss: 2.7542 (3.1710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6292 (0.7787)  time: 0.1484  data: 0.0007  max mem: 12911
Epoch: [92] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.003366  min_lr: 0.003366  loss: 2.7542 (3.1370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6292 (0.7787)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7973 (0.7973)  acc1: 83.2000 (83.2000)  acc5: 97.2000 (97.2000)  time: 5.5827  data: 5.4908  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9374 (0.9907)  acc1: 79.2000 (78.7636)  acc5: 95.6000 (94.9818)  time: 0.6904  data: 0.6076  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2315 (1.2025)  acc1: 71.2000 (74.1333)  acc5: 92.0000 (92.0000)  time: 0.1932  data: 0.1125  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3305 (1.2136)  acc1: 70.8000 (73.9680)  acc5: 89.6000 (91.8400)  time: 0.1978  data: 0.1180  max mem: 12911
Test: Total time: 0:00:09 (0.3989 s / it)
* Acc@1 73.866 Acc@5 92.126 loss 1.212
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 74.06%
Epoch: [93]  [   0/1251]  eta: 1:05:14  lr: 0.003366  min_lr: 0.003366  loss: 2.4855 (2.4855)  weight_decay: 0.0500 (0.0500)  time: 3.1288  data: 2.1273  max mem: 12911
Epoch: [93]  [ 200/1251]  eta: 0:03:34  lr: 0.003363  min_lr: 0.003363  loss: 2.5769 (3.0738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6627 (0.7071)  time: 0.1893  data: 0.0005  max mem: 12911
Epoch: [93]  [ 400/1251]  eta: 0:02:47  lr: 0.003361  min_lr: 0.003361  loss: 2.5065 (3.1137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6764 (0.7847)  time: 0.1898  data: 0.0005  max mem: 12911
Epoch: [93]  [ 600/1251]  eta: 0:02:06  lr: 0.003358  min_lr: 0.003358  loss: 2.6387 (3.1078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8848 (0.7920)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [93]  [ 800/1251]  eta: 0:01:27  lr: 0.003355  min_lr: 0.003355  loss: 2.4607 (3.1161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5851 (0.7764)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [93]  [1000/1251]  eta: 0:00:48  lr: 0.003353  min_lr: 0.003353  loss: 2.6550 (3.1164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6330 (0.7560)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [93]  [1200/1251]  eta: 0:00:09  lr: 0.003350  min_lr: 0.003350  loss: 2.4812 (3.1279)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7013 (0.7595)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [93]  [1250/1251]  eta: 0:00:00  lr: 0.003350  min_lr: 0.003350  loss: 2.5207 (3.1268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6155 (0.7558)  time: 0.1471  data: 0.0008  max mem: 12911
Epoch: [93] Total time: 0:03:59 (0.1917 s / it)
Averaged stats: lr: 0.003350  min_lr: 0.003350  loss: 2.5207 (3.1355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6155 (0.7558)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7490 (0.7490)  acc1: 84.0000 (84.0000)  acc5: 96.8000 (96.8000)  time: 5.7833  data: 5.6917  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8794 (0.9564)  acc1: 80.8000 (78.2909)  acc5: 95.6000 (94.6546)  time: 0.7787  data: 0.6825  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2044 (1.1593)  acc1: 70.4000 (73.8476)  acc5: 92.0000 (92.0191)  time: 0.2100  data: 0.1210  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2519 (1.1703)  acc1: 70.0000 (73.7760)  acc5: 90.4000 (91.8720)  time: 0.2067  data: 0.1209  max mem: 12911
Test: Total time: 0:00:10 (0.4152 s / it)
* Acc@1 73.820 Acc@5 92.198 loss 1.167
Accuracy of the model on the 50000 test images: 73.8%
Max accuracy: 74.06%
Epoch: [94]  [   0/1251]  eta: 1:06:25  lr: 0.003350  min_lr: 0.003350  loss: 3.9075 (3.9075)  weight_decay: 0.0500 (0.0500)  time: 3.1858  data: 1.5234  max mem: 12911
Epoch: [94]  [ 200/1251]  eta: 0:03:35  lr: 0.003347  min_lr: 0.003347  loss: 2.5222 (3.1692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6441 (0.7767)  time: 0.1882  data: 0.0006  max mem: 12911
Epoch: [94]  [ 400/1251]  eta: 0:02:45  lr: 0.003344  min_lr: 0.003344  loss: 2.5037 (3.1707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7686 (0.7772)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [94]  [ 600/1251]  eta: 0:02:06  lr: 0.003342  min_lr: 0.003342  loss: 2.6069 (3.1578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7083 (0.7788)  time: 0.1846  data: 0.0004  max mem: 12911
Epoch: [94]  [ 800/1251]  eta: 0:01:26  lr: 0.003339  min_lr: 0.003339  loss: 2.7384 (3.1339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6544 (0.7737)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [94]  [1000/1251]  eta: 0:00:47  lr: 0.003336  min_lr: 0.003336  loss: 2.8615 (3.1266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9249 (0.7846)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [94]  [1200/1251]  eta: 0:00:09  lr: 0.003334  min_lr: 0.003334  loss: 3.0488 (3.1386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7170 (0.7814)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [94]  [1250/1251]  eta: 0:00:00  lr: 0.003333  min_lr: 0.003333  loss: 2.5753 (3.1436)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7035 (0.7849)  time: 0.1477  data: 0.0007  max mem: 12911
Epoch: [94] Total time: 0:03:57 (0.1900 s / it)
Averaged stats: lr: 0.003333  min_lr: 0.003333  loss: 2.5753 (3.1241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7035 (0.7849)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7772 (0.7772)  acc1: 82.8000 (82.8000)  acc5: 96.4000 (96.4000)  time: 5.5625  data: 5.4354  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9313 (0.9777)  acc1: 80.8000 (78.7636)  acc5: 95.6000 (95.0182)  time: 0.7197  data: 0.6194  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2509 (1.1852)  acc1: 72.8000 (74.2095)  acc5: 92.0000 (92.1143)  time: 0.1909  data: 0.1023  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3214 (1.2058)  acc1: 71.6000 (73.8560)  acc5: 91.2000 (91.9200)  time: 0.1925  data: 0.1080  max mem: 12911
Test: Total time: 0:00:09 (0.3965 s / it)
* Acc@1 73.658 Acc@5 92.144 loss 1.204
Accuracy of the model on the 50000 test images: 73.7%
Max accuracy: 74.06%
Epoch: [95]  [   0/1251]  eta: 0:59:44  lr: 0.003333  min_lr: 0.003333  loss: 3.5118 (3.5118)  weight_decay: 0.0500 (0.0500)  time: 2.8656  data: 1.6518  max mem: 12911
Epoch: [95]  [ 200/1251]  eta: 0:03:34  lr: 0.003330  min_lr: 0.003330  loss: 2.6192 (3.2095)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6657 (nan)  time: 0.1892  data: 0.0004  max mem: 12911
Epoch: [95]  [ 400/1251]  eta: 0:02:45  lr: 0.003327  min_lr: 0.003327  loss: 2.6956 (3.1505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7360 (nan)  time: 0.1863  data: 0.0004  max mem: 12911
Epoch: [95]  [ 600/1251]  eta: 0:02:05  lr: 0.003325  min_lr: 0.003325  loss: 2.6435 (3.1582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7595 (nan)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [95]  [ 800/1251]  eta: 0:01:26  lr: 0.003322  min_lr: 0.003322  loss: 3.2425 (3.1441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6036 (nan)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [95]  [1000/1251]  eta: 0:00:47  lr: 0.003319  min_lr: 0.003319  loss: 2.6818 (3.1469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7996 (nan)  time: 0.1902  data: 0.0005  max mem: 12911
Epoch: [95]  [1200/1251]  eta: 0:00:09  lr: 0.003317  min_lr: 0.003317  loss: 2.6929 (3.1409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6643 (nan)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [95]  [1250/1251]  eta: 0:00:00  lr: 0.003316  min_lr: 0.003316  loss: 2.7135 (3.1321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6842 (nan)  time: 0.1469  data: 0.0005  max mem: 12911
Epoch: [95] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.003316  min_lr: 0.003316  loss: 2.7135 (3.1344)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6842 (nan)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.7396 (0.7396)  acc1: 80.8000 (80.8000)  acc5: 96.8000 (96.8000)  time: 5.2685  data: 5.1745  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9532 (0.9630)  acc1: 80.8000 (78.5818)  acc5: 95.2000 (95.2000)  time: 0.7307  data: 0.6364  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1829 (1.1661)  acc1: 72.4000 (74.5905)  acc5: 92.8000 (92.3238)  time: 0.2151  data: 0.1273  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3335 (1.1828)  acc1: 72.0000 (74.2720)  acc5: 90.0000 (92.1920)  time: 0.2132  data: 0.1272  max mem: 12911
Test: Total time: 0:00:09 (0.3984 s / it)
* Acc@1 73.712 Acc@5 91.978 loss 1.180
Accuracy of the model on the 50000 test images: 73.7%
Max accuracy: 74.06%
Epoch: [96]  [   0/1251]  eta: 1:09:29  lr: 0.003316  min_lr: 0.003316  loss: 2.4702 (2.4702)  weight_decay: 0.0500 (0.0500)  time: 3.3332  data: 2.3168  max mem: 12911
Epoch: [96]  [ 200/1251]  eta: 0:03:33  lr: 0.003313  min_lr: 0.003313  loss: 3.0730 (3.1099)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7229 (0.7910)  time: 0.1867  data: 0.0006  max mem: 12911
Epoch: [96]  [ 400/1251]  eta: 0:02:46  lr: 0.003311  min_lr: 0.003311  loss: 3.2486 (3.1438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6349 (0.7228)  time: 0.1864  data: 0.0003  max mem: 12911
Epoch: [96]  [ 600/1251]  eta: 0:02:05  lr: 0.003308  min_lr: 0.003308  loss: 2.8722 (3.1398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7098 (nan)  time: 0.1867  data: 0.0006  max mem: 12911
Epoch: [96]  [ 800/1251]  eta: 0:01:26  lr: 0.003305  min_lr: 0.003305  loss: 2.4745 (3.1500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (nan)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [96]  [1000/1251]  eta: 0:00:47  lr: 0.003302  min_lr: 0.003302  loss: 2.5520 (3.1363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7632 (nan)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [96]  [1200/1251]  eta: 0:00:09  lr: 0.003300  min_lr: 0.003300  loss: 2.6541 (3.1438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6645 (nan)  time: 0.1962  data: 0.0004  max mem: 12911
Epoch: [96]  [1250/1251]  eta: 0:00:00  lr: 0.003299  min_lr: 0.003299  loss: 2.6130 (3.1431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6787 (nan)  time: 0.1463  data: 0.0009  max mem: 12911
Epoch: [96] Total time: 0:03:57 (0.1901 s / it)
Averaged stats: lr: 0.003299  min_lr: 0.003299  loss: 2.6130 (3.1389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6787 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7914 (0.7914)  acc1: 83.2000 (83.2000)  acc5: 97.2000 (97.2000)  time: 5.4699  data: 5.3333  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0135 (0.9871)  acc1: 79.6000 (78.8727)  acc5: 95.6000 (95.1636)  time: 0.7555  data: 0.6553  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2111 (1.1817)  acc1: 72.4000 (74.8381)  acc5: 91.6000 (92.3619)  time: 0.2282  data: 0.1389  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3001 (1.1981)  acc1: 72.4000 (74.4960)  acc5: 91.2000 (92.1440)  time: 0.2242  data: 0.1388  max mem: 12911
Test: Total time: 0:00:10 (0.4171 s / it)
* Acc@1 73.780 Acc@5 92.148 loss 1.205
Accuracy of the model on the 50000 test images: 73.8%
Max accuracy: 74.06%
Epoch: [97]  [   0/1251]  eta: 1:05:24  lr: 0.003299  min_lr: 0.003299  loss: 2.3274 (2.3274)  weight_decay: 0.0500 (0.0500)  time: 3.1371  data: 2.5773  max mem: 12911
Epoch: [97]  [ 200/1251]  eta: 0:03:34  lr: 0.003296  min_lr: 0.003296  loss: 3.1808 (3.0681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.7944)  time: 0.1899  data: 0.0004  max mem: 12911
Epoch: [97]  [ 400/1251]  eta: 0:02:46  lr: 0.003294  min_lr: 0.003294  loss: 2.8889 (3.0541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (0.7867)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [97]  [ 600/1251]  eta: 0:02:05  lr: 0.003291  min_lr: 0.003291  loss: 3.1593 (3.0711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6777 (0.7950)  time: 0.1846  data: 0.0006  max mem: 12911
Epoch: [97]  [ 800/1251]  eta: 0:01:26  lr: 0.003288  min_lr: 0.003288  loss: 2.5136 (3.0878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7190 (0.7797)  time: 0.1888  data: 0.0004  max mem: 12911
Epoch: [97]  [1000/1251]  eta: 0:00:47  lr: 0.003285  min_lr: 0.003285  loss: 2.7035 (3.1162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7042 (0.7668)  time: 0.2038  data: 0.0004  max mem: 12911
Epoch: [97]  [1200/1251]  eta: 0:00:09  lr: 0.003283  min_lr: 0.003283  loss: 3.6583 (3.1271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8371 (0.7608)  time: 0.1888  data: 0.0004  max mem: 12911
Epoch: [97]  [1250/1251]  eta: 0:00:00  lr: 0.003282  min_lr: 0.003282  loss: 2.7234 (3.1268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8544 (0.7648)  time: 0.1470  data: 0.0007  max mem: 12911
Epoch: [97] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.003282  min_lr: 0.003282  loss: 2.7234 (3.1311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8544 (0.7648)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8583 (0.8583)  acc1: 82.0000 (82.0000)  acc5: 95.2000 (95.2000)  time: 5.4495  data: 5.3529  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9023 (0.9967)  acc1: 79.2000 (78.7636)  acc5: 95.2000 (95.0182)  time: 0.7548  data: 0.6622  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2472 (1.2049)  acc1: 70.4000 (74.5524)  acc5: 91.6000 (92.3810)  time: 0.2151  data: 0.1281  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3434 (1.2198)  acc1: 70.4000 (74.0800)  acc5: 90.8000 (92.1920)  time: 0.2148  data: 0.1280  max mem: 12911
Test: Total time: 0:00:10 (0.4071 s / it)
* Acc@1 74.002 Acc@5 92.204 loss 1.222
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.06%
Epoch: [98]  [   0/1251]  eta: 1:02:37  lr: 0.003282  min_lr: 0.003282  loss: 2.2107 (2.2107)  weight_decay: 0.0500 (0.0500)  time: 3.0033  data: 2.3040  max mem: 12911
Epoch: [98]  [ 200/1251]  eta: 0:03:36  lr: 0.003279  min_lr: 0.003279  loss: 2.8264 (3.0654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7102 (0.7046)  time: 0.1845  data: 0.0004  max mem: 12911
Epoch: [98]  [ 400/1251]  eta: 0:02:47  lr: 0.003276  min_lr: 0.003276  loss: 2.6913 (3.0391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7948 (0.7314)  time: 0.1889  data: 0.0004  max mem: 12911
Epoch: [98]  [ 600/1251]  eta: 0:02:06  lr: 0.003274  min_lr: 0.003274  loss: 2.5662 (3.0922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6290 (0.7286)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [98]  [ 800/1251]  eta: 0:01:26  lr: 0.003271  min_lr: 0.003271  loss: 2.5809 (3.0909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7778 (0.7388)  time: 0.1889  data: 0.0006  max mem: 12911
Epoch: [98]  [1000/1251]  eta: 0:00:48  lr: 0.003268  min_lr: 0.003268  loss: 2.6111 (3.0905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6862 (0.7565)  time: 0.1900  data: 0.0004  max mem: 12911
Epoch: [98]  [1200/1251]  eta: 0:00:09  lr: 0.003265  min_lr: 0.003265  loss: 2.6374 (3.1123)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7278 (0.7733)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [98]  [1250/1251]  eta: 0:00:00  lr: 0.003265  min_lr: 0.003265  loss: 2.5726 (3.1121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6568 (0.7688)  time: 0.1462  data: 0.0006  max mem: 12911
Epoch: [98] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.003265  min_lr: 0.003265  loss: 2.5726 (3.1207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6568 (0.7688)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7271 (0.7271)  acc1: 83.2000 (83.2000)  acc5: 96.4000 (96.4000)  time: 5.6906  data: 5.5758  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8615 (0.9282)  acc1: 80.4000 (78.9091)  acc5: 96.0000 (95.3818)  time: 0.6433  data: 0.5458  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1916 (1.1397)  acc1: 72.0000 (74.3238)  acc5: 90.8000 (92.4000)  time: 0.1522  data: 0.0645  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2871 (1.1475)  acc1: 72.0000 (74.0640)  acc5: 90.8000 (92.3680)  time: 0.1773  data: 0.0930  max mem: 12911
Test: Total time: 0:00:09 (0.3891 s / it)
* Acc@1 74.014 Acc@5 92.244 loss 1.151
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.06%
Epoch: [99]  [   0/1251]  eta: 1:05:14  lr: 0.003265  min_lr: 0.003265  loss: 2.0935 (2.0935)  weight_decay: 0.0500 (0.0500)  time: 3.1289  data: 2.2782  max mem: 12911
Epoch: [99]  [ 200/1251]  eta: 0:03:35  lr: 0.003262  min_lr: 0.003262  loss: 2.6410 (3.1798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7499 (0.8610)  time: 0.1903  data: 0.0004  max mem: 12911
Epoch: [99]  [ 400/1251]  eta: 0:02:46  lr: 0.003259  min_lr: 0.003259  loss: 3.0414 (3.1940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6550 (0.8455)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [99]  [ 600/1251]  eta: 0:02:05  lr: 0.003256  min_lr: 0.003256  loss: 2.5531 (3.1672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5751 (0.7931)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [99]  [ 800/1251]  eta: 0:01:26  lr: 0.003253  min_lr: 0.003253  loss: 2.4059 (3.1297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5937 (0.7862)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [99]  [1000/1251]  eta: 0:00:47  lr: 0.003251  min_lr: 0.003251  loss: 2.8470 (3.1291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6606 (0.7788)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [99]  [1200/1251]  eta: 0:00:09  lr: 0.003248  min_lr: 0.003248  loss: 2.5415 (3.1252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8566 (0.7796)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [99]  [1250/1251]  eta: 0:00:00  lr: 0.003247  min_lr: 0.003247  loss: 2.6412 (3.1240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7385 (0.7785)  time: 0.1461  data: 0.0009  max mem: 12911
Epoch: [99] Total time: 0:03:57 (0.1898 s / it)
Averaged stats: lr: 0.003247  min_lr: 0.003247  loss: 2.6412 (3.1232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7385 (0.7785)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.8103 (0.8103)  acc1: 84.0000 (84.0000)  acc5: 96.8000 (96.8000)  time: 5.4069  data: 5.3154  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9347 (0.9879)  acc1: 81.2000 (78.6909)  acc5: 96.0000 (95.3091)  time: 0.7628  data: 0.6653  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2049 (1.2058)  acc1: 70.8000 (74.2476)  acc5: 91.6000 (92.5333)  time: 0.2278  data: 0.1388  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3790 (1.2222)  acc1: 70.8000 (73.8720)  acc5: 90.4000 (92.2720)  time: 0.2246  data: 0.1388  max mem: 12911
Test: Total time: 0:00:10 (0.4146 s / it)
* Acc@1 73.986 Acc@5 92.206 loss 1.218
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.06%
Epoch: [100]  [   0/1251]  eta: 0:56:51  lr: 0.003247  min_lr: 0.003247  loss: 3.9884 (3.9884)  weight_decay: 0.0500 (0.0500)  time: 2.7272  data: 1.6621  max mem: 12911
Epoch: [100]  [ 200/1251]  eta: 0:03:36  lr: 0.003244  min_lr: 0.003244  loss: 2.5023 (3.0576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6733 (0.8081)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [100]  [ 400/1251]  eta: 0:02:47  lr: 0.003242  min_lr: 0.003242  loss: 2.5311 (3.0708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6491 (0.7990)  time: 0.1849  data: 0.0005  max mem: 12911
Epoch: [100]  [ 600/1251]  eta: 0:02:06  lr: 0.003239  min_lr: 0.003239  loss: 2.5990 (3.0703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6438 (0.7889)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [100]  [ 800/1251]  eta: 0:01:26  lr: 0.003236  min_lr: 0.003236  loss: 3.1870 (3.0653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6391 (0.7603)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [100]  [1000/1251]  eta: 0:00:48  lr: 0.003233  min_lr: 0.003233  loss: 2.5507 (3.0798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7874 (0.7653)  time: 0.1904  data: 0.0005  max mem: 12911
Epoch: [100]  [1200/1251]  eta: 0:00:09  lr: 0.003230  min_lr: 0.003230  loss: 2.6031 (3.0735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6462 (0.7603)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [100]  [1250/1251]  eta: 0:00:00  lr: 0.003230  min_lr: 0.003230  loss: 2.7545 (3.0795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6523 (0.7571)  time: 0.1465  data: 0.0009  max mem: 12911
Epoch: [100] Total time: 0:03:58 (0.1910 s / it)
Averaged stats: lr: 0.003230  min_lr: 0.003230  loss: 2.7545 (3.1080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6523 (0.7571)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.7676 (0.7676)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 5.4173  data: 5.3174  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8800 (0.9481)  acc1: 78.4000 (79.2364)  acc5: 96.8000 (95.5273)  time: 0.7671  data: 0.6699  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1969 (1.1699)  acc1: 72.0000 (74.2286)  acc5: 91.6000 (92.7048)  time: 0.2347  data: 0.1457  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3474 (1.1813)  acc1: 69.6000 (73.7120)  acc5: 90.8000 (92.6240)  time: 0.2316  data: 0.1457  max mem: 12911
Test: Total time: 0:00:10 (0.4202 s / it)
* Acc@1 74.038 Acc@5 92.326 loss 1.180
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.06%
Epoch: [101]  [   0/1251]  eta: 1:02:20  lr: 0.003230  min_lr: 0.003230  loss: 2.3430 (2.3430)  weight_decay: 0.0500 (0.0500)  time: 2.9898  data: 1.5579  max mem: 12911
Epoch: [101]  [ 200/1251]  eta: 0:03:37  lr: 0.003227  min_lr: 0.003227  loss: 2.5369 (3.0359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6474 (0.7285)  time: 0.1851  data: 0.0004  max mem: 12911
Epoch: [101]  [ 400/1251]  eta: 0:02:47  lr: 0.003224  min_lr: 0.003224  loss: 3.1913 (3.0710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7629 (0.7743)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [101]  [ 600/1251]  eta: 0:02:06  lr: 0.003221  min_lr: 0.003221  loss: 2.4780 (3.0499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6397 (0.7583)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [101]  [ 800/1251]  eta: 0:01:27  lr: 0.003218  min_lr: 0.003218  loss: 2.9293 (3.0766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7543 (0.7760)  time: 0.1898  data: 0.0004  max mem: 12911
Epoch: [101]  [1000/1251]  eta: 0:00:48  lr: 0.003215  min_lr: 0.003215  loss: 2.5589 (3.0904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7355 (0.7558)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [101]  [1200/1251]  eta: 0:00:09  lr: 0.003212  min_lr: 0.003212  loss: 2.4639 (3.0973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7297 (0.7524)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [101]  [1250/1251]  eta: 0:00:00  lr: 0.003212  min_lr: 0.003212  loss: 2.7078 (3.1118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5643 (0.7453)  time: 0.1457  data: 0.0008  max mem: 12911
Epoch: [101] Total time: 0:03:59 (0.1914 s / it)
Averaged stats: lr: 0.003212  min_lr: 0.003212  loss: 2.7078 (3.1205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5643 (0.7453)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.8304 (0.8304)  acc1: 84.0000 (84.0000)  acc5: 96.8000 (96.8000)  time: 5.6995  data: 5.6079  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9380 (1.0248)  acc1: 78.4000 (78.1455)  acc5: 96.0000 (94.9455)  time: 0.7536  data: 0.6582  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2880 (1.2260)  acc1: 71.2000 (73.8667)  acc5: 91.6000 (92.1714)  time: 0.2085  data: 0.1209  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3774 (1.2382)  acc1: 71.2000 (73.5840)  acc5: 90.8000 (92.1280)  time: 0.2148  data: 0.1303  max mem: 12911
Test: Total time: 0:00:10 (0.4184 s / it)
* Acc@1 73.888 Acc@5 92.220 loss 1.233
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 74.06%
Epoch: [102]  [   0/1251]  eta: 1:05:30  lr: 0.003212  min_lr: 0.003212  loss: 2.6336 (2.6336)  weight_decay: 0.0500 (0.0500)  time: 3.1420  data: 2.9172  max mem: 12911
Epoch: [102]  [ 200/1251]  eta: 0:03:35  lr: 0.003209  min_lr: 0.003209  loss: 2.5981 (3.1073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6274 (0.7809)  time: 0.1883  data: 0.0005  max mem: 12911
Epoch: [102]  [ 400/1251]  eta: 0:02:48  lr: 0.003206  min_lr: 0.003206  loss: 3.4665 (3.1350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6983 (0.7493)  time: 0.1901  data: 0.0005  max mem: 12911
Epoch: [102]  [ 600/1251]  eta: 0:02:06  lr: 0.003203  min_lr: 0.003203  loss: 3.3262 (3.1170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8099 (0.7477)  time: 0.1895  data: 0.0005  max mem: 12911
Epoch: [102]  [ 800/1251]  eta: 0:01:27  lr: 0.003200  min_lr: 0.003200  loss: 3.2715 (3.1248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7903 (0.7747)  time: 0.1862  data: 0.0004  max mem: 12911
Epoch: [102]  [1000/1251]  eta: 0:00:48  lr: 0.003197  min_lr: 0.003197  loss: 2.7805 (3.1339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8219 (0.7694)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [102]  [1200/1251]  eta: 0:00:09  lr: 0.003195  min_lr: 0.003195  loss: 2.5387 (3.1382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.7923)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [102]  [1250/1251]  eta: 0:00:00  lr: 0.003194  min_lr: 0.003194  loss: 2.5944 (3.1425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7226 (0.7901)  time: 0.1468  data: 0.0007  max mem: 12911
Epoch: [102] Total time: 0:04:00 (0.1920 s / it)
Averaged stats: lr: 0.003194  min_lr: 0.003194  loss: 2.5944 (3.1115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7226 (0.7901)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8134 (0.8134)  acc1: 84.8000 (84.8000)  acc5: 98.4000 (98.4000)  time: 5.8094  data: 5.6800  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9077 (0.9725)  acc1: 81.2000 (79.6000)  acc5: 96.0000 (95.2727)  time: 0.7663  data: 0.6757  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2653 (1.1851)  acc1: 72.4000 (75.2000)  acc5: 90.8000 (92.1143)  time: 0.2046  data: 0.1209  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3432 (1.1964)  acc1: 72.0000 (74.8800)  acc5: 89.6000 (92.0160)  time: 0.2045  data: 0.1224  max mem: 12911
Test: Total time: 0:00:10 (0.4134 s / it)
* Acc@1 74.306 Acc@5 92.266 loss 1.197
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.31%
Epoch: [103]  [   0/1251]  eta: 0:55:01  lr: 0.003194  min_lr: 0.003194  loss: 3.9925 (3.9925)  weight_decay: 0.0500 (0.0500)  time: 2.6390  data: 2.3770  max mem: 12911
Epoch: [103]  [ 200/1251]  eta: 0:03:31  lr: 0.003191  min_lr: 0.003191  loss: 2.6779 (3.0730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6983 (0.7103)  time: 0.1851  data: 0.0004  max mem: 12911
Epoch: [103]  [ 400/1251]  eta: 0:02:45  lr: 0.003188  min_lr: 0.003188  loss: 2.6858 (3.0763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5900 (0.6649)  time: 0.1841  data: 0.0004  max mem: 12911
Epoch: [103]  [ 600/1251]  eta: 0:02:04  lr: 0.003185  min_lr: 0.003185  loss: 2.6590 (3.1168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7229 (0.6897)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [103]  [ 800/1251]  eta: 0:01:25  lr: 0.003182  min_lr: 0.003182  loss: 2.6735 (3.1015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8587 (0.7496)  time: 0.1873  data: 0.0004  max mem: 12911
Epoch: [103]  [1000/1251]  eta: 0:00:47  lr: 0.003179  min_lr: 0.003179  loss: 2.9256 (3.0915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6353 (0.7364)  time: 0.1874  data: 0.0004  max mem: 12911
Epoch: [103]  [1200/1251]  eta: 0:00:09  lr: 0.003176  min_lr: 0.003176  loss: 2.6387 (3.0924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7373 (0.7374)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [103]  [1250/1251]  eta: 0:00:00  lr: 0.003176  min_lr: 0.003176  loss: 2.5188 (3.0930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6861 (0.7343)  time: 0.1464  data: 0.0006  max mem: 12911
Epoch: [103] Total time: 0:03:56 (0.1894 s / it)
Averaged stats: lr: 0.003176  min_lr: 0.003176  loss: 2.5188 (3.1181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6861 (0.7343)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7517 (0.7517)  acc1: 83.2000 (83.2000)  acc5: 96.4000 (96.4000)  time: 5.3499  data: 5.2510  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9512 (0.9832)  acc1: 80.0000 (79.3091)  acc5: 95.6000 (95.0909)  time: 0.7276  data: 0.6347  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2266 (1.1933)  acc1: 73.2000 (74.5333)  acc5: 91.6000 (92.2095)  time: 0.2070  data: 0.1201  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2666 (1.2036)  acc1: 70.8000 (74.3040)  acc5: 90.0000 (92.0960)  time: 0.2183  data: 0.1319  max mem: 12911
Test: Total time: 0:00:10 (0.4050 s / it)
* Acc@1 74.172 Acc@5 92.342 loss 1.201
Accuracy of the model on the 50000 test images: 74.2%
Max accuracy: 74.31%
Epoch: [104]  [   0/1251]  eta: 1:02:18  lr: 0.003176  min_lr: 0.003176  loss: 2.5138 (2.5138)  weight_decay: 0.0500 (0.0500)  time: 2.9885  data: 2.3407  max mem: 12911
Epoch: [104]  [ 200/1251]  eta: 0:03:36  lr: 0.003173  min_lr: 0.003173  loss: 2.8969 (3.0733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6321 (0.7186)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [104]  [ 400/1251]  eta: 0:02:47  lr: 0.003170  min_lr: 0.003170  loss: 2.7014 (3.1013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6078 (0.6779)  time: 0.1877  data: 0.0005  max mem: 12911
Epoch: [104]  [ 600/1251]  eta: 0:02:06  lr: 0.003167  min_lr: 0.003167  loss: 2.4461 (3.0980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6254 (0.7123)  time: 0.1899  data: 0.0004  max mem: 12911
Epoch: [104]  [ 800/1251]  eta: 0:01:27  lr: 0.003164  min_lr: 0.003164  loss: 2.4644 (3.1058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6845 (0.7314)  time: 0.1849  data: 0.0005  max mem: 12911
Epoch: [104]  [1000/1251]  eta: 0:00:48  lr: 0.003161  min_lr: 0.003161  loss: 2.7826 (3.1208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6865 (0.7260)  time: 0.1917  data: 0.0004  max mem: 12911
Epoch: [104]  [1200/1251]  eta: 0:00:09  lr: 0.003158  min_lr: 0.003158  loss: 2.5373 (3.1063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6524 (0.7259)  time: 0.1907  data: 0.0005  max mem: 12911
Epoch: [104]  [1250/1251]  eta: 0:00:00  lr: 0.003158  min_lr: 0.003158  loss: 2.5359 (3.0999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7995 (0.7308)  time: 0.1469  data: 0.0008  max mem: 12911
Epoch: [104] Total time: 0:04:00 (0.1920 s / it)
Averaged stats: lr: 0.003158  min_lr: 0.003158  loss: 2.5359 (3.0985)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7995 (0.7308)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.6819 (0.6819)  acc1: 86.0000 (86.0000)  acc5: 97.2000 (97.2000)  time: 5.2114  data: 5.1164  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9277 (0.9213)  acc1: 78.8000 (79.3455)  acc5: 96.4000 (95.3091)  time: 0.6979  data: 0.6037  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1756 (1.1359)  acc1: 71.6000 (74.9714)  acc5: 92.4000 (92.8000)  time: 0.2042  data: 0.1173  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3204 (1.1520)  acc1: 70.8000 (74.4160)  acc5: 91.2000 (92.5600)  time: 0.2018  data: 0.1172  max mem: 12911
Test: Total time: 0:00:09 (0.3893 s / it)
* Acc@1 74.428 Acc@5 92.424 loss 1.152
Accuracy of the model on the 50000 test images: 74.4%
Max accuracy: 74.43%
Epoch: [105]  [   0/1251]  eta: 1:03:06  lr: 0.003158  min_lr: 0.003158  loss: 3.0783 (3.0783)  weight_decay: 0.0500 (0.0500)  time: 3.0271  data: 2.8168  max mem: 12911
Epoch: [105]  [ 200/1251]  eta: 0:03:33  lr: 0.003155  min_lr: 0.003155  loss: 2.6912 (3.1138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.7534)  time: 0.1884  data: 0.0004  max mem: 12911
Epoch: [105]  [ 400/1251]  eta: 0:02:45  lr: 0.003152  min_lr: 0.003152  loss: 2.5455 (3.0876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6551 (0.7337)  time: 0.1896  data: 0.0004  max mem: 12911
Epoch: [105]  [ 600/1251]  eta: 0:02:05  lr: 0.003149  min_lr: 0.003149  loss: 2.4923 (3.0977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5972 (0.7236)  time: 0.1884  data: 0.0006  max mem: 12911
Epoch: [105]  [ 800/1251]  eta: 0:01:26  lr: 0.003146  min_lr: 0.003146  loss: 2.6055 (3.1308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6384 (0.7313)  time: 0.1919  data: 0.0004  max mem: 12911
Epoch: [105]  [1000/1251]  eta: 0:00:48  lr: 0.003143  min_lr: 0.003143  loss: 3.2438 (3.1222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6896 (0.7281)  time: 0.1928  data: 0.0004  max mem: 12911
Epoch: [105]  [1200/1251]  eta: 0:00:09  lr: 0.003140  min_lr: 0.003140  loss: 3.6671 (3.1207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6699 (0.7293)  time: 0.1873  data: 0.0004  max mem: 12911
Epoch: [105]  [1250/1251]  eta: 0:00:00  lr: 0.003139  min_lr: 0.003139  loss: 3.9078 (3.1241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7265 (0.7327)  time: 0.1462  data: 0.0009  max mem: 12911
Epoch: [105] Total time: 0:03:59 (0.1913 s / it)
Averaged stats: lr: 0.003139  min_lr: 0.003139  loss: 3.9078 (3.1059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7265 (0.7327)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.8914 (0.8914)  acc1: 81.6000 (81.6000)  acc5: 96.8000 (96.8000)  time: 5.3901  data: 5.2886  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9306 (1.0067)  acc1: 81.6000 (78.8364)  acc5: 96.0000 (95.3818)  time: 0.7278  data: 0.6310  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2368 (1.2125)  acc1: 71.6000 (74.0762)  acc5: 91.6000 (92.5714)  time: 0.2109  data: 0.1224  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3704 (1.2222)  acc1: 71.2000 (73.8080)  acc5: 90.0000 (92.4320)  time: 0.2075  data: 0.1223  max mem: 12911
Test: Total time: 0:00:10 (0.4020 s / it)
* Acc@1 73.670 Acc@5 92.178 loss 1.214
Accuracy of the model on the 50000 test images: 73.7%
Max accuracy: 74.43%
Epoch: [106]  [   0/1251]  eta: 1:01:02  lr: 0.003139  min_lr: 0.003139  loss: 2.3573 (2.3573)  weight_decay: 0.0500 (0.0500)  time: 2.9273  data: 2.0873  max mem: 12911
Epoch: [106]  [ 200/1251]  eta: 0:03:35  lr: 0.003136  min_lr: 0.003136  loss: 2.6454 (2.9997)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7198 (0.8281)  time: 0.1886  data: 0.0006  max mem: 12911
Epoch: [106]  [ 400/1251]  eta: 0:02:46  lr: 0.003133  min_lr: 0.003133  loss: 3.1247 (3.0524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6241 (0.7651)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [106]  [ 600/1251]  eta: 0:02:05  lr: 0.003130  min_lr: 0.003130  loss: 2.8739 (3.0447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6406 (0.7468)  time: 0.1859  data: 0.0006  max mem: 12911
Epoch: [106]  [ 800/1251]  eta: 0:01:26  lr: 0.003127  min_lr: 0.003127  loss: 3.0600 (3.0745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8155 (0.7456)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [106]  [1000/1251]  eta: 0:00:47  lr: 0.003124  min_lr: 0.003124  loss: 2.5866 (3.0899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7712 (0.7492)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [106]  [1200/1251]  eta: 0:00:09  lr: 0.003121  min_lr: 0.003121  loss: 3.3543 (3.1087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6568 (0.7401)  time: 0.1891  data: 0.0005  max mem: 12911
Epoch: [106]  [1250/1251]  eta: 0:00:00  lr: 0.003121  min_lr: 0.003121  loss: 2.4276 (3.0999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.7378)  time: 0.1476  data: 0.0011  max mem: 12911
Epoch: [106] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.003121  min_lr: 0.003121  loss: 2.4276 (3.1019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.7378)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8058 (0.8058)  acc1: 82.0000 (82.0000)  acc5: 98.4000 (98.4000)  time: 5.6620  data: 5.5698  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9366 (0.9655)  acc1: 79.2000 (79.4909)  acc5: 96.0000 (95.3091)  time: 0.7770  data: 0.6830  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1933 (1.1599)  acc1: 75.2000 (75.1238)  acc5: 91.6000 (92.7048)  time: 0.2226  data: 0.1349  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2432 (1.1708)  acc1: 72.0000 (74.8000)  acc5: 90.8000 (92.5600)  time: 0.2207  data: 0.1348  max mem: 12911
Test: Total time: 0:00:10 (0.4202 s / it)
* Acc@1 74.536 Acc@5 92.598 loss 1.168
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.54%
Epoch: [107]  [   0/1251]  eta: 0:58:30  lr: 0.003121  min_lr: 0.003121  loss: 2.1758 (2.1758)  weight_decay: 0.0500 (0.0500)  time: 2.8063  data: 2.5486  max mem: 12911
Epoch: [107]  [ 200/1251]  eta: 0:03:33  lr: 0.003118  min_lr: 0.003118  loss: 2.6336 (3.1901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7247 (0.7534)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [107]  [ 400/1251]  eta: 0:02:46  lr: 0.003115  min_lr: 0.003115  loss: 3.1037 (3.1486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5889 (0.7432)  time: 0.1909  data: 0.0005  max mem: 12911
Epoch: [107]  [ 600/1251]  eta: 0:02:06  lr: 0.003112  min_lr: 0.003112  loss: 3.0806 (3.1254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8469 (0.7747)  time: 0.1884  data: 0.0005  max mem: 12911
Epoch: [107]  [ 800/1251]  eta: 0:01:26  lr: 0.003109  min_lr: 0.003109  loss: 3.0157 (3.1150)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7259 (0.7666)  time: 0.1903  data: 0.0005  max mem: 12911
Epoch: [107]  [1000/1251]  eta: 0:00:48  lr: 0.003106  min_lr: 0.003106  loss: 2.4769 (3.1024)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5580 (0.7457)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [107]  [1200/1251]  eta: 0:00:09  lr: 0.003103  min_lr: 0.003103  loss: 2.3847 (3.0894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7465 (0.7563)  time: 0.1962  data: 0.0004  max mem: 12911
Epoch: [107]  [1250/1251]  eta: 0:00:00  lr: 0.003102  min_lr: 0.003102  loss: 2.6016 (3.0920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6860 (0.7544)  time: 0.1464  data: 0.0011  max mem: 12911
Epoch: [107] Total time: 0:03:59 (0.1910 s / it)
Averaged stats: lr: 0.003102  min_lr: 0.003102  loss: 2.6016 (3.1043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6860 (0.7544)
Test:  [ 0/25]  eta: 0:01:21  loss: 0.8148 (0.8148)  acc1: 82.4000 (82.4000)  acc5: 96.4000 (96.4000)  time: 3.2599  data: 3.1261  max mem: 12911
Test:  [10/25]  eta: 0:00:08  loss: 0.9375 (0.9501)  acc1: 78.8000 (78.4000)  acc5: 95.2000 (95.2364)  time: 0.5489  data: 0.4486  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1643 (1.1448)  acc1: 72.0000 (74.2857)  acc5: 91.6000 (92.5524)  time: 0.2607  data: 0.1702  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3007 (1.1608)  acc1: 72.0000 (73.9840)  acc5: 90.8000 (92.3680)  time: 0.2000  data: 0.1132  max mem: 12911
Test: Total time: 0:00:09 (0.3819 s / it)
* Acc@1 74.198 Acc@5 92.524 loss 1.154
Accuracy of the model on the 50000 test images: 74.2%
Max accuracy: 74.54%
Epoch: [108]  [   0/1251]  eta: 1:09:00  lr: 0.003102  min_lr: 0.003102  loss: 3.9435 (3.9435)  weight_decay: 0.0500 (0.0500)  time: 3.3101  data: 1.6678  max mem: 12911
Epoch: [108]  [ 200/1251]  eta: 0:03:32  lr: 0.003099  min_lr: 0.003099  loss: 2.4617 (3.0455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7406 (0.7360)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [108]  [ 400/1251]  eta: 0:02:45  lr: 0.003096  min_lr: 0.003096  loss: 2.4205 (3.0607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6361 (0.7334)  time: 0.1864  data: 0.0004  max mem: 12911
Epoch: [108]  [ 600/1251]  eta: 0:02:04  lr: 0.003093  min_lr: 0.003093  loss: 2.4303 (3.0644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8257 (0.7635)  time: 0.1852  data: 0.0004  max mem: 12911
Epoch: [108]  [ 800/1251]  eta: 0:01:25  lr: 0.003090  min_lr: 0.003090  loss: 2.5364 (3.0750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6817 (0.7720)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [108]  [1000/1251]  eta: 0:00:47  lr: 0.003087  min_lr: 0.003087  loss: 2.5019 (3.0738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5781 (0.7545)  time: 0.1840  data: 0.0004  max mem: 12911
Epoch: [108]  [1200/1251]  eta: 0:00:09  lr: 0.003084  min_lr: 0.003084  loss: 2.9175 (3.0880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6822 (0.7544)  time: 0.1853  data: 0.0003  max mem: 12911
Epoch: [108]  [1250/1251]  eta: 0:00:00  lr: 0.003083  min_lr: 0.003083  loss: 2.6333 (3.0898)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6782 (0.7523)  time: 0.1458  data: 0.0008  max mem: 12911
Epoch: [108] Total time: 0:03:55 (0.1886 s / it)
Averaged stats: lr: 0.003083  min_lr: 0.003083  loss: 2.6333 (3.1153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6782 (0.7523)
Test:  [ 0/25]  eta: 0:01:48  loss: 0.7870 (0.7870)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 4.3345  data: 4.2329  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9505 (1.0195)  acc1: 80.0000 (78.9818)  acc5: 96.4000 (95.4909)  time: 0.6938  data: 0.6092  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2724 (1.2189)  acc1: 72.0000 (74.4191)  acc5: 92.0000 (92.5524)  time: 0.2553  data: 0.1740  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3414 (1.2255)  acc1: 71.6000 (74.1120)  acc5: 90.8000 (92.4800)  time: 0.2066  data: 0.1265  max mem: 12911
Test: Total time: 0:00:09 (0.3970 s / it)
* Acc@1 74.554 Acc@5 92.476 loss 1.214
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 74.55%
Epoch: [109]  [   0/1251]  eta: 1:03:33  lr: 0.003083  min_lr: 0.003083  loss: 4.0184 (4.0184)  weight_decay: 0.0500 (0.0500)  time: 3.0486  data: 2.7907  max mem: 12911
Epoch: [109]  [ 200/1251]  eta: 0:03:33  lr: 0.003080  min_lr: 0.003080  loss: 2.3542 (3.1547)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5822 (0.6985)  time: 0.1883  data: 0.0005  max mem: 12911
Epoch: [109]  [ 400/1251]  eta: 0:02:45  lr: 0.003077  min_lr: 0.003077  loss: 2.4890 (3.1014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6647 (0.7505)  time: 0.1877  data: 0.0004  max mem: 12911
Epoch: [109]  [ 600/1251]  eta: 0:02:05  lr: 0.003074  min_lr: 0.003074  loss: 2.3347 (3.0884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6718 (0.7928)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [109]  [ 800/1251]  eta: 0:01:26  lr: 0.003071  min_lr: 0.003071  loss: 3.3354 (3.1232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6592 (0.7682)  time: 0.1903  data: 0.0004  max mem: 12911
Epoch: [109]  [1000/1251]  eta: 0:00:47  lr: 0.003068  min_lr: 0.003068  loss: 2.4359 (3.1177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6945 (0.7590)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [109]  [1200/1251]  eta: 0:00:09  lr: 0.003065  min_lr: 0.003065  loss: 2.5340 (3.1051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.7493)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [109]  [1250/1251]  eta: 0:00:00  lr: 0.003064  min_lr: 0.003064  loss: 2.8233 (3.1035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7079 (0.7462)  time: 0.1459  data: 0.0013  max mem: 12911
Epoch: [109] Total time: 0:03:57 (0.1901 s / it)
Averaged stats: lr: 0.003064  min_lr: 0.003064  loss: 2.8233 (3.1001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7079 (0.7462)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7949 (0.7949)  acc1: 81.6000 (81.6000)  acc5: 96.8000 (96.8000)  time: 5.6246  data: 5.5290  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9233 (0.9625)  acc1: 79.6000 (78.9091)  acc5: 95.6000 (95.1636)  time: 0.7493  data: 0.6625  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1833 (1.1681)  acc1: 72.0000 (75.0667)  acc5: 92.0000 (92.3810)  time: 0.2098  data: 0.1262  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2606 (1.1814)  acc1: 72.4000 (74.6400)  acc5: 90.4000 (92.2400)  time: 0.2088  data: 0.1261  max mem: 12911
Test: Total time: 0:00:10 (0.4085 s / it)
* Acc@1 74.258 Acc@5 92.402 loss 1.180
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.55%
Epoch: [110]  [   0/1251]  eta: 1:05:36  lr: 0.003064  min_lr: 0.003064  loss: 2.6506 (2.6506)  weight_decay: 0.0500 (0.0500)  time: 3.1464  data: 2.8648  max mem: 12911
Epoch: [110]  [ 200/1251]  eta: 0:03:37  lr: 0.003061  min_lr: 0.003061  loss: 2.5503 (3.0605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7243 (0.7906)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [110]  [ 400/1251]  eta: 0:02:47  lr: 0.003058  min_lr: 0.003058  loss: 2.5817 (3.0530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6887 (0.7728)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [110]  [ 600/1251]  eta: 0:02:06  lr: 0.003055  min_lr: 0.003055  loss: 3.0567 (3.1038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5937 (0.7254)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [110]  [ 800/1251]  eta: 0:01:26  lr: 0.003052  min_lr: 0.003052  loss: 2.5022 (3.0948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6357 (0.7461)  time: 0.1864  data: 0.0004  max mem: 12911
Epoch: [110]  [1000/1251]  eta: 0:00:47  lr: 0.003049  min_lr: 0.003049  loss: 3.0013 (3.0968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6624 (0.7350)  time: 0.1873  data: 0.0003  max mem: 12911
Epoch: [110]  [1200/1251]  eta: 0:00:09  lr: 0.003046  min_lr: 0.003046  loss: 2.7088 (3.0999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6880 (0.7369)  time: 0.1863  data: 0.0004  max mem: 12911
Epoch: [110]  [1250/1251]  eta: 0:00:00  lr: 0.003045  min_lr: 0.003045  loss: 2.9034 (3.1035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6271 (0.7331)  time: 0.1475  data: 0.0006  max mem: 12911
Epoch: [110] Total time: 0:03:57 (0.1899 s / it)
Averaged stats: lr: 0.003045  min_lr: 0.003045  loss: 2.9034 (3.0787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6271 (0.7331)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.7806 (0.7806)  acc1: 82.4000 (82.4000)  acc5: 96.4000 (96.4000)  time: 5.4255  data: 5.2837  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8894 (0.9586)  acc1: 79.6000 (78.9091)  acc5: 96.0000 (95.2000)  time: 0.7477  data: 0.6477  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1943 (1.1673)  acc1: 72.8000 (74.1143)  acc5: 92.0000 (92.4762)  time: 0.2075  data: 0.1186  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3306 (1.1865)  acc1: 72.0000 (73.5840)  acc5: 91.2000 (92.4320)  time: 0.2035  data: 0.1185  max mem: 12911
Test: Total time: 0:00:09 (0.3986 s / it)
* Acc@1 74.150 Acc@5 92.422 loss 1.176
Accuracy of the model on the 50000 test images: 74.2%
Max accuracy: 74.55%
Epoch: [111]  [   0/1251]  eta: 0:55:23  lr: 0.003045  min_lr: 0.003045  loss: 2.2578 (2.2578)  weight_decay: 0.0500 (0.0500)  time: 2.6569  data: 1.6734  max mem: 12911
Epoch: [111]  [ 200/1251]  eta: 0:03:35  lr: 0.003042  min_lr: 0.003042  loss: 3.9703 (3.0958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6455 (0.7316)  time: 0.1869  data: 0.0004  max mem: 12911
Epoch: [111]  [ 400/1251]  eta: 0:02:48  lr: 0.003039  min_lr: 0.003039  loss: 3.2789 (3.0977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.7179)  time: 0.1937  data: 0.0005  max mem: 12911
Epoch: [111]  [ 600/1251]  eta: 0:02:06  lr: 0.003036  min_lr: 0.003036  loss: 2.4390 (3.0945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6460 (0.7111)  time: 0.1875  data: 0.0006  max mem: 12911
Epoch: [111]  [ 800/1251]  eta: 0:01:26  lr: 0.003033  min_lr: 0.003033  loss: 2.7412 (3.1010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6320 (0.7033)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [111]  [1000/1251]  eta: 0:00:48  lr: 0.003030  min_lr: 0.003030  loss: 2.6519 (3.0934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5859 (0.7023)  time: 0.1874  data: 0.0004  max mem: 12911
Epoch: [111]  [1200/1251]  eta: 0:00:09  lr: 0.003027  min_lr: 0.003027  loss: 2.5429 (3.0951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7439 (0.7177)  time: 0.1902  data: 0.0004  max mem: 12911
Epoch: [111]  [1250/1251]  eta: 0:00:00  lr: 0.003026  min_lr: 0.003026  loss: 2.7811 (3.1041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7093 (0.7193)  time: 0.1470  data: 0.0006  max mem: 12911
Epoch: [111] Total time: 0:03:59 (0.1914 s / it)
Averaged stats: lr: 0.003026  min_lr: 0.003026  loss: 2.7811 (3.0909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7093 (0.7193)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8620 (0.8620)  acc1: 80.8000 (80.8000)  acc5: 97.6000 (97.6000)  time: 5.5459  data: 5.4529  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 1.0939 (1.0326)  acc1: 78.0000 (78.5818)  acc5: 96.0000 (95.4909)  time: 0.7259  data: 0.6426  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2488 (1.2158)  acc1: 74.0000 (74.9524)  acc5: 92.0000 (92.5524)  time: 0.2083  data: 0.1274  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3671 (1.2300)  acc1: 72.0000 (74.5120)  acc5: 90.8000 (92.4480)  time: 0.2069  data: 0.1274  max mem: 12911
Test: Total time: 0:00:10 (0.4045 s / it)
* Acc@1 74.364 Acc@5 92.520 loss 1.224
Accuracy of the model on the 50000 test images: 74.4%
Max accuracy: 74.55%
Epoch: [112]  [   0/1251]  eta: 1:07:48  lr: 0.003026  min_lr: 0.003026  loss: 2.4814 (2.4814)  weight_decay: 0.0500 (0.0500)  time: 3.2520  data: 2.4031  max mem: 12911
Epoch: [112]  [ 200/1251]  eta: 0:03:34  lr: 0.003023  min_lr: 0.003023  loss: 3.3587 (2.9692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8868 (0.8754)  time: 0.1837  data: 0.0005  max mem: 12911
Epoch: [112]  [ 400/1251]  eta: 0:02:46  lr: 0.003020  min_lr: 0.003020  loss: 2.5792 (3.0224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7136 (0.7925)  time: 0.1857  data: 0.0004  max mem: 12911
Epoch: [112]  [ 600/1251]  eta: 0:02:05  lr: 0.003017  min_lr: 0.003017  loss: 2.5829 (3.0250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6698 (0.7783)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [112]  [ 800/1251]  eta: 0:01:26  lr: 0.003014  min_lr: 0.003014  loss: 2.5965 (3.0373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6671 (0.7764)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [112]  [1000/1251]  eta: 0:00:47  lr: 0.003011  min_lr: 0.003011  loss: 2.3788 (3.0383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7693 (0.7813)  time: 0.1895  data: 0.0006  max mem: 12911
Epoch: [112]  [1200/1251]  eta: 0:00:09  lr: 0.003007  min_lr: 0.003007  loss: 2.5790 (3.0582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6325 (0.7643)  time: 0.1909  data: 0.0004  max mem: 12911
Epoch: [112]  [1250/1251]  eta: 0:00:00  lr: 0.003007  min_lr: 0.003007  loss: 2.4877 (3.0584)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7079 (0.7632)  time: 0.1468  data: 0.0010  max mem: 12911
Epoch: [112] Total time: 0:03:57 (0.1900 s / it)
Averaged stats: lr: 0.003007  min_lr: 0.003007  loss: 2.4877 (3.0835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7079 (0.7632)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.8060 (0.8060)  acc1: 83.2000 (83.2000)  acc5: 96.8000 (96.8000)  time: 5.4914  data: 5.3968  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9270 (0.9791)  acc1: 79.6000 (79.4545)  acc5: 95.6000 (95.0545)  time: 0.7255  data: 0.6319  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2170 (1.1823)  acc1: 73.6000 (74.7619)  acc5: 91.2000 (92.1333)  time: 0.2149  data: 0.1276  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3519 (1.1941)  acc1: 71.6000 (74.5440)  acc5: 90.4000 (92.0160)  time: 0.2189  data: 0.1335  max mem: 12911
Test: Total time: 0:00:10 (0.4126 s / it)
* Acc@1 74.502 Acc@5 92.318 loss 1.197
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.55%
Epoch: [113]  [   0/1251]  eta: 1:04:23  lr: 0.003007  min_lr: 0.003007  loss: 4.5648 (4.5648)  weight_decay: 0.0500 (0.0500)  time: 3.0880  data: 2.3794  max mem: 12911
Epoch: [113]  [ 200/1251]  eta: 0:03:34  lr: 0.003004  min_lr: 0.003004  loss: 2.7697 (3.0703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7656 (0.7654)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [113]  [ 400/1251]  eta: 0:02:46  lr: 0.003000  min_lr: 0.003000  loss: 2.6307 (3.0799)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6103 (0.7511)  time: 0.1881  data: 0.0004  max mem: 12911
Epoch: [113]  [ 600/1251]  eta: 0:02:05  lr: 0.002997  min_lr: 0.002997  loss: 3.4741 (3.1096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7033 (0.7350)  time: 0.1848  data: 0.0004  max mem: 12911
Epoch: [113]  [ 800/1251]  eta: 0:01:26  lr: 0.002994  min_lr: 0.002994  loss: 2.5111 (3.0853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6078 (0.7164)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [113]  [1000/1251]  eta: 0:00:47  lr: 0.002991  min_lr: 0.002991  loss: 2.6114 (3.0966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5580 (0.7269)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [113]  [1200/1251]  eta: 0:00:09  lr: 0.002988  min_lr: 0.002988  loss: 2.9993 (3.0906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7302 (0.7190)  time: 0.1856  data: 0.0005  max mem: 12911
Epoch: [113]  [1250/1251]  eta: 0:00:00  lr: 0.002987  min_lr: 0.002987  loss: 2.6567 (3.0900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7052 (0.7283)  time: 0.1466  data: 0.0011  max mem: 12911
Epoch: [113] Total time: 0:03:57 (0.1899 s / it)
Averaged stats: lr: 0.002987  min_lr: 0.002987  loss: 2.6567 (3.0900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7052 (0.7283)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7852 (0.7852)  acc1: 82.8000 (82.8000)  acc5: 97.6000 (97.6000)  time: 5.6713  data: 5.5793  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9008 (0.9138)  acc1: 80.4000 (79.7091)  acc5: 96.4000 (95.5273)  time: 0.7092  data: 0.6155  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1606 (1.1307)  acc1: 73.2000 (75.0857)  acc5: 92.4000 (92.9333)  time: 0.1962  data: 0.1086  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2181 (1.1381)  acc1: 72.4000 (74.7520)  acc5: 92.4000 (93.0400)  time: 0.2135  data: 0.1280  max mem: 12911
Test: Total time: 0:00:10 (0.4154 s / it)
* Acc@1 74.674 Acc@5 92.602 loss 1.142
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 74.67%
Epoch: [114]  [   0/1251]  eta: 1:06:04  lr: 0.002987  min_lr: 0.002987  loss: 2.8118 (2.8118)  weight_decay: 0.0500 (0.0500)  time: 3.1691  data: 2.9110  max mem: 12911
Epoch: [114]  [ 200/1251]  eta: 0:03:33  lr: 0.002984  min_lr: 0.002984  loss: 2.4366 (3.0049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7553 (0.7884)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [114]  [ 400/1251]  eta: 0:02:45  lr: 0.002981  min_lr: 0.002981  loss: 2.5923 (3.0135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6517 (0.7210)  time: 0.1860  data: 0.0006  max mem: 12911
Epoch: [114]  [ 600/1251]  eta: 0:02:05  lr: 0.002978  min_lr: 0.002978  loss: 3.0272 (3.0559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7003 (0.7292)  time: 0.1900  data: 0.0005  max mem: 12911
Epoch: [114]  [ 800/1251]  eta: 0:01:26  lr: 0.002975  min_lr: 0.002975  loss: 2.5787 (3.0718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6436 (0.7616)  time: 0.1850  data: 0.0004  max mem: 12911
Epoch: [114]  [1000/1251]  eta: 0:00:48  lr: 0.002972  min_lr: 0.002972  loss: 2.5661 (3.0717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6741 (0.7824)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [114]  [1200/1251]  eta: 0:00:09  lr: 0.002968  min_lr: 0.002968  loss: 3.2471 (3.0719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5596 (0.7573)  time: 0.1903  data: 0.0005  max mem: 12911
Epoch: [114]  [1250/1251]  eta: 0:00:00  lr: 0.002968  min_lr: 0.002968  loss: 2.6040 (3.0745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6411 (0.7552)  time: 0.1461  data: 0.0007  max mem: 12911
Epoch: [114] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.002968  min_lr: 0.002968  loss: 2.6040 (3.0992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6411 (0.7552)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7665 (0.7665)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.4877  data: 5.3962  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9290 (0.9658)  acc1: 80.0000 (79.9273)  acc5: 96.0000 (95.6727)  time: 0.7292  data: 0.6344  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1713 (1.1766)  acc1: 72.8000 (75.0095)  acc5: 92.0000 (92.8571)  time: 0.2165  data: 0.1288  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3432 (1.1937)  acc1: 71.6000 (74.4800)  acc5: 90.8000 (92.6720)  time: 0.2146  data: 0.1287  max mem: 12911
Test: Total time: 0:00:10 (0.4095 s / it)
* Acc@1 74.060 Acc@5 92.414 loss 1.206
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.67%
Epoch: [115]  [   0/1251]  eta: 1:08:11  lr: 0.002968  min_lr: 0.002968  loss: 4.1929 (4.1929)  weight_decay: 0.0500 (0.0500)  time: 3.2705  data: 2.5567  max mem: 12911
Epoch: [115]  [ 200/1251]  eta: 0:03:35  lr: 0.002965  min_lr: 0.002965  loss: 3.3639 (3.1399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6603 (0.7307)  time: 0.1870  data: 0.0006  max mem: 12911
Epoch: [115]  [ 400/1251]  eta: 0:02:46  lr: 0.002961  min_lr: 0.002961  loss: 2.6994 (3.1232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6617 (0.7622)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [115]  [ 600/1251]  eta: 0:02:06  lr: 0.002958  min_lr: 0.002958  loss: 2.5071 (3.1315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6245 (0.7297)  time: 0.1930  data: 0.0004  max mem: 12911
Epoch: [115]  [ 800/1251]  eta: 0:01:26  lr: 0.002955  min_lr: 0.002955  loss: 3.9361 (3.1418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7344 (0.7336)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [115]  [1000/1251]  eta: 0:00:48  lr: 0.002952  min_lr: 0.002952  loss: 2.6663 (3.1490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7878 (0.7541)  time: 0.1849  data: 0.0005  max mem: 12911
Epoch: [115]  [1200/1251]  eta: 0:00:09  lr: 0.002949  min_lr: 0.002949  loss: 2.4417 (3.1312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6432 (0.7502)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [115]  [1250/1251]  eta: 0:00:00  lr: 0.002948  min_lr: 0.002948  loss: 2.9794 (3.1316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6940 (0.7530)  time: 0.1466  data: 0.0007  max mem: 12911
Epoch: [115] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.002948  min_lr: 0.002948  loss: 2.9794 (3.0796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6940 (0.7530)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.8236 (0.8236)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.4022  data: 5.3107  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9365 (0.9632)  acc1: 79.6000 (79.4182)  acc5: 96.0000 (95.4909)  time: 0.7246  data: 0.6342  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1721 (1.1660)  acc1: 73.6000 (75.0095)  acc5: 90.8000 (92.6286)  time: 0.2113  data: 0.1263  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2881 (1.1804)  acc1: 73.6000 (74.6080)  acc5: 90.4000 (92.4160)  time: 0.2199  data: 0.1359  max mem: 12911
Test: Total time: 0:00:10 (0.4108 s / it)
* Acc@1 74.294 Acc@5 92.462 loss 1.186
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.67%
Epoch: [116]  [   0/1251]  eta: 1:01:23  lr: 0.002948  min_lr: 0.002948  loss: 2.4714 (2.4714)  weight_decay: 0.0500 (0.0500)  time: 2.9448  data: 2.5465  max mem: 12911
Epoch: [116]  [ 200/1251]  eta: 0:03:32  lr: 0.002945  min_lr: 0.002945  loss: 2.7213 (3.1859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5936 (0.7359)  time: 0.1862  data: 0.0004  max mem: 12911
Epoch: [116]  [ 400/1251]  eta: 0:02:45  lr: 0.002942  min_lr: 0.002942  loss: 2.6081 (3.0917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6812 (0.6909)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [116]  [ 600/1251]  eta: 0:02:05  lr: 0.002938  min_lr: 0.002938  loss: 3.3198 (3.0624)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1898 (0.8003)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [116]  [ 800/1251]  eta: 0:01:26  lr: 0.002935  min_lr: 0.002935  loss: 2.7987 (3.0846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5605 (0.7577)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [116]  [1000/1251]  eta: 0:00:47  lr: 0.002932  min_lr: 0.002932  loss: 2.4456 (3.0645)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1858  data: 0.0005  max mem: 12911
Epoch: [116]  [1200/1251]  eta: 0:00:09  lr: 0.002929  min_lr: 0.002929  loss: 2.6020 (3.0638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6802 (nan)  time: 0.1890  data: 0.0005  max mem: 12911
Epoch: [116]  [1250/1251]  eta: 0:00:00  lr: 0.002928  min_lr: 0.002928  loss: 2.7750 (3.0633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6908 (nan)  time: 0.1458  data: 0.0010  max mem: 12911
Epoch: [116] Total time: 0:03:57 (0.1897 s / it)
Averaged stats: lr: 0.002928  min_lr: 0.002928  loss: 2.7750 (3.0812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6908 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7620 (0.7620)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 5.6606  data: 5.5566  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9054 (0.9580)  acc1: 78.8000 (79.1273)  acc5: 96.0000 (95.4546)  time: 0.6920  data: 0.5996  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1887 (1.1445)  acc1: 72.0000 (74.8952)  acc5: 92.4000 (92.8000)  time: 0.1822  data: 0.0968  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2561 (1.1535)  acc1: 72.0000 (74.6240)  acc5: 91.6000 (92.7040)  time: 0.1847  data: 0.1003  max mem: 12911
Test: Total time: 0:00:09 (0.3933 s / it)
* Acc@1 74.716 Acc@5 92.586 loss 1.157
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 74.72%
Epoch: [117]  [   0/1251]  eta: 1:10:14  lr: 0.002928  min_lr: 0.002928  loss: 4.0867 (4.0867)  weight_decay: 0.0500 (0.0500)  time: 3.3688  data: 3.1379  max mem: 12911
Epoch: [117]  [ 200/1251]  eta: 0:03:32  lr: 0.002925  min_lr: 0.002925  loss: 3.1661 (3.1061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.7202)  time: 0.1870  data: 0.0005  max mem: 12911
Epoch: [117]  [ 400/1251]  eta: 0:02:46  lr: 0.002922  min_lr: 0.002922  loss: 2.7086 (3.1087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6215 (0.6884)  time: 0.1892  data: 0.0005  max mem: 12911
Epoch: [117]  [ 600/1251]  eta: 0:02:05  lr: 0.002919  min_lr: 0.002919  loss: 3.1428 (3.1054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6169 (0.6807)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [117]  [ 800/1251]  eta: 0:01:26  lr: 0.002915  min_lr: 0.002915  loss: 2.4222 (3.1082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6317 (0.6827)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [117]  [1000/1251]  eta: 0:00:48  lr: 0.002912  min_lr: 0.002912  loss: 2.4315 (3.0809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7094 (0.6988)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [117]  [1200/1251]  eta: 0:00:09  lr: 0.002909  min_lr: 0.002909  loss: 3.2817 (3.0847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6078 (0.7009)  time: 0.1915  data: 0.0006  max mem: 12911
Epoch: [117]  [1250/1251]  eta: 0:00:00  lr: 0.002908  min_lr: 0.002908  loss: 3.0848 (3.0833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6752 (0.7005)  time: 0.1462  data: 0.0009  max mem: 12911
Epoch: [117] Total time: 0:03:59 (0.1912 s / it)
Averaged stats: lr: 0.002908  min_lr: 0.002908  loss: 3.0848 (3.0670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6752 (0.7005)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.8167 (0.8167)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.3020  data: 5.2058  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9088 (0.9814)  acc1: 81.2000 (78.9818)  acc5: 96.0000 (95.6727)  time: 0.7206  data: 0.6233  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2182 (1.1717)  acc1: 72.8000 (75.1048)  acc5: 91.6000 (92.8762)  time: 0.2014  data: 0.1129  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3358 (1.1884)  acc1: 72.0000 (74.7040)  acc5: 90.8000 (92.7360)  time: 0.2008  data: 0.1163  max mem: 12911
Test: Total time: 0:00:09 (0.3924 s / it)
* Acc@1 74.532 Acc@5 92.640 loss 1.199
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.72%
Epoch: [118]  [   0/1251]  eta: 1:07:34  lr: 0.002908  min_lr: 0.002908  loss: 4.3738 (4.3738)  weight_decay: 0.0500 (0.0500)  time: 3.2412  data: 2.7332  max mem: 12911
Epoch: [118]  [ 200/1251]  eta: 0:03:36  lr: 0.002905  min_lr: 0.002905  loss: 2.7477 (3.0540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7427 (0.8193)  time: 0.1879  data: 0.0006  max mem: 12911
Epoch: [118]  [ 400/1251]  eta: 0:02:47  lr: 0.002902  min_lr: 0.002902  loss: 2.5358 (3.0574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8096 (0.7754)  time: 0.1915  data: 0.0005  max mem: 12911
Epoch: [118]  [ 600/1251]  eta: 0:02:06  lr: 0.002899  min_lr: 0.002899  loss: 2.5813 (3.0824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8023 (0.7830)  time: 0.1882  data: 0.0003  max mem: 12911
Epoch: [118]  [ 800/1251]  eta: 0:01:26  lr: 0.002895  min_lr: 0.002895  loss: 2.5936 (3.0663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6096 (0.7670)  time: 0.1925  data: 0.0003  max mem: 12911
Epoch: [118]  [1000/1251]  eta: 0:00:48  lr: 0.002892  min_lr: 0.002892  loss: 3.3903 (3.0625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6293 (0.7497)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [118]  [1200/1251]  eta: 0:00:09  lr: 0.002889  min_lr: 0.002889  loss: 2.6219 (3.0722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7414 (0.7568)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [118]  [1250/1251]  eta: 0:00:00  lr: 0.002888  min_lr: 0.002888  loss: 3.4029 (3.0762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6576 (0.7525)  time: 0.1462  data: 0.0007  max mem: 12911
Epoch: [118] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.002888  min_lr: 0.002888  loss: 3.4029 (3.0812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6576 (0.7525)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7558 (0.7558)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.6520  data: 5.5277  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9243 (0.9508)  acc1: 79.2000 (79.4545)  acc5: 96.0000 (95.6000)  time: 0.7348  data: 0.6476  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1768 (1.1757)  acc1: 72.8000 (74.6286)  acc5: 92.8000 (92.7429)  time: 0.2024  data: 0.1210  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3641 (1.1882)  acc1: 70.4000 (74.3360)  acc5: 91.6000 (92.7200)  time: 0.2005  data: 0.1209  max mem: 12911
Test: Total time: 0:00:10 (0.4055 s / it)
* Acc@1 74.526 Acc@5 92.462 loss 1.192
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.72%
Epoch: [119]  [   0/1251]  eta: 1:00:29  lr: 0.002888  min_lr: 0.002888  loss: 2.3525 (2.3525)  weight_decay: 0.0500 (0.0500)  time: 2.9015  data: 2.5420  max mem: 12911
Epoch: [119]  [ 200/1251]  eta: 0:03:38  lr: 0.002885  min_lr: 0.002885  loss: 3.2079 (3.0091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6853 (0.7168)  time: 0.1871  data: 0.0008  max mem: 12911
Epoch: [119]  [ 400/1251]  eta: 0:02:48  lr: 0.002882  min_lr: 0.002882  loss: 2.6266 (3.0127)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6554 (0.6879)  time: 0.1890  data: 0.0004  max mem: 12911
Epoch: [119]  [ 600/1251]  eta: 0:02:06  lr: 0.002879  min_lr: 0.002879  loss: 2.5454 (3.0405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6992 (0.6936)  time: 0.1907  data: 0.0004  max mem: 12911
Epoch: [119]  [ 800/1251]  eta: 0:01:27  lr: 0.002875  min_lr: 0.002875  loss: 2.5678 (3.0388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8434 (0.7265)  time: 0.1894  data: 0.0006  max mem: 12911
Epoch: [119]  [1000/1251]  eta: 0:00:48  lr: 0.002872  min_lr: 0.002872  loss: 2.8983 (3.0424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5888 (0.7275)  time: 0.1827  data: 0.0004  max mem: 12911
Epoch: [119]  [1200/1251]  eta: 0:00:09  lr: 0.002869  min_lr: 0.002869  loss: 2.7464 (3.0533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7151 (0.7331)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [119]  [1250/1251]  eta: 0:00:00  lr: 0.002868  min_lr: 0.002868  loss: 2.6690 (3.0547)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7318 (0.7341)  time: 0.1461  data: 0.0009  max mem: 12911
Epoch: [119] Total time: 0:03:59 (0.1914 s / it)
Averaged stats: lr: 0.002868  min_lr: 0.002868  loss: 2.6690 (3.0579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7318 (0.7341)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7284 (0.7284)  acc1: 86.4000 (86.4000)  acc5: 96.8000 (96.8000)  time: 5.7593  data: 5.6676  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8678 (0.9253)  acc1: 82.8000 (80.6545)  acc5: 96.0000 (95.1273)  time: 0.7634  data: 0.6785  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1297 (1.1235)  acc1: 72.8000 (75.7714)  acc5: 91.6000 (92.6857)  time: 0.1958  data: 0.1128  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2361 (1.1339)  acc1: 72.0000 (75.3280)  acc5: 91.6000 (92.6560)  time: 0.1937  data: 0.1128  max mem: 12911
Test: Total time: 0:00:10 (0.4027 s / it)
* Acc@1 74.630 Acc@5 92.610 loss 1.141
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 74.72%
Epoch: [120]  [   0/1251]  eta: 1:03:37  lr: 0.002868  min_lr: 0.002868  loss: 2.4988 (2.4988)  weight_decay: 0.0500 (0.0500)  time: 3.0515  data: 1.6889  max mem: 12911
Epoch: [120]  [ 200/1251]  eta: 0:03:34  lr: 0.002865  min_lr: 0.002865  loss: 2.4254 (2.9640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7623 (0.7617)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [120]  [ 400/1251]  eta: 0:02:46  lr: 0.002862  min_lr: 0.002862  loss: 2.4855 (3.0248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7309 (0.7537)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [120]  [ 600/1251]  eta: 0:02:05  lr: 0.002858  min_lr: 0.002858  loss: 2.6324 (3.0425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6410 (0.7400)  time: 0.1899  data: 0.0004  max mem: 12911
Epoch: [120]  [ 800/1251]  eta: 0:01:26  lr: 0.002855  min_lr: 0.002855  loss: 2.6192 (3.0358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6097 (0.7250)  time: 0.1914  data: 0.0005  max mem: 12911
Epoch: [120]  [1000/1251]  eta: 0:00:48  lr: 0.002852  min_lr: 0.002852  loss: 3.6312 (3.0644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6457 (0.7252)  time: 0.1833  data: 0.0004  max mem: 12911
Epoch: [120]  [1200/1251]  eta: 0:00:09  lr: 0.002849  min_lr: 0.002849  loss: 2.5929 (3.0653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6220 (0.7287)  time: 0.1929  data: 0.0005  max mem: 12911
Epoch: [120]  [1250/1251]  eta: 0:00:00  lr: 0.002848  min_lr: 0.002848  loss: 2.8115 (3.0650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8640 (0.7382)  time: 0.1466  data: 0.0008  max mem: 12911
Epoch: [120] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.002848  min_lr: 0.002848  loss: 2.8115 (3.0654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8640 (0.7382)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7937 (0.7937)  acc1: 83.2000 (83.2000)  acc5: 97.2000 (97.2000)  time: 5.4536  data: 5.3517  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9032 (0.9442)  acc1: 80.0000 (79.3818)  acc5: 96.4000 (95.3091)  time: 0.7258  data: 0.6334  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1753 (1.1387)  acc1: 72.8000 (75.1238)  acc5: 91.6000 (92.8000)  time: 0.2172  data: 0.1312  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3275 (1.1530)  acc1: 70.8000 (74.7840)  acc5: 90.4000 (92.6560)  time: 0.2161  data: 0.1311  max mem: 12911
Test: Total time: 0:00:10 (0.4078 s / it)
* Acc@1 74.670 Acc@5 92.670 loss 1.146
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 74.72%
Epoch: [121]  [   0/1251]  eta: 1:06:50  lr: 0.002848  min_lr: 0.002848  loss: 2.3113 (2.3113)  weight_decay: 0.0500 (0.0500)  time: 3.2057  data: 2.7967  max mem: 12911
Epoch: [121]  [ 200/1251]  eta: 0:03:35  lr: 0.002845  min_lr: 0.002845  loss: 2.5812 (3.0155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6313 (0.7121)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [121]  [ 400/1251]  eta: 0:02:47  lr: 0.002841  min_lr: 0.002841  loss: 2.8884 (3.0713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6619 (0.6916)  time: 0.1902  data: 0.0003  max mem: 12911
Epoch: [121]  [ 600/1251]  eta: 0:02:06  lr: 0.002838  min_lr: 0.002838  loss: 3.0400 (3.0704)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1985  data: 0.0004  max mem: 12911
Epoch: [121]  [ 800/1251]  eta: 0:01:27  lr: 0.002835  min_lr: 0.002835  loss: 2.4946 (3.0603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6766 (nan)  time: 0.2024  data: 0.0006  max mem: 12911
Epoch: [121]  [1000/1251]  eta: 0:00:48  lr: 0.002831  min_lr: 0.002831  loss: 2.5221 (3.0416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6842 (nan)  time: 0.1850  data: 0.0003  max mem: 12911
Epoch: [121]  [1200/1251]  eta: 0:00:09  lr: 0.002828  min_lr: 0.002828  loss: 2.5516 (3.0529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7280 (nan)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [121]  [1250/1251]  eta: 0:00:00  lr: 0.002827  min_lr: 0.002827  loss: 3.8557 (3.0567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7280 (nan)  time: 0.1473  data: 0.0008  max mem: 12911
Epoch: [121] Total time: 0:03:59 (0.1917 s / it)
Averaged stats: lr: 0.002827  min_lr: 0.002827  loss: 3.8557 (3.0710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7280 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7233 (0.7233)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 5.5516  data: 5.4600  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9418 (0.9749)  acc1: 79.6000 (79.2727)  acc5: 96.0000 (95.9636)  time: 0.7760  data: 0.6825  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2049 (1.1982)  acc1: 74.0000 (75.1048)  acc5: 92.8000 (92.9333)  time: 0.2249  data: 0.1381  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3471 (1.2122)  acc1: 72.8000 (74.7360)  acc5: 90.4000 (92.8000)  time: 0.2233  data: 0.1380  max mem: 12911
Test: Total time: 0:00:10 (0.4182 s / it)
* Acc@1 74.708 Acc@5 92.716 loss 1.210
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 74.72%
Epoch: [122]  [   0/1251]  eta: 1:07:23  lr: 0.002827  min_lr: 0.002827  loss: 2.3948 (2.3948)  weight_decay: 0.0500 (0.0500)  time: 3.2321  data: 1.6598  max mem: 12911
Epoch: [122]  [ 200/1251]  eta: 0:03:35  lr: 0.002824  min_lr: 0.002824  loss: 2.3813 (3.0458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7971 (0.8103)  time: 0.1890  data: 0.0005  max mem: 12911
Epoch: [122]  [ 400/1251]  eta: 0:02:47  lr: 0.002821  min_lr: 0.002821  loss: 2.6044 (3.0487)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [122]  [ 600/1251]  eta: 0:02:05  lr: 0.002818  min_lr: 0.002818  loss: 2.4552 (3.0649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7386 (nan)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [122]  [ 800/1251]  eta: 0:01:26  lr: 0.002814  min_lr: 0.002814  loss: 2.5446 (3.0380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6198 (nan)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [122]  [1000/1251]  eta: 0:00:48  lr: 0.002811  min_lr: 0.002811  loss: 3.2083 (3.0381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7930 (nan)  time: 0.1854  data: 0.0005  max mem: 12911
Epoch: [122]  [1200/1251]  eta: 0:00:09  lr: 0.002808  min_lr: 0.002808  loss: 2.5089 (3.0317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8115 (nan)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [122]  [1250/1251]  eta: 0:00:00  lr: 0.002807  min_lr: 0.002807  loss: 2.7660 (3.0423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6871 (nan)  time: 0.1469  data: 0.0006  max mem: 12911
Epoch: [122] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.002807  min_lr: 0.002807  loss: 2.7660 (3.0628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6871 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8000 (0.8000)  acc1: 84.4000 (84.4000)  acc5: 98.0000 (98.0000)  time: 5.4730  data: 5.3791  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0248 (1.0013)  acc1: 79.2000 (79.4182)  acc5: 94.8000 (95.0182)  time: 0.7549  data: 0.6613  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2556 (1.2086)  acc1: 73.6000 (74.9905)  acc5: 91.6000 (92.4762)  time: 0.2157  data: 0.1283  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3450 (1.2291)  acc1: 70.0000 (74.3520)  acc5: 90.4000 (92.2880)  time: 0.2143  data: 0.1283  max mem: 12911
Test: Total time: 0:00:10 (0.4075 s / it)
* Acc@1 74.394 Acc@5 92.406 loss 1.220
Accuracy of the model on the 50000 test images: 74.4%
Max accuracy: 74.72%
Epoch: [123]  [   0/1251]  eta: 1:05:58  lr: 0.002807  min_lr: 0.002807  loss: 3.7589 (3.7589)  weight_decay: 0.0500 (0.0500)  time: 3.1642  data: 1.5867  max mem: 12911
Epoch: [123]  [ 200/1251]  eta: 0:03:36  lr: 0.002804  min_lr: 0.002804  loss: 2.4471 (3.0268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8093 (0.7463)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [123]  [ 400/1251]  eta: 0:02:46  lr: 0.002800  min_lr: 0.002800  loss: 2.6305 (3.0578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5635 (0.7124)  time: 0.1877  data: 0.0004  max mem: 12911
Epoch: [123]  [ 600/1251]  eta: 0:02:05  lr: 0.002797  min_lr: 0.002797  loss: 2.7486 (3.0315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6967 (0.7228)  time: 0.1888  data: 0.0004  max mem: 12911
Epoch: [123]  [ 800/1251]  eta: 0:01:26  lr: 0.002794  min_lr: 0.002794  loss: 2.6068 (3.0325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6421 (0.7072)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [123]  [1000/1251]  eta: 0:00:47  lr: 0.002790  min_lr: 0.002790  loss: 2.8696 (3.0474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6983 (0.7050)  time: 0.1913  data: 0.0005  max mem: 12911
Epoch: [123]  [1200/1251]  eta: 0:00:09  lr: 0.002787  min_lr: 0.002787  loss: 2.4424 (3.0462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6295 (0.7060)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [123]  [1250/1251]  eta: 0:00:00  lr: 0.002786  min_lr: 0.002786  loss: 2.9575 (3.0494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6965 (0.7058)  time: 0.1462  data: 0.0007  max mem: 12911
Epoch: [123] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.002786  min_lr: 0.002786  loss: 2.9575 (3.0687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6965 (0.7058)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7717 (0.7717)  acc1: 84.0000 (84.0000)  acc5: 97.2000 (97.2000)  time: 5.4961  data: 5.3679  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8850 (0.9624)  acc1: 80.8000 (79.6000)  acc5: 96.0000 (95.3818)  time: 0.7632  data: 0.6639  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2135 (1.1850)  acc1: 72.0000 (74.9524)  acc5: 91.6000 (92.3238)  time: 0.2141  data: 0.1256  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3514 (1.1851)  acc1: 72.0000 (74.8320)  acc5: 91.2000 (92.3840)  time: 0.2101  data: 0.1255  max mem: 12911
Test: Total time: 0:00:10 (0.4067 s / it)
* Acc@1 74.756 Acc@5 92.592 loss 1.177
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 74.76%
Epoch: [124]  [   0/1251]  eta: 0:57:27  lr: 0.002786  min_lr: 0.002786  loss: 2.5199 (2.5199)  weight_decay: 0.0500 (0.0500)  time: 2.7560  data: 2.4884  max mem: 12911
Epoch: [124]  [ 200/1251]  eta: 0:03:30  lr: 0.002783  min_lr: 0.002783  loss: 3.6795 (3.0790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7655 (0.8100)  time: 0.1866  data: 0.0006  max mem: 12911
Epoch: [124]  [ 400/1251]  eta: 0:02:44  lr: 0.002780  min_lr: 0.002780  loss: 2.6688 (3.0721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5652 (0.7618)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [124]  [ 600/1251]  eta: 0:02:04  lr: 0.002776  min_lr: 0.002776  loss: 3.2692 (3.0378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5986 (0.7456)  time: 0.1882  data: 0.0004  max mem: 12911
Epoch: [124]  [ 800/1251]  eta: 0:01:25  lr: 0.002773  min_lr: 0.002773  loss: 2.4777 (3.0409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5655 (0.7306)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [124]  [1000/1251]  eta: 0:00:47  lr: 0.002770  min_lr: 0.002770  loss: 2.4964 (3.0555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7210 (0.7402)  time: 0.1970  data: 0.0004  max mem: 12911
Epoch: [124]  [1200/1251]  eta: 0:00:09  lr: 0.002766  min_lr: 0.002766  loss: 2.7373 (3.0611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6669 (0.7383)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [124]  [1250/1251]  eta: 0:00:00  lr: 0.002766  min_lr: 0.002766  loss: 2.4536 (3.0554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7899 (0.7420)  time: 0.1466  data: 0.0014  max mem: 12911
Epoch: [124] Total time: 0:03:56 (0.1893 s / it)
Averaged stats: lr: 0.002766  min_lr: 0.002766  loss: 2.4536 (3.0603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7899 (0.7420)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7566 (0.7566)  acc1: 84.0000 (84.0000)  acc5: 97.2000 (97.2000)  time: 5.4731  data: 5.3805  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8261 (0.9300)  acc1: 80.0000 (79.6364)  acc5: 95.6000 (95.5273)  time: 0.7351  data: 0.6394  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1872 (1.1628)  acc1: 72.0000 (74.4000)  acc5: 91.6000 (92.5905)  time: 0.2097  data: 0.1221  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3072 (1.1703)  acc1: 70.0000 (74.0800)  acc5: 90.8000 (92.5280)  time: 0.2065  data: 0.1220  max mem: 12911
Test: Total time: 0:00:10 (0.4026 s / it)
* Acc@1 74.316 Acc@5 92.456 loss 1.167
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.76%
Epoch: [125]  [   0/1251]  eta: 0:57:09  lr: 0.002766  min_lr: 0.002766  loss: 3.1113 (3.1113)  weight_decay: 0.0500 (0.0500)  time: 2.7413  data: 2.0313  max mem: 12911
Epoch: [125]  [ 200/1251]  eta: 0:03:35  lr: 0.002762  min_lr: 0.002762  loss: 2.4280 (3.0517)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6211 (0.6648)  time: 0.1884  data: 0.0004  max mem: 12911
Epoch: [125]  [ 400/1251]  eta: 0:02:46  lr: 0.002759  min_lr: 0.002759  loss: 3.5967 (3.1064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7056 (0.7205)  time: 0.1866  data: 0.0005  max mem: 12911
Epoch: [125]  [ 600/1251]  eta: 0:02:05  lr: 0.002756  min_lr: 0.002756  loss: 3.0609 (3.1032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5871 (0.7066)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [125]  [ 800/1251]  eta: 0:01:26  lr: 0.002752  min_lr: 0.002752  loss: 2.6386 (3.0927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7096 (0.7111)  time: 0.1924  data: 0.0004  max mem: 12911
Epoch: [125]  [1000/1251]  eta: 0:00:47  lr: 0.002749  min_lr: 0.002749  loss: 2.9178 (3.0769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6613 (0.7177)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [125]  [1200/1251]  eta: 0:00:09  lr: 0.002746  min_lr: 0.002746  loss: 2.4584 (3.0768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6616 (0.7151)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [125]  [1250/1251]  eta: 0:00:00  lr: 0.002745  min_lr: 0.002745  loss: 2.9884 (3.0804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6463 (0.7122)  time: 0.1468  data: 0.0007  max mem: 12911
Epoch: [125] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.002745  min_lr: 0.002745  loss: 2.9884 (3.0686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6463 (0.7122)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7669 (0.7669)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 5.7364  data: 5.6438  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9253 (0.9593)  acc1: 82.4000 (80.4000)  acc5: 96.4000 (95.3818)  time: 0.7643  data: 0.6691  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1876 (1.1702)  acc1: 72.8000 (75.3524)  acc5: 91.6000 (92.8571)  time: 0.2246  data: 0.1365  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2976 (1.1802)  acc1: 70.8000 (74.8800)  acc5: 90.8000 (92.8000)  time: 0.2220  data: 0.1364  max mem: 12911
Test: Total time: 0:00:10 (0.4246 s / it)
* Acc@1 74.852 Acc@5 92.758 loss 1.178
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.85%
Epoch: [126]  [   0/1251]  eta: 0:58:18  lr: 0.002745  min_lr: 0.002745  loss: 2.4674 (2.4674)  weight_decay: 0.0500 (0.0500)  time: 2.7964  data: 2.5311  max mem: 12911
Epoch: [126]  [ 200/1251]  eta: 0:03:33  lr: 0.002742  min_lr: 0.002742  loss: 2.4634 (2.9933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7606 (0.7223)  time: 0.1899  data: 0.0005  max mem: 12911
Epoch: [126]  [ 400/1251]  eta: 0:02:46  lr: 0.002738  min_lr: 0.002738  loss: 2.4989 (3.0153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7109 (0.7655)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [126]  [ 600/1251]  eta: 0:02:06  lr: 0.002735  min_lr: 0.002735  loss: 3.2334 (3.0128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7500 (0.7513)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [126]  [ 800/1251]  eta: 0:01:26  lr: 0.002732  min_lr: 0.002732  loss: 2.4948 (3.0493)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7321 (0.7495)  time: 0.1919  data: 0.0005  max mem: 12911
Epoch: [126]  [1000/1251]  eta: 0:00:48  lr: 0.002728  min_lr: 0.002728  loss: 2.4390 (3.0489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6464 (0.7744)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [126]  [1200/1251]  eta: 0:00:09  lr: 0.002725  min_lr: 0.002725  loss: 2.4273 (3.0438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6576 (0.7557)  time: 0.1848  data: 0.0004  max mem: 12911
Epoch: [126]  [1250/1251]  eta: 0:00:00  lr: 0.002724  min_lr: 0.002724  loss: 2.4761 (3.0457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6751 (0.7563)  time: 0.1456  data: 0.0009  max mem: 12911
Epoch: [126] Total time: 0:03:59 (0.1916 s / it)
Averaged stats: lr: 0.002724  min_lr: 0.002724  loss: 2.4761 (3.0413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6751 (0.7563)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8000 (0.8000)  acc1: 83.6000 (83.6000)  acc5: 96.0000 (96.0000)  time: 5.6575  data: 5.5132  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9495 (0.9738)  acc1: 80.8000 (80.2182)  acc5: 96.0000 (95.2727)  time: 0.7499  data: 0.6492  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1915 (1.1829)  acc1: 73.6000 (75.1048)  acc5: 91.2000 (92.4952)  time: 0.1979  data: 0.1100  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3558 (1.1991)  acc1: 71.2000 (74.4960)  acc5: 91.2000 (92.6240)  time: 0.2102  data: 0.1256  max mem: 12911
Test: Total time: 0:00:10 (0.4133 s / it)
* Acc@1 74.770 Acc@5 92.752 loss 1.188
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 74.85%
Epoch: [127]  [   0/1251]  eta: 1:02:04  lr: 0.002724  min_lr: 0.002724  loss: 2.1703 (2.1703)  weight_decay: 0.0500 (0.0500)  time: 2.9770  data: 2.7182  max mem: 12911
Epoch: [127]  [ 200/1251]  eta: 0:03:34  lr: 0.002721  min_lr: 0.002721  loss: 2.3235 (3.0000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6445 (0.6970)  time: 0.1862  data: 0.0004  max mem: 12911
Epoch: [127]  [ 400/1251]  eta: 0:02:47  lr: 0.002717  min_lr: 0.002717  loss: 2.8032 (2.9681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6542 (0.7082)  time: 0.1923  data: 0.0004  max mem: 12911
Epoch: [127]  [ 600/1251]  eta: 0:02:06  lr: 0.002714  min_lr: 0.002714  loss: 2.7810 (2.9849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9005 (0.7813)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [127]  [ 800/1251]  eta: 0:01:26  lr: 0.002711  min_lr: 0.002711  loss: 2.5355 (2.9969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6351 (0.7882)  time: 0.1902  data: 0.0004  max mem: 12911
Epoch: [127]  [1000/1251]  eta: 0:00:48  lr: 0.002707  min_lr: 0.002707  loss: 2.6871 (2.9998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6661 (0.7680)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [127]  [1200/1251]  eta: 0:00:09  lr: 0.002704  min_lr: 0.002704  loss: 2.3481 (3.0066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6163 (0.7482)  time: 0.1888  data: 0.0006  max mem: 12911
Epoch: [127]  [1250/1251]  eta: 0:00:00  lr: 0.002703  min_lr: 0.002703  loss: 2.6988 (3.0094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6797 (0.7550)  time: 0.1463  data: 0.0008  max mem: 12911
Epoch: [127] Total time: 0:03:59 (0.1916 s / it)
Averaged stats: lr: 0.002703  min_lr: 0.002703  loss: 2.6988 (3.0526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6797 (0.7550)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6932 (0.6932)  acc1: 86.0000 (86.0000)  acc5: 98.8000 (98.8000)  time: 5.2944  data: 5.2028  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8808 (0.8930)  acc1: 80.8000 (79.6727)  acc5: 96.0000 (95.6727)  time: 0.7417  data: 0.6451  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1543 (1.1087)  acc1: 72.0000 (75.2000)  acc5: 91.6000 (92.8762)  time: 0.2227  data: 0.1329  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2365 (1.1121)  acc1: 72.0000 (74.9120)  acc5: 91.2000 (92.8160)  time: 0.2199  data: 0.1329  max mem: 12911
Test: Total time: 0:00:10 (0.4065 s / it)
* Acc@1 75.172 Acc@5 92.822 loss 1.106
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.17%
Epoch: [128]  [   0/1251]  eta: 1:07:02  lr: 0.002703  min_lr: 0.002703  loss: 2.1211 (2.1211)  weight_decay: 0.0500 (0.0500)  time: 3.2155  data: 2.9684  max mem: 12911
Epoch: [128]  [ 200/1251]  eta: 0:03:35  lr: 0.002700  min_lr: 0.002700  loss: 2.8427 (2.9886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6741 (0.7936)  time: 0.1928  data: 0.0004  max mem: 12911
Epoch: [128]  [ 400/1251]  eta: 0:02:48  lr: 0.002696  min_lr: 0.002696  loss: 3.2244 (3.0302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7057 (0.7814)  time: 0.1895  data: 0.0005  max mem: 12911
Epoch: [128]  [ 600/1251]  eta: 0:02:06  lr: 0.002693  min_lr: 0.002693  loss: 2.4120 (3.0028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6794 (0.7749)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [128]  [ 800/1251]  eta: 0:01:26  lr: 0.002690  min_lr: 0.002690  loss: 3.9016 (3.0232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8174 (0.7658)  time: 0.1853  data: 0.0005  max mem: 12911
Epoch: [128]  [1000/1251]  eta: 0:00:48  lr: 0.002686  min_lr: 0.002686  loss: 2.9976 (3.0340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6286 (nan)  time: 0.1895  data: 0.0005  max mem: 12911
Epoch: [128]  [1200/1251]  eta: 0:00:09  lr: 0.002683  min_lr: 0.002683  loss: 3.1577 (3.0303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6497 (nan)  time: 0.1856  data: 0.0005  max mem: 12911
Epoch: [128]  [1250/1251]  eta: 0:00:00  lr: 0.002682  min_lr: 0.002682  loss: 2.7922 (3.0323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7097 (nan)  time: 0.1457  data: 0.0009  max mem: 12911
Epoch: [128] Total time: 0:03:59 (0.1913 s / it)
Averaged stats: lr: 0.002682  min_lr: 0.002682  loss: 2.7922 (3.0433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7097 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7313 (0.7313)  acc1: 84.8000 (84.8000)  acc5: 97.2000 (97.2000)  time: 5.5061  data: 5.3658  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8611 (0.9269)  acc1: 82.0000 (80.4727)  acc5: 96.4000 (95.6000)  time: 0.7448  data: 0.6435  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1476 (1.1329)  acc1: 73.6000 (75.5048)  acc5: 92.0000 (92.9524)  time: 0.2138  data: 0.1254  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2910 (1.1528)  acc1: 72.8000 (74.9600)  acc5: 92.0000 (92.7200)  time: 0.2103  data: 0.1257  max mem: 12911
Test: Total time: 0:00:10 (0.4073 s / it)
* Acc@1 74.890 Acc@5 92.774 loss 1.148
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 75.17%
Epoch: [129]  [   0/1251]  eta: 1:06:10  lr: 0.002682  min_lr: 0.002682  loss: 3.9581 (3.9581)  weight_decay: 0.0500 (0.0500)  time: 3.1742  data: 2.4499  max mem: 12911
Epoch: [129]  [ 200/1251]  eta: 0:03:36  lr: 0.002679  min_lr: 0.002679  loss: 2.3318 (2.9954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6938 (0.7234)  time: 0.1946  data: 0.0005  max mem: 12911
Epoch: [129]  [ 400/1251]  eta: 0:02:47  lr: 0.002675  min_lr: 0.002675  loss: 2.5608 (3.0321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7267 (0.7151)  time: 0.1883  data: 0.0005  max mem: 12911
Epoch: [129]  [ 600/1251]  eta: 0:02:06  lr: 0.002672  min_lr: 0.002672  loss: 2.5086 (3.0433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7895 (0.7192)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [129]  [ 800/1251]  eta: 0:01:26  lr: 0.002668  min_lr: 0.002668  loss: 3.6443 (3.0477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8060 (0.7429)  time: 0.1842  data: 0.0004  max mem: 12911
Epoch: [129]  [1000/1251]  eta: 0:00:48  lr: 0.002665  min_lr: 0.002665  loss: 3.2546 (3.0456)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7898 (0.7560)  time: 0.1882  data: 0.0004  max mem: 12911
Epoch: [129]  [1200/1251]  eta: 0:00:09  lr: 0.002662  min_lr: 0.002662  loss: 2.8881 (3.0453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6903 (0.7493)  time: 0.1888  data: 0.0004  max mem: 12911
Epoch: [129]  [1250/1251]  eta: 0:00:00  lr: 0.002661  min_lr: 0.002661  loss: 4.0237 (3.0589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (0.7523)  time: 0.1458  data: 0.0008  max mem: 12911
Epoch: [129] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.002661  min_lr: 0.002661  loss: 4.0237 (3.0426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (0.7523)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7644 (0.7644)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.8720  data: 5.7802  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9868 (1.0142)  acc1: 81.2000 (79.2000)  acc5: 96.0000 (95.2000)  time: 0.7560  data: 0.6594  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2880 (1.2457)  acc1: 71.6000 (74.7810)  acc5: 90.8000 (92.4000)  time: 0.2078  data: 0.1195  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3304 (1.2560)  acc1: 71.6000 (74.5280)  acc5: 90.8000 (92.3520)  time: 0.2039  data: 0.1194  max mem: 12911
Test: Total time: 0:00:10 (0.4170 s / it)
* Acc@1 74.632 Acc@5 92.610 loss 1.251
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 75.17%
Epoch: [130]  [   0/1251]  eta: 1:06:43  lr: 0.002661  min_lr: 0.002661  loss: 2.2228 (2.2228)  weight_decay: 0.0500 (0.0500)  time: 3.2000  data: 2.4037  max mem: 12911
Epoch: [130]  [ 200/1251]  eta: 0:03:33  lr: 0.002657  min_lr: 0.002657  loss: 2.7468 (2.9482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6846 (0.6994)  time: 0.1844  data: 0.0004  max mem: 12911
Epoch: [130]  [ 400/1251]  eta: 0:02:46  lr: 0.002654  min_lr: 0.002654  loss: 2.9928 (2.9856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6494 (0.7064)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [130]  [ 600/1251]  eta: 0:02:05  lr: 0.002651  min_lr: 0.002651  loss: 3.9710 (3.0499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7257 (0.7399)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [130]  [ 800/1251]  eta: 0:01:26  lr: 0.002647  min_lr: 0.002647  loss: 2.4440 (3.0428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6645 (0.7192)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [130]  [1000/1251]  eta: 0:00:47  lr: 0.002644  min_lr: 0.002644  loss: 2.7057 (3.0439)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7200 (0.7305)  time: 0.1863  data: 0.0004  max mem: 12911
Epoch: [130]  [1200/1251]  eta: 0:00:09  lr: 0.002640  min_lr: 0.002640  loss: 3.6939 (3.0534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6861 (0.7256)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [130]  [1250/1251]  eta: 0:00:00  lr: 0.002640  min_lr: 0.002640  loss: 3.2683 (3.0539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8060 (0.7283)  time: 0.1472  data: 0.0008  max mem: 12911
Epoch: [130] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.002640  min_lr: 0.002640  loss: 3.2683 (3.0426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8060 (0.7283)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7030 (0.7030)  acc1: 84.8000 (84.8000)  acc5: 98.4000 (98.4000)  time: 5.5243  data: 5.4326  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8216 (0.9338)  acc1: 82.8000 (80.1818)  acc5: 96.0000 (95.7091)  time: 0.7125  data: 0.6194  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1985 (1.1440)  acc1: 73.2000 (75.7905)  acc5: 91.2000 (92.8191)  time: 0.1911  data: 0.1047  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2828 (1.1573)  acc1: 72.0000 (75.3920)  acc5: 90.4000 (92.6880)  time: 0.2077  data: 0.1231  max mem: 12911
Test: Total time: 0:00:10 (0.4041 s / it)
* Acc@1 74.958 Acc@5 92.670 loss 1.162
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 75.17%
Epoch: [131]  [   0/1251]  eta: 1:08:16  lr: 0.002640  min_lr: 0.002640  loss: 2.3374 (2.3374)  weight_decay: 0.0500 (0.0500)  time: 3.2748  data: 2.0034  max mem: 12911
Epoch: [131]  [ 200/1251]  eta: 0:03:33  lr: 0.002636  min_lr: 0.002636  loss: 2.8942 (3.0017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7780 (0.8296)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [131]  [ 400/1251]  eta: 0:02:46  lr: 0.002633  min_lr: 0.002633  loss: 2.6197 (3.0239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6989 (0.8887)  time: 0.1864  data: 0.0004  max mem: 12911
Epoch: [131]  [ 600/1251]  eta: 0:02:05  lr: 0.002629  min_lr: 0.002629  loss: 2.5903 (3.0263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5902 (0.8344)  time: 0.1862  data: 0.0004  max mem: 12911
Epoch: [131]  [ 800/1251]  eta: 0:01:26  lr: 0.002626  min_lr: 0.002626  loss: 2.4842 (3.0539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6362 (0.8084)  time: 0.1841  data: 0.0004  max mem: 12911
Epoch: [131]  [1000/1251]  eta: 0:00:47  lr: 0.002623  min_lr: 0.002623  loss: 3.0231 (3.0654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8339 (0.7985)  time: 0.1842  data: 0.0005  max mem: 12911
Epoch: [131]  [1200/1251]  eta: 0:00:09  lr: 0.002619  min_lr: 0.002619  loss: 2.3694 (3.0638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5626 (0.7803)  time: 0.1858  data: 0.0003  max mem: 12911
Epoch: [131]  [1250/1251]  eta: 0:00:00  lr: 0.002618  min_lr: 0.002618  loss: 2.7616 (3.0617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (0.7826)  time: 0.1469  data: 0.0007  max mem: 12911
Epoch: [131] Total time: 0:03:58 (0.1902 s / it)
Averaged stats: lr: 0.002618  min_lr: 0.002618  loss: 2.7616 (3.0502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (0.7826)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6762 (0.6762)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 5.6808  data: 5.5891  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9062 (0.9101)  acc1: 81.2000 (79.6364)  acc5: 96.0000 (95.4909)  time: 0.7352  data: 0.6408  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1514 (1.1167)  acc1: 73.2000 (75.5048)  acc5: 92.4000 (92.8571)  time: 0.2015  data: 0.1130  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2978 (1.1273)  acc1: 72.0000 (75.1680)  acc5: 91.2000 (92.7360)  time: 0.1995  data: 0.1129  max mem: 12911
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 75.216 Acc@5 92.740 loss 1.125
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.22%
Epoch: [132]  [   0/1251]  eta: 1:08:06  lr: 0.002618  min_lr: 0.002618  loss: 3.9398 (3.9398)  weight_decay: 0.0500 (0.0500)  time: 3.2664  data: 3.0675  max mem: 12911
Epoch: [132]  [ 200/1251]  eta: 0:03:31  lr: 0.002615  min_lr: 0.002615  loss: 2.7575 (3.0820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6922 (0.6865)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [132]  [ 400/1251]  eta: 0:02:45  lr: 0.002612  min_lr: 0.002612  loss: 2.6330 (2.9865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5862 (0.6581)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [132]  [ 600/1251]  eta: 0:02:04  lr: 0.002608  min_lr: 0.002608  loss: 3.3792 (3.0138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5988 (0.6833)  time: 0.1866  data: 0.0006  max mem: 12911
Epoch: [132]  [ 800/1251]  eta: 0:01:26  lr: 0.002605  min_lr: 0.002605  loss: 2.4919 (3.0219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6576 (0.7067)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [132]  [1000/1251]  eta: 0:00:47  lr: 0.002601  min_lr: 0.002601  loss: 2.4005 (3.0203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6451 (0.7068)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [132]  [1200/1251]  eta: 0:00:09  lr: 0.002598  min_lr: 0.002598  loss: 2.6492 (3.0336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7719 (0.7262)  time: 0.1873  data: 0.0004  max mem: 12911
Epoch: [132]  [1250/1251]  eta: 0:00:00  lr: 0.002597  min_lr: 0.002597  loss: 2.5703 (3.0362)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7539 (0.7253)  time: 0.1461  data: 0.0010  max mem: 12911
Epoch: [132] Total time: 0:03:57 (0.1901 s / it)
Averaged stats: lr: 0.002597  min_lr: 0.002597  loss: 2.5703 (3.0382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7539 (0.7253)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7781 (0.7781)  acc1: 83.2000 (83.2000)  acc5: 97.2000 (97.2000)  time: 5.4694  data: 5.3776  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8844 (0.9531)  acc1: 81.6000 (79.8545)  acc5: 96.8000 (95.7455)  time: 0.7401  data: 0.6462  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1973 (1.1631)  acc1: 74.0000 (75.6381)  acc5: 92.8000 (92.8952)  time: 0.2062  data: 0.1185  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2632 (1.1728)  acc1: 72.8000 (75.3600)  acc5: 90.4000 (92.6560)  time: 0.2043  data: 0.1184  max mem: 12911
Test: Total time: 0:00:10 (0.4012 s / it)
* Acc@1 75.124 Acc@5 92.706 loss 1.170
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.22%
Epoch: [133]  [   0/1251]  eta: 1:04:20  lr: 0.002597  min_lr: 0.002597  loss: 2.2986 (2.2986)  weight_decay: 0.0500 (0.0500)  time: 3.0863  data: 1.7421  max mem: 12911
Epoch: [133]  [ 200/1251]  eta: 0:03:31  lr: 0.002594  min_lr: 0.002594  loss: 2.3961 (3.0396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7261 (0.7827)  time: 0.1835  data: 0.0005  max mem: 12911
Epoch: [133]  [ 400/1251]  eta: 0:02:44  lr: 0.002590  min_lr: 0.002590  loss: 2.6192 (3.0372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6598 (0.7407)  time: 0.1881  data: 0.0004  max mem: 12911
Epoch: [133]  [ 600/1251]  eta: 0:02:04  lr: 0.002587  min_lr: 0.002587  loss: 2.9414 (3.0844)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7631 (0.8525)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [133]  [ 800/1251]  eta: 0:01:25  lr: 0.002583  min_lr: 0.002583  loss: 2.5287 (3.0540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6617 (0.8027)  time: 0.1870  data: 0.0005  max mem: 12911
Epoch: [133]  [1000/1251]  eta: 0:00:47  lr: 0.002580  min_lr: 0.002580  loss: 2.7613 (3.0591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6409 (0.7758)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [133]  [1200/1251]  eta: 0:00:09  lr: 0.002576  min_lr: 0.002576  loss: 2.9666 (3.0559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7018 (0.7599)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [133]  [1250/1251]  eta: 0:00:00  lr: 0.002576  min_lr: 0.002576  loss: 2.4566 (3.0512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6137 (0.7559)  time: 0.1468  data: 0.0010  max mem: 12911
Epoch: [133] Total time: 0:03:56 (0.1891 s / it)
Averaged stats: lr: 0.002576  min_lr: 0.002576  loss: 2.4566 (3.0558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6137 (0.7559)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7696 (0.7696)  acc1: 82.4000 (82.4000)  acc5: 97.6000 (97.6000)  time: 5.3370  data: 5.1948  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9073 (0.9467)  acc1: 80.0000 (80.2545)  acc5: 96.0000 (95.5273)  time: 0.6754  data: 0.5809  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1973 (1.1338)  acc1: 74.0000 (75.5619)  acc5: 92.8000 (93.1429)  time: 0.1904  data: 0.1049  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1973 (1.1390)  acc1: 74.0000 (75.4560)  acc5: 91.6000 (93.0240)  time: 0.2081  data: 0.1247  max mem: 12911
Test: Total time: 0:00:10 (0.4033 s / it)
* Acc@1 75.186 Acc@5 92.934 loss 1.132
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.22%
Epoch: [134]  [   0/1251]  eta: 1:06:27  lr: 0.002576  min_lr: 0.002576  loss: 2.5366 (2.5366)  weight_decay: 0.0500 (0.0500)  time: 3.1875  data: 2.5038  max mem: 12911
Epoch: [134]  [ 200/1251]  eta: 0:03:37  lr: 0.002572  min_lr: 0.002572  loss: 2.8273 (2.9870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6730 (0.6870)  time: 0.1847  data: 0.0006  max mem: 12911
Epoch: [134]  [ 400/1251]  eta: 0:02:47  lr: 0.002569  min_lr: 0.002569  loss: 2.5947 (2.9989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7091 (0.6890)  time: 0.1864  data: 0.0004  max mem: 12911
Epoch: [134]  [ 600/1251]  eta: 0:02:06  lr: 0.002565  min_lr: 0.002565  loss: 3.0315 (3.0446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6786 (0.7211)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [134]  [ 800/1251]  eta: 0:01:26  lr: 0.002562  min_lr: 0.002562  loss: 2.5798 (3.0438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6503 (0.7208)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [134]  [1000/1251]  eta: 0:00:48  lr: 0.002558  min_lr: 0.002558  loss: 3.0686 (3.0395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7098 (0.7224)  time: 0.1896  data: 0.0005  max mem: 12911
Epoch: [134]  [1200/1251]  eta: 0:00:09  lr: 0.002555  min_lr: 0.002555  loss: 2.6471 (3.0249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7611 (0.7367)  time: 0.1919  data: 0.0005  max mem: 12911
Epoch: [134]  [1250/1251]  eta: 0:00:00  lr: 0.002554  min_lr: 0.002554  loss: 3.6912 (3.0365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6883 (0.7341)  time: 0.1491  data: 0.0007  max mem: 12911
Epoch: [134] Total time: 0:04:00 (0.1920 s / it)
Averaged stats: lr: 0.002554  min_lr: 0.002554  loss: 3.6912 (3.0394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6883 (0.7341)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8014 (0.8014)  acc1: 81.6000 (81.6000)  acc5: 98.4000 (98.4000)  time: 5.6359  data: 5.5375  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 1.0216 (0.9891)  acc1: 79.2000 (78.9091)  acc5: 95.6000 (95.3818)  time: 0.7734  data: 0.6801  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2076 (1.1839)  acc1: 72.0000 (74.7619)  acc5: 91.6000 (92.7429)  time: 0.2293  data: 0.1426  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2519 (1.1888)  acc1: 72.0000 (74.5120)  acc5: 91.2000 (92.6720)  time: 0.2281  data: 0.1425  max mem: 12911
Test: Total time: 0:00:10 (0.4249 s / it)
* Acc@1 74.864 Acc@5 92.780 loss 1.184
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 75.22%
Epoch: [135]  [   0/1251]  eta: 1:03:46  lr: 0.002554  min_lr: 0.002554  loss: 3.5545 (3.5545)  weight_decay: 0.0500 (0.0500)  time: 3.0588  data: 2.0172  max mem: 12911
Epoch: [135]  [ 200/1251]  eta: 0:03:34  lr: 0.002551  min_lr: 0.002551  loss: 2.2329 (3.0343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7130 (0.7512)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [135]  [ 400/1251]  eta: 0:02:47  lr: 0.002547  min_lr: 0.002547  loss: 2.7836 (3.0464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6564 (0.7426)  time: 0.1863  data: 0.0004  max mem: 12911
Epoch: [135]  [ 600/1251]  eta: 0:02:06  lr: 0.002544  min_lr: 0.002544  loss: 2.8425 (3.0590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6113 (0.7470)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [135]  [ 800/1251]  eta: 0:01:26  lr: 0.002540  min_lr: 0.002540  loss: 2.3837 (3.0453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5994 (0.7453)  time: 0.1898  data: 0.0004  max mem: 12911
Epoch: [135]  [1000/1251]  eta: 0:00:48  lr: 0.002537  min_lr: 0.002537  loss: 2.7312 (3.0511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7317 (0.7363)  time: 0.1877  data: 0.0005  max mem: 12911
Epoch: [135]  [1200/1251]  eta: 0:00:09  lr: 0.002533  min_lr: 0.002533  loss: 2.7029 (3.0448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5918 (0.7447)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [135]  [1250/1251]  eta: 0:00:00  lr: 0.002533  min_lr: 0.002533  loss: 2.8532 (3.0477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7126 (0.7472)  time: 0.1462  data: 0.0010  max mem: 12911
Epoch: [135] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.002533  min_lr: 0.002533  loss: 2.8532 (3.0305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7126 (0.7472)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6966 (0.6966)  acc1: 85.2000 (85.2000)  acc5: 98.0000 (98.0000)  time: 5.3765  data: 5.2762  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8922 (0.8990)  acc1: 80.4000 (80.0727)  acc5: 96.8000 (95.3818)  time: 0.7357  data: 0.6379  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1558 (1.1083)  acc1: 73.2000 (75.8286)  acc5: 90.8000 (92.6286)  time: 0.2105  data: 0.1211  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1917 (1.1132)  acc1: 73.2000 (75.6320)  acc5: 90.4000 (92.5280)  time: 0.2175  data: 0.1321  max mem: 12911
Test: Total time: 0:00:10 (0.4087 s / it)
* Acc@1 75.440 Acc@5 92.880 loss 1.105
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.44%
Epoch: [136]  [   0/1251]  eta: 1:02:28  lr: 0.002532  min_lr: 0.002532  loss: 2.5730 (2.5730)  weight_decay: 0.0500 (0.0500)  time: 2.9967  data: 2.7449  max mem: 12911
Epoch: [136]  [ 200/1251]  eta: 0:03:30  lr: 0.002529  min_lr: 0.002529  loss: 2.4647 (2.9428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.7853)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [136]  [ 400/1251]  eta: 0:02:44  lr: 0.002526  min_lr: 0.002526  loss: 2.6616 (3.0186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6653 (0.7398)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [136]  [ 600/1251]  eta: 0:02:04  lr: 0.002522  min_lr: 0.002522  loss: 3.7399 (3.0230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9067 (0.7587)  time: 0.1910  data: 0.0005  max mem: 12911
Epoch: [136]  [ 800/1251]  eta: 0:01:25  lr: 0.002519  min_lr: 0.002519  loss: 3.0251 (3.0260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6931 (0.7565)  time: 0.1863  data: 0.0004  max mem: 12911
Epoch: [136]  [1000/1251]  eta: 0:00:47  lr: 0.002515  min_lr: 0.002515  loss: 2.4361 (3.0259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6419 (0.7406)  time: 0.1897  data: 0.0004  max mem: 12911
Epoch: [136]  [1200/1251]  eta: 0:00:09  lr: 0.002512  min_lr: 0.002512  loss: 2.3372 (3.0247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8827 (0.7713)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [136]  [1250/1251]  eta: 0:00:00  lr: 0.002511  min_lr: 0.002511  loss: 2.5940 (3.0255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6825 (0.7687)  time: 0.1458  data: 0.0009  max mem: 12911
Epoch: [136] Total time: 0:03:57 (0.1896 s / it)
Averaged stats: lr: 0.002511  min_lr: 0.002511  loss: 2.5940 (3.0271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6825 (0.7687)
Test:  [ 0/25]  eta: 0:01:24  loss: 0.7416 (0.7416)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 3.3811  data: 3.2874  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.9023 (0.9051)  acc1: 80.4000 (80.0000)  acc5: 96.0000 (95.6727)  time: 0.6442  data: 0.5504  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1414 (1.1093)  acc1: 72.8000 (75.3714)  acc5: 90.8000 (92.8191)  time: 0.2775  data: 0.1907  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2606 (1.1220)  acc1: 70.0000 (74.9280)  acc5: 90.8000 (92.7840)  time: 0.2335  data: 0.1488  max mem: 12911
Test: Total time: 0:00:10 (0.4086 s / it)
* Acc@1 75.208 Acc@5 92.970 loss 1.119
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.44%
Epoch: [137]  [   0/1251]  eta: 1:07:23  lr: 0.002511  min_lr: 0.002511  loss: 2.1975 (2.1975)  weight_decay: 0.0500 (0.0500)  time: 3.2324  data: 2.1691  max mem: 12911
Epoch: [137]  [ 200/1251]  eta: 0:03:35  lr: 0.002507  min_lr: 0.002507  loss: 2.4462 (3.1018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6408 (0.7741)  time: 0.1884  data: 0.0004  max mem: 12911
Epoch: [137]  [ 400/1251]  eta: 0:02:46  lr: 0.002504  min_lr: 0.002504  loss: 3.6948 (3.0870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6839 (0.7372)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [137]  [ 600/1251]  eta: 0:02:05  lr: 0.002500  min_lr: 0.002500  loss: 2.6464 (3.0855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6491 (0.7405)  time: 0.1891  data: 0.0004  max mem: 12911
Epoch: [137]  [ 800/1251]  eta: 0:01:26  lr: 0.002497  min_lr: 0.002497  loss: 2.6132 (3.0658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7380 (0.7352)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [137]  [1000/1251]  eta: 0:00:47  lr: 0.002493  min_lr: 0.002493  loss: 2.4214 (3.0611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7155 (0.7398)  time: 0.1879  data: 0.0003  max mem: 12911
Epoch: [137]  [1200/1251]  eta: 0:00:09  lr: 0.002490  min_lr: 0.002490  loss: 2.4085 (3.0483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6965 (0.7344)  time: 0.1870  data: 0.0005  max mem: 12911
Epoch: [137]  [1250/1251]  eta: 0:00:00  lr: 0.002489  min_lr: 0.002489  loss: 2.5997 (3.0465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6894 (0.7330)  time: 0.1474  data: 0.0011  max mem: 12911
Epoch: [137] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.002489  min_lr: 0.002489  loss: 2.5997 (3.0218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6894 (0.7330)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7905 (0.7905)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.5357  data: 5.4090  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8847 (0.9508)  acc1: 81.6000 (79.6727)  acc5: 96.8000 (96.1455)  time: 0.7573  data: 0.6583  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2048 (1.1675)  acc1: 71.6000 (75.0286)  acc5: 92.0000 (93.3905)  time: 0.2194  data: 0.1308  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3355 (1.1769)  acc1: 71.2000 (74.8480)  acc5: 91.6000 (93.3440)  time: 0.2154  data: 0.1307  max mem: 12911
Test: Total time: 0:00:10 (0.4131 s / it)
* Acc@1 75.336 Acc@5 92.970 loss 1.183
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.44%
Epoch: [138]  [   0/1251]  eta: 0:58:38  lr: 0.002489  min_lr: 0.002489  loss: 3.9630 (3.9630)  weight_decay: 0.0500 (0.0500)  time: 2.8123  data: 2.4059  max mem: 12911
Epoch: [138]  [ 200/1251]  eta: 0:03:35  lr: 0.002486  min_lr: 0.002486  loss: 2.4640 (2.9543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6951 (0.8171)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [138]  [ 400/1251]  eta: 0:02:47  lr: 0.002482  min_lr: 0.002482  loss: 2.5999 (3.0201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5965 (0.7611)  time: 0.1863  data: 0.0004  max mem: 12911
Epoch: [138]  [ 600/1251]  eta: 0:02:06  lr: 0.002479  min_lr: 0.002479  loss: 2.5000 (3.0157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6706 (0.7548)  time: 0.1863  data: 0.0004  max mem: 12911
Epoch: [138]  [ 800/1251]  eta: 0:01:27  lr: 0.002475  min_lr: 0.002475  loss: 2.4780 (3.0064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6745 (0.7512)  time: 0.1909  data: 0.0005  max mem: 12911
Epoch: [138]  [1000/1251]  eta: 0:00:48  lr: 0.002472  min_lr: 0.002472  loss: 3.1914 (3.0194)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1896  data: 0.0007  max mem: 12911
Epoch: [138]  [1200/1251]  eta: 0:00:09  lr: 0.002468  min_lr: 0.002468  loss: 2.8431 (3.0163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6029 (nan)  time: 0.1906  data: 0.0006  max mem: 12911
Epoch: [138]  [1250/1251]  eta: 0:00:00  lr: 0.002467  min_lr: 0.002467  loss: 2.8006 (3.0188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6411 (nan)  time: 0.1474  data: 0.0012  max mem: 12911
Epoch: [138] Total time: 0:04:00 (0.1923 s / it)
Averaged stats: lr: 0.002467  min_lr: 0.002467  loss: 2.8006 (3.0162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6411 (nan)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7400 (0.7400)  acc1: 83.2000 (83.2000)  acc5: 98.4000 (98.4000)  time: 5.8586  data: 5.7670  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8712 (0.8902)  acc1: 81.2000 (80.0000)  acc5: 96.0000 (95.7091)  time: 0.7588  data: 0.6753  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1550 (1.1034)  acc1: 71.6000 (75.1048)  acc5: 91.6000 (92.8762)  time: 0.2145  data: 0.1333  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2087 (1.1131)  acc1: 70.4000 (74.7840)  acc5: 90.8000 (92.8320)  time: 0.2129  data: 0.1332  max mem: 12911
Test: Total time: 0:00:10 (0.4218 s / it)
* Acc@1 75.108 Acc@5 92.892 loss 1.103
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.44%
Epoch: [139]  [   0/1251]  eta: 1:08:31  lr: 0.002467  min_lr: 0.002467  loss: 2.3970 (2.3970)  weight_decay: 0.0500 (0.0500)  time: 3.2863  data: 1.6167  max mem: 12911
Epoch: [139]  [ 200/1251]  eta: 0:03:34  lr: 0.002464  min_lr: 0.002464  loss: 2.5670 (3.0686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7387 (0.7248)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [139]  [ 400/1251]  eta: 0:02:46  lr: 0.002460  min_lr: 0.002460  loss: 2.5129 (2.9991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7330 (0.7361)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [139]  [ 600/1251]  eta: 0:02:06  lr: 0.002457  min_lr: 0.002457  loss: 2.5206 (2.9788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8020 (0.7551)  time: 0.1894  data: 0.0004  max mem: 12911
Epoch: [139]  [ 800/1251]  eta: 0:01:26  lr: 0.002453  min_lr: 0.002453  loss: 2.4958 (3.0006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7290 (0.7633)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [139]  [1000/1251]  eta: 0:00:48  lr: 0.002450  min_lr: 0.002450  loss: 2.7455 (3.0028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9238 (0.7793)  time: 0.1924  data: 0.0005  max mem: 12911
Epoch: [139]  [1200/1251]  eta: 0:00:09  lr: 0.002446  min_lr: 0.002446  loss: 2.5163 (3.0176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7078 (0.7780)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [139]  [1250/1251]  eta: 0:00:00  lr: 0.002446  min_lr: 0.002446  loss: 2.3827 (3.0141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6860 (0.7768)  time: 0.1479  data: 0.0008  max mem: 12911
Epoch: [139] Total time: 0:03:59 (0.1914 s / it)
Averaged stats: lr: 0.002446  min_lr: 0.002446  loss: 2.3827 (3.0287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6860 (0.7768)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7126 (0.7126)  acc1: 82.8000 (82.8000)  acc5: 97.2000 (97.2000)  time: 5.4756  data: 5.3838  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9032 (0.8837)  acc1: 80.8000 (80.1091)  acc5: 95.6000 (95.6727)  time: 0.7389  data: 0.6441  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1161 (1.0903)  acc1: 75.2000 (75.4476)  acc5: 92.0000 (93.0476)  time: 0.1999  data: 0.1126  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1876 (1.0976)  acc1: 72.8000 (75.0880)  acc5: 92.0000 (93.0560)  time: 0.1971  data: 0.1126  max mem: 12911
Test: Total time: 0:00:09 (0.3952 s / it)
* Acc@1 75.404 Acc@5 93.012 loss 1.094
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.44%
Epoch: [140]  [   0/1251]  eta: 1:06:35  lr: 0.002445  min_lr: 0.002445  loss: 3.9917 (3.9917)  weight_decay: 0.0500 (0.0500)  time: 3.1936  data: 2.4070  max mem: 12911
Epoch: [140]  [ 200/1251]  eta: 0:03:38  lr: 0.002442  min_lr: 0.002442  loss: 2.4371 (3.0089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7230 (0.6903)  time: 0.1893  data: 0.0005  max mem: 12911
Epoch: [140]  [ 400/1251]  eta: 0:02:49  lr: 0.002438  min_lr: 0.002438  loss: 2.7079 (2.9951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8153 (0.7500)  time: 0.1898  data: 0.0005  max mem: 12911
Epoch: [140]  [ 600/1251]  eta: 0:02:07  lr: 0.002435  min_lr: 0.002435  loss: 2.4180 (3.0065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7980 (0.7521)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [140]  [ 800/1251]  eta: 0:01:27  lr: 0.002431  min_lr: 0.002431  loss: 2.5037 (3.0108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6964 (0.7479)  time: 0.1884  data: 0.0004  max mem: 12911
Epoch: [140]  [1000/1251]  eta: 0:00:48  lr: 0.002428  min_lr: 0.002428  loss: 2.4779 (3.0079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7848 (0.7559)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [140]  [1200/1251]  eta: 0:00:09  lr: 0.002424  min_lr: 0.002424  loss: 2.3935 (2.9964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6459 (0.7499)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [140]  [1250/1251]  eta: 0:00:00  lr: 0.002424  min_lr: 0.002424  loss: 2.4504 (2.9938)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6449 (0.7466)  time: 0.1477  data: 0.0009  max mem: 12911
Epoch: [140] Total time: 0:03:59 (0.1915 s / it)
Averaged stats: lr: 0.002424  min_lr: 0.002424  loss: 2.4504 (3.0082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6449 (0.7466)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.7071 (0.7071)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 5.1921  data: 5.0979  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8998 (0.8902)  acc1: 81.2000 (80.6182)  acc5: 96.0000 (95.8909)  time: 0.7276  data: 0.6425  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1142 (1.0858)  acc1: 73.6000 (76.0381)  acc5: 93.2000 (93.3714)  time: 0.2264  data: 0.1439  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2429 (1.0930)  acc1: 72.4000 (75.4720)  acc5: 92.4000 (93.3280)  time: 0.2261  data: 0.1447  max mem: 12911
Test: Total time: 0:00:10 (0.4084 s / it)
* Acc@1 75.292 Acc@5 93.062 loss 1.099
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.44%
Epoch: [141]  [   0/1251]  eta: 1:05:00  lr: 0.002424  min_lr: 0.002424  loss: 3.4226 (3.4226)  weight_decay: 0.0500 (0.0500)  time: 3.1181  data: 1.7056  max mem: 12911
Epoch: [141]  [ 200/1251]  eta: 0:03:33  lr: 0.002420  min_lr: 0.002420  loss: 2.5205 (2.9179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7028 (0.7340)  time: 0.1907  data: 0.0004  max mem: 12911
Epoch: [141]  [ 400/1251]  eta: 0:02:45  lr: 0.002417  min_lr: 0.002417  loss: 3.1234 (2.9830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7777 (0.7782)  time: 0.1844  data: 0.0003  max mem: 12911
Epoch: [141]  [ 600/1251]  eta: 0:02:05  lr: 0.002413  min_lr: 0.002413  loss: 2.3111 (2.9581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8042 (0.7893)  time: 0.1918  data: 0.0004  max mem: 12911
Epoch: [141]  [ 800/1251]  eta: 0:01:26  lr: 0.002409  min_lr: 0.002409  loss: 2.2408 (2.9670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7753 (0.7838)  time: 0.1896  data: 0.0005  max mem: 12911
Epoch: [141]  [1000/1251]  eta: 0:00:48  lr: 0.002406  min_lr: 0.002406  loss: 2.7261 (2.9757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7269 (nan)  time: 0.1901  data: 0.0005  max mem: 12911
Epoch: [141]  [1200/1251]  eta: 0:00:09  lr: 0.002402  min_lr: 0.002402  loss: 3.4911 (2.9844)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7486 (nan)  time: 0.1920  data: 0.0004  max mem: 12911
Epoch: [141]  [1250/1251]  eta: 0:00:00  lr: 0.002402  min_lr: 0.002402  loss: 2.3315 (2.9891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6367 (nan)  time: 0.1471  data: 0.0008  max mem: 12911
Epoch: [141] Total time: 0:04:00 (0.1924 s / it)
Averaged stats: lr: 0.002402  min_lr: 0.002402  loss: 2.3315 (3.0050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6367 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7427 (0.7427)  acc1: 83.6000 (83.6000)  acc5: 98.8000 (98.8000)  time: 5.5465  data: 5.4514  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8659 (0.9327)  acc1: 82.0000 (80.0000)  acc5: 96.8000 (96.0727)  time: 0.6968  data: 0.6144  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1494 (1.1444)  acc1: 72.0000 (75.5619)  acc5: 92.8000 (93.0095)  time: 0.1977  data: 0.1173  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3136 (1.1518)  acc1: 70.8000 (75.3120)  acc5: 91.2000 (93.0400)  time: 0.2053  data: 0.1242  max mem: 12911
Test: Total time: 0:00:10 (0.4074 s / it)
* Acc@1 75.024 Acc@5 93.004 loss 1.151
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 75.44%
Epoch: [142]  [   0/1251]  eta: 1:08:16  lr: 0.002402  min_lr: 0.002402  loss: 4.0024 (4.0024)  weight_decay: 0.0500 (0.0500)  time: 3.2749  data: 2.6203  max mem: 12911
Epoch: [142]  [ 200/1251]  eta: 0:03:34  lr: 0.002398  min_lr: 0.002398  loss: 2.4431 (2.9998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6839 (0.8325)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [142]  [ 400/1251]  eta: 0:02:46  lr: 0.002395  min_lr: 0.002395  loss: 2.7007 (3.0260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6280 (0.7457)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [142]  [ 600/1251]  eta: 0:02:05  lr: 0.002391  min_lr: 0.002391  loss: 2.4378 (2.9970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7100 (0.7573)  time: 0.1844  data: 0.0004  max mem: 12911
Epoch: [142]  [ 800/1251]  eta: 0:01:26  lr: 0.002387  min_lr: 0.002387  loss: 2.5940 (3.0257)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1900  data: 0.0004  max mem: 12911
Epoch: [142]  [1000/1251]  eta: 0:00:47  lr: 0.002384  min_lr: 0.002384  loss: 2.9503 (3.0208)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1916  data: 0.0004  max mem: 12911
Epoch: [142]  [1200/1251]  eta: 0:00:09  lr: 0.002380  min_lr: 0.002380  loss: 2.4591 (3.0228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7805 (nan)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [142]  [1250/1251]  eta: 0:00:00  lr: 0.002380  min_lr: 0.002380  loss: 2.4217 (3.0182)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7496 (nan)  time: 0.1472  data: 0.0008  max mem: 12911
Epoch: [142] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.002380  min_lr: 0.002380  loss: 2.4217 (3.0124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7496 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6402 (0.6402)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.5532  data: 5.4516  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8634 (0.8616)  acc1: 80.0000 (80.6545)  acc5: 96.0000 (95.7818)  time: 0.7429  data: 0.6501  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0711 (1.0655)  acc1: 74.8000 (76.3238)  acc5: 93.6000 (93.3143)  time: 0.2029  data: 0.1168  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2192 (1.0771)  acc1: 71.6000 (75.8720)  acc5: 91.6000 (93.2320)  time: 0.2021  data: 0.1167  max mem: 12911
Test: Total time: 0:00:10 (0.4017 s / it)
* Acc@1 75.352 Acc@5 93.012 loss 1.082
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.44%
Epoch: [143]  [   0/1251]  eta: 1:06:20  lr: 0.002380  min_lr: 0.002380  loss: 4.0487 (4.0487)  weight_decay: 0.0500 (0.0500)  time: 3.1815  data: 1.6250  max mem: 12911
Epoch: [143]  [ 200/1251]  eta: 0:03:35  lr: 0.002376  min_lr: 0.002376  loss: 2.4919 (3.1152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7017 (0.7441)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [143]  [ 400/1251]  eta: 0:02:46  lr: 0.002373  min_lr: 0.002373  loss: 2.6345 (3.0630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5975 (0.7397)  time: 0.1902  data: 0.0005  max mem: 12911
Epoch: [143]  [ 600/1251]  eta: 0:02:06  lr: 0.002369  min_lr: 0.002369  loss: 2.4014 (3.0369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7194 (0.7292)  time: 0.1909  data: 0.0005  max mem: 12911
Epoch: [143]  [ 800/1251]  eta: 0:01:26  lr: 0.002365  min_lr: 0.002365  loss: 2.7446 (3.0276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6832 (0.7432)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [143]  [1000/1251]  eta: 0:00:48  lr: 0.002362  min_lr: 0.002362  loss: 2.4544 (3.0423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7438 (0.7480)  time: 0.1907  data: 0.0005  max mem: 12911
Epoch: [143]  [1200/1251]  eta: 0:00:09  lr: 0.002358  min_lr: 0.002358  loss: 3.5793 (3.0389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7263 (0.7458)  time: 0.1935  data: 0.0005  max mem: 12911
Epoch: [143]  [1250/1251]  eta: 0:00:00  lr: 0.002358  min_lr: 0.002358  loss: 2.4977 (3.0368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7109 (0.7468)  time: 0.1462  data: 0.0006  max mem: 12911
Epoch: [143] Total time: 0:03:59 (0.1918 s / it)
Averaged stats: lr: 0.002358  min_lr: 0.002358  loss: 2.4977 (3.0066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7109 (0.7468)
Test:  [ 0/25]  eta: 0:01:48  loss: 0.6893 (0.6893)  acc1: 84.8000 (84.8000)  acc5: 98.4000 (98.4000)  time: 4.3279  data: 4.2264  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8977 (0.8941)  acc1: 81.6000 (80.4364)  acc5: 96.8000 (96.2182)  time: 0.6252  data: 0.5414  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1023 (1.0811)  acc1: 73.6000 (75.6952)  acc5: 93.2000 (93.4857)  time: 0.2138  data: 0.1327  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2338 (1.0899)  acc1: 72.4000 (75.4560)  acc5: 91.2000 (93.3440)  time: 0.1843  data: 0.1043  max mem: 12911
Test: Total time: 0:00:09 (0.3783 s / it)
* Acc@1 75.544 Acc@5 93.086 loss 1.090
Accuracy of the model on the 50000 test images: 75.5%
Max accuracy: 75.54%
Epoch: [144]  [   0/1251]  eta: 1:01:55  lr: 0.002358  min_lr: 0.002358  loss: 2.9226 (2.9226)  weight_decay: 0.0500 (0.0500)  time: 2.9702  data: 2.7152  max mem: 12911
Epoch: [144]  [ 200/1251]  eta: 0:03:31  lr: 0.002354  min_lr: 0.002354  loss: 2.4100 (2.9884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6568 (0.6937)  time: 0.1842  data: 0.0005  max mem: 12911
Epoch: [144]  [ 400/1251]  eta: 0:02:44  lr: 0.002350  min_lr: 0.002350  loss: 2.4391 (2.9854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7047 (0.7871)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [144]  [ 600/1251]  eta: 0:02:04  lr: 0.002347  min_lr: 0.002347  loss: 2.4543 (3.0019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6614 (0.7905)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [144]  [ 800/1251]  eta: 0:01:25  lr: 0.002343  min_lr: 0.002343  loss: 2.4123 (2.9936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7165 (0.7697)  time: 0.1879  data: 0.0006  max mem: 12911
Epoch: [144]  [1000/1251]  eta: 0:00:47  lr: 0.002340  min_lr: 0.002340  loss: 3.4868 (2.9991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8031 (0.7685)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [144]  [1200/1251]  eta: 0:00:09  lr: 0.002336  min_lr: 0.002336  loss: 2.8231 (2.9941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6305 (0.7720)  time: 0.1916  data: 0.0005  max mem: 12911
Epoch: [144]  [1250/1251]  eta: 0:00:00  lr: 0.002335  min_lr: 0.002335  loss: 3.3179 (2.9966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6259 (0.7671)  time: 0.1470  data: 0.0012  max mem: 12911
Epoch: [144] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.002335  min_lr: 0.002335  loss: 3.3179 (3.0069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6259 (0.7671)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7485 (0.7485)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.4876  data: 5.3806  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9943 (0.9568)  acc1: 81.2000 (80.1455)  acc5: 96.8000 (96.0364)  time: 0.7374  data: 0.6441  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1893 (1.1565)  acc1: 73.6000 (75.5619)  acc5: 92.0000 (93.3905)  time: 0.2084  data: 0.1212  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2779 (1.1648)  acc1: 70.8000 (75.2160)  acc5: 91.6000 (93.3920)  time: 0.2108  data: 0.1246  max mem: 12911
Test: Total time: 0:00:10 (0.4076 s / it)
* Acc@1 75.406 Acc@5 93.134 loss 1.162
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.54%
Epoch: [145]  [   0/1251]  eta: 1:02:05  lr: 0.002335  min_lr: 0.002335  loss: 2.2320 (2.2320)  weight_decay: 0.0500 (0.0500)  time: 2.9780  data: 2.6333  max mem: 12911
Epoch: [145]  [ 200/1251]  eta: 0:03:40  lr: 0.002332  min_lr: 0.002332  loss: 2.6901 (2.9330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6977 (0.7501)  time: 0.1953  data: 0.0005  max mem: 12911
Epoch: [145]  [ 400/1251]  eta: 0:02:49  lr: 0.002328  min_lr: 0.002328  loss: 2.8568 (2.9166)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9105 (0.7837)  time: 0.1905  data: 0.0005  max mem: 12911
Epoch: [145]  [ 600/1251]  eta: 0:02:07  lr: 0.002325  min_lr: 0.002325  loss: 2.5692 (2.9560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7484 (0.7659)  time: 0.1878  data: 0.0006  max mem: 12911
Epoch: [145]  [ 800/1251]  eta: 0:01:27  lr: 0.002321  min_lr: 0.002321  loss: 2.3638 (2.9513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8735 (0.7727)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [145]  [1000/1251]  eta: 0:00:48  lr: 0.002318  min_lr: 0.002318  loss: 2.5994 (2.9773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7905 (0.7849)  time: 0.1877  data: 0.0004  max mem: 12911
Epoch: [145]  [1200/1251]  eta: 0:00:09  lr: 0.002314  min_lr: 0.002314  loss: 2.8887 (3.0023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7348 (0.7913)  time: 0.1919  data: 0.0008  max mem: 12911
Epoch: [145]  [1250/1251]  eta: 0:00:00  lr: 0.002313  min_lr: 0.002313  loss: 2.4202 (3.0008)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8001 (0.7903)  time: 0.1469  data: 0.0006  max mem: 12911
Epoch: [145] Total time: 0:04:00 (0.1922 s / it)
Averaged stats: lr: 0.002313  min_lr: 0.002313  loss: 2.4202 (3.0113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8001 (0.7903)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6805 (0.6805)  acc1: 85.6000 (85.6000)  acc5: 97.2000 (97.2000)  time: 5.5607  data: 5.4669  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8528 (0.8795)  acc1: 81.2000 (80.3273)  acc5: 95.6000 (95.7455)  time: 0.7645  data: 0.6665  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0870 (1.0837)  acc1: 73.6000 (75.5429)  acc5: 92.4000 (92.9143)  time: 0.2121  data: 0.1201  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1927 (1.0916)  acc1: 73.6000 (75.2320)  acc5: 92.0000 (92.9280)  time: 0.2124  data: 0.1242  max mem: 12911
Test: Total time: 0:00:10 (0.4117 s / it)
* Acc@1 75.718 Acc@5 93.098 loss 1.078
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 75.72%
Epoch: [146]  [   0/1251]  eta: 1:00:57  lr: 0.002313  min_lr: 0.002313  loss: 4.0899 (4.0899)  weight_decay: 0.0500 (0.0500)  time: 2.9240  data: 2.6804  max mem: 12911
Epoch: [146]  [ 200/1251]  eta: 0:03:33  lr: 0.002310  min_lr: 0.002310  loss: 3.4763 (3.0401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7080 (0.7219)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [146]  [ 400/1251]  eta: 0:02:46  lr: 0.002306  min_lr: 0.002306  loss: 3.0320 (3.0396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7906 (0.7354)  time: 0.1892  data: 0.0005  max mem: 12911
Epoch: [146]  [ 600/1251]  eta: 0:02:05  lr: 0.002303  min_lr: 0.002303  loss: 2.9369 (3.0358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7024 (0.7384)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [146]  [ 800/1251]  eta: 0:01:26  lr: 0.002299  min_lr: 0.002299  loss: 2.3777 (3.0294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5765 (0.7311)  time: 0.1853  data: 0.0005  max mem: 12911
Epoch: [146]  [1000/1251]  eta: 0:00:48  lr: 0.002296  min_lr: 0.002296  loss: 2.6369 (3.0197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7229 (0.7346)  time: 0.1898  data: 0.0005  max mem: 12911
Epoch: [146]  [1200/1251]  eta: 0:00:09  lr: 0.002292  min_lr: 0.002292  loss: 2.4385 (2.9994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6991 (0.7442)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [146]  [1250/1251]  eta: 0:00:00  lr: 0.002291  min_lr: 0.002291  loss: 3.7900 (3.0071)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6991 (0.7432)  time: 0.1484  data: 0.0013  max mem: 12911
Epoch: [146] Total time: 0:03:59 (0.1914 s / it)
Averaged stats: lr: 0.002291  min_lr: 0.002291  loss: 3.7900 (2.9938)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6991 (0.7432)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8308 (0.8308)  acc1: 84.0000 (84.0000)  acc5: 98.4000 (98.4000)  time: 5.7542  data: 5.6627  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9422 (0.9918)  acc1: 81.6000 (80.3636)  acc5: 96.4000 (96.0000)  time: 0.7048  data: 0.6109  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2244 (1.1872)  acc1: 72.8000 (75.6000)  acc5: 92.8000 (93.2952)  time: 0.1832  data: 0.0949  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3418 (1.2004)  acc1: 72.8000 (75.1360)  acc5: 91.6000 (93.2000)  time: 0.2005  data: 0.1139  max mem: 12911
Test: Total time: 0:00:10 (0.4082 s / it)
* Acc@1 75.314 Acc@5 92.948 loss 1.199
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.72%
Epoch: [147]  [   0/1251]  eta: 0:58:28  lr: 0.002291  min_lr: 0.002291  loss: 2.7571 (2.7571)  weight_decay: 0.0500 (0.0500)  time: 2.8043  data: 1.6791  max mem: 12911
Epoch: [147]  [ 200/1251]  eta: 0:03:35  lr: 0.002288  min_lr: 0.002288  loss: 3.0580 (3.0154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7231 (0.8844)  time: 0.1855  data: 0.0005  max mem: 12911
Epoch: [147]  [ 400/1251]  eta: 0:02:46  lr: 0.002284  min_lr: 0.002284  loss: 2.4177 (3.0149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6952 (0.8088)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [147]  [ 600/1251]  eta: 0:02:05  lr: 0.002280  min_lr: 0.002280  loss: 3.0708 (3.0221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6333 (0.7815)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [147]  [ 800/1251]  eta: 0:01:26  lr: 0.002277  min_lr: 0.002277  loss: 2.4596 (3.0119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7174 (0.7647)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [147]  [1000/1251]  eta: 0:00:47  lr: 0.002273  min_lr: 0.002273  loss: 2.5617 (3.0181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6883 (0.7702)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [147]  [1200/1251]  eta: 0:00:09  lr: 0.002270  min_lr: 0.002270  loss: 2.5576 (3.0138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6894 (0.7569)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [147]  [1250/1251]  eta: 0:00:00  lr: 0.002269  min_lr: 0.002269  loss: 2.5903 (3.0154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6228 (0.7531)  time: 0.1475  data: 0.0006  max mem: 12911
Epoch: [147] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.002269  min_lr: 0.002269  loss: 2.5903 (2.9991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6228 (0.7531)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7687 (0.7687)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 5.7774  data: 5.6814  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8906 (0.9341)  acc1: 82.8000 (79.5273)  acc5: 95.6000 (95.8546)  time: 0.7402  data: 0.6429  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1592 (1.1170)  acc1: 73.2000 (75.7333)  acc5: 92.4000 (93.4476)  time: 0.1921  data: 0.1036  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2478 (1.1325)  acc1: 73.2000 (75.4080)  acc5: 91.6000 (93.3440)  time: 0.1932  data: 0.1086  max mem: 12911
Test: Total time: 0:00:10 (0.4054 s / it)
* Acc@1 75.286 Acc@5 93.082 loss 1.132
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.72%
Epoch: [148]  [   0/1251]  eta: 1:02:24  lr: 0.002269  min_lr: 0.002269  loss: 2.3874 (2.3874)  weight_decay: 0.0500 (0.0500)  time: 2.9933  data: 1.9109  max mem: 12911
Epoch: [148]  [ 200/1251]  eta: 0:03:34  lr: 0.002265  min_lr: 0.002265  loss: 2.6228 (3.0648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8267 (0.9069)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [148]  [ 400/1251]  eta: 0:02:45  lr: 0.002262  min_lr: 0.002262  loss: 2.7735 (3.0490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6381 (0.8076)  time: 0.1845  data: 0.0004  max mem: 12911
Epoch: [148]  [ 600/1251]  eta: 0:02:04  lr: 0.002258  min_lr: 0.002258  loss: 2.3893 (2.9932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6701 (0.7808)  time: 0.1862  data: 0.0004  max mem: 12911
Epoch: [148]  [ 800/1251]  eta: 0:01:26  lr: 0.002255  min_lr: 0.002255  loss: 2.3935 (2.9669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7322 (0.7820)  time: 0.1845  data: 0.0005  max mem: 12911
Epoch: [148]  [1000/1251]  eta: 0:00:47  lr: 0.002251  min_lr: 0.002251  loss: 2.9650 (2.9614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6776 (0.7567)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [148]  [1200/1251]  eta: 0:00:09  lr: 0.002248  min_lr: 0.002248  loss: 2.4702 (2.9641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7374 (0.7600)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [148]  [1250/1251]  eta: 0:00:00  lr: 0.002247  min_lr: 0.002247  loss: 2.3392 (2.9627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6735 (0.7563)  time: 0.1459  data: 0.0009  max mem: 12911
Epoch: [148] Total time: 0:03:56 (0.1892 s / it)
Averaged stats: lr: 0.002247  min_lr: 0.002247  loss: 2.3392 (2.9848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6735 (0.7563)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.7007 (0.7007)  acc1: 85.2000 (85.2000)  acc5: 97.2000 (97.2000)  time: 5.2568  data: 5.1568  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8031 (0.8693)  acc1: 82.8000 (81.0182)  acc5: 96.4000 (95.6000)  time: 0.6815  data: 0.5836  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1280 (1.0599)  acc1: 73.2000 (76.3048)  acc5: 92.0000 (93.1619)  time: 0.2042  data: 0.1155  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2354 (1.0710)  acc1: 72.8000 (75.9520)  acc5: 91.6000 (93.1200)  time: 0.2177  data: 0.1329  max mem: 12911
Test: Total time: 0:00:10 (0.4047 s / it)
* Acc@1 75.864 Acc@5 93.242 loss 1.068
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 75.86%
Epoch: [149]  [   0/1251]  eta: 0:59:20  lr: 0.002247  min_lr: 0.002247  loss: 3.7751 (3.7751)  weight_decay: 0.0500 (0.0500)  time: 2.8461  data: 1.9274  max mem: 12911
Epoch: [149]  [ 200/1251]  eta: 0:03:31  lr: 0.002243  min_lr: 0.002243  loss: 2.3311 (2.9749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7692 (0.8414)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [149]  [ 400/1251]  eta: 0:02:45  lr: 0.002240  min_lr: 0.002240  loss: 3.0973 (2.9231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8715 (0.8259)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [149]  [ 600/1251]  eta: 0:02:05  lr: 0.002236  min_lr: 0.002236  loss: 2.3632 (2.9132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7124 (0.8159)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [149]  [ 800/1251]  eta: 0:01:26  lr: 0.002232  min_lr: 0.002232  loss: 2.4972 (2.9258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6758 (0.8068)  time: 0.1856  data: 0.0004  max mem: 12911
Epoch: [149]  [1000/1251]  eta: 0:00:47  lr: 0.002229  min_lr: 0.002229  loss: 2.5957 (2.9361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8277 (0.8299)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [149]  [1200/1251]  eta: 0:00:09  lr: 0.002225  min_lr: 0.002225  loss: 2.3092 (2.9320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7183 (0.8255)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [149]  [1250/1251]  eta: 0:00:00  lr: 0.002224  min_lr: 0.002224  loss: 2.8620 (2.9343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6434 (0.8203)  time: 0.1466  data: 0.0012  max mem: 12911
Epoch: [149] Total time: 0:03:57 (0.1900 s / it)
Averaged stats: lr: 0.002224  min_lr: 0.002224  loss: 2.8620 (2.9885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6434 (0.8203)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7898 (0.7898)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.6426  data: 5.5413  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9932 (0.9698)  acc1: 80.8000 (80.3636)  acc5: 96.4000 (96.1091)  time: 0.7213  data: 0.6233  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1848 (1.1727)  acc1: 73.2000 (75.6571)  acc5: 92.0000 (93.1048)  time: 0.1945  data: 0.1054  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3142 (1.1834)  acc1: 72.0000 (75.5840)  acc5: 91.6000 (93.0400)  time: 0.1908  data: 0.1053  max mem: 12911
Test: Total time: 0:00:09 (0.3998 s / it)
* Acc@1 75.568 Acc@5 93.060 loss 1.174
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.86%
Epoch: [150]  [   0/1251]  eta: 1:09:46  lr: 0.002224  min_lr: 0.002224  loss: 4.0594 (4.0594)  weight_decay: 0.0500 (0.0500)  time: 3.3466  data: 1.4465  max mem: 12911
Epoch: [150]  [ 200/1251]  eta: 0:03:35  lr: 0.002221  min_lr: 0.002221  loss: 3.3735 (2.8702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6561 (0.7038)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [150]  [ 400/1251]  eta: 0:02:46  lr: 0.002217  min_lr: 0.002217  loss: 3.7003 (2.9387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6810 (0.7389)  time: 0.1846  data: 0.0004  max mem: 12911
Epoch: [150]  [ 600/1251]  eta: 0:02:05  lr: 0.002214  min_lr: 0.002214  loss: 2.4292 (2.9558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6384 (0.7297)  time: 0.1947  data: 0.0005  max mem: 12911
Epoch: [150]  [ 800/1251]  eta: 0:01:26  lr: 0.002210  min_lr: 0.002210  loss: 2.7503 (2.9600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6042 (0.7343)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [150]  [1000/1251]  eta: 0:00:48  lr: 0.002207  min_lr: 0.002207  loss: 2.6188 (2.9612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7519 (0.7601)  time: 0.1895  data: 0.0006  max mem: 12911
Epoch: [150]  [1200/1251]  eta: 0:00:09  lr: 0.002203  min_lr: 0.002203  loss: 3.4281 (2.9818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7764 (0.7630)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [150]  [1250/1251]  eta: 0:00:00  lr: 0.002202  min_lr: 0.002202  loss: 2.8857 (2.9860)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6776 (0.7650)  time: 0.1477  data: 0.0009  max mem: 12911
Epoch: [150] Total time: 0:03:59 (0.1916 s / it)
Averaged stats: lr: 0.002202  min_lr: 0.002202  loss: 2.8857 (2.9818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6776 (0.7650)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7265 (0.7265)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.4893  data: 5.3977  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8532 (0.9241)  acc1: 81.6000 (80.2545)  acc5: 96.4000 (95.8909)  time: 0.7486  data: 0.6546  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1700 (1.1301)  acc1: 74.4000 (75.9619)  acc5: 92.4000 (93.2000)  time: 0.2066  data: 0.1190  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2746 (1.1371)  acc1: 72.4000 (75.5680)  acc5: 90.8000 (93.1040)  time: 0.2042  data: 0.1190  max mem: 12911
Test: Total time: 0:00:10 (0.4008 s / it)
* Acc@1 75.568 Acc@5 93.036 loss 1.140
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.86%
Epoch: [151]  [   0/1251]  eta: 0:59:13  lr: 0.002202  min_lr: 0.002202  loss: 2.5925 (2.5925)  weight_decay: 0.0500 (0.0500)  time: 2.8405  data: 1.5072  max mem: 12911
Epoch: [151]  [ 200/1251]  eta: 0:03:35  lr: 0.002198  min_lr: 0.002198  loss: 2.6029 (3.0210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7878 (0.7832)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [151]  [ 400/1251]  eta: 0:02:46  lr: 0.002195  min_lr: 0.002195  loss: 2.6326 (3.0300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7478 (0.7807)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [151]  [ 600/1251]  eta: 0:02:05  lr: 0.002191  min_lr: 0.002191  loss: 3.2018 (3.0351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7841 (0.8164)  time: 0.1855  data: 0.0003  max mem: 12911
Epoch: [151]  [ 800/1251]  eta: 0:01:26  lr: 0.002188  min_lr: 0.002188  loss: 2.4528 (3.0327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8443 (0.8078)  time: 0.1847  data: 0.0004  max mem: 12911
Epoch: [151]  [1000/1251]  eta: 0:00:47  lr: 0.002184  min_lr: 0.002184  loss: 2.4812 (2.9964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7899 (0.8114)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [151]  [1200/1251]  eta: 0:00:09  lr: 0.002181  min_lr: 0.002181  loss: 2.9660 (2.9892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6774 (0.7984)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [151]  [1250/1251]  eta: 0:00:00  lr: 0.002180  min_lr: 0.002180  loss: 3.0878 (2.9917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6755 (0.7945)  time: 0.1481  data: 0.0008  max mem: 12911
Epoch: [151] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.002180  min_lr: 0.002180  loss: 3.0878 (2.9892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6755 (0.7945)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.7866 (0.7866)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.2961  data: 5.2039  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.9041 (0.9500)  acc1: 80.0000 (80.1818)  acc5: 96.4000 (95.7818)  time: 0.6450  data: 0.5484  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1871 (1.1489)  acc1: 74.4000 (75.8857)  acc5: 92.0000 (93.3143)  time: 0.1874  data: 0.0977  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2662 (1.1590)  acc1: 72.8000 (75.5200)  acc5: 92.0000 (93.2320)  time: 0.2083  data: 0.1242  max mem: 12911
Test: Total time: 0:00:10 (0.4052 s / it)
* Acc@1 75.562 Acc@5 93.016 loss 1.162
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.86%
Epoch: [152]  [   0/1251]  eta: 1:05:55  lr: 0.002180  min_lr: 0.002180  loss: 3.4082 (3.4082)  weight_decay: 0.0500 (0.0500)  time: 3.1620  data: 1.5877  max mem: 12911
Epoch: [152]  [ 200/1251]  eta: 0:03:34  lr: 0.002176  min_lr: 0.002176  loss: 2.5489 (2.9834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (0.7126)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [152]  [ 400/1251]  eta: 0:02:46  lr: 0.002173  min_lr: 0.002173  loss: 3.1043 (2.9829)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7614 (0.7348)  time: 0.1849  data: 0.0005  max mem: 12911
Epoch: [152]  [ 600/1251]  eta: 0:02:05  lr: 0.002169  min_lr: 0.002169  loss: 2.4172 (2.9511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7075 (0.7338)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [152]  [ 800/1251]  eta: 0:01:26  lr: 0.002165  min_lr: 0.002165  loss: 2.4079 (2.9513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7237 (0.7355)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [152]  [1000/1251]  eta: 0:00:47  lr: 0.002162  min_lr: 0.002162  loss: 2.4373 (2.9581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9499 (0.7536)  time: 0.1895  data: 0.0005  max mem: 12911
Epoch: [152]  [1200/1251]  eta: 0:00:09  lr: 0.002158  min_lr: 0.002158  loss: 2.8646 (2.9616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7114 (0.7640)  time: 0.1916  data: 0.0004  max mem: 12911
Epoch: [152]  [1250/1251]  eta: 0:00:00  lr: 0.002157  min_lr: 0.002157  loss: 3.2309 (2.9696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7161 (0.7645)  time: 0.1464  data: 0.0006  max mem: 12911
Epoch: [152] Total time: 0:03:58 (0.1910 s / it)
Averaged stats: lr: 0.002157  min_lr: 0.002157  loss: 3.2309 (2.9812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7161 (0.7645)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7555 (0.7555)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 5.5462  data: 5.4545  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8293 (0.8966)  acc1: 82.4000 (81.1273)  acc5: 96.4000 (96.1091)  time: 0.7599  data: 0.6626  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1226 (1.0999)  acc1: 72.8000 (75.9048)  acc5: 93.6000 (93.3714)  time: 0.2080  data: 0.1193  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2158 (1.1096)  acc1: 71.6000 (75.5840)  acc5: 91.6000 (93.3120)  time: 0.2043  data: 0.1192  max mem: 12911
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 75.682 Acc@5 93.228 loss 1.108
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 75.86%
Epoch: [153]  [   0/1251]  eta: 1:05:18  lr: 0.002157  min_lr: 0.002157  loss: 3.8703 (3.8703)  weight_decay: 0.0500 (0.0500)  time: 3.1325  data: 2.8843  max mem: 12911
Epoch: [153]  [ 200/1251]  eta: 0:03:35  lr: 0.002154  min_lr: 0.002154  loss: 2.5137 (2.9878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8220 (0.8266)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [153]  [ 400/1251]  eta: 0:02:46  lr: 0.002150  min_lr: 0.002150  loss: 3.0217 (2.9643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6776 (0.7704)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [153]  [ 600/1251]  eta: 0:02:05  lr: 0.002147  min_lr: 0.002147  loss: 2.5203 (2.9673)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.7909)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [153]  [ 800/1251]  eta: 0:01:26  lr: 0.002143  min_lr: 0.002143  loss: 2.3207 (2.9757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.7919)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [153]  [1000/1251]  eta: 0:00:47  lr: 0.002139  min_lr: 0.002139  loss: 2.5341 (2.9828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7460 (0.7830)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [153]  [1200/1251]  eta: 0:00:09  lr: 0.002136  min_lr: 0.002136  loss: 3.1040 (2.9914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7234 (0.7814)  time: 0.1903  data: 0.0004  max mem: 12911
Epoch: [153]  [1250/1251]  eta: 0:00:00  lr: 0.002135  min_lr: 0.002135  loss: 2.7348 (2.9896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8024 (0.7833)  time: 0.1456  data: 0.0009  max mem: 12911
Epoch: [153] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.002135  min_lr: 0.002135  loss: 2.7348 (2.9802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8024 (0.7833)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7399 (0.7399)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 5.8653  data: 5.7738  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8590 (0.8799)  acc1: 82.4000 (81.4545)  acc5: 96.4000 (96.0364)  time: 0.7595  data: 0.6676  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0937 (1.0818)  acc1: 74.0000 (76.4571)  acc5: 93.2000 (93.6571)  time: 0.1984  data: 0.1121  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2449 (1.0939)  acc1: 72.8000 (76.1760)  acc5: 92.0000 (93.5360)  time: 0.1977  data: 0.1120  max mem: 12911
Test: Total time: 0:00:10 (0.4092 s / it)
* Acc@1 76.022 Acc@5 93.270 loss 1.085
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.02%
Epoch: [154]  [   0/1251]  eta: 0:59:39  lr: 0.002135  min_lr: 0.002135  loss: 3.8707 (3.8707)  weight_decay: 0.0500 (0.0500)  time: 2.8610  data: 2.5934  max mem: 12911
Epoch: [154]  [ 200/1251]  eta: 0:03:31  lr: 0.002131  min_lr: 0.002131  loss: 2.5641 (2.9151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8166 (0.7646)  time: 0.1859  data: 0.0004  max mem: 12911
Epoch: [154]  [ 400/1251]  eta: 0:02:45  lr: 0.002128  min_lr: 0.002128  loss: 2.3298 (2.9316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7205 (0.7532)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [154]  [ 600/1251]  eta: 0:02:05  lr: 0.002124  min_lr: 0.002124  loss: 3.5180 (2.9611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7690 (0.7737)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [154]  [ 800/1251]  eta: 0:01:26  lr: 0.002121  min_lr: 0.002121  loss: 2.7995 (2.9518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7432 (0.7728)  time: 0.1921  data: 0.0004  max mem: 12911
Epoch: [154]  [1000/1251]  eta: 0:00:48  lr: 0.002117  min_lr: 0.002117  loss: 2.8605 (2.9688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9510 (0.7904)  time: 0.1922  data: 0.0005  max mem: 12911
Epoch: [154]  [1200/1251]  eta: 0:00:09  lr: 0.002113  min_lr: 0.002113  loss: 2.5140 (2.9667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6239 (0.7929)  time: 0.1845  data: 0.0005  max mem: 12911
Epoch: [154]  [1250/1251]  eta: 0:00:00  lr: 0.002113  min_lr: 0.002113  loss: 3.7698 (2.9736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6910 (0.7930)  time: 0.1464  data: 0.0009  max mem: 12911
Epoch: [154] Total time: 0:03:59 (0.1912 s / it)
Averaged stats: lr: 0.002113  min_lr: 0.002113  loss: 3.7698 (2.9783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6910 (0.7930)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8297 (0.8297)  acc1: 84.4000 (84.4000)  acc5: 98.4000 (98.4000)  time: 5.7595  data: 5.6678  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9334 (0.9891)  acc1: 81.2000 (80.4000)  acc5: 96.4000 (96.1818)  time: 0.6828  data: 0.5876  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.2133 (1.1958)  acc1: 74.0000 (75.8857)  acc5: 93.2000 (93.3333)  time: 0.1726  data: 0.0851  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.3306 (1.2031)  acc1: 72.0000 (75.6480)  acc5: 91.2000 (93.1360)  time: 0.1988  data: 0.1133  max mem: 12911
Test: Total time: 0:00:10 (0.4078 s / it)
* Acc@1 75.430 Acc@5 93.042 loss 1.202
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 76.02%
Epoch: [155]  [   0/1251]  eta: 1:02:10  lr: 0.002113  min_lr: 0.002113  loss: 3.8767 (3.8767)  weight_decay: 0.0500 (0.0500)  time: 2.9818  data: 2.6102  max mem: 12911
Epoch: [155]  [ 200/1251]  eta: 0:03:34  lr: 0.002109  min_lr: 0.002109  loss: 2.5452 (2.9332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7605 (0.8271)  time: 0.1838  data: 0.0004  max mem: 12911
Epoch: [155]  [ 400/1251]  eta: 0:02:46  lr: 0.002105  min_lr: 0.002105  loss: 2.4083 (2.9017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6999 (0.8139)  time: 0.1977  data: 0.0004  max mem: 12911
Epoch: [155]  [ 600/1251]  eta: 0:02:06  lr: 0.002102  min_lr: 0.002102  loss: 2.5756 (2.9487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8656 (0.9599)  time: 0.1915  data: 0.0006  max mem: 12911
Epoch: [155]  [ 800/1251]  eta: 0:01:26  lr: 0.002098  min_lr: 0.002098  loss: 2.6171 (2.9699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6540 (0.8927)  time: 0.1891  data: 0.0005  max mem: 12911
Epoch: [155]  [1000/1251]  eta: 0:00:48  lr: 0.002095  min_lr: 0.002095  loss: 3.3819 (2.9785)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8181 (0.8592)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [155]  [1200/1251]  eta: 0:00:09  lr: 0.002091  min_lr: 0.002091  loss: 2.2879 (2.9865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7300 (0.8471)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [155]  [1250/1251]  eta: 0:00:00  lr: 0.002090  min_lr: 0.002090  loss: 2.4442 (2.9868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6679 (0.8392)  time: 0.1460  data: 0.0007  max mem: 12911
Epoch: [155] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.002090  min_lr: 0.002090  loss: 2.4442 (2.9651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6679 (0.8392)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7666 (0.7666)  acc1: 83.2000 (83.2000)  acc5: 97.6000 (97.6000)  time: 5.3778  data: 5.2786  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.9433 (0.9390)  acc1: 81.6000 (80.2182)  acc5: 96.4000 (95.6364)  time: 0.7219  data: 0.6276  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1813 (1.1255)  acc1: 74.0000 (75.8476)  acc5: 92.0000 (93.1619)  time: 0.2065  data: 0.1198  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2555 (1.1340)  acc1: 71.6000 (75.6800)  acc5: 91.6000 (93.0720)  time: 0.2120  data: 0.1277  max mem: 12911
Test: Total time: 0:00:10 (0.4027 s / it)
* Acc@1 75.910 Acc@5 93.132 loss 1.128
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 76.02%
Epoch: [156]  [   0/1251]  eta: 1:03:48  lr: 0.002090  min_lr: 0.002090  loss: 3.5374 (3.5374)  weight_decay: 0.0500 (0.0500)  time: 3.0602  data: 2.6885  max mem: 12911
Epoch: [156]  [ 200/1251]  eta: 0:03:34  lr: 0.002087  min_lr: 0.002087  loss: 2.5424 (2.8948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7698 (0.8364)  time: 0.1830  data: 0.0004  max mem: 12911
Epoch: [156]  [ 400/1251]  eta: 0:02:45  lr: 0.002083  min_lr: 0.002083  loss: 2.5030 (2.9264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6568 (0.7962)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [156]  [ 600/1251]  eta: 0:02:05  lr: 0.002079  min_lr: 0.002079  loss: 2.6079 (2.9322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7461 (0.7778)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [156]  [ 800/1251]  eta: 0:01:26  lr: 0.002076  min_lr: 0.002076  loss: 2.5489 (2.9617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6020 (0.7572)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [156]  [1000/1251]  eta: 0:00:47  lr: 0.002072  min_lr: 0.002072  loss: 2.7193 (2.9702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7519 (0.7560)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [156]  [1200/1251]  eta: 0:00:09  lr: 0.002069  min_lr: 0.002069  loss: 2.6598 (2.9855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6074 (0.7517)  time: 0.1890  data: 0.0005  max mem: 12911
Epoch: [156]  [1250/1251]  eta: 0:00:00  lr: 0.002068  min_lr: 0.002068  loss: 3.5724 (2.9932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6886 (0.7519)  time: 0.1451  data: 0.0007  max mem: 12911
Epoch: [156] Total time: 0:03:57 (0.1896 s / it)
Averaged stats: lr: 0.002068  min_lr: 0.002068  loss: 3.5724 (2.9824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6886 (0.7519)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8318 (0.8318)  acc1: 84.4000 (84.4000)  acc5: 98.0000 (98.0000)  time: 5.5988  data: 5.5072  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8937 (0.9859)  acc1: 81.6000 (80.4000)  acc5: 95.6000 (95.6364)  time: 0.7107  data: 0.6178  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1969 (1.1697)  acc1: 74.8000 (76.0000)  acc5: 93.6000 (93.3333)  time: 0.1956  data: 0.1093  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2797 (1.1800)  acc1: 73.6000 (75.7600)  acc5: 92.0000 (93.2640)  time: 0.2025  data: 0.1179  max mem: 12911
Test: Total time: 0:00:10 (0.4031 s / it)
* Acc@1 75.742 Acc@5 93.244 loss 1.180
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 76.02%
Epoch: [157]  [   0/1251]  eta: 1:06:26  lr: 0.002068  min_lr: 0.002068  loss: 4.1247 (4.1247)  weight_decay: 0.0500 (0.0500)  time: 3.1870  data: 2.2934  max mem: 12911
Epoch: [157]  [ 200/1251]  eta: 0:03:32  lr: 0.002064  min_lr: 0.002064  loss: 2.7491 (3.0023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6780 (0.7378)  time: 0.1847  data: 0.0005  max mem: 12911
Epoch: [157]  [ 400/1251]  eta: 0:02:45  lr: 0.002061  min_lr: 0.002061  loss: 2.6548 (2.9649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7219 (0.7519)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [157]  [ 600/1251]  eta: 0:02:04  lr: 0.002057  min_lr: 0.002057  loss: 2.7806 (2.9720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6869 (0.7488)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [157]  [ 800/1251]  eta: 0:01:25  lr: 0.002053  min_lr: 0.002053  loss: 3.4770 (2.9969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7794 (0.7504)  time: 0.1849  data: 0.0005  max mem: 12911
Epoch: [157]  [1000/1251]  eta: 0:00:47  lr: 0.002050  min_lr: 0.002050  loss: 2.7090 (2.9835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6415 (0.7599)  time: 0.1916  data: 0.0004  max mem: 12911
Epoch: [157]  [1200/1251]  eta: 0:00:09  lr: 0.002046  min_lr: 0.002046  loss: 2.5025 (2.9825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7040 (0.7609)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [157]  [1250/1251]  eta: 0:00:00  lr: 0.002045  min_lr: 0.002045  loss: 2.5261 (2.9809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8159 (0.7654)  time: 0.1474  data: 0.0011  max mem: 12911
Epoch: [157] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.002045  min_lr: 0.002045  loss: 2.5261 (2.9615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8159 (0.7654)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.7606 (0.7606)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.4047  data: 5.3013  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8886 (0.9206)  acc1: 81.6000 (80.6545)  acc5: 96.4000 (95.8545)  time: 0.7116  data: 0.6161  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1413 (1.1089)  acc1: 73.6000 (76.3238)  acc5: 92.8000 (93.2381)  time: 0.2027  data: 0.1155  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2662 (1.1200)  acc1: 72.8000 (76.0160)  acc5: 91.2000 (93.1840)  time: 0.2051  data: 0.1204  max mem: 12911
Test: Total time: 0:00:09 (0.3981 s / it)
* Acc@1 76.068 Acc@5 93.286 loss 1.116
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.07%
Epoch: [158]  [   0/1251]  eta: 1:08:44  lr: 0.002045  min_lr: 0.002045  loss: 2.1654 (2.1654)  weight_decay: 0.0500 (0.0500)  time: 3.2968  data: 3.0930  max mem: 12911
Epoch: [158]  [ 200/1251]  eta: 0:03:33  lr: 0.002042  min_lr: 0.002042  loss: 2.4341 (2.8937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7983 (0.8339)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [158]  [ 400/1251]  eta: 0:02:47  lr: 0.002038  min_lr: 0.002038  loss: 2.3846 (2.8972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7077 (0.8025)  time: 0.1932  data: 0.0004  max mem: 12911
Epoch: [158]  [ 600/1251]  eta: 0:02:06  lr: 0.002035  min_lr: 0.002035  loss: 2.6981 (2.9375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7794 (0.7974)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [158]  [ 800/1251]  eta: 0:01:27  lr: 0.002031  min_lr: 0.002031  loss: 2.5505 (2.9690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7441 (0.7841)  time: 0.1932  data: 0.0005  max mem: 12911
Epoch: [158]  [1000/1251]  eta: 0:00:48  lr: 0.002027  min_lr: 0.002027  loss: 2.5629 (2.9664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7248 (0.7787)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [158]  [1200/1251]  eta: 0:00:09  lr: 0.002024  min_lr: 0.002024  loss: 2.5038 (2.9640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7272 (0.7696)  time: 0.1904  data: 0.0006  max mem: 12911
Epoch: [158]  [1250/1251]  eta: 0:00:00  lr: 0.002023  min_lr: 0.002023  loss: 2.5790 (2.9662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7213 (0.7712)  time: 0.1475  data: 0.0014  max mem: 12911
Epoch: [158] Total time: 0:04:01 (0.1927 s / it)
Averaged stats: lr: 0.002023  min_lr: 0.002023  loss: 2.5790 (2.9654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7213 (0.7712)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6929 (0.6929)  acc1: 86.0000 (86.0000)  acc5: 98.8000 (98.8000)  time: 5.7281  data: 5.6365  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.9165 (0.9297)  acc1: 80.4000 (80.4364)  acc5: 96.8000 (96.0000)  time: 0.7504  data: 0.6547  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1579 (1.1190)  acc1: 74.4000 (76.2667)  acc5: 92.8000 (93.4857)  time: 0.2030  data: 0.1152  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2417 (1.1230)  acc1: 72.8000 (75.9200)  acc5: 91.6000 (93.3600)  time: 0.1997  data: 0.1152  max mem: 12911
Test: Total time: 0:00:10 (0.4075 s / it)
* Acc@1 75.990 Acc@5 93.286 loss 1.127
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.07%
Epoch: [159]  [   0/1251]  eta: 1:08:42  lr: 0.002023  min_lr: 0.002023  loss: 2.1680 (2.1680)  weight_decay: 0.0500 (0.0500)  time: 3.2952  data: 2.0347  max mem: 12911
Epoch: [159]  [ 200/1251]  eta: 0:03:34  lr: 0.002019  min_lr: 0.002019  loss: 2.8898 (3.0132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7717 (0.7992)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [159]  [ 400/1251]  eta: 0:02:46  lr: 0.002016  min_lr: 0.002016  loss: 2.2865 (3.0732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6836 (0.8710)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [159]  [ 600/1251]  eta: 0:02:05  lr: 0.002012  min_lr: 0.002012  loss: 2.5614 (3.0191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6905 (0.8443)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [159]  [ 800/1251]  eta: 0:01:26  lr: 0.002009  min_lr: 0.002009  loss: 2.6774 (3.0057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8804 (0.8436)  time: 0.1890  data: 0.0005  max mem: 12911
Epoch: [159]  [1000/1251]  eta: 0:00:47  lr: 0.002005  min_lr: 0.002005  loss: 3.2377 (2.9830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6555 (0.8186)  time: 0.1853  data: 0.0005  max mem: 12911
Epoch: [159]  [1200/1251]  eta: 0:00:09  lr: 0.002001  min_lr: 0.002001  loss: 2.6739 (2.9895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7979 (0.8166)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [159]  [1250/1251]  eta: 0:00:00  lr: 0.002001  min_lr: 0.002001  loss: 3.4498 (2.9899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7684 (0.8174)  time: 0.1471  data: 0.0010  max mem: 12911
Epoch: [159] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.002001  min_lr: 0.002001  loss: 3.4498 (2.9553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7684 (0.8174)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6992 (0.6992)  acc1: 84.8000 (84.8000)  acc5: 99.2000 (99.2000)  time: 5.5871  data: 5.4954  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8905 (0.8943)  acc1: 82.4000 (81.2000)  acc5: 96.4000 (96.3273)  time: 0.6887  data: 0.5920  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1090 (1.0990)  acc1: 75.2000 (76.9333)  acc5: 92.4000 (93.4667)  time: 0.1804  data: 0.0911  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2405 (1.1141)  acc1: 73.2000 (76.4960)  acc5: 91.6000 (93.2960)  time: 0.1906  data: 0.1042  max mem: 12911
Test: Total time: 0:00:09 (0.3997 s / it)
* Acc@1 75.930 Acc@5 93.206 loss 1.117
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 76.07%
Epoch: [160]  [   0/1251]  eta: 1:04:01  lr: 0.002001  min_lr: 0.002001  loss: 2.2407 (2.2407)  weight_decay: 0.0500 (0.0500)  time: 3.0707  data: 2.7542  max mem: 12911
Epoch: [160]  [ 200/1251]  eta: 0:03:34  lr: 0.001997  min_lr: 0.001997  loss: 2.4606 (2.9673)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7271 (0.8104)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [160]  [ 400/1251]  eta: 0:02:46  lr: 0.001993  min_lr: 0.001993  loss: 2.6410 (2.9604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6963 (0.7781)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [160]  [ 600/1251]  eta: 0:02:05  lr: 0.001990  min_lr: 0.001990  loss: 2.4834 (2.9313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7163 (0.7845)  time: 0.1866  data: 0.0004  max mem: 12911
Epoch: [160]  [ 800/1251]  eta: 0:01:26  lr: 0.001986  min_lr: 0.001986  loss: 2.4069 (2.9194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7410 (0.7863)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [160]  [1000/1251]  eta: 0:00:47  lr: 0.001983  min_lr: 0.001983  loss: 2.4434 (2.9253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6482 (0.7762)  time: 0.1925  data: 0.0004  max mem: 12911
Epoch: [160]  [1200/1251]  eta: 0:00:09  lr: 0.001979  min_lr: 0.001979  loss: 2.4503 (2.9506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7567 (0.7741)  time: 0.1845  data: 0.0005  max mem: 12911
Epoch: [160]  [1250/1251]  eta: 0:00:00  lr: 0.001978  min_lr: 0.001978  loss: 2.4254 (2.9400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7142 (0.7715)  time: 0.1468  data: 0.0006  max mem: 12911
Epoch: [160] Total time: 0:03:57 (0.1900 s / it)
Averaged stats: lr: 0.001978  min_lr: 0.001978  loss: 2.4254 (2.9664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7142 (0.7715)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6949 (0.6949)  acc1: 83.2000 (83.2000)  acc5: 98.8000 (98.8000)  time: 5.4962  data: 5.4045  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8799 (0.8718)  acc1: 81.6000 (80.8727)  acc5: 96.8000 (96.0000)  time: 0.7427  data: 0.6461  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1245 (1.0735)  acc1: 73.2000 (76.4762)  acc5: 91.6000 (93.5810)  time: 0.2050  data: 0.1157  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2530 (1.0861)  acc1: 72.4000 (76.0640)  acc5: 91.6000 (93.5840)  time: 0.2019  data: 0.1156  max mem: 12911
Test: Total time: 0:00:09 (0.3999 s / it)
* Acc@1 75.922 Acc@5 93.396 loss 1.088
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 76.07%
Epoch: [161]  [   0/1251]  eta: 1:06:25  lr: 0.001978  min_lr: 0.001978  loss: 4.1474 (4.1474)  weight_decay: 0.0500 (0.0500)  time: 3.1859  data: 2.5919  max mem: 12911
Epoch: [161]  [ 200/1251]  eta: 0:03:34  lr: 0.001974  min_lr: 0.001974  loss: 2.3429 (2.9104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7614 (0.8611)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [161]  [ 400/1251]  eta: 0:02:46  lr: 0.001971  min_lr: 0.001971  loss: 2.4469 (2.9515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7687 (0.8139)  time: 0.1847  data: 0.0004  max mem: 12911
Epoch: [161]  [ 600/1251]  eta: 0:02:05  lr: 0.001967  min_lr: 0.001967  loss: 2.7198 (2.9953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7996 (0.8258)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [161]  [ 800/1251]  eta: 0:01:26  lr: 0.001964  min_lr: 0.001964  loss: 2.3070 (2.9933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7209 (0.8074)  time: 0.1838  data: 0.0005  max mem: 12911
Epoch: [161]  [1000/1251]  eta: 0:00:47  lr: 0.001960  min_lr: 0.001960  loss: 3.4205 (3.0047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6688 (0.7910)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [161]  [1200/1251]  eta: 0:00:09  lr: 0.001956  min_lr: 0.001956  loss: 2.5316 (2.9945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8181 (0.7856)  time: 0.1848  data: 0.0006  max mem: 12911
Epoch: [161]  [1250/1251]  eta: 0:00:00  lr: 0.001956  min_lr: 0.001956  loss: 2.4790 (2.9973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6708 (0.7806)  time: 0.1455  data: 0.0008  max mem: 12911
Epoch: [161] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.001956  min_lr: 0.001956  loss: 2.4790 (2.9795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6708 (0.7806)
Test:  [ 0/25]  eta: 0:02:07  loss: 0.6867 (0.6867)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.1129  data: 4.9771  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8658 (0.8864)  acc1: 80.8000 (80.5818)  acc5: 96.0000 (95.8546)  time: 0.7256  data: 0.6273  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1193 (1.0878)  acc1: 74.8000 (76.0571)  acc5: 93.2000 (93.4667)  time: 0.2199  data: 0.1323  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1602 (1.0940)  acc1: 73.2000 (75.7920)  acc5: 92.4000 (93.4400)  time: 0.2167  data: 0.1305  max mem: 12911
Test: Total time: 0:00:09 (0.3982 s / it)
* Acc@1 76.068 Acc@5 93.358 loss 1.087
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.07%
Epoch: [162]  [   0/1251]  eta: 1:05:56  lr: 0.001956  min_lr: 0.001956  loss: 2.2693 (2.2693)  weight_decay: 0.0500 (0.0500)  time: 3.1630  data: 2.2804  max mem: 12911
Epoch: [162]  [ 200/1251]  eta: 0:03:33  lr: 0.001952  min_lr: 0.001952  loss: 3.7299 (3.0025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6982 (0.7472)  time: 0.1858  data: 0.0006  max mem: 12911
Epoch: [162]  [ 400/1251]  eta: 0:02:46  lr: 0.001948  min_lr: 0.001948  loss: 2.4160 (2.9461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (0.7735)  time: 0.1851  data: 0.0005  max mem: 12911
Epoch: [162]  [ 600/1251]  eta: 0:02:05  lr: 0.001945  min_lr: 0.001945  loss: 2.2981 (2.9132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7424 (0.8076)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [162]  [ 800/1251]  eta: 0:01:26  lr: 0.001941  min_lr: 0.001941  loss: 2.8040 (2.9222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7754 (0.8219)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [162]  [1000/1251]  eta: 0:00:47  lr: 0.001938  min_lr: 0.001938  loss: 2.5169 (2.9151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6150 (0.8064)  time: 0.1929  data: 0.0005  max mem: 12911
Epoch: [162]  [1200/1251]  eta: 0:00:09  lr: 0.001934  min_lr: 0.001934  loss: 2.4718 (2.9322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8108 (0.8169)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [162]  [1250/1251]  eta: 0:00:00  lr: 0.001933  min_lr: 0.001933  loss: 2.4491 (2.9348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6421 (0.8135)  time: 0.1468  data: 0.0009  max mem: 12911
Epoch: [162] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.001933  min_lr: 0.001933  loss: 2.4491 (2.9383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6421 (0.8135)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6969 (0.6969)  acc1: 87.2000 (87.2000)  acc5: 97.2000 (97.2000)  time: 5.8034  data: 5.7117  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8851 (0.9045)  acc1: 80.8000 (81.0182)  acc5: 96.8000 (95.8909)  time: 0.7536  data: 0.6589  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1405 (1.1007)  acc1: 74.4000 (76.4191)  acc5: 92.4000 (93.3333)  time: 0.2070  data: 0.1197  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2030 (1.1122)  acc1: 72.8000 (75.6480)  acc5: 91.2000 (93.3920)  time: 0.2042  data: 0.1197  max mem: 12911
Test: Total time: 0:00:10 (0.4134 s / it)
* Acc@1 75.938 Acc@5 93.162 loss 1.115
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 76.07%
Epoch: [163]  [   0/1251]  eta: 1:02:22  lr: 0.001933  min_lr: 0.001933  loss: 4.0139 (4.0139)  weight_decay: 0.0500 (0.0500)  time: 2.9913  data: 2.6607  max mem: 12911
Epoch: [163]  [ 200/1251]  eta: 0:03:36  lr: 0.001930  min_lr: 0.001930  loss: 2.4913 (2.9410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8061 (0.8260)  time: 0.1902  data: 0.0005  max mem: 12911
Epoch: [163]  [ 400/1251]  eta: 0:02:48  lr: 0.001926  min_lr: 0.001926  loss: 2.7524 (2.9654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7764 (0.8649)  time: 0.1890  data: 0.0006  max mem: 12911
Epoch: [163]  [ 600/1251]  eta: 0:02:06  lr: 0.001922  min_lr: 0.001922  loss: 2.3621 (2.9537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6634 (0.8359)  time: 0.1921  data: 0.0005  max mem: 12911
Epoch: [163]  [ 800/1251]  eta: 0:01:26  lr: 0.001919  min_lr: 0.001919  loss: 2.4161 (2.9526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6882 (0.8136)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [163]  [1000/1251]  eta: 0:00:48  lr: 0.001915  min_lr: 0.001915  loss: 2.5280 (2.9650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7422 (0.8094)  time: 0.1862  data: 0.0003  max mem: 12911
Epoch: [163]  [1200/1251]  eta: 0:00:09  lr: 0.001912  min_lr: 0.001912  loss: 2.5514 (2.9640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7963 (0.8120)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [163]  [1250/1251]  eta: 0:00:00  lr: 0.001911  min_lr: 0.001911  loss: 2.3575 (2.9634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7243 (0.8092)  time: 0.1465  data: 0.0007  max mem: 12911
Epoch: [163] Total time: 0:03:59 (0.1913 s / it)
Averaged stats: lr: 0.001911  min_lr: 0.001911  loss: 2.3575 (2.9491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7243 (0.8092)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6554 (0.6554)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.3520  data: 5.2604  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8226 (0.8389)  acc1: 80.8000 (80.8364)  acc5: 96.8000 (96.1091)  time: 0.6857  data: 0.5917  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0663 (1.0293)  acc1: 74.4000 (76.5905)  acc5: 92.8000 (93.5810)  time: 0.2021  data: 0.1151  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1651 (1.0362)  acc1: 73.2000 (76.3680)  acc5: 92.0000 (93.4720)  time: 0.1997  data: 0.1151  max mem: 12911
Test: Total time: 0:00:09 (0.3921 s / it)
* Acc@1 76.220 Acc@5 93.428 loss 1.033
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.22%
Epoch: [164]  [   0/1251]  eta: 0:57:59  lr: 0.001911  min_lr: 0.001911  loss: 3.9640 (3.9640)  weight_decay: 0.0500 (0.0500)  time: 2.7812  data: 2.5170  max mem: 12911
Epoch: [164]  [ 200/1251]  eta: 0:03:31  lr: 0.001907  min_lr: 0.001907  loss: 2.6579 (2.8765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6438 (0.7077)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [164]  [ 400/1251]  eta: 0:02:45  lr: 0.001904  min_lr: 0.001904  loss: 2.4194 (2.8963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7448 (0.7720)  time: 0.1911  data: 0.0005  max mem: 12911
Epoch: [164]  [ 600/1251]  eta: 0:02:05  lr: 0.001900  min_lr: 0.001900  loss: 2.4390 (2.9089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8006 (0.7789)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [164]  [ 800/1251]  eta: 0:01:26  lr: 0.001896  min_lr: 0.001896  loss: 2.3964 (2.9016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6964 (0.7835)  time: 0.1933  data: 0.0005  max mem: 12911
Epoch: [164]  [1000/1251]  eta: 0:00:47  lr: 0.001893  min_lr: 0.001893  loss: 2.5924 (2.9014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6727 (0.7741)  time: 0.1839  data: 0.0004  max mem: 12911
Epoch: [164]  [1200/1251]  eta: 0:00:09  lr: 0.001889  min_lr: 0.001889  loss: 2.7751 (2.9205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8556 (nan)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [164]  [1250/1251]  eta: 0:00:00  lr: 0.001888  min_lr: 0.001888  loss: 2.3437 (2.9213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8215 (nan)  time: 0.1461  data: 0.0007  max mem: 12911
Epoch: [164] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.001888  min_lr: 0.001888  loss: 2.3437 (2.9448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8215 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6241 (0.6241)  acc1: 85.2000 (85.2000)  acc5: 98.8000 (98.8000)  time: 5.4769  data: 5.3850  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8762 (0.8558)  acc1: 82.4000 (81.0545)  acc5: 96.8000 (96.3273)  time: 0.7282  data: 0.6328  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1440 (1.0465)  acc1: 74.0000 (76.4952)  acc5: 93.6000 (93.9429)  time: 0.2049  data: 0.1173  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1770 (1.0562)  acc1: 72.8000 (76.2560)  acc5: 92.4000 (93.8560)  time: 0.2020  data: 0.1172  max mem: 12911
Test: Total time: 0:00:09 (0.3998 s / it)
* Acc@1 76.194 Acc@5 93.446 loss 1.058
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.22%
Epoch: [165]  [   0/1251]  eta: 1:05:52  lr: 0.001888  min_lr: 0.001888  loss: 3.8769 (3.8769)  weight_decay: 0.0500 (0.0500)  time: 3.1594  data: 2.1104  max mem: 12911
Epoch: [165]  [ 200/1251]  eta: 0:03:35  lr: 0.001885  min_lr: 0.001885  loss: 2.5517 (2.9216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7701 (0.8062)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [165]  [ 400/1251]  eta: 0:02:47  lr: 0.001881  min_lr: 0.001881  loss: 2.4060 (2.9665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7631 (0.7747)  time: 0.1988  data: 0.0005  max mem: 12911
Epoch: [165]  [ 600/1251]  eta: 0:02:06  lr: 0.001878  min_lr: 0.001878  loss: 2.4461 (2.9690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7112 (0.7688)  time: 0.1886  data: 0.0005  max mem: 12911
Epoch: [165]  [ 800/1251]  eta: 0:01:26  lr: 0.001874  min_lr: 0.001874  loss: 2.9812 (2.9528)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7100 (0.7739)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [165]  [1000/1251]  eta: 0:00:47  lr: 0.001870  min_lr: 0.001870  loss: 2.9357 (2.9384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7684 (0.7841)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [165]  [1200/1251]  eta: 0:00:09  lr: 0.001867  min_lr: 0.001867  loss: 2.2591 (2.9424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8070 (0.7926)  time: 0.1899  data: 0.0004  max mem: 12911
Epoch: [165]  [1250/1251]  eta: 0:00:00  lr: 0.001866  min_lr: 0.001866  loss: 2.4892 (2.9429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7722 (0.7899)  time: 0.1465  data: 0.0012  max mem: 12911
Epoch: [165] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.001866  min_lr: 0.001866  loss: 2.4892 (2.9381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7722 (0.7899)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7210 (0.7210)  acc1: 83.2000 (83.2000)  acc5: 98.0000 (98.0000)  time: 5.6365  data: 5.5447  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8245 (0.8831)  acc1: 82.8000 (80.7636)  acc5: 96.4000 (95.8546)  time: 0.7433  data: 0.6466  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1048 (1.0712)  acc1: 74.0000 (76.7238)  acc5: 92.8000 (93.5238)  time: 0.2014  data: 0.1131  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2091 (1.0835)  acc1: 74.0000 (76.2400)  acc5: 92.0000 (93.3120)  time: 0.1975  data: 0.1131  max mem: 12911
Test: Total time: 0:00:10 (0.4031 s / it)
* Acc@1 76.428 Acc@5 93.410 loss 1.071
Accuracy of the model on the 50000 test images: 76.4%
Max accuracy: 76.43%
Epoch: [166]  [   0/1251]  eta: 1:06:04  lr: 0.001866  min_lr: 0.001866  loss: 2.4779 (2.4779)  weight_decay: 0.0500 (0.0500)  time: 3.1692  data: 2.9293  max mem: 12911
Epoch: [166]  [ 200/1251]  eta: 0:03:32  lr: 0.001862  min_lr: 0.001862  loss: 2.5896 (3.0064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6949 (0.7482)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [166]  [ 400/1251]  eta: 0:02:45  lr: 0.001859  min_lr: 0.001859  loss: 3.7633 (2.9834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6424 (0.7150)  time: 0.1891  data: 0.0005  max mem: 12911
Epoch: [166]  [ 600/1251]  eta: 0:02:05  lr: 0.001855  min_lr: 0.001855  loss: 2.6461 (2.9779)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8756 (0.7505)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [166]  [ 800/1251]  eta: 0:01:26  lr: 0.001852  min_lr: 0.001852  loss: 3.3584 (2.9619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7338 (0.7682)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [166]  [1000/1251]  eta: 0:00:47  lr: 0.001848  min_lr: 0.001848  loss: 2.7738 (2.9651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7627 (0.7762)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [166]  [1200/1251]  eta: 0:00:09  lr: 0.001844  min_lr: 0.001844  loss: 2.4894 (2.9664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8386 (0.7776)  time: 0.1914  data: 0.0005  max mem: 12911
Epoch: [166]  [1250/1251]  eta: 0:00:00  lr: 0.001844  min_lr: 0.001844  loss: 2.8398 (2.9628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8663 (0.7837)  time: 0.1469  data: 0.0011  max mem: 12911
Epoch: [166] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.001844  min_lr: 0.001844  loss: 2.8398 (2.9475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8663 (0.7837)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7120 (0.7120)  acc1: 83.6000 (83.6000)  acc5: 99.2000 (99.2000)  time: 5.6237  data: 5.5319  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8972 (0.8684)  acc1: 81.2000 (81.3818)  acc5: 96.0000 (96.3273)  time: 0.7467  data: 0.6526  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0900 (1.0687)  acc1: 74.8000 (76.6095)  acc5: 93.6000 (93.7333)  time: 0.2163  data: 0.1294  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2472 (1.0840)  acc1: 74.0000 (76.3040)  acc5: 91.6000 (93.5040)  time: 0.2138  data: 0.1293  max mem: 12911
Test: Total time: 0:00:10 (0.4156 s / it)
* Acc@1 76.404 Acc@5 93.464 loss 1.083
Accuracy of the model on the 50000 test images: 76.4%
Max accuracy: 76.43%
Epoch: [167]  [   0/1251]  eta: 1:04:53  lr: 0.001844  min_lr: 0.001844  loss: 3.5555 (3.5555)  weight_decay: 0.0500 (0.0500)  time: 3.1123  data: 2.1435  max mem: 12911
Epoch: [167]  [ 200/1251]  eta: 0:03:34  lr: 0.001840  min_lr: 0.001840  loss: 2.6667 (2.9498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6452 (0.8529)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [167]  [ 400/1251]  eta: 0:02:46  lr: 0.001836  min_lr: 0.001836  loss: 3.7152 (2.9419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7257 (0.8105)  time: 0.1886  data: 0.0005  max mem: 12911
Epoch: [167]  [ 600/1251]  eta: 0:02:05  lr: 0.001833  min_lr: 0.001833  loss: 2.6653 (2.9660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7833 (0.7916)  time: 0.1877  data: 0.0006  max mem: 12911
Epoch: [167]  [ 800/1251]  eta: 0:01:26  lr: 0.001829  min_lr: 0.001829  loss: 3.1423 (2.9828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7426 (0.7884)  time: 0.1906  data: 0.0005  max mem: 12911
Epoch: [167]  [1000/1251]  eta: 0:00:48  lr: 0.001826  min_lr: 0.001826  loss: 2.3276 (2.9760)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6878 (0.7964)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [167]  [1200/1251]  eta: 0:00:09  lr: 0.001822  min_lr: 0.001822  loss: 2.7143 (2.9771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7906 (0.8041)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [167]  [1250/1251]  eta: 0:00:00  lr: 0.001821  min_lr: 0.001821  loss: 2.4418 (2.9766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7630 (0.8005)  time: 0.1462  data: 0.0011  max mem: 12911
Epoch: [167] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.001821  min_lr: 0.001821  loss: 2.4418 (2.9352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7630 (0.8005)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7138 (0.7138)  acc1: 84.0000 (84.0000)  acc5: 98.8000 (98.8000)  time: 5.5952  data: 5.4600  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8338 (0.8871)  acc1: 81.6000 (80.8000)  acc5: 96.4000 (96.3273)  time: 0.7334  data: 0.6339  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1315 (1.0862)  acc1: 73.6000 (76.3810)  acc5: 93.6000 (93.9238)  time: 0.1945  data: 0.1060  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2398 (1.1025)  acc1: 73.2000 (76.1120)  acc5: 92.0000 (93.7600)  time: 0.1905  data: 0.1060  max mem: 12911
Test: Total time: 0:00:09 (0.3968 s / it)
* Acc@1 76.094 Acc@5 93.380 loss 1.107
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.43%
Epoch: [168]  [   0/1251]  eta: 1:07:28  lr: 0.001821  min_lr: 0.001821  loss: 2.1095 (2.1095)  weight_decay: 0.0500 (0.0500)  time: 3.2358  data: 2.0247  max mem: 12911
Epoch: [168]  [ 200/1251]  eta: 0:03:34  lr: 0.001818  min_lr: 0.001818  loss: 2.3125 (2.8880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7199 (0.8073)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [168]  [ 400/1251]  eta: 0:02:46  lr: 0.001814  min_lr: 0.001814  loss: 2.3447 (2.8797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7092 (0.8152)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [168]  [ 600/1251]  eta: 0:02:05  lr: 0.001811  min_lr: 0.001811  loss: 2.5029 (2.8934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7921 (0.8168)  time: 0.1869  data: 0.0004  max mem: 12911
Epoch: [168]  [ 800/1251]  eta: 0:01:26  lr: 0.001807  min_lr: 0.001807  loss: 2.7001 (2.9202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7939 (0.8832)  time: 0.1877  data: 0.0004  max mem: 12911
Epoch: [168]  [1000/1251]  eta: 0:00:48  lr: 0.001803  min_lr: 0.001803  loss: 2.5035 (2.9209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7250 (0.8513)  time: 0.2075  data: 0.0004  max mem: 12911
Epoch: [168]  [1200/1251]  eta: 0:00:09  lr: 0.001800  min_lr: 0.001800  loss: 3.0464 (2.9189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7688 (0.8374)  time: 0.1902  data: 0.0005  max mem: 12911
Epoch: [168]  [1250/1251]  eta: 0:00:00  lr: 0.001799  min_lr: 0.001799  loss: 2.7510 (2.9258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8527 (0.8493)  time: 0.1474  data: 0.0011  max mem: 12911
Epoch: [168] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.001799  min_lr: 0.001799  loss: 2.7510 (2.9195)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8527 (0.8493)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7392 (0.7392)  acc1: 85.2000 (85.2000)  acc5: 98.8000 (98.8000)  time: 5.3635  data: 5.2718  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8926 (0.9102)  acc1: 84.0000 (81.4545)  acc5: 96.0000 (96.1091)  time: 0.7450  data: 0.6482  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1490 (1.1215)  acc1: 74.4000 (76.8191)  acc5: 93.2000 (93.5238)  time: 0.2298  data: 0.1408  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2719 (1.1313)  acc1: 73.2000 (76.4800)  acc5: 92.0000 (93.5040)  time: 0.2264  data: 0.1407  max mem: 12911
Test: Total time: 0:00:10 (0.4142 s / it)
* Acc@1 76.158 Acc@5 93.320 loss 1.139
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.43%
Epoch: [169]  [   0/1251]  eta: 1:03:41  lr: 0.001799  min_lr: 0.001799  loss: 3.8695 (3.8695)  weight_decay: 0.0500 (0.0500)  time: 3.0551  data: 1.8230  max mem: 12911
Epoch: [169]  [ 200/1251]  eta: 0:03:35  lr: 0.001795  min_lr: 0.001795  loss: 2.4143 (2.9236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7916 (1.1203)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [169]  [ 400/1251]  eta: 0:02:46  lr: 0.001792  min_lr: 0.001792  loss: 2.5187 (2.9365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7344 (0.9912)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [169]  [ 600/1251]  eta: 0:02:05  lr: 0.001788  min_lr: 0.001788  loss: 3.4219 (2.9169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7726 (0.9090)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [169]  [ 800/1251]  eta: 0:01:26  lr: 0.001785  min_lr: 0.001785  loss: 2.4152 (2.9422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6782 (0.8621)  time: 0.1893  data: 0.0005  max mem: 12911
Epoch: [169]  [1000/1251]  eta: 0:00:47  lr: 0.001781  min_lr: 0.001781  loss: 2.2862 (2.9308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7260 (0.8412)  time: 0.1842  data: 0.0004  max mem: 12911
Epoch: [169]  [1200/1251]  eta: 0:00:09  lr: 0.001777  min_lr: 0.001777  loss: 3.3582 (2.9581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8712 (0.8337)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [169]  [1250/1251]  eta: 0:00:00  lr: 0.001777  min_lr: 0.001777  loss: 2.3909 (2.9597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7329 (0.8296)  time: 0.1462  data: 0.0006  max mem: 12911
Epoch: [169] Total time: 0:03:57 (0.1900 s / it)
Averaged stats: lr: 0.001777  min_lr: 0.001777  loss: 2.3909 (2.9469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7329 (0.8296)
Test:  [ 0/25]  eta: 0:01:52  loss: 0.7484 (0.7484)  acc1: 87.2000 (87.2000)  acc5: 97.2000 (97.2000)  time: 4.5099  data: 4.4097  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8826 (0.9107)  acc1: 82.0000 (81.1636)  acc5: 96.0000 (95.7818)  time: 0.6489  data: 0.5587  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1390 (1.1008)  acc1: 75.6000 (76.7429)  acc5: 92.8000 (93.2191)  time: 0.2191  data: 0.1348  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2047 (1.1014)  acc1: 72.0000 (76.4480)  acc5: 92.4000 (93.2480)  time: 0.2050  data: 0.1225  max mem: 12911
Test: Total time: 0:00:09 (0.3968 s / it)
* Acc@1 76.192 Acc@5 93.438 loss 1.094
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.43%
Epoch: [170]  [   0/1251]  eta: 1:03:46  lr: 0.001777  min_lr: 0.001777  loss: 2.3515 (2.3515)  weight_decay: 0.0500 (0.0500)  time: 3.0584  data: 2.3799  max mem: 12911
Epoch: [170]  [ 200/1251]  eta: 0:03:34  lr: 0.001773  min_lr: 0.001773  loss: 2.3333 (2.8780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7793 (0.6997)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [170]  [ 400/1251]  eta: 0:02:46  lr: 0.001769  min_lr: 0.001769  loss: 2.8276 (2.9704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6691 (0.7574)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [170]  [ 600/1251]  eta: 0:02:05  lr: 0.001766  min_lr: 0.001766  loss: 2.7025 (2.9686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7579 (0.7646)  time: 0.1853  data: 0.0004  max mem: 12911
Epoch: [170]  [ 800/1251]  eta: 0:01:26  lr: 0.001762  min_lr: 0.001762  loss: 2.2091 (2.9628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8297 (0.7692)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [170]  [1000/1251]  eta: 0:00:47  lr: 0.001759  min_lr: 0.001759  loss: 2.4442 (2.9375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7861 (0.7679)  time: 0.1886  data: 0.0005  max mem: 12911
Epoch: [170]  [1200/1251]  eta: 0:00:09  lr: 0.001755  min_lr: 0.001755  loss: 2.3976 (2.9336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6724 (0.7787)  time: 0.1902  data: 0.0004  max mem: 12911
Epoch: [170]  [1250/1251]  eta: 0:00:00  lr: 0.001754  min_lr: 0.001754  loss: 2.6556 (2.9365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6998 (0.7759)  time: 0.1477  data: 0.0010  max mem: 12911
Epoch: [170] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.001754  min_lr: 0.001754  loss: 2.6556 (2.9260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6998 (0.7759)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6736 (0.6736)  acc1: 85.2000 (85.2000)  acc5: 98.4000 (98.4000)  time: 5.3568  data: 5.2549  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8681 (0.8523)  acc1: 80.4000 (80.5455)  acc5: 96.4000 (95.9636)  time: 0.7510  data: 0.6529  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0737 (1.0500)  acc1: 74.8000 (76.6286)  acc5: 92.4000 (93.8857)  time: 0.2235  data: 0.1346  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1731 (1.0649)  acc1: 73.2000 (76.3200)  acc5: 92.4000 (93.8080)  time: 0.2205  data: 0.1345  max mem: 12911
Test: Total time: 0:00:10 (0.4092 s / it)
* Acc@1 76.564 Acc@5 93.620 loss 1.060
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.56%
Epoch: [171]  [   0/1251]  eta: 0:56:23  lr: 0.001754  min_lr: 0.001754  loss: 2.3435 (2.3435)  weight_decay: 0.0500 (0.0500)  time: 2.7044  data: 2.4368  max mem: 12911
Epoch: [171]  [ 200/1251]  eta: 0:03:34  lr: 0.001751  min_lr: 0.001751  loss: 2.4590 (2.8052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8110 (0.8293)  time: 0.1892  data: 0.0005  max mem: 12911
Epoch: [171]  [ 400/1251]  eta: 0:02:47  lr: 0.001747  min_lr: 0.001747  loss: 2.4984 (2.8357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8239 (0.8309)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [171]  [ 600/1251]  eta: 0:02:06  lr: 0.001744  min_lr: 0.001744  loss: 2.5290 (2.8565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7598 (0.8202)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [171]  [ 800/1251]  eta: 0:01:27  lr: 0.001740  min_lr: 0.001740  loss: 2.4672 (2.8880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7796 (0.8150)  time: 0.1886  data: 0.0005  max mem: 12911
Epoch: [171]  [1000/1251]  eta: 0:00:48  lr: 0.001737  min_lr: 0.001737  loss: 2.3303 (2.9113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7008 (0.8074)  time: 0.1844  data: 0.0004  max mem: 12911
Epoch: [171]  [1200/1251]  eta: 0:00:09  lr: 0.001733  min_lr: 0.001733  loss: 2.3245 (2.9194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7904 (0.8010)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [171]  [1250/1251]  eta: 0:00:00  lr: 0.001732  min_lr: 0.001732  loss: 2.3547 (2.9260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7416 (0.8032)  time: 0.1460  data: 0.0007  max mem: 12911
Epoch: [171] Total time: 0:03:59 (0.1912 s / it)
Averaged stats: lr: 0.001732  min_lr: 0.001732  loss: 2.3547 (2.9367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7416 (0.8032)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.5987 (0.5987)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.2090  data: 5.1100  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8509 (0.8115)  acc1: 82.0000 (81.3455)  acc5: 96.0000 (96.1455)  time: 0.7076  data: 0.6125  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9953 (1.0140)  acc1: 74.4000 (76.9143)  acc5: 93.6000 (93.7905)  time: 0.2097  data: 0.1202  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1688 (1.0259)  acc1: 73.2000 (76.5440)  acc5: 92.8000 (93.7600)  time: 0.2083  data: 0.1209  max mem: 12911
Test: Total time: 0:00:09 (0.3930 s / it)
* Acc@1 76.744 Acc@5 93.644 loss 1.020
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.74%
Epoch: [172]  [   0/1251]  eta: 0:59:58  lr: 0.001732  min_lr: 0.001732  loss: 2.1673 (2.1673)  weight_decay: 0.0500 (0.0500)  time: 2.8763  data: 2.5778  max mem: 12911
Epoch: [172]  [ 200/1251]  eta: 0:03:35  lr: 0.001729  min_lr: 0.001729  loss: 2.5740 (2.8624)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6982 (0.7729)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [172]  [ 400/1251]  eta: 0:02:47  lr: 0.001725  min_lr: 0.001725  loss: 2.4720 (2.9044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7022 (0.8205)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [172]  [ 600/1251]  eta: 0:02:05  lr: 0.001721  min_lr: 0.001721  loss: 2.4695 (2.8998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6835 (0.7802)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [172]  [ 800/1251]  eta: 0:01:26  lr: 0.001718  min_lr: 0.001718  loss: 2.2281 (2.8904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8279 (0.8154)  time: 0.1888  data: 0.0006  max mem: 12911
Epoch: [172]  [1000/1251]  eta: 0:00:47  lr: 0.001714  min_lr: 0.001714  loss: 2.5051 (2.8977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7765 (0.8459)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [172]  [1200/1251]  eta: 0:00:09  lr: 0.001711  min_lr: 0.001711  loss: 3.2410 (2.9106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8070 (0.8366)  time: 0.1847  data: 0.0005  max mem: 12911
Epoch: [172]  [1250/1251]  eta: 0:00:00  lr: 0.001710  min_lr: 0.001710  loss: 2.7680 (2.9145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8070 (0.8393)  time: 0.1461  data: 0.0007  max mem: 12911
Epoch: [172] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.001710  min_lr: 0.001710  loss: 2.7680 (2.9243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8070 (0.8393)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6732 (0.6732)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 5.7501  data: 5.6584  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8528 (0.8825)  acc1: 81.2000 (80.9091)  acc5: 96.8000 (95.8182)  time: 0.7648  data: 0.6669  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1153 (1.0748)  acc1: 74.8000 (76.4762)  acc5: 92.4000 (93.6381)  time: 0.1976  data: 0.1086  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2029 (1.0843)  acc1: 74.4000 (76.2560)  acc5: 92.4000 (93.5040)  time: 0.2118  data: 0.1267  max mem: 12911
Test: Total time: 0:00:10 (0.4180 s / it)
* Acc@1 76.562 Acc@5 93.612 loss 1.075
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.74%
Epoch: [173]  [   0/1251]  eta: 1:06:50  lr: 0.001710  min_lr: 0.001710  loss: 2.7780 (2.7780)  weight_decay: 0.0500 (0.0500)  time: 3.2062  data: 2.5109  max mem: 12911
Epoch: [173]  [ 200/1251]  eta: 0:03:32  lr: 0.001706  min_lr: 0.001706  loss: 2.5845 (2.9083)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6880 (0.7385)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [173]  [ 400/1251]  eta: 0:02:46  lr: 0.001703  min_lr: 0.001703  loss: 2.4295 (2.8624)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8271 (0.7917)  time: 0.1832  data: 0.0005  max mem: 12911
Epoch: [173]  [ 600/1251]  eta: 0:02:05  lr: 0.001699  min_lr: 0.001699  loss: 2.3441 (2.8787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6685 (0.7830)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [173]  [ 800/1251]  eta: 0:01:26  lr: 0.001696  min_lr: 0.001696  loss: 2.5447 (2.8792)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7107 (0.7691)  time: 0.1893  data: 0.0005  max mem: 12911
Epoch: [173]  [1000/1251]  eta: 0:00:48  lr: 0.001692  min_lr: 0.001692  loss: 2.6987 (2.9005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8192 (0.7727)  time: 0.1858  data: 0.0005  max mem: 12911
Epoch: [173]  [1200/1251]  eta: 0:00:09  lr: 0.001689  min_lr: 0.001689  loss: 2.4014 (2.8871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7632 (0.7868)  time: 0.1913  data: 0.0006  max mem: 12911
Epoch: [173]  [1250/1251]  eta: 0:00:00  lr: 0.001688  min_lr: 0.001688  loss: 2.5420 (2.8924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7189 (0.7875)  time: 0.1459  data: 0.0007  max mem: 12911
Epoch: [173] Total time: 0:03:59 (0.1912 s / it)
Averaged stats: lr: 0.001688  min_lr: 0.001688  loss: 2.5420 (2.9125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7189 (0.7875)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7116 (0.7116)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 5.7402  data: 5.6483  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8880 (0.9135)  acc1: 82.0000 (80.1818)  acc5: 96.8000 (95.9273)  time: 0.7529  data: 0.6579  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1444 (1.1073)  acc1: 73.6000 (76.1333)  acc5: 92.0000 (93.5810)  time: 0.2078  data: 0.1204  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2468 (1.1205)  acc1: 72.8000 (75.8560)  acc5: 91.6000 (93.4560)  time: 0.2048  data: 0.1203  max mem: 12911
Test: Total time: 0:00:10 (0.4119 s / it)
* Acc@1 76.220 Acc@5 93.528 loss 1.106
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.74%
Epoch: [174]  [   0/1251]  eta: 1:04:14  lr: 0.001688  min_lr: 0.001688  loss: 2.0919 (2.0919)  weight_decay: 0.0500 (0.0500)  time: 3.0809  data: 1.7243  max mem: 12911
Epoch: [174]  [ 200/1251]  eta: 0:03:32  lr: 0.001684  min_lr: 0.001684  loss: 2.7991 (2.9963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7872 (0.7585)  time: 0.1846  data: 0.0005  max mem: 12911
Epoch: [174]  [ 400/1251]  eta: 0:02:44  lr: 0.001681  min_lr: 0.001681  loss: 2.9009 (2.9477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7467 (0.7747)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [174]  [ 600/1251]  eta: 0:02:04  lr: 0.001677  min_lr: 0.001677  loss: 2.3834 (2.9582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9367 (0.8611)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [174]  [ 800/1251]  eta: 0:01:25  lr: 0.001674  min_lr: 0.001674  loss: 2.4777 (2.9481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8026 (0.8561)  time: 0.1848  data: 0.0004  max mem: 12911
Epoch: [174]  [1000/1251]  eta: 0:00:47  lr: 0.001670  min_lr: 0.001670  loss: 2.6437 (2.9460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7682 (0.8393)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [174]  [1200/1251]  eta: 0:00:09  lr: 0.001666  min_lr: 0.001666  loss: 3.1171 (2.9414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7444 (0.8494)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [174]  [1250/1251]  eta: 0:00:00  lr: 0.001666  min_lr: 0.001666  loss: 2.4263 (2.9397)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.8492)  time: 0.1463  data: 0.0010  max mem: 12911
Epoch: [174] Total time: 0:03:57 (0.1899 s / it)
Averaged stats: lr: 0.001666  min_lr: 0.001666  loss: 2.4263 (2.9243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.8492)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6411 (0.6411)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.6680  data: 5.5684  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8612 (0.8485)  acc1: 80.8000 (81.2364)  acc5: 96.4000 (96.0000)  time: 0.7516  data: 0.6539  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0469 (1.0464)  acc1: 75.2000 (77.1429)  acc5: 93.2000 (93.5810)  time: 0.2060  data: 0.1176  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1602 (1.0637)  acc1: 74.0000 (76.6400)  acc5: 92.8000 (93.5520)  time: 0.2020  data: 0.1175  max mem: 12911
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 76.496 Acc@5 93.542 loss 1.058
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.74%
Epoch: [175]  [   0/1251]  eta: 1:05:47  lr: 0.001666  min_lr: 0.001666  loss: 2.2187 (2.2187)  weight_decay: 0.0500 (0.0500)  time: 3.1556  data: 2.5541  max mem: 12911
Epoch: [175]  [ 200/1251]  eta: 0:03:35  lr: 0.001662  min_lr: 0.001662  loss: 3.3591 (2.9952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7412 (0.7969)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [175]  [ 400/1251]  eta: 0:02:46  lr: 0.001658  min_lr: 0.001658  loss: 2.3515 (2.9230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7493 (0.8237)  time: 0.1894  data: 0.0006  max mem: 12911
Epoch: [175]  [ 600/1251]  eta: 0:02:05  lr: 0.001655  min_lr: 0.001655  loss: 2.2917 (2.9063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7671 (0.8449)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [175]  [ 800/1251]  eta: 0:01:26  lr: 0.001651  min_lr: 0.001651  loss: 2.4020 (2.9241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7879 (0.8429)  time: 0.1817  data: 0.0005  max mem: 12911
Epoch: [175]  [1000/1251]  eta: 0:00:47  lr: 0.001648  min_lr: 0.001648  loss: 3.0294 (2.9282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6726 (0.8271)  time: 0.1853  data: 0.0004  max mem: 12911
Epoch: [175]  [1200/1251]  eta: 0:00:09  lr: 0.001644  min_lr: 0.001644  loss: 3.7419 (2.9331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7074 (0.8223)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [175]  [1250/1251]  eta: 0:00:00  lr: 0.001644  min_lr: 0.001644  loss: 2.2873 (2.9297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.8169)  time: 0.1452  data: 0.0006  max mem: 12911
Epoch: [175] Total time: 0:03:57 (0.1900 s / it)
Averaged stats: lr: 0.001644  min_lr: 0.001644  loss: 2.2873 (2.9125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.8169)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6369 (0.6369)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.6463  data: 5.5103  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8334 (0.8386)  acc1: 82.4000 (81.2364)  acc5: 96.4000 (96.1091)  time: 0.7496  data: 0.6479  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0975 (1.0303)  acc1: 74.0000 (77.1619)  acc5: 92.8000 (93.6762)  time: 0.2017  data: 0.1129  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1293 (1.0370)  acc1: 74.0000 (76.9760)  acc5: 92.4000 (93.6160)  time: 0.2021  data: 0.1172  max mem: 12911
Test: Total time: 0:00:10 (0.4067 s / it)
* Acc@1 76.744 Acc@5 93.598 loss 1.032
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.74%
Epoch: [176]  [   0/1251]  eta: 1:04:04  lr: 0.001643  min_lr: 0.001643  loss: 3.0035 (3.0035)  weight_decay: 0.0500 (0.0500)  time: 3.0735  data: 2.3984  max mem: 12911
Epoch: [176]  [ 200/1251]  eta: 0:03:33  lr: 0.001640  min_lr: 0.001640  loss: 2.6431 (2.9082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8372 (0.8006)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [176]  [ 400/1251]  eta: 0:02:46  lr: 0.001636  min_lr: 0.001636  loss: 2.7572 (2.8967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (0.8060)  time: 0.1873  data: 0.0007  max mem: 12911
Epoch: [176]  [ 600/1251]  eta: 0:02:06  lr: 0.001633  min_lr: 0.001633  loss: 2.2819 (2.8684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7919 (0.8196)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [176]  [ 800/1251]  eta: 0:01:26  lr: 0.001629  min_lr: 0.001629  loss: 2.3903 (2.8627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6730 (0.8284)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [176]  [1000/1251]  eta: 0:00:47  lr: 0.001626  min_lr: 0.001626  loss: 2.5296 (2.8697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7920 (0.8791)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [176]  [1200/1251]  eta: 0:00:09  lr: 0.001622  min_lr: 0.001622  loss: 2.4292 (2.8915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7125 (0.8529)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [176]  [1250/1251]  eta: 0:00:00  lr: 0.001621  min_lr: 0.001621  loss: 2.4332 (2.8968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6954 (0.8475)  time: 0.1461  data: 0.0011  max mem: 12911
Epoch: [176] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.001621  min_lr: 0.001621  loss: 2.4332 (2.9190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6954 (0.8475)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6896 (0.6896)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.5695  data: 5.4754  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8252 (0.8462)  acc1: 83.6000 (81.3455)  acc5: 97.2000 (96.2909)  time: 0.7548  data: 0.6603  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0815 (1.0401)  acc1: 74.4000 (76.8952)  acc5: 92.8000 (93.9238)  time: 0.2063  data: 0.1183  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1998 (1.0509)  acc1: 73.6000 (76.4640)  acc5: 92.4000 (93.8400)  time: 0.2045  data: 0.1183  max mem: 12911
Test: Total time: 0:00:10 (0.4033 s / it)
* Acc@1 76.458 Acc@5 93.546 loss 1.052
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.74%
Epoch: [177]  [   0/1251]  eta: 0:59:11  lr: 0.001621  min_lr: 0.001621  loss: 4.0253 (4.0253)  weight_decay: 0.0500 (0.0500)  time: 2.8390  data: 2.5403  max mem: 12911
Epoch: [177]  [ 200/1251]  eta: 0:03:36  lr: 0.001618  min_lr: 0.001618  loss: 2.7402 (2.8677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7048 (0.7308)  time: 0.1890  data: 0.0004  max mem: 12911
Epoch: [177]  [ 400/1251]  eta: 0:02:48  lr: 0.001614  min_lr: 0.001614  loss: 2.3439 (2.9149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.7488)  time: 0.1915  data: 0.0004  max mem: 12911
Epoch: [177]  [ 600/1251]  eta: 0:02:06  lr: 0.001611  min_lr: 0.001611  loss: 3.4155 (2.9345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8092 (0.7713)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [177]  [ 800/1251]  eta: 0:01:26  lr: 0.001607  min_lr: 0.001607  loss: 2.3441 (2.9256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7212 (0.7958)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [177]  [1000/1251]  eta: 0:00:48  lr: 0.001604  min_lr: 0.001604  loss: 3.0929 (2.9167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7630 (0.7812)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [177]  [1200/1251]  eta: 0:00:09  lr: 0.001600  min_lr: 0.001600  loss: 2.4068 (2.9067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6494 (0.7815)  time: 0.1874  data: 0.0006  max mem: 12911
Epoch: [177]  [1250/1251]  eta: 0:00:00  lr: 0.001599  min_lr: 0.001599  loss: 2.5001 (2.9045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7498 (0.7834)  time: 0.1477  data: 0.0010  max mem: 12911
Epoch: [177] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.001599  min_lr: 0.001599  loss: 2.5001 (2.9100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7498 (0.7834)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6585 (0.6585)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.3747  data: 5.2678  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8603 (0.8749)  acc1: 82.0000 (81.3455)  acc5: 96.8000 (96.1455)  time: 0.7171  data: 0.6237  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1365 (1.0817)  acc1: 72.8000 (76.7429)  acc5: 93.2000 (93.7714)  time: 0.2087  data: 0.1229  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2375 (1.0959)  acc1: 72.8000 (76.3840)  acc5: 92.4000 (93.5840)  time: 0.2074  data: 0.1228  max mem: 12911
Test: Total time: 0:00:09 (0.3976 s / it)
* Acc@1 76.390 Acc@5 93.606 loss 1.091
Accuracy of the model on the 50000 test images: 76.4%
Max accuracy: 76.74%
Epoch: [178]  [   0/1251]  eta: 1:08:03  lr: 0.001599  min_lr: 0.001599  loss: 2.4337 (2.4337)  weight_decay: 0.0500 (0.0500)  time: 3.2642  data: 2.2475  max mem: 12911
Epoch: [178]  [ 200/1251]  eta: 0:03:33  lr: 0.001596  min_lr: 0.001596  loss: 2.3413 (2.9442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7447 (0.8510)  time: 0.1874  data: 0.0004  max mem: 12911
Epoch: [178]  [ 400/1251]  eta: 0:02:47  lr: 0.001592  min_lr: 0.001592  loss: 2.3895 (2.9079)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1799 (1.0045)  time: 0.1883  data: 0.0007  max mem: 12911
Epoch: [178]  [ 600/1251]  eta: 0:02:05  lr: 0.001589  min_lr: 0.001589  loss: 3.4077 (2.9443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6932 (0.9589)  time: 0.1904  data: 0.0004  max mem: 12911
Epoch: [178]  [ 800/1251]  eta: 0:01:26  lr: 0.001585  min_lr: 0.001585  loss: 2.3930 (2.9235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6977 (0.8936)  time: 0.1838  data: 0.0004  max mem: 12911
Epoch: [178]  [1000/1251]  eta: 0:00:47  lr: 0.001582  min_lr: 0.001582  loss: 2.6284 (2.9198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7307 (0.8639)  time: 0.1874  data: 0.0004  max mem: 12911
Epoch: [178]  [1200/1251]  eta: 0:00:09  lr: 0.001578  min_lr: 0.001578  loss: 2.9936 (2.9116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7682 (0.8493)  time: 0.2010  data: 0.0006  max mem: 12911
Epoch: [178]  [1250/1251]  eta: 0:00:00  lr: 0.001578  min_lr: 0.001578  loss: 2.5088 (2.9107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8149 (0.8533)  time: 0.1476  data: 0.0013  max mem: 12911
Epoch: [178] Total time: 0:03:59 (0.1915 s / it)
Averaged stats: lr: 0.001578  min_lr: 0.001578  loss: 2.5088 (2.9148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8149 (0.8533)
Test:  [ 0/25]  eta: 0:02:04  loss: 0.6509 (0.6509)  acc1: 84.4000 (84.4000)  acc5: 98.4000 (98.4000)  time: 4.9961  data: 4.8962  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8073 (0.8431)  acc1: 83.6000 (81.3091)  acc5: 96.4000 (96.1091)  time: 0.6656  data: 0.5732  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0972 (1.0487)  acc1: 74.0000 (76.8571)  acc5: 92.8000 (93.8095)  time: 0.2141  data: 0.1271  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1894 (1.0574)  acc1: 74.0000 (76.7200)  acc5: 92.4000 (93.6640)  time: 0.2267  data: 0.1427  max mem: 12911
Test: Total time: 0:00:10 (0.4083 s / it)
* Acc@1 76.642 Acc@5 93.628 loss 1.052
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.74%
Epoch: [179]  [   0/1251]  eta: 1:00:35  lr: 0.001577  min_lr: 0.001577  loss: 3.7294 (3.7294)  weight_decay: 0.0500 (0.0500)  time: 2.9057  data: 1.7967  max mem: 12911
Epoch: [179]  [ 200/1251]  eta: 0:03:32  lr: 0.001574  min_lr: 0.001574  loss: 2.2441 (2.9467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7218 (0.8265)  time: 0.1857  data: 0.0004  max mem: 12911
Epoch: [179]  [ 400/1251]  eta: 0:02:45  lr: 0.001570  min_lr: 0.001570  loss: 2.5470 (2.8860)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7888 (0.8068)  time: 0.1851  data: 0.0004  max mem: 12911
Epoch: [179]  [ 600/1251]  eta: 0:02:04  lr: 0.001567  min_lr: 0.001567  loss: 2.3003 (2.8763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9635 (0.8388)  time: 0.1848  data: 0.0005  max mem: 12911
Epoch: [179]  [ 800/1251]  eta: 0:01:25  lr: 0.001563  min_lr: 0.001563  loss: 2.6898 (2.8640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7649 (0.8343)  time: 0.1864  data: 0.0004  max mem: 12911
Epoch: [179]  [1000/1251]  eta: 0:00:47  lr: 0.001560  min_lr: 0.001560  loss: 2.9126 (2.8702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7030 (0.8176)  time: 0.1975  data: 0.0004  max mem: 12911
Epoch: [179]  [1200/1251]  eta: 0:00:09  lr: 0.001556  min_lr: 0.001556  loss: 2.4851 (2.8596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6845 (0.8168)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [179]  [1250/1251]  eta: 0:00:00  lr: 0.001556  min_lr: 0.001556  loss: 2.4440 (2.8664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6889 (0.8144)  time: 0.1463  data: 0.0007  max mem: 12911
Epoch: [179] Total time: 0:03:57 (0.1896 s / it)
Averaged stats: lr: 0.001556  min_lr: 0.001556  loss: 2.4440 (2.8903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6889 (0.8144)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6747 (0.6747)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.8033  data: 5.6887  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8383 (0.8524)  acc1: 83.6000 (81.6364)  acc5: 96.4000 (95.8545)  time: 0.7496  data: 0.6578  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0598 (1.0517)  acc1: 75.2000 (77.2952)  acc5: 93.2000 (93.7143)  time: 0.1953  data: 0.1108  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1929 (1.0652)  acc1: 74.4000 (76.7680)  acc5: 92.8000 (93.6320)  time: 0.1943  data: 0.1107  max mem: 12911
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 76.754 Acc@5 93.648 loss 1.053
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.75%
Epoch: [180]  [   0/1251]  eta: 0:57:10  lr: 0.001556  min_lr: 0.001556  loss: 2.4027 (2.4027)  weight_decay: 0.0500 (0.0500)  time: 2.7423  data: 2.4802  max mem: 12911
Epoch: [180]  [ 200/1251]  eta: 0:03:31  lr: 0.001552  min_lr: 0.001552  loss: 2.3275 (2.9281)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8260 (0.7759)  time: 0.1834  data: 0.0005  max mem: 12911
Epoch: [180]  [ 400/1251]  eta: 0:02:45  lr: 0.001549  min_lr: 0.001549  loss: 2.2883 (2.9342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7654 (0.7879)  time: 0.1847  data: 0.0005  max mem: 12911
Epoch: [180]  [ 600/1251]  eta: 0:02:04  lr: 0.001545  min_lr: 0.001545  loss: 2.9815 (2.9443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7045 (0.7904)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [180]  [ 800/1251]  eta: 0:01:26  lr: 0.001542  min_lr: 0.001542  loss: 2.4439 (2.9005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7360 (0.8059)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [180]  [1000/1251]  eta: 0:00:47  lr: 0.001538  min_lr: 0.001538  loss: 2.3906 (2.8978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (nan)  time: 0.1885  data: 0.0006  max mem: 12911
Epoch: [180]  [1200/1251]  eta: 0:00:09  lr: 0.001535  min_lr: 0.001535  loss: 2.4333 (2.9069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8217 (nan)  time: 0.1865  data: 0.0006  max mem: 12911
Epoch: [180]  [1250/1251]  eta: 0:00:00  lr: 0.001534  min_lr: 0.001534  loss: 2.7234 (2.9069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8404 (nan)  time: 0.1468  data: 0.0011  max mem: 12911
Epoch: [180] Total time: 0:03:57 (0.1898 s / it)
Averaged stats: lr: 0.001534  min_lr: 0.001534  loss: 2.7234 (2.8924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8404 (nan)
Test:  [ 0/25]  eta: 0:01:22  loss: 0.6858 (0.6858)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 3.2920  data: 3.1966  max mem: 12911
Test:  [10/25]  eta: 0:00:08  loss: 0.8490 (0.8516)  acc1: 83.2000 (81.3818)  acc5: 96.8000 (96.2546)  time: 0.5887  data: 0.5019  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0841 (1.0631)  acc1: 74.0000 (76.9905)  acc5: 92.4000 (93.7333)  time: 0.2647  data: 0.1819  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2423 (1.0728)  acc1: 73.6000 (76.7040)  acc5: 92.0000 (93.6480)  time: 0.2254  data: 0.1433  max mem: 12911
Test: Total time: 0:00:09 (0.3917 s / it)
* Acc@1 76.822 Acc@5 93.720 loss 1.061
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.82%
Epoch: [181]  [   0/1251]  eta: 1:06:01  lr: 0.001534  min_lr: 0.001534  loss: 2.1923 (2.1923)  weight_decay: 0.0500 (0.0500)  time: 3.1667  data: 2.9232  max mem: 12911
Epoch: [181]  [ 200/1251]  eta: 0:03:33  lr: 0.001530  min_lr: 0.001530  loss: 2.2048 (2.8350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9154 (0.9035)  time: 0.1919  data: 0.0005  max mem: 12911
Epoch: [181]  [ 400/1251]  eta: 0:02:45  lr: 0.001527  min_lr: 0.001527  loss: 2.3656 (2.8480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8145 (0.9048)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [181]  [ 600/1251]  eta: 0:02:04  lr: 0.001523  min_lr: 0.001523  loss: 2.2753 (2.8428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8242 (0.8804)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [181]  [ 800/1251]  eta: 0:01:25  lr: 0.001520  min_lr: 0.001520  loss: 2.3790 (2.8789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8579 (nan)  time: 0.1870  data: 0.0005  max mem: 12911
Epoch: [181]  [1000/1251]  eta: 0:00:47  lr: 0.001516  min_lr: 0.001516  loss: 2.6982 (2.8781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7400 (nan)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [181]  [1200/1251]  eta: 0:00:09  lr: 0.001513  min_lr: 0.001513  loss: 2.7396 (2.8820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9990 (nan)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [181]  [1250/1251]  eta: 0:00:00  lr: 0.001512  min_lr: 0.001512  loss: 2.3252 (2.8800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9173 (nan)  time: 0.1469  data: 0.0011  max mem: 12911
Epoch: [181] Total time: 0:03:57 (0.1901 s / it)
Averaged stats: lr: 0.001512  min_lr: 0.001512  loss: 2.3252 (2.8897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9173 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6429 (0.6429)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.6891  data: 5.5579  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8312 (0.8082)  acc1: 82.4000 (81.6000)  acc5: 96.4000 (96.0000)  time: 0.7271  data: 0.6357  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0512 (1.0102)  acc1: 74.0000 (77.0667)  acc5: 92.8000 (93.4667)  time: 0.1939  data: 0.1102  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1373 (1.0216)  acc1: 72.8000 (76.7840)  acc5: 92.4000 (93.3440)  time: 0.1964  data: 0.1139  max mem: 12911
Test: Total time: 0:00:10 (0.4013 s / it)
* Acc@1 76.888 Acc@5 93.576 loss 1.006
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 76.89%
Epoch: [182]  [   0/1251]  eta: 1:02:27  lr: 0.001512  min_lr: 0.001512  loss: 2.1203 (2.1203)  weight_decay: 0.0500 (0.0500)  time: 2.9955  data: 2.7269  max mem: 12911
Epoch: [182]  [ 200/1251]  eta: 0:03:28  lr: 0.001508  min_lr: 0.001508  loss: 2.2815 (2.7390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7217 (0.7960)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [182]  [ 400/1251]  eta: 0:02:44  lr: 0.001505  min_lr: 0.001505  loss: 3.5642 (2.7715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7058 (0.7991)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [182]  [ 600/1251]  eta: 0:02:04  lr: 0.001501  min_lr: 0.001501  loss: 2.3596 (2.8142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7082 (nan)  time: 0.1876  data: 0.0003  max mem: 12911
Epoch: [182]  [ 800/1251]  eta: 0:01:25  lr: 0.001498  min_lr: 0.001498  loss: 2.2611 (2.8257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8238 (nan)  time: 0.1832  data: 0.0005  max mem: 12911
Epoch: [182]  [1000/1251]  eta: 0:00:47  lr: 0.001495  min_lr: 0.001495  loss: 2.2846 (2.8219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7184 (nan)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [182]  [1200/1251]  eta: 0:00:09  lr: 0.001491  min_lr: 0.001491  loss: 3.0566 (2.8452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8810 (nan)  time: 0.1900  data: 0.0004  max mem: 12911
Epoch: [182]  [1250/1251]  eta: 0:00:00  lr: 0.001490  min_lr: 0.001490  loss: 2.5572 (2.8452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8946 (nan)  time: 0.1464  data: 0.0008  max mem: 12911
Epoch: [182] Total time: 0:03:56 (0.1892 s / it)
Averaged stats: lr: 0.001490  min_lr: 0.001490  loss: 2.5572 (2.8845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8946 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6373 (0.6373)  acc1: 86.4000 (86.4000)  acc5: 99.2000 (99.2000)  time: 5.4776  data: 5.3531  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8681 (0.8346)  acc1: 80.0000 (80.5818)  acc5: 96.4000 (96.1818)  time: 0.6237  data: 0.5265  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0886 (1.0168)  acc1: 74.0000 (76.4952)  acc5: 92.4000 (93.7905)  time: 0.1647  data: 0.0776  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1304 (1.0251)  acc1: 73.6000 (76.2880)  acc5: 92.0000 (93.7920)  time: 0.2110  data: 0.1275  max mem: 12911
Test: Total time: 0:00:10 (0.4093 s / it)
* Acc@1 76.722 Acc@5 93.598 loss 1.017
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.89%
Epoch: [183]  [   0/1251]  eta: 1:04:52  lr: 0.001490  min_lr: 0.001490  loss: 2.2652 (2.2652)  weight_decay: 0.0500 (0.0500)  time: 3.1115  data: 2.2224  max mem: 12911
Epoch: [183]  [ 200/1251]  eta: 0:03:36  lr: 0.001487  min_lr: 0.001487  loss: 2.4151 (2.9423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7389 (0.7797)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [183]  [ 400/1251]  eta: 0:02:47  lr: 0.001483  min_lr: 0.001483  loss: 2.3481 (2.8815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7292 (0.7857)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [183]  [ 600/1251]  eta: 0:02:06  lr: 0.001480  min_lr: 0.001480  loss: 2.2634 (2.9124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9259 (0.8376)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [183]  [ 800/1251]  eta: 0:01:26  lr: 0.001476  min_lr: 0.001476  loss: 2.5201 (2.9175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7354 (0.8378)  time: 0.1888  data: 0.0006  max mem: 12911
Epoch: [183]  [1000/1251]  eta: 0:00:48  lr: 0.001473  min_lr: 0.001473  loss: 2.2751 (2.9170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7823 (0.8240)  time: 0.1939  data: 0.0004  max mem: 12911
Epoch: [183]  [1200/1251]  eta: 0:00:09  lr: 0.001469  min_lr: 0.001469  loss: 2.3419 (2.9102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6778 (0.8143)  time: 0.1899  data: 0.0004  max mem: 12911
Epoch: [183]  [1250/1251]  eta: 0:00:00  lr: 0.001469  min_lr: 0.001469  loss: 2.6777 (2.9094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7543 (0.8201)  time: 0.1462  data: 0.0008  max mem: 12911
Epoch: [183] Total time: 0:03:59 (0.1915 s / it)
Averaged stats: lr: 0.001469  min_lr: 0.001469  loss: 2.6777 (2.8811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7543 (0.8201)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7030 (0.7030)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.5913  data: 5.4997  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8447 (0.8896)  acc1: 81.6000 (81.4909)  acc5: 96.0000 (95.6727)  time: 0.7385  data: 0.6419  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1570 (1.0769)  acc1: 76.0000 (77.1619)  acc5: 92.0000 (93.5429)  time: 0.2010  data: 0.1126  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2426 (1.0920)  acc1: 72.8000 (76.6240)  acc5: 92.0000 (93.4240)  time: 0.1972  data: 0.1126  max mem: 12911
Test: Total time: 0:00:10 (0.4016 s / it)
* Acc@1 76.842 Acc@5 93.716 loss 1.079
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.89%
Epoch: [184]  [   0/1251]  eta: 1:01:44  lr: 0.001469  min_lr: 0.001469  loss: 3.3085 (3.3085)  weight_decay: 0.0500 (0.0500)  time: 2.9611  data: 2.6791  max mem: 12911
Epoch: [184]  [ 200/1251]  eta: 0:03:35  lr: 0.001465  min_lr: 0.001465  loss: 2.2792 (2.7987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7252 (0.8188)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [184]  [ 400/1251]  eta: 0:02:46  lr: 0.001462  min_lr: 0.001462  loss: 2.3695 (2.8063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9124 (0.8804)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [184]  [ 600/1251]  eta: 0:02:05  lr: 0.001458  min_lr: 0.001458  loss: 3.0203 (2.8611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9054 (0.8746)  time: 0.1844  data: 0.0005  max mem: 12911
Epoch: [184]  [ 800/1251]  eta: 0:01:25  lr: 0.001455  min_lr: 0.001455  loss: 2.2855 (2.8641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9135 (0.9716)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [184]  [1000/1251]  eta: 0:00:47  lr: 0.001451  min_lr: 0.001451  loss: 2.3570 (2.8679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7359 (0.9256)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [184]  [1200/1251]  eta: 0:00:09  lr: 0.001448  min_lr: 0.001448  loss: 2.4751 (2.8677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7180 (0.8975)  time: 0.1910  data: 0.0006  max mem: 12911
Epoch: [184]  [1250/1251]  eta: 0:00:00  lr: 0.001447  min_lr: 0.001447  loss: 2.4292 (2.8683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7180 (0.8948)  time: 0.1456  data: 0.0008  max mem: 12911
Epoch: [184] Total time: 0:03:57 (0.1895 s / it)
Averaged stats: lr: 0.001447  min_lr: 0.001447  loss: 2.4292 (2.8872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7180 (0.8948)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6861 (0.6861)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 5.5045  data: 5.4126  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8416 (0.8577)  acc1: 81.2000 (80.7273)  acc5: 96.8000 (96.2182)  time: 0.7346  data: 0.6386  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0879 (1.0496)  acc1: 74.8000 (76.8571)  acc5: 93.2000 (93.7714)  time: 0.2213  data: 0.1323  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1391 (1.0622)  acc1: 74.0000 (76.4800)  acc5: 92.4000 (93.5680)  time: 0.2217  data: 0.1361  max mem: 12911
Test: Total time: 0:00:10 (0.4162 s / it)
* Acc@1 77.048 Acc@5 93.740 loss 1.052
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.05%
Epoch: [185]  [   0/1251]  eta: 1:05:32  lr: 0.001447  min_lr: 0.001447  loss: 3.0507 (3.0507)  weight_decay: 0.0500 (0.0500)  time: 3.1431  data: 2.8929  max mem: 12911
Epoch: [185]  [ 200/1251]  eta: 0:03:32  lr: 0.001444  min_lr: 0.001444  loss: 2.3801 (2.8177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8226 (0.8642)  time: 0.1876  data: 0.0003  max mem: 12911
Epoch: [185]  [ 400/1251]  eta: 0:02:46  lr: 0.001440  min_lr: 0.001440  loss: 2.4240 (2.8440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7490 (0.9359)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [185]  [ 600/1251]  eta: 0:02:05  lr: 0.001437  min_lr: 0.001437  loss: 2.3603 (2.8856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6783 (0.9599)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [185]  [ 800/1251]  eta: 0:01:26  lr: 0.001433  min_lr: 0.001433  loss: 2.5164 (2.8987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6554 (0.8966)  time: 0.1866  data: 0.0004  max mem: 12911
Epoch: [185]  [1000/1251]  eta: 0:00:47  lr: 0.001430  min_lr: 0.001430  loss: 2.3337 (2.8892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7564 (0.8769)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [185]  [1200/1251]  eta: 0:00:09  lr: 0.001426  min_lr: 0.001426  loss: 2.3140 (2.8951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7818 (0.8720)  time: 0.1894  data: 0.0004  max mem: 12911
Epoch: [185]  [1250/1251]  eta: 0:00:00  lr: 0.001426  min_lr: 0.001426  loss: 2.2154 (2.8912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7818 (0.8710)  time: 0.1466  data: 0.0007  max mem: 12911
Epoch: [185] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.001426  min_lr: 0.001426  loss: 2.2154 (2.8868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7818 (0.8710)
Test:  [ 0/25]  eta: 0:01:54  loss: 0.6922 (0.6922)  acc1: 86.4000 (86.4000)  acc5: 97.2000 (97.2000)  time: 4.5798  data: 4.4799  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8571 (0.8709)  acc1: 80.4000 (81.6364)  acc5: 96.4000 (96.0727)  time: 0.7528  data: 0.6614  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1053 (1.0556)  acc1: 75.6000 (77.2191)  acc5: 92.8000 (93.7714)  time: 0.2543  data: 0.1684  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1688 (1.0698)  acc1: 74.4000 (76.6880)  acc5: 92.8000 (93.8080)  time: 0.2103  data: 0.1277  max mem: 12911
Test: Total time: 0:00:10 (0.4023 s / it)
* Acc@1 76.966 Acc@5 93.720 loss 1.061
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.05%
Epoch: [186]  [   0/1251]  eta: 1:03:49  lr: 0.001425  min_lr: 0.001425  loss: 4.0267 (4.0267)  weight_decay: 0.0500 (0.0500)  time: 3.0609  data: 2.2778  max mem: 12911
Epoch: [186]  [ 200/1251]  eta: 0:03:35  lr: 0.001422  min_lr: 0.001422  loss: 2.4527 (2.8868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9815 (0.9591)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [186]  [ 400/1251]  eta: 0:02:46  lr: 0.001419  min_lr: 0.001419  loss: 2.7083 (2.9060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8550 (0.8951)  time: 0.1901  data: 0.0006  max mem: 12911
Epoch: [186]  [ 600/1251]  eta: 0:02:05  lr: 0.001415  min_lr: 0.001415  loss: 2.3426 (2.8947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7281 (0.8719)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [186]  [ 800/1251]  eta: 0:01:26  lr: 0.001412  min_lr: 0.001412  loss: 2.4756 (2.8981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7446 (0.8453)  time: 0.1882  data: 0.0004  max mem: 12911
Epoch: [186]  [1000/1251]  eta: 0:00:48  lr: 0.001408  min_lr: 0.001408  loss: 3.5636 (2.9069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7484 (0.8353)  time: 0.1875  data: 0.0006  max mem: 12911
Epoch: [186]  [1200/1251]  eta: 0:00:09  lr: 0.001405  min_lr: 0.001405  loss: 2.1890 (2.8819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9067 (0.8401)  time: 0.1864  data: 0.0006  max mem: 12911
Epoch: [186]  [1250/1251]  eta: 0:00:00  lr: 0.001404  min_lr: 0.001404  loss: 2.3656 (2.8826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7559 (0.8374)  time: 0.1484  data: 0.0008  max mem: 12911
Epoch: [186] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.001404  min_lr: 0.001404  loss: 2.3656 (2.8784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7559 (0.8374)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6129 (0.6129)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.5539  data: 5.4268  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7673 (0.8272)  acc1: 82.8000 (81.8545)  acc5: 97.2000 (96.5091)  time: 0.7434  data: 0.6552  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1322 (1.0224)  acc1: 75.2000 (77.0476)  acc5: 93.6000 (94.1714)  time: 0.2161  data: 0.1335  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1390 (1.0342)  acc1: 73.2000 (76.5440)  acc5: 92.8000 (94.0160)  time: 0.2147  data: 0.1334  max mem: 12911
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 77.074 Acc@5 93.878 loss 1.029
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.07%
Epoch: [187]  [   0/1251]  eta: 1:08:17  lr: 0.001404  min_lr: 0.001404  loss: 2.6963 (2.6963)  weight_decay: 0.0500 (0.0500)  time: 3.2755  data: 3.0602  max mem: 12911
Epoch: [187]  [ 200/1251]  eta: 0:03:32  lr: 0.001401  min_lr: 0.001401  loss: 2.4281 (2.8716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7225 (0.8409)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [187]  [ 400/1251]  eta: 0:02:46  lr: 0.001397  min_lr: 0.001397  loss: 2.2722 (2.8537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7108 (0.8131)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [187]  [ 600/1251]  eta: 0:02:05  lr: 0.001394  min_lr: 0.001394  loss: 2.1574 (2.8448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6836 (0.7960)  time: 0.1860  data: 0.0004  max mem: 12911
Epoch: [187]  [ 800/1251]  eta: 0:01:26  lr: 0.001390  min_lr: 0.001390  loss: 2.3852 (2.8432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7969 (0.8133)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [187]  [1000/1251]  eta: 0:00:47  lr: 0.001387  min_lr: 0.001387  loss: 2.2790 (2.8389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7010 (0.8350)  time: 0.1894  data: 0.0004  max mem: 12911
Epoch: [187]  [1200/1251]  eta: 0:00:09  lr: 0.001383  min_lr: 0.001383  loss: 2.3959 (2.8485)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7605 (0.8239)  time: 0.1860  data: 0.0004  max mem: 12911
Epoch: [187]  [1250/1251]  eta: 0:00:00  lr: 0.001383  min_lr: 0.001383  loss: 2.3469 (2.8479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8287 (0.8314)  time: 0.1460  data: 0.0008  max mem: 12911
Epoch: [187] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.001383  min_lr: 0.001383  loss: 2.3469 (2.8628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8287 (0.8314)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6129 (0.6129)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.6894  data: 5.5639  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8533 (0.8585)  acc1: 82.4000 (81.7091)  acc5: 96.8000 (96.2545)  time: 0.6625  data: 0.5624  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0645 (1.0623)  acc1: 75.6000 (77.2762)  acc5: 93.2000 (93.8095)  time: 0.1672  data: 0.0780  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1901 (1.0739)  acc1: 74.0000 (76.6400)  acc5: 93.2000 (93.7920)  time: 0.2067  data: 0.1215  max mem: 12911
Test: Total time: 0:00:10 (0.4124 s / it)
* Acc@1 76.802 Acc@5 93.704 loss 1.067
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 77.07%
Epoch: [188]  [   0/1251]  eta: 1:08:40  lr: 0.001383  min_lr: 0.001383  loss: 3.8909 (3.8909)  weight_decay: 0.0500 (0.0500)  time: 3.2940  data: 1.6480  max mem: 12911
Epoch: [188]  [ 200/1251]  eta: 0:03:35  lr: 0.001379  min_lr: 0.001379  loss: 2.3325 (2.8682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7470 (0.8602)  time: 0.1890  data: 0.0005  max mem: 12911
Epoch: [188]  [ 400/1251]  eta: 0:02:47  lr: 0.001376  min_lr: 0.001376  loss: 2.4076 (2.8527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8054 (0.8592)  time: 0.1895  data: 0.0005  max mem: 12911
Epoch: [188]  [ 600/1251]  eta: 0:02:06  lr: 0.001372  min_lr: 0.001372  loss: 2.3888 (2.8724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7207 (0.8590)  time: 0.1847  data: 0.0005  max mem: 12911
Epoch: [188]  [ 800/1251]  eta: 0:01:26  lr: 0.001369  min_lr: 0.001369  loss: 2.9677 (2.8963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7655 (0.8443)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [188]  [1000/1251]  eta: 0:00:47  lr: 0.001366  min_lr: 0.001366  loss: 2.8577 (2.8835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7070 (0.8372)  time: 0.1843  data: 0.0005  max mem: 12911
Epoch: [188]  [1200/1251]  eta: 0:00:09  lr: 0.001362  min_lr: 0.001362  loss: 2.4772 (2.8806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7237 (0.8401)  time: 0.1991  data: 0.0004  max mem: 12911
Epoch: [188]  [1250/1251]  eta: 0:00:00  lr: 0.001361  min_lr: 0.001361  loss: 2.4114 (2.8792)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8265 (0.8407)  time: 0.1529  data: 0.0008  max mem: 12911
Epoch: [188] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.001361  min_lr: 0.001361  loss: 2.4114 (2.8676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8265 (0.8407)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6988 (0.6988)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.4822  data: 5.3678  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8698 (0.8642)  acc1: 82.8000 (81.6364)  acc5: 96.4000 (96.1455)  time: 0.7087  data: 0.6227  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0879 (1.0588)  acc1: 74.8000 (77.3905)  acc5: 93.2000 (93.9048)  time: 0.2117  data: 0.1293  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.2036 (1.0672)  acc1: 73.2000 (77.1520)  acc5: 92.4000 (93.8880)  time: 0.2202  data: 0.1386  max mem: 12911
Test: Total time: 0:00:10 (0.4121 s / it)
* Acc@1 77.380 Acc@5 93.932 loss 1.057
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.38%
Epoch: [189]  [   0/1251]  eta: 1:00:32  lr: 0.001361  min_lr: 0.001361  loss: 3.5765 (3.5765)  weight_decay: 0.0500 (0.0500)  time: 2.9035  data: 2.6382  max mem: 12911
Epoch: [189]  [ 200/1251]  eta: 0:03:34  lr: 0.001358  min_lr: 0.001358  loss: 2.3239 (2.8921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8287 (0.8038)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [189]  [ 400/1251]  eta: 0:02:46  lr: 0.001355  min_lr: 0.001355  loss: 2.3651 (2.8862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7829 (0.8261)  time: 0.1886  data: 0.0005  max mem: 12911
Epoch: [189]  [ 600/1251]  eta: 0:02:05  lr: 0.001351  min_lr: 0.001351  loss: 2.3021 (2.8891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7195 (0.8627)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [189]  [ 800/1251]  eta: 0:01:26  lr: 0.001348  min_lr: 0.001348  loss: 2.6550 (2.8853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7583 (0.8736)  time: 0.1859  data: 0.0004  max mem: 12911
Epoch: [189]  [1000/1251]  eta: 0:00:47  lr: 0.001344  min_lr: 0.001344  loss: 2.2841 (2.8761)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7655 (0.8622)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [189]  [1200/1251]  eta: 0:00:09  lr: 0.001341  min_lr: 0.001341  loss: 2.6558 (2.8770)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9763 (0.8733)  time: 0.1848  data: 0.0005  max mem: 12911
Epoch: [189]  [1250/1251]  eta: 0:00:00  lr: 0.001340  min_lr: 0.001340  loss: 2.1683 (2.8718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7908 (0.8712)  time: 0.1459  data: 0.0005  max mem: 12911
Epoch: [189] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.001340  min_lr: 0.001340  loss: 2.1683 (2.8639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7908 (0.8712)
Test:  [ 0/25]  eta: 0:02:06  loss: 0.5715 (0.5715)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.0472  data: 4.9489  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8144 (0.8151)  acc1: 81.6000 (82.5091)  acc5: 96.4000 (96.3636)  time: 0.6501  data: 0.5571  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0621 (1.0057)  acc1: 76.4000 (77.6952)  acc5: 93.2000 (94.0191)  time: 0.1933  data: 0.1051  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1403 (1.0170)  acc1: 75.2000 (77.3760)  acc5: 92.8000 (93.8400)  time: 0.2170  data: 0.1302  max mem: 12911
Test: Total time: 0:00:10 (0.4002 s / it)
* Acc@1 77.358 Acc@5 93.924 loss 1.012
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.38%
Epoch: [190]  [   0/1251]  eta: 1:07:57  lr: 0.001340  min_lr: 0.001340  loss: 3.7088 (3.7088)  weight_decay: 0.0500 (0.0500)  time: 3.2597  data: 1.5686  max mem: 12911
Epoch: [190]  [ 200/1251]  eta: 0:03:35  lr: 0.001337  min_lr: 0.001337  loss: 2.7019 (2.8677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7742 (0.9658)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [190]  [ 400/1251]  eta: 0:02:48  lr: 0.001333  min_lr: 0.001333  loss: 2.2463 (2.8580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9388 (0.8958)  time: 0.1903  data: 0.0004  max mem: 12911
Epoch: [190]  [ 600/1251]  eta: 0:02:07  lr: 0.001330  min_lr: 0.001330  loss: 3.6403 (2.8841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8820 (0.8895)  time: 0.1935  data: 0.0005  max mem: 12911
Epoch: [190]  [ 800/1251]  eta: 0:01:27  lr: 0.001327  min_lr: 0.001327  loss: 2.8930 (2.9013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7435 (0.8728)  time: 0.1899  data: 0.0004  max mem: 12911
Epoch: [190]  [1000/1251]  eta: 0:00:48  lr: 0.001323  min_lr: 0.001323  loss: 2.4642 (2.8964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7964 (0.8554)  time: 0.1884  data: 0.0005  max mem: 12911
Epoch: [190]  [1200/1251]  eta: 0:00:09  lr: 0.001320  min_lr: 0.001320  loss: 2.6692 (2.8908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8286 (0.8436)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [190]  [1250/1251]  eta: 0:00:00  lr: 0.001319  min_lr: 0.001319  loss: 2.8296 (2.8873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8394 (0.8458)  time: 0.1464  data: 0.0011  max mem: 12911
Epoch: [190] Total time: 0:03:59 (0.1917 s / it)
Averaged stats: lr: 0.001319  min_lr: 0.001319  loss: 2.8296 (2.8596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8394 (0.8458)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7125 (0.7125)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 5.5092  data: 5.4133  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8441 (0.8737)  acc1: 82.4000 (82.1455)  acc5: 96.4000 (96.5818)  time: 0.7328  data: 0.6344  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1096 (1.0545)  acc1: 76.0000 (77.4476)  acc5: 93.6000 (94.1143)  time: 0.2008  data: 0.1087  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1571 (1.0623)  acc1: 76.0000 (76.9920)  acc5: 92.8000 (94.0480)  time: 0.1968  data: 0.1086  max mem: 12911
Test: Total time: 0:00:09 (0.3970 s / it)
* Acc@1 77.170 Acc@5 93.876 loss 1.054
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.38%
Epoch: [191]  [   0/1251]  eta: 1:05:28  lr: 0.001319  min_lr: 0.001319  loss: 3.7631 (3.7631)  weight_decay: 0.0500 (0.0500)  time: 3.1404  data: 2.8672  max mem: 12911
Epoch: [191]  [ 200/1251]  eta: 0:03:35  lr: 0.001316  min_lr: 0.001316  loss: 2.2905 (2.8058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7286 (0.8723)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [191]  [ 400/1251]  eta: 0:02:47  lr: 0.001312  min_lr: 0.001312  loss: 2.7547 (2.8483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8000 (0.8697)  time: 0.1896  data: 0.0005  max mem: 12911
Epoch: [191]  [ 600/1251]  eta: 0:02:06  lr: 0.001309  min_lr: 0.001309  loss: 2.2647 (2.8472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8447 (0.8624)  time: 0.1931  data: 0.0004  max mem: 12911
Epoch: [191]  [ 800/1251]  eta: 0:01:27  lr: 0.001305  min_lr: 0.001305  loss: 3.1605 (2.8650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7718 (0.8484)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [191]  [1000/1251]  eta: 0:00:48  lr: 0.001302  min_lr: 0.001302  loss: 2.3323 (2.8867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7859 (0.8623)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [191]  [1200/1251]  eta: 0:00:09  lr: 0.001299  min_lr: 0.001299  loss: 2.2739 (2.8677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7528 (0.8695)  time: 0.1899  data: 0.0005  max mem: 12911
Epoch: [191]  [1250/1251]  eta: 0:00:00  lr: 0.001298  min_lr: 0.001298  loss: 2.5866 (2.8672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.8671)  time: 0.1468  data: 0.0011  max mem: 12911
Epoch: [191] Total time: 0:03:59 (0.1916 s / it)
Averaged stats: lr: 0.001298  min_lr: 0.001298  loss: 2.5866 (2.8559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.8671)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6741 (0.6741)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 5.6091  data: 5.5111  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8601 (0.8417)  acc1: 83.2000 (81.7091)  acc5: 96.4000 (96.0727)  time: 0.7599  data: 0.6638  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0673 (1.0288)  acc1: 74.4000 (77.2952)  acc5: 92.8000 (93.8286)  time: 0.2134  data: 0.1253  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1787 (1.0384)  acc1: 74.4000 (77.1520)  acc5: 92.4000 (93.7440)  time: 0.2109  data: 0.1252  max mem: 12911
Test: Total time: 0:00:10 (0.4106 s / it)
* Acc@1 77.258 Acc@5 93.858 loss 1.034
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.38%
Epoch: [192]  [   0/1251]  eta: 1:04:50  lr: 0.001298  min_lr: 0.001298  loss: 1.8933 (1.8933)  weight_decay: 0.0500 (0.0500)  time: 3.1095  data: 2.6494  max mem: 12911
Epoch: [192]  [ 200/1251]  eta: 0:03:34  lr: 0.001295  min_lr: 0.001295  loss: 2.5199 (2.8132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (0.8816)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [192]  [ 400/1251]  eta: 0:02:47  lr: 0.001291  min_lr: 0.001291  loss: 2.7453 (2.8308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7734 (0.8368)  time: 0.1976  data: 0.0004  max mem: 12911
Epoch: [192]  [ 600/1251]  eta: 0:02:06  lr: 0.001288  min_lr: 0.001288  loss: 2.8283 (2.8509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7887 (0.8452)  time: 0.1832  data: 0.0004  max mem: 12911
Epoch: [192]  [ 800/1251]  eta: 0:01:26  lr: 0.001284  min_lr: 0.001284  loss: 2.3543 (2.8612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7875 (0.8270)  time: 0.1882  data: 0.0003  max mem: 12911
Epoch: [192]  [1000/1251]  eta: 0:00:48  lr: 0.001281  min_lr: 0.001281  loss: 2.7010 (2.8589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7784 (0.8444)  time: 0.1873  data: 0.0006  max mem: 12911
Epoch: [192]  [1200/1251]  eta: 0:00:09  lr: 0.001278  min_lr: 0.001278  loss: 3.2795 (2.8700)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7604 (0.8526)  time: 0.1911  data: 0.0005  max mem: 12911
Epoch: [192]  [1250/1251]  eta: 0:00:00  lr: 0.001277  min_lr: 0.001277  loss: 2.4183 (2.8679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8234 (0.8604)  time: 0.1470  data: 0.0008  max mem: 12911
Epoch: [192] Total time: 0:03:59 (0.1913 s / it)
Averaged stats: lr: 0.001277  min_lr: 0.001277  loss: 2.4183 (2.8497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8234 (0.8604)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6806 (0.6806)  acc1: 83.6000 (83.6000)  acc5: 98.4000 (98.4000)  time: 5.4220  data: 5.3166  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8219 (0.8281)  acc1: 81.6000 (82.1455)  acc5: 97.2000 (96.5091)  time: 0.7391  data: 0.6436  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0040 (1.0246)  acc1: 74.8000 (77.5238)  acc5: 93.6000 (94.4571)  time: 0.2091  data: 0.1221  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1479 (1.0342)  acc1: 74.0000 (77.2000)  acc5: 93.2000 (94.3520)  time: 0.2086  data: 0.1241  max mem: 12911
Test: Total time: 0:00:10 (0.4020 s / it)
* Acc@1 77.344 Acc@5 94.002 loss 1.032
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.38%
Epoch: [193]  [   0/1251]  eta: 1:07:16  lr: 0.001277  min_lr: 0.001277  loss: 2.2598 (2.2598)  weight_decay: 0.0500 (0.0500)  time: 3.2269  data: 2.9303  max mem: 12911
Epoch: [193]  [ 200/1251]  eta: 0:03:35  lr: 0.001274  min_lr: 0.001274  loss: 2.7634 (2.7566)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0410 (0.9358)  time: 0.1874  data: 0.0004  max mem: 12911
Epoch: [193]  [ 400/1251]  eta: 0:02:47  lr: 0.001270  min_lr: 0.001270  loss: 2.2743 (2.8065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7481 (0.8838)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [193]  [ 600/1251]  eta: 0:02:05  lr: 0.001267  min_lr: 0.001267  loss: 2.8026 (2.8699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8111 (0.8741)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [193]  [ 800/1251]  eta: 0:01:26  lr: 0.001264  min_lr: 0.001264  loss: 2.3248 (2.8736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7817 (0.8794)  time: 0.1890  data: 0.0005  max mem: 12911
Epoch: [193]  [1000/1251]  eta: 0:00:48  lr: 0.001260  min_lr: 0.001260  loss: 2.8135 (2.8655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8638 (0.8784)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [193]  [1200/1251]  eta: 0:00:09  lr: 0.001257  min_lr: 0.001257  loss: 2.5511 (2.8608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8032 (0.8871)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [193]  [1250/1251]  eta: 0:00:00  lr: 0.001256  min_lr: 0.001256  loss: 2.2359 (2.8583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7657 (0.8851)  time: 0.1472  data: 0.0009  max mem: 12911
Epoch: [193] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.001256  min_lr: 0.001256  loss: 2.2359 (2.8511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7657 (0.8851)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6476 (0.6476)  acc1: 84.8000 (84.8000)  acc5: 97.2000 (97.2000)  time: 5.5540  data: 5.4623  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8321 (0.8167)  acc1: 82.4000 (82.1091)  acc5: 96.8000 (96.3273)  time: 0.7007  data: 0.6066  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0574 (1.0088)  acc1: 76.0000 (77.7333)  acc5: 94.0000 (94.1524)  time: 0.1942  data: 0.1040  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1325 (1.0218)  acc1: 74.8000 (77.2480)  acc5: 93.6000 (93.9360)  time: 0.1924  data: 0.1040  max mem: 12911
Test: Total time: 0:00:09 (0.3949 s / it)
* Acc@1 77.502 Acc@5 94.088 loss 1.012
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.50%
Epoch: [194]  [   0/1251]  eta: 1:05:23  lr: 0.001256  min_lr: 0.001256  loss: 2.1856 (2.1856)  weight_decay: 0.0500 (0.0500)  time: 3.1362  data: 2.8925  max mem: 12911
Epoch: [194]  [ 200/1251]  eta: 0:03:32  lr: 0.001253  min_lr: 0.001253  loss: 2.1784 (2.7830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7602 (0.8320)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [194]  [ 400/1251]  eta: 0:02:45  lr: 0.001249  min_lr: 0.001249  loss: 3.7105 (2.8767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7805 (0.8349)  time: 0.1873  data: 0.0004  max mem: 12911
Epoch: [194]  [ 600/1251]  eta: 0:02:05  lr: 0.001246  min_lr: 0.001246  loss: 2.2361 (2.8691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8614 (0.8493)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [194]  [ 800/1251]  eta: 0:01:26  lr: 0.001243  min_lr: 0.001243  loss: 2.1950 (2.8376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8647 (0.8845)  time: 0.1869  data: 0.0004  max mem: 12911
Epoch: [194]  [1000/1251]  eta: 0:00:47  lr: 0.001239  min_lr: 0.001239  loss: 2.3610 (2.8450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7754 (0.8827)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [194]  [1200/1251]  eta: 0:00:09  lr: 0.001236  min_lr: 0.001236  loss: 2.2769 (2.8304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8682 (0.8805)  time: 0.1883  data: 0.0005  max mem: 12911
Epoch: [194]  [1250/1251]  eta: 0:00:00  lr: 0.001235  min_lr: 0.001235  loss: 3.0441 (2.8368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8124 (0.8777)  time: 0.1465  data: 0.0007  max mem: 12911
Epoch: [194] Total time: 0:03:57 (0.1897 s / it)
Averaged stats: lr: 0.001235  min_lr: 0.001235  loss: 3.0441 (2.8332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8124 (0.8777)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6751 (0.6751)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 5.4697  data: 5.3357  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8222 (0.8586)  acc1: 82.0000 (81.4182)  acc5: 96.0000 (96.1091)  time: 0.7324  data: 0.6334  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0538 (1.0409)  acc1: 74.8000 (77.2381)  acc5: 93.6000 (93.8857)  time: 0.2060  data: 0.1185  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1265 (1.0501)  acc1: 74.8000 (77.2160)  acc5: 92.4000 (93.7280)  time: 0.2020  data: 0.1184  max mem: 12911
Test: Total time: 0:00:10 (0.4008 s / it)
* Acc@1 77.348 Acc@5 93.944 loss 1.040
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.50%
Epoch: [195]  [   0/1251]  eta: 1:06:46  lr: 0.001235  min_lr: 0.001235  loss: 3.8589 (3.8589)  weight_decay: 0.0500 (0.0500)  time: 3.2025  data: 1.5150  max mem: 12911
Epoch: [195]  [ 200/1251]  eta: 0:03:35  lr: 0.001232  min_lr: 0.001232  loss: 2.5204 (2.8644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7867 (0.8453)  time: 0.1868  data: 0.0003  max mem: 12911
Epoch: [195]  [ 400/1251]  eta: 0:02:46  lr: 0.001229  min_lr: 0.001229  loss: 3.0022 (2.8786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7751 (0.8619)  time: 0.1888  data: 0.0004  max mem: 12911
Epoch: [195]  [ 600/1251]  eta: 0:02:05  lr: 0.001225  min_lr: 0.001225  loss: 3.2855 (2.8787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7638 (0.8329)  time: 0.1862  data: 0.0003  max mem: 12911
Epoch: [195]  [ 800/1251]  eta: 0:01:26  lr: 0.001222  min_lr: 0.001222  loss: 2.6747 (2.8831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7265 (0.8286)  time: 0.1888  data: 0.0004  max mem: 12911
Epoch: [195]  [1000/1251]  eta: 0:00:48  lr: 0.001219  min_lr: 0.001219  loss: 2.2235 (2.8699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8326 (0.8472)  time: 0.1896  data: 0.0005  max mem: 12911
Epoch: [195]  [1200/1251]  eta: 0:00:09  lr: 0.001215  min_lr: 0.001215  loss: 2.3807 (2.8653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8000 (0.8488)  time: 0.1881  data: 0.0004  max mem: 12911
Epoch: [195]  [1250/1251]  eta: 0:00:00  lr: 0.001215  min_lr: 0.001215  loss: 2.1627 (2.8590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7444 (0.8474)  time: 0.1471  data: 0.0006  max mem: 12911
Epoch: [195] Total time: 0:03:59 (0.1914 s / it)
Averaged stats: lr: 0.001215  min_lr: 0.001215  loss: 2.1627 (2.8324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7444 (0.8474)
Test:  [ 0/25]  eta: 0:01:34  loss: 0.6589 (0.6589)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 3.7632  data: 3.6700  max mem: 12911
Test:  [10/25]  eta: 0:00:08  loss: 0.8264 (0.8158)  acc1: 83.2000 (81.6000)  acc5: 96.8000 (96.6182)  time: 0.5957  data: 0.5048  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0269 (1.0037)  acc1: 75.6000 (77.3714)  acc5: 94.4000 (94.3810)  time: 0.2441  data: 0.1574  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1440 (1.0151)  acc1: 74.4000 (77.1360)  acc5: 92.8000 (94.1280)  time: 0.2501  data: 0.1649  max mem: 12911
Test: Total time: 0:00:10 (0.4031 s / it)
* Acc@1 77.548 Acc@5 94.050 loss 1.011
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.55%
Epoch: [196]  [   0/1251]  eta: 0:57:31  lr: 0.001215  min_lr: 0.001215  loss: 3.5525 (3.5525)  weight_decay: 0.0500 (0.0500)  time: 2.7591  data: 2.5018  max mem: 12911
Epoch: [196]  [ 200/1251]  eta: 0:03:33  lr: 0.001211  min_lr: 0.001211  loss: 2.3195 (2.8892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8740 (0.8508)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [196]  [ 400/1251]  eta: 0:02:46  lr: 0.001208  min_lr: 0.001208  loss: 2.2550 (2.8555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8267 (0.8501)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [196]  [ 600/1251]  eta: 0:02:05  lr: 0.001205  min_lr: 0.001205  loss: 2.2771 (2.8315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7796 (0.8430)  time: 0.1870  data: 0.0005  max mem: 12911
Epoch: [196]  [ 800/1251]  eta: 0:01:25  lr: 0.001201  min_lr: 0.001201  loss: 2.4272 (2.8418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8215 (0.8485)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [196]  [1000/1251]  eta: 0:00:47  lr: 0.001198  min_lr: 0.001198  loss: 2.5408 (2.8507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8043 (0.8359)  time: 0.1909  data: 0.0009  max mem: 12911
Epoch: [196]  [1200/1251]  eta: 0:00:09  lr: 0.001195  min_lr: 0.001195  loss: 2.2821 (2.8500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8660 (0.8384)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [196]  [1250/1251]  eta: 0:00:00  lr: 0.001194  min_lr: 0.001194  loss: 3.4356 (2.8462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9361 (0.8426)  time: 0.1467  data: 0.0007  max mem: 12911
Epoch: [196] Total time: 0:03:57 (0.1898 s / it)
Averaged stats: lr: 0.001194  min_lr: 0.001194  loss: 3.4356 (2.8374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9361 (0.8426)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6668 (0.6668)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.7308  data: 5.6391  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8415 (0.8528)  acc1: 82.4000 (82.0000)  acc5: 96.8000 (96.5455)  time: 0.7528  data: 0.6584  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0638 (1.0476)  acc1: 77.2000 (77.7714)  acc5: 94.0000 (94.3429)  time: 0.2088  data: 0.1217  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1671 (1.0629)  acc1: 74.0000 (77.4240)  acc5: 92.8000 (94.1120)  time: 0.2062  data: 0.1216  max mem: 12911
Test: Total time: 0:00:10 (0.4123 s / it)
* Acc@1 77.578 Acc@5 94.044 loss 1.054
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.58%
Epoch: [197]  [   0/1251]  eta: 0:57:00  lr: 0.001194  min_lr: 0.001194  loss: 4.5319 (4.5319)  weight_decay: 0.0500 (0.0500)  time: 2.7339  data: 2.4980  max mem: 12911
Epoch: [197]  [ 200/1251]  eta: 0:03:33  lr: 0.001191  min_lr: 0.001191  loss: 2.2838 (2.8925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8430 (0.9238)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [197]  [ 400/1251]  eta: 0:02:45  lr: 0.001187  min_lr: 0.001187  loss: 3.2377 (2.8776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7688 (0.8837)  time: 0.1864  data: 0.0004  max mem: 12911
Epoch: [197]  [ 600/1251]  eta: 0:02:05  lr: 0.001184  min_lr: 0.001184  loss: 2.6707 (2.8876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8080 (nan)  time: 0.1905  data: 0.0003  max mem: 12911
Epoch: [197]  [ 800/1251]  eta: 0:01:26  lr: 0.001181  min_lr: 0.001181  loss: 2.5384 (2.8860)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8966 (nan)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [197]  [1000/1251]  eta: 0:00:47  lr: 0.001178  min_lr: 0.001178  loss: 2.1646 (2.8746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8309 (nan)  time: 0.1851  data: 0.0004  max mem: 12911
Epoch: [197]  [1200/1251]  eta: 0:00:09  lr: 0.001174  min_lr: 0.001174  loss: 2.2358 (2.8556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8679 (nan)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [197]  [1250/1251]  eta: 0:00:00  lr: 0.001174  min_lr: 0.001174  loss: 2.2639 (2.8584)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8821 (nan)  time: 0.1470  data: 0.0009  max mem: 12911
Epoch: [197] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.001174  min_lr: 0.001174  loss: 2.2639 (2.8426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8821 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6870 (0.6870)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.5505  data: 5.4517  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8212 (0.8436)  acc1: 82.0000 (81.4545)  acc5: 96.4000 (96.4364)  time: 0.7639  data: 0.6674  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0603 (1.0284)  acc1: 74.0000 (77.1048)  acc5: 93.6000 (94.2667)  time: 0.2187  data: 0.1286  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1874 (1.0424)  acc1: 73.6000 (76.6720)  acc5: 93.2000 (94.1120)  time: 0.2160  data: 0.1285  max mem: 12911
Test: Total time: 0:00:10 (0.4128 s / it)
* Acc@1 77.526 Acc@5 94.112 loss 1.038
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.58%
Epoch: [198]  [   0/1251]  eta: 1:01:22  lr: 0.001174  min_lr: 0.001174  loss: 2.0316 (2.0316)  weight_decay: 0.0500 (0.0500)  time: 2.9435  data: 2.6692  max mem: 12911
Epoch: [198]  [ 200/1251]  eta: 0:03:33  lr: 0.001170  min_lr: 0.001170  loss: 2.7820 (2.8306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8035 (0.8156)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [198]  [ 400/1251]  eta: 0:02:46  lr: 0.001167  min_lr: 0.001167  loss: 2.7309 (2.8361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7849 (0.8453)  time: 0.1931  data: 0.0006  max mem: 12911
Epoch: [198]  [ 600/1251]  eta: 0:02:05  lr: 0.001164  min_lr: 0.001164  loss: 2.2789 (2.8369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8291 (0.8435)  time: 0.1920  data: 0.0005  max mem: 12911
Epoch: [198]  [ 800/1251]  eta: 0:01:26  lr: 0.001161  min_lr: 0.001161  loss: 2.7298 (2.8486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8688 (0.8584)  time: 0.1837  data: 0.0005  max mem: 12911
Epoch: [198]  [1000/1251]  eta: 0:00:47  lr: 0.001157  min_lr: 0.001157  loss: 2.2872 (2.8514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8037 (0.8703)  time: 0.1826  data: 0.0004  max mem: 12911
Epoch: [198]  [1200/1251]  eta: 0:00:09  lr: 0.001154  min_lr: 0.001154  loss: 2.1787 (2.8453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7688 (0.8647)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [198]  [1250/1251]  eta: 0:00:00  lr: 0.001153  min_lr: 0.001153  loss: 3.1750 (2.8478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7676 (0.8623)  time: 0.1468  data: 0.0011  max mem: 12911
Epoch: [198] Total time: 0:03:57 (0.1895 s / it)
Averaged stats: lr: 0.001153  min_lr: 0.001153  loss: 3.1750 (2.8390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7676 (0.8623)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7504 (0.7504)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 5.5305  data: 5.4386  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8649 (0.9008)  acc1: 81.2000 (82.1818)  acc5: 96.8000 (96.6545)  time: 0.7165  data: 0.6311  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1871 (1.0905)  acc1: 76.0000 (77.9429)  acc5: 94.0000 (94.3238)  time: 0.2021  data: 0.1200  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1871 (1.0991)  acc1: 75.6000 (77.5520)  acc5: 92.0000 (94.1920)  time: 0.2030  data: 0.1225  max mem: 12911
Test: Total time: 0:00:10 (0.4019 s / it)
* Acc@1 77.778 Acc@5 94.070 loss 1.090
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.78%
Epoch: [199]  [   0/1251]  eta: 1:03:56  lr: 0.001153  min_lr: 0.001153  loss: 3.8170 (3.8170)  weight_decay: 0.0500 (0.0500)  time: 3.0666  data: 2.8148  max mem: 12911
Epoch: [199]  [ 200/1251]  eta: 0:03:36  lr: 0.001150  min_lr: 0.001150  loss: 2.2354 (2.8871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8848 (0.8558)  time: 0.1916  data: 0.0004  max mem: 12911
Epoch: [199]  [ 400/1251]  eta: 0:02:47  lr: 0.001147  min_lr: 0.001147  loss: 3.0582 (2.8497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7268 (0.8367)  time: 0.1884  data: 0.0005  max mem: 12911
Epoch: [199]  [ 600/1251]  eta: 0:02:06  lr: 0.001143  min_lr: 0.001143  loss: 2.3404 (2.8355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8595 (0.8451)  time: 0.1883  data: 0.0005  max mem: 12911
Epoch: [199]  [ 800/1251]  eta: 0:01:26  lr: 0.001140  min_lr: 0.001140  loss: 2.1475 (2.8343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7383 (0.8474)  time: 0.1844  data: 0.0004  max mem: 12911
Epoch: [199]  [1000/1251]  eta: 0:00:47  lr: 0.001137  min_lr: 0.001137  loss: 2.2967 (2.8271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8761 (0.8735)  time: 0.1828  data: 0.0005  max mem: 12911
Epoch: [199]  [1200/1251]  eta: 0:00:09  lr: 0.001134  min_lr: 0.001134  loss: 2.6034 (2.8417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8758 (0.8842)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [199]  [1250/1251]  eta: 0:00:00  lr: 0.001133  min_lr: 0.001133  loss: 2.4713 (2.8446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8446 (0.8835)  time: 0.1458  data: 0.0010  max mem: 12911
Epoch: [199] Total time: 0:03:57 (0.1897 s / it)
Averaged stats: lr: 0.001133  min_lr: 0.001133  loss: 2.4713 (2.8309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8446 (0.8835)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7013 (0.7013)  acc1: 86.0000 (86.0000)  acc5: 98.8000 (98.8000)  time: 5.5523  data: 5.4244  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7895 (0.8449)  acc1: 84.4000 (82.6182)  acc5: 96.4000 (96.5455)  time: 0.7419  data: 0.6417  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1085 (1.0388)  acc1: 75.6000 (78.1333)  acc5: 93.6000 (94.3429)  time: 0.2061  data: 0.1177  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1631 (1.0498)  acc1: 74.8000 (77.6640)  acc5: 92.8000 (94.2400)  time: 0.2021  data: 0.1176  max mem: 12911
Test: Total time: 0:00:10 (0.4053 s / it)
* Acc@1 77.774 Acc@5 94.046 loss 1.046
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.78%
Epoch: [200]  [   0/1251]  eta: 1:05:07  lr: 0.001133  min_lr: 0.001133  loss: 2.9306 (2.9306)  weight_decay: 0.0500 (0.0500)  time: 3.1234  data: 2.2210  max mem: 12911
Epoch: [200]  [ 200/1251]  eta: 0:03:34  lr: 0.001130  min_lr: 0.001130  loss: 2.2920 (2.7707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8103 (0.8667)  time: 0.1854  data: 0.0005  max mem: 12911
Epoch: [200]  [ 400/1251]  eta: 0:02:46  lr: 0.001126  min_lr: 0.001126  loss: 2.5517 (2.8156)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0701 (0.9851)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [200]  [ 600/1251]  eta: 0:02:06  lr: 0.001123  min_lr: 0.001123  loss: 2.4439 (2.8270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7962 (0.9275)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [200]  [ 800/1251]  eta: 0:01:26  lr: 0.001120  min_lr: 0.001120  loss: 3.1363 (2.8558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7353 (0.9187)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [200]  [1000/1251]  eta: 0:00:48  lr: 0.001117  min_lr: 0.001117  loss: 2.2573 (2.8328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8560 (0.9177)  time: 0.1886  data: 0.0005  max mem: 12911
Epoch: [200]  [1200/1251]  eta: 0:00:09  lr: 0.001114  min_lr: 0.001114  loss: 2.6165 (2.8515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8811 (0.9127)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [200]  [1250/1251]  eta: 0:00:00  lr: 0.001113  min_lr: 0.001113  loss: 2.2430 (2.8518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8475 (0.9116)  time: 0.1458  data: 0.0008  max mem: 12911
Epoch: [200] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.001113  min_lr: 0.001113  loss: 2.2430 (2.8313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8475 (0.9116)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6695 (0.6695)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.5509  data: 5.4486  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8056 (0.8200)  acc1: 83.6000 (82.6182)  acc5: 97.2000 (96.5818)  time: 0.7037  data: 0.6132  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0267 (1.0070)  acc1: 76.0000 (78.6095)  acc5: 94.4000 (94.4762)  time: 0.1924  data: 0.1065  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1450 (1.0210)  acc1: 75.6000 (78.1120)  acc5: 93.2000 (94.3520)  time: 0.2067  data: 0.1215  max mem: 12911
Test: Total time: 0:00:10 (0.4043 s / it)
* Acc@1 77.654 Acc@5 94.146 loss 1.026
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.78%
Epoch: [201]  [   0/1251]  eta: 1:05:56  lr: 0.001113  min_lr: 0.001113  loss: 3.7381 (3.7381)  weight_decay: 0.0500 (0.0500)  time: 3.1628  data: 2.2640  max mem: 12911
Epoch: [201]  [ 200/1251]  eta: 0:03:35  lr: 0.001110  min_lr: 0.001110  loss: 2.2405 (2.8121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7305 (0.8280)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [201]  [ 400/1251]  eta: 0:02:46  lr: 0.001106  min_lr: 0.001106  loss: 2.9874 (2.8820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9050 (0.9547)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [201]  [ 600/1251]  eta: 0:02:05  lr: 0.001103  min_lr: 0.001103  loss: 2.4927 (2.8563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7993 (0.9164)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [201]  [ 800/1251]  eta: 0:01:26  lr: 0.001100  min_lr: 0.001100  loss: 3.3159 (2.8554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8308 (0.8975)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [201]  [1000/1251]  eta: 0:00:48  lr: 0.001097  min_lr: 0.001097  loss: 2.8690 (2.8527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8522 (0.8970)  time: 0.1869  data: 0.0012  max mem: 12911
Epoch: [201]  [1200/1251]  eta: 0:00:09  lr: 0.001094  min_lr: 0.001094  loss: 2.7971 (2.8384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8102 (0.8926)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [201]  [1250/1251]  eta: 0:00:00  lr: 0.001093  min_lr: 0.001093  loss: 2.9297 (2.8404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7937 (0.8933)  time: 0.1475  data: 0.0008  max mem: 12911
Epoch: [201] Total time: 0:03:59 (0.1912 s / it)
Averaged stats: lr: 0.001093  min_lr: 0.001093  loss: 2.9297 (2.8262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7937 (0.8933)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7465 (0.7465)  acc1: 84.8000 (84.8000)  acc5: 97.2000 (97.2000)  time: 5.3507  data: 5.2249  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8425 (0.8762)  acc1: 83.2000 (81.4545)  acc5: 96.8000 (96.5818)  time: 0.7279  data: 0.6279  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.1011 (1.0625)  acc1: 75.2000 (77.5810)  acc5: 94.8000 (94.6857)  time: 0.2157  data: 0.1268  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1486 (1.0721)  acc1: 75.2000 (77.4240)  acc5: 93.6000 (94.4960)  time: 0.2117  data: 0.1268  max mem: 12911
Test: Total time: 0:00:10 (0.4035 s / it)
* Acc@1 77.608 Acc@5 94.106 loss 1.068
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.78%
Epoch: [202]  [   0/1251]  eta: 1:03:39  lr: 0.001093  min_lr: 0.001093  loss: 3.9052 (3.9052)  weight_decay: 0.0500 (0.0500)  time: 3.0529  data: 2.7568  max mem: 12911
Epoch: [202]  [ 200/1251]  eta: 0:03:35  lr: 0.001090  min_lr: 0.001090  loss: 2.6909 (2.7877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.8286)  time: 0.1933  data: 0.0004  max mem: 12911
Epoch: [202]  [ 400/1251]  eta: 0:02:47  lr: 0.001086  min_lr: 0.001086  loss: 2.2913 (2.7715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8261 (0.8375)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [202]  [ 600/1251]  eta: 0:02:06  lr: 0.001083  min_lr: 0.001083  loss: 2.2036 (2.7977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9118 (0.8465)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [202]  [ 800/1251]  eta: 0:01:27  lr: 0.001080  min_lr: 0.001080  loss: 3.0809 (2.7893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8575 (0.8627)  time: 0.1899  data: 0.0006  max mem: 12911
Epoch: [202]  [1000/1251]  eta: 0:00:48  lr: 0.001077  min_lr: 0.001077  loss: 2.5381 (2.7958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8222 (0.8724)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [202]  [1200/1251]  eta: 0:00:09  lr: 0.001074  min_lr: 0.001074  loss: 2.9252 (2.8091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8855 (0.8707)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [202]  [1250/1251]  eta: 0:00:00  lr: 0.001073  min_lr: 0.001073  loss: 2.3848 (2.8101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8867 (0.8724)  time: 0.1477  data: 0.0006  max mem: 12911
Epoch: [202] Total time: 0:04:00 (0.1922 s / it)
Averaged stats: lr: 0.001073  min_lr: 0.001073  loss: 2.3848 (2.8075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8867 (0.8724)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6415 (0.6415)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.5049  data: 5.4117  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7945 (0.8095)  acc1: 82.4000 (81.7818)  acc5: 97.2000 (96.5091)  time: 0.7432  data: 0.6501  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0726 (0.9860)  acc1: 76.0000 (78.0000)  acc5: 94.0000 (94.4000)  time: 0.2172  data: 0.1302  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0833 (0.9949)  acc1: 74.4000 (77.6960)  acc5: 92.8000 (94.2400)  time: 0.2154  data: 0.1300  max mem: 12911
Test: Total time: 0:00:10 (0.4092 s / it)
* Acc@1 77.614 Acc@5 94.186 loss 0.999
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.78%
Epoch: [203]  [   0/1251]  eta: 1:01:19  lr: 0.001073  min_lr: 0.001073  loss: 1.8582 (1.8582)  weight_decay: 0.0500 (0.0500)  time: 2.9416  data: 1.7448  max mem: 12911
Epoch: [203]  [ 200/1251]  eta: 0:03:33  lr: 0.001070  min_lr: 0.001070  loss: 2.1886 (2.7912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8590 (0.8778)  time: 0.1853  data: 0.0005  max mem: 12911
Epoch: [203]  [ 400/1251]  eta: 0:02:46  lr: 0.001066  min_lr: 0.001066  loss: 2.4379 (2.8182)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7808 (0.8802)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [203]  [ 600/1251]  eta: 0:02:05  lr: 0.001063  min_lr: 0.001063  loss: 2.4289 (2.8155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8601 (0.8757)  time: 0.1877  data: 0.0005  max mem: 12911
Epoch: [203]  [ 800/1251]  eta: 0:01:26  lr: 0.001060  min_lr: 0.001060  loss: 3.0148 (2.8307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8788 (0.8996)  time: 0.1834  data: 0.0005  max mem: 12911
Epoch: [203]  [1000/1251]  eta: 0:00:47  lr: 0.001057  min_lr: 0.001057  loss: 2.0709 (2.7964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8301 (0.8978)  time: 0.1918  data: 0.0005  max mem: 12911
Epoch: [203]  [1200/1251]  eta: 0:00:09  lr: 0.001054  min_lr: 0.001054  loss: 2.2438 (2.8096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7810 (0.9185)  time: 0.1913  data: 0.0005  max mem: 12911
Epoch: [203]  [1250/1251]  eta: 0:00:00  lr: 0.001053  min_lr: 0.001053  loss: 2.4112 (2.8119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7781 (0.9169)  time: 0.1462  data: 0.0006  max mem: 12911
Epoch: [203] Total time: 0:03:59 (0.1912 s / it)
Averaged stats: lr: 0.001053  min_lr: 0.001053  loss: 2.4112 (2.8016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7781 (0.9169)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6765 (0.6765)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.6856  data: 5.5938  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8276 (0.8393)  acc1: 83.2000 (82.4000)  acc5: 96.8000 (96.5091)  time: 0.6905  data: 0.5953  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0382 (1.0285)  acc1: 74.8000 (77.9048)  acc5: 93.6000 (94.4191)  time: 0.1767  data: 0.0891  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1359 (1.0418)  acc1: 74.4000 (77.6960)  acc5: 92.8000 (94.2240)  time: 0.1863  data: 0.1012  max mem: 12911
Test: Total time: 0:00:09 (0.3966 s / it)
* Acc@1 77.648 Acc@5 94.058 loss 1.045
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.78%
Epoch: [204]  [   0/1251]  eta: 1:08:02  lr: 0.001053  min_lr: 0.001053  loss: 1.9512 (1.9512)  weight_decay: 0.0500 (0.0500)  time: 3.2635  data: 1.5735  max mem: 12911
Epoch: [204]  [ 200/1251]  eta: 0:03:31  lr: 0.001050  min_lr: 0.001050  loss: 2.2251 (2.7535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7795 (0.8136)  time: 0.1821  data: 0.0004  max mem: 12911
Epoch: [204]  [ 400/1251]  eta: 0:02:43  lr: 0.001047  min_lr: 0.001047  loss: 2.4959 (2.7652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7825 (0.8721)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [204]  [ 600/1251]  eta: 0:02:03  lr: 0.001044  min_lr: 0.001044  loss: 2.7848 (2.7914)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0031 (0.9061)  time: 0.1837  data: 0.0004  max mem: 12911
Epoch: [204]  [ 800/1251]  eta: 0:01:24  lr: 0.001040  min_lr: 0.001040  loss: 2.4597 (2.7848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9882 (0.9063)  time: 0.1903  data: 0.0005  max mem: 12911
Epoch: [204]  [1000/1251]  eta: 0:00:47  lr: 0.001037  min_lr: 0.001037  loss: 2.2714 (2.7838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9037 (0.9118)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [204]  [1200/1251]  eta: 0:00:09  lr: 0.001034  min_lr: 0.001034  loss: 3.3345 (2.7778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8192 (0.9021)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [204]  [1250/1251]  eta: 0:00:00  lr: 0.001033  min_lr: 0.001033  loss: 2.4944 (2.7801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8192 (0.9010)  time: 0.1464  data: 0.0010  max mem: 12911
Epoch: [204] Total time: 0:03:55 (0.1884 s / it)
Averaged stats: lr: 0.001033  min_lr: 0.001033  loss: 2.4944 (2.8014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8192 (0.9010)
Test:  [ 0/25]  eta: 0:01:24  loss: 0.6247 (0.6247)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 3.3769  data: 3.2852  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8315 (0.8363)  acc1: 82.0000 (82.2182)  acc5: 96.8000 (96.5091)  time: 0.6333  data: 0.5386  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0750 (1.0215)  acc1: 76.0000 (77.8476)  acc5: 94.4000 (94.6095)  time: 0.2782  data: 0.1902  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1562 (1.0319)  acc1: 74.4000 (77.5680)  acc5: 93.6000 (94.4960)  time: 0.2225  data: 0.1370  max mem: 12911
Test: Total time: 0:00:09 (0.3974 s / it)
* Acc@1 77.756 Acc@5 94.168 loss 1.040
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.78%
Epoch: [205]  [   0/1251]  eta: 1:01:29  lr: 0.001033  min_lr: 0.001033  loss: 3.3941 (3.3941)  weight_decay: 0.0500 (0.0500)  time: 2.9489  data: 2.3737  max mem: 12911
Epoch: [205]  [ 200/1251]  eta: 0:03:37  lr: 0.001030  min_lr: 0.001030  loss: 2.3576 (2.8123)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8333 (0.8801)  time: 0.1936  data: 0.0004  max mem: 12911
Epoch: [205]  [ 400/1251]  eta: 0:02:48  lr: 0.001027  min_lr: 0.001027  loss: 2.1415 (2.8301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8338 (0.8908)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [205]  [ 600/1251]  eta: 0:02:07  lr: 0.001024  min_lr: 0.001024  loss: 2.2133 (2.8038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7666 (0.8662)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [205]  [ 800/1251]  eta: 0:01:27  lr: 0.001021  min_lr: 0.001021  loss: 2.4523 (2.7870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8642 (0.8737)  time: 0.1907  data: 0.0006  max mem: 12911
Epoch: [205]  [1000/1251]  eta: 0:00:48  lr: 0.001018  min_lr: 0.001018  loss: 2.2835 (2.7865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8601 (0.8742)  time: 0.1907  data: 0.0005  max mem: 12911
Epoch: [205]  [1200/1251]  eta: 0:00:09  lr: 0.001014  min_lr: 0.001014  loss: 2.4949 (2.7815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8223 (0.8811)  time: 0.1911  data: 0.0005  max mem: 12911
Epoch: [205]  [1250/1251]  eta: 0:00:00  lr: 0.001014  min_lr: 0.001014  loss: 2.2881 (2.7763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8980 (0.8835)  time: 0.1477  data: 0.0009  max mem: 12911
Epoch: [205] Total time: 0:04:01 (0.1927 s / it)
Averaged stats: lr: 0.001014  min_lr: 0.001014  loss: 2.2881 (2.7974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8980 (0.8835)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5937 (0.5937)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.7450  data: 5.6534  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8232 (0.8025)  acc1: 82.4000 (82.8364)  acc5: 96.4000 (96.3636)  time: 0.7533  data: 0.6570  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0610 (0.9924)  acc1: 76.4000 (78.3238)  acc5: 94.0000 (94.3238)  time: 0.2012  data: 0.1128  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1370 (1.0001)  acc1: 73.2000 (78.0000)  acc5: 94.0000 (94.2560)  time: 0.2030  data: 0.1182  max mem: 12911
Test: Total time: 0:00:10 (0.4129 s / it)
* Acc@1 77.954 Acc@5 94.106 loss 0.991
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 77.95%
Epoch: [206]  [   0/1251]  eta: 0:50:36  lr: 0.001014  min_lr: 0.001014  loss: 2.0168 (2.0168)  weight_decay: 0.0500 (0.0500)  time: 2.4270  data: 1.7390  max mem: 12911
Epoch: [206]  [ 200/1251]  eta: 0:03:33  lr: 0.001011  min_lr: 0.001011  loss: 2.1782 (2.7257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9594 (0.9352)  time: 0.1883  data: 0.0005  max mem: 12911
Epoch: [206]  [ 400/1251]  eta: 0:02:45  lr: 0.001007  min_lr: 0.001007  loss: 2.6296 (2.8070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8384 (0.9054)  time: 0.1866  data: 0.0004  max mem: 12911
Epoch: [206]  [ 600/1251]  eta: 0:02:05  lr: 0.001004  min_lr: 0.001004  loss: 2.2600 (2.7809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8930 (0.9302)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [206]  [ 800/1251]  eta: 0:01:26  lr: 0.001001  min_lr: 0.001001  loss: 2.2968 (2.7801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8079 (0.9215)  time: 0.1838  data: 0.0004  max mem: 12911
Epoch: [206]  [1000/1251]  eta: 0:00:47  lr: 0.000998  min_lr: 0.000998  loss: 2.1213 (2.7926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8456 (0.9153)  time: 0.1893  data: 0.0005  max mem: 12911
Epoch: [206]  [1200/1251]  eta: 0:00:09  lr: 0.000995  min_lr: 0.000995  loss: 2.3814 (2.7843)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7740 (0.9155)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [206]  [1250/1251]  eta: 0:00:00  lr: 0.000994  min_lr: 0.000994  loss: 2.2418 (2.7847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7365 (0.9129)  time: 0.1467  data: 0.0011  max mem: 12911
Epoch: [206] Total time: 0:03:57 (0.1902 s / it)
Averaged stats: lr: 0.000994  min_lr: 0.000994  loss: 2.2418 (2.8088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7365 (0.9129)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5894 (0.5894)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 5.2845  data: 5.1927  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8303 (0.7886)  acc1: 81.6000 (82.5091)  acc5: 97.2000 (96.6909)  time: 0.7470  data: 0.6499  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0296 (0.9649)  acc1: 76.0000 (78.5714)  acc5: 94.4000 (94.8000)  time: 0.2263  data: 0.1364  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0852 (0.9764)  acc1: 74.4000 (78.2400)  acc5: 93.6000 (94.5760)  time: 0.2231  data: 0.1363  max mem: 12911
Test: Total time: 0:00:10 (0.4083 s / it)
* Acc@1 77.970 Acc@5 94.332 loss 0.975
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 77.97%
Epoch: [207]  [   0/1251]  eta: 1:05:30  lr: 0.000994  min_lr: 0.000994  loss: 2.0021 (2.0021)  weight_decay: 0.0500 (0.0500)  time: 3.1421  data: 2.9215  max mem: 12911
Epoch: [207]  [ 200/1251]  eta: 0:03:30  lr: 0.000991  min_lr: 0.000991  loss: 2.3414 (2.6829)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8404 (0.8506)  time: 0.1856  data: 0.0004  max mem: 12911
Epoch: [207]  [ 400/1251]  eta: 0:02:44  lr: 0.000988  min_lr: 0.000988  loss: 2.8832 (2.7127)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8768 (0.8878)  time: 0.1881  data: 0.0004  max mem: 12911
Epoch: [207]  [ 600/1251]  eta: 0:02:04  lr: 0.000985  min_lr: 0.000985  loss: 2.3319 (2.7595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8733 (0.8983)  time: 0.1913  data: 0.0004  max mem: 12911
Epoch: [207]  [ 800/1251]  eta: 0:01:26  lr: 0.000982  min_lr: 0.000982  loss: 2.3615 (2.7917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8464 (0.8986)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [207]  [1000/1251]  eta: 0:00:47  lr: 0.000979  min_lr: 0.000979  loss: 2.2218 (2.7914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8389 (0.8909)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [207]  [1200/1251]  eta: 0:00:09  lr: 0.000976  min_lr: 0.000976  loss: 3.0222 (2.8129)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0288 (0.9062)  time: 0.1848  data: 0.0005  max mem: 12911
Epoch: [207]  [1250/1251]  eta: 0:00:00  lr: 0.000975  min_lr: 0.000975  loss: 2.2323 (2.8070)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0530 (0.9181)  time: 0.1458  data: 0.0012  max mem: 12911
Epoch: [207] Total time: 0:03:57 (0.1897 s / it)
Averaged stats: lr: 0.000975  min_lr: 0.000975  loss: 2.2323 (2.7884)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0530 (0.9181)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6918 (0.6918)  acc1: 85.2000 (85.2000)  acc5: 96.4000 (96.4000)  time: 5.6764  data: 5.5782  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8492 (0.8464)  acc1: 82.4000 (82.0000)  acc5: 96.8000 (96.4000)  time: 0.7714  data: 0.6733  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0588 (1.0238)  acc1: 76.4000 (77.7143)  acc5: 93.6000 (94.5333)  time: 0.2154  data: 0.1257  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1333 (1.0402)  acc1: 75.2000 (77.4400)  acc5: 93.6000 (94.4480)  time: 0.2114  data: 0.1257  max mem: 12911
Test: Total time: 0:00:10 (0.4151 s / it)
* Acc@1 77.716 Acc@5 94.066 loss 1.037
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.97%
Epoch: [208]  [   0/1251]  eta: 1:02:23  lr: 0.000975  min_lr: 0.000975  loss: 2.1462 (2.1462)  weight_decay: 0.0500 (0.0500)  time: 2.9923  data: 2.6923  max mem: 12911
Epoch: [208]  [ 200/1251]  eta: 0:03:35  lr: 0.000972  min_lr: 0.000972  loss: 2.1305 (2.8192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8357 (0.9209)  time: 0.1883  data: 0.0003  max mem: 12911
Epoch: [208]  [ 400/1251]  eta: 0:02:45  lr: 0.000969  min_lr: 0.000969  loss: 2.2032 (2.7699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8818 (0.9473)  time: 0.1856  data: 0.0004  max mem: 12911
Epoch: [208]  [ 600/1251]  eta: 0:02:05  lr: 0.000966  min_lr: 0.000966  loss: 2.6976 (2.7588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9397 (0.9580)  time: 0.1852  data: 0.0004  max mem: 12911
Epoch: [208]  [ 800/1251]  eta: 0:01:25  lr: 0.000963  min_lr: 0.000963  loss: 2.2292 (2.7631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8477 (0.9881)  time: 0.1834  data: 0.0004  max mem: 12911
Epoch: [208]  [1000/1251]  eta: 0:00:47  lr: 0.000960  min_lr: 0.000960  loss: 2.5392 (2.7561)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7513 (0.9695)  time: 0.1856  data: 0.0005  max mem: 12911
Epoch: [208]  [1200/1251]  eta: 0:00:09  lr: 0.000956  min_lr: 0.000956  loss: 2.3618 (2.7645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8507 (0.9606)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [208]  [1250/1251]  eta: 0:00:00  lr: 0.000956  min_lr: 0.000956  loss: 2.6716 (2.7746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8933 (0.9631)  time: 0.1482  data: 0.0011  max mem: 12911
Epoch: [208] Total time: 0:03:56 (0.1889 s / it)
Averaged stats: lr: 0.000956  min_lr: 0.000956  loss: 2.6716 (2.8015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8933 (0.9631)
Test:  [ 0/25]  eta: 0:02:07  loss: 0.6655 (0.6655)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.1123  data: 4.9721  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8724 (0.8349)  acc1: 83.2000 (82.8000)  acc5: 96.8000 (96.6909)  time: 0.6659  data: 0.5753  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0652 (1.0276)  acc1: 76.0000 (78.4381)  acc5: 94.0000 (94.5333)  time: 0.1947  data: 0.1120  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1683 (1.0385)  acc1: 75.6000 (78.1760)  acc5: 92.8000 (94.3360)  time: 0.2030  data: 0.1220  max mem: 12911
Test: Total time: 0:00:10 (0.4048 s / it)
* Acc@1 77.940 Acc@5 94.222 loss 1.037
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.97%
Epoch: [209]  [   0/1251]  eta: 1:06:32  lr: 0.000956  min_lr: 0.000956  loss: 2.1562 (2.1562)  weight_decay: 0.0500 (0.0500)  time: 3.1912  data: 1.8907  max mem: 12911
Epoch: [209]  [ 200/1251]  eta: 0:03:37  lr: 0.000953  min_lr: 0.000953  loss: 3.0143 (2.8189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9004 (0.9460)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [209]  [ 400/1251]  eta: 0:02:50  lr: 0.000950  min_lr: 0.000950  loss: 2.4084 (2.7486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8185 (0.9133)  time: 0.1979  data: 0.0004  max mem: 12911
Epoch: [209]  [ 600/1251]  eta: 0:02:08  lr: 0.000947  min_lr: 0.000947  loss: 2.1068 (2.7500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8438 (0.9061)  time: 0.1856  data: 0.0005  max mem: 12911
Epoch: [209]  [ 800/1251]  eta: 0:01:27  lr: 0.000944  min_lr: 0.000944  loss: 2.7177 (2.7674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8567 (0.8884)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [209]  [1000/1251]  eta: 0:00:48  lr: 0.000940  min_lr: 0.000940  loss: 2.2822 (2.7681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8329 (0.8876)  time: 0.1881  data: 0.0005  max mem: 12911
Epoch: [209]  [1200/1251]  eta: 0:00:09  lr: 0.000937  min_lr: 0.000937  loss: 2.2234 (2.7805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7962 (0.8925)  time: 0.1877  data: 0.0005  max mem: 12911
Epoch: [209]  [1250/1251]  eta: 0:00:00  lr: 0.000937  min_lr: 0.000937  loss: 2.2858 (2.7845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7962 (0.8910)  time: 0.1465  data: 0.0011  max mem: 12911
Epoch: [209] Total time: 0:04:00 (0.1920 s / it)
Averaged stats: lr: 0.000937  min_lr: 0.000937  loss: 2.2858 (2.7940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7962 (0.8910)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6711 (0.6711)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.3122  data: 5.2134  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7725 (0.8007)  acc1: 82.4000 (82.2182)  acc5: 96.8000 (96.5818)  time: 0.7141  data: 0.6164  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0056 (0.9817)  acc1: 76.8000 (77.8667)  acc5: 93.6000 (94.4762)  time: 0.2025  data: 0.1136  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1090 (0.9960)  acc1: 75.2000 (77.5200)  acc5: 93.2000 (94.2560)  time: 0.2274  data: 0.1424  max mem: 12911
Test: Total time: 0:00:10 (0.4134 s / it)
* Acc@1 77.934 Acc@5 94.270 loss 0.984
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.97%
Epoch: [210]  [   0/1251]  eta: 0:55:44  lr: 0.000937  min_lr: 0.000937  loss: 3.9053 (3.9053)  weight_decay: 0.0500 (0.0500)  time: 2.6731  data: 2.3133  max mem: 12911
Epoch: [210]  [ 200/1251]  eta: 0:03:35  lr: 0.000934  min_lr: 0.000934  loss: 2.2486 (2.6949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9049 (0.9372)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [210]  [ 400/1251]  eta: 0:02:47  lr: 0.000931  min_lr: 0.000931  loss: 2.2339 (2.7358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8166 (0.9137)  time: 0.1886  data: 0.0006  max mem: 12911
Epoch: [210]  [ 600/1251]  eta: 0:02:06  lr: 0.000928  min_lr: 0.000928  loss: 2.4214 (2.7556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8383 (0.9822)  time: 0.1862  data: 0.0006  max mem: 12911
Epoch: [210]  [ 800/1251]  eta: 0:01:26  lr: 0.000925  min_lr: 0.000925  loss: 2.2898 (2.7639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8439 (0.9640)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [210]  [1000/1251]  eta: 0:00:47  lr: 0.000922  min_lr: 0.000922  loss: 2.6304 (2.7612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8977 (0.9467)  time: 0.1848  data: 0.0005  max mem: 12911
Epoch: [210]  [1200/1251]  eta: 0:00:09  lr: 0.000918  min_lr: 0.000918  loss: 2.2350 (2.7692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9048 (0.9422)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [210]  [1250/1251]  eta: 0:00:00  lr: 0.000918  min_lr: 0.000918  loss: 2.2170 (2.7680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8703 (0.9413)  time: 0.1475  data: 0.0009  max mem: 12911
Epoch: [210] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.000918  min_lr: 0.000918  loss: 2.2170 (2.7921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8703 (0.9413)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6250 (0.6250)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.7801  data: 5.6854  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8077 (0.8037)  acc1: 83.2000 (83.4546)  acc5: 96.4000 (96.4364)  time: 0.7372  data: 0.6401  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0473 (0.9961)  acc1: 76.4000 (78.6857)  acc5: 94.0000 (94.4762)  time: 0.1958  data: 0.1069  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1456 (1.0065)  acc1: 74.0000 (78.3040)  acc5: 93.6000 (94.4160)  time: 0.1925  data: 0.1075  max mem: 12911
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 77.868 Acc@5 94.276 loss 1.003
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.97%
Epoch: [211]  [   0/1251]  eta: 1:09:17  lr: 0.000918  min_lr: 0.000918  loss: 1.9576 (1.9576)  weight_decay: 0.0500 (0.0500)  time: 3.3232  data: 1.8721  max mem: 12911
Epoch: [211]  [ 200/1251]  eta: 0:03:35  lr: 0.000915  min_lr: 0.000915  loss: 2.1138 (2.7741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8005 (0.9045)  time: 0.1917  data: 0.0004  max mem: 12911
Epoch: [211]  [ 400/1251]  eta: 0:02:47  lr: 0.000912  min_lr: 0.000912  loss: 2.0993 (2.7545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8740 (0.8926)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [211]  [ 600/1251]  eta: 0:02:06  lr: 0.000909  min_lr: 0.000909  loss: 2.7985 (2.7524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7864 (0.8583)  time: 0.1898  data: 0.0004  max mem: 12911
Epoch: [211]  [ 800/1251]  eta: 0:01:26  lr: 0.000906  min_lr: 0.000906  loss: 2.5783 (2.7780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9960 (0.9006)  time: 0.1881  data: 0.0004  max mem: 12911
Epoch: [211]  [1000/1251]  eta: 0:00:48  lr: 0.000903  min_lr: 0.000903  loss: 2.1814 (2.7703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8205 (0.9023)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [211]  [1200/1251]  eta: 0:00:09  lr: 0.000900  min_lr: 0.000900  loss: 2.2078 (2.7802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8933 (0.8940)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [211]  [1250/1251]  eta: 0:00:00  lr: 0.000899  min_lr: 0.000899  loss: 2.3184 (2.7808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8385 (0.8919)  time: 0.1472  data: 0.0008  max mem: 12911
Epoch: [211] Total time: 0:03:59 (0.1912 s / it)
Averaged stats: lr: 0.000899  min_lr: 0.000899  loss: 2.3184 (2.7747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8385 (0.8919)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6392 (0.6392)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.5819  data: 5.4903  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8111 (0.8211)  acc1: 83.2000 (82.2182)  acc5: 96.8000 (96.5455)  time: 0.7321  data: 0.6391  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0764 (0.9911)  acc1: 76.0000 (78.2857)  acc5: 93.2000 (94.5143)  time: 0.2043  data: 0.1175  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0651 (0.9993)  acc1: 74.8000 (78.0800)  acc5: 93.2000 (94.3680)  time: 0.2034  data: 0.1175  max mem: 12911
Test: Total time: 0:00:10 (0.4045 s / it)
* Acc@1 78.124 Acc@5 94.250 loss 1.005
Accuracy of the model on the 50000 test images: 78.1%
Max accuracy: 78.12%
Epoch: [212]  [   0/1251]  eta: 1:05:33  lr: 0.000899  min_lr: 0.000899  loss: 3.3920 (3.3920)  weight_decay: 0.0500 (0.0500)  time: 3.1445  data: 2.9190  max mem: 12911
Epoch: [212]  [ 200/1251]  eta: 0:03:29  lr: 0.000896  min_lr: 0.000896  loss: 2.1108 (2.6425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8277 (0.9236)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [212]  [ 400/1251]  eta: 0:02:43  lr: 0.000893  min_lr: 0.000893  loss: 2.2470 (2.6879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8888 (0.9258)  time: 0.1850  data: 0.0004  max mem: 12911
Epoch: [212]  [ 600/1251]  eta: 0:02:04  lr: 0.000890  min_lr: 0.000890  loss: 2.9334 (2.7677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7891 (0.9282)  time: 0.1955  data: 0.0005  max mem: 12911
Epoch: [212]  [ 800/1251]  eta: 0:01:26  lr: 0.000887  min_lr: 0.000887  loss: 2.1691 (2.7770)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8469 (0.9232)  time: 0.1833  data: 0.0004  max mem: 12911
Epoch: [212]  [1000/1251]  eta: 0:00:47  lr: 0.000884  min_lr: 0.000884  loss: 2.9772 (2.7897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8839 (0.9338)  time: 0.1856  data: 0.0004  max mem: 12911
Epoch: [212]  [1200/1251]  eta: 0:00:09  lr: 0.000881  min_lr: 0.000881  loss: 2.7891 (2.7972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8342 (0.9369)  time: 0.1873  data: 0.0004  max mem: 12911
Epoch: [212]  [1250/1251]  eta: 0:00:00  lr: 0.000880  min_lr: 0.000880  loss: 2.2597 (2.7959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8324 (0.9392)  time: 0.1483  data: 0.0008  max mem: 12911
Epoch: [212] Total time: 0:03:57 (0.1899 s / it)
Averaged stats: lr: 0.000880  min_lr: 0.000880  loss: 2.2597 (2.7596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8324 (0.9392)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6202 (0.6202)  acc1: 85.2000 (85.2000)  acc5: 98.4000 (98.4000)  time: 5.3919  data: 5.3003  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8104 (0.8082)  acc1: 83.2000 (82.9091)  acc5: 96.8000 (96.5091)  time: 0.7153  data: 0.6219  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0436 (0.9972)  acc1: 75.2000 (78.5143)  acc5: 94.4000 (94.2857)  time: 0.2070  data: 0.1204  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1338 (1.0115)  acc1: 74.4000 (78.1440)  acc5: 93.6000 (94.2080)  time: 0.2102  data: 0.1256  max mem: 12911
Test: Total time: 0:00:10 (0.4021 s / it)
* Acc@1 78.264 Acc@5 94.254 loss 1.007
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.26%
Epoch: [213]  [   0/1251]  eta: 1:01:13  lr: 0.000880  min_lr: 0.000880  loss: 3.9476 (3.9476)  weight_decay: 0.0500 (0.0500)  time: 2.9364  data: 2.6718  max mem: 12911
Epoch: [213]  [ 200/1251]  eta: 0:03:32  lr: 0.000877  min_lr: 0.000877  loss: 2.2785 (2.7648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8681 (0.9013)  time: 0.1871  data: 0.0003  max mem: 12911
Epoch: [213]  [ 400/1251]  eta: 0:02:46  lr: 0.000874  min_lr: 0.000874  loss: 3.5363 (2.7740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8440 (0.9044)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [213]  [ 600/1251]  eta: 0:02:05  lr: 0.000871  min_lr: 0.000871  loss: 2.2251 (2.7657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8906 (0.8979)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [213]  [ 800/1251]  eta: 0:01:26  lr: 0.000868  min_lr: 0.000868  loss: 2.3801 (2.7484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9946 (0.9105)  time: 0.1858  data: 0.0005  max mem: 12911
Epoch: [213]  [1000/1251]  eta: 0:00:47  lr: 0.000865  min_lr: 0.000865  loss: 2.2619 (2.7592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8843 (0.9145)  time: 0.1856  data: 0.0005  max mem: 12911
Epoch: [213]  [1200/1251]  eta: 0:00:09  lr: 0.000863  min_lr: 0.000863  loss: 3.3096 (2.7469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8226 (nan)  time: 0.1888  data: 0.0005  max mem: 12911
Epoch: [213]  [1250/1251]  eta: 0:00:00  lr: 0.000862  min_lr: 0.000862  loss: 2.2137 (2.7478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8850 (nan)  time: 0.1465  data: 0.0010  max mem: 12911
Epoch: [213] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.000862  min_lr: 0.000862  loss: 2.2137 (2.7679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8850 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6119 (0.6119)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.5861  data: 5.4944  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8354 (0.7903)  acc1: 83.2000 (83.0909)  acc5: 96.8000 (96.6546)  time: 0.7096  data: 0.6133  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0094 (0.9680)  acc1: 77.6000 (78.9143)  acc5: 94.0000 (94.4571)  time: 0.2008  data: 0.1106  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0430 (0.9866)  acc1: 76.4000 (78.3040)  acc5: 92.8000 (94.2560)  time: 0.2070  data: 0.1204  max mem: 12911
Test: Total time: 0:00:10 (0.4074 s / it)
* Acc@1 78.222 Acc@5 94.272 loss 0.984
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.26%
Epoch: [214]  [   0/1251]  eta: 1:04:23  lr: 0.000862  min_lr: 0.000862  loss: 1.8821 (1.8821)  weight_decay: 0.0500 (0.0500)  time: 3.0884  data: 1.5054  max mem: 12911
Epoch: [214]  [ 200/1251]  eta: 0:03:34  lr: 0.000859  min_lr: 0.000859  loss: 2.3241 (2.7342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8494 (0.9349)  time: 0.1899  data: 0.0005  max mem: 12911
Epoch: [214]  [ 400/1251]  eta: 0:02:47  lr: 0.000856  min_lr: 0.000856  loss: 2.2234 (2.7597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9655 (0.9490)  time: 0.1911  data: 0.0005  max mem: 12911
Epoch: [214]  [ 600/1251]  eta: 0:02:06  lr: 0.000853  min_lr: 0.000853  loss: 2.1759 (2.7285)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8626 (0.9319)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [214]  [ 800/1251]  eta: 0:01:26  lr: 0.000850  min_lr: 0.000850  loss: 2.1939 (2.7288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9272 (0.9400)  time: 0.1848  data: 0.0004  max mem: 12911
Epoch: [214]  [1000/1251]  eta: 0:00:47  lr: 0.000847  min_lr: 0.000847  loss: 3.4448 (2.7317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8966 (0.9476)  time: 0.1845  data: 0.0004  max mem: 12911
Epoch: [214]  [1200/1251]  eta: 0:00:09  lr: 0.000844  min_lr: 0.000844  loss: 2.2247 (2.7329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8516 (0.9446)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [214]  [1250/1251]  eta: 0:00:00  lr: 0.000844  min_lr: 0.000844  loss: 2.0842 (2.7375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9359 (0.9456)  time: 0.1470  data: 0.0006  max mem: 12911
Epoch: [214] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.000844  min_lr: 0.000844  loss: 2.0842 (2.7731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9359 (0.9456)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6589 (0.6589)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 5.7330  data: 5.6414  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8399 (0.8266)  acc1: 84.0000 (83.3455)  acc5: 96.8000 (96.5818)  time: 0.7578  data: 0.6712  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0376 (1.0079)  acc1: 76.8000 (78.7238)  acc5: 93.6000 (94.3238)  time: 0.2020  data: 0.1183  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1086 (1.0204)  acc1: 76.8000 (78.5440)  acc5: 93.2000 (94.2720)  time: 0.2009  data: 0.1183  max mem: 12911
Test: Total time: 0:00:10 (0.4063 s / it)
* Acc@1 78.316 Acc@5 94.362 loss 1.018
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.32%
Epoch: [215]  [   0/1251]  eta: 0:59:43  lr: 0.000843  min_lr: 0.000843  loss: 1.9231 (1.9231)  weight_decay: 0.0500 (0.0500)  time: 2.8648  data: 2.5874  max mem: 12911
Epoch: [215]  [ 200/1251]  eta: 0:03:34  lr: 0.000841  min_lr: 0.000841  loss: 2.3341 (2.7899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8190 (0.8782)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [215]  [ 400/1251]  eta: 0:02:46  lr: 0.000838  min_lr: 0.000838  loss: 3.4798 (2.7874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9742 (0.9273)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [215]  [ 600/1251]  eta: 0:02:05  lr: 0.000835  min_lr: 0.000835  loss: 2.2170 (2.7700)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9322 (0.9163)  time: 0.1855  data: 0.0005  max mem: 12911
Epoch: [215]  [ 800/1251]  eta: 0:01:26  lr: 0.000832  min_lr: 0.000832  loss: 2.3848 (2.7649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8353 (0.9248)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [215]  [1000/1251]  eta: 0:00:47  lr: 0.000829  min_lr: 0.000829  loss: 2.4658 (2.7797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9187 (0.9233)  time: 0.1882  data: 0.0006  max mem: 12911
Epoch: [215]  [1200/1251]  eta: 0:00:09  lr: 0.000826  min_lr: 0.000826  loss: 2.2451 (2.7779)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8827 (0.9257)  time: 0.1881  data: 0.0004  max mem: 12911
Epoch: [215]  [1250/1251]  eta: 0:00:00  lr: 0.000825  min_lr: 0.000825  loss: 2.3190 (2.7724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8003 (0.9208)  time: 0.1464  data: 0.0009  max mem: 12911
Epoch: [215] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.000825  min_lr: 0.000825  loss: 2.3190 (2.7664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8003 (0.9208)
Test:  [ 0/25]  eta: 0:01:45  loss: 0.6375 (0.6375)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 4.2136  data: 4.0989  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8401 (0.8067)  acc1: 81.2000 (82.7273)  acc5: 96.4000 (96.6182)  time: 0.6468  data: 0.5490  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0255 (0.9844)  acc1: 77.6000 (78.6857)  acc5: 94.0000 (94.6286)  time: 0.2354  data: 0.1464  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1467 (0.9945)  acc1: 75.6000 (78.3520)  acc5: 93.6000 (94.5440)  time: 0.2228  data: 0.1370  max mem: 12911
Test: Total time: 0:00:09 (0.3934 s / it)
* Acc@1 78.218 Acc@5 94.404 loss 0.988
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.32%
Epoch: [216]  [   0/1251]  eta: 1:04:38  lr: 0.000825  min_lr: 0.000825  loss: 3.6103 (3.6103)  weight_decay: 0.0500 (0.0500)  time: 3.1003  data: 2.0143  max mem: 12911
Epoch: [216]  [ 200/1251]  eta: 0:03:37  lr: 0.000822  min_lr: 0.000822  loss: 3.4769 (2.8210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9981 (0.9279)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [216]  [ 400/1251]  eta: 0:02:47  lr: 0.000819  min_lr: 0.000819  loss: 2.4597 (2.7908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8767 (0.9289)  time: 0.1848  data: 0.0004  max mem: 12911
Epoch: [216]  [ 600/1251]  eta: 0:02:06  lr: 0.000817  min_lr: 0.000817  loss: 2.2055 (2.7618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9459 (0.9281)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [216]  [ 800/1251]  eta: 0:01:26  lr: 0.000814  min_lr: 0.000814  loss: 2.2929 (2.7677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9495 (0.9194)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [216]  [1000/1251]  eta: 0:00:48  lr: 0.000811  min_lr: 0.000811  loss: 2.2384 (2.7577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9012 (0.9332)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [216]  [1200/1251]  eta: 0:00:09  lr: 0.000808  min_lr: 0.000808  loss: 2.1881 (2.7476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8127 (0.9200)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [216]  [1250/1251]  eta: 0:00:00  lr: 0.000807  min_lr: 0.000807  loss: 2.3818 (2.7415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8909 (0.9212)  time: 0.1468  data: 0.0008  max mem: 12911
Epoch: [216] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.000807  min_lr: 0.000807  loss: 2.3818 (2.7645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8909 (0.9212)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6239 (0.6239)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.5077  data: 5.4084  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8370 (0.8072)  acc1: 84.0000 (83.1273)  acc5: 96.4000 (96.4727)  time: 0.7193  data: 0.6217  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0254 (0.9897)  acc1: 75.6000 (78.9714)  acc5: 94.0000 (94.3238)  time: 0.1968  data: 0.1081  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1102 (0.9999)  acc1: 75.6000 (78.6560)  acc5: 93.6000 (94.2880)  time: 0.2038  data: 0.1190  max mem: 12911
Test: Total time: 0:00:10 (0.4027 s / it)
* Acc@1 78.400 Acc@5 94.266 loss 0.996
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.40%
Epoch: [217]  [   0/1251]  eta: 0:58:50  lr: 0.000807  min_lr: 0.000807  loss: 2.2289 (2.2289)  weight_decay: 0.0500 (0.0500)  time: 2.8224  data: 2.4602  max mem: 12911
Epoch: [217]  [ 200/1251]  eta: 0:03:32  lr: 0.000804  min_lr: 0.000804  loss: 2.1707 (2.7490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8573 (0.9928)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [217]  [ 400/1251]  eta: 0:02:45  lr: 0.000801  min_lr: 0.000801  loss: 2.1049 (2.7490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8866 (0.9773)  time: 0.1870  data: 0.0005  max mem: 12911
Epoch: [217]  [ 600/1251]  eta: 0:02:04  lr: 0.000799  min_lr: 0.000799  loss: 3.1265 (2.7686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9199 (0.9718)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [217]  [ 800/1251]  eta: 0:01:26  lr: 0.000796  min_lr: 0.000796  loss: 2.2423 (2.7842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9545 (0.9770)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [217]  [1000/1251]  eta: 0:00:47  lr: 0.000793  min_lr: 0.000793  loss: 2.3277 (2.7818)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0138 (0.9869)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [217]  [1200/1251]  eta: 0:00:09  lr: 0.000790  min_lr: 0.000790  loss: 2.5946 (2.7684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8283 (0.9768)  time: 0.1864  data: 0.0004  max mem: 12911
Epoch: [217]  [1250/1251]  eta: 0:00:00  lr: 0.000789  min_lr: 0.000789  loss: 2.3189 (2.7669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7730 (0.9702)  time: 0.1456  data: 0.0008  max mem: 12911
Epoch: [217] Total time: 0:03:57 (0.1901 s / it)
Averaged stats: lr: 0.000789  min_lr: 0.000789  loss: 2.3189 (2.7657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7730 (0.9702)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6020 (0.6020)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.5128  data: 5.4211  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7986 (0.7983)  acc1: 84.4000 (82.8364)  acc5: 97.2000 (96.2909)  time: 0.7498  data: 0.6543  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0542 (0.9703)  acc1: 76.0000 (78.4952)  acc5: 93.6000 (94.3619)  time: 0.2170  data: 0.1289  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0542 (0.9804)  acc1: 76.0000 (78.2560)  acc5: 93.2000 (94.2080)  time: 0.2184  data: 0.1335  max mem: 12911
Test: Total time: 0:00:10 (0.4137 s / it)
* Acc@1 78.362 Acc@5 94.240 loss 0.982
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.40%
Epoch: [218]  [   0/1251]  eta: 1:04:35  lr: 0.000789  min_lr: 0.000789  loss: 3.8669 (3.8669)  weight_decay: 0.0500 (0.0500)  time: 3.0979  data: 2.3791  max mem: 12911
Epoch: [218]  [ 200/1251]  eta: 0:03:33  lr: 0.000786  min_lr: 0.000786  loss: 2.1653 (2.6809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9167 (0.9832)  time: 0.1851  data: 0.0005  max mem: 12911
Epoch: [218]  [ 400/1251]  eta: 0:02:45  lr: 0.000784  min_lr: 0.000784  loss: 2.2572 (2.7221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8516 (0.9407)  time: 0.1830  data: 0.0005  max mem: 12911
Epoch: [218]  [ 600/1251]  eta: 0:02:04  lr: 0.000781  min_lr: 0.000781  loss: 2.2103 (2.7291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9547 (0.9528)  time: 0.1839  data: 0.0004  max mem: 12911
Epoch: [218]  [ 800/1251]  eta: 0:01:25  lr: 0.000778  min_lr: 0.000778  loss: 2.4357 (2.7440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9757 (0.9657)  time: 0.1854  data: 0.0006  max mem: 12911
Epoch: [218]  [1000/1251]  eta: 0:00:47  lr: 0.000775  min_lr: 0.000775  loss: 2.1392 (2.7351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9755 (0.9884)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [218]  [1200/1251]  eta: 0:00:09  lr: 0.000772  min_lr: 0.000772  loss: 2.2012 (2.7423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8194 (0.9840)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [218]  [1250/1251]  eta: 0:00:00  lr: 0.000772  min_lr: 0.000772  loss: 3.4902 (2.7518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8668 (0.9817)  time: 0.1472  data: 0.0007  max mem: 12911
Epoch: [218] Total time: 0:03:56 (0.1888 s / it)
Averaged stats: lr: 0.000772  min_lr: 0.000772  loss: 3.4902 (2.7481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8668 (0.9817)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6896 (0.6896)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.6214  data: 5.5254  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8803 (0.8752)  acc1: 84.4000 (82.9455)  acc5: 96.8000 (96.4364)  time: 0.7168  data: 0.6235  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0844 (1.0523)  acc1: 76.4000 (78.6476)  acc5: 93.6000 (94.5333)  time: 0.1955  data: 0.1087  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1845 (1.0632)  acc1: 75.6000 (78.2720)  acc5: 93.2000 (94.3840)  time: 0.1980  data: 0.1128  max mem: 12911
Test: Total time: 0:00:10 (0.4002 s / it)
* Acc@1 78.244 Acc@5 94.386 loss 1.062
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.40%
Epoch: [219]  [   0/1251]  eta: 1:06:02  lr: 0.000771  min_lr: 0.000771  loss: 2.9694 (2.9694)  weight_decay: 0.0500 (0.0500)  time: 3.1672  data: 2.0871  max mem: 12911
Epoch: [219]  [ 200/1251]  eta: 0:03:33  lr: 0.000769  min_lr: 0.000769  loss: 2.4525 (2.8391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8496 (0.9320)  time: 0.1832  data: 0.0003  max mem: 12911
Epoch: [219]  [ 400/1251]  eta: 0:02:45  lr: 0.000766  min_lr: 0.000766  loss: 2.9186 (2.7313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7990 (0.9022)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [219]  [ 600/1251]  eta: 0:02:06  lr: 0.000763  min_lr: 0.000763  loss: 2.5003 (2.7490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9426 (0.9056)  time: 0.1967  data: 0.0005  max mem: 12911
Epoch: [219]  [ 800/1251]  eta: 0:01:26  lr: 0.000760  min_lr: 0.000760  loss: 2.1930 (2.7353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8437 (0.9002)  time: 0.1936  data: 0.0004  max mem: 12911
Epoch: [219]  [1000/1251]  eta: 0:00:48  lr: 0.000757  min_lr: 0.000757  loss: 2.1855 (2.7314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9349 (0.9062)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [219]  [1200/1251]  eta: 0:00:09  lr: 0.000755  min_lr: 0.000755  loss: 2.1808 (2.7210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9706 (0.9257)  time: 0.1848  data: 0.0004  max mem: 12911
Epoch: [219]  [1250/1251]  eta: 0:00:00  lr: 0.000754  min_lr: 0.000754  loss: 2.1851 (2.7224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9215 (0.9301)  time: 0.1458  data: 0.0006  max mem: 12911
Epoch: [219] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.000754  min_lr: 0.000754  loss: 2.1851 (2.7421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9215 (0.9301)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6297 (0.6297)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 5.6098  data: 5.5155  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7596 (0.7790)  acc1: 83.2000 (82.9455)  acc5: 96.8000 (96.4364)  time: 0.7483  data: 0.6505  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0004 (0.9664)  acc1: 76.4000 (78.4381)  acc5: 94.4000 (94.4952)  time: 0.2044  data: 0.1153  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1259 (0.9773)  acc1: 74.4000 (78.0640)  acc5: 93.6000 (94.4480)  time: 0.2004  data: 0.1152  max mem: 12911
Test: Total time: 0:00:10 (0.4038 s / it)
* Acc@1 78.250 Acc@5 94.366 loss 0.971
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.40%
Epoch: [220]  [   0/1251]  eta: 1:05:56  lr: 0.000754  min_lr: 0.000754  loss: 2.1477 (2.1477)  weight_decay: 0.0500 (0.0500)  time: 3.1630  data: 2.1434  max mem: 12911
Epoch: [220]  [ 200/1251]  eta: 0:03:33  lr: 0.000751  min_lr: 0.000751  loss: 2.1998 (2.7190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8131 (0.9424)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [220]  [ 400/1251]  eta: 0:02:45  lr: 0.000748  min_lr: 0.000748  loss: 2.0597 (2.7068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9395 (0.9622)  time: 0.1839  data: 0.0004  max mem: 12911
Epoch: [220]  [ 600/1251]  eta: 0:02:05  lr: 0.000745  min_lr: 0.000745  loss: 2.4407 (2.7105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9448 (0.9665)  time: 0.1904  data: 0.0005  max mem: 12911
Epoch: [220]  [ 800/1251]  eta: 0:01:26  lr: 0.000743  min_lr: 0.000743  loss: 3.2240 (2.7144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9056 (0.9570)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [220]  [1000/1251]  eta: 0:00:47  lr: 0.000740  min_lr: 0.000740  loss: 2.4560 (2.7221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9282 (0.9421)  time: 0.1859  data: 0.0004  max mem: 12911
Epoch: [220]  [1200/1251]  eta: 0:00:09  lr: 0.000737  min_lr: 0.000737  loss: 2.3983 (2.7093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8428 (0.9295)  time: 0.1863  data: 0.0004  max mem: 12911
Epoch: [220]  [1250/1251]  eta: 0:00:00  lr: 0.000736  min_lr: 0.000736  loss: 2.3610 (2.7174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8597 (0.9275)  time: 0.1467  data: 0.0006  max mem: 12911
Epoch: [220] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.000736  min_lr: 0.000736  loss: 2.3610 (2.7346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8597 (0.9275)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6128 (0.6128)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.4530  data: 5.3434  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7972 (0.7830)  acc1: 82.4000 (83.0546)  acc5: 97.2000 (96.5818)  time: 0.7144  data: 0.6216  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0012 (0.9610)  acc1: 78.4000 (78.8762)  acc5: 94.4000 (94.4000)  time: 0.2033  data: 0.1161  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0548 (0.9733)  acc1: 75.6000 (78.6080)  acc5: 93.2000 (94.2880)  time: 0.2025  data: 0.1160  max mem: 12911
Test: Total time: 0:00:09 (0.3996 s / it)
* Acc@1 78.422 Acc@5 94.418 loss 0.975
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.42%
Epoch: [221]  [   0/1251]  eta: 0:51:54  lr: 0.000736  min_lr: 0.000736  loss: 2.2868 (2.2868)  weight_decay: 0.0500 (0.0500)  time: 2.4893  data: 2.1426  max mem: 12911
Epoch: [221]  [ 200/1251]  eta: 0:03:35  lr: 0.000734  min_lr: 0.000734  loss: 2.4061 (2.6778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8953 (0.9418)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [221]  [ 400/1251]  eta: 0:02:47  lr: 0.000731  min_lr: 0.000731  loss: 2.2430 (2.7035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9524 (nan)  time: 0.1902  data: 0.0005  max mem: 12911
Epoch: [221]  [ 600/1251]  eta: 0:02:06  lr: 0.000728  min_lr: 0.000728  loss: 2.1125 (2.6705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9311 (nan)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [221]  [ 800/1251]  eta: 0:01:26  lr: 0.000725  min_lr: 0.000725  loss: 2.5811 (2.6948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9287 (nan)  time: 0.1843  data: 0.0004  max mem: 12911
Epoch: [221]  [1000/1251]  eta: 0:00:48  lr: 0.000722  min_lr: 0.000722  loss: 2.8513 (2.6872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8835 (nan)  time: 0.1874  data: 0.0004  max mem: 12911
Epoch: [221]  [1200/1251]  eta: 0:00:09  lr: 0.000720  min_lr: 0.000720  loss: 2.0958 (2.6871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9125 (nan)  time: 0.1961  data: 0.0005  max mem: 12911
Epoch: [221]  [1250/1251]  eta: 0:00:00  lr: 0.000719  min_lr: 0.000719  loss: 2.9155 (2.6937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8874 (nan)  time: 0.1467  data: 0.0012  max mem: 12911
Epoch: [221] Total time: 0:03:58 (0.1910 s / it)
Averaged stats: lr: 0.000719  min_lr: 0.000719  loss: 2.9155 (2.7270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8874 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6769 (0.6769)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 5.5839  data: 5.4921  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8640 (0.8476)  acc1: 83.2000 (82.5818)  acc5: 96.4000 (96.3273)  time: 0.7496  data: 0.6537  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0585 (1.0275)  acc1: 76.8000 (78.4952)  acc5: 94.4000 (94.5905)  time: 0.2127  data: 0.1248  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1805 (1.0411)  acc1: 75.2000 (78.0480)  acc5: 93.6000 (94.3680)  time: 0.2092  data: 0.1248  max mem: 12911
Test: Total time: 0:00:10 (0.4092 s / it)
* Acc@1 78.336 Acc@5 94.406 loss 1.046
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.42%
Epoch: [222]  [   0/1251]  eta: 0:58:13  lr: 0.000719  min_lr: 0.000719  loss: 2.2475 (2.2475)  weight_decay: 0.0500 (0.0500)  time: 2.7926  data: 2.4399  max mem: 12911
Epoch: [222]  [ 200/1251]  eta: 0:03:35  lr: 0.000716  min_lr: 0.000716  loss: 2.2410 (2.8053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9707 (0.9153)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [222]  [ 400/1251]  eta: 0:02:46  lr: 0.000714  min_lr: 0.000714  loss: 2.3059 (2.7959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8819 (0.9082)  time: 0.1862  data: 0.0004  max mem: 12911
Epoch: [222]  [ 600/1251]  eta: 0:02:05  lr: 0.000711  min_lr: 0.000711  loss: 2.1415 (2.7872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9992 (0.9392)  time: 0.1847  data: 0.0005  max mem: 12911
Epoch: [222]  [ 800/1251]  eta: 0:01:26  lr: 0.000708  min_lr: 0.000708  loss: 2.9853 (2.7611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9271 (0.9480)  time: 0.1906  data: 0.0004  max mem: 12911
Epoch: [222]  [1000/1251]  eta: 0:00:47  lr: 0.000705  min_lr: 0.000705  loss: 2.4973 (2.7482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9761 (0.9435)  time: 0.1870  data: 0.0005  max mem: 12911
Epoch: [222]  [1200/1251]  eta: 0:00:09  lr: 0.000703  min_lr: 0.000703  loss: 2.8287 (2.7394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8853 (0.9371)  time: 0.1842  data: 0.0004  max mem: 12911
Epoch: [222]  [1250/1251]  eta: 0:00:00  lr: 0.000702  min_lr: 0.000702  loss: 3.3852 (2.7431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8935 (0.9345)  time: 0.1470  data: 0.0010  max mem: 12911
Epoch: [222] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.000702  min_lr: 0.000702  loss: 3.3852 (2.7427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8935 (0.9345)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7004 (0.7004)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.6023  data: 5.5093  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8859 (0.8554)  acc1: 82.8000 (82.7636)  acc5: 96.8000 (96.6546)  time: 0.7571  data: 0.6622  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0816 (1.0305)  acc1: 76.4000 (78.6667)  acc5: 94.0000 (94.6667)  time: 0.2291  data: 0.1415  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1678 (1.0439)  acc1: 76.4000 (78.4160)  acc5: 93.6000 (94.4960)  time: 0.2263  data: 0.1414  max mem: 12911
Test: Total time: 0:00:10 (0.4230 s / it)
* Acc@1 78.554 Acc@5 94.414 loss 1.047
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.55%
Epoch: [223]  [   0/1251]  eta: 0:55:59  lr: 0.000702  min_lr: 0.000702  loss: 3.7205 (3.7205)  weight_decay: 0.0500 (0.0500)  time: 2.6853  data: 2.4127  max mem: 12911
Epoch: [223]  [ 200/1251]  eta: 0:03:32  lr: 0.000699  min_lr: 0.000699  loss: 2.8228 (2.8070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9995 (1.0634)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [223]  [ 400/1251]  eta: 0:02:45  lr: 0.000696  min_lr: 0.000696  loss: 2.0702 (2.7621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8743 (0.9884)  time: 0.1861  data: 0.0003  max mem: 12911
Epoch: [223]  [ 600/1251]  eta: 0:02:04  lr: 0.000694  min_lr: 0.000694  loss: 2.5191 (2.7544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8148 (0.9737)  time: 0.1836  data: 0.0004  max mem: 12911
Epoch: [223]  [ 800/1251]  eta: 0:01:25  lr: 0.000691  min_lr: 0.000691  loss: 2.2899 (2.7469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9024 (0.9550)  time: 0.1873  data: 0.0004  max mem: 12911
Epoch: [223]  [1000/1251]  eta: 0:00:47  lr: 0.000688  min_lr: 0.000688  loss: 2.7642 (2.7483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8644 (0.9598)  time: 0.1850  data: 0.0004  max mem: 12911
Epoch: [223]  [1200/1251]  eta: 0:00:09  lr: 0.000686  min_lr: 0.000686  loss: 2.1878 (2.7353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9324 (0.9659)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [223]  [1250/1251]  eta: 0:00:00  lr: 0.000685  min_lr: 0.000685  loss: 2.7223 (2.7436)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9718 (0.9654)  time: 0.1472  data: 0.0011  max mem: 12911
Epoch: [223] Total time: 0:03:56 (0.1889 s / it)
Averaged stats: lr: 0.000685  min_lr: 0.000685  loss: 2.7223 (2.7290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9718 (0.9654)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.6760 (0.6760)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.1924  data: 5.0957  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8519 (0.8345)  acc1: 83.6000 (83.2727)  acc5: 96.8000 (96.6182)  time: 0.6929  data: 0.5982  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0522 (1.0106)  acc1: 76.4000 (78.6667)  acc5: 94.0000 (94.5905)  time: 0.1985  data: 0.1092  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1393 (1.0224)  acc1: 75.2000 (78.3840)  acc5: 94.0000 (94.4960)  time: 0.2028  data: 0.1161  max mem: 12911
Test: Total time: 0:00:09 (0.3895 s / it)
* Acc@1 78.588 Acc@5 94.460 loss 1.020
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.59%
Epoch: [224]  [   0/1251]  eta: 1:06:02  lr: 0.000685  min_lr: 0.000685  loss: 2.7607 (2.7607)  weight_decay: 0.0500 (0.0500)  time: 3.1676  data: 2.9298  max mem: 12911
Epoch: [224]  [ 200/1251]  eta: 0:03:31  lr: 0.000682  min_lr: 0.000682  loss: 2.1296 (2.6283)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0099 (1.0260)  time: 0.1853  data: 0.0005  max mem: 12911
Epoch: [224]  [ 400/1251]  eta: 0:02:45  lr: 0.000680  min_lr: 0.000680  loss: 2.1420 (2.6705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8758 (0.9671)  time: 0.1940  data: 0.0004  max mem: 12911
Epoch: [224]  [ 600/1251]  eta: 0:02:05  lr: 0.000677  min_lr: 0.000677  loss: 2.2606 (2.6927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8687 (0.9640)  time: 0.1946  data: 0.0005  max mem: 12911
Epoch: [224]  [ 800/1251]  eta: 0:01:26  lr: 0.000674  min_lr: 0.000674  loss: 2.2031 (2.7090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9149 (0.9636)  time: 0.1898  data: 0.0005  max mem: 12911
Epoch: [224]  [1000/1251]  eta: 0:00:47  lr: 0.000671  min_lr: 0.000671  loss: 2.3643 (2.7048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9409 (nan)  time: 0.1903  data: 0.0009  max mem: 12911
Epoch: [224]  [1200/1251]  eta: 0:00:09  lr: 0.000669  min_lr: 0.000669  loss: 2.1434 (2.6923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9711 (nan)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [224]  [1250/1251]  eta: 0:00:00  lr: 0.000668  min_lr: 0.000668  loss: 2.3590 (2.6946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9159 (nan)  time: 0.1466  data: 0.0010  max mem: 12911
Epoch: [224] Total time: 0:03:58 (0.1910 s / it)
Averaged stats: lr: 0.000668  min_lr: 0.000668  loss: 2.3590 (2.7410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9159 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6603 (0.6603)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.5820  data: 5.4903  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8192 (0.8184)  acc1: 82.8000 (82.5455)  acc5: 97.2000 (96.8364)  time: 0.7365  data: 0.6404  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0561 (0.9882)  acc1: 76.8000 (78.4191)  acc5: 94.0000 (94.8000)  time: 0.1989  data: 0.1103  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0775 (0.9958)  acc1: 76.8000 (78.2560)  acc5: 93.6000 (94.6720)  time: 0.1967  data: 0.1102  max mem: 12911
Test: Total time: 0:00:10 (0.4007 s / it)
* Acc@1 78.478 Acc@5 94.534 loss 0.997
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.59%
Epoch: [225]  [   0/1251]  eta: 1:02:49  lr: 0.000668  min_lr: 0.000668  loss: 1.9017 (1.9017)  weight_decay: 0.0500 (0.0500)  time: 3.0132  data: 2.3613  max mem: 12911
Epoch: [225]  [ 200/1251]  eta: 0:03:34  lr: 0.000665  min_lr: 0.000665  loss: 3.0431 (2.7561)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8553 (0.9227)  time: 0.1907  data: 0.0004  max mem: 12911
Epoch: [225]  [ 400/1251]  eta: 0:02:45  lr: 0.000663  min_lr: 0.000663  loss: 3.5760 (2.7591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0077 (0.9720)  time: 0.1852  data: 0.0004  max mem: 12911
Epoch: [225]  [ 600/1251]  eta: 0:02:04  lr: 0.000660  min_lr: 0.000660  loss: 2.2500 (2.7327)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0106 (nan)  time: 0.1816  data: 0.0004  max mem: 12911
Epoch: [225]  [ 800/1251]  eta: 0:01:25  lr: 0.000657  min_lr: 0.000657  loss: 2.2155 (2.7341)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0104 (nan)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [225]  [1000/1251]  eta: 0:00:47  lr: 0.000655  min_lr: 0.000655  loss: 2.2537 (2.7566)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0843 (nan)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [225]  [1200/1251]  eta: 0:00:09  lr: 0.000652  min_lr: 0.000652  loss: 2.7213 (2.7569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9698 (nan)  time: 0.1845  data: 0.0005  max mem: 12911
Epoch: [225]  [1250/1251]  eta: 0:00:00  lr: 0.000652  min_lr: 0.000652  loss: 2.2228 (2.7548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8861 (nan)  time: 0.1493  data: 0.0007  max mem: 12911
Epoch: [225] Total time: 0:03:56 (0.1891 s / it)
Averaged stats: lr: 0.000652  min_lr: 0.000652  loss: 2.2228 (2.7220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8861 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5962 (0.5962)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.4434  data: 5.3362  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7384 (0.7542)  acc1: 83.2000 (82.7273)  acc5: 97.2000 (96.8364)  time: 0.7301  data: 0.6376  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9833 (0.9346)  acc1: 77.2000 (79.0095)  acc5: 94.0000 (94.6476)  time: 0.2097  data: 0.1240  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0828 (0.9480)  acc1: 76.4000 (78.6560)  acc5: 94.0000 (94.5120)  time: 0.2094  data: 0.1239  max mem: 12911
Test: Total time: 0:00:10 (0.4027 s / it)
* Acc@1 78.782 Acc@5 94.486 loss 0.945
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.78%
Epoch: [226]  [   0/1251]  eta: 1:05:52  lr: 0.000651  min_lr: 0.000651  loss: 2.2029 (2.2029)  weight_decay: 0.0500 (0.0500)  time: 3.1591  data: 2.9451  max mem: 12911
Epoch: [226]  [ 200/1251]  eta: 0:03:34  lr: 0.000649  min_lr: 0.000649  loss: 2.5181 (2.7608)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0177 (0.9871)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [226]  [ 400/1251]  eta: 0:02:46  lr: 0.000646  min_lr: 0.000646  loss: 2.2403 (2.7078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9178 (1.0138)  time: 0.1847  data: 0.0006  max mem: 12911
Epoch: [226]  [ 600/1251]  eta: 0:02:05  lr: 0.000644  min_lr: 0.000644  loss: 2.1553 (2.7253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8502 (0.9838)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [226]  [ 800/1251]  eta: 0:01:26  lr: 0.000641  min_lr: 0.000641  loss: 2.0902 (2.7201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9606 (0.9660)  time: 0.1877  data: 0.0005  max mem: 12911
Epoch: [226]  [1000/1251]  eta: 0:00:47  lr: 0.000638  min_lr: 0.000638  loss: 2.1136 (2.7172)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9615 (0.9855)  time: 0.1906  data: 0.0004  max mem: 12911
Epoch: [226]  [1200/1251]  eta: 0:00:09  lr: 0.000636  min_lr: 0.000636  loss: 2.2831 (2.7041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8156 (0.9725)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [226]  [1250/1251]  eta: 0:00:00  lr: 0.000635  min_lr: 0.000635  loss: 2.3690 (2.7044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7668 (0.9654)  time: 0.1462  data: 0.0015  max mem: 12911
Epoch: [226] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.000635  min_lr: 0.000635  loss: 2.3690 (2.7227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7668 (0.9654)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6157 (0.6157)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.6449  data: 5.5531  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8566 (0.8060)  acc1: 82.4000 (82.8364)  acc5: 96.8000 (96.8364)  time: 0.7792  data: 0.6854  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0170 (0.9800)  acc1: 76.4000 (78.6476)  acc5: 94.8000 (94.8381)  time: 0.2193  data: 0.1314  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1271 (0.9920)  acc1: 75.6000 (78.2240)  acc5: 94.0000 (94.6560)  time: 0.2181  data: 0.1313  max mem: 12911
Test: Total time: 0:00:10 (0.4173 s / it)
* Acc@1 78.700 Acc@5 94.588 loss 0.993
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.78%
Epoch: [227]  [   0/1251]  eta: 1:05:19  lr: 0.000635  min_lr: 0.000635  loss: 2.1135 (2.1135)  weight_decay: 0.0500 (0.0500)  time: 3.1334  data: 1.8685  max mem: 12911
Epoch: [227]  [ 200/1251]  eta: 0:03:36  lr: 0.000632  min_lr: 0.000632  loss: 2.8413 (2.7385)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9277 (1.0082)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [227]  [ 400/1251]  eta: 0:02:47  lr: 0.000630  min_lr: 0.000630  loss: 2.6979 (2.7502)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0299 (0.9999)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [227]  [ 600/1251]  eta: 0:02:06  lr: 0.000627  min_lr: 0.000627  loss: 2.1285 (2.7308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8457 (0.9714)  time: 0.1877  data: 0.0004  max mem: 12911
Epoch: [227]  [ 800/1251]  eta: 0:01:26  lr: 0.000625  min_lr: 0.000625  loss: 2.2817 (2.7212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0302 (0.9832)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [227]  [1000/1251]  eta: 0:00:48  lr: 0.000622  min_lr: 0.000622  loss: 2.2651 (2.7033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9938 (0.9829)  time: 0.1857  data: 0.0004  max mem: 12911
Epoch: [227]  [1200/1251]  eta: 0:00:09  lr: 0.000619  min_lr: 0.000619  loss: 2.3048 (2.6963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9268 (0.9705)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [227]  [1250/1251]  eta: 0:00:00  lr: 0.000619  min_lr: 0.000619  loss: 2.5043 (2.6997)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9960 (0.9721)  time: 0.1465  data: 0.0007  max mem: 12911
Epoch: [227] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.000619  min_lr: 0.000619  loss: 2.5043 (2.7236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9960 (0.9721)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6657 (0.6657)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.5710  data: 5.4769  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8746 (0.8486)  acc1: 83.2000 (83.0182)  acc5: 96.8000 (96.9091)  time: 0.7682  data: 0.6702  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0663 (1.0200)  acc1: 76.4000 (78.6095)  acc5: 94.4000 (94.8952)  time: 0.2175  data: 0.1283  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1522 (1.0304)  acc1: 74.4000 (78.3040)  acc5: 94.0000 (94.6400)  time: 0.2135  data: 0.1282  max mem: 12911
Test: Total time: 0:00:10 (0.4127 s / it)
* Acc@1 78.670 Acc@5 94.574 loss 1.026
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.78%
Epoch: [228]  [   0/1251]  eta: 1:04:30  lr: 0.000619  min_lr: 0.000619  loss: 2.0908 (2.0908)  weight_decay: 0.0500 (0.0500)  time: 3.0937  data: 2.6902  max mem: 12911
Epoch: [228]  [ 200/1251]  eta: 0:03:36  lr: 0.000616  min_lr: 0.000616  loss: 2.3519 (2.5972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9442 (0.9348)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [228]  [ 400/1251]  eta: 0:02:48  lr: 0.000614  min_lr: 0.000614  loss: 2.3399 (2.6683)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0000 (0.9760)  time: 0.1924  data: 0.0005  max mem: 12911
Epoch: [228]  [ 600/1251]  eta: 0:02:07  lr: 0.000611  min_lr: 0.000611  loss: 3.1803 (2.7324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9273 (0.9927)  time: 0.1870  data: 0.0005  max mem: 12911
Epoch: [228]  [ 800/1251]  eta: 0:01:27  lr: 0.000608  min_lr: 0.000608  loss: 2.2430 (2.7396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8770 (0.9953)  time: 0.1895  data: 0.0005  max mem: 12911
Epoch: [228]  [1000/1251]  eta: 0:00:48  lr: 0.000606  min_lr: 0.000606  loss: 2.2711 (2.7427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8503 (0.9727)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [228]  [1200/1251]  eta: 0:00:09  lr: 0.000603  min_lr: 0.000603  loss: 2.8495 (2.7471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9341 (0.9710)  time: 0.1846  data: 0.0005  max mem: 12911
Epoch: [228]  [1250/1251]  eta: 0:00:00  lr: 0.000603  min_lr: 0.000603  loss: 2.4364 (2.7515)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1094 (0.9775)  time: 0.1465  data: 0.0008  max mem: 12911
Epoch: [228] Total time: 0:03:59 (0.1915 s / it)
Averaged stats: lr: 0.000603  min_lr: 0.000603  loss: 2.4364 (2.7150)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1094 (0.9775)
Test:  [ 0/25]  eta: 0:02:08  loss: 0.6530 (0.6530)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.1465  data: 5.0101  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8386 (0.8106)  acc1: 81.2000 (82.0000)  acc5: 96.8000 (96.7273)  time: 0.7464  data: 0.6458  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0247 (0.9785)  acc1: 76.4000 (78.4191)  acc5: 94.8000 (94.8000)  time: 0.2388  data: 0.1502  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0976 (0.9920)  acc1: 76.4000 (78.0160)  acc5: 93.2000 (94.5760)  time: 0.2347  data: 0.1494  max mem: 12911
Test: Total time: 0:00:10 (0.4130 s / it)
* Acc@1 78.656 Acc@5 94.612 loss 0.985
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.78%
Epoch: [229]  [   0/1251]  eta: 1:07:49  lr: 0.000603  min_lr: 0.000603  loss: 1.7935 (1.7935)  weight_decay: 0.0500 (0.0500)  time: 3.2527  data: 2.4323  max mem: 12911
Epoch: [229]  [ 200/1251]  eta: 0:03:35  lr: 0.000600  min_lr: 0.000600  loss: 2.1280 (2.7407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9271 (1.0106)  time: 0.1923  data: 0.0004  max mem: 12911
Epoch: [229]  [ 400/1251]  eta: 0:02:46  lr: 0.000597  min_lr: 0.000597  loss: 2.3689 (2.7438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9476 (1.0095)  time: 0.1840  data: 0.0005  max mem: 12911
Epoch: [229]  [ 600/1251]  eta: 0:02:05  lr: 0.000595  min_lr: 0.000595  loss: 2.3908 (2.7465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8903 (1.0004)  time: 0.1984  data: 0.0004  max mem: 12911
Epoch: [229]  [ 800/1251]  eta: 0:01:26  lr: 0.000592  min_lr: 0.000592  loss: 3.0113 (2.7408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8726 (1.0040)  time: 0.1951  data: 0.0004  max mem: 12911
Epoch: [229]  [1000/1251]  eta: 0:00:48  lr: 0.000590  min_lr: 0.000590  loss: 2.0999 (2.7383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8190 (0.9860)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [229]  [1200/1251]  eta: 0:00:09  lr: 0.000587  min_lr: 0.000587  loss: 2.2188 (2.7332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8739 (0.9722)  time: 0.1851  data: 0.0005  max mem: 12911
Epoch: [229]  [1250/1251]  eta: 0:00:00  lr: 0.000587  min_lr: 0.000587  loss: 2.2093 (2.7330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9405 (0.9748)  time: 0.1465  data: 0.0012  max mem: 12911
Epoch: [229] Total time: 0:03:59 (0.1916 s / it)
Averaged stats: lr: 0.000587  min_lr: 0.000587  loss: 2.2093 (2.7187)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9405 (0.9748)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5614 (0.5614)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.7450  data: 5.6533  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7738 (0.7740)  acc1: 83.6000 (83.4545)  acc5: 96.8000 (96.6545)  time: 0.7503  data: 0.6537  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0013 (0.9540)  acc1: 76.8000 (79.0095)  acc5: 94.0000 (94.8000)  time: 0.2028  data: 0.1146  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0766 (0.9650)  acc1: 76.8000 (78.6560)  acc5: 93.6000 (94.6240)  time: 0.1991  data: 0.1145  max mem: 12911
Test: Total time: 0:00:10 (0.4118 s / it)
* Acc@1 78.830 Acc@5 94.574 loss 0.964
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.83%
Epoch: [230]  [   0/1251]  eta: 0:54:41  lr: 0.000587  min_lr: 0.000587  loss: 2.2959 (2.2959)  weight_decay: 0.0500 (0.0500)  time: 2.6235  data: 2.0644  max mem: 12911
Epoch: [230]  [ 200/1251]  eta: 0:03:33  lr: 0.000584  min_lr: 0.000584  loss: 2.1032 (2.7598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9069 (0.9921)  time: 0.1877  data: 0.0004  max mem: 12911
Epoch: [230]  [ 400/1251]  eta: 0:02:46  lr: 0.000582  min_lr: 0.000582  loss: 2.5013 (2.7123)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0133 (0.9963)  time: 0.1883  data: 0.0005  max mem: 12911
Epoch: [230]  [ 600/1251]  eta: 0:02:05  lr: 0.000579  min_lr: 0.000579  loss: 2.0974 (2.7424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9468 (1.0095)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [230]  [ 800/1251]  eta: 0:01:26  lr: 0.000577  min_lr: 0.000577  loss: 2.3730 (2.7454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9701 (1.0001)  time: 0.1884  data: 0.0004  max mem: 12911
Epoch: [230]  [1000/1251]  eta: 0:00:48  lr: 0.000574  min_lr: 0.000574  loss: 2.1903 (2.7309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9666 (0.9954)  time: 0.1903  data: 0.0003  max mem: 12911
Epoch: [230]  [1200/1251]  eta: 0:00:09  lr: 0.000571  min_lr: 0.000571  loss: 2.5618 (2.7366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0114 (1.0008)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [230]  [1250/1251]  eta: 0:00:00  lr: 0.000571  min_lr: 0.000571  loss: 2.5689 (2.7412)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0114 (1.0025)  time: 0.1476  data: 0.0009  max mem: 12911
Epoch: [230] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.000571  min_lr: 0.000571  loss: 2.5689 (2.7186)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0114 (1.0025)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6688 (0.6688)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.7745  data: 5.6570  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8401 (0.8325)  acc1: 82.4000 (82.5091)  acc5: 96.4000 (96.6546)  time: 0.7582  data: 0.6590  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0454 (1.0002)  acc1: 76.0000 (78.5714)  acc5: 94.4000 (94.6476)  time: 0.2226  data: 0.1328  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1119 (1.0100)  acc1: 76.0000 (78.2560)  acc5: 93.6000 (94.4960)  time: 0.2185  data: 0.1327  max mem: 12911
Test: Total time: 0:00:10 (0.4251 s / it)
* Acc@1 78.762 Acc@5 94.618 loss 1.006
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.83%
Epoch: [231]  [   0/1251]  eta: 1:01:30  lr: 0.000571  min_lr: 0.000571  loss: 3.6611 (3.6611)  weight_decay: 0.0500 (0.0500)  time: 2.9502  data: 1.7047  max mem: 12911
Epoch: [231]  [ 200/1251]  eta: 0:03:31  lr: 0.000568  min_lr: 0.000568  loss: 2.8943 (2.6658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9511 (0.9845)  time: 0.1819  data: 0.0005  max mem: 12911
Epoch: [231]  [ 400/1251]  eta: 0:02:44  lr: 0.000566  min_lr: 0.000566  loss: 2.4004 (2.6785)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0204 (1.0160)  time: 0.1851  data: 0.0004  max mem: 12911
Epoch: [231]  [ 600/1251]  eta: 0:02:04  lr: 0.000563  min_lr: 0.000563  loss: 2.3459 (2.7112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9349 (0.9942)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [231]  [ 800/1251]  eta: 0:01:25  lr: 0.000561  min_lr: 0.000561  loss: 2.1585 (2.6982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9436 (0.9924)  time: 0.1841  data: 0.0005  max mem: 12911
Epoch: [231]  [1000/1251]  eta: 0:00:47  lr: 0.000558  min_lr: 0.000558  loss: 2.5006 (2.6973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8586 (0.9941)  time: 0.1827  data: 0.0004  max mem: 12911
Epoch: [231]  [1200/1251]  eta: 0:00:09  lr: 0.000556  min_lr: 0.000556  loss: 2.1175 (2.6962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9090 (0.9957)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [231]  [1250/1251]  eta: 0:00:00  lr: 0.000555  min_lr: 0.000555  loss: 2.0685 (2.7010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8966 (0.9923)  time: 0.1454  data: 0.0009  max mem: 12911
Epoch: [231] Total time: 0:03:54 (0.1876 s / it)
Averaged stats: lr: 0.000555  min_lr: 0.000555  loss: 2.0685 (2.7094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8966 (0.9923)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6324 (0.6324)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.4877  data: 5.3958  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8204 (0.7941)  acc1: 82.8000 (83.0545)  acc5: 97.2000 (96.6909)  time: 0.7593  data: 0.6629  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0092 (0.9598)  acc1: 77.2000 (78.8000)  acc5: 94.0000 (94.6286)  time: 0.2092  data: 0.1198  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0926 (0.9717)  acc1: 75.2000 (78.3520)  acc5: 93.2000 (94.4800)  time: 0.2101  data: 0.1238  max mem: 12911
Test: Total time: 0:00:10 (0.4063 s / it)
* Acc@1 78.824 Acc@5 94.564 loss 0.972
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.83%
Epoch: [232]  [   0/1251]  eta: 1:02:42  lr: 0.000555  min_lr: 0.000555  loss: 2.0431 (2.0431)  weight_decay: 0.0500 (0.0500)  time: 3.0077  data: 1.5787  max mem: 12911
Epoch: [232]  [ 200/1251]  eta: 0:03:37  lr: 0.000553  min_lr: 0.000553  loss: 3.2254 (2.7431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9444 (0.9688)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [232]  [ 400/1251]  eta: 0:02:48  lr: 0.000550  min_lr: 0.000550  loss: 2.1040 (2.7025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8764 (0.9645)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [232]  [ 600/1251]  eta: 0:02:06  lr: 0.000548  min_lr: 0.000548  loss: 2.2905 (2.6999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9109 (0.9557)  time: 0.1891  data: 0.0004  max mem: 12911
Epoch: [232]  [ 800/1251]  eta: 0:01:26  lr: 0.000545  min_lr: 0.000545  loss: 2.1269 (2.6919)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0752 (0.9718)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [232]  [1000/1251]  eta: 0:00:48  lr: 0.000543  min_lr: 0.000543  loss: 2.2272 (2.6939)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0885 (0.9974)  time: 0.1865  data: 0.0003  max mem: 12911
Epoch: [232]  [1200/1251]  eta: 0:00:09  lr: 0.000540  min_lr: 0.000540  loss: 2.0525 (2.6949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9210 (0.9910)  time: 0.1888  data: 0.0006  max mem: 12911
Epoch: [232]  [1250/1251]  eta: 0:00:00  lr: 0.000540  min_lr: 0.000540  loss: 2.0723 (2.6964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9458 (0.9913)  time: 0.1465  data: 0.0009  max mem: 12911
Epoch: [232] Total time: 0:03:59 (0.1917 s / it)
Averaged stats: lr: 0.000540  min_lr: 0.000540  loss: 2.0723 (2.7070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9458 (0.9913)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6095 (0.6095)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.5992  data: 5.4651  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7825 (0.7846)  acc1: 82.4000 (83.0182)  acc5: 96.4000 (96.4364)  time: 0.7440  data: 0.6446  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9884 (0.9523)  acc1: 76.4000 (79.1429)  acc5: 94.4000 (94.7810)  time: 0.2046  data: 0.1163  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0525 (0.9604)  acc1: 76.4000 (78.6880)  acc5: 94.0000 (94.6720)  time: 0.2006  data: 0.1162  max mem: 12911
Test: Total time: 0:00:10 (0.4032 s / it)
* Acc@1 78.870 Acc@5 94.598 loss 0.961
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 78.87%
Epoch: [233]  [   0/1251]  eta: 0:55:39  lr: 0.000540  min_lr: 0.000540  loss: 4.4142 (4.4142)  weight_decay: 0.0500 (0.0500)  time: 2.6691  data: 2.3950  max mem: 12911
Epoch: [233]  [ 200/1251]  eta: 0:03:34  lr: 0.000537  min_lr: 0.000537  loss: 2.6941 (2.6896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9561 (1.0775)  time: 0.1891  data: 0.0004  max mem: 12911
Epoch: [233]  [ 400/1251]  eta: 0:02:46  lr: 0.000535  min_lr: 0.000535  loss: 2.2171 (2.7293)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1061 (1.0957)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [233]  [ 600/1251]  eta: 0:02:05  lr: 0.000533  min_lr: 0.000533  loss: 2.1446 (2.7273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9413 (1.0404)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [233]  [ 800/1251]  eta: 0:01:26  lr: 0.000530  min_lr: 0.000530  loss: 2.1849 (2.7234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9927 (1.0412)  time: 0.1847  data: 0.0005  max mem: 12911
Epoch: [233]  [1000/1251]  eta: 0:00:47  lr: 0.000528  min_lr: 0.000528  loss: 3.5012 (2.7178)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9794 (1.0326)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [233]  [1200/1251]  eta: 0:00:09  lr: 0.000525  min_lr: 0.000525  loss: 2.1438 (2.7159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9002 (1.0177)  time: 0.1866  data: 0.0005  max mem: 12911
Epoch: [233]  [1250/1251]  eta: 0:00:00  lr: 0.000525  min_lr: 0.000525  loss: 2.1507 (2.7148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9300 (1.0181)  time: 0.1456  data: 0.0010  max mem: 12911
Epoch: [233] Total time: 0:03:57 (0.1898 s / it)
Averaged stats: lr: 0.000525  min_lr: 0.000525  loss: 2.1507 (2.7283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9300 (1.0181)
Test:  [ 0/25]  eta: 0:01:44  loss: 0.6152 (0.6152)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 4.1855  data: 4.0882  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7959 (0.7818)  acc1: 82.8000 (83.0182)  acc5: 96.8000 (96.6909)  time: 0.6967  data: 0.6035  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0283 (0.9510)  acc1: 76.0000 (78.9143)  acc5: 94.0000 (94.7429)  time: 0.2572  data: 0.1695  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0803 (0.9617)  acc1: 75.2000 (78.4640)  acc5: 93.6000 (94.6720)  time: 0.2378  data: 0.1519  max mem: 12911
Test: Total time: 0:00:09 (0.3971 s / it)
* Acc@1 78.930 Acc@5 94.542 loss 0.953
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 78.93%
Epoch: [234]  [   0/1251]  eta: 1:00:57  lr: 0.000525  min_lr: 0.000525  loss: 1.9302 (1.9302)  weight_decay: 0.0500 (0.0500)  time: 2.9234  data: 2.6541  max mem: 12911
Epoch: [234]  [ 200/1251]  eta: 0:03:32  lr: 0.000522  min_lr: 0.000522  loss: 2.2197 (2.7270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9125 (1.0227)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [234]  [ 400/1251]  eta: 0:02:45  lr: 0.000520  min_lr: 0.000520  loss: 2.2666 (2.7757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8664 (1.0304)  time: 0.1856  data: 0.0005  max mem: 12911
Epoch: [234]  [ 600/1251]  eta: 0:02:05  lr: 0.000517  min_lr: 0.000517  loss: 2.0758 (2.7679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9173 (1.0021)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [234]  [ 800/1251]  eta: 0:01:26  lr: 0.000515  min_lr: 0.000515  loss: 2.9304 (2.7445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0347 (1.0092)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [234]  [1000/1251]  eta: 0:00:47  lr: 0.000513  min_lr: 0.000513  loss: 2.8946 (2.7592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9334 (1.0045)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [234]  [1200/1251]  eta: 0:00:09  lr: 0.000510  min_lr: 0.000510  loss: 2.3349 (2.7528)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9971 (1.0050)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [234]  [1250/1251]  eta: 0:00:00  lr: 0.000510  min_lr: 0.000510  loss: 2.1043 (2.7446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9700 (1.0077)  time: 0.1468  data: 0.0009  max mem: 12911
Epoch: [234] Total time: 0:03:57 (0.1899 s / it)
Averaged stats: lr: 0.000510  min_lr: 0.000510  loss: 2.1043 (2.7023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9700 (1.0077)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5703 (0.5703)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 5.5631  data: 5.4713  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7814 (0.7598)  acc1: 84.0000 (83.3455)  acc5: 96.8000 (96.7273)  time: 0.7397  data: 0.6459  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0006 (0.9323)  acc1: 76.8000 (79.5048)  acc5: 94.8000 (94.8952)  time: 0.2124  data: 0.1256  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0581 (0.9426)  acc1: 76.8000 (79.0240)  acc5: 94.4000 (94.7680)  time: 0.2102  data: 0.1255  max mem: 12911
Test: Total time: 0:00:10 (0.4081 s / it)
* Acc@1 78.904 Acc@5 94.674 loss 0.943
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 78.93%
Epoch: [235]  [   0/1251]  eta: 1:08:40  lr: 0.000510  min_lr: 0.000510  loss: 2.4064 (2.4064)  weight_decay: 0.0500 (0.0500)  time: 3.2938  data: 1.6992  max mem: 12911
Epoch: [235]  [ 200/1251]  eta: 0:03:34  lr: 0.000507  min_lr: 0.000507  loss: 2.1496 (2.7608)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0281 (1.1011)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [235]  [ 400/1251]  eta: 0:02:45  lr: 0.000505  min_lr: 0.000505  loss: 2.7848 (2.7610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9164 (1.0613)  time: 0.1834  data: 0.0005  max mem: 12911
Epoch: [235]  [ 600/1251]  eta: 0:02:04  lr: 0.000502  min_lr: 0.000502  loss: 2.2369 (2.7380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9404 (1.0558)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [235]  [ 800/1251]  eta: 0:01:25  lr: 0.000500  min_lr: 0.000500  loss: 2.2900 (2.7429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8747 (1.0227)  time: 0.1847  data: 0.0005  max mem: 12911
Epoch: [235]  [1000/1251]  eta: 0:00:47  lr: 0.000498  min_lr: 0.000498  loss: 2.6850 (2.7420)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1281 (1.0381)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [235]  [1200/1251]  eta: 0:00:09  lr: 0.000495  min_lr: 0.000495  loss: 2.8749 (2.7399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9454 (1.0421)  time: 0.1890  data: 0.0004  max mem: 12911
Epoch: [235]  [1250/1251]  eta: 0:00:00  lr: 0.000495  min_lr: 0.000495  loss: 2.1186 (2.7364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9036 (1.0405)  time: 0.1457  data: 0.0008  max mem: 12911
Epoch: [235] Total time: 0:03:56 (0.1888 s / it)
Averaged stats: lr: 0.000495  min_lr: 0.000495  loss: 2.1186 (2.6848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9036 (1.0405)
Test:  [ 0/25]  eta: 0:01:17  loss: 0.5923 (0.5923)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 3.0806  data: 2.9889  max mem: 12911
Test:  [10/25]  eta: 0:00:08  loss: 0.7496 (0.7461)  acc1: 84.4000 (83.7091)  acc5: 97.2000 (96.8364)  time: 0.5681  data: 0.4790  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9622 (0.9184)  acc1: 78.4000 (79.6381)  acc5: 94.4000 (94.9333)  time: 0.2819  data: 0.1931  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0394 (0.9278)  acc1: 77.6000 (79.3760)  acc5: 94.0000 (94.8000)  time: 0.2197  data: 0.1323  max mem: 12911
Test: Total time: 0:00:09 (0.3934 s / it)
* Acc@1 79.128 Acc@5 94.694 loss 0.928
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.13%
Epoch: [236]  [   0/1251]  eta: 0:57:28  lr: 0.000495  min_lr: 0.000495  loss: 3.8421 (3.8421)  weight_decay: 0.0500 (0.0500)  time: 2.7565  data: 2.4763  max mem: 12911
Epoch: [236]  [ 200/1251]  eta: 0:03:32  lr: 0.000492  min_lr: 0.000492  loss: 2.3553 (2.6225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9645 (0.9617)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [236]  [ 400/1251]  eta: 0:02:46  lr: 0.000490  min_lr: 0.000490  loss: 2.1191 (2.6372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8732 (0.9419)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [236]  [ 600/1251]  eta: 0:02:06  lr: 0.000488  min_lr: 0.000488  loss: 2.0934 (2.6580)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0627 (0.9708)  time: 0.1902  data: 0.0005  max mem: 12911
Epoch: [236]  [ 800/1251]  eta: 0:01:26  lr: 0.000485  min_lr: 0.000485  loss: 2.3633 (2.6662)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1301 (1.0288)  time: 0.1899  data: 0.0004  max mem: 12911
Epoch: [236]  [1000/1251]  eta: 0:00:48  lr: 0.000483  min_lr: 0.000483  loss: 2.1842 (2.6563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8732 (1.0100)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [236]  [1200/1251]  eta: 0:00:09  lr: 0.000481  min_lr: 0.000481  loss: 2.1489 (2.6685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9711 (1.0003)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [236]  [1250/1251]  eta: 0:00:00  lr: 0.000480  min_lr: 0.000480  loss: 2.3609 (2.6703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9439 (0.9962)  time: 0.1460  data: 0.0010  max mem: 12911
Epoch: [236] Total time: 0:03:59 (0.1913 s / it)
Averaged stats: lr: 0.000480  min_lr: 0.000480  loss: 2.3609 (2.6970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9439 (0.9962)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.6487 (0.6487)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.1645  data: 5.0590  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8378 (0.8218)  acc1: 83.2000 (83.1636)  acc5: 96.8000 (96.8000)  time: 0.6645  data: 0.5688  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0428 (0.9986)  acc1: 77.6000 (78.9143)  acc5: 95.2000 (95.0286)  time: 0.2000  data: 0.1119  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1005 (1.0042)  acc1: 76.4000 (78.6720)  acc5: 94.4000 (94.9280)  time: 0.2168  data: 0.1323  max mem: 12911
Test: Total time: 0:00:10 (0.4025 s / it)
* Acc@1 78.994 Acc@5 94.616 loss 1.006
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.13%
Epoch: [237]  [   0/1251]  eta: 1:06:51  lr: 0.000480  min_lr: 0.000480  loss: 2.0235 (2.0235)  weight_decay: 0.0500 (0.0500)  time: 3.2066  data: 2.8546  max mem: 12911
Epoch: [237]  [ 200/1251]  eta: 0:03:34  lr: 0.000478  min_lr: 0.000478  loss: 3.3234 (2.6403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0234 (0.9683)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [237]  [ 400/1251]  eta: 0:02:47  lr: 0.000475  min_lr: 0.000475  loss: 2.1276 (2.6850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9518 (0.9804)  time: 0.1883  data: 0.0005  max mem: 12911
Epoch: [237]  [ 600/1251]  eta: 0:02:05  lr: 0.000473  min_lr: 0.000473  loss: 2.6001 (2.6663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9052 (0.9675)  time: 0.1892  data: 0.0004  max mem: 12911
Epoch: [237]  [ 800/1251]  eta: 0:01:26  lr: 0.000471  min_lr: 0.000471  loss: 2.0232 (2.6761)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9887 (0.9828)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [237]  [1000/1251]  eta: 0:00:47  lr: 0.000468  min_lr: 0.000468  loss: 2.0836 (2.6851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9249 (0.9836)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [237]  [1200/1251]  eta: 0:00:09  lr: 0.000466  min_lr: 0.000466  loss: 2.1380 (2.6845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9030 (0.9876)  time: 0.1889  data: 0.0004  max mem: 12911
Epoch: [237]  [1250/1251]  eta: 0:00:00  lr: 0.000466  min_lr: 0.000466  loss: 2.6014 (2.6863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9603 (0.9894)  time: 0.1459  data: 0.0011  max mem: 12911
Epoch: [237] Total time: 0:03:58 (0.1910 s / it)
Averaged stats: lr: 0.000466  min_lr: 0.000466  loss: 2.6014 (2.6974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9603 (0.9894)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6696 (0.6696)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.7643  data: 5.6688  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8058 (0.8100)  acc1: 84.8000 (83.7091)  acc5: 96.8000 (96.8727)  time: 0.7196  data: 0.6213  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0405 (0.9891)  acc1: 77.2000 (78.9524)  acc5: 94.8000 (95.0476)  time: 0.1907  data: 0.0991  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1482 (0.9996)  acc1: 76.0000 (78.5120)  acc5: 94.0000 (94.8320)  time: 0.1867  data: 0.0990  max mem: 12911
Test: Total time: 0:00:09 (0.3995 s / it)
* Acc@1 79.034 Acc@5 94.638 loss 0.995
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.13%
Epoch: [238]  [   0/1251]  eta: 1:05:44  lr: 0.000466  min_lr: 0.000466  loss: 2.7383 (2.7383)  weight_decay: 0.0500 (0.0500)  time: 3.1527  data: 2.3663  max mem: 12911
Epoch: [238]  [ 200/1251]  eta: 0:03:34  lr: 0.000463  min_lr: 0.000463  loss: 2.4909 (2.6756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9720 (1.0322)  time: 0.1876  data: 0.0006  max mem: 12911
Epoch: [238]  [ 400/1251]  eta: 0:02:46  lr: 0.000461  min_lr: 0.000461  loss: 2.1862 (2.6685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9213 (1.0289)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [238]  [ 600/1251]  eta: 0:02:05  lr: 0.000459  min_lr: 0.000459  loss: 2.1454 (2.6770)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9907 (1.0575)  time: 0.1863  data: 0.0004  max mem: 12911
Epoch: [238]  [ 800/1251]  eta: 0:01:26  lr: 0.000456  min_lr: 0.000456  loss: 2.1538 (2.6560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9134 (1.0245)  time: 0.1879  data: 0.0006  max mem: 12911
Epoch: [238]  [1000/1251]  eta: 0:00:47  lr: 0.000454  min_lr: 0.000454  loss: 2.3418 (2.6583)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0022 (1.0201)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [238]  [1200/1251]  eta: 0:00:09  lr: 0.000452  min_lr: 0.000452  loss: 2.1450 (2.6588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1176 (1.0259)  time: 0.1878  data: 0.0006  max mem: 12911
Epoch: [238]  [1250/1251]  eta: 0:00:00  lr: 0.000451  min_lr: 0.000451  loss: 2.2939 (2.6621)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1283 (1.0311)  time: 0.1461  data: 0.0008  max mem: 12911
Epoch: [238] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.000451  min_lr: 0.000451  loss: 2.2939 (2.6938)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1283 (1.0311)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5994 (0.5994)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 5.6526  data: 5.5554  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8073 (0.7896)  acc1: 84.4000 (83.6000)  acc5: 96.4000 (96.6546)  time: 0.7047  data: 0.6101  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9850 (0.9645)  acc1: 77.6000 (79.3905)  acc5: 94.4000 (94.7619)  time: 0.1938  data: 0.1063  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0556 (0.9754)  acc1: 75.2000 (78.8640)  acc5: 93.6000 (94.6400)  time: 0.1980  data: 0.1130  max mem: 12911
Test: Total time: 0:00:10 (0.4024 s / it)
* Acc@1 79.062 Acc@5 94.598 loss 0.969
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.13%
Epoch: [239]  [   0/1251]  eta: 1:04:12  lr: 0.000451  min_lr: 0.000451  loss: 2.1809 (2.1809)  weight_decay: 0.0500 (0.0500)  time: 3.0799  data: 2.6158  max mem: 12911
Epoch: [239]  [ 200/1251]  eta: 0:03:35  lr: 0.000449  min_lr: 0.000449  loss: 2.1713 (2.5975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9252 (0.9791)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [239]  [ 400/1251]  eta: 0:02:46  lr: 0.000447  min_lr: 0.000447  loss: 2.3908 (2.6267)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8704 (0.9536)  time: 0.1874  data: 0.0004  max mem: 12911
Epoch: [239]  [ 600/1251]  eta: 0:02:06  lr: 0.000445  min_lr: 0.000445  loss: 2.5252 (2.6675)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0072 (0.9669)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [239]  [ 800/1251]  eta: 0:01:26  lr: 0.000442  min_lr: 0.000442  loss: 2.0120 (2.6764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9808 (0.9868)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [239]  [1000/1251]  eta: 0:00:48  lr: 0.000440  min_lr: 0.000440  loss: 2.2467 (2.6757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9117 (0.9866)  time: 0.1959  data: 0.0004  max mem: 12911
Epoch: [239]  [1200/1251]  eta: 0:00:09  lr: 0.000438  min_lr: 0.000438  loss: 2.9130 (2.6736)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0721 (0.9964)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [239]  [1250/1251]  eta: 0:00:00  lr: 0.000437  min_lr: 0.000437  loss: 3.2298 (2.6788)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0702 (0.9995)  time: 0.1471  data: 0.0007  max mem: 12911
Epoch: [239] Total time: 0:04:00 (0.1919 s / it)
Averaged stats: lr: 0.000437  min_lr: 0.000437  loss: 3.2298 (2.6868)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0702 (0.9995)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6428 (0.6428)  acc1: 86.4000 (86.4000)  acc5: 98.8000 (98.8000)  time: 5.6793  data: 5.5874  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8115 (0.7996)  acc1: 83.2000 (83.3455)  acc5: 97.2000 (97.0545)  time: 0.7525  data: 0.6598  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0135 (0.9721)  acc1: 77.2000 (79.2952)  acc5: 94.8000 (95.1048)  time: 0.1928  data: 0.1068  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1028 (0.9845)  acc1: 77.2000 (78.8640)  acc5: 93.6000 (94.9440)  time: 0.2128  data: 0.1283  max mem: 12911
Test: Total time: 0:00:10 (0.4143 s / it)
* Acc@1 79.084 Acc@5 94.754 loss 0.981
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.13%
Epoch: [240]  [   0/1251]  eta: 1:05:25  lr: 0.000437  min_lr: 0.000437  loss: 2.1780 (2.1780)  weight_decay: 0.0500 (0.0500)  time: 3.1379  data: 2.4977  max mem: 12911
Epoch: [240]  [ 200/1251]  eta: 0:03:33  lr: 0.000435  min_lr: 0.000435  loss: 2.1231 (2.7076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9012 (1.0616)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [240]  [ 400/1251]  eta: 0:02:45  lr: 0.000433  min_lr: 0.000433  loss: 2.5877 (2.6783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9652 (1.0044)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [240]  [ 600/1251]  eta: 0:02:04  lr: 0.000431  min_lr: 0.000431  loss: 2.2576 (2.6739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9927 (1.0535)  time: 0.1928  data: 0.0004  max mem: 12911
Epoch: [240]  [ 800/1251]  eta: 0:01:25  lr: 0.000428  min_lr: 0.000428  loss: 2.2732 (2.6693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9954 (1.0396)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [240]  [1000/1251]  eta: 0:00:47  lr: 0.000426  min_lr: 0.000426  loss: 2.6748 (2.6794)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0764 (1.0347)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [240]  [1200/1251]  eta: 0:00:09  lr: 0.000424  min_lr: 0.000424  loss: 2.6850 (2.6726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1567 (1.0588)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [240]  [1250/1251]  eta: 0:00:00  lr: 0.000423  min_lr: 0.000423  loss: 2.6439 (2.6712)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2014 (1.0647)  time: 0.1456  data: 0.0007  max mem: 12911
Epoch: [240] Total time: 0:03:57 (0.1895 s / it)
Averaged stats: lr: 0.000423  min_lr: 0.000423  loss: 2.6439 (2.6784)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2014 (1.0647)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6302 (0.6302)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.5655  data: 5.4737  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8268 (0.7942)  acc1: 84.4000 (83.3455)  acc5: 97.2000 (96.7273)  time: 0.7331  data: 0.6400  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0017 (0.9670)  acc1: 76.4000 (79.2191)  acc5: 94.8000 (94.8571)  time: 0.1964  data: 0.1097  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0751 (0.9783)  acc1: 76.4000 (78.8160)  acc5: 94.0000 (94.6880)  time: 0.1963  data: 0.1096  max mem: 12911
Test: Total time: 0:00:09 (0.3999 s / it)
* Acc@1 79.240 Acc@5 94.680 loss 0.972
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.24%
Epoch: [241]  [   0/1251]  eta: 1:04:06  lr: 0.000423  min_lr: 0.000423  loss: 1.8612 (1.8612)  weight_decay: 0.0500 (0.0500)  time: 3.0747  data: 2.7994  max mem: 12911
Epoch: [241]  [ 200/1251]  eta: 0:03:31  lr: 0.000421  min_lr: 0.000421  loss: 2.5341 (2.7922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9893 (1.0899)  time: 0.1866  data: 0.0004  max mem: 12911
Epoch: [241]  [ 400/1251]  eta: 0:02:45  lr: 0.000419  min_lr: 0.000419  loss: 2.7458 (2.7556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8613 (1.0361)  time: 0.1864  data: 0.0004  max mem: 12911
Epoch: [241]  [ 600/1251]  eta: 0:02:04  lr: 0.000417  min_lr: 0.000417  loss: 2.1190 (2.7009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9649 (1.0069)  time: 0.1898  data: 0.0006  max mem: 12911
Epoch: [241]  [ 800/1251]  eta: 0:01:25  lr: 0.000415  min_lr: 0.000415  loss: 2.0756 (2.7011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8963 (0.9908)  time: 0.1877  data: 0.0008  max mem: 12911
Epoch: [241]  [1000/1251]  eta: 0:00:47  lr: 0.000412  min_lr: 0.000412  loss: 2.1971 (2.6931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9487 (0.9898)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [241]  [1200/1251]  eta: 0:00:09  lr: 0.000410  min_lr: 0.000410  loss: 2.1217 (2.6783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9908 (0.9970)  time: 0.1866  data: 0.0006  max mem: 12911
Epoch: [241]  [1250/1251]  eta: 0:00:00  lr: 0.000410  min_lr: 0.000410  loss: 2.1550 (2.6774)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0845 (1.0026)  time: 0.1466  data: 0.0013  max mem: 12911
Epoch: [241] Total time: 0:03:57 (0.1897 s / it)
Averaged stats: lr: 0.000410  min_lr: 0.000410  loss: 2.1550 (2.6881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0845 (1.0026)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6008 (0.6008)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.6531  data: 5.5615  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7951 (0.7716)  acc1: 84.0000 (83.3091)  acc5: 96.8000 (96.7636)  time: 0.7333  data: 0.6391  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9843 (0.9438)  acc1: 76.8000 (79.0667)  acc5: 94.4000 (95.0857)  time: 0.1935  data: 0.1060  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1027 (0.9569)  acc1: 76.0000 (78.8640)  acc5: 94.0000 (94.8800)  time: 0.1928  data: 0.1059  max mem: 12911
Test: Total time: 0:00:09 (0.3986 s / it)
* Acc@1 79.186 Acc@5 94.658 loss 0.953
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.24%
Epoch: [242]  [   0/1251]  eta: 1:07:50  lr: 0.000410  min_lr: 0.000410  loss: 3.2567 (3.2567)  weight_decay: 0.0500 (0.0500)  time: 3.2539  data: 1.7907  max mem: 12911
Epoch: [242]  [ 200/1251]  eta: 0:03:35  lr: 0.000407  min_lr: 0.000407  loss: 2.7428 (2.6995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9075 (0.9879)  time: 0.1879  data: 0.0006  max mem: 12911
Epoch: [242]  [ 400/1251]  eta: 0:02:47  lr: 0.000405  min_lr: 0.000405  loss: 2.5927 (2.6631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9606 (0.9900)  time: 0.1876  data: 0.0005  max mem: 12911
Epoch: [242]  [ 600/1251]  eta: 0:02:06  lr: 0.000403  min_lr: 0.000403  loss: 2.5607 (2.6687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9465 (0.9924)  time: 0.1866  data: 0.0004  max mem: 12911
Epoch: [242]  [ 800/1251]  eta: 0:01:26  lr: 0.000401  min_lr: 0.000401  loss: 2.0676 (2.6386)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0226 (0.9871)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [242]  [1000/1251]  eta: 0:00:48  lr: 0.000399  min_lr: 0.000399  loss: 2.8806 (2.6486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9120 (0.9919)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [242]  [1200/1251]  eta: 0:00:09  lr: 0.000397  min_lr: 0.000397  loss: 2.6555 (2.6502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9162 (0.9906)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [242]  [1250/1251]  eta: 0:00:00  lr: 0.000396  min_lr: 0.000396  loss: 2.3879 (2.6503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9311 (0.9899)  time: 0.1531  data: 0.0006  max mem: 12911
Epoch: [242] Total time: 0:03:59 (0.1914 s / it)
Averaged stats: lr: 0.000396  min_lr: 0.000396  loss: 2.3879 (2.6664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9311 (0.9899)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6254 (0.6254)  acc1: 86.4000 (86.4000)  acc5: 98.8000 (98.8000)  time: 5.3152  data: 5.2099  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7763 (0.7785)  acc1: 84.8000 (83.4545)  acc5: 96.8000 (96.8000)  time: 0.7490  data: 0.6552  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9784 (0.9615)  acc1: 77.2000 (79.1048)  acc5: 94.8000 (94.6095)  time: 0.2194  data: 0.1330  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0968 (0.9717)  acc1: 74.8000 (78.7680)  acc5: 94.0000 (94.5440)  time: 0.2182  data: 0.1328  max mem: 12911
Test: Total time: 0:00:10 (0.4038 s / it)
* Acc@1 79.284 Acc@5 94.648 loss 0.958
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.28%
Epoch: [243]  [   0/1251]  eta: 0:54:27  lr: 0.000396  min_lr: 0.000396  loss: 3.4144 (3.4144)  weight_decay: 0.0500 (0.0500)  time: 2.6120  data: 2.3187  max mem: 12911
Epoch: [243]  [ 200/1251]  eta: 0:03:34  lr: 0.000394  min_lr: 0.000394  loss: 2.4571 (2.6395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8699 (0.9429)  time: 0.1892  data: 0.0004  max mem: 12911
Epoch: [243]  [ 400/1251]  eta: 0:02:47  lr: 0.000392  min_lr: 0.000392  loss: 2.1981 (2.6688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9253 (1.0406)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [243]  [ 600/1251]  eta: 0:02:06  lr: 0.000390  min_lr: 0.000390  loss: 2.1914 (2.6460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9889 (1.0565)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [243]  [ 800/1251]  eta: 0:01:26  lr: 0.000388  min_lr: 0.000388  loss: 2.1497 (2.6576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9067 (1.0372)  time: 0.1845  data: 0.0004  max mem: 12911
Epoch: [243]  [1000/1251]  eta: 0:00:47  lr: 0.000385  min_lr: 0.000385  loss: 2.5519 (2.6466)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1950 (1.0865)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [243]  [1200/1251]  eta: 0:00:09  lr: 0.000383  min_lr: 0.000383  loss: 2.0770 (2.6548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9801 (1.0726)  time: 0.1885  data: 0.0005  max mem: 12911
Epoch: [243]  [1250/1251]  eta: 0:00:00  lr: 0.000383  min_lr: 0.000383  loss: 2.1546 (2.6539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9733 (1.0689)  time: 0.1474  data: 0.0006  max mem: 12911
Epoch: [243] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.000383  min_lr: 0.000383  loss: 2.1546 (2.6684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9733 (1.0689)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.5899 (0.5899)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 5.1971  data: 5.0976  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7752 (0.7618)  acc1: 84.0000 (83.9273)  acc5: 96.8000 (96.8364)  time: 0.7378  data: 0.6392  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9601 (0.9405)  acc1: 77.6000 (79.5810)  acc5: 94.4000 (94.9524)  time: 0.2234  data: 0.1342  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0746 (0.9501)  acc1: 76.4000 (79.2320)  acc5: 94.4000 (94.8800)  time: 0.2221  data: 0.1352  max mem: 12911
Test: Total time: 0:00:10 (0.4062 s / it)
* Acc@1 79.204 Acc@5 94.766 loss 0.946
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.28%
Epoch: [244]  [   0/1251]  eta: 1:08:09  lr: 0.000383  min_lr: 0.000383  loss: 3.5871 (3.5871)  weight_decay: 0.0500 (0.0500)  time: 3.2688  data: 1.8885  max mem: 12911
Epoch: [244]  [ 200/1251]  eta: 0:03:34  lr: 0.000381  min_lr: 0.000381  loss: 2.0558 (2.6532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9351 (0.9674)  time: 0.1846  data: 0.0005  max mem: 12911
Epoch: [244]  [ 400/1251]  eta: 0:02:46  lr: 0.000379  min_lr: 0.000379  loss: 3.4856 (2.6831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9916 (0.9828)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [244]  [ 600/1251]  eta: 0:02:05  lr: 0.000377  min_lr: 0.000377  loss: 2.0651 (2.6738)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0463 (0.9990)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [244]  [ 800/1251]  eta: 0:01:26  lr: 0.000374  min_lr: 0.000374  loss: 2.0433 (2.6741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9401 (1.0157)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [244]  [1000/1251]  eta: 0:00:47  lr: 0.000372  min_lr: 0.000372  loss: 2.2179 (2.6788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9120 (1.0129)  time: 0.1892  data: 0.0005  max mem: 12911
Epoch: [244]  [1200/1251]  eta: 0:00:09  lr: 0.000370  min_lr: 0.000370  loss: 3.0695 (2.6873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9175 (1.0098)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [244]  [1250/1251]  eta: 0:00:00  lr: 0.000370  min_lr: 0.000370  loss: 2.1896 (2.6857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9686 (1.0088)  time: 0.1472  data: 0.0009  max mem: 12911
Epoch: [244] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.000370  min_lr: 0.000370  loss: 2.1896 (2.6727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9686 (1.0088)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6076 (0.6076)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.5514  data: 5.4213  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8083 (0.7803)  acc1: 83.6000 (83.1273)  acc5: 96.8000 (96.6909)  time: 0.7264  data: 0.6278  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9949 (0.9453)  acc1: 76.4000 (79.2000)  acc5: 94.8000 (94.8381)  time: 0.1978  data: 0.1098  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0351 (0.9561)  acc1: 76.4000 (78.9760)  acc5: 94.4000 (94.7200)  time: 0.1993  data: 0.1137  max mem: 12911
Test: Total time: 0:00:10 (0.4006 s / it)
* Acc@1 79.240 Acc@5 94.770 loss 0.946
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.28%
Epoch: [245]  [   0/1251]  eta: 1:03:11  lr: 0.000370  min_lr: 0.000370  loss: 1.9632 (1.9632)  weight_decay: 0.0500 (0.0500)  time: 3.0309  data: 2.3601  max mem: 12911
Epoch: [245]  [ 200/1251]  eta: 0:03:33  lr: 0.000368  min_lr: 0.000368  loss: 2.5748 (2.6470)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0902 (1.0792)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [245]  [ 400/1251]  eta: 0:02:46  lr: 0.000366  min_lr: 0.000366  loss: 2.3660 (2.6474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9765 (1.0639)  time: 0.1892  data: 0.0004  max mem: 12911
Epoch: [245]  [ 600/1251]  eta: 0:02:05  lr: 0.000364  min_lr: 0.000364  loss: 3.1123 (2.6478)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1352 (1.0496)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [245]  [ 800/1251]  eta: 0:01:26  lr: 0.000362  min_lr: 0.000362  loss: 2.0871 (2.6516)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0717 (1.0682)  time: 0.1893  data: 0.0004  max mem: 12911
Epoch: [245]  [1000/1251]  eta: 0:00:48  lr: 0.000359  min_lr: 0.000359  loss: 2.0266 (2.6573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9847 (1.0567)  time: 0.1870  data: 0.0005  max mem: 12911
Epoch: [245]  [1200/1251]  eta: 0:00:09  lr: 0.000357  min_lr: 0.000357  loss: 2.1447 (2.6593)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0003 (1.0571)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [245]  [1250/1251]  eta: 0:00:00  lr: 0.000357  min_lr: 0.000357  loss: 2.2196 (2.6586)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0387 (1.0600)  time: 0.1467  data: 0.0008  max mem: 12911
Epoch: [245] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.000357  min_lr: 0.000357  loss: 2.2196 (2.6637)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0387 (1.0600)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5646 (0.5646)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.6434  data: 5.5519  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7309 (0.7498)  acc1: 84.0000 (83.7091)  acc5: 97.2000 (96.9091)  time: 0.7737  data: 0.6771  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9730 (0.9181)  acc1: 77.2000 (79.5238)  acc5: 94.8000 (94.9333)  time: 0.2212  data: 0.1328  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0239 (0.9294)  acc1: 76.8000 (79.1200)  acc5: 93.6000 (94.7840)  time: 0.2179  data: 0.1327  max mem: 12911
Test: Total time: 0:00:10 (0.4186 s / it)
* Acc@1 79.372 Acc@5 94.762 loss 0.924
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.37%
Epoch: [246]  [   0/1251]  eta: 0:57:55  lr: 0.000357  min_lr: 0.000357  loss: 2.2125 (2.2125)  weight_decay: 0.0500 (0.0500)  time: 2.7785  data: 1.7902  max mem: 12911
Epoch: [246]  [ 200/1251]  eta: 0:03:33  lr: 0.000355  min_lr: 0.000355  loss: 2.1513 (2.6889)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0007 (1.0989)  time: 0.1896  data: 0.0005  max mem: 12911
Epoch: [246]  [ 400/1251]  eta: 0:02:46  lr: 0.000353  min_lr: 0.000353  loss: 1.9891 (2.6632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9580 (1.0798)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [246]  [ 600/1251]  eta: 0:02:06  lr: 0.000351  min_lr: 0.000351  loss: 2.0320 (2.6325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9899 (1.0700)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [246]  [ 800/1251]  eta: 0:01:27  lr: 0.000349  min_lr: 0.000349  loss: 2.0148 (2.6468)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0197 (1.0490)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [246]  [1000/1251]  eta: 0:00:48  lr: 0.000347  min_lr: 0.000347  loss: 3.1744 (2.6492)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0465 (1.0716)  time: 0.1882  data: 0.0004  max mem: 12911
Epoch: [246]  [1200/1251]  eta: 0:00:09  lr: 0.000345  min_lr: 0.000345  loss: 2.9329 (2.6687)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0885 (1.0762)  time: 0.1892  data: 0.0005  max mem: 12911
Epoch: [246]  [1250/1251]  eta: 0:00:00  lr: 0.000344  min_lr: 0.000344  loss: 1.9943 (2.6646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9323 (1.0715)  time: 0.1464  data: 0.0011  max mem: 12911
Epoch: [246] Total time: 0:04:00 (0.1924 s / it)
Averaged stats: lr: 0.000344  min_lr: 0.000344  loss: 1.9943 (2.6427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9323 (1.0715)
Test:  [ 0/25]  eta: 0:01:47  loss: 0.5748 (0.5748)  acc1: 86.4000 (86.4000)  acc5: 99.2000 (99.2000)  time: 4.3034  data: 4.1978  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.7077 (0.7404)  acc1: 84.4000 (82.9091)  acc5: 96.8000 (97.0182)  time: 0.6660  data: 0.5798  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9594 (0.9112)  acc1: 77.6000 (79.2762)  acc5: 95.2000 (95.0095)  time: 0.2373  data: 0.1547  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9835 (0.9230)  acc1: 77.6000 (78.9760)  acc5: 93.6000 (94.9280)  time: 0.2087  data: 0.1265  max mem: 12911
Test: Total time: 0:00:10 (0.4011 s / it)
* Acc@1 79.456 Acc@5 94.846 loss 0.918
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.46%
Epoch: [247]  [   0/1251]  eta: 0:59:29  lr: 0.000344  min_lr: 0.000344  loss: 2.0525 (2.0525)  weight_decay: 0.0500 (0.0500)  time: 2.8530  data: 2.5981  max mem: 12911
Epoch: [247]  [ 200/1251]  eta: 0:03:33  lr: 0.000342  min_lr: 0.000342  loss: 2.1742 (2.6037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9925 (nan)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [247]  [ 400/1251]  eta: 0:02:46  lr: 0.000340  min_lr: 0.000340  loss: 2.2672 (2.6402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8981 (nan)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [247]  [ 600/1251]  eta: 0:02:05  lr: 0.000338  min_lr: 0.000338  loss: 2.2728 (2.6519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9876 (nan)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [247]  [ 800/1251]  eta: 0:01:26  lr: 0.000336  min_lr: 0.000336  loss: 2.1026 (2.6578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9588 (nan)  time: 0.1920  data: 0.0005  max mem: 12911
Epoch: [247]  [1000/1251]  eta: 0:00:48  lr: 0.000334  min_lr: 0.000334  loss: 2.8236 (2.6442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1315 (nan)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [247]  [1200/1251]  eta: 0:00:09  lr: 0.000332  min_lr: 0.000332  loss: 2.2230 (2.6317)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0415 (nan)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [247]  [1250/1251]  eta: 0:00:00  lr: 0.000332  min_lr: 0.000332  loss: 2.3806 (2.6396)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1181 (nan)  time: 0.1470  data: 0.0013  max mem: 12911
Epoch: [247] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.000332  min_lr: 0.000332  loss: 2.3806 (2.6502)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1181 (nan)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6061 (0.6061)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.3153  data: 5.2164  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7612 (0.7617)  acc1: 83.6000 (83.4909)  acc5: 97.2000 (96.9455)  time: 0.7402  data: 0.6462  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0007 (0.9384)  acc1: 77.2000 (79.4095)  acc5: 94.4000 (95.0476)  time: 0.2202  data: 0.1333  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0461 (0.9476)  acc1: 76.8000 (79.1200)  acc5: 94.0000 (94.9760)  time: 0.2187  data: 0.1332  max mem: 12911
Test: Total time: 0:00:10 (0.4045 s / it)
* Acc@1 79.528 Acc@5 94.870 loss 0.942
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.53%
Epoch: [248]  [   0/1251]  eta: 1:00:23  lr: 0.000332  min_lr: 0.000332  loss: 3.6839 (3.6839)  weight_decay: 0.0500 (0.0500)  time: 2.8968  data: 2.6464  max mem: 12911
Epoch: [248]  [ 200/1251]  eta: 0:03:31  lr: 0.000330  min_lr: 0.000330  loss: 2.1752 (2.6352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9315 (1.0219)  time: 0.1847  data: 0.0004  max mem: 12911
Epoch: [248]  [ 400/1251]  eta: 0:02:45  lr: 0.000328  min_lr: 0.000328  loss: 2.0743 (2.6366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9360 (1.0219)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [248]  [ 600/1251]  eta: 0:02:05  lr: 0.000326  min_lr: 0.000326  loss: 2.1721 (2.6263)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0072 (1.0261)  time: 0.1892  data: 0.0005  max mem: 12911
Epoch: [248]  [ 800/1251]  eta: 0:01:26  lr: 0.000324  min_lr: 0.000324  loss: 2.0610 (2.6244)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0579 (1.0486)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [248]  [1000/1251]  eta: 0:00:47  lr: 0.000322  min_lr: 0.000322  loss: 2.0427 (2.6295)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0016 (1.0470)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [248]  [1200/1251]  eta: 0:00:09  lr: 0.000320  min_lr: 0.000320  loss: 2.3268 (2.6428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9853 (1.0359)  time: 0.1882  data: 0.0005  max mem: 12911
Epoch: [248]  [1250/1251]  eta: 0:00:00  lr: 0.000320  min_lr: 0.000320  loss: 2.1660 (2.6397)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9639 (1.0306)  time: 0.1463  data: 0.0013  max mem: 12911
Epoch: [248] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.000320  min_lr: 0.000320  loss: 2.1660 (2.6502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9639 (1.0306)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6159 (0.6159)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.6530  data: 5.5615  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7736 (0.7719)  acc1: 84.4000 (83.7091)  acc5: 97.2000 (97.0182)  time: 0.7519  data: 0.6553  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9823 (0.9420)  acc1: 77.6000 (79.7143)  acc5: 95.2000 (95.1619)  time: 0.2186  data: 0.1303  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0547 (0.9531)  acc1: 76.8000 (79.2800)  acc5: 94.4000 (95.0720)  time: 0.2147  data: 0.1302  max mem: 12911
Test: Total time: 0:00:10 (0.4171 s / it)
* Acc@1 79.560 Acc@5 94.908 loss 0.950
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.56%
Epoch: [249]  [   0/1251]  eta: 1:04:08  lr: 0.000320  min_lr: 0.000320  loss: 2.0199 (2.0199)  weight_decay: 0.0500 (0.0500)  time: 3.0762  data: 2.8164  max mem: 12911
Epoch: [249]  [ 200/1251]  eta: 0:03:32  lr: 0.000318  min_lr: 0.000318  loss: 2.1831 (2.6035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9902 (0.9797)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [249]  [ 400/1251]  eta: 0:02:46  lr: 0.000316  min_lr: 0.000316  loss: 1.9289 (2.5960)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0048 (1.0171)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [249]  [ 600/1251]  eta: 0:02:05  lr: 0.000314  min_lr: 0.000314  loss: 2.2278 (2.6363)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2823 (1.0472)  time: 0.1900  data: 0.0005  max mem: 12911
Epoch: [249]  [ 800/1251]  eta: 0:01:26  lr: 0.000312  min_lr: 0.000312  loss: 2.2872 (2.6469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9137 (1.0492)  time: 0.1930  data: 0.0005  max mem: 12911
Epoch: [249]  [1000/1251]  eta: 0:00:48  lr: 0.000310  min_lr: 0.000310  loss: 2.0346 (2.6271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9728 (1.0352)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [249]  [1200/1251]  eta: 0:00:09  lr: 0.000308  min_lr: 0.000308  loss: 2.3412 (2.6353)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0198 (1.0474)  time: 0.1852  data: 0.0007  max mem: 12911
Epoch: [249]  [1250/1251]  eta: 0:00:00  lr: 0.000308  min_lr: 0.000308  loss: 2.0360 (2.6303)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0551 (1.0524)  time: 0.1465  data: 0.0013  max mem: 12911
Epoch: [249] Total time: 0:03:59 (0.1918 s / it)
Averaged stats: lr: 0.000308  min_lr: 0.000308  loss: 2.0360 (2.6479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0551 (1.0524)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6127 (0.6127)  acc1: 86.4000 (86.4000)  acc5: 99.2000 (99.2000)  time: 5.7215  data: 5.6299  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7394 (0.7546)  acc1: 84.0000 (83.3455)  acc5: 96.8000 (96.8000)  time: 0.7581  data: 0.6666  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9400 (0.9222)  acc1: 78.4000 (79.2571)  acc5: 94.8000 (95.0476)  time: 0.2078  data: 0.1216  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9809 (0.9340)  acc1: 76.8000 (78.9920)  acc5: 94.4000 (95.0080)  time: 0.2062  data: 0.1216  max mem: 12911
Test: Total time: 0:00:10 (0.4112 s / it)
* Acc@1 79.440 Acc@5 94.864 loss 0.928
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.56%
Epoch: [250]  [   0/1251]  eta: 1:06:28  lr: 0.000307  min_lr: 0.000307  loss: 1.7319 (1.7319)  weight_decay: 0.0500 (0.0500)  time: 3.1882  data: 2.3007  max mem: 12911
Epoch: [250]  [ 200/1251]  eta: 0:03:34  lr: 0.000306  min_lr: 0.000306  loss: 2.3518 (2.7050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9716 (1.0458)  time: 0.1894  data: 0.0005  max mem: 12911
Epoch: [250]  [ 400/1251]  eta: 0:02:47  lr: 0.000304  min_lr: 0.000304  loss: 2.5913 (2.6525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9392 (1.0383)  time: 0.1883  data: 0.0005  max mem: 12911
Epoch: [250]  [ 600/1251]  eta: 0:02:06  lr: 0.000302  min_lr: 0.000302  loss: 2.0708 (2.6594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9798 (1.0295)  time: 0.1941  data: 0.0006  max mem: 12911
Epoch: [250]  [ 800/1251]  eta: 0:01:27  lr: 0.000300  min_lr: 0.000300  loss: 2.1910 (2.6479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9092 (1.0156)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [250]  [1000/1251]  eta: 0:00:48  lr: 0.000298  min_lr: 0.000298  loss: 2.0998 (2.6479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0192 (1.0229)  time: 0.1903  data: 0.0005  max mem: 12911
Epoch: [250]  [1200/1251]  eta: 0:00:09  lr: 0.000296  min_lr: 0.000296  loss: 2.1810 (2.6505)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0227 (1.0193)  time: 0.1873  data: 0.0006  max mem: 12911
Epoch: [250]  [1250/1251]  eta: 0:00:00  lr: 0.000296  min_lr: 0.000296  loss: 2.8484 (2.6487)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1462  data: 0.0008  max mem: 12911
Epoch: [250] Total time: 0:03:59 (0.1916 s / it)
Averaged stats: lr: 0.000296  min_lr: 0.000296  loss: 2.8484 (2.6435)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6216 (0.6216)  acc1: 87.6000 (87.6000)  acc5: 99.6000 (99.6000)  time: 5.2960  data: 5.2045  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7958 (0.7950)  acc1: 84.0000 (83.8182)  acc5: 96.8000 (96.8727)  time: 0.6961  data: 0.6007  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0061 (0.9684)  acc1: 78.4000 (79.6952)  acc5: 94.8000 (95.0095)  time: 0.1988  data: 0.1110  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0718 (0.9786)  acc1: 78.4000 (79.4240)  acc5: 94.0000 (94.8640)  time: 0.2212  data: 0.1366  max mem: 12911
Test: Total time: 0:00:10 (0.4071 s / it)
* Acc@1 79.482 Acc@5 94.804 loss 0.974
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.56%
Epoch: [251]  [   0/1251]  eta: 1:01:14  lr: 0.000296  min_lr: 0.000296  loss: 1.9022 (1.9022)  weight_decay: 0.0500 (0.0500)  time: 2.9376  data: 1.7347  max mem: 12911
Epoch: [251]  [ 200/1251]  eta: 0:03:34  lr: 0.000294  min_lr: 0.000294  loss: 2.3535 (2.6807)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1293 (1.0992)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [251]  [ 400/1251]  eta: 0:02:46  lr: 0.000292  min_lr: 0.000292  loss: 2.0056 (2.6516)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0579 (1.0777)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [251]  [ 600/1251]  eta: 0:02:05  lr: 0.000290  min_lr: 0.000290  loss: 2.3951 (2.6689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9713 (1.0613)  time: 0.1846  data: 0.0004  max mem: 12911
Epoch: [251]  [ 800/1251]  eta: 0:01:26  lr: 0.000288  min_lr: 0.000288  loss: 2.1466 (2.6623)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0463 (1.0714)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [251]  [1000/1251]  eta: 0:00:47  lr: 0.000286  min_lr: 0.000286  loss: 2.6170 (2.6588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9746 (1.0755)  time: 0.1891  data: 0.0005  max mem: 12911
Epoch: [251]  [1200/1251]  eta: 0:00:09  lr: 0.000284  min_lr: 0.000284  loss: 2.1028 (2.6587)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0391 (1.0973)  time: 0.1907  data: 0.0006  max mem: 12911
Epoch: [251]  [1250/1251]  eta: 0:00:00  lr: 0.000284  min_lr: 0.000284  loss: 2.1261 (2.6520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0591 (1.0972)  time: 0.1465  data: 0.0009  max mem: 12911
Epoch: [251] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.000284  min_lr: 0.000284  loss: 2.1261 (2.6420)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0591 (1.0972)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5488 (0.5488)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.5585  data: 5.4614  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7669 (0.7403)  acc1: 82.4000 (83.3818)  acc5: 96.8000 (96.8727)  time: 0.6791  data: 0.5845  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9625 (0.9125)  acc1: 77.6000 (79.4857)  acc5: 95.2000 (95.1810)  time: 0.1884  data: 0.1010  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9884 (0.9212)  acc1: 77.2000 (79.0880)  acc5: 94.4000 (95.0560)  time: 0.1937  data: 0.1087  max mem: 12911
Test: Total time: 0:00:09 (0.3962 s / it)
* Acc@1 79.508 Acc@5 94.844 loss 0.919
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.56%
Epoch: [252]  [   0/1251]  eta: 1:03:08  lr: 0.000284  min_lr: 0.000284  loss: 2.2832 (2.2832)  weight_decay: 0.0500 (0.0500)  time: 3.0285  data: 1.5756  max mem: 12911
Epoch: [252]  [ 200/1251]  eta: 0:03:35  lr: 0.000282  min_lr: 0.000282  loss: 1.9812 (2.6435)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0732 (1.2063)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [252]  [ 400/1251]  eta: 0:02:47  lr: 0.000280  min_lr: 0.000280  loss: 1.9040 (2.5687)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0093 (1.1235)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [252]  [ 600/1251]  eta: 0:02:05  lr: 0.000279  min_lr: 0.000279  loss: 2.0733 (2.5454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8967 (1.0756)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [252]  [ 800/1251]  eta: 0:01:26  lr: 0.000277  min_lr: 0.000277  loss: 2.1109 (2.5968)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0445 (1.0670)  time: 0.1943  data: 0.0005  max mem: 12911
Epoch: [252]  [1000/1251]  eta: 0:00:47  lr: 0.000275  min_lr: 0.000275  loss: 2.2569 (2.6145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9307 (1.0592)  time: 0.1912  data: 0.0005  max mem: 12911
Epoch: [252]  [1200/1251]  eta: 0:00:09  lr: 0.000273  min_lr: 0.000273  loss: 2.2245 (2.6082)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0209 (1.0530)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [252]  [1250/1251]  eta: 0:00:00  lr: 0.000273  min_lr: 0.000273  loss: 2.0254 (2.6050)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0057 (1.0526)  time: 0.1469  data: 0.0007  max mem: 12911
Epoch: [252] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.000273  min_lr: 0.000273  loss: 2.0254 (2.6490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0057 (1.0526)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5597 (0.5597)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.6403  data: 5.5219  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7654 (0.7485)  acc1: 84.0000 (83.4546)  acc5: 96.8000 (96.7636)  time: 0.7464  data: 0.6597  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9659 (0.9163)  acc1: 77.2000 (79.4857)  acc5: 94.8000 (94.9905)  time: 0.2191  data: 0.1372  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0229 (0.9242)  acc1: 77.2000 (79.1680)  acc5: 94.4000 (95.0080)  time: 0.2170  data: 0.1372  max mem: 12911
Test: Total time: 0:00:10 (0.4164 s / it)
* Acc@1 79.532 Acc@5 94.904 loss 0.923
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.56%
Epoch: [253]  [   0/1251]  eta: 1:07:30  lr: 0.000273  min_lr: 0.000273  loss: 2.4893 (2.4893)  weight_decay: 0.0500 (0.0500)  time: 3.2378  data: 2.6376  max mem: 12911
Epoch: [253]  [ 200/1251]  eta: 0:03:35  lr: 0.000271  min_lr: 0.000271  loss: 3.4991 (2.7774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9905 (1.0648)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [253]  [ 400/1251]  eta: 0:02:48  lr: 0.000269  min_lr: 0.000269  loss: 2.9161 (2.7048)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0018 (1.0427)  time: 0.1899  data: 0.0005  max mem: 12911
Epoch: [253]  [ 600/1251]  eta: 0:02:07  lr: 0.000267  min_lr: 0.000267  loss: 2.2833 (2.6600)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0073 (1.0308)  time: 0.1893  data: 0.0005  max mem: 12911
Epoch: [253]  [ 800/1251]  eta: 0:01:27  lr: 0.000265  min_lr: 0.000265  loss: 2.0335 (2.6473)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0212 (1.0427)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [253]  [1000/1251]  eta: 0:00:48  lr: 0.000264  min_lr: 0.000264  loss: 2.0890 (2.6249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9735 (1.0335)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [253]  [1200/1251]  eta: 0:00:09  lr: 0.000262  min_lr: 0.000262  loss: 2.1639 (2.6194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9904 (1.0274)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [253]  [1250/1251]  eta: 0:00:00  lr: 0.000261  min_lr: 0.000261  loss: 2.6491 (2.6203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9853 (1.0267)  time: 0.1467  data: 0.0009  max mem: 12911
Epoch: [253] Total time: 0:04:00 (0.1919 s / it)
Averaged stats: lr: 0.000261  min_lr: 0.000261  loss: 2.6491 (2.6268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9853 (1.0267)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.6329 (0.6329)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.2714  data: 5.1801  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8146 (0.7904)  acc1: 83.6000 (84.1091)  acc5: 97.2000 (96.9091)  time: 0.7400  data: 0.6474  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9941 (0.9566)  acc1: 78.0000 (79.9810)  acc5: 94.8000 (94.9714)  time: 0.2185  data: 0.1316  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0595 (0.9662)  acc1: 77.2000 (79.5680)  acc5: 94.4000 (94.9600)  time: 0.2159  data: 0.1322  max mem: 12911
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 79.596 Acc@5 94.874 loss 0.965
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.60%
Epoch: [254]  [   0/1251]  eta: 0:59:24  lr: 0.000261  min_lr: 0.000261  loss: 1.8373 (1.8373)  weight_decay: 0.0500 (0.0500)  time: 2.8490  data: 2.6044  max mem: 12911
Epoch: [254]  [ 200/1251]  eta: 0:03:31  lr: 0.000260  min_lr: 0.000260  loss: 2.1861 (2.6346)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0137 (1.0675)  time: 0.1850  data: 0.0004  max mem: 12911
Epoch: [254]  [ 400/1251]  eta: 0:02:45  lr: 0.000258  min_lr: 0.000258  loss: 2.8759 (2.6432)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0588 (1.0514)  time: 0.1909  data: 0.0005  max mem: 12911
Epoch: [254]  [ 600/1251]  eta: 0:02:05  lr: 0.000256  min_lr: 0.000256  loss: 2.6233 (2.6566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9749 (1.0349)  time: 0.1882  data: 0.0006  max mem: 12911
Epoch: [254]  [ 800/1251]  eta: 0:01:25  lr: 0.000254  min_lr: 0.000254  loss: 2.5432 (2.6188)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0420 (1.0349)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [254]  [1000/1251]  eta: 0:00:47  lr: 0.000253  min_lr: 0.000253  loss: 2.0644 (2.6021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9605 (1.0503)  time: 0.1918  data: 0.0005  max mem: 12911
Epoch: [254]  [1200/1251]  eta: 0:00:09  lr: 0.000251  min_lr: 0.000251  loss: 2.6209 (2.6135)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0160 (1.0455)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [254]  [1250/1251]  eta: 0:00:00  lr: 0.000251  min_lr: 0.000251  loss: 2.1493 (2.6091)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0410 (1.0457)  time: 0.1456  data: 0.0008  max mem: 12911
Epoch: [254] Total time: 0:03:57 (0.1896 s / it)
Averaged stats: lr: 0.000251  min_lr: 0.000251  loss: 2.1493 (2.6166)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0410 (1.0457)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6045 (0.6045)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.5993  data: 5.5077  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7633 (0.7721)  acc1: 82.8000 (83.2727)  acc5: 97.2000 (96.9455)  time: 0.7493  data: 0.6545  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9850 (0.9373)  acc1: 78.0000 (79.8857)  acc5: 94.0000 (95.0095)  time: 0.2199  data: 0.1321  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0525 (0.9463)  acc1: 78.0000 (79.4080)  acc5: 94.0000 (94.9120)  time: 0.2179  data: 0.1320  max mem: 12911
Test: Total time: 0:00:10 (0.4160 s / it)
* Acc@1 79.602 Acc@5 94.906 loss 0.943
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.60%
Epoch: [255]  [   0/1251]  eta: 0:49:06  lr: 0.000250  min_lr: 0.000250  loss: 1.8749 (1.8749)  weight_decay: 0.0500 (0.0500)  time: 2.3553  data: 1.7187  max mem: 12911
Epoch: [255]  [ 200/1251]  eta: 0:03:34  lr: 0.000249  min_lr: 0.000249  loss: 2.2319 (2.5771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8771 (1.0087)  time: 0.1889  data: 0.0004  max mem: 12911
Epoch: [255]  [ 400/1251]  eta: 0:02:47  lr: 0.000247  min_lr: 0.000247  loss: 2.1889 (2.5885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9406 (1.0018)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [255]  [ 600/1251]  eta: 0:02:06  lr: 0.000245  min_lr: 0.000245  loss: 2.2583 (2.6063)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0705 (1.0464)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [255]  [ 800/1251]  eta: 0:01:27  lr: 0.000244  min_lr: 0.000244  loss: 2.3252 (2.6155)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0396 (1.0434)  time: 0.1869  data: 0.0005  max mem: 12911
Epoch: [255]  [1000/1251]  eta: 0:00:48  lr: 0.000242  min_lr: 0.000242  loss: 2.0916 (2.6026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0244 (1.0522)  time: 0.1873  data: 0.0004  max mem: 12911
Epoch: [255]  [1200/1251]  eta: 0:00:09  lr: 0.000240  min_lr: 0.000240  loss: 2.0015 (2.6033)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0361 (1.0490)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [255]  [1250/1251]  eta: 0:00:00  lr: 0.000240  min_lr: 0.000240  loss: 2.0234 (2.6043)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0068 (1.0490)  time: 0.1467  data: 0.0011  max mem: 12911
Epoch: [255] Total time: 0:03:59 (0.1912 s / it)
Averaged stats: lr: 0.000240  min_lr: 0.000240  loss: 2.0234 (2.6256)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0068 (1.0490)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.5403 (0.5403)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.1917  data: 5.0939  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7317 (0.7374)  acc1: 84.0000 (83.8545)  acc5: 96.8000 (96.9455)  time: 0.7139  data: 0.6182  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9695 (0.9016)  acc1: 78.4000 (80.0952)  acc5: 95.2000 (95.1048)  time: 0.2134  data: 0.1241  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9981 (0.9122)  acc1: 76.4000 (79.6640)  acc5: 94.4000 (95.0080)  time: 0.2137  data: 0.1270  max mem: 12911
Test: Total time: 0:00:09 (0.3981 s / it)
* Acc@1 79.650 Acc@5 94.882 loss 0.915
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.65%
Epoch: [256]  [   0/1251]  eta: 1:00:10  lr: 0.000240  min_lr: 0.000240  loss: 1.9866 (1.9866)  weight_decay: 0.0500 (0.0500)  time: 2.8861  data: 2.6069  max mem: 12911
Epoch: [256]  [ 200/1251]  eta: 0:03:30  lr: 0.000238  min_lr: 0.000238  loss: 2.0525 (2.5275)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0279 (1.0462)  time: 0.1855  data: 0.0005  max mem: 12911
Epoch: [256]  [ 400/1251]  eta: 0:02:44  lr: 0.000236  min_lr: 0.000236  loss: 2.1011 (2.5823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9030 (1.0260)  time: 0.1891  data: 0.0005  max mem: 12911
Epoch: [256]  [ 600/1251]  eta: 0:02:04  lr: 0.000235  min_lr: 0.000235  loss: 2.0119 (2.5834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0071 (1.0310)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [256]  [ 800/1251]  eta: 0:01:26  lr: 0.000233  min_lr: 0.000233  loss: 2.0356 (2.5993)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0568 (1.0493)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [256]  [1000/1251]  eta: 0:00:47  lr: 0.000231  min_lr: 0.000231  loss: 2.2317 (2.6080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9799 (1.0467)  time: 0.1970  data: 0.0005  max mem: 12911
Epoch: [256]  [1200/1251]  eta: 0:00:09  lr: 0.000230  min_lr: 0.000230  loss: 2.1760 (2.5996)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0027 (1.0430)  time: 0.1916  data: 0.0005  max mem: 12911
Epoch: [256]  [1250/1251]  eta: 0:00:00  lr: 0.000229  min_lr: 0.000229  loss: 2.2687 (2.6002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0208 (1.0433)  time: 0.1470  data: 0.0009  max mem: 12911
Epoch: [256] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.000229  min_lr: 0.000229  loss: 2.2687 (2.6306)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0208 (1.0433)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.5721 (0.5721)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.1724  data: 5.0338  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7554 (0.7481)  acc1: 83.2000 (83.4182)  acc5: 97.2000 (96.8364)  time: 0.7074  data: 0.6061  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9813 (0.9102)  acc1: 77.2000 (79.7905)  acc5: 94.4000 (94.8762)  time: 0.2009  data: 0.1124  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0138 (0.9209)  acc1: 77.2000 (79.3120)  acc5: 94.0000 (94.8800)  time: 0.1979  data: 0.1134  max mem: 12911
Test: Total time: 0:00:09 (0.3850 s / it)
* Acc@1 79.656 Acc@5 94.902 loss 0.920
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.66%
Epoch: [257]  [   0/1251]  eta: 0:54:55  lr: 0.000229  min_lr: 0.000229  loss: 1.8234 (1.8234)  weight_decay: 0.0500 (0.0500)  time: 2.6342  data: 2.3551  max mem: 12911
Epoch: [257]  [ 200/1251]  eta: 0:03:33  lr: 0.000228  min_lr: 0.000228  loss: 2.0689 (2.6256)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1356 (1.1568)  time: 0.1909  data: 0.0004  max mem: 12911
Epoch: [257]  [ 400/1251]  eta: 0:02:46  lr: 0.000226  min_lr: 0.000226  loss: 2.2933 (2.6705)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0106 (1.1450)  time: 0.1896  data: 0.0004  max mem: 12911
Epoch: [257]  [ 600/1251]  eta: 0:02:05  lr: 0.000224  min_lr: 0.000224  loss: 2.0197 (2.6862)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0227 (1.1073)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [257]  [ 800/1251]  eta: 0:01:26  lr: 0.000223  min_lr: 0.000223  loss: 2.0497 (2.6791)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1078 (1.1012)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [257]  [1000/1251]  eta: 0:00:47  lr: 0.000221  min_lr: 0.000221  loss: 2.3044 (2.6600)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1187 (1.0939)  time: 0.1900  data: 0.0005  max mem: 12911
Epoch: [257]  [1200/1251]  eta: 0:00:09  lr: 0.000219  min_lr: 0.000219  loss: 1.9907 (2.6488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9706 (1.0854)  time: 0.1853  data: 0.0005  max mem: 12911
Epoch: [257]  [1250/1251]  eta: 0:00:00  lr: 0.000219  min_lr: 0.000219  loss: 2.1048 (2.6451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9476 (1.0810)  time: 0.1462  data: 0.0008  max mem: 12911
Epoch: [257] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.000219  min_lr: 0.000219  loss: 2.1048 (2.6206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9476 (1.0810)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5503 (0.5503)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.6501  data: 5.5585  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7407 (0.7403)  acc1: 84.4000 (83.7091)  acc5: 97.2000 (96.8000)  time: 0.6818  data: 0.5859  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9570 (0.9073)  acc1: 78.0000 (79.6952)  acc5: 94.4000 (95.0286)  time: 0.1781  data: 0.0902  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0153 (0.9167)  acc1: 77.2000 (79.3600)  acc5: 94.4000 (94.9600)  time: 0.1981  data: 0.1136  max mem: 12911
Test: Total time: 0:00:10 (0.4029 s / it)
* Acc@1 79.876 Acc@5 94.904 loss 0.917
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.88%
Epoch: [258]  [   0/1251]  eta: 1:00:09  lr: 0.000219  min_lr: 0.000219  loss: 2.0240 (2.0240)  weight_decay: 0.0500 (0.0500)  time: 2.8850  data: 2.6182  max mem: 12911
Epoch: [258]  [ 200/1251]  eta: 0:03:32  lr: 0.000217  min_lr: 0.000217  loss: 2.3563 (2.5716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9309 (1.0233)  time: 0.1866  data: 0.0004  max mem: 12911
Epoch: [258]  [ 400/1251]  eta: 0:02:45  lr: 0.000216  min_lr: 0.000216  loss: 2.2343 (2.5871)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0321 (1.0477)  time: 0.1883  data: 0.0005  max mem: 12911
Epoch: [258]  [ 600/1251]  eta: 0:02:04  lr: 0.000214  min_lr: 0.000214  loss: 2.1767 (2.5997)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9572 (1.0296)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [258]  [ 800/1251]  eta: 0:01:25  lr: 0.000212  min_lr: 0.000212  loss: 2.4783 (2.6136)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0239 (1.0348)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [258]  [1000/1251]  eta: 0:00:47  lr: 0.000211  min_lr: 0.000211  loss: 2.4587 (2.6120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9342 (1.0373)  time: 0.1849  data: 0.0005  max mem: 12911
Epoch: [258]  [1200/1251]  eta: 0:00:09  lr: 0.000209  min_lr: 0.000209  loss: 2.2648 (2.6227)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0096 (1.0385)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [258]  [1250/1251]  eta: 0:00:00  lr: 0.000209  min_lr: 0.000209  loss: 2.3244 (2.6260)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0026 (1.0389)  time: 0.1475  data: 0.0007  max mem: 12911
Epoch: [258] Total time: 0:03:57 (0.1898 s / it)
Averaged stats: lr: 0.000209  min_lr: 0.000209  loss: 2.3244 (2.6131)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0026 (1.0389)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5920 (0.5920)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.6113  data: 5.4906  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7978 (0.7644)  acc1: 83.6000 (83.8182)  acc5: 96.8000 (96.7636)  time: 0.7599  data: 0.6622  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9607 (0.9336)  acc1: 78.0000 (79.4667)  acc5: 94.8000 (94.8952)  time: 0.2212  data: 0.1333  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0476 (0.9447)  acc1: 76.4000 (79.0560)  acc5: 93.6000 (94.8480)  time: 0.2172  data: 0.1333  max mem: 12911
Test: Total time: 0:00:10 (0.4176 s / it)
* Acc@1 79.678 Acc@5 94.950 loss 0.938
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.88%
Epoch: [259]  [   0/1251]  eta: 1:02:38  lr: 0.000209  min_lr: 0.000209  loss: 3.6095 (3.6095)  weight_decay: 0.0500 (0.0500)  time: 3.0046  data: 1.7207  max mem: 12911
Epoch: [259]  [ 200/1251]  eta: 0:03:38  lr: 0.000207  min_lr: 0.000207  loss: 2.2173 (2.6477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9555 (1.0016)  time: 0.1890  data: 0.0005  max mem: 12911
Epoch: [259]  [ 400/1251]  eta: 0:02:48  lr: 0.000206  min_lr: 0.000206  loss: 1.9378 (2.6279)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0536 (1.0383)  time: 0.1896  data: 0.0005  max mem: 12911
Epoch: [259]  [ 600/1251]  eta: 0:02:07  lr: 0.000204  min_lr: 0.000204  loss: 2.0494 (2.6437)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0830 (1.0687)  time: 0.1893  data: 0.0004  max mem: 12911
Epoch: [259]  [ 800/1251]  eta: 0:01:27  lr: 0.000203  min_lr: 0.000203  loss: 3.0255 (2.6465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9828 (1.0603)  time: 0.1927  data: 0.0005  max mem: 12911
Epoch: [259]  [1000/1251]  eta: 0:00:48  lr: 0.000201  min_lr: 0.000201  loss: 2.6566 (2.6496)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0630 (1.0684)  time: 0.1903  data: 0.0005  max mem: 12911
Epoch: [259]  [1200/1251]  eta: 0:00:09  lr: 0.000199  min_lr: 0.000199  loss: 2.1389 (2.6546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0312 (1.0695)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [259]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 2.7453 (2.6604)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0312 (1.0699)  time: 0.1520  data: 0.0005  max mem: 12911
Epoch: [259] Total time: 0:04:01 (0.1931 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 2.7453 (2.6157)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0312 (1.0699)
Test:  [ 0/25]  eta: 0:01:40  loss: 0.6489 (0.6489)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 4.0316  data: 3.9206  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.8152 (0.8148)  acc1: 84.0000 (83.5273)  acc5: 97.2000 (96.8364)  time: 0.6629  data: 0.5725  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0153 (0.9827)  acc1: 77.6000 (79.3714)  acc5: 94.8000 (95.2000)  time: 0.2508  data: 0.1658  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0881 (0.9937)  acc1: 75.6000 (79.0400)  acc5: 94.4000 (95.0240)  time: 0.2261  data: 0.1434  max mem: 12911
Test: Total time: 0:00:10 (0.4081 s / it)
* Acc@1 79.480 Acc@5 94.926 loss 0.994
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.88%
Epoch: [260]  [   0/1251]  eta: 1:05:00  lr: 0.000199  min_lr: 0.000199  loss: 2.1791 (2.1791)  weight_decay: 0.0500 (0.0500)  time: 3.1176  data: 2.0224  max mem: 12911
Epoch: [260]  [ 200/1251]  eta: 0:03:37  lr: 0.000197  min_lr: 0.000197  loss: 2.3956 (2.6376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9566 (1.0202)  time: 0.1863  data: 0.0004  max mem: 12911
Epoch: [260]  [ 400/1251]  eta: 0:02:48  lr: 0.000196  min_lr: 0.000196  loss: 2.2230 (2.6604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9973 (1.0396)  time: 0.1866  data: 0.0003  max mem: 12911
Epoch: [260]  [ 600/1251]  eta: 0:02:06  lr: 0.000194  min_lr: 0.000194  loss: 2.3458 (2.6452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9566 (1.0428)  time: 0.1898  data: 0.0005  max mem: 12911
Epoch: [260]  [ 800/1251]  eta: 0:01:27  lr: 0.000193  min_lr: 0.000193  loss: 2.5098 (2.6061)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0331 (1.0520)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [260]  [1000/1251]  eta: 0:00:48  lr: 0.000191  min_lr: 0.000191  loss: 2.0769 (2.6134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9126 (1.0463)  time: 0.1914  data: 0.0005  max mem: 12911
Epoch: [260]  [1200/1251]  eta: 0:00:09  lr: 0.000190  min_lr: 0.000190  loss: 2.4514 (2.6049)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0201 (1.0439)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [260]  [1250/1251]  eta: 0:00:00  lr: 0.000189  min_lr: 0.000189  loss: 2.2338 (2.6003)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0137 (1.0433)  time: 0.1493  data: 0.0006  max mem: 12911
Epoch: [260] Total time: 0:03:59 (0.1915 s / it)
Averaged stats: lr: 0.000189  min_lr: 0.000189  loss: 2.2338 (2.6110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0137 (1.0433)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5811 (0.5811)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.4972  data: 5.4054  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7510 (0.7549)  acc1: 84.0000 (83.5273)  acc5: 97.2000 (97.0546)  time: 0.7402  data: 0.6463  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9715 (0.9228)  acc1: 78.0000 (79.6000)  acc5: 94.4000 (95.1429)  time: 0.2233  data: 0.1366  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0154 (0.9321)  acc1: 76.8000 (79.2960)  acc5: 94.0000 (95.0240)  time: 0.2210  data: 0.1365  max mem: 12911
Test: Total time: 0:00:10 (0.4143 s / it)
* Acc@1 79.832 Acc@5 94.968 loss 0.931
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.88%
Epoch: [261]  [   0/1251]  eta: 1:05:55  lr: 0.000189  min_lr: 0.000189  loss: 1.8401 (1.8401)  weight_decay: 0.0500 (0.0500)  time: 3.1623  data: 2.9173  max mem: 12911
Epoch: [261]  [ 200/1251]  eta: 0:03:34  lr: 0.000188  min_lr: 0.000188  loss: 1.9725 (2.5298)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0103 (1.0064)  time: 0.1859  data: 0.0006  max mem: 12911
Epoch: [261]  [ 400/1251]  eta: 0:02:47  lr: 0.000186  min_lr: 0.000186  loss: 2.2398 (2.5231)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0697 (1.0363)  time: 0.1898  data: 0.0004  max mem: 12911
Epoch: [261]  [ 600/1251]  eta: 0:02:06  lr: 0.000185  min_lr: 0.000185  loss: 2.0739 (2.5517)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0980 (1.0812)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [261]  [ 800/1251]  eta: 0:01:27  lr: 0.000183  min_lr: 0.000183  loss: 2.2444 (2.5763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9953 (1.0766)  time: 0.1896  data: 0.0005  max mem: 12911
Epoch: [261]  [1000/1251]  eta: 0:00:48  lr: 0.000182  min_lr: 0.000182  loss: 2.4819 (2.5757)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0395 (1.0684)  time: 0.1900  data: 0.0005  max mem: 12911
Epoch: [261]  [1200/1251]  eta: 0:00:09  lr: 0.000180  min_lr: 0.000180  loss: 2.2902 (2.5976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9740 (1.0639)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [261]  [1250/1251]  eta: 0:00:00  lr: 0.000180  min_lr: 0.000180  loss: 2.0268 (2.5931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9465 (1.0590)  time: 0.1461  data: 0.0009  max mem: 12911
Epoch: [261] Total time: 0:03:59 (0.1917 s / it)
Averaged stats: lr: 0.000180  min_lr: 0.000180  loss: 2.0268 (2.6009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9465 (1.0590)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5555 (0.5555)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.2960  data: 5.2004  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7180 (0.7320)  acc1: 83.6000 (83.4545)  acc5: 97.2000 (96.8727)  time: 0.7499  data: 0.6521  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9343 (0.8960)  acc1: 77.6000 (79.6952)  acc5: 95.2000 (95.1810)  time: 0.2193  data: 0.1298  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9993 (0.9059)  acc1: 77.2000 (79.4240)  acc5: 94.8000 (95.0400)  time: 0.2155  data: 0.1297  max mem: 12911
Test: Total time: 0:00:10 (0.4039 s / it)
* Acc@1 79.774 Acc@5 94.990 loss 0.904
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.88%
Epoch: [262]  [   0/1251]  eta: 1:03:58  lr: 0.000180  min_lr: 0.000180  loss: 2.9363 (2.9363)  weight_decay: 0.0500 (0.0500)  time: 3.0680  data: 2.4708  max mem: 12911
Epoch: [262]  [ 200/1251]  eta: 0:03:36  lr: 0.000179  min_lr: 0.000179  loss: 2.1199 (2.6078)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0068 (1.0246)  time: 0.1930  data: 0.0005  max mem: 12911
Epoch: [262]  [ 400/1251]  eta: 0:02:48  lr: 0.000177  min_lr: 0.000177  loss: 2.3636 (2.6281)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9789 (1.0353)  time: 0.1874  data: 0.0005  max mem: 12911
Epoch: [262]  [ 600/1251]  eta: 0:02:06  lr: 0.000176  min_lr: 0.000176  loss: 2.0015 (2.6022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0239 (1.0545)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [262]  [ 800/1251]  eta: 0:01:26  lr: 0.000174  min_lr: 0.000174  loss: 1.9548 (2.5943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9296 (1.0352)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [262]  [1000/1251]  eta: 0:00:48  lr: 0.000173  min_lr: 0.000173  loss: 2.0689 (2.5980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9667 (1.0233)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [262]  [1200/1251]  eta: 0:00:09  lr: 0.000171  min_lr: 0.000171  loss: 2.6531 (2.5930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9424 (1.0313)  time: 0.1866  data: 0.0005  max mem: 12911
Epoch: [262]  [1250/1251]  eta: 0:00:00  lr: 0.000171  min_lr: 0.000171  loss: 3.2022 (2.5976)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0605 (1.0332)  time: 0.1468  data: 0.0008  max mem: 12911
Epoch: [262] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.000171  min_lr: 0.000171  loss: 3.2022 (2.6013)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0605 (1.0332)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6736 (0.6736)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.3618  data: 5.2703  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8424 (0.8318)  acc1: 84.0000 (83.4545)  acc5: 96.8000 (97.0182)  time: 0.7002  data: 0.6049  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0457 (1.0028)  acc1: 78.4000 (79.7524)  acc5: 94.8000 (95.2191)  time: 0.1975  data: 0.1099  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1140 (1.0132)  acc1: 78.4000 (79.3120)  acc5: 94.0000 (95.0720)  time: 0.2033  data: 0.1189  max mem: 12911
Test: Total time: 0:00:09 (0.3955 s / it)
* Acc@1 79.728 Acc@5 94.964 loss 1.017
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.88%
Epoch: [263]  [   0/1251]  eta: 1:04:06  lr: 0.000171  min_lr: 0.000171  loss: 2.9040 (2.9040)  weight_decay: 0.0500 (0.0500)  time: 3.0745  data: 2.0780  max mem: 12911
Epoch: [263]  [ 200/1251]  eta: 0:03:34  lr: 0.000169  min_lr: 0.000169  loss: 2.3238 (2.7533)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0202 (1.0627)  time: 0.1850  data: 0.0004  max mem: 12911
Epoch: [263]  [ 400/1251]  eta: 0:02:46  lr: 0.000168  min_lr: 0.000168  loss: 1.9347 (2.6978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9749 (1.0495)  time: 0.1854  data: 0.0005  max mem: 12911
Epoch: [263]  [ 600/1251]  eta: 0:02:07  lr: 0.000167  min_lr: 0.000167  loss: 1.9795 (2.6860)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0450 (1.0388)  time: 0.1917  data: 0.0005  max mem: 12911
Epoch: [263]  [ 800/1251]  eta: 0:01:27  lr: 0.000165  min_lr: 0.000165  loss: 2.0308 (2.6873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9763 (1.0411)  time: 0.1911  data: 0.0004  max mem: 12911
Epoch: [263]  [1000/1251]  eta: 0:00:48  lr: 0.000164  min_lr: 0.000164  loss: 3.0309 (2.6570)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1088 (1.0482)  time: 0.1909  data: 0.0005  max mem: 12911
Epoch: [263]  [1200/1251]  eta: 0:00:09  lr: 0.000162  min_lr: 0.000162  loss: 2.1025 (2.6697)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0083 (1.0565)  time: 0.1873  data: 0.0005  max mem: 12911
Epoch: [263]  [1250/1251]  eta: 0:00:00  lr: 0.000162  min_lr: 0.000162  loss: 2.3564 (2.6688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9801 (1.0674)  time: 0.1474  data: 0.0011  max mem: 12911
Epoch: [263] Total time: 0:03:59 (0.1915 s / it)
Averaged stats: lr: 0.000162  min_lr: 0.000162  loss: 2.3564 (2.5978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9801 (1.0674)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5839 (0.5839)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.6621  data: 5.5606  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7656 (0.7594)  acc1: 82.8000 (83.4909)  acc5: 97.2000 (96.9818)  time: 0.7419  data: 0.6456  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9660 (0.9257)  acc1: 78.0000 (79.6952)  acc5: 94.8000 (95.1238)  time: 0.1983  data: 0.1073  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0233 (0.9353)  acc1: 78.0000 (79.2160)  acc5: 94.8000 (95.0240)  time: 0.2045  data: 0.1160  max mem: 12911
Test: Total time: 0:00:10 (0.4101 s / it)
* Acc@1 79.762 Acc@5 94.954 loss 0.932
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.88%
Epoch: [264]  [   0/1251]  eta: 1:02:38  lr: 0.000162  min_lr: 0.000162  loss: 1.8966 (1.8966)  weight_decay: 0.0500 (0.0500)  time: 3.0043  data: 2.2834  max mem: 12911
Epoch: [264]  [ 200/1251]  eta: 0:03:34  lr: 0.000160  min_lr: 0.000160  loss: 2.1257 (2.7728)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9529 (1.0268)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [264]  [ 400/1251]  eta: 0:02:46  lr: 0.000159  min_lr: 0.000159  loss: 2.1855 (2.6954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0116 (1.0345)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [264]  [ 600/1251]  eta: 0:02:05  lr: 0.000158  min_lr: 0.000158  loss: 3.1465 (2.6709)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0938 (1.0526)  time: 0.1853  data: 0.0004  max mem: 12911
Epoch: [264]  [ 800/1251]  eta: 0:01:26  lr: 0.000156  min_lr: 0.000156  loss: 3.3202 (2.6782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9799 (1.0444)  time: 0.1997  data: 0.0005  max mem: 12911
Epoch: [264]  [1000/1251]  eta: 0:00:47  lr: 0.000155  min_lr: 0.000155  loss: 2.3860 (2.6615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9962 (nan)  time: 0.1864  data: 0.0004  max mem: 12911
Epoch: [264]  [1200/1251]  eta: 0:00:09  lr: 0.000154  min_lr: 0.000154  loss: 2.0584 (2.6566)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0283 (nan)  time: 0.1906  data: 0.0004  max mem: 12911
Epoch: [264]  [1250/1251]  eta: 0:00:00  lr: 0.000153  min_lr: 0.000153  loss: 1.9995 (2.6512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0118 (nan)  time: 0.1465  data: 0.0011  max mem: 12911
Epoch: [264] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.000153  min_lr: 0.000153  loss: 1.9995 (2.6009)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0118 (nan)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.5654 (0.5654)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.1800  data: 5.0478  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7385 (0.7456)  acc1: 82.8000 (83.4182)  acc5: 97.2000 (96.9818)  time: 0.7538  data: 0.6539  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9615 (0.9067)  acc1: 78.0000 (79.8095)  acc5: 94.8000 (95.2191)  time: 0.2315  data: 0.1427  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0192 (0.9163)  acc1: 77.2000 (79.5040)  acc5: 94.0000 (94.9760)  time: 0.2278  data: 0.1426  max mem: 12911
Test: Total time: 0:00:10 (0.4081 s / it)
* Acc@1 79.890 Acc@5 94.968 loss 0.917
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.89%
Epoch: [265]  [   0/1251]  eta: 1:00:12  lr: 0.000153  min_lr: 0.000153  loss: 1.8551 (1.8551)  weight_decay: 0.0500 (0.0500)  time: 2.8879  data: 2.6252  max mem: 12911
Epoch: [265]  [ 200/1251]  eta: 0:03:33  lr: 0.000152  min_lr: 0.000152  loss: 2.1413 (2.5341)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0231 (1.0449)  time: 0.1853  data: 0.0005  max mem: 12911
Epoch: [265]  [ 400/1251]  eta: 0:02:46  lr: 0.000150  min_lr: 0.000150  loss: 2.8577 (2.5542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9815 (1.0366)  time: 0.1860  data: 0.0005  max mem: 12911
Epoch: [265]  [ 600/1251]  eta: 0:02:05  lr: 0.000149  min_lr: 0.000149  loss: 2.0578 (2.5652)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0357 (1.0481)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [265]  [ 800/1251]  eta: 0:01:26  lr: 0.000148  min_lr: 0.000148  loss: 2.0200 (2.5787)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0103 (1.0497)  time: 0.1863  data: 0.0004  max mem: 12911
Epoch: [265]  [1000/1251]  eta: 0:00:47  lr: 0.000146  min_lr: 0.000146  loss: 2.1073 (2.5897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9636 (1.0358)  time: 0.1843  data: 0.0004  max mem: 12911
Epoch: [265]  [1200/1251]  eta: 0:00:09  lr: 0.000145  min_lr: 0.000145  loss: 2.0136 (2.6031)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0245 (1.0361)  time: 0.1886  data: 0.0003  max mem: 12911
Epoch: [265]  [1250/1251]  eta: 0:00:00  lr: 0.000145  min_lr: 0.000145  loss: 2.1001 (2.6091)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0518 (1.0362)  time: 0.1464  data: 0.0008  max mem: 12911
Epoch: [265] Total time: 0:03:56 (0.1891 s / it)
Averaged stats: lr: 0.000145  min_lr: 0.000145  loss: 2.1001 (2.6069)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0518 (1.0362)
Test:  [ 0/25]  eta: 0:02:08  loss: 0.5993 (0.5993)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.1508  data: 5.0525  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7733 (0.7670)  acc1: 84.0000 (83.5273)  acc5: 97.2000 (97.0546)  time: 0.7161  data: 0.6213  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9736 (0.9345)  acc1: 78.0000 (79.9238)  acc5: 94.8000 (95.2000)  time: 0.2295  data: 0.1425  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0471 (0.9446)  acc1: 76.8000 (79.6480)  acc5: 94.4000 (95.0720)  time: 0.2272  data: 0.1425  max mem: 12911
Test: Total time: 0:00:10 (0.4053 s / it)
* Acc@1 79.934 Acc@5 94.998 loss 0.944
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.93%
Epoch: [266]  [   0/1251]  eta: 1:05:54  lr: 0.000145  min_lr: 0.000145  loss: 2.8486 (2.8486)  weight_decay: 0.0500 (0.0500)  time: 3.1608  data: 2.8853  max mem: 12911
Epoch: [266]  [ 200/1251]  eta: 0:03:33  lr: 0.000143  min_lr: 0.000143  loss: 2.1324 (2.5781)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0255 (1.0572)  time: 0.1912  data: 0.0006  max mem: 12911
Epoch: [266]  [ 400/1251]  eta: 0:02:45  lr: 0.000142  min_lr: 0.000142  loss: 2.2172 (2.5733)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0262 (1.0316)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [266]  [ 600/1251]  eta: 0:02:05  lr: 0.000141  min_lr: 0.000141  loss: 2.0346 (2.5799)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0310 (1.0457)  time: 0.1884  data: 0.0005  max mem: 12911
Epoch: [266]  [ 800/1251]  eta: 0:01:26  lr: 0.000139  min_lr: 0.000139  loss: 1.9998 (2.5933)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1817 (1.0693)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [266]  [1000/1251]  eta: 0:00:47  lr: 0.000138  min_lr: 0.000138  loss: 2.0454 (2.6023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0137 (1.0592)  time: 0.1859  data: 0.0004  max mem: 12911
Epoch: [266]  [1200/1251]  eta: 0:00:09  lr: 0.000137  min_lr: 0.000137  loss: 3.3167 (2.6038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9850 (1.0539)  time: 0.1832  data: 0.0004  max mem: 12911
Epoch: [266]  [1250/1251]  eta: 0:00:00  lr: 0.000137  min_lr: 0.000137  loss: 1.9247 (2.5970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9767 (1.0530)  time: 0.1524  data: 0.0012  max mem: 12911
Epoch: [266] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.000137  min_lr: 0.000137  loss: 1.9247 (2.5895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9767 (1.0530)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5399 (0.5399)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.7056  data: 5.6138  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7243 (0.7237)  acc1: 83.6000 (83.8182)  acc5: 97.2000 (97.0546)  time: 0.7565  data: 0.6601  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9196 (0.8846)  acc1: 78.0000 (80.0571)  acc5: 94.8000 (95.2762)  time: 0.2177  data: 0.1295  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9548 (0.8950)  acc1: 78.0000 (79.6960)  acc5: 94.0000 (95.0720)  time: 0.2143  data: 0.1294  max mem: 12911
Test: Total time: 0:00:10 (0.4181 s / it)
* Acc@1 79.902 Acc@5 95.014 loss 0.896
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.93%
Epoch: [267]  [   0/1251]  eta: 1:08:08  lr: 0.000136  min_lr: 0.000136  loss: 2.8076 (2.8076)  weight_decay: 0.0500 (0.0500)  time: 3.2678  data: 2.7880  max mem: 12911
Epoch: [267]  [ 200/1251]  eta: 0:03:32  lr: 0.000135  min_lr: 0.000135  loss: 2.1600 (2.6033)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0231 (1.0801)  time: 0.1859  data: 0.0006  max mem: 12911
Epoch: [267]  [ 400/1251]  eta: 0:02:46  lr: 0.000134  min_lr: 0.000134  loss: 2.4856 (2.6060)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0083 (1.0637)  time: 0.1931  data: 0.0004  max mem: 12911
Epoch: [267]  [ 600/1251]  eta: 0:02:05  lr: 0.000133  min_lr: 0.000133  loss: 2.1131 (2.5997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0658 (1.0566)  time: 0.1854  data: 0.0005  max mem: 12911
Epoch: [267]  [ 800/1251]  eta: 0:01:26  lr: 0.000131  min_lr: 0.000131  loss: 2.4484 (2.5873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9838 (1.0489)  time: 0.1881  data: 0.0005  max mem: 12911
Epoch: [267]  [1000/1251]  eta: 0:00:47  lr: 0.000130  min_lr: 0.000130  loss: 2.0729 (2.5846)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0040 (1.0506)  time: 0.1898  data: 0.0006  max mem: 12911
Epoch: [267]  [1200/1251]  eta: 0:00:09  lr: 0.000129  min_lr: 0.000129  loss: 2.0930 (2.5894)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0995 (1.0547)  time: 0.1880  data: 0.0004  max mem: 12911
Epoch: [267]  [1250/1251]  eta: 0:00:00  lr: 0.000129  min_lr: 0.000129  loss: 2.3539 (2.5959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9775 (1.0527)  time: 0.1469  data: 0.0008  max mem: 12911
Epoch: [267] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.000129  min_lr: 0.000129  loss: 2.3539 (2.5974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9775 (1.0527)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6130 (0.6130)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.5056  data: 5.3745  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7787 (0.7835)  acc1: 83.6000 (84.1818)  acc5: 97.2000 (96.9818)  time: 0.7370  data: 0.6383  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9857 (0.9486)  acc1: 78.8000 (80.2095)  acc5: 94.8000 (95.3333)  time: 0.2146  data: 0.1271  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0432 (0.9583)  acc1: 78.4000 (79.8560)  acc5: 94.8000 (95.2000)  time: 0.2105  data: 0.1270  max mem: 12911
Test: Total time: 0:00:10 (0.4079 s / it)
* Acc@1 80.008 Acc@5 94.996 loss 0.963
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.01%
Epoch: [268]  [   0/1251]  eta: 0:58:15  lr: 0.000128  min_lr: 0.000128  loss: 1.7843 (1.7843)  weight_decay: 0.0500 (0.0500)  time: 2.7939  data: 2.4942  max mem: 12911
Epoch: [268]  [ 200/1251]  eta: 0:03:31  lr: 0.000127  min_lr: 0.000127  loss: 2.6085 (2.5846)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0482 (1.0864)  time: 0.1828  data: 0.0004  max mem: 12911
Epoch: [268]  [ 400/1251]  eta: 0:02:45  lr: 0.000126  min_lr: 0.000126  loss: 2.6493 (2.6049)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0128 (1.1096)  time: 0.1927  data: 0.0004  max mem: 12911
Epoch: [268]  [ 600/1251]  eta: 0:02:05  lr: 0.000125  min_lr: 0.000125  loss: 1.9066 (2.5609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9736 (1.0848)  time: 0.1900  data: 0.0004  max mem: 12911
Epoch: [268]  [ 800/1251]  eta: 0:01:26  lr: 0.000123  min_lr: 0.000123  loss: 1.9724 (2.5515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9334 (1.0752)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [268]  [1000/1251]  eta: 0:00:47  lr: 0.000122  min_lr: 0.000122  loss: 1.9303 (2.5526)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0139 (1.0744)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [268]  [1200/1251]  eta: 0:00:09  lr: 0.000121  min_lr: 0.000121  loss: 1.8972 (2.5650)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0167 (1.0718)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [268]  [1250/1251]  eta: 0:00:00  lr: 0.000121  min_lr: 0.000121  loss: 2.0042 (2.5663)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0414 (1.0757)  time: 0.1474  data: 0.0011  max mem: 12911
Epoch: [268] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.000121  min_lr: 0.000121  loss: 2.0042 (2.5778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0414 (1.0757)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5653 (0.5653)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.3546  data: 5.2616  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.7228 (0.7435)  acc1: 84.0000 (84.2909)  acc5: 97.2000 (96.9818)  time: 0.6376  data: 0.5528  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9574 (0.9084)  acc1: 78.0000 (80.1714)  acc5: 94.8000 (95.1810)  time: 0.1906  data: 0.1072  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0111 (0.9184)  acc1: 77.2000 (79.7920)  acc5: 94.4000 (95.0720)  time: 0.2096  data: 0.1264  max mem: 12911
Test: Total time: 0:00:09 (0.3997 s / it)
* Acc@1 80.092 Acc@5 95.038 loss 0.919
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.09%
Epoch: [269]  [   0/1251]  eta: 0:57:57  lr: 0.000121  min_lr: 0.000121  loss: 2.0112 (2.0112)  weight_decay: 0.0500 (0.0500)  time: 2.7801  data: 2.3978  max mem: 12911
Epoch: [269]  [ 200/1251]  eta: 0:03:33  lr: 0.000120  min_lr: 0.000120  loss: 2.0403 (2.5184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9993 (1.0303)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [269]  [ 400/1251]  eta: 0:02:46  lr: 0.000118  min_lr: 0.000118  loss: 2.9442 (2.5538)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0077 (1.0242)  time: 0.1856  data: 0.0004  max mem: 12911
Epoch: [269]  [ 600/1251]  eta: 0:02:04  lr: 0.000117  min_lr: 0.000117  loss: 2.0706 (2.5439)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0270 (1.0248)  time: 0.1852  data: 0.0004  max mem: 12911
Epoch: [269]  [ 800/1251]  eta: 0:01:25  lr: 0.000116  min_lr: 0.000116  loss: 3.0714 (2.5536)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0290 (1.0255)  time: 0.1873  data: 0.0004  max mem: 12911
Epoch: [269]  [1000/1251]  eta: 0:00:47  lr: 0.000115  min_lr: 0.000115  loss: 2.2529 (2.5670)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0612 (1.0332)  time: 0.1852  data: 0.0004  max mem: 12911
Epoch: [269]  [1200/1251]  eta: 0:00:09  lr: 0.000113  min_lr: 0.000113  loss: 2.3453 (2.5794)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0314 (1.0309)  time: 0.1964  data: 0.0005  max mem: 12911
Epoch: [269]  [1250/1251]  eta: 0:00:00  lr: 0.000113  min_lr: 0.000113  loss: 2.8166 (2.5878)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0673 (1.0327)  time: 0.1480  data: 0.0009  max mem: 12911
Epoch: [269] Total time: 0:03:56 (0.1891 s / it)
Averaged stats: lr: 0.000113  min_lr: 0.000113  loss: 2.8166 (2.5769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0673 (1.0327)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6538 (0.6538)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.2952  data: 5.2036  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.8287 (0.8287)  acc1: 83.6000 (83.7818)  acc5: 96.8000 (97.0182)  time: 0.7500  data: 0.6533  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 1.0222 (0.9947)  acc1: 77.6000 (79.8476)  acc5: 95.2000 (95.2000)  time: 0.2276  data: 0.1393  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.1035 (1.0072)  acc1: 76.8000 (79.3920)  acc5: 94.4000 (95.0240)  time: 0.2240  data: 0.1392  max mem: 12911
Test: Total time: 0:00:10 (0.4096 s / it)
* Acc@1 79.844 Acc@5 95.028 loss 1.008
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 80.09%
Epoch: [270]  [   0/1251]  eta: 1:01:41  lr: 0.000113  min_lr: 0.000113  loss: 2.1399 (2.1399)  weight_decay: 0.0500 (0.0500)  time: 2.9592  data: 2.5593  max mem: 12911
Epoch: [270]  [ 200/1251]  eta: 0:03:39  lr: 0.000112  min_lr: 0.000112  loss: 2.7988 (2.5649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9496 (1.0005)  time: 0.1890  data: 0.0004  max mem: 12911
Epoch: [270]  [ 400/1251]  eta: 0:02:49  lr: 0.000111  min_lr: 0.000111  loss: 2.4184 (2.6124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9631 (1.0172)  time: 0.1912  data: 0.0004  max mem: 12911
Epoch: [270]  [ 600/1251]  eta: 0:02:07  lr: 0.000110  min_lr: 0.000110  loss: 2.0734 (2.5796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9709 (1.0221)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [270]  [ 800/1251]  eta: 0:01:27  lr: 0.000109  min_lr: 0.000109  loss: 2.0310 (2.5798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9892 (1.0212)  time: 0.1846  data: 0.0004  max mem: 12911
Epoch: [270]  [1000/1251]  eta: 0:00:48  lr: 0.000107  min_lr: 0.000107  loss: 2.0243 (2.5806)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0639 (1.0236)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [270]  [1200/1251]  eta: 0:00:09  lr: 0.000106  min_lr: 0.000106  loss: 1.9132 (2.5742)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0316 (1.0276)  time: 0.1875  data: 0.0004  max mem: 12911
Epoch: [270]  [1250/1251]  eta: 0:00:00  lr: 0.000106  min_lr: 0.000106  loss: 2.0083 (2.5707)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0052 (1.0268)  time: 0.1483  data: 0.0008  max mem: 12911
Epoch: [270] Total time: 0:04:00 (0.1924 s / it)
Averaged stats: lr: 0.000106  min_lr: 0.000106  loss: 2.0083 (2.5886)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0052 (1.0268)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.5224 (0.5224)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.2540  data: 5.1566  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.7029 (0.7203)  acc1: 83.6000 (83.6364)  acc5: 97.2000 (96.9091)  time: 0.6229  data: 0.5352  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9430 (0.8823)  acc1: 78.0000 (80.0191)  acc5: 95.2000 (95.2381)  time: 0.1677  data: 0.0818  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9796 (0.8928)  acc1: 77.2000 (79.6640)  acc5: 94.4000 (95.0560)  time: 0.1796  data: 0.0942  max mem: 12911
Test: Total time: 0:00:09 (0.3866 s / it)
* Acc@1 79.952 Acc@5 95.016 loss 0.892
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.09%
Epoch: [271]  [   0/1251]  eta: 1:10:16  lr: 0.000106  min_lr: 0.000106  loss: 1.9176 (1.9176)  weight_decay: 0.0500 (0.0500)  time: 3.3706  data: 2.2538  max mem: 12911
Epoch: [271]  [ 200/1251]  eta: 0:03:33  lr: 0.000105  min_lr: 0.000105  loss: 2.0777 (2.5247)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0130 (1.0274)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [271]  [ 400/1251]  eta: 0:02:46  lr: 0.000104  min_lr: 0.000104  loss: 2.0251 (2.5376)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0922 (1.0376)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [271]  [ 600/1251]  eta: 0:02:05  lr: 0.000102  min_lr: 0.000102  loss: 2.6961 (2.6031)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0383 (1.0392)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [271]  [ 800/1251]  eta: 0:01:26  lr: 0.000101  min_lr: 0.000101  loss: 2.0052 (2.5856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9765 (1.0371)  time: 0.1854  data: 0.0005  max mem: 12911
Epoch: [271]  [1000/1251]  eta: 0:00:47  lr: 0.000100  min_lr: 0.000100  loss: 1.9627 (2.5590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0917 (1.0446)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [271]  [1200/1251]  eta: 0:00:09  lr: 0.000099  min_lr: 0.000099  loss: 2.1176 (2.5559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9875 (1.0527)  time: 0.1871  data: 0.0005  max mem: 12911
Epoch: [271]  [1250/1251]  eta: 0:00:00  lr: 0.000099  min_lr: 0.000099  loss: 1.9336 (2.5495)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0365 (1.0516)  time: 0.1464  data: 0.0007  max mem: 12911
Epoch: [271] Total time: 0:03:57 (0.1895 s / it)
Averaged stats: lr: 0.000099  min_lr: 0.000099  loss: 1.9336 (2.5721)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0365 (1.0516)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5461 (0.5461)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.6707  data: 5.5790  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7168 (0.7305)  acc1: 83.6000 (84.2909)  acc5: 97.2000 (97.0546)  time: 0.7495  data: 0.6557  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9326 (0.8942)  acc1: 78.0000 (80.4571)  acc5: 95.2000 (95.2000)  time: 0.2124  data: 0.1256  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9995 (0.9042)  acc1: 77.6000 (80.0480)  acc5: 94.8000 (95.1040)  time: 0.2101  data: 0.1255  max mem: 12911
Test: Total time: 0:00:10 (0.4123 s / it)
* Acc@1 80.078 Acc@5 95.026 loss 0.905
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.09%
Epoch: [272]  [   0/1251]  eta: 1:04:50  lr: 0.000099  min_lr: 0.000099  loss: 3.4014 (3.4014)  weight_decay: 0.0500 (0.0500)  time: 3.1103  data: 2.6715  max mem: 12911
Epoch: [272]  [ 200/1251]  eta: 0:03:33  lr: 0.000098  min_lr: 0.000098  loss: 1.9134 (2.6046)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0374 (1.1035)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [272]  [ 400/1251]  eta: 0:02:46  lr: 0.000097  min_lr: 0.000097  loss: 2.0602 (2.5320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9933 (1.0439)  time: 0.1925  data: 0.0004  max mem: 12911
Epoch: [272]  [ 600/1251]  eta: 0:02:05  lr: 0.000096  min_lr: 0.000096  loss: 2.8851 (2.5280)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1268 (1.0462)  time: 0.1884  data: 0.0004  max mem: 12911
Epoch: [272]  [ 800/1251]  eta: 0:01:26  lr: 0.000094  min_lr: 0.000094  loss: 2.6507 (2.5324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0693 (1.0533)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [272]  [1000/1251]  eta: 0:00:47  lr: 0.000093  min_lr: 0.000093  loss: 2.0044 (2.5308)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0533 (1.0517)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [272]  [1200/1251]  eta: 0:00:09  lr: 0.000092  min_lr: 0.000092  loss: 2.0338 (2.5520)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9962 (1.0476)  time: 0.1855  data: 0.0005  max mem: 12911
Epoch: [272]  [1250/1251]  eta: 0:00:00  lr: 0.000092  min_lr: 0.000092  loss: 2.0605 (2.5521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9896 (1.0485)  time: 0.1461  data: 0.0010  max mem: 12911
Epoch: [272] Total time: 0:03:57 (0.1898 s / it)
Averaged stats: lr: 0.000092  min_lr: 0.000092  loss: 2.0605 (2.5790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9896 (1.0485)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5979 (0.5979)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.4198  data: 5.3160  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7928 (0.7794)  acc1: 84.0000 (83.7091)  acc5: 97.2000 (97.0546)  time: 0.7137  data: 0.6214  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9834 (0.9433)  acc1: 78.0000 (80.0952)  acc5: 95.2000 (95.1810)  time: 0.2047  data: 0.1194  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0578 (0.9545)  acc1: 77.2000 (79.7600)  acc5: 94.8000 (95.0880)  time: 0.2038  data: 0.1193  max mem: 12911
Test: Total time: 0:00:09 (0.3959 s / it)
* Acc@1 80.004 Acc@5 95.016 loss 0.955
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.09%
Epoch: [273]  [   0/1251]  eta: 1:03:38  lr: 0.000092  min_lr: 0.000092  loss: 1.9086 (1.9086)  weight_decay: 0.0500 (0.0500)  time: 3.0524  data: 2.3446  max mem: 12911
Epoch: [273]  [ 200/1251]  eta: 0:03:35  lr: 0.000091  min_lr: 0.000091  loss: 2.2288 (2.6840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9788 (1.0259)  time: 0.1844  data: 0.0004  max mem: 12911
Epoch: [273]  [ 400/1251]  eta: 0:02:47  lr: 0.000090  min_lr: 0.000090  loss: 2.8535 (2.6173)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0497 (1.0569)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [273]  [ 600/1251]  eta: 0:02:06  lr: 0.000089  min_lr: 0.000089  loss: 2.0759 (2.5951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9791 (1.0400)  time: 0.1853  data: 0.0004  max mem: 12911
Epoch: [273]  [ 800/1251]  eta: 0:01:26  lr: 0.000088  min_lr: 0.000088  loss: 2.1006 (2.5795)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0657 (1.0485)  time: 0.1840  data: 0.0005  max mem: 12911
Epoch: [273]  [1000/1251]  eta: 0:00:48  lr: 0.000087  min_lr: 0.000087  loss: 2.6402 (2.5868)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0061 (1.0451)  time: 0.1900  data: 0.0005  max mem: 12911
Epoch: [273]  [1200/1251]  eta: 0:00:09  lr: 0.000086  min_lr: 0.000086  loss: 2.2663 (2.5661)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0269 (1.0437)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [273]  [1250/1251]  eta: 0:00:00  lr: 0.000085  min_lr: 0.000085  loss: 2.0096 (2.5661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9941 (1.0430)  time: 0.1457  data: 0.0008  max mem: 12911
Epoch: [273] Total time: 0:03:58 (0.1908 s / it)
Averaged stats: lr: 0.000085  min_lr: 0.000085  loss: 2.0096 (2.5703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9941 (1.0430)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5429 (0.5429)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.7606  data: 5.6689  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7172 (0.7304)  acc1: 83.2000 (84.1818)  acc5: 97.2000 (97.2727)  time: 0.7596  data: 0.6658  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9483 (0.8950)  acc1: 78.0000 (80.3238)  acc5: 95.6000 (95.3905)  time: 0.2059  data: 0.1192  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0019 (0.9077)  acc1: 78.0000 (79.9360)  acc5: 94.0000 (95.1520)  time: 0.2036  data: 0.1191  max mem: 12911
Test: Total time: 0:00:10 (0.4113 s / it)
* Acc@1 80.148 Acc@5 95.022 loss 0.908
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.15%
Epoch: [274]  [   0/1251]  eta: 0:59:40  lr: 0.000085  min_lr: 0.000085  loss: 2.6358 (2.6358)  weight_decay: 0.0500 (0.0500)  time: 2.8619  data: 2.5840  max mem: 12911
Epoch: [274]  [ 200/1251]  eta: 0:03:32  lr: 0.000084  min_lr: 0.000084  loss: 2.0329 (2.5751)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0032 (1.0171)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [274]  [ 400/1251]  eta: 0:02:45  lr: 0.000083  min_lr: 0.000083  loss: 1.9536 (2.5789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9814 (1.0147)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [274]  [ 600/1251]  eta: 0:02:05  lr: 0.000082  min_lr: 0.000082  loss: 2.2190 (2.5966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9736 (1.0297)  time: 0.1874  data: 0.0004  max mem: 12911
Epoch: [274]  [ 800/1251]  eta: 0:01:26  lr: 0.000081  min_lr: 0.000081  loss: 2.0409 (2.6189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0151 (1.0342)  time: 0.1872  data: 0.0006  max mem: 12911
Epoch: [274]  [1000/1251]  eta: 0:00:47  lr: 0.000080  min_lr: 0.000080  loss: 2.1935 (2.6097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9994 (1.0355)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [274]  [1200/1251]  eta: 0:00:09  lr: 0.000079  min_lr: 0.000079  loss: 2.1236 (2.6081)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0528 (1.0503)  time: 0.1866  data: 0.0005  max mem: 12911
Epoch: [274]  [1250/1251]  eta: 0:00:00  lr: 0.000079  min_lr: 0.000079  loss: 2.1592 (2.6045)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0241 (1.0494)  time: 0.1472  data: 0.0006  max mem: 12911
Epoch: [274] Total time: 0:03:57 (0.1898 s / it)
Averaged stats: lr: 0.000079  min_lr: 0.000079  loss: 2.1592 (2.5901)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0241 (1.0494)
Test:  [ 0/25]  eta: 0:01:45  loss: 0.5408 (0.5408)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 4.2150  data: 4.1138  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.7227 (0.7368)  acc1: 83.2000 (84.2909)  acc5: 97.2000 (96.9091)  time: 0.6344  data: 0.5513  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9443 (0.9008)  acc1: 77.6000 (80.1905)  acc5: 95.2000 (95.3143)  time: 0.2268  data: 0.1464  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0095 (0.9120)  acc1: 76.8000 (79.7280)  acc5: 94.8000 (95.1360)  time: 0.2151  data: 0.1341  max mem: 12911
Test: Total time: 0:00:10 (0.4015 s / it)
* Acc@1 80.004 Acc@5 95.000 loss 0.914
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.15%
Epoch: [275]  [   0/1251]  eta: 1:06:21  lr: 0.000079  min_lr: 0.000079  loss: 1.7617 (1.7617)  weight_decay: 0.0500 (0.0500)  time: 3.1830  data: 2.5622  max mem: 12911
Epoch: [275]  [ 200/1251]  eta: 0:03:34  lr: 0.000078  min_lr: 0.000078  loss: 2.1624 (2.5904)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0419 (1.0292)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [275]  [ 400/1251]  eta: 0:02:47  lr: 0.000077  min_lr: 0.000077  loss: 2.0842 (2.5695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9500 (1.0233)  time: 0.1891  data: 0.0005  max mem: 12911
Epoch: [275]  [ 600/1251]  eta: 0:02:06  lr: 0.000076  min_lr: 0.000076  loss: 2.6296 (2.6085)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.1891  data: 0.0005  max mem: 12911
Epoch: [275]  [ 800/1251]  eta: 0:01:26  lr: 0.000075  min_lr: 0.000075  loss: 2.1714 (2.6062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9993 (nan)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [275]  [1000/1251]  eta: 0:00:48  lr: 0.000074  min_lr: 0.000074  loss: 2.2426 (2.5809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9687 (nan)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [275]  [1200/1251]  eta: 0:00:09  lr: 0.000073  min_lr: 0.000073  loss: 2.3413 (2.5846)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0462 (nan)  time: 0.1866  data: 0.0005  max mem: 12911
Epoch: [275]  [1250/1251]  eta: 0:00:00  lr: 0.000073  min_lr: 0.000073  loss: 2.8686 (2.5883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9629 (nan)  time: 0.1460  data: 0.0006  max mem: 12911
Epoch: [275] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.000073  min_lr: 0.000073  loss: 2.8686 (2.5917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9629 (nan)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6251 (0.6251)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.3220  data: 5.1775  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7888 (0.8005)  acc1: 84.0000 (84.1818)  acc5: 97.2000 (96.9818)  time: 0.7189  data: 0.6172  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9927 (0.9668)  acc1: 78.0000 (80.2857)  acc5: 94.8000 (95.3714)  time: 0.2063  data: 0.1178  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0997 (0.9785)  acc1: 77.6000 (79.7920)  acc5: 94.8000 (95.2000)  time: 0.2146  data: 0.1299  max mem: 12911
Test: Total time: 0:00:10 (0.4037 s / it)
* Acc@1 79.988 Acc@5 95.036 loss 0.981
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.15%
Epoch: [276]  [   0/1251]  eta: 1:09:26  lr: 0.000073  min_lr: 0.000073  loss: 1.9377 (1.9377)  weight_decay: 0.0500 (0.0500)  time: 3.3308  data: 2.3291  max mem: 12911
Epoch: [276]  [ 200/1251]  eta: 0:03:33  lr: 0.000072  min_lr: 0.000072  loss: 2.2909 (2.4793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9549 (1.0431)  time: 0.1860  data: 0.0004  max mem: 12911
Epoch: [276]  [ 400/1251]  eta: 0:02:45  lr: 0.000071  min_lr: 0.000071  loss: 2.1426 (2.5163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9929 (1.0436)  time: 0.1866  data: 0.0004  max mem: 12911
Epoch: [276]  [ 600/1251]  eta: 0:02:05  lr: 0.000070  min_lr: 0.000070  loss: 2.1355 (2.5393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9312 (1.0352)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [276]  [ 800/1251]  eta: 0:01:26  lr: 0.000069  min_lr: 0.000069  loss: 1.9861 (2.5257)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0889 (1.0418)  time: 0.1884  data: 0.0005  max mem: 12911
Epoch: [276]  [1000/1251]  eta: 0:00:47  lr: 0.000068  min_lr: 0.000068  loss: 3.0101 (2.5565)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0826 (1.0432)  time: 0.1901  data: 0.0005  max mem: 12911
Epoch: [276]  [1200/1251]  eta: 0:00:09  lr: 0.000067  min_lr: 0.000067  loss: 1.9723 (2.5483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0421 (1.0405)  time: 0.1844  data: 0.0003  max mem: 12911
Epoch: [276]  [1250/1251]  eta: 0:00:00  lr: 0.000067  min_lr: 0.000067  loss: 2.0749 (2.5473)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0461 (1.0410)  time: 0.1469  data: 0.0008  max mem: 12911
Epoch: [276] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.000067  min_lr: 0.000067  loss: 2.0749 (2.5602)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0461 (1.0410)
Test:  [ 0/25]  eta: 0:01:21  loss: 0.5686 (0.5686)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 3.2753  data: 3.1770  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.7495 (0.7564)  acc1: 83.2000 (84.0364)  acc5: 97.2000 (97.2000)  time: 0.6330  data: 0.5378  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9616 (0.9240)  acc1: 77.6000 (80.3810)  acc5: 95.2000 (95.3333)  time: 0.2860  data: 0.1981  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0362 (0.9355)  acc1: 77.2000 (79.9200)  acc5: 94.8000 (95.1840)  time: 0.2369  data: 0.1515  max mem: 12911
Test: Total time: 0:00:10 (0.4064 s / it)
* Acc@1 80.142 Acc@5 95.100 loss 0.936
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.15%
Epoch: [277]  [   0/1251]  eta: 1:08:19  lr: 0.000067  min_lr: 0.000067  loss: 4.0848 (4.0848)  weight_decay: 0.0500 (0.0500)  time: 3.2768  data: 1.7365  max mem: 12911
Epoch: [277]  [ 200/1251]  eta: 0:03:34  lr: 0.000066  min_lr: 0.000066  loss: 2.7434 (2.6004)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0359 (1.0721)  time: 0.1924  data: 0.0004  max mem: 12911
Epoch: [277]  [ 400/1251]  eta: 0:02:46  lr: 0.000065  min_lr: 0.000065  loss: 2.0813 (2.6096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9791 (1.0597)  time: 0.1889  data: 0.0004  max mem: 12911
Epoch: [277]  [ 600/1251]  eta: 0:02:06  lr: 0.000064  min_lr: 0.000064  loss: 1.9984 (2.5821)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0067 (1.0525)  time: 0.1879  data: 0.0004  max mem: 12911
Epoch: [277]  [ 800/1251]  eta: 0:01:26  lr: 0.000064  min_lr: 0.000064  loss: 2.0704 (2.5814)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0287 (1.0502)  time: 0.1880  data: 0.0006  max mem: 12911
Epoch: [277]  [1000/1251]  eta: 0:00:47  lr: 0.000063  min_lr: 0.000063  loss: 2.7094 (2.5940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9859 (1.0462)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [277]  [1200/1251]  eta: 0:00:09  lr: 0.000062  min_lr: 0.000062  loss: 2.0084 (2.5979)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0253 (1.0486)  time: 0.1922  data: 0.0004  max mem: 12911
Epoch: [277]  [1250/1251]  eta: 0:00:00  lr: 0.000062  min_lr: 0.000062  loss: 2.5198 (2.5960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9672 (1.0459)  time: 0.1474  data: 0.0007  max mem: 12911
Epoch: [277] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.000062  min_lr: 0.000062  loss: 2.5198 (2.5711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9672 (1.0459)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6083 (0.6083)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.3835  data: 5.2881  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7800 (0.7854)  acc1: 83.2000 (83.7818)  acc5: 96.8000 (97.0182)  time: 0.7314  data: 0.6386  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9917 (0.9474)  acc1: 77.6000 (79.9238)  acc5: 94.8000 (95.2381)  time: 0.2051  data: 0.1182  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0679 (0.9581)  acc1: 77.2000 (79.5200)  acc5: 94.4000 (95.1360)  time: 0.2095  data: 0.1234  max mem: 12911
Test: Total time: 0:00:10 (0.4006 s / it)
* Acc@1 80.078 Acc@5 95.050 loss 0.961
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.15%
Epoch: [278]  [   0/1251]  eta: 1:00:36  lr: 0.000062  min_lr: 0.000062  loss: 1.8389 (1.8389)  weight_decay: 0.0500 (0.0500)  time: 2.9065  data: 1.6234  max mem: 12911
Epoch: [278]  [ 200/1251]  eta: 0:03:35  lr: 0.000061  min_lr: 0.000061  loss: 2.1665 (2.5848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9683 (1.0180)  time: 0.1877  data: 0.0005  max mem: 12911
Epoch: [278]  [ 400/1251]  eta: 0:02:47  lr: 0.000060  min_lr: 0.000060  loss: 2.5544 (2.5678)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0495 (1.0307)  time: 0.1897  data: 0.0005  max mem: 12911
Epoch: [278]  [ 600/1251]  eta: 0:02:06  lr: 0.000059  min_lr: 0.000059  loss: 2.1185 (2.5945)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0010 (1.0206)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [278]  [ 800/1251]  eta: 0:01:27  lr: 0.000058  min_lr: 0.000058  loss: 3.0511 (2.6031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9515 (1.0233)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [278]  [1000/1251]  eta: 0:00:48  lr: 0.000057  min_lr: 0.000057  loss: 2.0637 (2.5980)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0358 (nan)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [278]  [1200/1251]  eta: 0:00:09  lr: 0.000056  min_lr: 0.000056  loss: 2.1700 (2.5854)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0921 (nan)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [278]  [1250/1251]  eta: 0:00:00  lr: 0.000056  min_lr: 0.000056  loss: 2.0389 (2.5925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9961 (nan)  time: 0.1463  data: 0.0008  max mem: 12911
Epoch: [278] Total time: 0:03:59 (0.1915 s / it)
Averaged stats: lr: 0.000056  min_lr: 0.000056  loss: 2.0389 (2.5704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9961 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5733 (0.5733)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 5.5107  data: 5.4191  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7616 (0.7609)  acc1: 84.0000 (84.1455)  acc5: 97.2000 (97.0909)  time: 0.7558  data: 0.6613  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9605 (0.9235)  acc1: 78.4000 (80.4191)  acc5: 95.2000 (95.3143)  time: 0.2146  data: 0.1260  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0283 (0.9348)  acc1: 77.6000 (79.9040)  acc5: 94.4000 (95.1520)  time: 0.2129  data: 0.1260  max mem: 12911
Test: Total time: 0:00:10 (0.4087 s / it)
* Acc@1 80.082 Acc@5 95.066 loss 0.936
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.15%
Epoch: [279]  [   0/1251]  eta: 1:06:00  lr: 0.000056  min_lr: 0.000056  loss: 3.1121 (3.1121)  weight_decay: 0.0500 (0.0500)  time: 3.1662  data: 2.7520  max mem: 12911
Epoch: [279]  [ 200/1251]  eta: 0:03:36  lr: 0.000055  min_lr: 0.000055  loss: 3.3029 (2.5138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0243 (1.0608)  time: 0.1900  data: 0.0005  max mem: 12911
Epoch: [279]  [ 400/1251]  eta: 0:02:47  lr: 0.000055  min_lr: 0.000055  loss: 2.2786 (2.5259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1277 (1.0673)  time: 0.1861  data: 0.0005  max mem: 12911
Epoch: [279]  [ 600/1251]  eta: 0:02:06  lr: 0.000054  min_lr: 0.000054  loss: 2.0119 (2.5236)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1170 (1.0737)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [279]  [ 800/1251]  eta: 0:01:26  lr: 0.000053  min_lr: 0.000053  loss: 3.1465 (2.5386)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0345 (1.0705)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [279]  [1000/1251]  eta: 0:00:48  lr: 0.000052  min_lr: 0.000052  loss: 3.0114 (2.5423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9996 (1.0629)  time: 0.1917  data: 0.0004  max mem: 12911
Epoch: [279]  [1200/1251]  eta: 0:00:09  lr: 0.000051  min_lr: 0.000051  loss: 2.4855 (2.5543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9820 (1.0626)  time: 0.1955  data: 0.0005  max mem: 12911
Epoch: [279]  [1250/1251]  eta: 0:00:00  lr: 0.000051  min_lr: 0.000051  loss: 1.9503 (2.5502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9786 (1.0597)  time: 0.1468  data: 0.0010  max mem: 12911
Epoch: [279] Total time: 0:04:00 (0.1922 s / it)
Averaged stats: lr: 0.000051  min_lr: 0.000051  loss: 1.9503 (2.5787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9786 (1.0597)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5207 (0.5207)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6836  data: 5.5563  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7000 (0.7154)  acc1: 82.8000 (84.3636)  acc5: 96.8000 (97.0182)  time: 0.7697  data: 0.6714  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9199 (0.8776)  acc1: 78.0000 (80.4191)  acc5: 95.2000 (95.3714)  time: 0.2106  data: 0.1231  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9921 (0.8882)  acc1: 78.0000 (79.9360)  acc5: 94.8000 (95.1680)  time: 0.2066  data: 0.1231  max mem: 12911
Test: Total time: 0:00:10 (0.4116 s / it)
* Acc@1 80.160 Acc@5 95.032 loss 0.890
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.16%
Epoch: [280]  [   0/1251]  eta: 1:02:06  lr: 0.000051  min_lr: 0.000051  loss: 1.7104 (1.7104)  weight_decay: 0.0500 (0.0500)  time: 2.9791  data: 2.7117  max mem: 12911
Epoch: [280]  [ 200/1251]  eta: 0:03:34  lr: 0.000050  min_lr: 0.000050  loss: 2.0619 (2.6160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9962 (1.0485)  time: 0.1961  data: 0.0004  max mem: 12911
Epoch: [280]  [ 400/1251]  eta: 0:02:47  lr: 0.000050  min_lr: 0.000050  loss: 1.9953 (2.5618)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0503 (1.0540)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [280]  [ 600/1251]  eta: 0:02:05  lr: 0.000049  min_lr: 0.000049  loss: 2.0294 (2.5837)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0307 (1.0614)  time: 0.1852  data: 0.0005  max mem: 12911
Epoch: [280]  [ 800/1251]  eta: 0:01:26  lr: 0.000048  min_lr: 0.000048  loss: 1.9910 (2.5898)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9926 (1.0556)  time: 0.1839  data: 0.0005  max mem: 12911
Epoch: [280]  [1000/1251]  eta: 0:00:47  lr: 0.000047  min_lr: 0.000047  loss: 2.2357 (2.5723)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0323 (1.0535)  time: 0.1865  data: 0.0005  max mem: 12911
Epoch: [280]  [1200/1251]  eta: 0:00:09  lr: 0.000046  min_lr: 0.000046  loss: 2.9625 (2.5665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9682 (1.0482)  time: 0.1877  data: 0.0005  max mem: 12911
Epoch: [280]  [1250/1251]  eta: 0:00:00  lr: 0.000046  min_lr: 0.000046  loss: 1.9309 (2.5648)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0326 (1.0494)  time: 0.1472  data: 0.0010  max mem: 12911
Epoch: [280] Total time: 0:03:58 (0.1905 s / it)
Averaged stats: lr: 0.000046  min_lr: 0.000046  loss: 1.9309 (2.5836)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0326 (1.0494)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5505 (0.5505)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.6612  data: 5.5695  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7332 (0.7411)  acc1: 83.2000 (84.3636)  acc5: 97.2000 (97.0909)  time: 0.7800  data: 0.6833  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9454 (0.9013)  acc1: 78.8000 (80.3429)  acc5: 95.6000 (95.2762)  time: 0.2288  data: 0.1389  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9957 (0.9130)  acc1: 77.2000 (79.9520)  acc5: 94.4000 (95.1520)  time: 0.2259  data: 0.1388  max mem: 12911
Test: Total time: 0:00:10 (0.4257 s / it)
* Acc@1 80.096 Acc@5 95.046 loss 0.915
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.16%
Epoch: [281]  [   0/1251]  eta: 1:09:02  lr: 0.000046  min_lr: 0.000046  loss: 4.3086 (4.3086)  weight_decay: 0.0500 (0.0500)  time: 3.3113  data: 2.5939  max mem: 12911
Epoch: [281]  [ 200/1251]  eta: 0:03:32  lr: 0.000046  min_lr: 0.000046  loss: 1.9981 (2.4285)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0067 (1.0260)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [281]  [ 400/1251]  eta: 0:02:45  lr: 0.000045  min_lr: 0.000045  loss: 2.0918 (2.4834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1093 (1.0505)  time: 0.1836  data: 0.0005  max mem: 12911
Epoch: [281]  [ 600/1251]  eta: 0:02:04  lr: 0.000044  min_lr: 0.000044  loss: 1.9019 (2.4900)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0158 (1.0423)  time: 0.1892  data: 0.0004  max mem: 12911
Epoch: [281]  [ 800/1251]  eta: 0:01:26  lr: 0.000043  min_lr: 0.000043  loss: 1.9501 (2.5157)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0720 (1.0454)  time: 0.1908  data: 0.0005  max mem: 12911
Epoch: [281]  [1000/1251]  eta: 0:00:47  lr: 0.000043  min_lr: 0.000043  loss: 2.0450 (2.5161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9712 (1.0441)  time: 0.1914  data: 0.0005  max mem: 12911
Epoch: [281]  [1200/1251]  eta: 0:00:09  lr: 0.000042  min_lr: 0.000042  loss: 2.1185 (2.5426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9598 (1.0418)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [281]  [1250/1251]  eta: 0:00:00  lr: 0.000042  min_lr: 0.000042  loss: 2.1785 (2.5403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0503 (1.0437)  time: 0.1465  data: 0.0010  max mem: 12911
Epoch: [281] Total time: 0:03:58 (0.1907 s / it)
Averaged stats: lr: 0.000042  min_lr: 0.000042  loss: 2.1785 (2.5651)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0503 (1.0437)
Test:  [ 0/25]  eta: 0:01:23  loss: 0.5678 (0.5678)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 3.3503  data: 3.2586  max mem: 12911
Test:  [10/25]  eta: 0:00:08  loss: 0.7582 (0.7552)  acc1: 84.0000 (84.2546)  acc5: 97.2000 (97.0546)  time: 0.5885  data: 0.5001  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9613 (0.9179)  acc1: 78.8000 (80.3619)  acc5: 95.6000 (95.3333)  time: 0.2849  data: 0.1988  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0324 (0.9301)  acc1: 77.2000 (79.9360)  acc5: 94.0000 (95.1520)  time: 0.2132  data: 0.1304  max mem: 12911
Test: Total time: 0:00:09 (0.3988 s / it)
* Acc@1 80.120 Acc@5 95.074 loss 0.930
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.16%
Epoch: [282]  [   0/1251]  eta: 1:08:05  lr: 0.000042  min_lr: 0.000042  loss: 4.2736 (4.2736)  weight_decay: 0.0500 (0.0500)  time: 3.2659  data: 2.2452  max mem: 12911
Epoch: [282]  [ 200/1251]  eta: 0:03:33  lr: 0.000041  min_lr: 0.000041  loss: 2.0109 (2.4859)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0882 (1.0383)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [282]  [ 400/1251]  eta: 0:02:46  lr: 0.000040  min_lr: 0.000040  loss: 1.9874 (2.4812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9532 (1.0079)  time: 0.1906  data: 0.0004  max mem: 12911
Epoch: [282]  [ 600/1251]  eta: 0:02:05  lr: 0.000040  min_lr: 0.000040  loss: 2.5482 (2.4973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9876 (1.0087)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [282]  [ 800/1251]  eta: 0:01:26  lr: 0.000039  min_lr: 0.000039  loss: 2.6566 (2.5179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9532 (1.0359)  time: 0.1904  data: 0.0005  max mem: 12911
Epoch: [282]  [1000/1251]  eta: 0:00:47  lr: 0.000038  min_lr: 0.000038  loss: 1.9668 (2.5208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9931 (1.0403)  time: 0.1900  data: 0.0004  max mem: 12911
Epoch: [282]  [1200/1251]  eta: 0:00:09  lr: 0.000037  min_lr: 0.000037  loss: 2.1139 (2.5385)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0244 (1.0400)  time: 0.1878  data: 0.0004  max mem: 12911
Epoch: [282]  [1250/1251]  eta: 0:00:00  lr: 0.000037  min_lr: 0.000037  loss: 2.0503 (2.5391)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0276 (1.0386)  time: 0.1470  data: 0.0008  max mem: 12911
Epoch: [282] Total time: 0:03:57 (0.1902 s / it)
Averaged stats: lr: 0.000037  min_lr: 0.000037  loss: 2.0503 (2.5662)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0276 (1.0386)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5639 (0.5639)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.5285  data: 5.4283  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7353 (0.7423)  acc1: 84.4000 (84.4364)  acc5: 97.2000 (96.9091)  time: 0.7058  data: 0.6080  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9466 (0.9035)  acc1: 78.4000 (80.3238)  acc5: 95.2000 (95.2571)  time: 0.1956  data: 0.1070  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0110 (0.9138)  acc1: 76.8000 (79.9200)  acc5: 94.4000 (95.1680)  time: 0.1967  data: 0.1117  max mem: 12911
Test: Total time: 0:00:09 (0.4000 s / it)
* Acc@1 80.218 Acc@5 95.082 loss 0.915
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.22%
Epoch: [283]  [   0/1251]  eta: 0:53:19  lr: 0.000037  min_lr: 0.000037  loss: 3.3937 (3.3937)  weight_decay: 0.0500 (0.0500)  time: 2.5579  data: 2.2936  max mem: 12911
Epoch: [283]  [ 200/1251]  eta: 0:03:31  lr: 0.000037  min_lr: 0.000037  loss: 2.1132 (2.5257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9944 (1.0127)  time: 0.1953  data: 0.0005  max mem: 12911
Epoch: [283]  [ 400/1251]  eta: 0:02:46  lr: 0.000036  min_lr: 0.000036  loss: 2.3887 (2.5451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9968 (1.0258)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [283]  [ 600/1251]  eta: 0:02:05  lr: 0.000035  min_lr: 0.000035  loss: 3.3138 (2.5548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9889 (1.0147)  time: 0.1872  data: 0.0004  max mem: 12911
Epoch: [283]  [ 800/1251]  eta: 0:01:26  lr: 0.000035  min_lr: 0.000035  loss: 2.1312 (2.5466)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0438 (1.0274)  time: 0.1922  data: 0.0005  max mem: 12911
Epoch: [283]  [1000/1251]  eta: 0:00:48  lr: 0.000034  min_lr: 0.000034  loss: 1.9643 (2.5540)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0348 (1.0366)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [283]  [1200/1251]  eta: 0:00:09  lr: 0.000033  min_lr: 0.000033  loss: 1.9426 (2.5548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9945 (1.0399)  time: 0.1868  data: 0.0005  max mem: 12911
Epoch: [283]  [1250/1251]  eta: 0:00:00  lr: 0.000033  min_lr: 0.000033  loss: 1.9360 (2.5522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9945 (1.0390)  time: 0.1464  data: 0.0008  max mem: 12911
Epoch: [283] Total time: 0:03:59 (0.1914 s / it)
Averaged stats: lr: 0.000033  min_lr: 0.000033  loss: 1.9360 (2.5656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9945 (1.0390)
Test:  [ 0/25]  eta: 0:01:20  loss: 0.5359 (0.5359)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 3.2276  data: 3.1361  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.7114 (0.7234)  acc1: 82.8000 (84.0000)  acc5: 96.8000 (96.9091)  time: 0.6049  data: 0.5176  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9327 (0.8802)  acc1: 78.0000 (80.4381)  acc5: 94.8000 (95.3143)  time: 0.2765  data: 0.1932  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9717 (0.8917)  acc1: 77.6000 (80.0160)  acc5: 94.4000 (95.1200)  time: 0.2103  data: 0.1307  max mem: 12911
Test: Total time: 0:00:09 (0.3993 s / it)
* Acc@1 80.228 Acc@5 95.098 loss 0.891
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.23%
Epoch: [284]  [   0/1251]  eta: 1:02:54  lr: 0.000033  min_lr: 0.000033  loss: 2.4082 (2.4082)  weight_decay: 0.0500 (0.0500)  time: 3.0171  data: 2.7260  max mem: 12911
Epoch: [284]  [ 200/1251]  eta: 0:03:32  lr: 0.000032  min_lr: 0.000032  loss: 1.9416 (2.5622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9894 (1.0715)  time: 0.1868  data: 0.0003  max mem: 12911
Epoch: [284]  [ 400/1251]  eta: 0:02:46  lr: 0.000032  min_lr: 0.000032  loss: 2.0402 (2.5767)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0262 (1.0498)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [284]  [ 600/1251]  eta: 0:02:05  lr: 0.000031  min_lr: 0.000031  loss: 2.9164 (2.5924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9962 (1.0463)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [284]  [ 800/1251]  eta: 0:01:26  lr: 0.000031  min_lr: 0.000031  loss: 1.9824 (2.5825)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0546 (1.0410)  time: 0.1842  data: 0.0004  max mem: 12911
Epoch: [284]  [1000/1251]  eta: 0:00:48  lr: 0.000030  min_lr: 0.000030  loss: 2.0916 (2.5894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9595 (1.0415)  time: 0.1868  data: 0.0004  max mem: 12911
Epoch: [284]  [1200/1251]  eta: 0:00:09  lr: 0.000029  min_lr: 0.000029  loss: 1.9025 (2.5906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9834 (1.0357)  time: 0.1918  data: 0.0004  max mem: 12911
Epoch: [284]  [1250/1251]  eta: 0:00:00  lr: 0.000029  min_lr: 0.000029  loss: 1.9844 (2.5873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9935 (1.0343)  time: 0.1462  data: 0.0010  max mem: 12911
Epoch: [284] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.000029  min_lr: 0.000029  loss: 1.9844 (2.5630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9935 (1.0343)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5805 (0.5805)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.7203  data: 5.6286  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7644 (0.7597)  acc1: 84.0000 (84.2182)  acc5: 97.2000 (97.0546)  time: 0.7630  data: 0.6677  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9658 (0.9216)  acc1: 78.4000 (80.2857)  acc5: 94.8000 (95.2191)  time: 0.2148  data: 0.1273  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0296 (0.9320)  acc1: 77.6000 (79.9200)  acc5: 94.4000 (95.1040)  time: 0.2117  data: 0.1272  max mem: 12911
Test: Total time: 0:00:10 (0.4166 s / it)
* Acc@1 80.244 Acc@5 95.088 loss 0.931
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [285]  [   0/1251]  eta: 0:59:55  lr: 0.000029  min_lr: 0.000029  loss: 1.7569 (1.7569)  weight_decay: 0.0500 (0.0500)  time: 2.8744  data: 2.6294  max mem: 12911
Epoch: [285]  [ 200/1251]  eta: 0:03:33  lr: 0.000029  min_lr: 0.000029  loss: 3.3107 (2.6046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9615 (1.0397)  time: 0.1845  data: 0.0005  max mem: 12911
Epoch: [285]  [ 400/1251]  eta: 0:02:46  lr: 0.000028  min_lr: 0.000028  loss: 2.1472 (2.5597)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0330 (1.0540)  time: 0.1880  data: 0.0008  max mem: 12911
Epoch: [285]  [ 600/1251]  eta: 0:02:05  lr: 0.000027  min_lr: 0.000027  loss: 1.9915 (2.5255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9945 (1.0362)  time: 0.1875  data: 0.0005  max mem: 12911
Epoch: [285]  [ 800/1251]  eta: 0:01:26  lr: 0.000027  min_lr: 0.000027  loss: 2.0689 (2.5261)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0047 (1.0335)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [285]  [1000/1251]  eta: 0:00:47  lr: 0.000026  min_lr: 0.000026  loss: 1.9880 (2.5297)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0248 (1.0334)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [285]  [1200/1251]  eta: 0:00:09  lr: 0.000026  min_lr: 0.000026  loss: 2.3193 (2.5387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9637 (1.0292)  time: 0.1969  data: 0.0004  max mem: 12911
Epoch: [285]  [1250/1251]  eta: 0:00:00  lr: 0.000026  min_lr: 0.000026  loss: 2.3605 (2.5449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0551 (1.0314)  time: 0.1472  data: 0.0007  max mem: 12911
Epoch: [285] Total time: 0:03:58 (0.1909 s / it)
Averaged stats: lr: 0.000026  min_lr: 0.000026  loss: 2.3605 (2.5563)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0551 (1.0314)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5677 (0.5677)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.4623  data: 5.3707  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7550 (0.7559)  acc1: 84.0000 (84.1091)  acc5: 97.2000 (96.9455)  time: 0.7651  data: 0.6682  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9629 (0.9169)  acc1: 78.0000 (80.3048)  acc5: 94.8000 (95.2762)  time: 0.2152  data: 0.1241  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0176 (0.9273)  acc1: 77.2000 (79.9680)  acc5: 94.4000 (95.1200)  time: 0.2116  data: 0.1240  max mem: 12911
Test: Total time: 0:00:10 (0.4063 s / it)
* Acc@1 80.240 Acc@5 95.102 loss 0.927
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [286]  [   0/1251]  eta: 1:04:30  lr: 0.000026  min_lr: 0.000026  loss: 1.9553 (1.9553)  weight_decay: 0.0500 (0.0500)  time: 3.0941  data: 1.9242  max mem: 12911
Epoch: [286]  [ 200/1251]  eta: 0:03:36  lr: 0.000025  min_lr: 0.000025  loss: 2.1462 (2.5980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9886 (1.0241)  time: 0.1854  data: 0.0004  max mem: 12911
Epoch: [286]  [ 400/1251]  eta: 0:02:47  lr: 0.000025  min_lr: 0.000025  loss: 2.0132 (2.5618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9586 (1.0121)  time: 0.1877  data: 0.0005  max mem: 12911
Epoch: [286]  [ 600/1251]  eta: 0:02:05  lr: 0.000024  min_lr: 0.000024  loss: 1.8646 (2.5296)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0193 (1.0117)  time: 0.1856  data: 0.0005  max mem: 12911
Epoch: [286]  [ 800/1251]  eta: 0:01:26  lr: 0.000023  min_lr: 0.000023  loss: 2.2447 (2.5224)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0130 (1.0235)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [286]  [1000/1251]  eta: 0:00:47  lr: 0.000023  min_lr: 0.000023  loss: 1.9536 (2.5331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0546 (1.0339)  time: 0.1880  data: 0.0006  max mem: 12911
Epoch: [286]  [1200/1251]  eta: 0:00:09  lr: 0.000022  min_lr: 0.000022  loss: 1.9410 (2.5489)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0050 (1.0370)  time: 0.1887  data: 0.0004  max mem: 12911
Epoch: [286]  [1250/1251]  eta: 0:00:00  lr: 0.000022  min_lr: 0.000022  loss: 2.1386 (2.5514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0260 (1.0368)  time: 0.1481  data: 0.0006  max mem: 12911
Epoch: [286] Total time: 0:03:59 (0.1911 s / it)
Averaged stats: lr: 0.000022  min_lr: 0.000022  loss: 2.1386 (2.5610)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0260 (1.0368)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5711 (0.5711)  acc1: 88.8000 (88.8000)  acc5: 99.6000 (99.6000)  time: 5.5605  data: 5.4689  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7673 (0.7609)  acc1: 83.2000 (84.4000)  acc5: 96.8000 (96.9455)  time: 0.7074  data: 0.6112  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9606 (0.9222)  acc1: 79.2000 (80.3429)  acc5: 95.2000 (95.2381)  time: 0.1924  data: 0.1038  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0443 (0.9328)  acc1: 77.6000 (79.9520)  acc5: 94.8000 (95.1200)  time: 0.2109  data: 0.1259  max mem: 12911
Test: Total time: 0:00:10 (0.4099 s / it)
* Acc@1 80.134 Acc@5 95.120 loss 0.933
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.24%
Epoch: [287]  [   0/1251]  eta: 1:03:43  lr: 0.000022  min_lr: 0.000022  loss: 1.7780 (1.7780)  weight_decay: 0.0500 (0.0500)  time: 3.0567  data: 1.5229  max mem: 12911
Epoch: [287]  [ 200/1251]  eta: 0:03:37  lr: 0.000022  min_lr: 0.000022  loss: 2.0497 (2.5485)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9761 (1.0280)  time: 0.1881  data: 0.0005  max mem: 12911
Epoch: [287]  [ 400/1251]  eta: 0:02:48  lr: 0.000021  min_lr: 0.000021  loss: 2.0167 (2.5444)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0093 (1.0300)  time: 0.1950  data: 0.0005  max mem: 12911
Epoch: [287]  [ 600/1251]  eta: 0:02:06  lr: 0.000021  min_lr: 0.000021  loss: 2.8193 (2.5571)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0259 (1.0294)  time: 0.1843  data: 0.0004  max mem: 12911
Epoch: [287]  [ 800/1251]  eta: 0:01:26  lr: 0.000020  min_lr: 0.000020  loss: 2.0311 (2.5600)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0035 (1.0249)  time: 0.1900  data: 0.0005  max mem: 12911
Epoch: [287]  [1000/1251]  eta: 0:00:48  lr: 0.000020  min_lr: 0.000020  loss: 2.0514 (2.5575)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0167 (1.0307)  time: 0.1851  data: 0.0005  max mem: 12911
Epoch: [287]  [1200/1251]  eta: 0:00:09  lr: 0.000019  min_lr: 0.000019  loss: 2.0239 (2.5395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9980 (1.0297)  time: 0.1879  data: 0.0005  max mem: 12911
Epoch: [287]  [1250/1251]  eta: 0:00:00  lr: 0.000019  min_lr: 0.000019  loss: 2.0130 (2.5409)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0115 (1.0331)  time: 0.1465  data: 0.0008  max mem: 12911
Epoch: [287] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.000019  min_lr: 0.000019  loss: 2.0130 (2.5685)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0115 (1.0331)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5675 (0.5675)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.4331  data: 5.3326  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7357 (0.7454)  acc1: 82.8000 (84.2545)  acc5: 97.2000 (96.9818)  time: 0.7234  data: 0.6297  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9429 (0.9043)  acc1: 78.4000 (80.4191)  acc5: 95.2000 (95.2952)  time: 0.2082  data: 0.1213  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0141 (0.9153)  acc1: 77.2000 (79.9840)  acc5: 94.8000 (95.1360)  time: 0.2063  data: 0.1212  max mem: 12911
Test: Total time: 0:00:10 (0.4026 s / it)
* Acc@1 80.212 Acc@5 95.150 loss 0.914
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [288]  [   0/1251]  eta: 1:03:12  lr: 0.000019  min_lr: 0.000019  loss: 2.8103 (2.8103)  weight_decay: 0.0500 (0.0500)  time: 3.0317  data: 2.5957  max mem: 12911
Epoch: [288]  [ 200/1251]  eta: 0:03:34  lr: 0.000019  min_lr: 0.000019  loss: 1.9362 (2.6112)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0269 (1.0433)  time: 0.1913  data: 0.0004  max mem: 12911
Epoch: [288]  [ 400/1251]  eta: 0:02:47  lr: 0.000018  min_lr: 0.000018  loss: 2.2515 (2.6285)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0098 (1.0586)  time: 0.1845  data: 0.0005  max mem: 12911
Epoch: [288]  [ 600/1251]  eta: 0:02:05  lr: 0.000018  min_lr: 0.000018  loss: 2.6391 (2.5853)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0229 (nan)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [288]  [ 800/1251]  eta: 0:01:26  lr: 0.000017  min_lr: 0.000017  loss: 2.1194 (2.6011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9863 (nan)  time: 0.1847  data: 0.0005  max mem: 12911
Epoch: [288]  [1000/1251]  eta: 0:00:47  lr: 0.000017  min_lr: 0.000017  loss: 1.8977 (2.5870)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0481 (nan)  time: 0.1865  data: 0.0004  max mem: 12911
Epoch: [288]  [1200/1251]  eta: 0:00:09  lr: 0.000016  min_lr: 0.000016  loss: 1.9290 (2.5967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9759 (nan)  time: 0.1839  data: 0.0004  max mem: 12911
Epoch: [288]  [1250/1251]  eta: 0:00:00  lr: 0.000016  min_lr: 0.000016  loss: 2.1631 (2.5956)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0039 (nan)  time: 0.1467  data: 0.0011  max mem: 12911
Epoch: [288] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.000016  min_lr: 0.000016  loss: 2.1631 (2.5674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0039 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5656 (0.5656)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.5578  data: 5.4297  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7520 (0.7561)  acc1: 82.8000 (84.3273)  acc5: 97.2000 (97.1636)  time: 0.6757  data: 0.5898  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9556 (0.9172)  acc1: 78.8000 (80.4191)  acc5: 95.2000 (95.4476)  time: 0.1840  data: 0.1034  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0207 (0.9283)  acc1: 78.0000 (79.9840)  acc5: 94.0000 (95.2800)  time: 0.2214  data: 0.1419  max mem: 12911
Test: Total time: 0:00:10 (0.4164 s / it)
* Acc@1 80.202 Acc@5 95.148 loss 0.928
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [289]  [   0/1251]  eta: 1:05:55  lr: 0.000016  min_lr: 0.000016  loss: 2.8287 (2.8287)  weight_decay: 0.0500 (0.0500)  time: 3.1617  data: 2.4984  max mem: 12911
Epoch: [289]  [ 200/1251]  eta: 0:03:34  lr: 0.000016  min_lr: 0.000016  loss: 2.8031 (2.6265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9864 (1.0125)  time: 0.1873  data: 0.0004  max mem: 12911
Epoch: [289]  [ 400/1251]  eta: 0:02:46  lr: 0.000015  min_lr: 0.000015  loss: 2.4655 (2.6212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9866 (1.0141)  time: 0.1880  data: 0.0006  max mem: 12911
Epoch: [289]  [ 600/1251]  eta: 0:02:05  lr: 0.000015  min_lr: 0.000015  loss: 1.9201 (2.5903)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0054 (1.0208)  time: 0.1927  data: 0.0004  max mem: 12911
Epoch: [289]  [ 800/1251]  eta: 0:01:26  lr: 0.000014  min_lr: 0.000014  loss: 2.3730 (2.5851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0188 (1.0242)  time: 0.1872  data: 0.0005  max mem: 12911
Epoch: [289]  [1000/1251]  eta: 0:00:47  lr: 0.000014  min_lr: 0.000014  loss: 2.1565 (2.5669)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0508 (1.0230)  time: 0.1853  data: 0.0004  max mem: 12911
Epoch: [289]  [1200/1251]  eta: 0:00:09  lr: 0.000014  min_lr: 0.000014  loss: 2.1462 (2.5681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0403 (1.0236)  time: 0.1889  data: 0.0005  max mem: 12911
Epoch: [289]  [1250/1251]  eta: 0:00:00  lr: 0.000014  min_lr: 0.000014  loss: 1.9482 (2.5690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9788 (1.0237)  time: 0.1476  data: 0.0008  max mem: 12911
Epoch: [289] Total time: 0:03:57 (0.1900 s / it)
Averaged stats: lr: 0.000014  min_lr: 0.000014  loss: 1.9482 (2.5539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9788 (1.0237)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5396 (0.5396)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.4672  data: 5.3685  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7196 (0.7326)  acc1: 83.6000 (84.1455)  acc5: 96.8000 (96.9818)  time: 0.7347  data: 0.6371  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9429 (0.8919)  acc1: 78.8000 (80.3048)  acc5: 95.2000 (95.1619)  time: 0.2089  data: 0.1199  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9912 (0.9026)  acc1: 76.8000 (79.9520)  acc5: 94.4000 (95.0240)  time: 0.2233  data: 0.1382  max mem: 12911
Test: Total time: 0:00:10 (0.4162 s / it)
* Acc@1 80.212 Acc@5 95.138 loss 0.901
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [290]  [   0/1251]  eta: 1:09:59  lr: 0.000014  min_lr: 0.000014  loss: 3.6269 (3.6269)  weight_decay: 0.0500 (0.0500)  time: 3.3572  data: 2.4822  max mem: 12911
Epoch: [290]  [ 200/1251]  eta: 0:03:33  lr: 0.000013  min_lr: 0.000013  loss: 1.8892 (2.5492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9838 (0.9993)  time: 0.1886  data: 0.0004  max mem: 12911
Epoch: [290]  [ 400/1251]  eta: 0:02:47  lr: 0.000013  min_lr: 0.000013  loss: 2.0762 (2.5385)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0372 (1.0257)  time: 0.1908  data: 0.0004  max mem: 12911
Epoch: [290]  [ 600/1251]  eta: 0:02:06  lr: 0.000012  min_lr: 0.000012  loss: 2.8056 (2.5310)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9749 (1.0217)  time: 0.1870  data: 0.0004  max mem: 12911
Epoch: [290]  [ 800/1251]  eta: 0:01:26  lr: 0.000012  min_lr: 0.000012  loss: 2.1830 (2.5278)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0239 (1.0238)  time: 0.1858  data: 0.0005  max mem: 12911
Epoch: [290]  [1000/1251]  eta: 0:00:48  lr: 0.000012  min_lr: 0.000012  loss: 2.3986 (2.5372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0257 (1.0268)  time: 0.1902  data: 0.0005  max mem: 12911
Epoch: [290]  [1200/1251]  eta: 0:00:09  lr: 0.000011  min_lr: 0.000011  loss: 1.8620 (2.5212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9936 (1.0273)  time: 0.1912  data: 0.0005  max mem: 12911
Epoch: [290]  [1250/1251]  eta: 0:00:00  lr: 0.000011  min_lr: 0.000011  loss: 2.0346 (2.5208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9788 (1.0260)  time: 0.1466  data: 0.0009  max mem: 12911
Epoch: [290] Total time: 0:03:59 (0.1918 s / it)
Averaged stats: lr: 0.000011  min_lr: 0.000011  loss: 2.0346 (2.5605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9788 (1.0260)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5195 (0.5195)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5676  data: 5.4759  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7028 (0.7123)  acc1: 83.6000 (83.9636)  acc5: 97.2000 (97.0182)  time: 0.7068  data: 0.6100  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9192 (0.8727)  acc1: 77.6000 (80.2476)  acc5: 95.2000 (95.3143)  time: 0.1916  data: 0.1031  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9769 (0.8829)  acc1: 77.6000 (79.8720)  acc5: 94.8000 (95.1680)  time: 0.1923  data: 0.1078  max mem: 12911
Test: Total time: 0:00:09 (0.3988 s / it)
* Acc@1 80.214 Acc@5 95.134 loss 0.883
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [291]  [   0/1251]  eta: 1:06:46  lr: 0.000011  min_lr: 0.000011  loss: 2.8730 (2.8730)  weight_decay: 0.0500 (0.0500)  time: 3.2030  data: 1.5320  max mem: 12911
Epoch: [291]  [ 200/1251]  eta: 0:03:37  lr: 0.000011  min_lr: 0.000011  loss: 1.8834 (2.5690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9338 (1.0266)  time: 0.1888  data: 0.0006  max mem: 12911
Epoch: [291]  [ 400/1251]  eta: 0:02:47  lr: 0.000010  min_lr: 0.000010  loss: 2.0590 (2.5635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9813 (1.0316)  time: 0.1876  data: 0.0004  max mem: 12911
Epoch: [291]  [ 600/1251]  eta: 0:02:06  lr: 0.000010  min_lr: 0.000010  loss: 2.6053 (2.5937)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0490 (1.0706)  time: 0.1846  data: 0.0005  max mem: 12911
Epoch: [291]  [ 800/1251]  eta: 0:01:26  lr: 0.000010  min_lr: 0.000010  loss: 2.0350 (2.5971)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0209 (1.0607)  time: 0.1852  data: 0.0004  max mem: 12911
Epoch: [291]  [1000/1251]  eta: 0:00:47  lr: 0.000009  min_lr: 0.000009  loss: 2.2685 (2.5655)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0156 (1.0576)  time: 0.1846  data: 0.0005  max mem: 12911
Epoch: [291]  [1200/1251]  eta: 0:00:09  lr: 0.000009  min_lr: 0.000009  loss: 2.1033 (2.5585)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0216 (1.0558)  time: 0.1893  data: 0.0005  max mem: 12911
Epoch: [291]  [1250/1251]  eta: 0:00:00  lr: 0.000009  min_lr: 0.000009  loss: 2.0782 (2.5660)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0216 (1.0540)  time: 0.1459  data: 0.0009  max mem: 12911
Epoch: [291] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.000009  min_lr: 0.000009  loss: 2.0782 (2.5585)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0216 (1.0540)
Test:  [ 0/25]  eta: 0:01:53  loss: 0.5844 (0.5844)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 4.5454  data: 4.4495  max mem: 12911
Test:  [10/25]  eta: 0:00:09  loss: 0.7580 (0.7643)  acc1: 83.2000 (84.3636)  acc5: 97.2000 (97.3091)  time: 0.6063  data: 0.5131  max mem: 12911
Test:  [20/25]  eta: 0:00:01  loss: 0.9589 (0.9244)  acc1: 78.4000 (80.4762)  acc5: 95.6000 (95.2762)  time: 0.1910  data: 0.1043  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0343 (0.9344)  acc1: 77.6000 (80.0160)  acc5: 94.4000 (95.1520)  time: 0.1847  data: 0.1036  max mem: 12911
Test: Total time: 0:00:09 (0.3710 s / it)
* Acc@1 80.186 Acc@5 95.110 loss 0.934
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [292]  [   0/1251]  eta: 1:03:05  lr: 0.000009  min_lr: 0.000009  loss: 3.6712 (3.6712)  weight_decay: 0.0500 (0.0500)  time: 3.0261  data: 2.1261  max mem: 12911
Epoch: [292]  [ 200/1251]  eta: 0:03:34  lr: 0.000009  min_lr: 0.000009  loss: 2.1876 (2.5819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9637 (0.9895)  time: 0.1856  data: 0.0004  max mem: 12911
Epoch: [292]  [ 400/1251]  eta: 0:02:46  lr: 0.000008  min_lr: 0.000008  loss: 2.3401 (2.5551)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0228 (1.0128)  time: 0.1884  data: 0.0003  max mem: 12911
Epoch: [292]  [ 600/1251]  eta: 0:02:06  lr: 0.000008  min_lr: 0.000008  loss: 2.7350 (2.5365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9979 (1.0300)  time: 0.1874  data: 0.0004  max mem: 12911
Epoch: [292]  [ 800/1251]  eta: 0:01:26  lr: 0.000008  min_lr: 0.000008  loss: 2.2194 (2.5667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9757 (1.0233)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [292]  [1000/1251]  eta: 0:00:47  lr: 0.000008  min_lr: 0.000008  loss: 2.5238 (2.5476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9512 (1.0190)  time: 0.1858  data: 0.0004  max mem: 12911
Epoch: [292]  [1200/1251]  eta: 0:00:09  lr: 0.000007  min_lr: 0.000007  loss: 2.1299 (2.5475)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0178 (1.0223)  time: 0.1900  data: 0.0004  max mem: 12911
Epoch: [292]  [1250/1251]  eta: 0:00:00  lr: 0.000007  min_lr: 0.000007  loss: 2.2613 (2.5431)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0171 (1.0228)  time: 0.1470  data: 0.0006  max mem: 12911
Epoch: [292] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.000007  min_lr: 0.000007  loss: 2.2613 (2.5644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0171 (1.0228)
Test:  [ 0/25]  eta: 0:02:07  loss: 0.5643 (0.5643)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.0974  data: 5.0058  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7529 (0.7568)  acc1: 82.8000 (84.1091)  acc5: 97.2000 (96.9818)  time: 0.7207  data: 0.6251  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9611 (0.9173)  acc1: 78.4000 (80.3619)  acc5: 94.4000 (95.2381)  time: 0.2247  data: 0.1360  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0218 (0.9274)  acc1: 77.6000 (80.0000)  acc5: 94.4000 (95.0880)  time: 0.2217  data: 0.1359  max mem: 12911
Test: Total time: 0:00:10 (0.4001 s / it)
* Acc@1 80.218 Acc@5 95.120 loss 0.927
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [293]  [   0/1251]  eta: 1:08:06  lr: 0.000007  min_lr: 0.000007  loss: 1.7539 (1.7539)  weight_decay: 0.0500 (0.0500)  time: 3.2663  data: 1.6045  max mem: 12911
Epoch: [293]  [ 200/1251]  eta: 0:03:35  lr: 0.000007  min_lr: 0.000007  loss: 2.3236 (2.5868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9974 (1.0217)  time: 0.1906  data: 0.0005  max mem: 12911
Epoch: [293]  [ 400/1251]  eta: 0:02:47  lr: 0.000007  min_lr: 0.000007  loss: 2.1817 (2.5542)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0190 (1.0267)  time: 0.1878  data: 0.0005  max mem: 12911
Epoch: [293]  [ 600/1251]  eta: 0:02:06  lr: 0.000006  min_lr: 0.000006  loss: 2.2459 (2.5497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9424 (1.0344)  time: 0.1889  data: 0.0004  max mem: 12911
Epoch: [293]  [ 800/1251]  eta: 0:01:26  lr: 0.000006  min_lr: 0.000006  loss: 2.0946 (2.5593)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0125 (1.0276)  time: 0.1904  data: 0.0004  max mem: 12911
Epoch: [293]  [1000/1251]  eta: 0:00:48  lr: 0.000006  min_lr: 0.000006  loss: 2.6070 (2.5485)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0024 (1.0266)  time: 0.1867  data: 0.0005  max mem: 12911
Epoch: [293]  [1200/1251]  eta: 0:00:09  lr: 0.000006  min_lr: 0.000006  loss: 1.9581 (2.5568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9536 (1.0261)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [293]  [1250/1251]  eta: 0:00:00  lr: 0.000006  min_lr: 0.000006  loss: 1.9967 (2.5515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9731 (1.0264)  time: 0.1452  data: 0.0006  max mem: 12911
Epoch: [293] Total time: 0:03:59 (0.1914 s / it)
Averaged stats: lr: 0.000006  min_lr: 0.000006  loss: 1.9967 (2.5521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9731 (1.0264)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5745 (0.5745)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.5920  data: 5.5004  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7544 (0.7565)  acc1: 83.2000 (84.0364)  acc5: 97.2000 (97.0546)  time: 0.7381  data: 0.6420  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9599 (0.9169)  acc1: 77.6000 (80.4191)  acc5: 95.2000 (95.3905)  time: 0.1986  data: 0.1106  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0217 (0.9276)  acc1: 77.6000 (79.9520)  acc5: 94.8000 (95.2000)  time: 0.2010  data: 0.1149  max mem: 12911
Test: Total time: 0:00:10 (0.4039 s / it)
* Acc@1 80.166 Acc@5 95.132 loss 0.927
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [294]  [   0/1251]  eta: 1:03:03  lr: 0.000006  min_lr: 0.000006  loss: 1.7608 (1.7608)  weight_decay: 0.0500 (0.0500)  time: 3.0246  data: 1.5917  max mem: 12911
Epoch: [294]  [ 200/1251]  eta: 0:03:36  lr: 0.000005  min_lr: 0.000005  loss: 2.2570 (2.4719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9922 (1.0451)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [294]  [ 400/1251]  eta: 0:02:48  lr: 0.000005  min_lr: 0.000005  loss: 3.0527 (2.5110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9448 (1.0257)  time: 0.1886  data: 0.0005  max mem: 12911
Epoch: [294]  [ 600/1251]  eta: 0:02:06  lr: 0.000005  min_lr: 0.000005  loss: 1.9004 (2.5179)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0178 (1.0236)  time: 0.1913  data: 0.0004  max mem: 12911
Epoch: [294]  [ 800/1251]  eta: 0:01:27  lr: 0.000005  min_lr: 0.000005  loss: 2.0246 (2.5284)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0460 (1.0261)  time: 0.1866  data: 0.0005  max mem: 12911
Epoch: [294]  [1000/1251]  eta: 0:00:48  lr: 0.000004  min_lr: 0.000004  loss: 2.7640 (2.5287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9597 (1.0247)  time: 0.1864  data: 0.0005  max mem: 12911
Epoch: [294]  [1200/1251]  eta: 0:00:09  lr: 0.000004  min_lr: 0.000004  loss: 2.0939 (2.5247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9810 (1.0211)  time: 0.1925  data: 0.0005  max mem: 12911
Epoch: [294]  [1250/1251]  eta: 0:00:00  lr: 0.000004  min_lr: 0.000004  loss: 1.8886 (2.5276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9363 (1.0212)  time: 0.1475  data: 0.0012  max mem: 12911
Epoch: [294] Total time: 0:03:59 (0.1915 s / it)
Averaged stats: lr: 0.000004  min_lr: 0.000004  loss: 1.8886 (2.5443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9363 (1.0212)
Test:  [ 0/25]  eta: 0:01:17  loss: 0.5869 (0.5869)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 3.0861  data: 2.9944  max mem: 12911
Test:  [10/25]  eta: 0:00:08  loss: 0.7652 (0.7646)  acc1: 83.6000 (84.2182)  acc5: 96.8000 (96.8727)  time: 0.5719  data: 0.4834  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9658 (0.9250)  acc1: 78.0000 (80.3619)  acc5: 95.2000 (95.0667)  time: 0.2925  data: 0.2076  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0343 (0.9362)  acc1: 77.6000 (79.9040)  acc5: 94.0000 (94.9120)  time: 0.2058  data: 0.1226  max mem: 12911
Test: Total time: 0:00:09 (0.3979 s / it)
* Acc@1 80.144 Acc@5 95.120 loss 0.936
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.24%
Epoch: [295]  [   0/1251]  eta: 1:06:28  lr: 0.000004  min_lr: 0.000004  loss: 1.7823 (1.7823)  weight_decay: 0.0500 (0.0500)  time: 3.1880  data: 2.3826  max mem: 12911
Epoch: [295]  [ 200/1251]  eta: 0:03:32  lr: 0.000004  min_lr: 0.000004  loss: 2.2487 (2.5476)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0491 (1.0518)  time: 0.1827  data: 0.0004  max mem: 12911
Epoch: [295]  [ 400/1251]  eta: 0:02:46  lr: 0.000004  min_lr: 0.000004  loss: 1.9863 (2.5457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9665 (1.0370)  time: 0.1887  data: 0.0005  max mem: 12911
Epoch: [295]  [ 600/1251]  eta: 0:02:05  lr: 0.000004  min_lr: 0.000004  loss: 2.4605 (2.5349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9507 (1.0341)  time: 0.1902  data: 0.0004  max mem: 12911
Epoch: [295]  [ 800/1251]  eta: 0:01:26  lr: 0.000003  min_lr: 0.000003  loss: 1.8835 (2.5211)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0174 (1.0297)  time: 0.1847  data: 0.0004  max mem: 12911
Epoch: [295]  [1000/1251]  eta: 0:00:47  lr: 0.000003  min_lr: 0.000003  loss: 1.8775 (2.5302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9939 (1.0305)  time: 0.1912  data: 0.0005  max mem: 12911
Epoch: [295]  [1200/1251]  eta: 0:00:09  lr: 0.000003  min_lr: 0.000003  loss: 2.0542 (2.5316)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0153 (1.0281)  time: 0.1846  data: 0.0006  max mem: 12911
Epoch: [295]  [1250/1251]  eta: 0:00:00  lr: 0.000003  min_lr: 0.000003  loss: 2.8530 (2.5362)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0073 (1.0293)  time: 0.1471  data: 0.0012  max mem: 12911
Epoch: [295] Total time: 0:03:58 (0.1906 s / it)
Averaged stats: lr: 0.000003  min_lr: 0.000003  loss: 2.8530 (2.5554)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0073 (1.0293)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5882 (0.5882)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.8063  data: 5.7145  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7850 (0.7767)  acc1: 82.8000 (84.2182)  acc5: 97.2000 (97.0182)  time: 0.6926  data: 0.5967  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9731 (0.9384)  acc1: 78.4000 (80.4762)  acc5: 95.2000 (95.2762)  time: 0.1841  data: 0.0956  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0475 (0.9492)  acc1: 77.2000 (80.0000)  acc5: 94.4000 (95.0720)  time: 0.2038  data: 0.1182  max mem: 12911
Test: Total time: 0:00:10 (0.4139 s / it)
* Acc@1 80.096 Acc@5 95.130 loss 0.949
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.24%
Epoch: [296]  [   0/1251]  eta: 1:05:56  lr: 0.000003  min_lr: 0.000003  loss: 1.7745 (1.7745)  weight_decay: 0.0500 (0.0500)  time: 3.1625  data: 2.6648  max mem: 12911
Epoch: [296]  [ 200/1251]  eta: 0:03:33  lr: 0.000003  min_lr: 0.000003  loss: 2.0630 (2.6020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9423 (1.0089)  time: 0.1846  data: 0.0004  max mem: 12911
Epoch: [296]  [ 400/1251]  eta: 0:02:46  lr: 0.000003  min_lr: 0.000003  loss: 2.0619 (2.5692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9346 (1.0236)  time: 0.1918  data: 0.0005  max mem: 12911
Epoch: [296]  [ 600/1251]  eta: 0:02:05  lr: 0.000003  min_lr: 0.000003  loss: 2.1748 (2.5746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9053 (1.0072)  time: 0.1884  data: 0.0005  max mem: 12911
Epoch: [296]  [ 800/1251]  eta: 0:01:26  lr: 0.000002  min_lr: 0.000002  loss: 1.8901 (2.5598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9763 (1.0047)  time: 0.1889  data: 0.0004  max mem: 12911
Epoch: [296]  [1000/1251]  eta: 0:00:47  lr: 0.000002  min_lr: 0.000002  loss: 1.9587 (2.5642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9256 (1.0050)  time: 0.1859  data: 0.0005  max mem: 12911
Epoch: [296]  [1200/1251]  eta: 0:00:09  lr: 0.000002  min_lr: 0.000002  loss: 2.2224 (2.5658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9831 (1.0160)  time: 0.1895  data: 0.0004  max mem: 12911
Epoch: [296]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.0138 (2.5578)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0319 (1.0158)  time: 0.1465  data: 0.0011  max mem: 12911
Epoch: [296] Total time: 0:03:58 (0.1903 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.0138 (2.5440)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0319 (1.0158)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5450 (0.5450)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.4329  data: 5.3410  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7269 (0.7308)  acc1: 83.2000 (84.0364)  acc5: 97.2000 (97.0909)  time: 0.7482  data: 0.6542  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9365 (0.8913)  acc1: 78.4000 (80.3810)  acc5: 95.2000 (95.3714)  time: 0.2097  data: 0.1214  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9931 (0.9020)  acc1: 77.6000 (79.9840)  acc5: 94.8000 (95.2320)  time: 0.2136  data: 0.1268  max mem: 12911
Test: Total time: 0:00:10 (0.4056 s / it)
* Acc@1 80.198 Acc@5 95.140 loss 0.902
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [297]  [   0/1251]  eta: 1:03:28  lr: 0.000002  min_lr: 0.000002  loss: 2.0295 (2.0295)  weight_decay: 0.0500 (0.0500)  time: 3.0446  data: 2.3449  max mem: 12911
Epoch: [297]  [ 200/1251]  eta: 0:03:34  lr: 0.000002  min_lr: 0.000002  loss: 1.9370 (2.5039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9207 (1.0040)  time: 0.1892  data: 0.0005  max mem: 12911
Epoch: [297]  [ 400/1251]  eta: 0:02:46  lr: 0.000002  min_lr: 0.000002  loss: 1.9276 (2.5014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9753 (1.0017)  time: 0.1856  data: 0.0005  max mem: 12911
Epoch: [297]  [ 600/1251]  eta: 0:02:06  lr: 0.000002  min_lr: 0.000002  loss: 1.9777 (2.5078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9984 (1.0003)  time: 0.1862  data: 0.0005  max mem: 12911
Epoch: [297]  [ 800/1251]  eta: 0:01:26  lr: 0.000002  min_lr: 0.000002  loss: 2.0318 (2.5177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9684 (0.9948)  time: 0.2025  data: 0.0004  max mem: 12911
Epoch: [297]  [1000/1251]  eta: 0:00:48  lr: 0.000002  min_lr: 0.000002  loss: 2.0886 (2.5399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9951 (0.9978)  time: 0.1885  data: 0.0004  max mem: 12911
Epoch: [297]  [1200/1251]  eta: 0:00:09  lr: 0.000002  min_lr: 0.000002  loss: 2.0845 (2.5367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9707 (1.0043)  time: 0.1850  data: 0.0005  max mem: 12911
Epoch: [297]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 1.9055 (2.5392)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0562 (1.0081)  time: 0.1470  data: 0.0009  max mem: 12911
Epoch: [297] Total time: 0:03:59 (0.1915 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 1.9055 (2.5505)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0562 (1.0081)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5557 (0.5557)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.7543  data: 5.6273  max mem: 12911
Test:  [10/25]  eta: 0:00:11  loss: 0.7452 (0.7480)  acc1: 83.2000 (84.4727)  acc5: 97.2000 (97.0182)  time: 0.7560  data: 0.6578  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9555 (0.9043)  acc1: 78.4000 (80.5714)  acc5: 95.2000 (95.2762)  time: 0.2034  data: 0.1160  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 0.9967 (0.9152)  acc1: 77.2000 (80.0000)  acc5: 94.4000 (95.1040)  time: 0.1994  data: 0.1159  max mem: 12911
Test: Total time: 0:00:10 (0.4085 s / it)
* Acc@1 80.228 Acc@5 95.106 loss 0.915
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [298]  [   0/1251]  eta: 1:07:30  lr: 0.000002  min_lr: 0.000002  loss: 3.7513 (3.7513)  weight_decay: 0.0500 (0.0500)  time: 3.2380  data: 1.8504  max mem: 12911
Epoch: [298]  [ 200/1251]  eta: 0:03:35  lr: 0.000001  min_lr: 0.000001  loss: 1.8987 (2.5356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0209 (1.0399)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [298]  [ 400/1251]  eta: 0:02:47  lr: 0.000001  min_lr: 0.000001  loss: 3.3092 (2.5952)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0270 (1.0272)  time: 0.1883  data: 0.0004  max mem: 12911
Epoch: [298]  [ 600/1251]  eta: 0:02:06  lr: 0.000001  min_lr: 0.000001  loss: 1.8878 (2.5717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9967 (1.0126)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [298]  [ 800/1251]  eta: 0:01:26  lr: 0.000001  min_lr: 0.000001  loss: 2.3379 (2.5728)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0119 (1.0185)  time: 0.1857  data: 0.0005  max mem: 12911
Epoch: [298]  [1000/1251]  eta: 0:00:47  lr: 0.000001  min_lr: 0.000001  loss: 2.9018 (2.5657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9429 (1.0126)  time: 0.1855  data: 0.0004  max mem: 12911
Epoch: [298]  [1200/1251]  eta: 0:00:09  lr: 0.000001  min_lr: 0.000001  loss: 2.2020 (2.5662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9360 (1.0152)  time: 0.1849  data: 0.0004  max mem: 12911
Epoch: [298]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 1.9518 (2.5681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0175 (1.0165)  time: 0.1464  data: 0.0007  max mem: 12911
Epoch: [298] Total time: 0:03:57 (0.1902 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 1.9518 (2.5543)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0175 (1.0165)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6259 (0.6259)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.6812  data: 5.5895  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.8043 (0.7941)  acc1: 83.6000 (84.1818)  acc5: 97.2000 (97.0546)  time: 0.7307  data: 0.6338  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9959 (0.9554)  acc1: 78.4000 (80.2667)  acc5: 94.8000 (95.2952)  time: 0.2078  data: 0.1185  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0673 (0.9666)  acc1: 77.2000 (79.8240)  acc5: 94.4000 (95.1520)  time: 0.2041  data: 0.1184  max mem: 12911
Test: Total time: 0:00:10 (0.4091 s / it)
* Acc@1 80.110 Acc@5 95.082 loss 0.968
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.24%
Epoch: [299]  [   0/1251]  eta: 1:04:33  lr: 0.000001  min_lr: 0.000001  loss: 3.2507 (3.2507)  weight_decay: 0.0500 (0.0500)  time: 3.0962  data: 2.2778  max mem: 12911
Epoch: [299]  [ 200/1251]  eta: 0:03:35  lr: 0.000001  min_lr: 0.000001  loss: 1.9396 (2.5477)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0276 (1.0504)  time: 0.1880  data: 0.0005  max mem: 12911
Epoch: [299]  [ 400/1251]  eta: 0:02:47  lr: 0.000001  min_lr: 0.000001  loss: 2.0079 (2.5533)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0002 (1.0313)  time: 0.1863  data: 0.0005  max mem: 12911
Epoch: [299]  [ 600/1251]  eta: 0:02:05  lr: 0.000001  min_lr: 0.000001  loss: 2.0383 (2.5453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9936 (1.0374)  time: 0.1862  data: 0.0004  max mem: 12911
Epoch: [299]  [ 800/1251]  eta: 0:01:26  lr: 0.000001  min_lr: 0.000001  loss: 2.4395 (2.5648)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0557 (1.0275)  time: 0.1871  data: 0.0004  max mem: 12911
Epoch: [299]  [1000/1251]  eta: 0:00:47  lr: 0.000001  min_lr: 0.000001  loss: 2.1319 (2.5548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9755 (1.0214)  time: 0.1867  data: 0.0004  max mem: 12911
Epoch: [299]  [1200/1251]  eta: 0:00:09  lr: 0.000001  min_lr: 0.000001  loss: 1.9871 (2.5594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9507 (1.0199)  time: 0.1861  data: 0.0004  max mem: 12911
Epoch: [299]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.1651 (2.5540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9904 (1.0193)  time: 0.1456  data: 0.0006  max mem: 12911
Epoch: [299] Total time: 0:03:58 (0.1904 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.1651 (2.5449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9904 (1.0193)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5775 (0.5775)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.3566  data: 5.2625  max mem: 12911
Test:  [10/25]  eta: 0:00:10  loss: 0.7674 (0.7666)  acc1: 82.4000 (84.1091)  acc5: 97.2000 (97.2000)  time: 0.7298  data: 0.6354  max mem: 12911
Test:  [20/25]  eta: 0:00:02  loss: 0.9612 (0.9273)  acc1: 78.0000 (80.4191)  acc5: 95.6000 (95.3333)  time: 0.2127  data: 0.1257  max mem: 12911
Test:  [24/25]  eta: 0:00:00  loss: 1.0281 (0.9383)  acc1: 77.2000 (79.9680)  acc5: 94.0000 (95.1840)  time: 0.2102  data: 0.1256  max mem: 12911
Test: Total time: 0:00:10 (0.4007 s / it)
* Acc@1 80.184 Acc@5 95.126 loss 0.938
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Training time 20:46:13
