| distributed init (rank 0): env://, gpu 0
| distributed init (rank 3): env://, gpu 3
| distributed init (rank 6): env://, gpu 6
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 5): env://, gpu 5
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 7): env://, gpu 7
Namespace(batch_size=128, epochs=300, update_freq=4, model='tiny', drop_path=0, input_size=224, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.004, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.4, cutmix=0.5, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_tiny_2.4G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=8, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(224, 224))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7eff5959f590>
Mixup is activated!
Model = RaCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (layer1): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(192, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=24, bias=False)
          (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(24, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): Identity()
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.005)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.010)
    )
  )
  (layer2): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.015)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.020)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.025)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.030)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.035)
    )
  )
  (layer3): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.040)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.045)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.050)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.055)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.060)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.065)
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.070)
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.075)
    )
    (8): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.080)
    )
    (9): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.085)
    )
  )
  (layer4): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(1536, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.090)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.095)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.100)
    )
  )
  (head): ConvX(
    (conv): Conv2d(384, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 18807596
LR = 0.00400000
Batch size = 4096
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 312
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.mlp.0.conv.weight",
      "layer1.0.mlp.1.conv.weight",
      "layer1.0.mlp.2.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.mlp.conv_in.conv.weight",
      "layer1.1.mlp.dw.conv.weight",
      "layer1.1.mlp.re.region.0.weight",
      "layer1.1.mlp.re.region.3.weight",
      "layer1.1.mlp.proj.conv.weight",
      "layer1.1.dcnn.conv_in.conv.weight",
      "layer1.1.dcnn.spe.conv.weight",
      "layer1.1.dcnn.att.logit_scale",
      "layer1.1.dcnn.proj.conv.weight",
      "layer1.2.mlp.conv_in.conv.weight",
      "layer1.2.mlp.dw.conv.weight",
      "layer1.2.mlp.re.region.0.weight",
      "layer1.2.mlp.re.region.3.weight",
      "layer1.2.mlp.proj.conv.weight",
      "layer1.2.dcnn.conv_in.conv.weight",
      "layer1.2.dcnn.spe.conv.weight",
      "layer1.2.dcnn.att.logit_scale",
      "layer1.2.dcnn.proj.conv.weight",
      "layer2.0.mlp.0.conv.weight",
      "layer2.0.mlp.1.conv.weight",
      "layer2.0.mlp.2.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.mlp.conv_in.conv.weight",
      "layer2.1.mlp.dw.conv.weight",
      "layer2.1.mlp.re.region.0.weight",
      "layer2.1.mlp.re.region.3.weight",
      "layer2.1.mlp.proj.conv.weight",
      "layer2.1.dcnn.conv_in.conv.weight",
      "layer2.1.dcnn.spe.conv.weight",
      "layer2.1.dcnn.att.logit_scale",
      "layer2.1.dcnn.proj.conv.weight",
      "layer2.2.mlp.conv_in.conv.weight",
      "layer2.2.mlp.dw.conv.weight",
      "layer2.2.mlp.re.region.0.weight",
      "layer2.2.mlp.re.region.3.weight",
      "layer2.2.mlp.proj.conv.weight",
      "layer2.2.dcnn.conv_in.conv.weight",
      "layer2.2.dcnn.spe.conv.weight",
      "layer2.2.dcnn.att.logit_scale",
      "layer2.2.dcnn.proj.conv.weight",
      "layer2.3.mlp.conv_in.conv.weight",
      "layer2.3.mlp.dw.conv.weight",
      "layer2.3.mlp.re.region.0.weight",
      "layer2.3.mlp.re.region.3.weight",
      "layer2.3.mlp.proj.conv.weight",
      "layer2.3.dcnn.conv_in.conv.weight",
      "layer2.3.dcnn.spe.conv.weight",
      "layer2.3.dcnn.att.logit_scale",
      "layer2.3.dcnn.proj.conv.weight",
      "layer2.4.mlp.conv_in.conv.weight",
      "layer2.4.mlp.dw.conv.weight",
      "layer2.4.mlp.re.region.0.weight",
      "layer2.4.mlp.re.region.3.weight",
      "layer2.4.mlp.proj.conv.weight",
      "layer2.4.dcnn.conv_in.conv.weight",
      "layer2.4.dcnn.spe.conv.weight",
      "layer2.4.dcnn.att.logit_scale",
      "layer2.4.dcnn.proj.conv.weight",
      "layer3.0.mlp.0.conv.weight",
      "layer3.0.mlp.1.conv.weight",
      "layer3.0.mlp.2.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.mlp.conv_in.conv.weight",
      "layer3.1.mlp.dw.conv.weight",
      "layer3.1.mlp.re.region.0.weight",
      "layer3.1.mlp.re.region.3.weight",
      "layer3.1.mlp.proj.conv.weight",
      "layer3.1.dcnn.conv_in.conv.weight",
      "layer3.1.dcnn.spe.conv.weight",
      "layer3.1.dcnn.att.logit_scale",
      "layer3.1.dcnn.proj.conv.weight",
      "layer3.2.mlp.conv_in.conv.weight",
      "layer3.2.mlp.dw.conv.weight",
      "layer3.2.mlp.re.region.0.weight",
      "layer3.2.mlp.re.region.3.weight",
      "layer3.2.mlp.proj.conv.weight",
      "layer3.2.dcnn.conv_in.conv.weight",
      "layer3.2.dcnn.spe.conv.weight",
      "layer3.2.dcnn.att.logit_scale",
      "layer3.2.dcnn.proj.conv.weight",
      "layer3.3.mlp.conv_in.conv.weight",
      "layer3.3.mlp.dw.conv.weight",
      "layer3.3.mlp.re.region.0.weight",
      "layer3.3.mlp.re.region.3.weight",
      "layer3.3.mlp.proj.conv.weight",
      "layer3.3.dcnn.conv_in.conv.weight",
      "layer3.3.dcnn.spe.conv.weight",
      "layer3.3.dcnn.att.logit_scale",
      "layer3.3.dcnn.proj.conv.weight",
      "layer3.4.mlp.conv_in.conv.weight",
      "layer3.4.mlp.dw.conv.weight",
      "layer3.4.mlp.re.region.0.weight",
      "layer3.4.mlp.re.region.3.weight",
      "layer3.4.mlp.proj.conv.weight",
      "layer3.4.dcnn.conv_in.conv.weight",
      "layer3.4.dcnn.spe.conv.weight",
      "layer3.4.dcnn.att.logit_scale",
      "layer3.4.dcnn.proj.conv.weight",
      "layer3.5.mlp.conv_in.conv.weight",
      "layer3.5.mlp.dw.conv.weight",
      "layer3.5.mlp.re.region.0.weight",
      "layer3.5.mlp.re.region.3.weight",
      "layer3.5.mlp.proj.conv.weight",
      "layer3.5.dcnn.conv_in.conv.weight",
      "layer3.5.dcnn.spe.conv.weight",
      "layer3.5.dcnn.att.logit_scale",
      "layer3.5.dcnn.proj.conv.weight",
      "layer3.6.mlp.conv_in.conv.weight",
      "layer3.6.mlp.dw.conv.weight",
      "layer3.6.mlp.re.region.0.weight",
      "layer3.6.mlp.re.region.3.weight",
      "layer3.6.mlp.proj.conv.weight",
      "layer3.6.dcnn.conv_in.conv.weight",
      "layer3.6.dcnn.spe.conv.weight",
      "layer3.6.dcnn.att.logit_scale",
      "layer3.6.dcnn.proj.conv.weight",
      "layer3.7.mlp.conv_in.conv.weight",
      "layer3.7.mlp.dw.conv.weight",
      "layer3.7.mlp.re.region.0.weight",
      "layer3.7.mlp.re.region.3.weight",
      "layer3.7.mlp.proj.conv.weight",
      "layer3.7.dcnn.conv_in.conv.weight",
      "layer3.7.dcnn.spe.conv.weight",
      "layer3.7.dcnn.att.logit_scale",
      "layer3.7.dcnn.proj.conv.weight",
      "layer3.8.mlp.conv_in.conv.weight",
      "layer3.8.mlp.dw.conv.weight",
      "layer3.8.mlp.re.region.0.weight",
      "layer3.8.mlp.re.region.3.weight",
      "layer3.8.mlp.proj.conv.weight",
      "layer3.8.dcnn.conv_in.conv.weight",
      "layer3.8.dcnn.spe.conv.weight",
      "layer3.8.dcnn.att.logit_scale",
      "layer3.8.dcnn.proj.conv.weight",
      "layer3.9.mlp.conv_in.conv.weight",
      "layer3.9.mlp.dw.conv.weight",
      "layer3.9.mlp.re.region.0.weight",
      "layer3.9.mlp.re.region.3.weight",
      "layer3.9.mlp.proj.conv.weight",
      "layer3.9.dcnn.conv_in.conv.weight",
      "layer3.9.dcnn.spe.conv.weight",
      "layer3.9.dcnn.att.logit_scale",
      "layer3.9.dcnn.proj.conv.weight",
      "layer4.0.mlp.0.conv.weight",
      "layer4.0.mlp.1.conv.weight",
      "layer4.0.mlp.2.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.mlp.conv_in.conv.weight",
      "layer4.1.mlp.dw.conv.weight",
      "layer4.1.mlp.re.region.0.weight",
      "layer4.1.mlp.re.region.3.weight",
      "layer4.1.mlp.proj.conv.weight",
      "layer4.1.dcnn.conv_in.conv.weight",
      "layer4.1.dcnn.spe.conv.weight",
      "layer4.1.dcnn.att.logit_scale",
      "layer4.1.dcnn.proj.conv.weight",
      "layer4.2.mlp.conv_in.conv.weight",
      "layer4.2.mlp.dw.conv.weight",
      "layer4.2.mlp.re.region.0.weight",
      "layer4.2.mlp.re.region.3.weight",
      "layer4.2.mlp.proj.conv.weight",
      "layer4.2.dcnn.conv_in.conv.weight",
      "layer4.2.dcnn.spe.conv.weight",
      "layer4.2.dcnn.att.logit_scale",
      "layer4.2.dcnn.proj.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.mlp.0.norm.weight",
      "layer1.0.mlp.0.norm.bias",
      "layer1.0.mlp.1.norm.weight",
      "layer1.0.mlp.1.norm.bias",
      "layer1.0.mlp.2.norm.weight",
      "layer1.0.mlp.2.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.mlp.conv_in.norm.weight",
      "layer1.1.mlp.conv_in.norm.bias",
      "layer1.1.mlp.dw.norm.weight",
      "layer1.1.mlp.dw.norm.bias",
      "layer1.1.mlp.re.region.1.weight",
      "layer1.1.mlp.re.region.1.bias",
      "layer1.1.mlp.re.region.3.bias",
      "layer1.1.mlp.proj.norm.weight",
      "layer1.1.mlp.proj.norm.bias",
      "layer1.1.dcnn.conv_in.norm.weight",
      "layer1.1.dcnn.conv_in.norm.bias",
      "layer1.1.dcnn.spe.norm.weight",
      "layer1.1.dcnn.spe.norm.bias",
      "layer1.1.dcnn.proj.norm.weight",
      "layer1.1.dcnn.proj.norm.bias",
      "layer1.2.mlp.conv_in.norm.weight",
      "layer1.2.mlp.conv_in.norm.bias",
      "layer1.2.mlp.dw.norm.weight",
      "layer1.2.mlp.dw.norm.bias",
      "layer1.2.mlp.re.region.1.weight",
      "layer1.2.mlp.re.region.1.bias",
      "layer1.2.mlp.re.region.3.bias",
      "layer1.2.mlp.proj.norm.weight",
      "layer1.2.mlp.proj.norm.bias",
      "layer1.2.dcnn.conv_in.norm.weight",
      "layer1.2.dcnn.conv_in.norm.bias",
      "layer1.2.dcnn.spe.norm.weight",
      "layer1.2.dcnn.spe.norm.bias",
      "layer1.2.dcnn.proj.norm.weight",
      "layer1.2.dcnn.proj.norm.bias",
      "layer2.0.mlp.0.norm.weight",
      "layer2.0.mlp.0.norm.bias",
      "layer2.0.mlp.1.norm.weight",
      "layer2.0.mlp.1.norm.bias",
      "layer2.0.mlp.2.norm.weight",
      "layer2.0.mlp.2.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.mlp.conv_in.norm.weight",
      "layer2.1.mlp.conv_in.norm.bias",
      "layer2.1.mlp.dw.norm.weight",
      "layer2.1.mlp.dw.norm.bias",
      "layer2.1.mlp.re.region.1.weight",
      "layer2.1.mlp.re.region.1.bias",
      "layer2.1.mlp.re.region.3.bias",
      "layer2.1.mlp.proj.norm.weight",
      "layer2.1.mlp.proj.norm.bias",
      "layer2.1.dcnn.conv_in.norm.weight",
      "layer2.1.dcnn.conv_in.norm.bias",
      "layer2.1.dcnn.spe.norm.weight",
      "layer2.1.dcnn.spe.norm.bias",
      "layer2.1.dcnn.proj.norm.weight",
      "layer2.1.dcnn.proj.norm.bias",
      "layer2.2.mlp.conv_in.norm.weight",
      "layer2.2.mlp.conv_in.norm.bias",
      "layer2.2.mlp.dw.norm.weight",
      "layer2.2.mlp.dw.norm.bias",
      "layer2.2.mlp.re.region.1.weight",
      "layer2.2.mlp.re.region.1.bias",
      "layer2.2.mlp.re.region.3.bias",
      "layer2.2.mlp.proj.norm.weight",
      "layer2.2.mlp.proj.norm.bias",
      "layer2.2.dcnn.conv_in.norm.weight",
      "layer2.2.dcnn.conv_in.norm.bias",
      "layer2.2.dcnn.spe.norm.weight",
      "layer2.2.dcnn.spe.norm.bias",
      "layer2.2.dcnn.proj.norm.weight",
      "layer2.2.dcnn.proj.norm.bias",
      "layer2.3.mlp.conv_in.norm.weight",
      "layer2.3.mlp.conv_in.norm.bias",
      "layer2.3.mlp.dw.norm.weight",
      "layer2.3.mlp.dw.norm.bias",
      "layer2.3.mlp.re.region.1.weight",
      "layer2.3.mlp.re.region.1.bias",
      "layer2.3.mlp.re.region.3.bias",
      "layer2.3.mlp.proj.norm.weight",
      "layer2.3.mlp.proj.norm.bias",
      "layer2.3.dcnn.conv_in.norm.weight",
      "layer2.3.dcnn.conv_in.norm.bias",
      "layer2.3.dcnn.spe.norm.weight",
      "layer2.3.dcnn.spe.norm.bias",
      "layer2.3.dcnn.proj.norm.weight",
      "layer2.3.dcnn.proj.norm.bias",
      "layer2.4.mlp.conv_in.norm.weight",
      "layer2.4.mlp.conv_in.norm.bias",
      "layer2.4.mlp.dw.norm.weight",
      "layer2.4.mlp.dw.norm.bias",
      "layer2.4.mlp.re.region.1.weight",
      "layer2.4.mlp.re.region.1.bias",
      "layer2.4.mlp.re.region.3.bias",
      "layer2.4.mlp.proj.norm.weight",
      "layer2.4.mlp.proj.norm.bias",
      "layer2.4.dcnn.conv_in.norm.weight",
      "layer2.4.dcnn.conv_in.norm.bias",
      "layer2.4.dcnn.spe.norm.weight",
      "layer2.4.dcnn.spe.norm.bias",
      "layer2.4.dcnn.proj.norm.weight",
      "layer2.4.dcnn.proj.norm.bias",
      "layer3.0.mlp.0.norm.weight",
      "layer3.0.mlp.0.norm.bias",
      "layer3.0.mlp.1.norm.weight",
      "layer3.0.mlp.1.norm.bias",
      "layer3.0.mlp.2.norm.weight",
      "layer3.0.mlp.2.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.mlp.conv_in.norm.weight",
      "layer3.1.mlp.conv_in.norm.bias",
      "layer3.1.mlp.dw.norm.weight",
      "layer3.1.mlp.dw.norm.bias",
      "layer3.1.mlp.re.region.1.weight",
      "layer3.1.mlp.re.region.1.bias",
      "layer3.1.mlp.re.region.3.bias",
      "layer3.1.mlp.proj.norm.weight",
      "layer3.1.mlp.proj.norm.bias",
      "layer3.1.dcnn.conv_in.norm.weight",
      "layer3.1.dcnn.conv_in.norm.bias",
      "layer3.1.dcnn.spe.norm.weight",
      "layer3.1.dcnn.spe.norm.bias",
      "layer3.1.dcnn.proj.norm.weight",
      "layer3.1.dcnn.proj.norm.bias",
      "layer3.2.mlp.conv_in.norm.weight",
      "layer3.2.mlp.conv_in.norm.bias",
      "layer3.2.mlp.dw.norm.weight",
      "layer3.2.mlp.dw.norm.bias",
      "layer3.2.mlp.re.region.1.weight",
      "layer3.2.mlp.re.region.1.bias",
      "layer3.2.mlp.re.region.3.bias",
      "layer3.2.mlp.proj.norm.weight",
      "layer3.2.mlp.proj.norm.bias",
      "layer3.2.dcnn.conv_in.norm.weight",
      "layer3.2.dcnn.conv_in.norm.bias",
      "layer3.2.dcnn.spe.norm.weight",
      "layer3.2.dcnn.spe.norm.bias",
      "layer3.2.dcnn.proj.norm.weight",
      "layer3.2.dcnn.proj.norm.bias",
      "layer3.3.mlp.conv_in.norm.weight",
      "layer3.3.mlp.conv_in.norm.bias",
      "layer3.3.mlp.dw.norm.weight",
      "layer3.3.mlp.dw.norm.bias",
      "layer3.3.mlp.re.region.1.weight",
      "layer3.3.mlp.re.region.1.bias",
      "layer3.3.mlp.re.region.3.bias",
      "layer3.3.mlp.proj.norm.weight",
      "layer3.3.mlp.proj.norm.bias",
      "layer3.3.dcnn.conv_in.norm.weight",
      "layer3.3.dcnn.conv_in.norm.bias",
      "layer3.3.dcnn.spe.norm.weight",
      "layer3.3.dcnn.spe.norm.bias",
      "layer3.3.dcnn.proj.norm.weight",
      "layer3.3.dcnn.proj.norm.bias",
      "layer3.4.mlp.conv_in.norm.weight",
      "layer3.4.mlp.conv_in.norm.bias",
      "layer3.4.mlp.dw.norm.weight",
      "layer3.4.mlp.dw.norm.bias",
      "layer3.4.mlp.re.region.1.weight",
      "layer3.4.mlp.re.region.1.bias",
      "layer3.4.mlp.re.region.3.bias",
      "layer3.4.mlp.proj.norm.weight",
      "layer3.4.mlp.proj.norm.bias",
      "layer3.4.dcnn.conv_in.norm.weight",
      "layer3.4.dcnn.conv_in.norm.bias",
      "layer3.4.dcnn.spe.norm.weight",
      "layer3.4.dcnn.spe.norm.bias",
      "layer3.4.dcnn.proj.norm.weight",
      "layer3.4.dcnn.proj.norm.bias",
      "layer3.5.mlp.conv_in.norm.weight",
      "layer3.5.mlp.conv_in.norm.bias",
      "layer3.5.mlp.dw.norm.weight",
      "layer3.5.mlp.dw.norm.bias",
      "layer3.5.mlp.re.region.1.weight",
      "layer3.5.mlp.re.region.1.bias",
      "layer3.5.mlp.re.region.3.bias",
      "layer3.5.mlp.proj.norm.weight",
      "layer3.5.mlp.proj.norm.bias",
      "layer3.5.dcnn.conv_in.norm.weight",
      "layer3.5.dcnn.conv_in.norm.bias",
      "layer3.5.dcnn.spe.norm.weight",
      "layer3.5.dcnn.spe.norm.bias",
      "layer3.5.dcnn.proj.norm.weight",
      "layer3.5.dcnn.proj.norm.bias",
      "layer3.6.mlp.conv_in.norm.weight",
      "layer3.6.mlp.conv_in.norm.bias",
      "layer3.6.mlp.dw.norm.weight",
      "layer3.6.mlp.dw.norm.bias",
      "layer3.6.mlp.re.region.1.weight",
      "layer3.6.mlp.re.region.1.bias",
      "layer3.6.mlp.re.region.3.bias",
      "layer3.6.mlp.proj.norm.weight",
      "layer3.6.mlp.proj.norm.bias",
      "layer3.6.dcnn.conv_in.norm.weight",
      "layer3.6.dcnn.conv_in.norm.bias",
      "layer3.6.dcnn.spe.norm.weight",
      "layer3.6.dcnn.spe.norm.bias",
      "layer3.6.dcnn.proj.norm.weight",
      "layer3.6.dcnn.proj.norm.bias",
      "layer3.7.mlp.conv_in.norm.weight",
      "layer3.7.mlp.conv_in.norm.bias",
      "layer3.7.mlp.dw.norm.weight",
      "layer3.7.mlp.dw.norm.bias",
      "layer3.7.mlp.re.region.1.weight",
      "layer3.7.mlp.re.region.1.bias",
      "layer3.7.mlp.re.region.3.bias",
      "layer3.7.mlp.proj.norm.weight",
      "layer3.7.mlp.proj.norm.bias",
      "layer3.7.dcnn.conv_in.norm.weight",
      "layer3.7.dcnn.conv_in.norm.bias",
      "layer3.7.dcnn.spe.norm.weight",
      "layer3.7.dcnn.spe.norm.bias",
      "layer3.7.dcnn.proj.norm.weight",
      "layer3.7.dcnn.proj.norm.bias",
      "layer3.8.mlp.conv_in.norm.weight",
      "layer3.8.mlp.conv_in.norm.bias",
      "layer3.8.mlp.dw.norm.weight",
      "layer3.8.mlp.dw.norm.bias",
      "layer3.8.mlp.re.region.1.weight",
      "layer3.8.mlp.re.region.1.bias",
      "layer3.8.mlp.re.region.3.bias",
      "layer3.8.mlp.proj.norm.weight",
      "layer3.8.mlp.proj.norm.bias",
      "layer3.8.dcnn.conv_in.norm.weight",
      "layer3.8.dcnn.conv_in.norm.bias",
      "layer3.8.dcnn.spe.norm.weight",
      "layer3.8.dcnn.spe.norm.bias",
      "layer3.8.dcnn.proj.norm.weight",
      "layer3.8.dcnn.proj.norm.bias",
      "layer3.9.mlp.conv_in.norm.weight",
      "layer3.9.mlp.conv_in.norm.bias",
      "layer3.9.mlp.dw.norm.weight",
      "layer3.9.mlp.dw.norm.bias",
      "layer3.9.mlp.re.region.1.weight",
      "layer3.9.mlp.re.region.1.bias",
      "layer3.9.mlp.re.region.3.bias",
      "layer3.9.mlp.proj.norm.weight",
      "layer3.9.mlp.proj.norm.bias",
      "layer3.9.dcnn.conv_in.norm.weight",
      "layer3.9.dcnn.conv_in.norm.bias",
      "layer3.9.dcnn.spe.norm.weight",
      "layer3.9.dcnn.spe.norm.bias",
      "layer3.9.dcnn.proj.norm.weight",
      "layer3.9.dcnn.proj.norm.bias",
      "layer4.0.mlp.0.norm.weight",
      "layer4.0.mlp.0.norm.bias",
      "layer4.0.mlp.1.norm.weight",
      "layer4.0.mlp.1.norm.bias",
      "layer4.0.mlp.2.norm.weight",
      "layer4.0.mlp.2.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.mlp.conv_in.norm.weight",
      "layer4.1.mlp.conv_in.norm.bias",
      "layer4.1.mlp.dw.norm.weight",
      "layer4.1.mlp.dw.norm.bias",
      "layer4.1.mlp.re.region.1.weight",
      "layer4.1.mlp.re.region.1.bias",
      "layer4.1.mlp.re.region.3.bias",
      "layer4.1.mlp.proj.norm.weight",
      "layer4.1.mlp.proj.norm.bias",
      "layer4.1.dcnn.conv_in.norm.weight",
      "layer4.1.dcnn.conv_in.norm.bias",
      "layer4.1.dcnn.spe.norm.weight",
      "layer4.1.dcnn.spe.norm.bias",
      "layer4.1.dcnn.proj.norm.weight",
      "layer4.1.dcnn.proj.norm.bias",
      "layer4.2.mlp.conv_in.norm.weight",
      "layer4.2.mlp.conv_in.norm.bias",
      "layer4.2.mlp.dw.norm.weight",
      "layer4.2.mlp.dw.norm.bias",
      "layer4.2.mlp.re.region.1.weight",
      "layer4.2.mlp.re.region.1.bias",
      "layer4.2.mlp.re.region.3.bias",
      "layer4.2.mlp.proj.norm.weight",
      "layer4.2.mlp.proj.norm.bias",
      "layer4.2.dcnn.conv_in.norm.weight",
      "layer4.2.dcnn.conv_in.norm.bias",
      "layer4.2.dcnn.spe.norm.weight",
      "layer4.2.dcnn.spe.norm.bias",
      "layer4.2.dcnn.proj.norm.weight",
      "layer4.2.dcnn.proj.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 6240
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: 
Start training for 300 epochs
Epoch: [0]  [   0/1251]  eta: 4:16:03  lr: 0.000000  min_lr: 0.000000  loss: 6.9582 (6.9582)  weight_decay: 0.0500 (0.0500)  time: 12.2811  data: 3.0605  max mem: 21847
Epoch: [0]  [ 200/1251]  eta: 0:05:51  lr: 0.000032  min_lr: 0.000032  loss: 6.9397 (6.9535)  weight_decay: 0.0500 (0.0500)  grad_norm: 29.0637 (nan)  time: 0.2737  data: 0.0005  max mem: 21847
Epoch: [0]  [ 400/1251]  eta: 0:04:18  lr: 0.000064  min_lr: 0.000064  loss: 6.8184 (6.9162)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.8304 (nan)  time: 0.2728  data: 0.0005  max mem: 21847
Epoch: [0]  [ 600/1251]  eta: 0:03:12  lr: 0.000096  min_lr: 0.000096  loss: 6.6278 (6.8571)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0944 (nan)  time: 0.2778  data: 0.0005  max mem: 21847
Epoch: [0]  [ 800/1251]  eta: 0:02:11  lr: 0.000128  min_lr: 0.000128  loss: 6.5202 (6.7963)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.3809 (nan)  time: 0.2738  data: 0.0005  max mem: 21847
Epoch: [0]  [1000/1251]  eta: 0:01:12  lr: 0.000160  min_lr: 0.000160  loss: 6.5114 (6.7402)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.6575 (nan)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [0]  [1200/1251]  eta: 0:00:14  lr: 0.000192  min_lr: 0.000192  loss: 6.4720 (6.6852)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.5260 (nan)  time: 0.2793  data: 0.0005  max mem: 21847
Epoch: [0]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 6.4895 (6.6766)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0630 (nan)  time: 0.2376  data: 0.0006  max mem: 21847
Epoch: [0] Total time: 0:05:57 (0.2858 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 6.4895 (6.6737)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0630 (nan)
Test:  [ 0/25]  eta: 0:04:25  loss: 5.4070 (5.4070)  acc1: 5.2000 (5.2000)  acc5: 16.0000 (16.0000)  time: 10.6044  data: 6.8536  max mem: 21847
Test:  [10/25]  eta: 0:00:16  loss: 5.4781 (5.4706)  acc1: 3.6000 (4.6909)  acc5: 15.6000 (16.2545)  time: 1.0802  data: 0.6233  max mem: 21847
Test:  [20/25]  eta: 0:00:03  loss: 5.4781 (5.4883)  acc1: 4.0000 (5.1429)  acc5: 15.6000 (16.8762)  time: 0.1276  data: 0.0002  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 5.4781 (5.4241)  acc1: 6.0000 (5.9360)  acc5: 17.2000 (17.9840)  time: 0.1276  data: 0.0001  max mem: 21847
Test: Total time: 0:00:13 (0.5498 s / it)
* Acc@1 5.878 Acc@5 17.390 loss 5.437
Accuracy of the model on the 50000 test images: 5.9%
Max accuracy: 5.88%
Epoch: [1]  [   0/1251]  eta: 1:12:15  lr: 0.000200  min_lr: 0.000200  loss: 5.9125 (5.9125)  weight_decay: 0.0500 (0.0500)  time: 3.4653  data: 3.0434  max mem: 21847
Epoch: [1]  [ 200/1251]  eta: 0:05:05  lr: 0.000232  min_lr: 0.000232  loss: 6.2974 (6.3001)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9129 (4.3375)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [1]  [ 400/1251]  eta: 0:03:59  lr: 0.000264  min_lr: 0.000264  loss: 6.0417 (6.2585)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.4410 (4.3264)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [1]  [ 600/1251]  eta: 0:03:01  lr: 0.000296  min_lr: 0.000296  loss: 6.1161 (6.2062)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0153 (4.2791)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [1]  [ 800/1251]  eta: 0:02:05  lr: 0.000328  min_lr: 0.000328  loss: 5.9050 (6.1590)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8826 (4.2058)  time: 0.2776  data: 0.0004  max mem: 21847
Epoch: [1]  [1000/1251]  eta: 0:01:09  lr: 0.000360  min_lr: 0.000360  loss: 5.9039 (6.1186)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5662 (4.1668)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [1]  [1200/1251]  eta: 0:00:14  lr: 0.000392  min_lr: 0.000392  loss: 5.8509 (6.0709)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7560 (4.1609)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [1]  [1250/1251]  eta: 0:00:00  lr: 0.000399  min_lr: 0.000399  loss: 5.4319 (6.0588)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6318 (4.1535)  time: 0.2280  data: 0.0005  max mem: 21847
Epoch: [1] Total time: 0:05:45 (0.2760 s / it)
Averaged stats: lr: 0.000399  min_lr: 0.000399  loss: 5.4319 (6.0654)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6318 (4.1535)
Test:  [ 0/25]  eta: 0:02:18  loss: 4.0126 (4.0126)  acc1: 16.4000 (16.4000)  acc5: 44.4000 (44.4000)  time: 5.5280  data: 5.3677  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 4.0126 (4.0252)  acc1: 17.2000 (18.4727)  acc5: 44.4000 (43.5636)  time: 0.7515  data: 0.6168  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 4.3703 (4.2447)  acc1: 16.8000 (17.2190)  acc5: 38.4000 (40.0571)  time: 0.2091  data: 0.0793  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 4.4106 (4.2171)  acc1: 16.8000 (18.0160)  acc5: 38.0000 (40.8000)  time: 0.2083  data: 0.0792  max mem: 21847
Test: Total time: 0:00:10 (0.4124 s / it)
* Acc@1 18.326 Acc@5 40.702 loss 4.202
Accuracy of the model on the 50000 test images: 18.3%
Max accuracy: 18.33%
Epoch: [2]  [   0/1251]  eta: 1:01:44  lr: 0.000400  min_lr: 0.000400  loss: 5.3348 (5.3348)  weight_decay: 0.0500 (0.0500)  time: 2.9611  data: 2.6089  max mem: 21847
Epoch: [2]  [ 200/1251]  eta: 0:05:03  lr: 0.000432  min_lr: 0.000432  loss: 5.9739 (5.8153)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5941 (3.9544)  time: 0.2718  data: 0.0005  max mem: 21847
Epoch: [2]  [ 400/1251]  eta: 0:04:00  lr: 0.000464  min_lr: 0.000464  loss: 5.6185 (5.7455)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7156 (4.0232)  time: 0.2728  data: 0.0005  max mem: 21847
Epoch: [2]  [ 600/1251]  eta: 0:03:01  lr: 0.000496  min_lr: 0.000496  loss: 5.6297 (5.7104)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6687 (4.0125)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [2]  [ 800/1251]  eta: 0:02:05  lr: 0.000528  min_lr: 0.000528  loss: 5.5078 (5.6785)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9799 (3.9609)  time: 0.2752  data: 0.0004  max mem: 21847
Epoch: [2]  [1000/1251]  eta: 0:01:09  lr: 0.000560  min_lr: 0.000560  loss: 5.7855 (5.6481)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7453 (3.9308)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [2]  [1200/1251]  eta: 0:00:14  lr: 0.000592  min_lr: 0.000592  loss: 5.6436 (5.6107)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.4096 (3.9248)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [2]  [1250/1251]  eta: 0:00:00  lr: 0.000599  min_lr: 0.000599  loss: 5.3853 (5.5987)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5694 (3.9310)  time: 0.2286  data: 0.0007  max mem: 21847
Epoch: [2] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.000599  min_lr: 0.000599  loss: 5.3853 (5.5898)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5694 (3.9310)
Test:  [ 0/25]  eta: 0:02:20  loss: 3.1125 (3.1125)  acc1: 35.2000 (35.2000)  acc5: 63.2000 (63.2000)  time: 5.6216  data: 5.4519  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 3.1125 (3.1694)  acc1: 33.6000 (33.9273)  acc5: 63.2000 (61.8545)  time: 0.7534  data: 0.6169  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 3.6185 (3.4684)  acc1: 27.2000 (30.1905)  acc5: 53.2000 (56.0952)  time: 0.2003  data: 0.0699  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 3.6488 (3.4584)  acc1: 27.2000 (30.6560)  acc5: 51.6000 (56.3520)  time: 0.1982  data: 0.0699  max mem: 21847
Test: Total time: 0:00:10 (0.4090 s / it)
* Acc@1 30.254 Acc@5 56.492 loss 3.456
Accuracy of the model on the 50000 test images: 30.3%
Max accuracy: 30.25%
Epoch: [3]  [   0/1251]  eta: 1:07:39  lr: 0.000600  min_lr: 0.000600  loss: 4.4647 (4.4647)  weight_decay: 0.0500 (0.0500)  time: 3.2451  data: 2.9544  max mem: 21847
Epoch: [3]  [ 200/1251]  eta: 0:05:03  lr: 0.000632  min_lr: 0.000632  loss: 5.6815 (5.3553)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3492 (3.6785)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [3]  [ 400/1251]  eta: 0:03:59  lr: 0.000664  min_lr: 0.000664  loss: 5.0869 (5.3131)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6538 (3.7534)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [3]  [ 600/1251]  eta: 0:03:02  lr: 0.000696  min_lr: 0.000696  loss: 5.0708 (5.2845)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5390 (3.6597)  time: 0.2805  data: 0.0005  max mem: 21847
Epoch: [3]  [ 800/1251]  eta: 0:02:05  lr: 0.000728  min_lr: 0.000728  loss: 5.1608 (5.2810)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9667 (3.5694)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [3]  [1000/1251]  eta: 0:01:09  lr: 0.000760  min_lr: 0.000760  loss: 4.6082 (5.2644)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2501 (3.5735)  time: 0.2731  data: 0.0005  max mem: 21847
Epoch: [3]  [1200/1251]  eta: 0:00:14  lr: 0.000792  min_lr: 0.000792  loss: 5.3729 (5.2377)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5300 (3.5668)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [3]  [1250/1251]  eta: 0:00:00  lr: 0.000799  min_lr: 0.000799  loss: 5.2158 (5.2369)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1208 (3.5540)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [3] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.000799  min_lr: 0.000799  loss: 5.2158 (5.2416)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1208 (3.5540)
Test:  [ 0/25]  eta: 0:02:18  loss: 2.6208 (2.6208)  acc1: 48.4000 (48.4000)  acc5: 70.4000 (70.4000)  time: 5.5589  data: 5.4102  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 2.5869 (2.5985)  acc1: 46.8000 (43.8909)  acc5: 74.8000 (73.0545)  time: 0.7196  data: 0.5834  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 3.0428 (2.9045)  acc1: 36.4000 (39.0095)  acc5: 61.6000 (66.0000)  time: 0.1969  data: 0.0657  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 3.0729 (2.8913)  acc1: 34.8000 (39.2640)  acc5: 59.2000 (66.0800)  time: 0.2119  data: 0.0828  max mem: 21847
Test: Total time: 0:00:10 (0.4181 s / it)
* Acc@1 39.246 Acc@5 66.542 loss 2.875
Accuracy of the model on the 50000 test images: 39.2%
Max accuracy: 39.25%
Epoch: [4]  [   0/1251]  eta: 1:03:43  lr: 0.000800  min_lr: 0.000800  loss: 5.2300 (5.2300)  weight_decay: 0.0500 (0.0500)  time: 3.0561  data: 2.7193  max mem: 21847
Epoch: [4]  [ 200/1251]  eta: 0:05:04  lr: 0.000832  min_lr: 0.000832  loss: 5.1115 (5.0451)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1890 (3.3842)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [4]  [ 400/1251]  eta: 0:03:59  lr: 0.000864  min_lr: 0.000864  loss: 4.8620 (5.0000)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9139 (3.2002)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [4]  [ 600/1251]  eta: 0:03:01  lr: 0.000896  min_lr: 0.000896  loss: 5.3171 (5.0123)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9373 (3.1760)  time: 0.2845  data: 0.0004  max mem: 21847
Epoch: [4]  [ 800/1251]  eta: 0:02:05  lr: 0.000928  min_lr: 0.000928  loss: 5.3713 (5.0080)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8260 (3.1827)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [4]  [1000/1251]  eta: 0:01:09  lr: 0.000960  min_lr: 0.000960  loss: 4.4794 (5.0025)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5513 (3.0898)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [4]  [1200/1251]  eta: 0:00:14  lr: 0.000992  min_lr: 0.000992  loss: 5.0484 (5.0004)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3864 (3.0346)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [4]  [1250/1251]  eta: 0:00:00  lr: 0.001000  min_lr: 0.001000  loss: 4.4545 (4.9943)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7716 (3.0274)  time: 0.2281  data: 0.0005  max mem: 21847
Epoch: [4] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.001000  min_lr: 0.001000  loss: 4.4545 (5.0068)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7716 (3.0274)
Test:  [ 0/25]  eta: 0:01:32  loss: 2.3400 (2.3400)  acc1: 53.2000 (53.2000)  acc5: 74.4000 (74.4000)  time: 3.7038  data: 3.5542  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 2.3400 (2.3657)  acc1: 50.4000 (49.9636)  acc5: 80.0000 (77.5636)  time: 0.6527  data: 0.5188  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 2.8203 (2.6524)  acc1: 41.6000 (45.2381)  acc5: 68.8000 (72.0000)  time: 0.2729  data: 0.1422  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 2.8926 (2.6403)  acc1: 42.4000 (45.6160)  acc5: 67.2000 (71.9520)  time: 0.2144  data: 0.0861  max mem: 21847
Test: Total time: 0:00:10 (0.4167 s / it)
* Acc@1 45.352 Acc@5 71.964 loss 2.641
Accuracy of the model on the 50000 test images: 45.4%
Max accuracy: 45.35%
Epoch: [5]  [   0/1251]  eta: 1:02:26  lr: 0.001000  min_lr: 0.001000  loss: 4.6248 (4.6248)  weight_decay: 0.0500 (0.0500)  time: 2.9949  data: 2.5949  max mem: 21847
Epoch: [5]  [ 200/1251]  eta: 0:05:01  lr: 0.001032  min_lr: 0.001032  loss: 4.5408 (4.8883)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5566 (2.7722)  time: 0.2700  data: 0.0004  max mem: 21847
Epoch: [5]  [ 400/1251]  eta: 0:03:59  lr: 0.001064  min_lr: 0.001064  loss: 4.8026 (4.8777)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5348 (2.8225)  time: 0.2706  data: 0.0004  max mem: 21847
Epoch: [5]  [ 600/1251]  eta: 0:03:01  lr: 0.001096  min_lr: 0.001096  loss: 4.8154 (4.8398)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5201 (2.7095)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [5]  [ 800/1251]  eta: 0:02:04  lr: 0.001128  min_lr: 0.001128  loss: 4.5881 (4.8303)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3167 (2.6851)  time: 0.2804  data: 0.0004  max mem: 21847
Epoch: [5]  [1000/1251]  eta: 0:01:09  lr: 0.001160  min_lr: 0.001160  loss: 5.0252 (4.8345)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5055 (2.6797)  time: 0.2960  data: 0.0004  max mem: 21847
Epoch: [5]  [1200/1251]  eta: 0:00:14  lr: 0.001192  min_lr: 0.001192  loss: 4.7953 (4.8118)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1351 (2.6208)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [5]  [1250/1251]  eta: 0:00:00  lr: 0.001200  min_lr: 0.001200  loss: 4.7441 (4.8113)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0936 (2.5987)  time: 0.2282  data: 0.0006  max mem: 21847
Epoch: [5] Total time: 0:05:45 (0.2759 s / it)
Averaged stats: lr: 0.001200  min_lr: 0.001200  loss: 4.7441 (4.8100)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0936 (2.5987)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.8349 (1.8349)  acc1: 64.8000 (64.8000)  acc5: 84.8000 (84.8000)  time: 5.4884  data: 5.3275  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.8995 (1.9959)  acc1: 57.2000 (56.3636)  acc5: 84.4000 (81.8909)  time: 0.7496  data: 0.6152  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 2.4445 (2.3546)  acc1: 44.8000 (50.5524)  acc5: 72.4000 (75.7905)  time: 0.2214  data: 0.0916  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 2.6509 (2.3546)  acc1: 44.8000 (50.4800)  acc5: 70.8000 (75.6320)  time: 0.2208  data: 0.0915  max mem: 21847
Test: Total time: 0:00:10 (0.4206 s / it)
* Acc@1 50.332 Acc@5 76.248 loss 2.337
Accuracy of the model on the 50000 test images: 50.3%
Max accuracy: 50.33%
Epoch: [6]  [   0/1251]  eta: 1:12:53  lr: 0.001200  min_lr: 0.001200  loss: 4.4929 (4.4929)  weight_decay: 0.0500 (0.0500)  time: 3.4964  data: 3.2126  max mem: 21847
Epoch: [6]  [ 200/1251]  eta: 0:05:05  lr: 0.001232  min_lr: 0.001232  loss: 4.9244 (4.7309)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1769 (2.3411)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [6]  [ 400/1251]  eta: 0:04:00  lr: 0.001264  min_lr: 0.001264  loss: 4.5888 (4.7313)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2112 (2.3944)  time: 0.2743  data: 0.0003  max mem: 21847
Epoch: [6]  [ 600/1251]  eta: 0:03:02  lr: 0.001296  min_lr: 0.001296  loss: 5.0294 (4.7072)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4321 (2.3585)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [6]  [ 800/1251]  eta: 0:02:05  lr: 0.001328  min_lr: 0.001328  loss: 5.0862 (4.7070)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9182 (2.2553)  time: 0.2792  data: 0.0004  max mem: 21847
Epoch: [6]  [1000/1251]  eta: 0:01:09  lr: 0.001360  min_lr: 0.001360  loss: 5.0459 (4.6819)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9911 (2.2175)  time: 0.2711  data: 0.0005  max mem: 21847
Epoch: [6]  [1200/1251]  eta: 0:00:14  lr: 0.001393  min_lr: 0.001393  loss: 5.0090 (4.6707)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9604 (2.1872)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [6]  [1250/1251]  eta: 0:00:00  lr: 0.001400  min_lr: 0.001400  loss: 4.3791 (4.6655)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9463 (2.1855)  time: 0.2359  data: 0.0005  max mem: 21847
Epoch: [6] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.001400  min_lr: 0.001400  loss: 4.3791 (4.6463)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9463 (2.1855)
Test:  [ 0/25]  eta: 0:01:50  loss: 1.7479 (1.7479)  acc1: 65.2000 (65.2000)  acc5: 85.6000 (85.6000)  time: 4.4173  data: 4.2591  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.7479 (1.8806)  acc1: 60.4000 (58.6545)  acc5: 86.4000 (84.3273)  time: 0.6976  data: 0.5632  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 2.3376 (2.1593)  acc1: 48.0000 (53.2381)  acc5: 76.4000 (79.2381)  time: 0.2431  data: 0.1133  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 2.3986 (2.1550)  acc1: 47.6000 (52.9920)  acc5: 74.8000 (79.1680)  time: 0.2085  data: 0.0794  max mem: 21847
Test: Total time: 0:00:10 (0.4028 s / it)
* Acc@1 53.796 Acc@5 79.394 loss 2.144
Accuracy of the model on the 50000 test images: 53.8%
Max accuracy: 53.80%
Epoch: [7]  [   0/1251]  eta: 0:58:48  lr: 0.001400  min_lr: 0.001400  loss: 4.9706 (4.9706)  weight_decay: 0.0500 (0.0500)  time: 2.8206  data: 2.4061  max mem: 21847
Epoch: [7]  [ 200/1251]  eta: 0:05:00  lr: 0.001432  min_lr: 0.001432  loss: 4.2479 (4.4801)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8288 (1.8835)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [7]  [ 400/1251]  eta: 0:03:58  lr: 0.001464  min_lr: 0.001464  loss: 4.7869 (4.5485)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6419 (1.8780)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [7]  [ 600/1251]  eta: 0:03:00  lr: 0.001496  min_lr: 0.001496  loss: 5.0023 (4.5293)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6918 (1.8433)  time: 0.2702  data: 0.0005  max mem: 21847
Epoch: [7]  [ 800/1251]  eta: 0:02:05  lr: 0.001528  min_lr: 0.001528  loss: 3.8653 (4.5225)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1612 (1.8840)  time: 0.2705  data: 0.0004  max mem: 21847
Epoch: [7]  [1000/1251]  eta: 0:01:09  lr: 0.001561  min_lr: 0.001561  loss: 4.5926 (4.5237)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7911 (1.8631)  time: 0.2711  data: 0.0005  max mem: 21847
Epoch: [7]  [1200/1251]  eta: 0:00:14  lr: 0.001593  min_lr: 0.001593  loss: 4.5770 (4.5120)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4700 (1.8436)  time: 0.2717  data: 0.0005  max mem: 21847
Epoch: [7]  [1250/1251]  eta: 0:00:00  lr: 0.001600  min_lr: 0.001600  loss: 3.9797 (4.5065)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5584 (1.8452)  time: 0.2277  data: 0.0006  max mem: 21847
Epoch: [7] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.001600  min_lr: 0.001600  loss: 3.9797 (4.5273)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5584 (1.8452)
Test:  [ 0/25]  eta: 0:02:19  loss: 1.5118 (1.5118)  acc1: 69.6000 (69.6000)  acc5: 87.2000 (87.2000)  time: 5.5834  data: 5.4256  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.5123 (1.6854)  acc1: 64.8000 (62.1455)  acc5: 88.4000 (86.6182)  time: 0.7177  data: 0.5828  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 2.0989 (1.9981)  acc1: 52.0000 (56.5143)  acc5: 76.4000 (80.5714)  time: 0.1936  data: 0.0627  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 2.2786 (2.0059)  acc1: 52.0000 (56.3520)  acc5: 76.4000 (80.3360)  time: 0.1921  data: 0.0618  max mem: 21847
Test: Total time: 0:00:10 (0.4030 s / it)
* Acc@1 56.732 Acc@5 80.966 loss 2.000
Accuracy of the model on the 50000 test images: 56.7%
Max accuracy: 56.73%
Epoch: [8]  [   0/1251]  eta: 0:59:42  lr: 0.001600  min_lr: 0.001600  loss: 4.9324 (4.9324)  weight_decay: 0.0500 (0.0500)  time: 2.8639  data: 2.4899  max mem: 21847
Epoch: [8]  [ 200/1251]  eta: 0:05:05  lr: 0.001632  min_lr: 0.001632  loss: 4.6555 (4.5600)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6644 (1.7380)  time: 0.2797  data: 0.0004  max mem: 21847
Epoch: [8]  [ 400/1251]  eta: 0:04:00  lr: 0.001664  min_lr: 0.001664  loss: 4.2517 (4.5139)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6856 (1.7028)  time: 0.2750  data: 0.0004  max mem: 21847
Epoch: [8]  [ 600/1251]  eta: 0:03:02  lr: 0.001696  min_lr: 0.001696  loss: 4.4437 (4.4539)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2999 (1.6600)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [8]  [ 800/1251]  eta: 0:02:05  lr: 0.001728  min_lr: 0.001728  loss: 4.5471 (4.4545)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6750 (1.6586)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [8]  [1000/1251]  eta: 0:01:09  lr: 0.001761  min_lr: 0.001761  loss: 4.7733 (4.4351)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3877 (1.6414)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [8]  [1200/1251]  eta: 0:00:14  lr: 0.001793  min_lr: 0.001793  loss: 4.1582 (4.4322)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3603 (1.6302)  time: 0.2743  data: 0.0004  max mem: 21847
Epoch: [8]  [1250/1251]  eta: 0:00:00  lr: 0.001800  min_lr: 0.001800  loss: 4.7667 (4.4314)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4428 (1.6226)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [8] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.001800  min_lr: 0.001800  loss: 4.7667 (4.4118)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4428 (1.6226)
Test:  [ 0/25]  eta: 0:01:26  loss: 1.5951 (1.5951)  acc1: 72.4000 (72.4000)  acc5: 88.0000 (88.0000)  time: 3.4650  data: 3.3175  max mem: 21847
Test:  [10/25]  eta: 0:00:08  loss: 1.5951 (1.6802)  acc1: 69.2000 (65.1273)  acc5: 90.4000 (88.0364)  time: 0.5923  data: 0.4572  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 2.1553 (1.9710)  acc1: 56.8000 (58.7619)  acc5: 80.0000 (82.9524)  time: 0.2905  data: 0.1574  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 2.1631 (1.9713)  acc1: 54.8000 (58.7360)  acc5: 79.2000 (82.9440)  time: 0.2232  data: 0.0916  max mem: 21847
Test: Total time: 0:00:10 (0.4103 s / it)
* Acc@1 58.766 Acc@5 82.758 loss 1.957
Accuracy of the model on the 50000 test images: 58.8%
Max accuracy: 58.77%
Epoch: [9]  [   0/1251]  eta: 0:59:45  lr: 0.001800  min_lr: 0.001800  loss: 4.9693 (4.9693)  weight_decay: 0.0500 (0.0500)  time: 2.8662  data: 2.5068  max mem: 21847
Epoch: [9]  [ 200/1251]  eta: 0:05:00  lr: 0.001832  min_lr: 0.001832  loss: 3.5681 (4.3881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4532 (1.4420)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [9]  [ 400/1251]  eta: 0:03:59  lr: 0.001864  min_lr: 0.001864  loss: 3.9091 (4.3847)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2205 (1.4199)  time: 0.2713  data: 0.0005  max mem: 21847
Epoch: [9]  [ 600/1251]  eta: 0:03:01  lr: 0.001896  min_lr: 0.001896  loss: 4.6329 (4.3729)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4930 (1.4564)  time: 0.2826  data: 0.0004  max mem: 21847
Epoch: [9]  [ 800/1251]  eta: 0:02:05  lr: 0.001929  min_lr: 0.001929  loss: 4.4640 (4.3674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3311 (1.4431)  time: 0.2793  data: 0.0004  max mem: 21847
Epoch: [9]  [1000/1251]  eta: 0:01:09  lr: 0.001961  min_lr: 0.001961  loss: 4.3641 (4.3553)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2443 (1.4335)  time: 0.2803  data: 0.0004  max mem: 21847
Epoch: [9]  [1200/1251]  eta: 0:00:14  lr: 0.001993  min_lr: 0.001993  loss: 4.4287 (4.3677)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2186 (1.4155)  time: 0.2738  data: 0.0003  max mem: 21847
Epoch: [9]  [1250/1251]  eta: 0:00:00  lr: 0.002000  min_lr: 0.002000  loss: 4.8012 (4.3666)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2657 (1.4080)  time: 0.2282  data: 0.0005  max mem: 21847
Epoch: [9] Total time: 0:05:45 (0.2765 s / it)
Averaged stats: lr: 0.002000  min_lr: 0.002000  loss: 4.8012 (4.3500)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2657 (1.4080)
Test:  [ 0/25]  eta: 0:02:22  loss: 1.4439 (1.4439)  acc1: 68.8000 (68.8000)  acc5: 88.4000 (88.4000)  time: 5.7186  data: 5.5710  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.4754 (1.6213)  acc1: 67.6000 (65.5273)  acc5: 91.2000 (89.1636)  time: 0.7573  data: 0.6240  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 2.0627 (1.9262)  acc1: 57.6000 (60.2286)  acc5: 82.0000 (83.6762)  time: 0.2009  data: 0.0712  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 2.1592 (1.9295)  acc1: 57.6000 (60.0320)  acc5: 79.2000 (83.5840)  time: 0.2025  data: 0.0741  max mem: 21847
Test: Total time: 0:00:10 (0.4153 s / it)
* Acc@1 60.270 Acc@5 83.696 loss 1.914
Accuracy of the model on the 50000 test images: 60.3%
Max accuracy: 60.27%
Epoch: [10]  [   0/1251]  eta: 1:06:28  lr: 0.002000  min_lr: 0.002000  loss: 3.5754 (3.5754)  weight_decay: 0.0500 (0.0500)  time: 3.1882  data: 2.8567  max mem: 21847
Epoch: [10]  [ 200/1251]  eta: 0:05:03  lr: 0.002032  min_lr: 0.002032  loss: 4.6114 (4.3308)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3571 (1.3575)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [10]  [ 400/1251]  eta: 0:03:59  lr: 0.002064  min_lr: 0.002064  loss: 4.5145 (4.2949)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2021 (1.2957)  time: 0.2826  data: 0.0004  max mem: 21847
Epoch: [10]  [ 600/1251]  eta: 0:03:02  lr: 0.002096  min_lr: 0.002096  loss: 4.7300 (4.2945)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2736 (1.2973)  time: 0.2742  data: 0.0004  max mem: 21847
Epoch: [10]  [ 800/1251]  eta: 0:02:05  lr: 0.002129  min_lr: 0.002129  loss: 4.3099 (4.2984)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1767 (1.2624)  time: 0.2893  data: 0.0004  max mem: 21847
Epoch: [10]  [1000/1251]  eta: 0:01:09  lr: 0.002161  min_lr: 0.002161  loss: 3.8470 (4.2910)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1393 (1.2417)  time: 0.2809  data: 0.0004  max mem: 21847
Epoch: [10]  [1200/1251]  eta: 0:00:14  lr: 0.002193  min_lr: 0.002193  loss: 4.2395 (4.2782)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0453 (1.2411)  time: 0.2740  data: 0.0004  max mem: 21847
Epoch: [10]  [1250/1251]  eta: 0:00:00  lr: 0.002200  min_lr: 0.002200  loss: 4.7542 (4.2759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0214 (1.2333)  time: 0.2279  data: 0.0005  max mem: 21847
Epoch: [10] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.002200  min_lr: 0.002200  loss: 4.7542 (4.2657)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0214 (1.2333)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.4399 (1.4399)  acc1: 73.2000 (73.2000)  acc5: 90.4000 (90.4000)  time: 5.6605  data: 5.4901  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.5263 (1.5689)  acc1: 70.0000 (67.3455)  acc5: 90.4000 (89.6727)  time: 0.7431  data: 0.6091  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.9226 (1.8407)  acc1: 58.8000 (61.4667)  acc5: 83.6000 (84.8191)  time: 0.2031  data: 0.0742  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.9809 (1.8470)  acc1: 56.4000 (61.3120)  acc5: 80.8000 (84.5920)  time: 0.2029  data: 0.0741  max mem: 21847
Test: Total time: 0:00:10 (0.4127 s / it)
* Acc@1 61.694 Acc@5 84.610 loss 1.850
Accuracy of the model on the 50000 test images: 61.7%
Max accuracy: 61.69%
Epoch: [11]  [   0/1251]  eta: 0:59:36  lr: 0.002200  min_lr: 0.002200  loss: 5.5083 (5.5083)  weight_decay: 0.0500 (0.0500)  time: 2.8587  data: 2.4696  max mem: 21847
Epoch: [11]  [ 200/1251]  eta: 0:05:03  lr: 0.002232  min_lr: 0.002232  loss: 4.3613 (4.2267)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2298 (1.2650)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [11]  [ 400/1251]  eta: 0:03:59  lr: 0.002264  min_lr: 0.002264  loss: 4.8518 (4.2554)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0664 (1.2102)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [11]  [ 600/1251]  eta: 0:03:02  lr: 0.002297  min_lr: 0.002297  loss: 4.1292 (4.2326)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0474 (1.1881)  time: 0.2830  data: 0.0004  max mem: 21847
Epoch: [11]  [ 800/1251]  eta: 0:02:05  lr: 0.002329  min_lr: 0.002329  loss: 4.1267 (4.2223)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1455 (1.2051)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [11]  [1000/1251]  eta: 0:01:09  lr: 0.002361  min_lr: 0.002361  loss: 4.4313 (4.2211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9421 (1.1733)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [11]  [1200/1251]  eta: 0:00:14  lr: 0.002393  min_lr: 0.002393  loss: 4.5636 (4.2219)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0249 (1.1671)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [11]  [1250/1251]  eta: 0:00:00  lr: 0.002400  min_lr: 0.002400  loss: 4.4256 (4.2203)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1028 (1.1669)  time: 0.2277  data: 0.0006  max mem: 21847
Epoch: [11] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.002400  min_lr: 0.002400  loss: 4.4256 (4.2052)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1028 (1.1669)
Test:  [ 0/25]  eta: 0:02:19  loss: 1.3426 (1.3426)  acc1: 75.6000 (75.6000)  acc5: 88.4000 (88.4000)  time: 5.5877  data: 5.4271  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.3703 (1.4764)  acc1: 70.0000 (68.8364)  acc5: 91.6000 (90.1818)  time: 0.7347  data: 0.6024  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.8344 (1.7631)  acc1: 58.8000 (63.2952)  acc5: 83.2000 (85.8286)  time: 0.1946  data: 0.0656  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.9841 (1.7774)  acc1: 58.4000 (62.7360)  acc5: 82.8000 (85.7760)  time: 0.2156  data: 0.0868  max mem: 21847
Test: Total time: 0:00:10 (0.4205 s / it)
* Acc@1 62.714 Acc@5 85.644 loss 1.774
Accuracy of the model on the 50000 test images: 62.7%
Max accuracy: 62.71%
Epoch: [12]  [   0/1251]  eta: 1:08:24  lr: 0.002400  min_lr: 0.002400  loss: 5.1265 (5.1265)  weight_decay: 0.0500 (0.0500)  time: 3.2811  data: 2.9106  max mem: 21847
Epoch: [12]  [ 200/1251]  eta: 0:05:04  lr: 0.002432  min_lr: 0.002432  loss: 3.9858 (4.0905)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1180 (1.1076)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [12]  [ 400/1251]  eta: 0:04:00  lr: 0.002464  min_lr: 0.002464  loss: 4.1599 (4.1338)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0409 (1.0993)  time: 0.2742  data: 0.0005  max mem: 21847
Epoch: [12]  [ 600/1251]  eta: 0:03:02  lr: 0.002497  min_lr: 0.002497  loss: 3.9069 (4.1062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9589 (1.0806)  time: 0.2725  data: 0.0005  max mem: 21847
Epoch: [12]  [ 800/1251]  eta: 0:02:05  lr: 0.002529  min_lr: 0.002529  loss: 3.5525 (4.1096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9407 (1.0786)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [12]  [1000/1251]  eta: 0:01:09  lr: 0.002561  min_lr: 0.002561  loss: 4.2969 (4.1159)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0488 (1.0778)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [12]  [1200/1251]  eta: 0:00:14  lr: 0.002593  min_lr: 0.002593  loss: 3.8988 (4.1141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9604 (1.0683)  time: 0.2745  data: 0.0004  max mem: 21847
Epoch: [12]  [1250/1251]  eta: 0:00:00  lr: 0.002600  min_lr: 0.002600  loss: 3.7880 (4.1132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9375 (1.0676)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [12] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.002600  min_lr: 0.002600  loss: 3.7880 (4.1501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9375 (1.0676)
Test:  [ 0/25]  eta: 0:01:29  loss: 1.2594 (1.2594)  acc1: 77.6000 (77.6000)  acc5: 91.2000 (91.2000)  time: 3.5750  data: 3.3867  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 1.3431 (1.4248)  acc1: 71.6000 (70.2909)  acc5: 91.6000 (90.9455)  time: 0.6183  data: 0.4820  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.7477 (1.7151)  acc1: 62.0000 (64.4191)  acc5: 87.6000 (86.8762)  time: 0.2641  data: 0.1348  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.9010 (1.7307)  acc1: 62.0000 (64.1600)  acc5: 83.2000 (86.8320)  time: 0.2176  data: 0.0890  max mem: 21847
Test: Total time: 0:00:09 (0.3909 s / it)
* Acc@1 64.106 Acc@5 86.472 loss 1.725
Accuracy of the model on the 50000 test images: 64.1%
Max accuracy: 64.11%
Epoch: [13]  [   0/1251]  eta: 1:01:35  lr: 0.002600  min_lr: 0.002600  loss: 2.9881 (2.9881)  weight_decay: 0.0500 (0.0500)  time: 2.9541  data: 2.5745  max mem: 21847
Epoch: [13]  [ 200/1251]  eta: 0:05:04  lr: 0.002632  min_lr: 0.002632  loss: 4.1095 (4.1895)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0291 (1.0774)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [13]  [ 400/1251]  eta: 0:03:59  lr: 0.002665  min_lr: 0.002665  loss: 4.1827 (4.1232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8651 (0.9726)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [13]  [ 600/1251]  eta: 0:03:02  lr: 0.002697  min_lr: 0.002697  loss: 4.7397 (4.1164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9285 (1.0023)  time: 0.2717  data: 0.0005  max mem: 21847
Epoch: [13]  [ 800/1251]  eta: 0:02:05  lr: 0.002729  min_lr: 0.002729  loss: 4.3781 (4.1231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8467 (0.9874)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [13]  [1000/1251]  eta: 0:01:09  lr: 0.002761  min_lr: 0.002761  loss: 3.9743 (4.1269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9806 (0.9706)  time: 0.2722  data: 0.0005  max mem: 21847
Epoch: [13]  [1200/1251]  eta: 0:00:14  lr: 0.002793  min_lr: 0.002793  loss: 3.6881 (4.1166)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9289 (0.9700)  time: 0.2727  data: 0.0005  max mem: 21847
Epoch: [13]  [1250/1251]  eta: 0:00:00  lr: 0.002800  min_lr: 0.002800  loss: 3.5898 (4.1043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9171 (0.9671)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [13] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.002800  min_lr: 0.002800  loss: 3.5898 (4.1152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9171 (0.9671)
Test:  [ 0/25]  eta: 0:02:18  loss: 1.2453 (1.2453)  acc1: 74.0000 (74.0000)  acc5: 93.2000 (93.2000)  time: 5.5425  data: 5.3915  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.2453 (1.3869)  acc1: 74.0000 (71.4545)  acc5: 92.8000 (91.8182)  time: 0.7224  data: 0.5861  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.7547 (1.6570)  acc1: 62.0000 (65.7333)  acc5: 86.0000 (87.0667)  time: 0.1994  data: 0.0682  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.8135 (1.6709)  acc1: 60.4000 (65.3280)  acc5: 84.0000 (86.9600)  time: 0.2111  data: 0.0820  max mem: 21847
Test: Total time: 0:00:10 (0.4159 s / it)
* Acc@1 65.240 Acc@5 87.132 loss 1.663
Accuracy of the model on the 50000 test images: 65.2%
Max accuracy: 65.24%
Epoch: [14]  [   0/1251]  eta: 1:08:50  lr: 0.002800  min_lr: 0.002800  loss: 4.3522 (4.3522)  weight_decay: 0.0500 (0.0500)  time: 3.3021  data: 3.0176  max mem: 21847
Epoch: [14]  [ 200/1251]  eta: 0:05:03  lr: 0.002833  min_lr: 0.002833  loss: 4.0175 (4.0820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8301 (0.9320)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [14]  [ 400/1251]  eta: 0:04:00  lr: 0.002865  min_lr: 0.002865  loss: 3.6415 (4.0903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8743 (0.8951)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [14]  [ 600/1251]  eta: 0:03:01  lr: 0.002897  min_lr: 0.002897  loss: 3.3548 (4.0678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8600 (0.9043)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [14]  [ 800/1251]  eta: 0:02:05  lr: 0.002929  min_lr: 0.002929  loss: 4.2196 (4.0818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8551 (0.9024)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [14]  [1000/1251]  eta: 0:01:09  lr: 0.002961  min_lr: 0.002961  loss: 4.1246 (4.0627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9269 (0.8983)  time: 0.2760  data: 0.0005  max mem: 21847
Epoch: [14]  [1200/1251]  eta: 0:00:14  lr: 0.002993  min_lr: 0.002993  loss: 4.3978 (4.0584)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7498 (0.8933)  time: 0.2808  data: 0.0004  max mem: 21847
Epoch: [14]  [1250/1251]  eta: 0:00:00  lr: 0.003000  min_lr: 0.003000  loss: 3.9577 (4.0583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7065 (0.8861)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [14] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.003000  min_lr: 0.003000  loss: 3.9577 (4.0621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7065 (0.8861)
Test:  [ 0/25]  eta: 0:02:19  loss: 1.1620 (1.1620)  acc1: 79.2000 (79.2000)  acc5: 93.6000 (93.6000)  time: 5.5983  data: 5.4295  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.3025 (1.3728)  acc1: 72.4000 (71.5636)  acc5: 92.4000 (91.3091)  time: 0.7537  data: 0.6181  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.7407 (1.6241)  acc1: 60.8000 (65.6762)  acc5: 84.8000 (87.4095)  time: 0.2231  data: 0.0930  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.8441 (1.6312)  acc1: 59.6000 (65.2960)  acc5: 84.4000 (87.3120)  time: 0.2223  data: 0.0929  max mem: 21847
Test: Total time: 0:00:10 (0.4261 s / it)
* Acc@1 65.636 Acc@5 87.418 loss 1.627
Accuracy of the model on the 50000 test images: 65.6%
Max accuracy: 65.64%
Epoch: [15]  [   0/1251]  eta: 1:10:30  lr: 0.003000  min_lr: 0.003000  loss: 3.1871 (3.1871)  weight_decay: 0.0500 (0.0500)  time: 3.3821  data: 3.1071  max mem: 21847
Epoch: [15]  [ 200/1251]  eta: 0:05:05  lr: 0.003033  min_lr: 0.003033  loss: 4.3832 (4.0518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9648 (0.8201)  time: 0.2812  data: 0.0004  max mem: 21847
Epoch: [15]  [ 400/1251]  eta: 0:03:59  lr: 0.003065  min_lr: 0.003065  loss: 4.2990 (4.0124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7550 (0.8032)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [15]  [ 600/1251]  eta: 0:03:02  lr: 0.003097  min_lr: 0.003097  loss: 4.6975 (4.0278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7104 (0.7948)  time: 0.2716  data: 0.0003  max mem: 21847
Epoch: [15]  [ 800/1251]  eta: 0:02:05  lr: 0.003129  min_lr: 0.003129  loss: 4.6208 (4.0082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8989 (0.8239)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [15]  [1000/1251]  eta: 0:01:09  lr: 0.003161  min_lr: 0.003161  loss: 4.3641 (4.0198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6942 (0.8164)  time: 0.2771  data: 0.0004  max mem: 21847
Epoch: [15]  [1200/1251]  eta: 0:00:14  lr: 0.003193  min_lr: 0.003193  loss: 4.0951 (4.0269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.8155)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [15]  [1250/1251]  eta: 0:00:00  lr: 0.003200  min_lr: 0.003200  loss: 3.7915 (4.0304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7159 (0.8190)  time: 0.2281  data: 0.0006  max mem: 21847
Epoch: [15] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.003200  min_lr: 0.003200  loss: 3.7915 (4.0317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7159 (0.8190)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.0810 (1.0810)  acc1: 77.6000 (77.6000)  acc5: 92.8000 (92.8000)  time: 5.6549  data: 5.4863  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.1940 (1.3017)  acc1: 72.0000 (71.7818)  acc5: 92.8000 (92.2909)  time: 0.6805  data: 0.5463  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.6119 (1.5537)  acc1: 61.6000 (66.1905)  acc5: 87.2000 (88.2286)  time: 0.1745  data: 0.0453  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.7511 (1.5653)  acc1: 61.6000 (65.7440)  acc5: 85.6000 (88.2240)  time: 0.1781  data: 0.0498  max mem: 21847
Test: Total time: 0:00:09 (0.3935 s / it)
* Acc@1 66.274 Acc@5 88.004 loss 1.559
Accuracy of the model on the 50000 test images: 66.3%
Max accuracy: 66.27%
Epoch: [16]  [   0/1251]  eta: 1:12:24  lr: 0.003201  min_lr: 0.003201  loss: 4.7001 (4.7001)  weight_decay: 0.0500 (0.0500)  time: 3.4729  data: 3.1968  max mem: 21847
Epoch: [16]  [ 200/1251]  eta: 0:05:06  lr: 0.003233  min_lr: 0.003233  loss: 3.9527 (3.9307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7052 (0.7327)  time: 0.2822  data: 0.0004  max mem: 21847
Epoch: [16]  [ 400/1251]  eta: 0:04:00  lr: 0.003265  min_lr: 0.003265  loss: 3.7543 (3.9403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7054 (0.7606)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [16]  [ 600/1251]  eta: 0:03:01  lr: 0.003297  min_lr: 0.003297  loss: 3.6780 (3.9687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8972 (0.7873)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [16]  [ 800/1251]  eta: 0:02:05  lr: 0.003329  min_lr: 0.003329  loss: 3.6277 (3.9498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7516 (0.8067)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [16]  [1000/1251]  eta: 0:01:09  lr: 0.003361  min_lr: 0.003361  loss: 3.8627 (3.9435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7752 (0.7979)  time: 0.2743  data: 0.0005  max mem: 21847
Epoch: [16]  [1200/1251]  eta: 0:00:14  lr: 0.003393  min_lr: 0.003393  loss: 4.4194 (3.9698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7390 (0.7935)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [16]  [1250/1251]  eta: 0:00:00  lr: 0.003400  min_lr: 0.003400  loss: 4.2121 (3.9711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7391 (0.7933)  time: 0.2281  data: 0.0006  max mem: 21847
Epoch: [16] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.003400  min_lr: 0.003400  loss: 4.2121 (3.9896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7391 (0.7933)
Test:  [ 0/25]  eta: 0:02:26  loss: 1.1768 (1.1768)  acc1: 80.0000 (80.0000)  acc5: 91.6000 (91.6000)  time: 5.8648  data: 5.7193  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.1995 (1.2953)  acc1: 74.0000 (73.0545)  acc5: 93.2000 (91.6364)  time: 0.7291  data: 0.5982  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.6733 (1.5843)  acc1: 64.8000 (67.0476)  acc5: 87.2000 (87.7143)  time: 0.1715  data: 0.0431  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.7479 (1.5942)  acc1: 64.8000 (67.1040)  acc5: 86.0000 (87.8400)  time: 0.1706  data: 0.0430  max mem: 21847
Test: Total time: 0:00:09 (0.3959 s / it)
* Acc@1 66.856 Acc@5 88.050 loss 1.593
Accuracy of the model on the 50000 test images: 66.9%
Max accuracy: 66.86%
Epoch: [17]  [   0/1251]  eta: 1:02:55  lr: 0.003401  min_lr: 0.003401  loss: 2.8543 (2.8543)  weight_decay: 0.0500 (0.0500)  time: 3.0176  data: 2.6532  max mem: 21847
Epoch: [17]  [ 200/1251]  eta: 0:05:04  lr: 0.003433  min_lr: 0.003433  loss: 3.9052 (3.9018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6486 (0.7518)  time: 0.2735  data: 0.0005  max mem: 21847
Epoch: [17]  [ 400/1251]  eta: 0:04:00  lr: 0.003465  min_lr: 0.003465  loss: 4.1813 (3.9405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6691 (0.7294)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [17]  [ 600/1251]  eta: 0:03:02  lr: 0.003497  min_lr: 0.003497  loss: 3.8907 (3.9036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6546 (0.7331)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [17]  [ 800/1251]  eta: 0:02:05  lr: 0.003529  min_lr: 0.003529  loss: 4.0872 (3.9171)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7315 (0.7454)  time: 0.2789  data: 0.0004  max mem: 21847
Epoch: [17]  [1000/1251]  eta: 0:01:09  lr: 0.003561  min_lr: 0.003561  loss: 3.8107 (3.9365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6614 (0.7412)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [17]  [1200/1251]  eta: 0:00:14  lr: 0.003593  min_lr: 0.003593  loss: 3.7057 (3.9353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7131 (0.7380)  time: 0.2846  data: 0.0004  max mem: 21847
Epoch: [17]  [1250/1251]  eta: 0:00:00  lr: 0.003600  min_lr: 0.003600  loss: 4.2590 (3.9363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7131 (0.7380)  time: 0.2280  data: 0.0005  max mem: 21847
Epoch: [17] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.003600  min_lr: 0.003600  loss: 4.2590 (3.9481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7131 (0.7380)
Test:  [ 0/25]  eta: 0:02:19  loss: 1.1221 (1.1221)  acc1: 79.6000 (79.6000)  acc5: 93.6000 (93.6000)  time: 5.5757  data: 5.4081  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.2923 (1.2966)  acc1: 73.6000 (72.2909)  acc5: 92.0000 (91.9273)  time: 0.7356  data: 0.6000  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.6209 (1.5462)  acc1: 63.6000 (66.9905)  acc5: 87.2000 (88.2667)  time: 0.2138  data: 0.0808  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.7351 (1.5528)  acc1: 62.4000 (66.6880)  acc5: 85.6000 (88.0800)  time: 0.2131  data: 0.0807  max mem: 21847
Test: Total time: 0:00:10 (0.4178 s / it)
* Acc@1 67.302 Acc@5 88.556 loss 1.542
Accuracy of the model on the 50000 test images: 67.3%
Max accuracy: 67.30%
Epoch: [18]  [   0/1251]  eta: 1:06:33  lr: 0.003601  min_lr: 0.003601  loss: 3.6784 (3.6784)  weight_decay: 0.0500 (0.0500)  time: 3.1919  data: 2.8826  max mem: 21847
Epoch: [18]  [ 200/1251]  eta: 0:05:03  lr: 0.003633  min_lr: 0.003633  loss: 4.5099 (3.9044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6968 (0.6969)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [18]  [ 400/1251]  eta: 0:04:00  lr: 0.003665  min_lr: 0.003665  loss: 3.9463 (3.9535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7204 (0.6967)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [18]  [ 600/1251]  eta: 0:03:02  lr: 0.003697  min_lr: 0.003697  loss: 3.3954 (3.9504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7021 (0.7002)  time: 0.2857  data: 0.0005  max mem: 21847
Epoch: [18]  [ 800/1251]  eta: 0:02:05  lr: 0.003729  min_lr: 0.003729  loss: 3.9791 (3.9235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6188 (0.7047)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [18]  [1000/1251]  eta: 0:01:09  lr: 0.003761  min_lr: 0.003761  loss: 3.9728 (3.9133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5900 (0.7023)  time: 0.2814  data: 0.0004  max mem: 21847
Epoch: [18]  [1200/1251]  eta: 0:00:14  lr: 0.003793  min_lr: 0.003793  loss: 3.8727 (3.9105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6038 (0.7023)  time: 0.2756  data: 0.0005  max mem: 21847
Epoch: [18]  [1250/1251]  eta: 0:00:00  lr: 0.003800  min_lr: 0.003800  loss: 3.9262 (3.9079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6631 (0.7010)  time: 0.2280  data: 0.0007  max mem: 21847
Epoch: [18] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.003800  min_lr: 0.003800  loss: 3.9262 (3.9288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6631 (0.7010)
Test:  [ 0/25]  eta: 0:02:12  loss: 1.1246 (1.1246)  acc1: 79.6000 (79.6000)  acc5: 93.2000 (93.2000)  time: 5.3199  data: 5.1455  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.2219 (1.3081)  acc1: 73.6000 (72.7636)  acc5: 92.8000 (91.9636)  time: 0.7493  data: 0.6121  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.6090 (1.5572)  acc1: 64.4000 (67.9619)  acc5: 88.4000 (88.4571)  time: 0.2293  data: 0.0954  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.6558 (1.5651)  acc1: 64.4000 (67.7440)  acc5: 86.4000 (88.4960)  time: 0.2285  data: 0.0953  max mem: 21847
Test: Total time: 0:00:10 (0.4203 s / it)
* Acc@1 67.552 Acc@5 88.694 loss 1.555
Accuracy of the model on the 50000 test images: 67.6%
Max accuracy: 67.55%
Epoch: [19]  [   0/1251]  eta: 1:09:00  lr: 0.003801  min_lr: 0.003801  loss: 2.8093 (2.8093)  weight_decay: 0.0500 (0.0500)  time: 3.3100  data: 3.0099  max mem: 21847
Epoch: [19]  [ 200/1251]  eta: 0:05:05  lr: 0.003833  min_lr: 0.003833  loss: 3.6855 (3.9388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7171 (0.6711)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [19]  [ 400/1251]  eta: 0:04:01  lr: 0.003865  min_lr: 0.003865  loss: 3.8052 (3.9245)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6830 (0.6854)  time: 0.2819  data: 0.0004  max mem: 21847
Epoch: [19]  [ 600/1251]  eta: 0:03:02  lr: 0.003897  min_lr: 0.003897  loss: 3.5224 (3.9018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6631 (0.6865)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [19]  [ 800/1251]  eta: 0:02:05  lr: 0.003929  min_lr: 0.003929  loss: 4.3587 (3.9193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6306 (0.7147)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [19]  [1000/1251]  eta: 0:01:09  lr: 0.003961  min_lr: 0.003961  loss: 3.7797 (3.9322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5900 (0.6993)  time: 0.2756  data: 0.0004  max mem: 21847
Epoch: [19]  [1200/1251]  eta: 0:00:14  lr: 0.003993  min_lr: 0.003993  loss: 3.6230 (3.9317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6326 (0.6915)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [19]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.3792 (3.9336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5645 (0.6882)  time: 0.2281  data: 0.0007  max mem: 21847
Epoch: [19] Total time: 0:05:47 (0.2778 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.3792 (3.9144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5645 (0.6882)
Test:  [ 0/25]  eta: 0:02:24  loss: 1.0753 (1.0753)  acc1: 76.8000 (76.8000)  acc5: 94.4000 (94.4000)  time: 5.7968  data: 5.6511  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.1542 (1.2569)  acc1: 75.2000 (73.6364)  acc5: 94.4000 (93.2000)  time: 0.7713  data: 0.6378  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.5648 (1.5178)  acc1: 65.6000 (68.6286)  acc5: 87.6000 (89.3714)  time: 0.2082  data: 0.0767  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.7214 (1.5314)  acc1: 65.6000 (68.4960)  acc5: 86.8000 (89.1200)  time: 0.2070  data: 0.0766  max mem: 21847
Test: Total time: 0:00:10 (0.4220 s / it)
* Acc@1 67.884 Acc@5 89.026 loss 1.540
Accuracy of the model on the 50000 test images: 67.9%
Max accuracy: 67.88%
Epoch: [20]  [   0/1251]  eta: 1:04:22  lr: 0.004000  min_lr: 0.004000  loss: 3.2641 (3.2641)  weight_decay: 0.0500 (0.0500)  time: 3.0875  data: 2.7509  max mem: 21847
Epoch: [20]  [ 200/1251]  eta: 0:05:02  lr: 0.004000  min_lr: 0.004000  loss: 3.8563 (3.8965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6944 (0.7151)  time: 0.2828  data: 0.0004  max mem: 21847
Epoch: [20]  [ 400/1251]  eta: 0:03:59  lr: 0.004000  min_lr: 0.004000  loss: 4.5200 (3.8809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5692 (0.6729)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [20]  [ 600/1251]  eta: 0:03:02  lr: 0.004000  min_lr: 0.004000  loss: 3.5041 (3.8923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5519 (0.6588)  time: 0.2814  data: 0.0004  max mem: 21847
Epoch: [20]  [ 800/1251]  eta: 0:02:05  lr: 0.004000  min_lr: 0.004000  loss: 4.4504 (3.9061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6307 (0.6665)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [20]  [1000/1251]  eta: 0:01:09  lr: 0.004000  min_lr: 0.004000  loss: 3.3923 (3.8919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6112 (0.6581)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [20]  [1200/1251]  eta: 0:00:14  lr: 0.004000  min_lr: 0.004000  loss: 3.8758 (3.9043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5874 (0.6529)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [20]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.1646 (3.9118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5613 (0.6488)  time: 0.2285  data: 0.0005  max mem: 21847
Epoch: [20] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.1646 (3.8880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5613 (0.6488)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.9817 (0.9817)  acc1: 78.0000 (78.0000)  acc5: 94.8000 (94.8000)  time: 5.5140  data: 5.3585  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.0917 (1.1420)  acc1: 75.2000 (73.7818)  acc5: 94.4000 (93.2364)  time: 0.7461  data: 0.6119  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.4662 (1.4089)  acc1: 66.0000 (69.0095)  acc5: 89.2000 (89.3143)  time: 0.2277  data: 0.0974  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.5783 (1.4165)  acc1: 66.0000 (68.9920)  acc5: 86.0000 (89.2320)  time: 0.2264  data: 0.0973  max mem: 21847
Test: Total time: 0:00:10 (0.4268 s / it)
* Acc@1 68.310 Acc@5 89.208 loss 1.416
Accuracy of the model on the 50000 test images: 68.3%
Max accuracy: 68.31%
Epoch: [21]  [   0/1251]  eta: 1:03:15  lr: 0.004000  min_lr: 0.004000  loss: 3.7167 (3.7167)  weight_decay: 0.0500 (0.0500)  time: 3.0341  data: 2.6692  max mem: 21847
Epoch: [21]  [ 200/1251]  eta: 0:05:02  lr: 0.004000  min_lr: 0.004000  loss: 4.1074 (3.8516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7064 (0.6632)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [21]  [ 400/1251]  eta: 0:03:59  lr: 0.004000  min_lr: 0.004000  loss: 3.4138 (3.8112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5729 (0.6327)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [21]  [ 600/1251]  eta: 0:03:02  lr: 0.004000  min_lr: 0.004000  loss: 3.9240 (3.8394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6178 (0.6514)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [21]  [ 800/1251]  eta: 0:02:05  lr: 0.004000  min_lr: 0.004000  loss: 3.6170 (3.8412)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6340 (0.6350)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [21]  [1000/1251]  eta: 0:01:09  lr: 0.004000  min_lr: 0.004000  loss: 4.1193 (3.8397)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6103 (0.6233)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [21]  [1200/1251]  eta: 0:00:14  lr: 0.004000  min_lr: 0.004000  loss: 3.9572 (3.8363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5739 (0.6188)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [21]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 4.1522 (3.8381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5810 (0.6197)  time: 0.2280  data: 0.0005  max mem: 21847
Epoch: [21] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 4.1522 (3.8552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5810 (0.6197)
Test:  [ 0/25]  eta: 0:02:07  loss: 0.9413 (0.9413)  acc1: 80.0000 (80.0000)  acc5: 96.0000 (96.0000)  time: 5.0880  data: 4.9229  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0884 (1.1294)  acc1: 72.8000 (74.9091)  acc5: 94.4000 (93.4546)  time: 0.6723  data: 0.5372  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.4233 (1.3916)  acc1: 68.0000 (69.1619)  acc5: 89.6000 (89.6000)  time: 0.1945  data: 0.0645  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.5796 (1.4006)  acc1: 64.8000 (68.9920)  acc5: 86.8000 (89.5040)  time: 0.2069  data: 0.0776  max mem: 21847
Test: Total time: 0:00:09 (0.3941 s / it)
* Acc@1 69.220 Acc@5 89.700 loss 1.394
Accuracy of the model on the 50000 test images: 69.2%
Max accuracy: 69.22%
Epoch: [22]  [   0/1251]  eta: 1:10:07  lr: 0.003999  min_lr: 0.003999  loss: 3.2399 (3.2399)  weight_decay: 0.0500 (0.0500)  time: 3.3631  data: 3.0800  max mem: 21847
Epoch: [22]  [ 200/1251]  eta: 0:05:06  lr: 0.003999  min_lr: 0.003999  loss: 4.4356 (3.9082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5202 (0.5799)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [22]  [ 400/1251]  eta: 0:04:00  lr: 0.003999  min_lr: 0.003999  loss: 4.1243 (3.8517)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6209 (0.5874)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [22]  [ 600/1251]  eta: 0:03:02  lr: 0.003999  min_lr: 0.003999  loss: 3.7109 (3.8411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (0.5996)  time: 0.2727  data: 0.0003  max mem: 21847
Epoch: [22]  [ 800/1251]  eta: 0:02:05  lr: 0.003999  min_lr: 0.003999  loss: 4.2614 (3.8360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5556 (0.6029)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [22]  [1000/1251]  eta: 0:01:09  lr: 0.003999  min_lr: 0.003999  loss: 3.6912 (3.8393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5732 (0.6191)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [22]  [1200/1251]  eta: 0:00:14  lr: 0.003999  min_lr: 0.003999  loss: 4.2342 (3.8335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6593 (0.6219)  time: 0.2702  data: 0.0004  max mem: 21847
Epoch: [22]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 3.6825 (3.8322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6831 (0.6219)  time: 0.2279  data: 0.0006  max mem: 21847
Epoch: [22] Total time: 0:05:46 (0.2774 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 3.6825 (3.8253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6831 (0.6219)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.9812 (0.9812)  acc1: 78.8000 (78.8000)  acc5: 95.2000 (95.2000)  time: 5.7800  data: 5.6291  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.1350 (1.1664)  acc1: 75.6000 (74.6182)  acc5: 94.8000 (94.0727)  time: 0.7314  data: 0.5989  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.4229 (1.4219)  acc1: 67.2000 (70.3810)  acc5: 91.6000 (90.1143)  time: 0.1855  data: 0.0565  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.6237 (1.4319)  acc1: 66.4000 (69.9360)  acc5: 87.2000 (90.1920)  time: 0.1927  data: 0.0643  max mem: 21847
Test: Total time: 0:00:10 (0.4104 s / it)
* Acc@1 69.672 Acc@5 89.872 loss 1.436
Accuracy of the model on the 50000 test images: 69.7%
Max accuracy: 69.67%
Epoch: [23]  [   0/1251]  eta: 1:01:14  lr: 0.003999  min_lr: 0.003999  loss: 4.6626 (4.6626)  weight_decay: 0.0500 (0.0500)  time: 2.9375  data: 2.5631  max mem: 21847
Epoch: [23]  [ 200/1251]  eta: 0:05:05  lr: 0.003999  min_lr: 0.003999  loss: 4.2412 (3.8490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6046 (0.6501)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [23]  [ 400/1251]  eta: 0:04:00  lr: 0.003999  min_lr: 0.003999  loss: 3.5638 (3.8383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5680 (0.6237)  time: 0.2820  data: 0.0004  max mem: 21847
Epoch: [23]  [ 600/1251]  eta: 0:03:01  lr: 0.003998  min_lr: 0.003998  loss: 4.0594 (3.8220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5984 (0.6224)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [23]  [ 800/1251]  eta: 0:02:05  lr: 0.003998  min_lr: 0.003998  loss: 3.6556 (3.7959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6337 (0.6131)  time: 0.2735  data: 0.0004  max mem: 21847
Epoch: [23]  [1000/1251]  eta: 0:01:09  lr: 0.003998  min_lr: 0.003998  loss: 3.3221 (3.7872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6509 (0.6148)  time: 0.2811  data: 0.0004  max mem: 21847
Epoch: [23]  [1200/1251]  eta: 0:00:14  lr: 0.003998  min_lr: 0.003998  loss: 3.5633 (3.7940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6634 (0.6228)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [23]  [1250/1251]  eta: 0:00:00  lr: 0.003998  min_lr: 0.003998  loss: 3.2943 (3.7955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6230 (0.6229)  time: 0.2282  data: 0.0006  max mem: 21847
Epoch: [23] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.003998  min_lr: 0.003998  loss: 3.2943 (3.7962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6230 (0.6229)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.9404 (0.9404)  acc1: 83.6000 (83.6000)  acc5: 95.2000 (95.2000)  time: 5.5639  data: 5.3816  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.1119 (1.1711)  acc1: 73.2000 (74.2909)  acc5: 94.4000 (93.3455)  time: 0.7124  data: 0.5763  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.4393 (1.3984)  acc1: 65.2000 (69.5238)  acc5: 89.2000 (90.0762)  time: 0.1971  data: 0.0653  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.4815 (1.3956)  acc1: 67.2000 (69.6000)  acc5: 88.0000 (90.1440)  time: 0.2173  data: 0.0859  max mem: 21847
Test: Total time: 0:00:10 (0.4203 s / it)
* Acc@1 69.846 Acc@5 90.008 loss 1.398
Accuracy of the model on the 50000 test images: 69.8%
Max accuracy: 69.85%
Epoch: [24]  [   0/1251]  eta: 1:03:12  lr: 0.003998  min_lr: 0.003998  loss: 4.6681 (4.6681)  weight_decay: 0.0500 (0.0500)  time: 3.0319  data: 2.6993  max mem: 21847
Epoch: [24]  [ 200/1251]  eta: 0:05:03  lr: 0.003998  min_lr: 0.003998  loss: 3.6051 (3.8093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5779 (0.5804)  time: 0.2857  data: 0.0004  max mem: 21847
Epoch: [24]  [ 400/1251]  eta: 0:03:59  lr: 0.003998  min_lr: 0.003998  loss: 4.1926 (3.8273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6023 (0.6192)  time: 0.2815  data: 0.0004  max mem: 21847
Epoch: [24]  [ 600/1251]  eta: 0:03:01  lr: 0.003997  min_lr: 0.003997  loss: 3.5138 (3.8276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5755 (0.5960)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [24]  [ 800/1251]  eta: 0:02:05  lr: 0.003997  min_lr: 0.003997  loss: 3.8925 (3.8093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6180 (0.5911)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [24]  [1000/1251]  eta: 0:01:09  lr: 0.003997  min_lr: 0.003997  loss: 3.4997 (3.7826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4594 (0.5889)  time: 0.2719  data: 0.0005  max mem: 21847
Epoch: [24]  [1200/1251]  eta: 0:00:14  lr: 0.003997  min_lr: 0.003997  loss: 3.6094 (3.7821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5382 (0.5884)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [24]  [1250/1251]  eta: 0:00:00  lr: 0.003997  min_lr: 0.003997  loss: 3.8753 (3.7812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4790 (0.5858)  time: 0.2279  data: 0.0006  max mem: 21847
Epoch: [24] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003997  min_lr: 0.003997  loss: 3.8753 (3.7745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4790 (0.5858)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.9979 (0.9979)  acc1: 79.2000 (79.2000)  acc5: 94.4000 (94.4000)  time: 5.4769  data: 5.3091  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.2096 (1.1908)  acc1: 74.4000 (74.8364)  acc5: 94.4000 (93.6000)  time: 0.6988  data: 0.5627  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.4691 (1.4204)  acc1: 67.2000 (70.1524)  acc5: 90.0000 (90.4191)  time: 0.1919  data: 0.0618  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.5300 (1.4267)  acc1: 67.2000 (70.0640)  acc5: 87.2000 (90.3360)  time: 0.1976  data: 0.0694  max mem: 21847
Test: Total time: 0:00:10 (0.4031 s / it)
* Acc@1 70.258 Acc@5 90.448 loss 1.414
Accuracy of the model on the 50000 test images: 70.3%
Max accuracy: 70.26%
Epoch: [25]  [   0/1251]  eta: 1:09:41  lr: 0.003997  min_lr: 0.003997  loss: 2.5155 (2.5155)  weight_decay: 0.0500 (0.0500)  time: 3.3424  data: 3.0581  max mem: 21847
Epoch: [25]  [ 200/1251]  eta: 0:05:06  lr: 0.003997  min_lr: 0.003997  loss: 4.0702 (3.7588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6512 (0.6082)  time: 0.2791  data: 0.0004  max mem: 21847
Epoch: [25]  [ 400/1251]  eta: 0:04:00  lr: 0.003996  min_lr: 0.003996  loss: 3.0724 (3.7365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5358 (0.6089)  time: 0.2841  data: 0.0004  max mem: 21847
Epoch: [25]  [ 600/1251]  eta: 0:03:02  lr: 0.003996  min_lr: 0.003996  loss: 4.0603 (3.7538)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6214 (0.6070)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [25]  [ 800/1251]  eta: 0:02:05  lr: 0.003996  min_lr: 0.003996  loss: 3.6537 (3.7650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6306 (0.6099)  time: 0.2720  data: 0.0005  max mem: 21847
Epoch: [25]  [1000/1251]  eta: 0:01:09  lr: 0.003996  min_lr: 0.003996  loss: 3.9108 (3.7565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5628 (0.6052)  time: 0.2752  data: 0.0005  max mem: 21847
Epoch: [25]  [1200/1251]  eta: 0:00:14  lr: 0.003996  min_lr: 0.003996  loss: 3.0979 (3.7476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6943 (0.6090)  time: 0.2727  data: 0.0005  max mem: 21847
Epoch: [25]  [1250/1251]  eta: 0:00:00  lr: 0.003995  min_lr: 0.003995  loss: 3.7070 (3.7441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5681 (0.6058)  time: 0.2380  data: 0.0006  max mem: 21847
Epoch: [25] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.003995  min_lr: 0.003995  loss: 3.7070 (3.7584)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5681 (0.6058)
Test:  [ 0/25]  eta: 0:02:24  loss: 1.0300 (1.0300)  acc1: 79.2000 (79.2000)  acc5: 94.0000 (94.0000)  time: 5.7647  data: 5.6141  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.0300 (1.1356)  acc1: 76.8000 (75.4182)  acc5: 94.0000 (93.9273)  time: 0.7458  data: 0.6143  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.4396 (1.3658)  acc1: 67.6000 (70.6095)  acc5: 89.6000 (90.6095)  time: 0.1983  data: 0.0698  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.5032 (1.3737)  acc1: 67.2000 (70.3200)  acc5: 88.8000 (90.5440)  time: 0.1972  data: 0.0697  max mem: 21847
Test: Total time: 0:00:10 (0.4129 s / it)
* Acc@1 70.410 Acc@5 90.342 loss 1.369
Accuracy of the model on the 50000 test images: 70.4%
Max accuracy: 70.41%
Epoch: [26]  [   0/1251]  eta: 1:07:11  lr: 0.003995  min_lr: 0.003995  loss: 2.8165 (2.8165)  weight_decay: 0.0500 (0.0500)  time: 3.2223  data: 2.8899  max mem: 21847
Epoch: [26]  [ 200/1251]  eta: 0:05:03  lr: 0.003995  min_lr: 0.003995  loss: 3.8717 (3.7497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5593 (0.5935)  time: 0.2812  data: 0.0004  max mem: 21847
Epoch: [26]  [ 400/1251]  eta: 0:04:00  lr: 0.003995  min_lr: 0.003995  loss: 3.3223 (3.7197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4856 (0.5670)  time: 0.2721  data: 0.0005  max mem: 21847
Epoch: [26]  [ 600/1251]  eta: 0:03:01  lr: 0.003995  min_lr: 0.003995  loss: 3.8978 (3.7444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6488 (0.6019)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [26]  [ 800/1251]  eta: 0:02:05  lr: 0.003994  min_lr: 0.003994  loss: 3.7771 (3.7214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6286 (0.6039)  time: 0.2835  data: 0.0004  max mem: 21847
Epoch: [26]  [1000/1251]  eta: 0:01:09  lr: 0.003994  min_lr: 0.003994  loss: 3.8820 (3.7263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5465 (0.6148)  time: 0.2750  data: 0.0005  max mem: 21847
Epoch: [26]  [1200/1251]  eta: 0:00:14  lr: 0.003994  min_lr: 0.003994  loss: 3.7189 (3.7303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5089 (0.6028)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [26]  [1250/1251]  eta: 0:00:00  lr: 0.003994  min_lr: 0.003994  loss: 3.9986 (3.7302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5486 (0.6040)  time: 0.2283  data: 0.0006  max mem: 21847
Epoch: [26] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.003994  min_lr: 0.003994  loss: 3.9986 (3.7289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5486 (0.6040)
Test:  [ 0/25]  eta: 0:01:49  loss: 1.0137 (1.0137)  acc1: 79.2000 (79.2000)  acc5: 94.0000 (94.0000)  time: 4.3961  data: 4.2445  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 1.0786 (1.1409)  acc1: 77.2000 (75.7455)  acc5: 94.0000 (93.7091)  time: 0.6436  data: 0.5110  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.4376 (1.3848)  acc1: 68.0000 (70.8762)  acc5: 90.0000 (90.2095)  time: 0.2230  data: 0.0940  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.4519 (1.3819)  acc1: 68.8000 (70.9120)  acc5: 88.4000 (90.3200)  time: 0.1981  data: 0.0688  max mem: 21847
Test: Total time: 0:00:09 (0.3935 s / it)
* Acc@1 70.656 Acc@5 90.576 loss 1.375
Accuracy of the model on the 50000 test images: 70.7%
Max accuracy: 70.66%
Epoch: [27]  [   0/1251]  eta: 1:00:03  lr: 0.003994  min_lr: 0.003994  loss: 3.2359 (3.2359)  weight_decay: 0.0500 (0.0500)  time: 2.8809  data: 2.4813  max mem: 21847
Epoch: [27]  [ 200/1251]  eta: 0:05:05  lr: 0.003994  min_lr: 0.003994  loss: 3.1617 (3.6739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4265 (0.5489)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [27]  [ 400/1251]  eta: 0:04:00  lr: 0.003993  min_lr: 0.003993  loss: 3.9052 (3.7324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5482 (0.5749)  time: 0.2836  data: 0.0004  max mem: 21847
Epoch: [27]  [ 600/1251]  eta: 0:03:02  lr: 0.003993  min_lr: 0.003993  loss: 3.4839 (3.7321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5001 (0.5755)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [27]  [ 800/1251]  eta: 0:02:05  lr: 0.003993  min_lr: 0.003993  loss: 3.9803 (3.7276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5467 (0.5934)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [27]  [1000/1251]  eta: 0:01:09  lr: 0.003992  min_lr: 0.003992  loss: 4.2499 (3.7328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4734 (0.5875)  time: 0.2811  data: 0.0005  max mem: 21847
Epoch: [27]  [1200/1251]  eta: 0:00:14  lr: 0.003992  min_lr: 0.003992  loss: 3.8960 (3.7342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5121 (0.5914)  time: 0.2785  data: 0.0004  max mem: 21847
Epoch: [27]  [1250/1251]  eta: 0:00:00  lr: 0.003992  min_lr: 0.003992  loss: 4.2733 (3.7396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5019 (0.5909)  time: 0.2284  data: 0.0007  max mem: 21847
Epoch: [27] Total time: 0:05:46 (0.2774 s / it)
Averaged stats: lr: 0.003992  min_lr: 0.003992  loss: 4.2733 (3.7087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5019 (0.5909)
Test:  [ 0/25]  eta: 0:01:50  loss: 0.8908 (0.8908)  acc1: 82.8000 (82.8000)  acc5: 96.4000 (96.4000)  time: 4.4254  data: 4.2459  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.1300 (1.1639)  acc1: 75.6000 (75.4909)  acc5: 94.8000 (94.0364)  time: 0.7010  data: 0.5675  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.4330 (1.3725)  acc1: 68.8000 (71.1810)  acc5: 90.4000 (90.8952)  time: 0.2454  data: 0.1172  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.5104 (1.3804)  acc1: 69.2000 (71.1040)  acc5: 88.0000 (90.7520)  time: 0.2127  data: 0.0852  max mem: 21847
Test: Total time: 0:00:09 (0.3998 s / it)
* Acc@1 71.092 Acc@5 90.912 loss 1.374
Accuracy of the model on the 50000 test images: 71.1%
Max accuracy: 71.09%
Epoch: [28]  [   0/1251]  eta: 0:50:31  lr: 0.003992  min_lr: 0.003992  loss: 3.8581 (3.8581)  weight_decay: 0.0500 (0.0500)  time: 2.4232  data: 1.8795  max mem: 21847
Epoch: [28]  [ 200/1251]  eta: 0:05:00  lr: 0.003992  min_lr: 0.003992  loss: 3.8517 (3.7093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5207 (0.5792)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [28]  [ 400/1251]  eta: 0:03:59  lr: 0.003991  min_lr: 0.003991  loss: 3.7958 (3.7165)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5615 (0.5800)  time: 0.2789  data: 0.0004  max mem: 21847
Epoch: [28]  [ 600/1251]  eta: 0:03:01  lr: 0.003991  min_lr: 0.003991  loss: 3.9869 (3.7006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5738 (0.5897)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [28]  [ 800/1251]  eta: 0:02:05  lr: 0.003991  min_lr: 0.003991  loss: 3.5661 (3.6868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5582 (0.5804)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [28]  [1000/1251]  eta: 0:01:09  lr: 0.003990  min_lr: 0.003990  loss: 3.5008 (3.6668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5677 (0.5769)  time: 0.2898  data: 0.0004  max mem: 21847
Epoch: [28]  [1200/1251]  eta: 0:00:14  lr: 0.003990  min_lr: 0.003990  loss: 4.2374 (3.6572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5408 (0.5817)  time: 0.2798  data: 0.0004  max mem: 21847
Epoch: [28]  [1250/1251]  eta: 0:00:00  lr: 0.003990  min_lr: 0.003990  loss: 3.4112 (3.6566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5488 (0.5806)  time: 0.2283  data: 0.0005  max mem: 21847
Epoch: [28] Total time: 0:05:45 (0.2761 s / it)
Averaged stats: lr: 0.003990  min_lr: 0.003990  loss: 3.4112 (3.6940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5488 (0.5806)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.9917 (0.9917)  acc1: 80.0000 (80.0000)  acc5: 95.6000 (95.6000)  time: 5.8350  data: 5.6846  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0876 (1.0906)  acc1: 75.2000 (75.5273)  acc5: 94.0000 (93.8909)  time: 0.6995  data: 0.5649  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.3250 (1.3333)  acc1: 68.0000 (71.0857)  acc5: 89.6000 (90.6857)  time: 0.2016  data: 0.0714  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.5314 (1.3487)  acc1: 67.6000 (70.7520)  acc5: 88.4000 (90.5760)  time: 0.1996  data: 0.0713  max mem: 21847
Test: Total time: 0:00:10 (0.4188 s / it)
* Acc@1 71.270 Acc@5 90.902 loss 1.336
Accuracy of the model on the 50000 test images: 71.3%
Max accuracy: 71.27%
Epoch: [29]  [   0/1251]  eta: 1:05:07  lr: 0.003990  min_lr: 0.003990  loss: 4.1425 (4.1425)  weight_decay: 0.0500 (0.0500)  time: 3.1235  data: 2.8104  max mem: 21847
Epoch: [29]  [ 200/1251]  eta: 0:05:04  lr: 0.003989  min_lr: 0.003989  loss: 3.6572 (3.5820)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2815  data: 0.0004  max mem: 21847
Epoch: [29]  [ 400/1251]  eta: 0:03:59  lr: 0.003989  min_lr: 0.003989  loss: 3.1766 (3.6163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5185 (nan)  time: 0.2817  data: 0.0004  max mem: 21847
Epoch: [29]  [ 600/1251]  eta: 0:03:02  lr: 0.003989  min_lr: 0.003989  loss: 3.4842 (3.6201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4419 (nan)  time: 0.2734  data: 0.0005  max mem: 21847
Epoch: [29]  [ 800/1251]  eta: 0:02:05  lr: 0.003988  min_lr: 0.003988  loss: 3.7294 (3.6226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6131 (nan)  time: 0.2753  data: 0.0005  max mem: 21847
Epoch: [29]  [1000/1251]  eta: 0:01:09  lr: 0.003988  min_lr: 0.003988  loss: 3.5898 (3.6278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6250 (nan)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [29]  [1200/1251]  eta: 0:00:14  lr: 0.003988  min_lr: 0.003988  loss: 3.2084 (3.6404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5591 (nan)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [29]  [1250/1251]  eta: 0:00:00  lr: 0.003987  min_lr: 0.003987  loss: 3.7288 (3.6417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4551 (nan)  time: 0.2278  data: 0.0006  max mem: 21847
Epoch: [29] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.003987  min_lr: 0.003987  loss: 3.7288 (3.6651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4551 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.9424 (0.9424)  acc1: 80.8000 (80.8000)  acc5: 94.8000 (94.8000)  time: 5.6655  data: 5.5148  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.0798 (1.1129)  acc1: 78.4000 (75.7091)  acc5: 94.8000 (94.4000)  time: 0.7478  data: 0.6152  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.3799 (1.3388)  acc1: 67.6000 (71.0476)  acc5: 89.6000 (91.2191)  time: 0.2018  data: 0.0727  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.4843 (1.3496)  acc1: 68.0000 (70.8640)  acc5: 89.6000 (91.0720)  time: 0.2008  data: 0.0726  max mem: 21847
Test: Total time: 0:00:10 (0.4117 s / it)
* Acc@1 71.396 Acc@5 91.094 loss 1.338
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.40%
Epoch: [30]  [   0/1251]  eta: 1:05:11  lr: 0.003987  min_lr: 0.003987  loss: 3.3360 (3.3360)  weight_decay: 0.0500 (0.0500)  time: 3.1270  data: 2.8468  max mem: 21847
Epoch: [30]  [ 200/1251]  eta: 0:05:03  lr: 0.003987  min_lr: 0.003987  loss: 3.5189 (3.6715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6255 (0.6507)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [30]  [ 400/1251]  eta: 0:03:59  lr: 0.003987  min_lr: 0.003987  loss: 3.9592 (3.6629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6318 (0.6147)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [30]  [ 600/1251]  eta: 0:03:02  lr: 0.003986  min_lr: 0.003986  loss: 3.2795 (3.6628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6091 (0.5968)  time: 0.2830  data: 0.0004  max mem: 21847
Epoch: [30]  [ 800/1251]  eta: 0:02:05  lr: 0.003986  min_lr: 0.003986  loss: 3.7723 (3.6553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7102 (0.6196)  time: 0.2841  data: 0.0005  max mem: 21847
Epoch: [30]  [1000/1251]  eta: 0:01:09  lr: 0.003985  min_lr: 0.003985  loss: 3.3111 (3.6651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4967 (0.6044)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [30]  [1200/1251]  eta: 0:00:14  lr: 0.003985  min_lr: 0.003985  loss: 4.0023 (3.6533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5442 (0.5975)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [30]  [1250/1251]  eta: 0:00:00  lr: 0.003985  min_lr: 0.003985  loss: 3.3462 (3.6483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5375 (0.5996)  time: 0.2285  data: 0.0005  max mem: 21847
Epoch: [30] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.003985  min_lr: 0.003985  loss: 3.3462 (3.6637)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5375 (0.5996)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8968 (0.8968)  acc1: 82.4000 (82.4000)  acc5: 97.6000 (97.6000)  time: 5.5533  data: 5.4000  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0580 (1.0849)  acc1: 77.6000 (76.9455)  acc5: 95.2000 (94.7273)  time: 0.7295  data: 0.5951  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.3678 (1.3103)  acc1: 68.4000 (72.0762)  acc5: 92.4000 (91.5810)  time: 0.2016  data: 0.0717  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.4991 (1.3233)  acc1: 67.6000 (71.4880)  acc5: 90.4000 (91.4720)  time: 0.1997  data: 0.0716  max mem: 21847
Test: Total time: 0:00:10 (0.4072 s / it)
* Acc@1 71.892 Acc@5 91.390 loss 1.311
Accuracy of the model on the 50000 test images: 71.9%
Max accuracy: 71.89%
Epoch: [31]  [   0/1251]  eta: 1:07:07  lr: 0.003985  min_lr: 0.003985  loss: 3.9154 (3.9154)  weight_decay: 0.0500 (0.0500)  time: 3.2192  data: 2.8770  max mem: 21847
Epoch: [31]  [ 200/1251]  eta: 0:05:03  lr: 0.003984  min_lr: 0.003984  loss: 3.5607 (3.7519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5819 (0.6229)  time: 0.2782  data: 0.0004  max mem: 21847
Epoch: [31]  [ 400/1251]  eta: 0:03:59  lr: 0.003984  min_lr: 0.003984  loss: 3.4975 (3.6687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5651 (0.5966)  time: 0.2812  data: 0.0004  max mem: 21847
Epoch: [31]  [ 600/1251]  eta: 0:03:01  lr: 0.003983  min_lr: 0.003983  loss: 3.3798 (3.6685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6232 (0.5913)  time: 0.2716  data: 0.0003  max mem: 21847
Epoch: [31]  [ 800/1251]  eta: 0:02:05  lr: 0.003983  min_lr: 0.003983  loss: 3.7469 (3.6447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5133 (0.5833)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [31]  [1000/1251]  eta: 0:01:09  lr: 0.003982  min_lr: 0.003982  loss: 3.5301 (3.6533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5353 (0.5866)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [31]  [1200/1251]  eta: 0:00:14  lr: 0.003982  min_lr: 0.003982  loss: 3.2969 (3.6578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5739 (0.5957)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [31]  [1250/1251]  eta: 0:00:00  lr: 0.003982  min_lr: 0.003982  loss: 3.0449 (3.6491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5739 (0.5962)  time: 0.2334  data: 0.0005  max mem: 21847
Epoch: [31] Total time: 0:05:45 (0.2765 s / it)
Averaged stats: lr: 0.003982  min_lr: 0.003982  loss: 3.0449 (3.6328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5739 (0.5962)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8500 (0.8500)  acc1: 81.2000 (81.2000)  acc5: 95.6000 (95.6000)  time: 5.6041  data: 5.4384  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9914 (1.0517)  acc1: 78.0000 (76.0727)  acc5: 94.8000 (94.4364)  time: 0.7101  data: 0.5748  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2565 (1.2644)  acc1: 69.2000 (71.6191)  acc5: 91.2000 (91.5429)  time: 0.1936  data: 0.0638  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.4050 (1.2680)  acc1: 69.2000 (71.4720)  acc5: 89.6000 (91.5200)  time: 0.1959  data: 0.0668  max mem: 21847
Test: Total time: 0:00:10 (0.4051 s / it)
* Acc@1 72.192 Acc@5 91.406 loss 1.254
Accuracy of the model on the 50000 test images: 72.2%
Max accuracy: 72.19%
Epoch: [32]  [   0/1251]  eta: 1:08:07  lr: 0.003982  min_lr: 0.003982  loss: 4.0456 (4.0456)  weight_decay: 0.0500 (0.0500)  time: 3.2673  data: 2.9395  max mem: 21847
Epoch: [32]  [ 200/1251]  eta: 0:05:02  lr: 0.003981  min_lr: 0.003981  loss: 3.3368 (3.4926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5890 (0.6554)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [32]  [ 400/1251]  eta: 0:03:59  lr: 0.003981  min_lr: 0.003981  loss: 3.2763 (3.5307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4875 (0.6259)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [32]  [ 600/1251]  eta: 0:03:02  lr: 0.003980  min_lr: 0.003980  loss: 3.4765 (3.5264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6029 (0.6337)  time: 0.2740  data: 0.0005  max mem: 21847
Epoch: [32]  [ 800/1251]  eta: 0:02:05  lr: 0.003980  min_lr: 0.003980  loss: 3.6054 (3.5538)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4678 (0.6214)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [32]  [1000/1251]  eta: 0:01:09  lr: 0.003979  min_lr: 0.003979  loss: 3.4510 (3.5592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6237 (0.6289)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [32]  [1200/1251]  eta: 0:00:14  lr: 0.003979  min_lr: 0.003979  loss: 3.0192 (3.5607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5367 (0.6288)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [32]  [1250/1251]  eta: 0:00:00  lr: 0.003979  min_lr: 0.003979  loss: 3.6066 (3.5600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6518 (0.6296)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [32] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.003979  min_lr: 0.003979  loss: 3.6066 (3.6171)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6518 (0.6296)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.8347 (0.8347)  acc1: 82.0000 (82.0000)  acc5: 95.2000 (95.2000)  time: 5.4363  data: 5.2891  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.0455 (1.0310)  acc1: 78.4000 (77.4909)  acc5: 94.8000 (94.3636)  time: 0.7426  data: 0.6058  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2942 (1.2537)  acc1: 70.4000 (72.7238)  acc5: 91.6000 (91.5429)  time: 0.2146  data: 0.0815  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3313 (1.2598)  acc1: 71.2000 (72.5760)  acc5: 91.2000 (91.5360)  time: 0.2127  data: 0.0814  max mem: 21847
Test: Total time: 0:00:10 (0.4127 s / it)
* Acc@1 72.406 Acc@5 91.632 loss 1.254
Accuracy of the model on the 50000 test images: 72.4%
Max accuracy: 72.41%
Epoch: [33]  [   0/1251]  eta: 1:03:15  lr: 0.003979  min_lr: 0.003979  loss: 3.9474 (3.9474)  weight_decay: 0.0500 (0.0500)  time: 3.0344  data: 2.6977  max mem: 21847
Epoch: [33]  [ 200/1251]  eta: 0:05:01  lr: 0.003978  min_lr: 0.003978  loss: 3.7274 (3.7053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5675 (0.5987)  time: 0.2715  data: 0.0003  max mem: 21847
Epoch: [33]  [ 400/1251]  eta: 0:03:59  lr: 0.003978  min_lr: 0.003978  loss: 3.7916 (3.6479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5843 (0.6041)  time: 0.2916  data: 0.0004  max mem: 21847
Epoch: [33]  [ 600/1251]  eta: 0:03:01  lr: 0.003977  min_lr: 0.003977  loss: 3.8803 (3.6461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6056 (0.6058)  time: 0.2818  data: 0.0003  max mem: 21847
Epoch: [33]  [ 800/1251]  eta: 0:02:05  lr: 0.003977  min_lr: 0.003977  loss: 3.7739 (3.6473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5133 (0.6039)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [33]  [1000/1251]  eta: 0:01:09  lr: 0.003976  min_lr: 0.003976  loss: 3.0663 (3.6401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5107 (0.5984)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [33]  [1200/1251]  eta: 0:00:14  lr: 0.003976  min_lr: 0.003976  loss: 3.6066 (3.6499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5596 (0.6105)  time: 0.2729  data: 0.0005  max mem: 21847
Epoch: [33]  [1250/1251]  eta: 0:00:00  lr: 0.003975  min_lr: 0.003975  loss: 2.9655 (3.6443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5841 (0.6112)  time: 0.2283  data: 0.0005  max mem: 21847
Epoch: [33] Total time: 0:05:45 (0.2765 s / it)
Averaged stats: lr: 0.003975  min_lr: 0.003975  loss: 2.9655 (3.6165)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5841 (0.6112)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7607 (0.7607)  acc1: 83.2000 (83.2000)  acc5: 96.8000 (96.8000)  time: 5.5431  data: 5.3869  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9672 (0.9994)  acc1: 79.2000 (76.9455)  acc5: 94.8000 (94.5455)  time: 0.7127  data: 0.5776  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2770 (1.2171)  acc1: 69.6000 (72.3429)  acc5: 90.0000 (92.0000)  time: 0.2038  data: 0.0723  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3472 (1.2237)  acc1: 70.8000 (72.0160)  acc5: 89.6000 (91.8400)  time: 0.2034  data: 0.0723  max mem: 21847
Test: Total time: 0:00:10 (0.4090 s / it)
* Acc@1 72.742 Acc@5 91.684 loss 1.210
Accuracy of the model on the 50000 test images: 72.7%
Max accuracy: 72.74%
Epoch: [34]  [   0/1251]  eta: 1:04:50  lr: 0.003975  min_lr: 0.003975  loss: 4.0337 (4.0337)  weight_decay: 0.0500 (0.0500)  time: 3.1097  data: 2.7670  max mem: 21847
Epoch: [34]  [ 200/1251]  eta: 0:05:04  lr: 0.003975  min_lr: 0.003975  loss: 3.1375 (3.5445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4804 (0.5160)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [34]  [ 400/1251]  eta: 0:04:00  lr: 0.003974  min_lr: 0.003974  loss: 3.9598 (3.5571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5379 (0.5595)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [34]  [ 600/1251]  eta: 0:03:02  lr: 0.003974  min_lr: 0.003974  loss: 3.6589 (3.5820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7196 (0.5839)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [34]  [ 800/1251]  eta: 0:02:05  lr: 0.003973  min_lr: 0.003973  loss: 3.7571 (3.5926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5904 (0.6018)  time: 0.2709  data: 0.0003  max mem: 21847
Epoch: [34]  [1000/1251]  eta: 0:01:09  lr: 0.003972  min_lr: 0.003972  loss: 3.8712 (3.6026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4854 (0.5965)  time: 0.2792  data: 0.0004  max mem: 21847
Epoch: [34]  [1200/1251]  eta: 0:00:14  lr: 0.003972  min_lr: 0.003972  loss: 3.9268 (3.6060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5267 (0.5989)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [34]  [1250/1251]  eta: 0:00:00  lr: 0.003972  min_lr: 0.003972  loss: 3.6325 (3.6017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5234 (0.5990)  time: 0.2373  data: 0.0007  max mem: 21847
Epoch: [34] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.003972  min_lr: 0.003972  loss: 3.6325 (3.6025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5234 (0.5990)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.7968 (0.7968)  acc1: 83.2000 (83.2000)  acc5: 96.0000 (96.0000)  time: 5.8141  data: 5.6476  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9848 (1.0188)  acc1: 76.8000 (77.6364)  acc5: 94.8000 (94.3273)  time: 0.7397  data: 0.6046  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.3070 (1.2388)  acc1: 70.4000 (72.8000)  acc5: 90.8000 (91.4476)  time: 0.1962  data: 0.0651  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.4030 (1.2448)  acc1: 69.2000 (72.4960)  acc5: 90.4000 (91.4880)  time: 0.1943  data: 0.0650  max mem: 21847
Test: Total time: 0:00:10 (0.4137 s / it)
* Acc@1 72.914 Acc@5 91.776 loss 1.231
Accuracy of the model on the 50000 test images: 72.9%
Max accuracy: 72.91%
Epoch: [35]  [   0/1251]  eta: 1:06:16  lr: 0.003972  min_lr: 0.003972  loss: 4.1789 (4.1789)  weight_decay: 0.0500 (0.0500)  time: 3.1785  data: 2.8677  max mem: 21847
Epoch: [35]  [ 200/1251]  eta: 0:05:02  lr: 0.003971  min_lr: 0.003971  loss: 3.7169 (3.5643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5706 (0.5823)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [35]  [ 400/1251]  eta: 0:03:59  lr: 0.003971  min_lr: 0.003971  loss: 3.8403 (3.5977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5823 (0.6349)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [35]  [ 600/1251]  eta: 0:03:01  lr: 0.003970  min_lr: 0.003970  loss: 3.9870 (3.5799)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5050 (0.6239)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [35]  [ 800/1251]  eta: 0:02:05  lr: 0.003969  min_lr: 0.003969  loss: 3.1744 (3.5680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5816 (0.6402)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [35]  [1000/1251]  eta: 0:01:09  lr: 0.003969  min_lr: 0.003969  loss: 3.0915 (3.5709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6050 (0.6398)  time: 0.2732  data: 0.0005  max mem: 21847
Epoch: [35]  [1200/1251]  eta: 0:00:14  lr: 0.003968  min_lr: 0.003968  loss: 3.5380 (3.5830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4790 (0.6367)  time: 0.2812  data: 0.0005  max mem: 21847
Epoch: [35]  [1250/1251]  eta: 0:00:00  lr: 0.003968  min_lr: 0.003968  loss: 3.6823 (3.5829)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5215 (0.6369)  time: 0.2284  data: 0.0006  max mem: 21847
Epoch: [35] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003968  min_lr: 0.003968  loss: 3.6823 (3.5766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5215 (0.6369)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8548 (0.8548)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.7229  data: 5.5722  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0930 (1.0750)  acc1: 77.6000 (78.2182)  acc5: 95.2000 (95.0909)  time: 0.6975  data: 0.5657  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.3587 (1.2947)  acc1: 70.8000 (72.7619)  acc5: 91.2000 (92.0000)  time: 0.1766  data: 0.0479  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.4407 (1.3031)  acc1: 69.6000 (72.5120)  acc5: 90.4000 (91.9840)  time: 0.1754  data: 0.0478  max mem: 21847
Test: Total time: 0:00:09 (0.3940 s / it)
* Acc@1 72.712 Acc@5 91.762 loss 1.295
Accuracy of the model on the 50000 test images: 72.7%
Max accuracy: 72.91%
Epoch: [36]  [   0/1251]  eta: 1:13:52  lr: 0.003968  min_lr: 0.003968  loss: 3.8677 (3.8677)  weight_decay: 0.0500 (0.0500)  time: 3.5432  data: 2.3742  max mem: 21847
Epoch: [36]  [ 200/1251]  eta: 0:05:07  lr: 0.003967  min_lr: 0.003967  loss: 3.4924 (3.5737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6731 (0.6784)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [36]  [ 400/1251]  eta: 0:04:01  lr: 0.003967  min_lr: 0.003967  loss: 3.7390 (3.5840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7313 (0.6847)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [36]  [ 600/1251]  eta: 0:03:02  lr: 0.003966  min_lr: 0.003966  loss: 3.6244 (3.5933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5185 (0.6539)  time: 0.2785  data: 0.0004  max mem: 21847
Epoch: [36]  [ 800/1251]  eta: 0:02:06  lr: 0.003965  min_lr: 0.003965  loss: 3.8655 (3.5703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5697 (0.6357)  time: 0.2722  data: 0.0005  max mem: 21847
Epoch: [36]  [1000/1251]  eta: 0:01:09  lr: 0.003965  min_lr: 0.003965  loss: 2.7692 (3.5642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7298 (0.6548)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [36]  [1200/1251]  eta: 0:00:14  lr: 0.003964  min_lr: 0.003964  loss: 2.7562 (3.5691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5624 (0.6501)  time: 0.2796  data: 0.0005  max mem: 21847
Epoch: [36]  [1250/1251]  eta: 0:00:00  lr: 0.003964  min_lr: 0.003964  loss: 3.8779 (3.5712)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5462 (0.6460)  time: 0.2281  data: 0.0006  max mem: 21847
Epoch: [36] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.003964  min_lr: 0.003964  loss: 3.8779 (3.5776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5462 (0.6460)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8114 (0.8114)  acc1: 84.0000 (84.0000)  acc5: 96.8000 (96.8000)  time: 5.5292  data: 5.3524  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0229 (1.0481)  acc1: 79.2000 (77.4909)  acc5: 94.8000 (94.7636)  time: 0.7265  data: 0.5926  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.3474 (1.2808)  acc1: 69.6000 (72.8762)  acc5: 90.8000 (91.7333)  time: 0.2063  data: 0.0778  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3522 (1.2828)  acc1: 69.6000 (72.6720)  acc5: 90.4000 (91.7760)  time: 0.2099  data: 0.0818  max mem: 21847
Test: Total time: 0:00:10 (0.4136 s / it)
* Acc@1 73.018 Acc@5 91.962 loss 1.275
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.02%
Epoch: [37]  [   0/1251]  eta: 1:07:50  lr: 0.003964  min_lr: 0.003964  loss: 3.7393 (3.7393)  weight_decay: 0.0500 (0.0500)  time: 3.2536  data: 2.9657  max mem: 21847
Epoch: [37]  [ 200/1251]  eta: 0:05:04  lr: 0.003963  min_lr: 0.003963  loss: 3.7702 (3.6278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5372 (nan)  time: 0.2873  data: 0.0004  max mem: 21847
Epoch: [37]  [ 400/1251]  eta: 0:04:00  lr: 0.003962  min_lr: 0.003962  loss: 3.6284 (3.6448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5483 (nan)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [37]  [ 600/1251]  eta: 0:03:01  lr: 0.003962  min_lr: 0.003962  loss: 3.9633 (3.6039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5954 (nan)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [37]  [ 800/1251]  eta: 0:02:05  lr: 0.003961  min_lr: 0.003961  loss: 4.1509 (3.6078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5706 (nan)  time: 0.2729  data: 0.0005  max mem: 21847
Epoch: [37]  [1000/1251]  eta: 0:01:09  lr: 0.003960  min_lr: 0.003960  loss: 3.9647 (3.5956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5648 (nan)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [37]  [1200/1251]  eta: 0:00:14  lr: 0.003960  min_lr: 0.003960  loss: 3.5214 (3.5958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7509 (nan)  time: 0.2728  data: 0.0005  max mem: 21847
Epoch: [37]  [1250/1251]  eta: 0:00:00  lr: 0.003959  min_lr: 0.003959  loss: 3.2837 (3.5959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6569 (nan)  time: 0.2278  data: 0.0006  max mem: 21847
Epoch: [37] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.003959  min_lr: 0.003959  loss: 3.2837 (3.5805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6569 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8642 (0.8642)  acc1: 82.0000 (82.0000)  acc5: 97.6000 (97.6000)  time: 5.6263  data: 5.4762  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0614 (1.0576)  acc1: 78.8000 (77.7818)  acc5: 95.2000 (94.7273)  time: 0.7213  data: 0.5888  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.3089 (1.2763)  acc1: 71.2000 (73.0476)  acc5: 91.2000 (92.0952)  time: 0.1900  data: 0.0609  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3336 (1.2845)  acc1: 68.4000 (72.7680)  acc5: 90.4000 (92.0480)  time: 0.1918  data: 0.0608  max mem: 21847
Test: Total time: 0:00:10 (0.4045 s / it)
* Acc@1 73.248 Acc@5 92.002 loss 1.280
Accuracy of the model on the 50000 test images: 73.2%
Max accuracy: 73.25%
Epoch: [38]  [   0/1251]  eta: 0:57:36  lr: 0.003959  min_lr: 0.003959  loss: 3.8650 (3.8650)  weight_decay: 0.0500 (0.0500)  time: 2.7632  data: 2.4312  max mem: 21847
Epoch: [38]  [ 200/1251]  eta: 0:05:01  lr: 0.003959  min_lr: 0.003959  loss: 3.8265 (3.5771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6806 (0.6593)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [38]  [ 400/1251]  eta: 0:03:58  lr: 0.003958  min_lr: 0.003958  loss: 3.5157 (3.5530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6169 (0.6760)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [38]  [ 600/1251]  eta: 0:03:01  lr: 0.003957  min_lr: 0.003957  loss: 3.2582 (3.5509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6082 (0.6912)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [38]  [ 800/1251]  eta: 0:02:05  lr: 0.003956  min_lr: 0.003956  loss: 3.7903 (3.5539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6038 (0.6700)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [38]  [1000/1251]  eta: 0:01:09  lr: 0.003956  min_lr: 0.003956  loss: 4.0390 (3.5495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6750 (0.6709)  time: 0.2756  data: 0.0004  max mem: 21847
Epoch: [38]  [1200/1251]  eta: 0:00:14  lr: 0.003955  min_lr: 0.003955  loss: 3.5881 (3.5598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6514 (0.6779)  time: 0.2712  data: 0.0005  max mem: 21847
Epoch: [38]  [1250/1251]  eta: 0:00:00  lr: 0.003955  min_lr: 0.003955  loss: 3.6564 (3.5599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7080 (0.6761)  time: 0.2363  data: 0.0006  max mem: 21847
Epoch: [38] Total time: 0:05:45 (0.2762 s / it)
Averaged stats: lr: 0.003955  min_lr: 0.003955  loss: 3.6564 (3.5537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7080 (0.6761)
Test:  [ 0/25]  eta: 0:01:38  loss: 0.8192 (0.8192)  acc1: 85.2000 (85.2000)  acc5: 96.8000 (96.8000)  time: 3.9250  data: 3.7491  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.9827 (1.0415)  acc1: 79.2000 (77.8545)  acc5: 95.2000 (94.8727)  time: 0.6356  data: 0.4969  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2916 (1.2507)  acc1: 71.2000 (73.1429)  acc5: 91.2000 (92.2286)  time: 0.2711  data: 0.1392  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3173 (1.2606)  acc1: 70.4000 (72.7840)  acc5: 90.8000 (92.1760)  time: 0.2286  data: 0.0988  max mem: 21847
Test: Total time: 0:00:10 (0.4100 s / it)
* Acc@1 73.346 Acc@5 92.100 loss 1.254
Accuracy of the model on the 50000 test images: 73.3%
Max accuracy: 73.35%
Epoch: [39]  [   0/1251]  eta: 1:00:04  lr: 0.003955  min_lr: 0.003955  loss: 4.3509 (4.3509)  weight_decay: 0.0500 (0.0500)  time: 2.8810  data: 2.5342  max mem: 21847
Epoch: [39]  [ 200/1251]  eta: 0:05:01  lr: 0.003954  min_lr: 0.003954  loss: 3.4970 (3.5698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5615 (0.6634)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [39]  [ 400/1251]  eta: 0:03:59  lr: 0.003953  min_lr: 0.003953  loss: 3.4202 (3.5564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5855 (0.7002)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [39]  [ 600/1251]  eta: 0:03:01  lr: 0.003952  min_lr: 0.003952  loss: 2.8575 (3.5396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6166 (0.6891)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [39]  [ 800/1251]  eta: 0:02:05  lr: 0.003952  min_lr: 0.003952  loss: 3.4509 (3.5418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5090 (0.6734)  time: 0.2829  data: 0.0004  max mem: 21847
Epoch: [39]  [1000/1251]  eta: 0:01:09  lr: 0.003951  min_lr: 0.003951  loss: 3.3793 (3.5272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6313 (0.6835)  time: 0.2876  data: 0.0004  max mem: 21847
Epoch: [39]  [1200/1251]  eta: 0:00:14  lr: 0.003950  min_lr: 0.003950  loss: 3.4379 (3.5351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7866 (0.6921)  time: 0.2820  data: 0.0005  max mem: 21847
Epoch: [39]  [1250/1251]  eta: 0:00:00  lr: 0.003950  min_lr: 0.003950  loss: 3.1110 (3.5318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6540 (0.6925)  time: 0.2284  data: 0.0006  max mem: 21847
Epoch: [39] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.003950  min_lr: 0.003950  loss: 3.1110 (3.5366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6540 (0.6925)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7678 (0.7678)  acc1: 84.8000 (84.8000)  acc5: 96.4000 (96.4000)  time: 5.7495  data: 5.5888  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9734 (0.9741)  acc1: 78.0000 (77.3455)  acc5: 95.2000 (94.8364)  time: 0.7270  data: 0.5922  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1929 (1.1845)  acc1: 71.2000 (73.2381)  acc5: 91.6000 (92.0381)  time: 0.1970  data: 0.0671  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3044 (1.1871)  acc1: 71.6000 (73.2640)  acc5: 89.6000 (92.0960)  time: 0.2067  data: 0.0776  max mem: 21847
Test: Total time: 0:00:10 (0.4201 s / it)
* Acc@1 73.784 Acc@5 92.272 loss 1.182
Accuracy of the model on the 50000 test images: 73.8%
Max accuracy: 73.78%
Epoch: [40]  [   0/1251]  eta: 0:59:45  lr: 0.003950  min_lr: 0.003950  loss: 3.4225 (3.4225)  weight_decay: 0.0500 (0.0500)  time: 2.8663  data: 2.5371  max mem: 21847
Epoch: [40]  [ 200/1251]  eta: 0:05:03  lr: 0.003949  min_lr: 0.003949  loss: 3.3676 (3.4777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6356 (0.7335)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [40]  [ 400/1251]  eta: 0:04:00  lr: 0.003948  min_lr: 0.003948  loss: 2.9994 (3.4880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6480 (0.7307)  time: 0.2744  data: 0.0003  max mem: 21847
Epoch: [40]  [ 600/1251]  eta: 0:03:02  lr: 0.003947  min_lr: 0.003947  loss: 3.0813 (3.4614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5947 (0.6955)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [40]  [ 800/1251]  eta: 0:02:05  lr: 0.003947  min_lr: 0.003947  loss: 3.6406 (3.4800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5931 (0.6916)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [40]  [1000/1251]  eta: 0:01:09  lr: 0.003946  min_lr: 0.003946  loss: 3.7965 (3.4842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6735 (0.6951)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [40]  [1200/1251]  eta: 0:00:14  lr: 0.003945  min_lr: 0.003945  loss: 3.5711 (3.4934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7030 (0.6930)  time: 0.2713  data: 0.0005  max mem: 21847
Epoch: [40]  [1250/1251]  eta: 0:00:00  lr: 0.003945  min_lr: 0.003945  loss: 3.6400 (3.4946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5432 (0.6860)  time: 0.2359  data: 0.0006  max mem: 21847
Epoch: [40] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.003945  min_lr: 0.003945  loss: 3.6400 (3.5227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5432 (0.6860)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.8133 (0.8133)  acc1: 84.0000 (84.0000)  acc5: 97.2000 (97.2000)  time: 5.3663  data: 5.2009  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.0077 (1.0539)  acc1: 78.4000 (77.5273)  acc5: 95.6000 (95.1636)  time: 0.7341  data: 0.5992  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2959 (1.2446)  acc1: 72.0000 (73.6952)  acc5: 91.2000 (92.3619)  time: 0.2181  data: 0.0885  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3853 (1.2453)  acc1: 70.4000 (73.4080)  acc5: 90.4000 (92.2560)  time: 0.2175  data: 0.0884  max mem: 21847
Test: Total time: 0:00:10 (0.4124 s / it)
* Acc@1 73.608 Acc@5 92.150 loss 1.231
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 73.78%
Epoch: [41]  [   0/1251]  eta: 0:59:28  lr: 0.003945  min_lr: 0.003945  loss: 3.0087 (3.0087)  weight_decay: 0.0500 (0.0500)  time: 2.8523  data: 1.7086  max mem: 21847
Epoch: [41]  [ 200/1251]  eta: 0:05:02  lr: 0.003944  min_lr: 0.003944  loss: 3.2859 (3.5755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6557 (0.7201)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [41]  [ 400/1251]  eta: 0:03:58  lr: 0.003943  min_lr: 0.003943  loss: 3.7343 (3.5424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5902 (0.7359)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [41]  [ 600/1251]  eta: 0:03:01  lr: 0.003942  min_lr: 0.003942  loss: 2.8841 (3.5374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6563 (0.7444)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [41]  [ 800/1251]  eta: 0:02:05  lr: 0.003941  min_lr: 0.003941  loss: 3.1273 (3.5581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5650 (0.7263)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [41]  [1000/1251]  eta: 0:01:09  lr: 0.003940  min_lr: 0.003940  loss: 3.9791 (3.5537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4915 (0.7159)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [41]  [1200/1251]  eta: 0:00:14  lr: 0.003940  min_lr: 0.003940  loss: 3.1214 (3.5542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8265 (0.7226)  time: 0.2708  data: 0.0005  max mem: 21847
Epoch: [41]  [1250/1251]  eta: 0:00:00  lr: 0.003939  min_lr: 0.003939  loss: 3.8186 (3.5533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5881 (0.7160)  time: 0.2334  data: 0.0006  max mem: 21847
Epoch: [41] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.003939  min_lr: 0.003939  loss: 3.8186 (3.5289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5881 (0.7160)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.8477 (0.8477)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.7662  data: 5.6003  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9839 (1.0302)  acc1: 78.8000 (78.3636)  acc5: 96.0000 (95.4182)  time: 0.7580  data: 0.6251  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2586 (1.2454)  acc1: 72.4000 (73.7524)  acc5: 92.4000 (92.5524)  time: 0.2220  data: 0.0935  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3680 (1.2559)  acc1: 72.0000 (73.3760)  acc5: 91.2000 (92.4160)  time: 0.2215  data: 0.0934  max mem: 21847
Test: Total time: 0:00:10 (0.4323 s / it)
* Acc@1 73.892 Acc@5 92.302 loss 1.243
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 73.89%
Epoch: [42]  [   0/1251]  eta: 1:04:57  lr: 0.003939  min_lr: 0.003939  loss: 4.1524 (4.1524)  weight_decay: 0.0500 (0.0500)  time: 3.1158  data: 2.8078  max mem: 21847
Epoch: [42]  [ 200/1251]  eta: 0:05:03  lr: 0.003939  min_lr: 0.003939  loss: 2.9361 (3.5235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6359 (0.8059)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [42]  [ 400/1251]  eta: 0:03:59  lr: 0.003938  min_lr: 0.003938  loss: 3.9474 (3.5489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8271 (0.7658)  time: 0.2735  data: 0.0004  max mem: 21847
Epoch: [42]  [ 600/1251]  eta: 0:03:02  lr: 0.003937  min_lr: 0.003937  loss: 3.2617 (3.5536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6140 (0.7807)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [42]  [ 800/1251]  eta: 0:02:05  lr: 0.003936  min_lr: 0.003936  loss: 3.5900 (3.5537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6280 (0.7709)  time: 0.2738  data: 0.0003  max mem: 21847
Epoch: [42]  [1000/1251]  eta: 0:01:09  lr: 0.003935  min_lr: 0.003935  loss: 3.5677 (3.5545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6589 (0.7597)  time: 0.2750  data: 0.0004  max mem: 21847
Epoch: [42]  [1200/1251]  eta: 0:00:14  lr: 0.003934  min_lr: 0.003934  loss: 3.7488 (3.5424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5782 (0.7502)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [42]  [1250/1251]  eta: 0:00:00  lr: 0.003934  min_lr: 0.003934  loss: 3.9935 (3.5486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6623 (0.7504)  time: 0.2282  data: 0.0008  max mem: 21847
Epoch: [42] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.003934  min_lr: 0.003934  loss: 3.9935 (3.5359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6623 (0.7504)
Test:  [ 0/25]  eta: 0:01:23  loss: 0.8371 (0.8371)  acc1: 86.0000 (86.0000)  acc5: 96.8000 (96.8000)  time: 3.3531  data: 3.2001  max mem: 21847
Test:  [10/25]  eta: 0:00:08  loss: 0.9687 (1.0343)  acc1: 79.6000 (78.7636)  acc5: 95.6000 (95.0546)  time: 0.5979  data: 0.4644  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2559 (1.2277)  acc1: 70.4000 (74.0000)  acc5: 92.0000 (92.2095)  time: 0.2906  data: 0.1611  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3544 (1.2301)  acc1: 70.0000 (73.8400)  acc5: 90.0000 (92.0960)  time: 0.2132  data: 0.0847  max mem: 21847
Test: Total time: 0:00:10 (0.4059 s / it)
* Acc@1 74.088 Acc@5 92.450 loss 1.215
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.09%
Epoch: [43]  [   0/1251]  eta: 1:06:59  lr: 0.003934  min_lr: 0.003934  loss: 3.8910 (3.8910)  weight_decay: 0.0500 (0.0500)  time: 3.2131  data: 2.8804  max mem: 21847
Epoch: [43]  [ 200/1251]  eta: 0:05:04  lr: 0.003933  min_lr: 0.003933  loss: 3.6562 (3.4820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6230 (0.7587)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [43]  [ 400/1251]  eta: 0:03:59  lr: 0.003932  min_lr: 0.003932  loss: 3.4482 (3.4671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6978 (0.7373)  time: 0.2728  data: 0.0003  max mem: 21847
Epoch: [43]  [ 600/1251]  eta: 0:03:02  lr: 0.003931  min_lr: 0.003931  loss: 3.1423 (3.4792)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7118 (0.7456)  time: 0.2713  data: 0.0005  max mem: 21847
Epoch: [43]  [ 800/1251]  eta: 0:02:05  lr: 0.003930  min_lr: 0.003930  loss: 3.1522 (3.5040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6815 (0.7338)  time: 0.2734  data: 0.0005  max mem: 21847
Epoch: [43]  [1000/1251]  eta: 0:01:09  lr: 0.003929  min_lr: 0.003929  loss: 3.2909 (3.5015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6913 (0.7222)  time: 0.2728  data: 0.0003  max mem: 21847
Epoch: [43]  [1200/1251]  eta: 0:00:14  lr: 0.003928  min_lr: 0.003928  loss: 3.0888 (3.4861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6187 (0.7288)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [43]  [1250/1251]  eta: 0:00:00  lr: 0.003928  min_lr: 0.003928  loss: 3.4497 (3.4904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7208 (0.7345)  time: 0.2282  data: 0.0004  max mem: 21847
Epoch: [43] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.003928  min_lr: 0.003928  loss: 3.4497 (3.4946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7208 (0.7345)
Test:  [ 0/25]  eta: 0:01:21  loss: 0.8889 (0.8889)  acc1: 82.4000 (82.4000)  acc5: 96.0000 (96.0000)  time: 3.2521  data: 3.0954  max mem: 21847
Test:  [10/25]  eta: 0:00:08  loss: 1.0266 (1.0455)  acc1: 78.4000 (78.8364)  acc5: 95.2000 (95.3818)  time: 0.5978  data: 0.4603  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2939 (1.2504)  acc1: 72.8000 (74.2286)  acc5: 92.4000 (92.4952)  time: 0.2755  data: 0.1440  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3366 (1.2579)  acc1: 70.0000 (73.8240)  acc5: 91.2000 (92.4160)  time: 0.1992  data: 0.0698  max mem: 21847
Test: Total time: 0:00:09 (0.3958 s / it)
* Acc@1 74.002 Acc@5 92.436 loss 1.250
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.09%
Epoch: [44]  [   0/1251]  eta: 1:07:52  lr: 0.003928  min_lr: 0.003928  loss: 4.0925 (4.0925)  weight_decay: 0.0500 (0.0500)  time: 3.2557  data: 2.8168  max mem: 21847
Epoch: [44]  [ 200/1251]  eta: 0:05:06  lr: 0.003927  min_lr: 0.003927  loss: 3.0214 (3.4880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7238 (0.8276)  time: 0.2735  data: 0.0004  max mem: 21847
Epoch: [44]  [ 400/1251]  eta: 0:04:01  lr: 0.003926  min_lr: 0.003926  loss: 3.6836 (3.5311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6074 (nan)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [44]  [ 600/1251]  eta: 0:03:02  lr: 0.003925  min_lr: 0.003925  loss: 3.3700 (3.4978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6458 (nan)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [44]  [ 800/1251]  eta: 0:02:06  lr: 0.003924  min_lr: 0.003924  loss: 3.6764 (3.4987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6450 (nan)  time: 0.2803  data: 0.0004  max mem: 21847
Epoch: [44]  [1000/1251]  eta: 0:01:09  lr: 0.003923  min_lr: 0.003923  loss: 3.6993 (3.4943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6877 (nan)  time: 0.2777  data: 0.0004  max mem: 21847
Epoch: [44]  [1200/1251]  eta: 0:00:14  lr: 0.003922  min_lr: 0.003922  loss: 3.7858 (3.5021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7199 (nan)  time: 0.2745  data: 0.0005  max mem: 21847
Epoch: [44]  [1250/1251]  eta: 0:00:00  lr: 0.003922  min_lr: 0.003922  loss: 3.9926 (3.5087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6897 (nan)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [44] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.003922  min_lr: 0.003922  loss: 3.9926 (3.4971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6897 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7833 (0.7833)  acc1: 88.4000 (88.4000)  acc5: 96.0000 (96.0000)  time: 5.5019  data: 5.3281  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0974 (1.0224)  acc1: 77.2000 (79.1636)  acc5: 95.6000 (95.1636)  time: 0.7077  data: 0.5735  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2358 (1.2324)  acc1: 72.4000 (74.6667)  acc5: 92.0000 (92.4000)  time: 0.1999  data: 0.0711  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3892 (1.2403)  acc1: 70.8000 (74.1760)  acc5: 91.2000 (92.3680)  time: 0.1998  data: 0.0710  max mem: 21847
Test: Total time: 0:00:10 (0.4053 s / it)
* Acc@1 74.278 Acc@5 92.480 loss 1.231
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.28%
Epoch: [45]  [   0/1251]  eta: 1:06:06  lr: 0.003922  min_lr: 0.003922  loss: 4.5014 (4.5014)  weight_decay: 0.0500 (0.0500)  time: 3.1705  data: 2.8606  max mem: 21847
Epoch: [45]  [ 200/1251]  eta: 0:05:03  lr: 0.003921  min_lr: 0.003921  loss: 3.4975 (3.4376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6405 (0.7314)  time: 0.2751  data: 0.0004  max mem: 21847
Epoch: [45]  [ 400/1251]  eta: 0:04:00  lr: 0.003920  min_lr: 0.003920  loss: 2.9684 (3.4230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8433 (0.7638)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [45]  [ 600/1251]  eta: 0:03:02  lr: 0.003919  min_lr: 0.003919  loss: 3.9639 (3.4429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6239 (0.7885)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [45]  [ 800/1251]  eta: 0:02:05  lr: 0.003918  min_lr: 0.003918  loss: 3.2741 (3.4437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5636 (0.7587)  time: 0.2721  data: 0.0005  max mem: 21847
Epoch: [45]  [1000/1251]  eta: 0:01:09  lr: 0.003917  min_lr: 0.003917  loss: 3.3503 (3.4546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6193 (0.7499)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [45]  [1200/1251]  eta: 0:00:14  lr: 0.003916  min_lr: 0.003916  loss: 2.8916 (3.4620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6173 (0.7617)  time: 0.2751  data: 0.0004  max mem: 21847
Epoch: [45]  [1250/1251]  eta: 0:00:00  lr: 0.003916  min_lr: 0.003916  loss: 3.0468 (3.4608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5915 (0.7583)  time: 0.2282  data: 0.0008  max mem: 21847
Epoch: [45] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.003916  min_lr: 0.003916  loss: 3.0468 (3.4894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5915 (0.7583)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.8551 (0.8551)  acc1: 82.4000 (82.4000)  acc5: 96.8000 (96.8000)  time: 5.3348  data: 5.1758  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.9795 (0.9955)  acc1: 78.4000 (78.8727)  acc5: 95.6000 (95.0545)  time: 0.6594  data: 0.5259  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2220 (1.2067)  acc1: 73.6000 (74.4571)  acc5: 91.2000 (92.1714)  time: 0.1899  data: 0.0597  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3182 (1.2108)  acc1: 72.0000 (74.1280)  acc5: 90.4000 (92.2560)  time: 0.2028  data: 0.0734  max mem: 21847
Test: Total time: 0:00:09 (0.3998 s / it)
* Acc@1 74.342 Acc@5 92.528 loss 1.206
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.34%
Epoch: [46]  [   0/1251]  eta: 1:05:18  lr: 0.003916  min_lr: 0.003916  loss: 2.2863 (2.2863)  weight_decay: 0.0500 (0.0500)  time: 3.1320  data: 2.7829  max mem: 21847
Epoch: [46]  [ 200/1251]  eta: 0:05:02  lr: 0.003914  min_lr: 0.003914  loss: 3.4639 (3.5213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6726 (0.8035)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [46]  [ 400/1251]  eta: 0:03:59  lr: 0.003913  min_lr: 0.003913  loss: 3.5496 (3.4814)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5430 (0.7623)  time: 0.2839  data: 0.0004  max mem: 21847
Epoch: [46]  [ 600/1251]  eta: 0:03:01  lr: 0.003912  min_lr: 0.003912  loss: 3.9317 (3.4847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9516 (0.7953)  time: 0.2814  data: 0.0005  max mem: 21847
Epoch: [46]  [ 800/1251]  eta: 0:02:05  lr: 0.003911  min_lr: 0.003911  loss: 3.7310 (3.4730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5822 (0.7961)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [46]  [1000/1251]  eta: 0:01:09  lr: 0.003910  min_lr: 0.003910  loss: 3.8073 (3.4993)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7268 (0.8074)  time: 0.2729  data: 0.0005  max mem: 21847
Epoch: [46]  [1200/1251]  eta: 0:00:14  lr: 0.003909  min_lr: 0.003909  loss: 3.3171 (3.5042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.8000)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [46]  [1250/1251]  eta: 0:00:00  lr: 0.003909  min_lr: 0.003909  loss: 3.3873 (3.5038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8095 (0.8029)  time: 0.2286  data: 0.0009  max mem: 21847
Epoch: [46] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003909  min_lr: 0.003909  loss: 3.3873 (3.4912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8095 (0.8029)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.8000 (0.8000)  acc1: 83.2000 (83.2000)  acc5: 97.6000 (97.6000)  time: 5.7184  data: 5.5730  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9941 (0.9837)  acc1: 77.6000 (78.6182)  acc5: 96.0000 (95.2000)  time: 0.7578  data: 0.6219  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2023 (1.2009)  acc1: 72.0000 (74.2476)  acc5: 92.0000 (92.3810)  time: 0.2073  data: 0.0761  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2615 (1.2049)  acc1: 70.8000 (74.0000)  acc5: 90.4000 (92.4160)  time: 0.2051  data: 0.0760  max mem: 21847
Test: Total time: 0:00:10 (0.4186 s / it)
* Acc@1 74.466 Acc@5 92.472 loss 1.190
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.47%
Epoch: [47]  [   0/1251]  eta: 1:00:17  lr: 0.003909  min_lr: 0.003909  loss: 3.9725 (3.9725)  weight_decay: 0.0500 (0.0500)  time: 2.8916  data: 2.5032  max mem: 21847
Epoch: [47]  [ 200/1251]  eta: 0:05:02  lr: 0.003908  min_lr: 0.003908  loss: 3.3552 (3.4999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6151 (0.7163)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [47]  [ 400/1251]  eta: 0:03:58  lr: 0.003907  min_lr: 0.003907  loss: 3.5757 (3.4941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8655 (0.8177)  time: 0.2752  data: 0.0004  max mem: 21847
Epoch: [47]  [ 600/1251]  eta: 0:03:01  lr: 0.003906  min_lr: 0.003906  loss: 3.4963 (3.5067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7437 (0.7974)  time: 0.2725  data: 0.0005  max mem: 21847
Epoch: [47]  [ 800/1251]  eta: 0:02:04  lr: 0.003905  min_lr: 0.003905  loss: 3.4709 (3.5167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7251 (0.7888)  time: 0.2710  data: 0.0005  max mem: 21847
Epoch: [47]  [1000/1251]  eta: 0:01:09  lr: 0.003904  min_lr: 0.003904  loss: 3.6816 (3.5070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9056 (0.7942)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [47]  [1200/1251]  eta: 0:00:14  lr: 0.003902  min_lr: 0.003902  loss: 3.1112 (3.4867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5531 (0.7685)  time: 0.2713  data: 0.0005  max mem: 21847
Epoch: [47]  [1250/1251]  eta: 0:00:00  lr: 0.003902  min_lr: 0.003902  loss: 3.3327 (3.4846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6795 (0.7815)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [47] Total time: 0:05:44 (0.2756 s / it)
Averaged stats: lr: 0.003902  min_lr: 0.003902  loss: 3.3327 (3.4748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6795 (0.7815)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8527 (0.8527)  acc1: 83.2000 (83.2000)  acc5: 97.2000 (97.2000)  time: 5.5844  data: 5.4352  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0320 (1.0087)  acc1: 78.8000 (78.7273)  acc5: 96.0000 (95.7091)  time: 0.7163  data: 0.5700  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1997 (1.2171)  acc1: 73.2000 (74.3238)  acc5: 92.0000 (92.5905)  time: 0.1972  data: 0.0605  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3373 (1.2276)  acc1: 71.2000 (73.8720)  acc5: 92.0000 (92.6400)  time: 0.1963  data: 0.0636  max mem: 21847
Test: Total time: 0:00:10 (0.4083 s / it)
* Acc@1 74.534 Acc@5 92.672 loss 1.204
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.53%
Epoch: [48]  [   0/1251]  eta: 1:06:52  lr: 0.003902  min_lr: 0.003902  loss: 3.4355 (3.4355)  weight_decay: 0.0500 (0.0500)  time: 3.2078  data: 2.8954  max mem: 21847
Epoch: [48]  [ 200/1251]  eta: 0:05:03  lr: 0.003901  min_lr: 0.003901  loss: 3.3751 (3.5064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6855 (0.8233)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [48]  [ 400/1251]  eta: 0:03:59  lr: 0.003900  min_lr: 0.003900  loss: 3.5533 (3.4831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7578 (0.7993)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [48]  [ 600/1251]  eta: 0:03:01  lr: 0.003899  min_lr: 0.003899  loss: 3.8407 (3.4930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6226 (0.8018)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [48]  [ 800/1251]  eta: 0:02:05  lr: 0.003898  min_lr: 0.003898  loss: 3.0848 (3.4963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8522 (0.7891)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [48]  [1000/1251]  eta: 0:01:09  lr: 0.003897  min_lr: 0.003897  loss: 3.5529 (3.4961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8593 (0.7949)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [48]  [1200/1251]  eta: 0:00:14  lr: 0.003895  min_lr: 0.003895  loss: 2.8540 (3.4878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6651 (0.7989)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [48]  [1250/1251]  eta: 0:00:00  lr: 0.003895  min_lr: 0.003895  loss: 3.3071 (3.4872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5307 (0.7923)  time: 0.2277  data: 0.0006  max mem: 21847
Epoch: [48] Total time: 0:05:45 (0.2759 s / it)
Averaged stats: lr: 0.003895  min_lr: 0.003895  loss: 3.3071 (3.4856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5307 (0.7923)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7250 (0.7250)  acc1: 84.8000 (84.8000)  acc5: 97.2000 (97.2000)  time: 5.6943  data: 5.5405  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9467 (0.9504)  acc1: 79.2000 (78.2182)  acc5: 96.0000 (95.4909)  time: 0.7307  data: 0.5954  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1955 (1.1547)  acc1: 72.8000 (74.4381)  acc5: 92.4000 (92.7048)  time: 0.1960  data: 0.0656  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2769 (1.1604)  acc1: 72.8000 (74.3200)  acc5: 91.2000 (92.6880)  time: 0.1947  data: 0.0655  max mem: 21847
Test: Total time: 0:00:10 (0.4083 s / it)
* Acc@1 74.724 Acc@5 92.822 loss 1.144
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 74.72%
Epoch: [49]  [   0/1251]  eta: 1:00:56  lr: 0.003895  min_lr: 0.003895  loss: 3.8872 (3.8872)  weight_decay: 0.0500 (0.0500)  time: 2.9226  data: 2.5384  max mem: 21847
Epoch: [49]  [ 200/1251]  eta: 0:05:02  lr: 0.003894  min_lr: 0.003894  loss: 3.5757 (3.4774)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0234 (0.9553)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [49]  [ 400/1251]  eta: 0:03:59  lr: 0.003893  min_lr: 0.003893  loss: 3.0931 (3.4790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6671 (0.8795)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [49]  [ 600/1251]  eta: 0:03:01  lr: 0.003892  min_lr: 0.003892  loss: 3.8091 (3.4452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8490 (0.9132)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [49]  [ 800/1251]  eta: 0:02:05  lr: 0.003890  min_lr: 0.003890  loss: 3.0627 (3.4323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6455 (0.8491)  time: 0.2752  data: 0.0004  max mem: 21847
Epoch: [49]  [1000/1251]  eta: 0:01:09  lr: 0.003889  min_lr: 0.003889  loss: 3.7583 (3.4318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7361 (0.8293)  time: 0.2844  data: 0.0005  max mem: 21847
Epoch: [49]  [1200/1251]  eta: 0:00:14  lr: 0.003888  min_lr: 0.003888  loss: 2.9903 (3.4461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7482 (0.8136)  time: 0.2822  data: 0.0005  max mem: 21847
Epoch: [49]  [1250/1251]  eta: 0:00:00  lr: 0.003888  min_lr: 0.003888  loss: 3.5559 (3.4511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9215 (0.8290)  time: 0.2281  data: 0.0005  max mem: 21847
Epoch: [49] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.003888  min_lr: 0.003888  loss: 3.5559 (3.4556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9215 (0.8290)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.8497 (0.8497)  acc1: 82.4000 (82.4000)  acc5: 97.2000 (97.2000)  time: 5.3275  data: 5.1788  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0141 (1.0252)  acc1: 80.4000 (78.8727)  acc5: 96.4000 (95.9273)  time: 0.6765  data: 0.5427  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2807 (1.2284)  acc1: 70.8000 (74.3238)  acc5: 92.4000 (92.8381)  time: 0.2036  data: 0.0737  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2952 (1.2291)  acc1: 72.8000 (74.3200)  acc5: 91.2000 (92.8000)  time: 0.2020  data: 0.0736  max mem: 21847
Test: Total time: 0:00:10 (0.4001 s / it)
* Acc@1 74.640 Acc@5 92.892 loss 1.218
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 74.72%
Epoch: [50]  [   0/1251]  eta: 1:08:39  lr: 0.003888  min_lr: 0.003888  loss: 3.1888 (3.1888)  weight_decay: 0.0500 (0.0500)  time: 3.2929  data: 2.9440  max mem: 21847
Epoch: [50]  [ 200/1251]  eta: 0:05:04  lr: 0.003887  min_lr: 0.003887  loss: 3.6300 (3.4942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8740 (0.8369)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [50]  [ 400/1251]  eta: 0:04:00  lr: 0.003885  min_lr: 0.003885  loss: 2.6576 (3.4817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7074 (0.8064)  time: 0.2751  data: 0.0004  max mem: 21847
Epoch: [50]  [ 600/1251]  eta: 0:03:02  lr: 0.003884  min_lr: 0.003884  loss: 3.6767 (3.4682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6071 (0.8476)  time: 0.2801  data: 0.0004  max mem: 21847
Epoch: [50]  [ 800/1251]  eta: 0:02:05  lr: 0.003883  min_lr: 0.003883  loss: 3.1806 (3.4586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6023 (0.8123)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [50]  [1000/1251]  eta: 0:01:09  lr: 0.003882  min_lr: 0.003882  loss: 3.3125 (3.4440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6179 (0.7977)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [50]  [1200/1251]  eta: 0:00:14  lr: 0.003881  min_lr: 0.003881  loss: 3.4430 (3.4550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9011 (0.8055)  time: 0.2740  data: 0.0005  max mem: 21847
Epoch: [50]  [1250/1251]  eta: 0:00:00  lr: 0.003880  min_lr: 0.003880  loss: 3.2830 (3.4510)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6723 (0.8048)  time: 0.2285  data: 0.0009  max mem: 21847
Epoch: [50] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.003880  min_lr: 0.003880  loss: 3.2830 (3.4532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6723 (0.8048)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7555 (0.7555)  acc1: 85.2000 (85.2000)  acc5: 96.4000 (96.4000)  time: 5.8786  data: 5.7311  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9071 (0.9310)  acc1: 78.0000 (78.9455)  acc5: 96.4000 (96.1091)  time: 0.7026  data: 0.5722  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1323 (1.1356)  acc1: 72.4000 (74.8762)  acc5: 92.0000 (93.2571)  time: 0.1654  data: 0.0374  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3022 (1.1470)  acc1: 72.4000 (74.3360)  acc5: 92.0000 (93.0720)  time: 0.1647  data: 0.0373  max mem: 21847
Test: Total time: 0:00:09 (0.3910 s / it)
* Acc@1 74.910 Acc@5 93.022 loss 1.140
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.91%
Epoch: [51]  [   0/1251]  eta: 1:02:43  lr: 0.003880  min_lr: 0.003880  loss: 3.9960 (3.9960)  weight_decay: 0.0500 (0.0500)  time: 3.0087  data: 2.6228  max mem: 21847
Epoch: [51]  [ 200/1251]  eta: 0:05:03  lr: 0.003879  min_lr: 0.003879  loss: 3.9324 (3.4214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5760 (0.7532)  time: 0.2839  data: 0.0004  max mem: 21847
Epoch: [51]  [ 400/1251]  eta: 0:03:59  lr: 0.003878  min_lr: 0.003878  loss: 3.6837 (3.4557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6586 (0.7635)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [51]  [ 600/1251]  eta: 0:03:01  lr: 0.003877  min_lr: 0.003877  loss: 3.3932 (3.4775)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1992 (0.8150)  time: 0.2862  data: 0.0005  max mem: 21847
Epoch: [51]  [ 800/1251]  eta: 0:02:05  lr: 0.003875  min_lr: 0.003875  loss: 3.0262 (3.4573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8534 (0.8025)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [51]  [1000/1251]  eta: 0:01:09  lr: 0.003874  min_lr: 0.003874  loss: 3.9542 (3.4559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7315 (0.8070)  time: 0.2741  data: 0.0005  max mem: 21847
Epoch: [51]  [1200/1251]  eta: 0:00:14  lr: 0.003873  min_lr: 0.003873  loss: 3.8199 (3.4452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7773 (0.7968)  time: 0.2742  data: 0.0004  max mem: 21847
Epoch: [51]  [1250/1251]  eta: 0:00:00  lr: 0.003873  min_lr: 0.003873  loss: 3.1439 (3.4450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8137 (0.7995)  time: 0.2294  data: 0.0006  max mem: 21847
Epoch: [51] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003873  min_lr: 0.003873  loss: 3.1439 (3.4552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8137 (0.7995)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7735 (0.7735)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 5.6882  data: 5.5377  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8676 (0.9483)  acc1: 77.6000 (78.4727)  acc5: 96.0000 (95.6000)  time: 0.7003  data: 0.5655  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1547 (1.1437)  acc1: 73.6000 (74.7810)  acc5: 92.8000 (93.1238)  time: 0.1771  data: 0.0468  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2565 (1.1485)  acc1: 72.0000 (74.5760)  acc5: 91.6000 (93.1520)  time: 0.1919  data: 0.0631  max mem: 21847
Test: Total time: 0:00:10 (0.4063 s / it)
* Acc@1 74.986 Acc@5 92.912 loss 1.141
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 74.99%
Epoch: [52]  [   0/1251]  eta: 0:53:24  lr: 0.003873  min_lr: 0.003873  loss: 3.3666 (3.3666)  weight_decay: 0.0500 (0.0500)  time: 2.5617  data: 2.1809  max mem: 21847
Epoch: [52]  [ 200/1251]  eta: 0:05:03  lr: 0.003871  min_lr: 0.003871  loss: 3.1725 (3.4116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7010 (nan)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [52]  [ 400/1251]  eta: 0:03:58  lr: 0.003870  min_lr: 0.003870  loss: 3.6890 (3.4325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6406 (nan)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [52]  [ 600/1251]  eta: 0:03:01  lr: 0.003869  min_lr: 0.003869  loss: 3.4218 (3.4451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8081 (nan)  time: 0.2737  data: 0.0003  max mem: 21847
Epoch: [52]  [ 800/1251]  eta: 0:02:05  lr: 0.003867  min_lr: 0.003867  loss: 3.3221 (3.4351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7191 (nan)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [52]  [1000/1251]  eta: 0:01:09  lr: 0.003866  min_lr: 0.003866  loss: 3.1132 (3.4283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7425 (nan)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [52]  [1200/1251]  eta: 0:00:14  lr: 0.003865  min_lr: 0.003865  loss: 3.2537 (3.4381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5937 (nan)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [52]  [1250/1251]  eta: 0:00:00  lr: 0.003865  min_lr: 0.003865  loss: 3.6570 (3.4462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7428 (nan)  time: 0.2368  data: 0.0006  max mem: 21847
Epoch: [52] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.003865  min_lr: 0.003865  loss: 3.6570 (3.4375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7428 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8049 (0.8049)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.6609  data: 5.5088  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9981 (1.0280)  acc1: 78.8000 (78.9091)  acc5: 96.4000 (95.8909)  time: 0.7648  data: 0.6307  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2410 (1.2347)  acc1: 72.8000 (74.6667)  acc5: 92.8000 (93.0095)  time: 0.2180  data: 0.0882  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3705 (1.2439)  acc1: 72.0000 (74.4640)  acc5: 91.6000 (93.0080)  time: 0.2163  data: 0.0881  max mem: 21847
Test: Total time: 0:00:10 (0.4245 s / it)
* Acc@1 74.862 Acc@5 92.798 loss 1.246
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.99%
Epoch: [53]  [   0/1251]  eta: 1:06:03  lr: 0.003865  min_lr: 0.003865  loss: 3.4393 (3.4393)  weight_decay: 0.0500 (0.0500)  time: 3.1680  data: 2.7527  max mem: 21847
Epoch: [53]  [ 200/1251]  eta: 0:05:02  lr: 0.003863  min_lr: 0.003863  loss: 3.3221 (3.4758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6670 (0.8112)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [53]  [ 400/1251]  eta: 0:03:59  lr: 0.003862  min_lr: 0.003862  loss: 3.0221 (3.4261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7615 (0.8743)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [53]  [ 600/1251]  eta: 0:03:02  lr: 0.003861  min_lr: 0.003861  loss: 2.7731 (3.4104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6966 (0.8734)  time: 0.2854  data: 0.0005  max mem: 21847
Epoch: [53]  [ 800/1251]  eta: 0:02:05  lr: 0.003859  min_lr: 0.003859  loss: 3.6572 (3.4175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8053 (0.8696)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [53]  [1000/1251]  eta: 0:01:09  lr: 0.003858  min_lr: 0.003858  loss: 3.8304 (3.4228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6701 (0.8731)  time: 0.2745  data: 0.0004  max mem: 21847
Epoch: [53]  [1200/1251]  eta: 0:00:14  lr: 0.003857  min_lr: 0.003857  loss: 3.7659 (3.4320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6688 (0.8566)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [53]  [1250/1251]  eta: 0:00:00  lr: 0.003856  min_lr: 0.003856  loss: 3.6267 (3.4365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9964 (0.8706)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [53] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.003856  min_lr: 0.003856  loss: 3.6267 (3.4354)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9964 (0.8706)
Test:  [ 0/25]  eta: 0:01:49  loss: 0.8034 (0.8034)  acc1: 83.2000 (83.2000)  acc5: 97.6000 (97.6000)  time: 4.3995  data: 4.2429  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.9382 (0.9936)  acc1: 80.8000 (79.5273)  acc5: 96.4000 (95.8909)  time: 0.6662  data: 0.5329  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2051 (1.2084)  acc1: 71.6000 (74.7238)  acc5: 92.4000 (93.1048)  time: 0.2307  data: 0.1016  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3151 (1.2153)  acc1: 71.6000 (74.6400)  acc5: 90.8000 (92.9760)  time: 0.1894  data: 0.0619  max mem: 21847
Test: Total time: 0:00:09 (0.3858 s / it)
* Acc@1 74.950 Acc@5 92.938 loss 1.215
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 74.99%
Epoch: [54]  [   0/1251]  eta: 1:04:15  lr: 0.003856  min_lr: 0.003856  loss: 2.4898 (2.4898)  weight_decay: 0.0500 (0.0500)  time: 3.0823  data: 2.6097  max mem: 21847
Epoch: [54]  [ 200/1251]  eta: 0:05:07  lr: 0.003855  min_lr: 0.003855  loss: 3.4915 (3.3616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6833 (0.7695)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [54]  [ 400/1251]  eta: 0:04:01  lr: 0.003854  min_lr: 0.003854  loss: 3.4193 (3.3979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7278 (0.7490)  time: 0.2731  data: 0.0003  max mem: 21847
Epoch: [54]  [ 600/1251]  eta: 0:03:02  lr: 0.003852  min_lr: 0.003852  loss: 3.6908 (3.4136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8415 (0.8294)  time: 0.2748  data: 0.0004  max mem: 21847
Epoch: [54]  [ 800/1251]  eta: 0:02:05  lr: 0.003851  min_lr: 0.003851  loss: 3.2300 (3.4189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6964 (0.8619)  time: 0.2775  data: 0.0004  max mem: 21847
Epoch: [54]  [1000/1251]  eta: 0:01:09  lr: 0.003849  min_lr: 0.003849  loss: 2.8768 (3.3965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7498 (0.8399)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [54]  [1200/1251]  eta: 0:00:14  lr: 0.003848  min_lr: 0.003848  loss: 3.8427 (3.4012)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8800 (0.8304)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [54]  [1250/1251]  eta: 0:00:00  lr: 0.003848  min_lr: 0.003848  loss: 3.7976 (3.4082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9681 (0.8400)  time: 0.2282  data: 0.0010  max mem: 21847
Epoch: [54] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.003848  min_lr: 0.003848  loss: 3.7976 (3.4246)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9681 (0.8400)
Test:  [ 0/25]  eta: 0:01:41  loss: 0.7726 (0.7726)  acc1: 85.2000 (85.2000)  acc5: 96.8000 (96.8000)  time: 4.0572  data: 3.8799  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.9161 (0.9928)  acc1: 77.6000 (78.7636)  acc5: 96.8000 (96.0000)  time: 0.6240  data: 0.4889  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2748 (1.2052)  acc1: 70.0000 (73.8476)  acc5: 91.2000 (93.0667)  time: 0.2445  data: 0.1153  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3560 (1.2140)  acc1: 70.4000 (73.5680)  acc5: 90.8000 (93.0240)  time: 0.2129  data: 0.0826  max mem: 21847
Test: Total time: 0:00:10 (0.4018 s / it)
* Acc@1 74.784 Acc@5 92.880 loss 1.205
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 74.99%
Epoch: [55]  [   0/1251]  eta: 1:09:15  lr: 0.003848  min_lr: 0.003848  loss: 3.3324 (3.3324)  weight_decay: 0.0500 (0.0500)  time: 3.3220  data: 2.9229  max mem: 21847
Epoch: [55]  [ 200/1251]  eta: 0:05:06  lr: 0.003846  min_lr: 0.003846  loss: 3.7154 (3.3990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7243 (0.8432)  time: 0.2729  data: 0.0005  max mem: 21847
Epoch: [55]  [ 400/1251]  eta: 0:04:01  lr: 0.003845  min_lr: 0.003845  loss: 3.8618 (3.3950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9347 (0.8619)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [55]  [ 600/1251]  eta: 0:03:02  lr: 0.003844  min_lr: 0.003844  loss: 3.5492 (3.4027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8043 (0.8292)  time: 0.2824  data: 0.0005  max mem: 21847
Epoch: [55]  [ 800/1251]  eta: 0:02:06  lr: 0.003842  min_lr: 0.003842  loss: 3.6943 (3.4020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7759 (0.8500)  time: 0.2826  data: 0.0005  max mem: 21847
Epoch: [55]  [1000/1251]  eta: 0:01:09  lr: 0.003841  min_lr: 0.003841  loss: 3.8416 (3.4029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6829 (0.8529)  time: 0.2799  data: 0.0004  max mem: 21847
Epoch: [55]  [1200/1251]  eta: 0:00:14  lr: 0.003839  min_lr: 0.003839  loss: 3.8284 (3.4089)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0200 (0.8743)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [55]  [1250/1251]  eta: 0:00:00  lr: 0.003839  min_lr: 0.003839  loss: 3.6410 (3.4103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9544 (0.8790)  time: 0.2278  data: 0.0006  max mem: 21847
Epoch: [55] Total time: 0:05:48 (0.2784 s / it)
Averaged stats: lr: 0.003839  min_lr: 0.003839  loss: 3.6410 (3.4391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9544 (0.8790)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8153 (0.8153)  acc1: 86.0000 (86.0000)  acc5: 97.2000 (97.2000)  time: 5.8259  data: 5.6648  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0182 (1.0356)  acc1: 77.6000 (79.1273)  acc5: 95.6000 (95.7818)  time: 0.7309  data: 0.5983  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2407 (1.2512)  acc1: 72.8000 (74.9524)  acc5: 92.8000 (92.7429)  time: 0.1865  data: 0.0573  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3765 (1.2601)  acc1: 71.6000 (74.7360)  acc5: 90.8000 (92.7520)  time: 0.1969  data: 0.0680  max mem: 21847
Test: Total time: 0:00:10 (0.4147 s / it)
* Acc@1 75.078 Acc@5 92.896 loss 1.250
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.08%
Epoch: [56]  [   0/1251]  eta: 1:02:30  lr: 0.003839  min_lr: 0.003839  loss: 4.0236 (4.0236)  weight_decay: 0.0500 (0.0500)  time: 2.9982  data: 2.6599  max mem: 21847
Epoch: [56]  [ 200/1251]  eta: 0:05:03  lr: 0.003838  min_lr: 0.003838  loss: 3.8779 (3.4968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8517 (0.7928)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [56]  [ 400/1251]  eta: 0:03:59  lr: 0.003836  min_lr: 0.003836  loss: 3.8122 (3.4850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8294 (0.7846)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [56]  [ 600/1251]  eta: 0:03:01  lr: 0.003835  min_lr: 0.003835  loss: 3.4763 (3.4615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9448 (0.8424)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [56]  [ 800/1251]  eta: 0:02:05  lr: 0.003833  min_lr: 0.003833  loss: 3.0030 (3.4491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7707 (nan)  time: 0.2795  data: 0.0004  max mem: 21847
Epoch: [56]  [1000/1251]  eta: 0:01:09  lr: 0.003832  min_lr: 0.003832  loss: 3.7337 (3.4444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6082 (nan)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [56]  [1200/1251]  eta: 0:00:14  lr: 0.003831  min_lr: 0.003831  loss: 3.7812 (3.4442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7161 (nan)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [56]  [1250/1251]  eta: 0:00:00  lr: 0.003830  min_lr: 0.003830  loss: 3.7066 (3.4463)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6856 (nan)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [56] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.003830  min_lr: 0.003830  loss: 3.7066 (3.4270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6856 (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8723 (0.8723)  acc1: 82.0000 (82.0000)  acc5: 97.6000 (97.6000)  time: 5.7202  data: 5.5688  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9802 (0.9783)  acc1: 78.8000 (79.4182)  acc5: 96.4000 (95.4909)  time: 0.7403  data: 0.6038  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2147 (1.1657)  acc1: 74.0000 (75.1238)  acc5: 92.0000 (93.1048)  time: 0.2156  data: 0.0843  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2589 (1.1798)  acc1: 73.2000 (74.8320)  acc5: 91.6000 (92.9280)  time: 0.2133  data: 0.0842  max mem: 21847
Test: Total time: 0:00:10 (0.4249 s / it)
* Acc@1 75.334 Acc@5 93.086 loss 1.163
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.33%
Epoch: [57]  [   0/1251]  eta: 1:01:29  lr: 0.003830  min_lr: 0.003830  loss: 3.6996 (3.6996)  weight_decay: 0.0500 (0.0500)  time: 2.9489  data: 2.6054  max mem: 21847
Epoch: [57]  [ 200/1251]  eta: 0:05:04  lr: 0.003829  min_lr: 0.003829  loss: 3.5650 (3.4454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7491 (0.8857)  time: 0.2811  data: 0.0004  max mem: 21847
Epoch: [57]  [ 400/1251]  eta: 0:04:00  lr: 0.003827  min_lr: 0.003827  loss: 3.6755 (3.4097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8003 (0.8209)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [57]  [ 600/1251]  eta: 0:03:01  lr: 0.003826  min_lr: 0.003826  loss: 3.5737 (3.4261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9745 (0.8863)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [57]  [ 800/1251]  eta: 0:02:05  lr: 0.003824  min_lr: 0.003824  loss: 3.5320 (3.4205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6781 (0.8867)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [57]  [1000/1251]  eta: 0:01:09  lr: 0.003823  min_lr: 0.003823  loss: 3.6880 (3.4198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8772 (0.8777)  time: 0.2835  data: 0.0004  max mem: 21847
Epoch: [57]  [1200/1251]  eta: 0:00:14  lr: 0.003821  min_lr: 0.003821  loss: 3.1503 (3.4260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8029 (0.8840)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [57]  [1250/1251]  eta: 0:00:00  lr: 0.003821  min_lr: 0.003821  loss: 3.5114 (3.4268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8888 (0.8873)  time: 0.2278  data: 0.0010  max mem: 21847
Epoch: [57] Total time: 0:05:45 (0.2766 s / it)
Averaged stats: lr: 0.003821  min_lr: 0.003821  loss: 3.5114 (3.4244)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8888 (0.8873)
Test:  [ 0/25]  eta: 0:01:49  loss: 0.7472 (0.7472)  acc1: 84.4000 (84.4000)  acc5: 98.4000 (98.4000)  time: 4.3766  data: 4.2010  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9747 (0.9421)  acc1: 79.2000 (79.2727)  acc5: 96.4000 (96.0364)  time: 0.6916  data: 0.5518  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1501 (1.1338)  acc1: 72.8000 (75.6381)  acc5: 92.4000 (93.3905)  time: 0.2446  data: 0.1098  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2174 (1.1423)  acc1: 72.8000 (75.5360)  acc5: 91.6000 (93.3440)  time: 0.2440  data: 0.1103  max mem: 21847
Test: Total time: 0:00:10 (0.4014 s / it)
* Acc@1 75.546 Acc@5 93.248 loss 1.140
Accuracy of the model on the 50000 test images: 75.5%
Max accuracy: 75.55%
Epoch: [58]  [   0/1251]  eta: 1:07:07  lr: 0.003821  min_lr: 0.003821  loss: 2.9326 (2.9326)  weight_decay: 0.0500 (0.0500)  time: 3.2195  data: 2.9254  max mem: 21847
Epoch: [58]  [ 200/1251]  eta: 0:05:04  lr: 0.003820  min_lr: 0.003820  loss: 3.6775 (3.4287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8480 (0.9794)  time: 0.2782  data: 0.0003  max mem: 21847
Epoch: [58]  [ 400/1251]  eta: 0:04:00  lr: 0.003818  min_lr: 0.003818  loss: 3.7380 (3.4572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8329 (0.8874)  time: 0.2838  data: 0.0004  max mem: 21847
Epoch: [58]  [ 600/1251]  eta: 0:03:02  lr: 0.003817  min_lr: 0.003817  loss: 3.8155 (3.4623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7903 (0.8921)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [58]  [ 800/1251]  eta: 0:02:05  lr: 0.003815  min_lr: 0.003815  loss: 3.0218 (3.4562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7964 (0.8639)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [58]  [1000/1251]  eta: 0:01:09  lr: 0.003813  min_lr: 0.003813  loss: 3.7057 (3.4444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5596 (0.8551)  time: 0.2702  data: 0.0004  max mem: 21847
Epoch: [58]  [1200/1251]  eta: 0:00:14  lr: 0.003812  min_lr: 0.003812  loss: 2.8922 (3.4230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8543 (0.8564)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [58]  [1250/1251]  eta: 0:00:00  lr: 0.003812  min_lr: 0.003812  loss: 2.9804 (3.4150)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (0.8612)  time: 0.2279  data: 0.0009  max mem: 21847
Epoch: [58] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003812  min_lr: 0.003812  loss: 2.9804 (3.4156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (0.8612)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7863 (0.7863)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.3582  data: 5.1981  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9273 (0.9327)  acc1: 79.6000 (79.4182)  acc5: 96.0000 (95.8545)  time: 0.6936  data: 0.5602  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1129 (1.1332)  acc1: 72.0000 (74.9524)  acc5: 92.4000 (93.0286)  time: 0.1907  data: 0.0616  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2241 (1.1420)  acc1: 73.2000 (74.7680)  acc5: 90.8000 (92.9440)  time: 0.1899  data: 0.0615  max mem: 21847
Test: Total time: 0:00:09 (0.3914 s / it)
* Acc@1 75.526 Acc@5 93.126 loss 1.123
Accuracy of the model on the 50000 test images: 75.5%
Max accuracy: 75.55%
Epoch: [59]  [   0/1251]  eta: 1:08:32  lr: 0.003812  min_lr: 0.003812  loss: 2.3555 (2.3555)  weight_decay: 0.0500 (0.0500)  time: 3.2871  data: 1.7041  max mem: 21847
Epoch: [59]  [ 200/1251]  eta: 0:05:06  lr: 0.003810  min_lr: 0.003810  loss: 3.5485 (3.4064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7526 (0.9523)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [59]  [ 400/1251]  eta: 0:04:00  lr: 0.003809  min_lr: 0.003809  loss: 3.4277 (3.3975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8742 (0.9512)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [59]  [ 600/1251]  eta: 0:03:02  lr: 0.003807  min_lr: 0.003807  loss: 3.4618 (3.4242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7757 (0.9187)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [59]  [ 800/1251]  eta: 0:02:05  lr: 0.003805  min_lr: 0.003805  loss: 3.2307 (3.4146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8069 (0.8936)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [59]  [1000/1251]  eta: 0:01:09  lr: 0.003804  min_lr: 0.003804  loss: 2.7840 (3.4068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6300 (0.8687)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [59]  [1200/1251]  eta: 0:00:14  lr: 0.003802  min_lr: 0.003802  loss: 2.9918 (3.4130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7627 (0.8619)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [59]  [1250/1251]  eta: 0:00:00  lr: 0.003802  min_lr: 0.003802  loss: 3.6265 (3.4167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7691 (0.8628)  time: 0.2286  data: 0.0007  max mem: 21847
Epoch: [59] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.003802  min_lr: 0.003802  loss: 3.6265 (3.4137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7691 (0.8628)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7358 (0.7358)  acc1: 84.8000 (84.8000)  acc5: 97.2000 (97.2000)  time: 5.3789  data: 5.2264  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9604 (0.9389)  acc1: 80.8000 (79.2000)  acc5: 95.6000 (95.7455)  time: 0.7468  data: 0.6127  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1316 (1.1373)  acc1: 73.2000 (75.0857)  acc5: 93.6000 (92.9714)  time: 0.2216  data: 0.0917  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2707 (1.1376)  acc1: 72.0000 (75.0080)  acc5: 91.6000 (93.0560)  time: 0.2208  data: 0.0916  max mem: 21847
Test: Total time: 0:00:10 (0.4164 s / it)
* Acc@1 75.594 Acc@5 93.350 loss 1.124
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.59%
Epoch: [60]  [   0/1251]  eta: 1:00:53  lr: 0.003802  min_lr: 0.003802  loss: 3.8539 (3.8539)  weight_decay: 0.0500 (0.0500)  time: 2.9202  data: 2.5398  max mem: 21847
Epoch: [60]  [ 200/1251]  eta: 0:05:05  lr: 0.003800  min_lr: 0.003800  loss: 3.5501 (3.4372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8763 (0.9177)  time: 0.2702  data: 0.0004  max mem: 21847
Epoch: [60]  [ 400/1251]  eta: 0:03:59  lr: 0.003799  min_lr: 0.003799  loss: 3.0491 (3.4204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8550 (0.9194)  time: 0.2809  data: 0.0004  max mem: 21847
Epoch: [60]  [ 600/1251]  eta: 0:03:01  lr: 0.003797  min_lr: 0.003797  loss: 3.1913 (3.3952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6087 (0.8695)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [60]  [ 800/1251]  eta: 0:02:05  lr: 0.003796  min_lr: 0.003796  loss: 3.3046 (3.3880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7713 (0.8977)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [60]  [1000/1251]  eta: 0:01:09  lr: 0.003794  min_lr: 0.003794  loss: 3.3318 (3.4094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8639 (0.8928)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [60]  [1200/1251]  eta: 0:00:14  lr: 0.003793  min_lr: 0.003793  loss: 3.2485 (3.4177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8474 (0.8961)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [60]  [1250/1251]  eta: 0:00:00  lr: 0.003792  min_lr: 0.003792  loss: 3.3731 (3.4157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8537 (0.8974)  time: 0.2279  data: 0.0009  max mem: 21847
Epoch: [60] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.003792  min_lr: 0.003792  loss: 3.3731 (3.3945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8537 (0.8974)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.8659 (0.8659)  acc1: 82.8000 (82.8000)  acc5: 96.8000 (96.8000)  time: 5.8504  data: 5.7000  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.0299 (1.0097)  acc1: 80.8000 (79.0182)  acc5: 95.6000 (95.7818)  time: 0.7696  data: 0.6369  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2414 (1.2021)  acc1: 72.4000 (75.0095)  acc5: 92.0000 (92.9333)  time: 0.1945  data: 0.0653  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3356 (1.2096)  acc1: 72.4000 (74.6880)  acc5: 92.0000 (92.9280)  time: 0.1935  data: 0.0652  max mem: 21847
Test: Total time: 0:00:10 (0.4133 s / it)
* Acc@1 75.536 Acc@5 93.128 loss 1.189
Accuracy of the model on the 50000 test images: 75.5%
Max accuracy: 75.59%
Epoch: [61]  [   0/1251]  eta: 1:04:13  lr: 0.003792  min_lr: 0.003792  loss: 2.9275 (2.9275)  weight_decay: 0.0500 (0.0500)  time: 3.0800  data: 2.5609  max mem: 21847
Epoch: [61]  [ 200/1251]  eta: 0:05:03  lr: 0.003791  min_lr: 0.003791  loss: 3.7527 (3.4630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8076 (0.8673)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [61]  [ 400/1251]  eta: 0:04:00  lr: 0.003789  min_lr: 0.003789  loss: 2.5594 (3.4270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6776 (0.8412)  time: 0.2805  data: 0.0004  max mem: 21847
Epoch: [61]  [ 600/1251]  eta: 0:03:02  lr: 0.003787  min_lr: 0.003787  loss: 3.1774 (3.4318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9816 (0.8710)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [61]  [ 800/1251]  eta: 0:02:05  lr: 0.003786  min_lr: 0.003786  loss: 3.7398 (3.4536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8682 (0.8641)  time: 0.2824  data: 0.0005  max mem: 21847
Epoch: [61]  [1000/1251]  eta: 0:01:09  lr: 0.003784  min_lr: 0.003784  loss: 3.3679 (3.4370)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0789 (0.8670)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [61]  [1200/1251]  eta: 0:00:14  lr: 0.003782  min_lr: 0.003782  loss: 2.8763 (3.4283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7144 (0.8708)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [61]  [1250/1251]  eta: 0:00:00  lr: 0.003782  min_lr: 0.003782  loss: 3.7767 (3.4269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9092 (0.8841)  time: 0.2280  data: 0.0007  max mem: 21847
Epoch: [61] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.003782  min_lr: 0.003782  loss: 3.7767 (3.4091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9092 (0.8841)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.8445 (0.8445)  acc1: 84.0000 (84.0000)  acc5: 96.8000 (96.8000)  time: 5.3793  data: 5.2024  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0395 (0.9911)  acc1: 78.4000 (79.7455)  acc5: 96.0000 (96.0364)  time: 0.6858  data: 0.5501  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1560 (1.1707)  acc1: 74.0000 (75.6191)  acc5: 92.0000 (93.3333)  time: 0.2022  data: 0.0724  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3200 (1.1809)  acc1: 72.0000 (75.2000)  acc5: 92.0000 (93.3280)  time: 0.2098  data: 0.0805  max mem: 21847
Test: Total time: 0:00:10 (0.4131 s / it)
* Acc@1 75.692 Acc@5 93.398 loss 1.168
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 75.69%
Epoch: [62]  [   0/1251]  eta: 1:02:40  lr: 0.003782  min_lr: 0.003782  loss: 4.1473 (4.1473)  weight_decay: 0.0500 (0.0500)  time: 3.0064  data: 2.6473  max mem: 21847
Epoch: [62]  [ 200/1251]  eta: 0:05:02  lr: 0.003780  min_lr: 0.003780  loss: 3.2632 (3.3782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6480 (0.8326)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [62]  [ 400/1251]  eta: 0:03:58  lr: 0.003779  min_lr: 0.003779  loss: 2.8935 (3.4060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8307 (0.9142)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [62]  [ 600/1251]  eta: 0:03:01  lr: 0.003777  min_lr: 0.003777  loss: 3.6910 (3.4203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8806 (0.8950)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [62]  [ 800/1251]  eta: 0:02:05  lr: 0.003775  min_lr: 0.003775  loss: 3.5785 (3.4122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6188 (0.8986)  time: 0.2728  data: 0.0005  max mem: 21847
Epoch: [62]  [1000/1251]  eta: 0:01:09  lr: 0.003774  min_lr: 0.003774  loss: 3.7990 (3.4111)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7633 (0.8642)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [62]  [1200/1251]  eta: 0:00:14  lr: 0.003772  min_lr: 0.003772  loss: 2.9688 (3.3975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5822 (0.8601)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [62]  [1250/1251]  eta: 0:00:00  lr: 0.003772  min_lr: 0.003772  loss: 3.0191 (3.3948)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0543 (0.8708)  time: 0.2275  data: 0.0007  max mem: 21847
Epoch: [62] Total time: 0:05:45 (0.2762 s / it)
Averaged stats: lr: 0.003772  min_lr: 0.003772  loss: 3.0191 (3.3995)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0543 (0.8708)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7011 (0.7011)  acc1: 86.0000 (86.0000)  acc5: 96.8000 (96.8000)  time: 5.6085  data: 5.4640  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8912 (0.8783)  acc1: 79.6000 (79.6727)  acc5: 96.0000 (95.9636)  time: 0.7504  data: 0.6178  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0634 (1.0863)  acc1: 74.4000 (75.4476)  acc5: 92.8000 (93.4095)  time: 0.2015  data: 0.0721  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2170 (1.0943)  acc1: 71.6000 (75.1680)  acc5: 91.6000 (93.3920)  time: 0.1995  data: 0.0712  max mem: 21847
Test: Total time: 0:00:10 (0.4096 s / it)
* Acc@1 75.658 Acc@5 93.330 loss 1.087
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 75.69%
Epoch: [63]  [   0/1251]  eta: 1:08:26  lr: 0.003772  min_lr: 0.003772  loss: 3.0940 (3.0940)  weight_decay: 0.0500 (0.0500)  time: 3.2822  data: 1.5743  max mem: 21847
Epoch: [63]  [ 200/1251]  eta: 0:05:08  lr: 0.003770  min_lr: 0.003770  loss: 3.5750 (3.4056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6920 (0.8997)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [63]  [ 400/1251]  eta: 0:04:01  lr: 0.003768  min_lr: 0.003768  loss: 2.8941 (3.3843)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8625 (0.8584)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [63]  [ 600/1251]  eta: 0:03:02  lr: 0.003767  min_lr: 0.003767  loss: 2.7462 (3.3811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8235 (0.9063)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [63]  [ 800/1251]  eta: 0:02:05  lr: 0.003765  min_lr: 0.003765  loss: 2.9592 (3.3642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7687 (0.8896)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [63]  [1000/1251]  eta: 0:01:09  lr: 0.003763  min_lr: 0.003763  loss: 3.6004 (3.3742)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9519 (0.9175)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [63]  [1200/1251]  eta: 0:00:14  lr: 0.003762  min_lr: 0.003762  loss: 3.3211 (3.3870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6517 (0.9011)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [63]  [1250/1251]  eta: 0:00:00  lr: 0.003761  min_lr: 0.003761  loss: 3.6000 (3.3943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6956 (0.8972)  time: 0.2276  data: 0.0006  max mem: 21847
Epoch: [63] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.003761  min_lr: 0.003761  loss: 3.6000 (3.3918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6956 (0.8972)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8323 (0.8323)  acc1: 82.8000 (82.8000)  acc5: 96.4000 (96.4000)  time: 5.6091  data: 5.4380  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0108 (0.9745)  acc1: 79.6000 (79.6000)  acc5: 96.4000 (95.7091)  time: 0.6757  data: 0.5422  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2304 (1.1896)  acc1: 74.4000 (75.9429)  acc5: 92.8000 (93.2191)  time: 0.1714  data: 0.0428  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3534 (1.2022)  acc1: 72.4000 (75.4240)  acc5: 91.2000 (93.0400)  time: 0.1993  data: 0.0709  max mem: 21847
Test: Total time: 0:00:10 (0.4077 s / it)
* Acc@1 75.646 Acc@5 93.248 loss 1.197
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.69%
Epoch: [64]  [   0/1251]  eta: 1:06:08  lr: 0.003761  min_lr: 0.003761  loss: 4.0237 (4.0237)  weight_decay: 0.0500 (0.0500)  time: 3.1722  data: 2.7164  max mem: 21847
Epoch: [64]  [ 200/1251]  eta: 0:05:04  lr: 0.003760  min_lr: 0.003760  loss: 3.7559 (3.3422)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2822  data: 0.0004  max mem: 21847
Epoch: [64]  [ 400/1251]  eta: 0:04:00  lr: 0.003758  min_lr: 0.003758  loss: 3.4194 (3.4014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7588 (nan)  time: 0.2706  data: 0.0004  max mem: 21847
Epoch: [64]  [ 600/1251]  eta: 0:03:02  lr: 0.003756  min_lr: 0.003756  loss: 3.3722 (3.4027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6541 (nan)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [64]  [ 800/1251]  eta: 0:02:05  lr: 0.003754  min_lr: 0.003754  loss: 3.2586 (3.3828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7112 (nan)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [64]  [1000/1251]  eta: 0:01:09  lr: 0.003753  min_lr: 0.003753  loss: 3.8526 (3.3908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6749 (nan)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [64]  [1200/1251]  eta: 0:00:14  lr: 0.003751  min_lr: 0.003751  loss: 3.7447 (3.4024)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8250 (nan)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [64]  [1250/1251]  eta: 0:00:00  lr: 0.003751  min_lr: 0.003751  loss: 2.5457 (3.3956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9010 (nan)  time: 0.2283  data: 0.0009  max mem: 21847
Epoch: [64] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.003751  min_lr: 0.003751  loss: 2.5457 (3.3751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9010 (nan)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.8133 (0.8133)  acc1: 81.2000 (81.2000)  acc5: 97.6000 (97.6000)  time: 5.8559  data: 5.6939  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8596 (0.9002)  acc1: 81.2000 (80.6909)  acc5: 96.8000 (96.5091)  time: 0.7524  data: 0.6187  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1268 (1.1052)  acc1: 74.4000 (76.2095)  acc5: 92.8000 (93.6952)  time: 0.2082  data: 0.0790  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2674 (1.1145)  acc1: 72.8000 (75.8240)  acc5: 92.0000 (93.5520)  time: 0.2073  data: 0.0789  max mem: 21847
Test: Total time: 0:00:10 (0.4246 s / it)
* Acc@1 75.814 Acc@5 93.290 loss 1.112
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 75.81%
Epoch: [65]  [   0/1251]  eta: 1:04:02  lr: 0.003751  min_lr: 0.003751  loss: 3.6406 (3.6406)  weight_decay: 0.0500 (0.0500)  time: 3.0714  data: 2.7293  max mem: 21847
Epoch: [65]  [ 200/1251]  eta: 0:05:02  lr: 0.003749  min_lr: 0.003749  loss: 3.4679 (3.3616)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0416 (0.9251)  time: 0.2715  data: 0.0005  max mem: 21847
Epoch: [65]  [ 400/1251]  eta: 0:03:59  lr: 0.003747  min_lr: 0.003747  loss: 3.2844 (3.3747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6604 (0.9016)  time: 0.2749  data: 0.0005  max mem: 21847
Epoch: [65]  [ 600/1251]  eta: 0:03:01  lr: 0.003745  min_lr: 0.003745  loss: 2.8308 (3.3590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8977 (0.8919)  time: 0.2806  data: 0.0004  max mem: 21847
Epoch: [65]  [ 800/1251]  eta: 0:02:05  lr: 0.003744  min_lr: 0.003744  loss: 3.5962 (3.3342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8983 (0.8785)  time: 0.2719  data: 0.0005  max mem: 21847
Epoch: [65]  [1000/1251]  eta: 0:01:09  lr: 0.003742  min_lr: 0.003742  loss: 3.0984 (3.3415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7525 (0.9022)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [65]  [1200/1251]  eta: 0:00:14  lr: 0.003740  min_lr: 0.003740  loss: 3.3806 (3.3576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7851 (0.8984)  time: 0.2744  data: 0.0005  max mem: 21847
Epoch: [65]  [1250/1251]  eta: 0:00:00  lr: 0.003740  min_lr: 0.003740  loss: 2.9831 (3.3566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8087 (0.8951)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [65] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.003740  min_lr: 0.003740  loss: 2.9831 (3.3766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8087 (0.8951)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7982 (0.7982)  acc1: 84.8000 (84.8000)  acc5: 96.4000 (96.4000)  time: 5.5982  data: 5.4451  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 1.0069 (0.9454)  acc1: 80.0000 (80.1091)  acc5: 96.0000 (95.8545)  time: 0.7314  data: 0.5994  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1682 (1.1387)  acc1: 74.4000 (75.6762)  acc5: 93.6000 (93.5238)  time: 0.2092  data: 0.0799  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2369 (1.1560)  acc1: 73.2000 (75.3760)  acc5: 92.0000 (93.2960)  time: 0.2080  data: 0.0798  max mem: 21847
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 75.848 Acc@5 93.422 loss 1.140
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 75.85%
Epoch: [66]  [   0/1251]  eta: 1:02:31  lr: 0.003740  min_lr: 0.003740  loss: 2.9134 (2.9134)  weight_decay: 0.0500 (0.0500)  time: 2.9987  data: 2.6076  max mem: 21847
Epoch: [66]  [ 200/1251]  eta: 0:05:02  lr: 0.003738  min_lr: 0.003738  loss: 3.4692 (3.4068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8803 (0.8674)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [66]  [ 400/1251]  eta: 0:03:58  lr: 0.003736  min_lr: 0.003736  loss: 3.3694 (3.4192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7287 (0.8848)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [66]  [ 600/1251]  eta: 0:03:01  lr: 0.003734  min_lr: 0.003734  loss: 3.3345 (3.4253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7891 (0.8823)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [66]  [ 800/1251]  eta: 0:02:05  lr: 0.003732  min_lr: 0.003732  loss: 3.2956 (3.4053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8607 (0.8867)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [66]  [1000/1251]  eta: 0:01:09  lr: 0.003731  min_lr: 0.003731  loss: 3.6463 (3.4210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7964 (0.9004)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [66]  [1200/1251]  eta: 0:00:14  lr: 0.003729  min_lr: 0.003729  loss: 3.8067 (3.4085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5941 (0.8822)  time: 0.2795  data: 0.0005  max mem: 21847
Epoch: [66]  [1250/1251]  eta: 0:00:00  lr: 0.003728  min_lr: 0.003728  loss: 3.4689 (3.4117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6911 (0.8799)  time: 0.2281  data: 0.0006  max mem: 21847
Epoch: [66] Total time: 0:05:45 (0.2759 s / it)
Averaged stats: lr: 0.003728  min_lr: 0.003728  loss: 3.4689 (3.3849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6911 (0.8799)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8382 (0.8382)  acc1: 82.0000 (82.0000)  acc5: 96.8000 (96.8000)  time: 5.8397  data: 5.6673  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9447 (0.9340)  acc1: 79.6000 (79.2000)  acc5: 96.8000 (96.1091)  time: 0.7243  data: 0.5870  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1236 (1.1244)  acc1: 72.4000 (75.8476)  acc5: 93.6000 (93.6000)  time: 0.1870  data: 0.0563  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2539 (1.1352)  acc1: 72.0000 (75.5200)  acc5: 92.8000 (93.5040)  time: 0.2018  data: 0.0735  max mem: 21847
Test: Total time: 0:00:10 (0.4211 s / it)
* Acc@1 76.008 Acc@5 93.432 loss 1.130
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.01%
Epoch: [67]  [   0/1251]  eta: 1:08:51  lr: 0.003728  min_lr: 0.003728  loss: 2.9648 (2.9648)  weight_decay: 0.0500 (0.0500)  time: 3.3027  data: 3.0051  max mem: 21847
Epoch: [67]  [ 200/1251]  eta: 0:05:04  lr: 0.003727  min_lr: 0.003727  loss: 3.4982 (3.3993)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8756 (0.9508)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [67]  [ 400/1251]  eta: 0:03:59  lr: 0.003725  min_lr: 0.003725  loss: 3.3146 (3.3917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8443 (0.9337)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [67]  [ 600/1251]  eta: 0:03:02  lr: 0.003723  min_lr: 0.003723  loss: 3.4928 (3.3755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9473 (0.9739)  time: 0.2766  data: 0.0004  max mem: 21847
Epoch: [67]  [ 800/1251]  eta: 0:02:05  lr: 0.003721  min_lr: 0.003721  loss: 3.6825 (3.3871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9170 (0.9381)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [67]  [1000/1251]  eta: 0:01:09  lr: 0.003719  min_lr: 0.003719  loss: 3.7937 (3.3758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7543 (0.9249)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [67]  [1200/1251]  eta: 0:00:14  lr: 0.003717  min_lr: 0.003717  loss: 2.8984 (3.3694)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7970 (0.9303)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [67]  [1250/1251]  eta: 0:00:00  lr: 0.003717  min_lr: 0.003717  loss: 3.8327 (3.3698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7863 (0.9288)  time: 0.2336  data: 0.0007  max mem: 21847
Epoch: [67] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.003717  min_lr: 0.003717  loss: 3.8327 (3.3684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7863 (0.9288)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.8501 (0.8501)  acc1: 82.8000 (82.8000)  acc5: 96.4000 (96.4000)  time: 5.3217  data: 5.1553  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 1.0061 (1.0017)  acc1: 80.8000 (79.7091)  acc5: 95.6000 (95.6364)  time: 0.7351  data: 0.5990  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1832 (1.1950)  acc1: 74.4000 (75.2571)  acc5: 93.2000 (92.9333)  time: 0.2122  data: 0.0801  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3079 (1.2075)  acc1: 72.0000 (75.0560)  acc5: 90.8000 (92.7840)  time: 0.2122  data: 0.0800  max mem: 21847
Test: Total time: 0:00:10 (0.4071 s / it)
* Acc@1 75.888 Acc@5 93.406 loss 1.189
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 76.01%
Epoch: [68]  [   0/1251]  eta: 1:08:27  lr: 0.003717  min_lr: 0.003717  loss: 3.4795 (3.4795)  weight_decay: 0.0500 (0.0500)  time: 3.2838  data: 2.8974  max mem: 21847
Epoch: [68]  [ 200/1251]  eta: 0:05:04  lr: 0.003715  min_lr: 0.003715  loss: 3.5037 (3.3239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6241 (0.7347)  time: 0.2823  data: 0.0004  max mem: 21847
Epoch: [68]  [ 400/1251]  eta: 0:04:01  lr: 0.003713  min_lr: 0.003713  loss: 3.1306 (3.3301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7691 (0.8396)  time: 0.2818  data: 0.0004  max mem: 21847
Epoch: [68]  [ 600/1251]  eta: 0:03:02  lr: 0.003711  min_lr: 0.003711  loss: 3.5038 (3.3335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6175 (0.8317)  time: 0.2710  data: 0.0005  max mem: 21847
Epoch: [68]  [ 800/1251]  eta: 0:02:05  lr: 0.003710  min_lr: 0.003710  loss: 3.4532 (3.3435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7544 (0.8465)  time: 0.2726  data: 0.0003  max mem: 21847
Epoch: [68]  [1000/1251]  eta: 0:01:09  lr: 0.003708  min_lr: 0.003708  loss: 3.3059 (3.3486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7605 (0.8317)  time: 0.2727  data: 0.0003  max mem: 21847
Epoch: [68]  [1200/1251]  eta: 0:00:14  lr: 0.003706  min_lr: 0.003706  loss: 3.2633 (3.3404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8551 (0.8520)  time: 0.2757  data: 0.0004  max mem: 21847
Epoch: [68]  [1250/1251]  eta: 0:00:00  lr: 0.003705  min_lr: 0.003705  loss: 3.9314 (3.3430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9817 (0.8721)  time: 0.2311  data: 0.0007  max mem: 21847
Epoch: [68] Total time: 0:05:47 (0.2778 s / it)
Averaged stats: lr: 0.003705  min_lr: 0.003705  loss: 3.9314 (3.3651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9817 (0.8721)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7168 (0.7168)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 5.3530  data: 5.1834  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9288 (0.9289)  acc1: 79.2000 (80.2545)  acc5: 96.4000 (95.7818)  time: 0.7299  data: 0.5951  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1431 (1.1201)  acc1: 74.4000 (76.1333)  acc5: 92.8000 (93.4476)  time: 0.2108  data: 0.0812  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2074 (1.1251)  acc1: 72.8000 (75.7760)  acc5: 92.8000 (93.3920)  time: 0.2110  data: 0.0811  max mem: 21847
Test: Total time: 0:00:10 (0.4071 s / it)
* Acc@1 76.022 Acc@5 93.438 loss 1.120
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.02%
Epoch: [69]  [   0/1251]  eta: 1:02:09  lr: 0.003705  min_lr: 0.003705  loss: 2.3981 (2.3981)  weight_decay: 0.0500 (0.0500)  time: 2.9813  data: 2.6055  max mem: 21847
Epoch: [69]  [ 200/1251]  eta: 0:05:02  lr: 0.003703  min_lr: 0.003703  loss: 3.1143 (3.3993)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7830 (0.8680)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [69]  [ 400/1251]  eta: 0:04:00  lr: 0.003702  min_lr: 0.003702  loss: 3.2708 (3.3507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8781 (0.8719)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [69]  [ 600/1251]  eta: 0:03:02  lr: 0.003700  min_lr: 0.003700  loss: 3.4443 (3.3517)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0165 (0.9158)  time: 0.2839  data: 0.0004  max mem: 21847
Epoch: [69]  [ 800/1251]  eta: 0:02:05  lr: 0.003698  min_lr: 0.003698  loss: 3.8137 (3.3601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7739 (0.8826)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [69]  [1000/1251]  eta: 0:01:09  lr: 0.003696  min_lr: 0.003696  loss: 3.6571 (3.3523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8285 (0.8962)  time: 0.2745  data: 0.0004  max mem: 21847
Epoch: [69]  [1200/1251]  eta: 0:00:14  lr: 0.003694  min_lr: 0.003694  loss: 3.0817 (3.3598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9020 (0.8971)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [69]  [1250/1251]  eta: 0:00:00  lr: 0.003694  min_lr: 0.003694  loss: 3.8105 (3.3627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9599 (0.8996)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [69] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.003694  min_lr: 0.003694  loss: 3.8105 (3.3566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9599 (0.8996)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7480 (0.7480)  acc1: 83.6000 (83.6000)  acc5: 96.0000 (96.0000)  time: 5.6798  data: 5.5186  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9343 (0.9383)  acc1: 79.6000 (79.6364)  acc5: 96.0000 (95.5273)  time: 0.7463  data: 0.6128  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1430 (1.1215)  acc1: 72.4000 (75.8667)  acc5: 92.8000 (93.1238)  time: 0.2074  data: 0.0783  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2320 (1.1281)  acc1: 72.4000 (75.3920)  acc5: 92.4000 (93.1040)  time: 0.2104  data: 0.0821  max mem: 21847
Test: Total time: 0:00:10 (0.4202 s / it)
* Acc@1 76.074 Acc@5 93.444 loss 1.112
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.07%
Epoch: [70]  [   0/1251]  eta: 0:58:17  lr: 0.003694  min_lr: 0.003694  loss: 3.7936 (3.7936)  weight_decay: 0.0500 (0.0500)  time: 2.7957  data: 2.4560  max mem: 21847
Epoch: [70]  [ 200/1251]  eta: 0:05:05  lr: 0.003692  min_lr: 0.003692  loss: 3.2294 (3.3458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6972 (0.7424)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [70]  [ 400/1251]  eta: 0:04:00  lr: 0.003690  min_lr: 0.003690  loss: 3.6573 (3.3826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7179 (0.8128)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [70]  [ 600/1251]  eta: 0:03:02  lr: 0.003688  min_lr: 0.003688  loss: 3.0381 (3.3756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9247 (0.8608)  time: 0.2913  data: 0.0004  max mem: 21847
Epoch: [70]  [ 800/1251]  eta: 0:02:05  lr: 0.003686  min_lr: 0.003686  loss: 3.3304 (3.3662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6535 (0.8313)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [70]  [1000/1251]  eta: 0:01:09  lr: 0.003684  min_lr: 0.003684  loss: 3.7716 (3.3789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7712 (0.8519)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [70]  [1200/1251]  eta: 0:00:14  lr: 0.003682  min_lr: 0.003682  loss: 3.6680 (3.3877)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0890 (0.8628)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [70]  [1250/1251]  eta: 0:00:00  lr: 0.003682  min_lr: 0.003682  loss: 3.0833 (3.3828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8555 (0.8617)  time: 0.2277  data: 0.0006  max mem: 21847
Epoch: [70] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.003682  min_lr: 0.003682  loss: 3.0833 (3.3594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8555 (0.8617)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7490 (0.7490)  acc1: 84.8000 (84.8000)  acc5: 97.2000 (97.2000)  time: 5.6614  data: 5.5004  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8616 (0.9274)  acc1: 81.6000 (80.5091)  acc5: 96.4000 (96.0727)  time: 0.7000  data: 0.5669  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1690 (1.1192)  acc1: 74.8000 (76.3619)  acc5: 92.0000 (93.2762)  time: 0.1791  data: 0.0484  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2334 (1.1262)  acc1: 72.8000 (76.1280)  acc5: 91.6000 (93.2960)  time: 0.1782  data: 0.0483  max mem: 21847
Test: Total time: 0:00:09 (0.3935 s / it)
* Acc@1 76.072 Acc@5 93.540 loss 1.119
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.07%
Epoch: [71]  [   0/1251]  eta: 1:05:43  lr: 0.003681  min_lr: 0.003681  loss: 2.5505 (2.5505)  weight_decay: 0.0500 (0.0500)  time: 3.1520  data: 2.3737  max mem: 21847
Epoch: [71]  [ 200/1251]  eta: 0:05:05  lr: 0.003680  min_lr: 0.003680  loss: 2.9270 (3.3034)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0205 (0.9568)  time: 0.2798  data: 0.0005  max mem: 21847
Epoch: [71]  [ 400/1251]  eta: 0:04:00  lr: 0.003678  min_lr: 0.003678  loss: 3.7244 (3.3607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8421 (0.9676)  time: 0.2810  data: 0.0004  max mem: 21847
Epoch: [71]  [ 600/1251]  eta: 0:03:01  lr: 0.003676  min_lr: 0.003676  loss: 3.6590 (3.3303)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0725 (0.9596)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [71]  [ 800/1251]  eta: 0:02:05  lr: 0.003674  min_lr: 0.003674  loss: 3.1596 (3.3480)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [71]  [1000/1251]  eta: 0:01:09  lr: 0.003672  min_lr: 0.003672  loss: 3.5356 (3.3577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7851 (nan)  time: 0.2743  data: 0.0004  max mem: 21847
Epoch: [71]  [1200/1251]  eta: 0:00:14  lr: 0.003670  min_lr: 0.003670  loss: 3.6587 (3.3747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7264 (nan)  time: 0.2817  data: 0.0005  max mem: 21847
Epoch: [71]  [1250/1251]  eta: 0:00:00  lr: 0.003669  min_lr: 0.003669  loss: 3.6993 (3.3742)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0396 (nan)  time: 0.2365  data: 0.0007  max mem: 21847
Epoch: [71] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.003669  min_lr: 0.003669  loss: 3.6993 (3.3574)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0396 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.8813 (0.8813)  acc1: 82.8000 (82.8000)  acc5: 97.6000 (97.6000)  time: 5.4922  data: 5.3094  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9558 (1.0135)  acc1: 78.0000 (79.3455)  acc5: 96.8000 (95.9273)  time: 0.7345  data: 0.5968  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2560 (1.1909)  acc1: 74.0000 (75.2762)  acc5: 92.8000 (93.5619)  time: 0.2100  data: 0.0794  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2801 (1.1951)  acc1: 72.8000 (75.1200)  acc5: 91.2000 (93.4400)  time: 0.2078  data: 0.0793  max mem: 21847
Test: Total time: 0:00:10 (0.4113 s / it)
* Acc@1 75.794 Acc@5 93.440 loss 1.185
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 76.07%
Epoch: [72]  [   0/1251]  eta: 1:11:11  lr: 0.003669  min_lr: 0.003669  loss: 3.8443 (3.8443)  weight_decay: 0.0500 (0.0500)  time: 3.4143  data: 3.0073  max mem: 21847
Epoch: [72]  [ 200/1251]  eta: 0:05:05  lr: 0.003667  min_lr: 0.003667  loss: 2.8898 (3.3147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8516 (0.8092)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [72]  [ 400/1251]  eta: 0:04:00  lr: 0.003665  min_lr: 0.003665  loss: 3.4241 (3.3292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6928 (0.8981)  time: 0.2761  data: 0.0004  max mem: 21847
Epoch: [72]  [ 600/1251]  eta: 0:03:02  lr: 0.003663  min_lr: 0.003663  loss: 3.6324 (3.3118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9376 (0.8676)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [72]  [ 800/1251]  eta: 0:02:05  lr: 0.003661  min_lr: 0.003661  loss: 3.5316 (3.3295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9087 (0.8874)  time: 0.2737  data: 0.0005  max mem: 21847
Epoch: [72]  [1000/1251]  eta: 0:01:09  lr: 0.003659  min_lr: 0.003659  loss: 3.6316 (3.3337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8905 (0.8877)  time: 0.2839  data: 0.0005  max mem: 21847
Epoch: [72]  [1200/1251]  eta: 0:00:14  lr: 0.003657  min_lr: 0.003657  loss: 3.8572 (3.3373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7963 (0.8758)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [72]  [1250/1251]  eta: 0:00:00  lr: 0.003657  min_lr: 0.003657  loss: 3.3586 (3.3305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8288 (0.8784)  time: 0.2281  data: 0.0007  max mem: 21847
Epoch: [72] Total time: 0:05:47 (0.2778 s / it)
Averaged stats: lr: 0.003657  min_lr: 0.003657  loss: 3.3586 (3.3477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8288 (0.8784)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7328 (0.7328)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 5.4823  data: 5.3262  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9221 (0.9232)  acc1: 80.0000 (79.9273)  acc5: 96.8000 (96.4364)  time: 0.7634  data: 0.6285  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1234 (1.1171)  acc1: 72.4000 (75.4667)  acc5: 93.6000 (93.6381)  time: 0.2096  data: 0.0795  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2744 (1.1320)  acc1: 71.6000 (74.9280)  acc5: 92.0000 (93.4400)  time: 0.2011  data: 0.0728  max mem: 21847
Test: Total time: 0:00:10 (0.4116 s / it)
* Acc@1 75.794 Acc@5 93.476 loss 1.121
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 76.07%
Epoch: [73]  [   0/1251]  eta: 1:08:09  lr: 0.003657  min_lr: 0.003657  loss: 2.7675 (2.7675)  weight_decay: 0.0500 (0.0500)  time: 3.2692  data: 2.8308  max mem: 21847
Epoch: [73]  [ 200/1251]  eta: 0:05:03  lr: 0.003655  min_lr: 0.003655  loss: 3.7252 (3.4252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7836 (0.9552)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [73]  [ 400/1251]  eta: 0:04:00  lr: 0.003653  min_lr: 0.003653  loss: 3.5092 (3.3878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9411 (0.9749)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [73]  [ 600/1251]  eta: 0:03:02  lr: 0.003651  min_lr: 0.003651  loss: 3.3933 (3.3567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8182 (0.9056)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [73]  [ 800/1251]  eta: 0:02:05  lr: 0.003649  min_lr: 0.003649  loss: 3.0384 (3.3812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6860 (0.9067)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [73]  [1000/1251]  eta: 0:01:09  lr: 0.003647  min_lr: 0.003647  loss: 3.5734 (3.3827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7235 (0.9080)  time: 0.2809  data: 0.0004  max mem: 21847
Epoch: [73]  [1200/1251]  eta: 0:00:14  lr: 0.003645  min_lr: 0.003645  loss: 3.0536 (3.3753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7907 (0.9133)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [73]  [1250/1251]  eta: 0:00:00  lr: 0.003644  min_lr: 0.003644  loss: 3.0013 (3.3729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7816 (0.9106)  time: 0.2281  data: 0.0013  max mem: 21847
Epoch: [73] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.003644  min_lr: 0.003644  loss: 3.0013 (3.3402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7816 (0.9106)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7169 (0.7169)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 5.6455  data: 5.4821  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8999 (0.8702)  acc1: 79.2000 (80.4000)  acc5: 96.8000 (96.4000)  time: 0.7219  data: 0.5883  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0951 (1.0622)  acc1: 74.4000 (76.4571)  acc5: 93.2000 (93.8476)  time: 0.2047  data: 0.0756  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1413 (1.0743)  acc1: 74.4000 (75.8560)  acc5: 92.4000 (93.7760)  time: 0.2039  data: 0.0755  max mem: 21847
Test: Total time: 0:00:10 (0.4133 s / it)
* Acc@1 76.128 Acc@5 93.622 loss 1.067
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.13%
Epoch: [74]  [   0/1251]  eta: 1:09:34  lr: 0.003644  min_lr: 0.003644  loss: 3.7116 (3.7116)  weight_decay: 0.0500 (0.0500)  time: 3.3368  data: 3.0556  max mem: 21847
Epoch: [74]  [ 200/1251]  eta: 0:05:04  lr: 0.003642  min_lr: 0.003642  loss: 3.7094 (3.4287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7571 (0.7838)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [74]  [ 400/1251]  eta: 0:03:59  lr: 0.003640  min_lr: 0.003640  loss: 3.1948 (3.3773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8795 (0.9396)  time: 0.2740  data: 0.0004  max mem: 21847
Epoch: [74]  [ 600/1251]  eta: 0:03:01  lr: 0.003638  min_lr: 0.003638  loss: 3.5110 (3.3585)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8937 (0.8899)  time: 0.2717  data: 0.0003  max mem: 21847
Epoch: [74]  [ 800/1251]  eta: 0:02:05  lr: 0.003636  min_lr: 0.003636  loss: 3.7898 (3.3520)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8282 (0.8880)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [74]  [1000/1251]  eta: 0:01:09  lr: 0.003634  min_lr: 0.003634  loss: 3.6989 (3.3689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6206 (0.8536)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [74]  [1200/1251]  eta: 0:00:14  lr: 0.003632  min_lr: 0.003632  loss: 3.1002 (3.3629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8633 (0.8637)  time: 0.2749  data: 0.0004  max mem: 21847
Epoch: [74]  [1250/1251]  eta: 0:00:00  lr: 0.003631  min_lr: 0.003631  loss: 2.9631 (3.3619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6791 (0.8616)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [74] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.003631  min_lr: 0.003631  loss: 2.9631 (3.3422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6791 (0.8616)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.7087 (0.7087)  acc1: 85.2000 (85.2000)  acc5: 95.6000 (95.6000)  time: 5.4073  data: 5.2544  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9032 (0.8798)  acc1: 81.2000 (80.5091)  acc5: 96.4000 (96.1091)  time: 0.7004  data: 0.5667  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1248 (1.0622)  acc1: 74.0000 (76.3810)  acc5: 93.2000 (93.4857)  time: 0.1958  data: 0.0660  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1556 (1.0687)  acc1: 74.0000 (76.3040)  acc5: 92.4000 (93.4080)  time: 0.1944  data: 0.0660  max mem: 21847
Test: Total time: 0:00:09 (0.3964 s / it)
* Acc@1 76.316 Acc@5 93.598 loss 1.063
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.32%
Epoch: [75]  [   0/1251]  eta: 1:05:17  lr: 0.003631  min_lr: 0.003631  loss: 3.6099 (3.6099)  weight_decay: 0.0500 (0.0500)  time: 3.1318  data: 2.8202  max mem: 21847
Epoch: [75]  [ 200/1251]  eta: 0:05:04  lr: 0.003629  min_lr: 0.003629  loss: 3.7055 (3.3007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9305 (0.8820)  time: 0.2799  data: 0.0004  max mem: 21847
Epoch: [75]  [ 400/1251]  eta: 0:04:00  lr: 0.003627  min_lr: 0.003627  loss: 3.2863 (3.2888)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7357 (0.8903)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [75]  [ 600/1251]  eta: 0:03:01  lr: 0.003625  min_lr: 0.003625  loss: 3.5960 (3.3083)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9232 (0.8828)  time: 0.2706  data: 0.0005  max mem: 21847
Epoch: [75]  [ 800/1251]  eta: 0:02:05  lr: 0.003623  min_lr: 0.003623  loss: 3.5783 (3.3124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7040 (0.8907)  time: 0.2828  data: 0.0004  max mem: 21847
Epoch: [75]  [1000/1251]  eta: 0:01:09  lr: 0.003621  min_lr: 0.003621  loss: 3.0595 (3.3191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9707 (0.8993)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [75]  [1200/1251]  eta: 0:00:14  lr: 0.003619  min_lr: 0.003619  loss: 3.2531 (3.3304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6254 (0.8734)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [75]  [1250/1251]  eta: 0:00:00  lr: 0.003618  min_lr: 0.003618  loss: 3.1883 (3.3293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7279 (0.8696)  time: 0.2292  data: 0.0007  max mem: 21847
Epoch: [75] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.003618  min_lr: 0.003618  loss: 3.1883 (3.3378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7279 (0.8696)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8001 (0.8001)  acc1: 84.0000 (84.0000)  acc5: 96.4000 (96.4000)  time: 5.4758  data: 5.3005  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9197 (0.9584)  acc1: 78.8000 (79.6727)  acc5: 96.4000 (95.9636)  time: 0.7314  data: 0.5963  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1258 (1.1369)  acc1: 74.8000 (76.4762)  acc5: 92.4000 (93.5048)  time: 0.2092  data: 0.0799  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2469 (1.1548)  acc1: 73.6000 (75.6960)  acc5: 92.0000 (93.3920)  time: 0.2088  data: 0.0798  max mem: 21847
Test: Total time: 0:00:10 (0.4097 s / it)
* Acc@1 76.062 Acc@5 93.614 loss 1.140
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.32%
Epoch: [76]  [   0/1251]  eta: 1:04:23  lr: 0.003618  min_lr: 0.003618  loss: 3.5297 (3.5297)  weight_decay: 0.0500 (0.0500)  time: 3.0882  data: 2.3273  max mem: 21847
Epoch: [76]  [ 200/1251]  eta: 0:05:04  lr: 0.003616  min_lr: 0.003616  loss: 3.0311 (3.3536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8143 (0.8516)  time: 0.2721  data: 0.0005  max mem: 21847
Epoch: [76]  [ 400/1251]  eta: 0:04:01  lr: 0.003614  min_lr: 0.003614  loss: 3.6760 (3.3394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0114 (0.9328)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [76]  [ 600/1251]  eta: 0:03:02  lr: 0.003612  min_lr: 0.003612  loss: 3.6499 (3.3336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8482 (0.8913)  time: 0.2720  data: 0.0005  max mem: 21847
Epoch: [76]  [ 800/1251]  eta: 0:02:05  lr: 0.003610  min_lr: 0.003610  loss: 3.6964 (3.3287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7958 (0.8937)  time: 0.2730  data: 0.0010  max mem: 21847
Epoch: [76]  [1000/1251]  eta: 0:01:09  lr: 0.003607  min_lr: 0.003607  loss: 3.0729 (3.3339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6972 (0.8632)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [76]  [1200/1251]  eta: 0:00:14  lr: 0.003605  min_lr: 0.003605  loss: 3.2194 (3.3258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7259 (0.8906)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [76]  [1250/1251]  eta: 0:00:00  lr: 0.003605  min_lr: 0.003605  loss: 3.5302 (3.3248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6817 (0.8920)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [76] Total time: 0:05:47 (0.2777 s / it)
Averaged stats: lr: 0.003605  min_lr: 0.003605  loss: 3.5302 (3.3277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6817 (0.8920)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7719 (0.7719)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 5.7266  data: 5.5645  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8688 (0.9433)  acc1: 80.0000 (80.0727)  acc5: 96.4000 (95.9273)  time: 0.7430  data: 0.6095  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1358 (1.1214)  acc1: 74.4000 (76.3238)  acc5: 92.4000 (93.6000)  time: 0.2064  data: 0.0770  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1358 (1.1359)  acc1: 73.6000 (75.8240)  acc5: 92.0000 (93.5200)  time: 0.2055  data: 0.0769  max mem: 21847
Test: Total time: 0:00:10 (0.4178 s / it)
* Acc@1 76.042 Acc@5 93.606 loss 1.127
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.32%
Epoch: [77]  [   0/1251]  eta: 0:59:57  lr: 0.003605  min_lr: 0.003605  loss: 3.2542 (3.2542)  weight_decay: 0.0500 (0.0500)  time: 2.8754  data: 1.8015  max mem: 21847
Epoch: [77]  [ 200/1251]  eta: 0:05:02  lr: 0.003603  min_lr: 0.003603  loss: 3.3625 (3.3402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6963 (0.8590)  time: 0.2713  data: 0.0005  max mem: 21847
Epoch: [77]  [ 400/1251]  eta: 0:03:59  lr: 0.003601  min_lr: 0.003601  loss: 3.4765 (3.2958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6340 (0.8362)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [77]  [ 600/1251]  eta: 0:03:01  lr: 0.003598  min_lr: 0.003598  loss: 3.2948 (3.2958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6839 (0.8471)  time: 0.2716  data: 0.0005  max mem: 21847
Epoch: [77]  [ 800/1251]  eta: 0:02:05  lr: 0.003596  min_lr: 0.003596  loss: 2.6961 (3.2824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9142 (0.8641)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [77]  [1000/1251]  eta: 0:01:09  lr: 0.003594  min_lr: 0.003594  loss: 3.2852 (3.2958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7624 (0.8529)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [77]  [1200/1251]  eta: 0:00:14  lr: 0.003592  min_lr: 0.003592  loss: 3.1432 (3.3014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7780 (0.8714)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [77]  [1250/1251]  eta: 0:00:00  lr: 0.003591  min_lr: 0.003591  loss: 3.7879 (3.3027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7382 (0.8706)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [77] Total time: 0:05:45 (0.2761 s / it)
Averaged stats: lr: 0.003591  min_lr: 0.003591  loss: 3.7879 (3.3405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7382 (0.8706)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7865 (0.7865)  acc1: 84.0000 (84.0000)  acc5: 96.8000 (96.8000)  time: 5.4803  data: 5.3103  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9480 (0.9398)  acc1: 80.4000 (80.3273)  acc5: 96.0000 (95.7818)  time: 0.7483  data: 0.6126  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1652 (1.1263)  acc1: 74.0000 (76.3429)  acc5: 92.4000 (93.3143)  time: 0.2123  data: 0.0825  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2749 (1.1381)  acc1: 73.2000 (76.0640)  acc5: 92.4000 (93.3280)  time: 0.2114  data: 0.0824  max mem: 21847
Test: Total time: 0:00:10 (0.4133 s / it)
* Acc@1 76.334 Acc@5 93.458 loss 1.131
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.33%
Epoch: [78]  [   0/1251]  eta: 1:02:34  lr: 0.003591  min_lr: 0.003591  loss: 3.1430 (3.1430)  weight_decay: 0.0500 (0.0500)  time: 3.0014  data: 2.6226  max mem: 21847
Epoch: [78]  [ 200/1251]  eta: 0:05:04  lr: 0.003589  min_lr: 0.003589  loss: 3.7118 (3.2562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8046 (0.8322)  time: 0.2744  data: 0.0005  max mem: 21847
Epoch: [78]  [ 400/1251]  eta: 0:04:00  lr: 0.003587  min_lr: 0.003587  loss: 3.5304 (3.2937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7431 (0.8572)  time: 0.2724  data: 0.0005  max mem: 21847
Epoch: [78]  [ 600/1251]  eta: 0:03:01  lr: 0.003585  min_lr: 0.003585  loss: 3.0147 (3.3286)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [78]  [ 800/1251]  eta: 0:02:05  lr: 0.003583  min_lr: 0.003583  loss: 2.9750 (3.3271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6612 (nan)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [78]  [1000/1251]  eta: 0:01:09  lr: 0.003580  min_lr: 0.003580  loss: 3.0291 (3.3231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9172 (nan)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [78]  [1200/1251]  eta: 0:00:14  lr: 0.003578  min_lr: 0.003578  loss: 3.7000 (3.3286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8475 (nan)  time: 0.2719  data: 0.0005  max mem: 21847
Epoch: [78]  [1250/1251]  eta: 0:00:00  lr: 0.003578  min_lr: 0.003578  loss: 3.6457 (3.3279)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7829 (nan)  time: 0.2283  data: 0.0006  max mem: 21847
Epoch: [78] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.003578  min_lr: 0.003578  loss: 3.6457 (3.3173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7829 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7193 (0.7193)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.4983  data: 5.3330  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8870 (0.9144)  acc1: 80.0000 (80.2909)  acc5: 96.0000 (96.1818)  time: 0.7090  data: 0.5766  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1261 (1.0955)  acc1: 74.8000 (76.4762)  acc5: 93.6000 (93.7905)  time: 0.1889  data: 0.0606  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2217 (1.1061)  acc1: 74.4000 (76.1760)  acc5: 92.4000 (93.7280)  time: 0.1887  data: 0.0605  max mem: 21847
Test: Total time: 0:00:09 (0.3948 s / it)
* Acc@1 76.448 Acc@5 93.642 loss 1.091
Accuracy of the model on the 50000 test images: 76.4%
Max accuracy: 76.45%
Epoch: [79]  [   0/1251]  eta: 1:02:12  lr: 0.003578  min_lr: 0.003578  loss: 2.4820 (2.4820)  weight_decay: 0.0500 (0.0500)  time: 2.9834  data: 2.5818  max mem: 21847
Epoch: [79]  [ 200/1251]  eta: 0:05:01  lr: 0.003575  min_lr: 0.003575  loss: 3.4371 (3.2983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6894 (0.7801)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [79]  [ 400/1251]  eta: 0:04:00  lr: 0.003573  min_lr: 0.003573  loss: 2.9326 (3.3121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6619 (0.8345)  time: 0.2753  data: 0.0004  max mem: 21847
Epoch: [79]  [ 600/1251]  eta: 0:03:01  lr: 0.003571  min_lr: 0.003571  loss: 3.7300 (3.3180)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7407 (0.8349)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [79]  [ 800/1251]  eta: 0:02:05  lr: 0.003569  min_lr: 0.003569  loss: 3.4458 (3.3317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7158 (0.8143)  time: 0.2707  data: 0.0005  max mem: 21847
Epoch: [79]  [1000/1251]  eta: 0:01:09  lr: 0.003567  min_lr: 0.003567  loss: 3.5260 (3.3290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9609 (0.8419)  time: 0.2717  data: 0.0005  max mem: 21847
Epoch: [79]  [1200/1251]  eta: 0:00:14  lr: 0.003564  min_lr: 0.003564  loss: 3.0366 (3.3039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6605 (0.8414)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [79]  [1250/1251]  eta: 0:00:00  lr: 0.003564  min_lr: 0.003564  loss: 3.7464 (3.3089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6605 (0.8395)  time: 0.2285  data: 0.0007  max mem: 21847
Epoch: [79] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003564  min_lr: 0.003564  loss: 3.7464 (3.3250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6605 (0.8395)
Test:  [ 0/25]  eta: 0:02:02  loss: 0.8133 (0.8133)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 4.8892  data: 4.7216  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.9723 (1.0101)  acc1: 81.6000 (80.1091)  acc5: 96.0000 (95.8546)  time: 0.6611  data: 0.5262  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.2206 (1.1804)  acc1: 73.6000 (76.3238)  acc5: 93.6000 (93.8857)  time: 0.2149  data: 0.0854  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.3050 (1.1876)  acc1: 73.2000 (76.0320)  acc5: 92.4000 (93.7760)  time: 0.2075  data: 0.0793  max mem: 21847
Test: Total time: 0:00:10 (0.4047 s / it)
* Acc@1 76.288 Acc@5 93.726 loss 1.189
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.45%
Epoch: [80]  [   0/1251]  eta: 1:01:19  lr: 0.003564  min_lr: 0.003564  loss: 2.9613 (2.9613)  weight_decay: 0.0500 (0.0500)  time: 2.9416  data: 2.3968  max mem: 21847
Epoch: [80]  [ 200/1251]  eta: 0:05:04  lr: 0.003562  min_lr: 0.003562  loss: 3.2018 (3.2747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7747 (0.9973)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [80]  [ 400/1251]  eta: 0:04:00  lr: 0.003559  min_lr: 0.003559  loss: 3.2116 (3.2956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.9079)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [80]  [ 600/1251]  eta: 0:03:01  lr: 0.003557  min_lr: 0.003557  loss: 3.7227 (3.2848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6445 (0.8799)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [80]  [ 800/1251]  eta: 0:02:05  lr: 0.003555  min_lr: 0.003555  loss: 3.1357 (3.2921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7188 (0.8723)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [80]  [1000/1251]  eta: 0:01:09  lr: 0.003553  min_lr: 0.003553  loss: 3.1033 (3.2883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7549 (0.8719)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [80]  [1200/1251]  eta: 0:00:14  lr: 0.003550  min_lr: 0.003550  loss: 3.7309 (3.3062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7584 (0.8637)  time: 0.2735  data: 0.0005  max mem: 21847
Epoch: [80]  [1250/1251]  eta: 0:00:00  lr: 0.003550  min_lr: 0.003550  loss: 3.3042 (3.3015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7868 (0.8685)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [80] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.003550  min_lr: 0.003550  loss: 3.3042 (3.3104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7868 (0.8685)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6649 (0.6649)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.7686  data: 5.5976  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8955 (0.8774)  acc1: 80.8000 (79.7091)  acc5: 96.8000 (96.1455)  time: 0.7186  data: 0.5851  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0974 (1.0482)  acc1: 74.4000 (76.6286)  acc5: 94.0000 (93.7524)  time: 0.1881  data: 0.0594  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1874 (1.0606)  acc1: 74.4000 (76.1600)  acc5: 91.6000 (93.6160)  time: 0.2029  data: 0.0749  max mem: 21847
Test: Total time: 0:00:10 (0.4177 s / it)
* Acc@1 76.426 Acc@5 93.702 loss 1.056
Accuracy of the model on the 50000 test images: 76.4%
Max accuracy: 76.45%
Epoch: [81]  [   0/1251]  eta: 1:07:37  lr: 0.003550  min_lr: 0.003550  loss: 2.7235 (2.7235)  weight_decay: 0.0500 (0.0500)  time: 3.2435  data: 2.9146  max mem: 21847
Epoch: [81]  [ 200/1251]  eta: 0:05:04  lr: 0.003547  min_lr: 0.003547  loss: 3.3133 (3.2930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8530 (0.9317)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [81]  [ 400/1251]  eta: 0:04:00  lr: 0.003545  min_lr: 0.003545  loss: 3.4567 (3.3204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6178 (0.8733)  time: 0.2741  data: 0.0005  max mem: 21847
Epoch: [81]  [ 600/1251]  eta: 0:03:02  lr: 0.003543  min_lr: 0.003543  loss: 3.0539 (3.3110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7908 (0.8570)  time: 0.2831  data: 0.0004  max mem: 21847
Epoch: [81]  [ 800/1251]  eta: 0:02:05  lr: 0.003541  min_lr: 0.003541  loss: 3.0209 (3.2965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7897 (0.8511)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [81]  [1000/1251]  eta: 0:01:09  lr: 0.003538  min_lr: 0.003538  loss: 3.7749 (3.3086)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6586 (0.8333)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [81]  [1200/1251]  eta: 0:00:14  lr: 0.003536  min_lr: 0.003536  loss: 3.5434 (3.3201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7530 (0.8286)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [81]  [1250/1251]  eta: 0:00:00  lr: 0.003535  min_lr: 0.003535  loss: 3.5553 (3.3212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6033 (0.8209)  time: 0.2292  data: 0.0005  max mem: 21847
Epoch: [81] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.003535  min_lr: 0.003535  loss: 3.5553 (3.3058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6033 (0.8209)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7363 (0.7363)  acc1: 84.4000 (84.4000)  acc5: 96.0000 (96.0000)  time: 5.3646  data: 5.1974  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9242 (0.9117)  acc1: 81.6000 (80.1091)  acc5: 96.0000 (95.9273)  time: 0.7305  data: 0.5944  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1392 (1.0706)  acc1: 73.6000 (76.1333)  acc5: 93.2000 (93.7333)  time: 0.2219  data: 0.0915  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1855 (1.0846)  acc1: 72.8000 (75.8720)  acc5: 92.8000 (93.6800)  time: 0.2197  data: 0.0904  max mem: 21847
Test: Total time: 0:00:10 (0.4158 s / it)
* Acc@1 76.508 Acc@5 93.704 loss 1.075
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.51%
Epoch: [82]  [   0/1251]  eta: 1:08:47  lr: 0.003535  min_lr: 0.003535  loss: 3.7241 (3.7241)  weight_decay: 0.0500 (0.0500)  time: 3.2997  data: 3.0025  max mem: 21847
Epoch: [82]  [ 200/1251]  eta: 0:05:08  lr: 0.003533  min_lr: 0.003533  loss: 2.8061 (3.3589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6601 (0.9664)  time: 0.2865  data: 0.0004  max mem: 21847
Epoch: [82]  [ 400/1251]  eta: 0:04:00  lr: 0.003531  min_lr: 0.003531  loss: 3.0628 (3.3139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6780 (0.8616)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [82]  [ 600/1251]  eta: 0:03:02  lr: 0.003528  min_lr: 0.003528  loss: 2.9924 (3.3053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7999 (0.8789)  time: 0.2808  data: 0.0004  max mem: 21847
Epoch: [82]  [ 800/1251]  eta: 0:02:05  lr: 0.003526  min_lr: 0.003526  loss: 3.1191 (3.3078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7933 (0.8664)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [82]  [1000/1251]  eta: 0:01:09  lr: 0.003524  min_lr: 0.003524  loss: 3.5275 (3.3074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7665 (0.8847)  time: 0.2751  data: 0.0004  max mem: 21847
Epoch: [82]  [1200/1251]  eta: 0:00:14  lr: 0.003521  min_lr: 0.003521  loss: 2.7786 (3.3087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6177 (0.8658)  time: 0.2745  data: 0.0004  max mem: 21847
Epoch: [82]  [1250/1251]  eta: 0:00:00  lr: 0.003521  min_lr: 0.003521  loss: 3.5992 (3.3087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8471 (0.8726)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [82] Total time: 0:05:47 (0.2778 s / it)
Averaged stats: lr: 0.003521  min_lr: 0.003521  loss: 3.5992 (3.3139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8471 (0.8726)
Test:  [ 0/25]  eta: 0:01:52  loss: 0.7197 (0.7197)  acc1: 86.4000 (86.4000)  acc5: 97.2000 (97.2000)  time: 4.5161  data: 4.3493  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.9572 (0.9230)  acc1: 79.6000 (80.3636)  acc5: 96.0000 (96.1455)  time: 0.6115  data: 0.4764  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0957 (1.1063)  acc1: 75.2000 (76.3619)  acc5: 92.8000 (93.6000)  time: 0.2107  data: 0.0810  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2409 (1.1148)  acc1: 73.2000 (76.2080)  acc5: 92.0000 (93.5200)  time: 0.1956  data: 0.0682  max mem: 21847
Test: Total time: 0:00:09 (0.3885 s / it)
* Acc@1 76.560 Acc@5 93.628 loss 1.110
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.56%
Epoch: [83]  [   0/1251]  eta: 1:01:30  lr: 0.003521  min_lr: 0.003521  loss: 4.0570 (4.0570)  weight_decay: 0.0500 (0.0500)  time: 2.9499  data: 2.5757  max mem: 21847
Epoch: [83]  [ 200/1251]  eta: 0:05:01  lr: 0.003519  min_lr: 0.003519  loss: 2.8780 (3.2840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7738 (0.7639)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [83]  [ 400/1251]  eta: 0:03:59  lr: 0.003516  min_lr: 0.003516  loss: 2.8884 (3.2907)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6464 (0.7917)  time: 0.2816  data: 0.0004  max mem: 21847
Epoch: [83]  [ 600/1251]  eta: 0:03:01  lr: 0.003514  min_lr: 0.003514  loss: 3.2931 (3.3113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8430 (0.7996)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [83]  [ 800/1251]  eta: 0:02:05  lr: 0.003512  min_lr: 0.003512  loss: 3.6912 (3.3316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.8194)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [83]  [1000/1251]  eta: 0:01:09  lr: 0.003509  min_lr: 0.003509  loss: 3.3894 (3.3240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6875 (0.8216)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [83]  [1200/1251]  eta: 0:00:14  lr: 0.003507  min_lr: 0.003507  loss: 3.3085 (3.3290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7922 (0.8203)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [83]  [1250/1251]  eta: 0:00:00  lr: 0.003506  min_lr: 0.003506  loss: 2.6815 (3.3222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7340 (0.8195)  time: 0.2278  data: 0.0006  max mem: 21847
Epoch: [83] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003506  min_lr: 0.003506  loss: 2.6815 (3.3106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7340 (0.8195)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7460 (0.7460)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.6253  data: 5.4701  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8588 (0.8734)  acc1: 81.6000 (81.0545)  acc5: 96.0000 (96.1818)  time: 0.7786  data: 0.6432  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0777 (1.0520)  acc1: 74.0000 (77.3905)  acc5: 93.2000 (93.6952)  time: 0.2176  data: 0.0860  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1279 (1.0583)  acc1: 74.8000 (77.0720)  acc5: 92.4000 (93.6480)  time: 0.2170  data: 0.0859  max mem: 21847
Test: Total time: 0:00:10 (0.4240 s / it)
* Acc@1 76.934 Acc@5 93.732 loss 1.049
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 76.93%
Epoch: [84]  [   0/1251]  eta: 1:05:35  lr: 0.003506  min_lr: 0.003506  loss: 3.5461 (3.5461)  weight_decay: 0.0500 (0.0500)  time: 3.1462  data: 2.8320  max mem: 21847
Epoch: [84]  [ 200/1251]  eta: 0:05:03  lr: 0.003504  min_lr: 0.003504  loss: 3.3361 (3.3192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.7300)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [84]  [ 400/1251]  eta: 0:03:59  lr: 0.003502  min_lr: 0.003502  loss: 3.1082 (3.2878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6673 (0.7862)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [84]  [ 600/1251]  eta: 0:03:01  lr: 0.003499  min_lr: 0.003499  loss: 3.7327 (3.2896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6281 (0.8284)  time: 0.2839  data: 0.0004  max mem: 21847
Epoch: [84]  [ 800/1251]  eta: 0:02:05  lr: 0.003497  min_lr: 0.003497  loss: 3.4863 (3.2881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8444 (0.8200)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [84]  [1000/1251]  eta: 0:01:09  lr: 0.003494  min_lr: 0.003494  loss: 3.5341 (3.2959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5770 (0.8172)  time: 0.2807  data: 0.0004  max mem: 21847
Epoch: [84]  [1200/1251]  eta: 0:00:14  lr: 0.003492  min_lr: 0.003492  loss: 3.1089 (3.2890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8244 (0.8256)  time: 0.2718  data: 0.0003  max mem: 21847
Epoch: [84]  [1250/1251]  eta: 0:00:00  lr: 0.003491  min_lr: 0.003491  loss: 3.4935 (3.2941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.8215)  time: 0.2276  data: 0.0005  max mem: 21847
Epoch: [84] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003491  min_lr: 0.003491  loss: 3.4935 (3.3050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.8215)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7855 (0.7855)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 5.6289  data: 5.4801  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9901 (0.9613)  acc1: 81.6000 (81.3818)  acc5: 96.4000 (96.2182)  time: 0.7580  data: 0.6238  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1266 (1.1494)  acc1: 76.0000 (76.7810)  acc5: 93.2000 (93.6571)  time: 0.2150  data: 0.0849  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2893 (1.1604)  acc1: 73.2000 (76.3200)  acc5: 92.8000 (93.6160)  time: 0.2130  data: 0.0848  max mem: 21847
Test: Total time: 0:00:10 (0.4208 s / it)
* Acc@1 76.574 Acc@5 93.714 loss 1.148
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.93%
Epoch: [85]  [   0/1251]  eta: 1:06:18  lr: 0.003491  min_lr: 0.003491  loss: 4.4463 (4.4463)  weight_decay: 0.0500 (0.0500)  time: 3.1805  data: 2.5247  max mem: 21847
Epoch: [85]  [ 200/1251]  eta: 0:05:08  lr: 0.003489  min_lr: 0.003489  loss: 3.7199 (3.3095)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [85]  [ 400/1251]  eta: 0:04:02  lr: 0.003487  min_lr: 0.003487  loss: 2.9519 (3.2880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7592 (nan)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [85]  [ 600/1251]  eta: 0:03:02  lr: 0.003484  min_lr: 0.003484  loss: 3.6982 (3.2884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6691 (nan)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [85]  [ 800/1251]  eta: 0:02:06  lr: 0.003482  min_lr: 0.003482  loss: 3.0860 (3.2894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7172 (nan)  time: 0.2803  data: 0.0005  max mem: 21847
Epoch: [85]  [1000/1251]  eta: 0:01:09  lr: 0.003479  min_lr: 0.003479  loss: 2.8138 (3.2827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8182 (nan)  time: 0.2725  data: 0.0005  max mem: 21847
Epoch: [85]  [1200/1251]  eta: 0:00:14  lr: 0.003477  min_lr: 0.003477  loss: 3.3510 (3.2823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8425 (nan)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [85]  [1250/1251]  eta: 0:00:00  lr: 0.003476  min_lr: 0.003476  loss: 3.3232 (3.2837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9700 (nan)  time: 0.2346  data: 0.0007  max mem: 21847
Epoch: [85] Total time: 0:05:47 (0.2781 s / it)
Averaged stats: lr: 0.003476  min_lr: 0.003476  loss: 3.3232 (3.3040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9700 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6792 (0.6792)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.6086  data: 5.4573  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9170 (0.8882)  acc1: 80.0000 (80.0727)  acc5: 96.4000 (96.0000)  time: 0.6982  data: 0.5622  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0770 (1.0652)  acc1: 73.6000 (76.6667)  acc5: 93.6000 (93.6000)  time: 0.1910  data: 0.0600  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1450 (1.0715)  acc1: 73.6000 (76.5280)  acc5: 91.6000 (93.4720)  time: 0.2122  data: 0.0820  max mem: 21847
Test: Total time: 0:00:10 (0.4199 s / it)
* Acc@1 76.740 Acc@5 93.752 loss 1.058
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.93%
Epoch: [86]  [   0/1251]  eta: 1:08:09  lr: 0.003476  min_lr: 0.003476  loss: 2.6113 (2.6113)  weight_decay: 0.0500 (0.0500)  time: 3.2691  data: 2.7553  max mem: 21847
Epoch: [86]  [ 200/1251]  eta: 0:05:02  lr: 0.003474  min_lr: 0.003474  loss: 3.5650 (3.3098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8811 (0.8274)  time: 0.2723  data: 0.0003  max mem: 21847
Epoch: [86]  [ 400/1251]  eta: 0:03:59  lr: 0.003472  min_lr: 0.003472  loss: 3.3133 (3.3095)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9035 (0.8451)  time: 0.2792  data: 0.0003  max mem: 21847
Epoch: [86]  [ 600/1251]  eta: 0:03:01  lr: 0.003469  min_lr: 0.003469  loss: 3.6285 (3.3177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6730 (0.7946)  time: 0.2748  data: 0.0005  max mem: 21847
Epoch: [86]  [ 800/1251]  eta: 0:02:05  lr: 0.003467  min_lr: 0.003467  loss: 3.1990 (3.2942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9408 (0.8411)  time: 0.2715  data: 0.0005  max mem: 21847
Epoch: [86]  [1000/1251]  eta: 0:01:09  lr: 0.003464  min_lr: 0.003464  loss: 3.4720 (3.2905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5552 (0.8218)  time: 0.2771  data: 0.0005  max mem: 21847
Epoch: [86]  [1200/1251]  eta: 0:00:14  lr: 0.003462  min_lr: 0.003462  loss: 3.6155 (3.2932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7530 (0.8209)  time: 0.2703  data: 0.0004  max mem: 21847
Epoch: [86]  [1250/1251]  eta: 0:00:00  lr: 0.003461  min_lr: 0.003461  loss: 3.6036 (3.2936)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0359 (0.8266)  time: 0.2277  data: 0.0006  max mem: 21847
Epoch: [86] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003461  min_lr: 0.003461  loss: 3.6036 (3.3009)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0359 (0.8266)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7139 (0.7139)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.5972  data: 5.4441  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9422 (0.9378)  acc1: 81.2000 (80.6182)  acc5: 96.8000 (96.2909)  time: 0.6994  data: 0.5652  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1423 (1.1147)  acc1: 75.2000 (76.7619)  acc5: 93.6000 (93.9619)  time: 0.1880  data: 0.0581  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2656 (1.1276)  acc1: 72.8000 (76.2720)  acc5: 92.0000 (93.8720)  time: 0.1864  data: 0.0580  max mem: 21847
Test: Total time: 0:00:09 (0.3986 s / it)
* Acc@1 76.744 Acc@5 93.892 loss 1.112
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.93%
Epoch: [87]  [   0/1251]  eta: 1:10:14  lr: 0.003461  min_lr: 0.003461  loss: 3.3391 (3.3391)  weight_decay: 0.0500 (0.0500)  time: 3.3688  data: 2.9932  max mem: 21847
Epoch: [87]  [ 200/1251]  eta: 0:05:04  lr: 0.003459  min_lr: 0.003459  loss: 3.2189 (3.2523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7136 (0.8001)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [87]  [ 400/1251]  eta: 0:04:00  lr: 0.003456  min_lr: 0.003456  loss: 3.5736 (3.2370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7085 (0.7655)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [87]  [ 600/1251]  eta: 0:03:01  lr: 0.003454  min_lr: 0.003454  loss: 3.6382 (3.2399)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [87]  [ 800/1251]  eta: 0:02:05  lr: 0.003451  min_lr: 0.003451  loss: 3.4441 (3.2671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8029 (nan)  time: 0.2845  data: 0.0004  max mem: 21847
Epoch: [87]  [1000/1251]  eta: 0:01:09  lr: 0.003449  min_lr: 0.003449  loss: 3.3121 (3.2636)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8162 (nan)  time: 0.2727  data: 0.0003  max mem: 21847
Epoch: [87]  [1200/1251]  eta: 0:00:14  lr: 0.003446  min_lr: 0.003446  loss: 3.5528 (3.2679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6410 (nan)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [87]  [1250/1251]  eta: 0:00:00  lr: 0.003446  min_lr: 0.003446  loss: 3.4130 (3.2700)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6870 (nan)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [87] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003446  min_lr: 0.003446  loss: 3.4130 (3.2864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6870 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6372 (0.6372)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.5756  data: 5.4073  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8498 (0.8932)  acc1: 80.8000 (80.7273)  acc5: 96.4000 (96.5091)  time: 0.7456  data: 0.6115  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1354 (1.0814)  acc1: 75.2000 (76.7619)  acc5: 94.4000 (93.9429)  time: 0.2181  data: 0.0882  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2016 (1.0882)  acc1: 74.4000 (76.5280)  acc5: 91.6000 (93.7760)  time: 0.2172  data: 0.0881  max mem: 21847
Test: Total time: 0:00:10 (0.4211 s / it)
* Acc@1 76.820 Acc@5 93.784 loss 1.088
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.93%
Epoch: [88]  [   0/1251]  eta: 1:11:52  lr: 0.003446  min_lr: 0.003446  loss: 2.9553 (2.9553)  weight_decay: 0.0500 (0.0500)  time: 3.4473  data: 2.7083  max mem: 21847
Epoch: [88]  [ 200/1251]  eta: 0:05:04  lr: 0.003443  min_lr: 0.003443  loss: 3.5812 (3.2609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8578 (0.8186)  time: 0.2723  data: 0.0003  max mem: 21847
Epoch: [88]  [ 400/1251]  eta: 0:04:01  lr: 0.003441  min_lr: 0.003441  loss: 3.4856 (3.2411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7399 (0.7790)  time: 0.2831  data: 0.0004  max mem: 21847
Epoch: [88]  [ 600/1251]  eta: 0:03:02  lr: 0.003438  min_lr: 0.003438  loss: 2.7367 (3.2325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6692 (0.7942)  time: 0.2711  data: 0.0003  max mem: 21847
Epoch: [88]  [ 800/1251]  eta: 0:02:05  lr: 0.003436  min_lr: 0.003436  loss: 3.0090 (3.2328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7866 (0.8163)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [88]  [1000/1251]  eta: 0:01:09  lr: 0.003433  min_lr: 0.003433  loss: 3.6104 (3.2510)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6894 (0.8203)  time: 0.2930  data: 0.0003  max mem: 21847
Epoch: [88]  [1200/1251]  eta: 0:00:14  lr: 0.003431  min_lr: 0.003431  loss: 3.5449 (3.2600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7330 (0.8057)  time: 0.2801  data: 0.0004  max mem: 21847
Epoch: [88]  [1250/1251]  eta: 0:00:00  lr: 0.003430  min_lr: 0.003430  loss: 3.0848 (3.2564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7363 (0.8084)  time: 0.2359  data: 0.0005  max mem: 21847
Epoch: [88] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.003430  min_lr: 0.003430  loss: 3.0848 (3.2839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7363 (0.8084)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.6727 (0.6727)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.2337  data: 5.0479  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8139 (0.8537)  acc1: 82.0000 (80.5818)  acc5: 95.2000 (95.7818)  time: 0.7071  data: 0.5620  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0919 (1.0307)  acc1: 75.2000 (76.9905)  acc5: 93.6000 (93.9238)  time: 0.2063  data: 0.0721  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1686 (1.0420)  acc1: 73.2000 (76.6240)  acc5: 93.2000 (93.8240)  time: 0.2137  data: 0.0834  max mem: 21847
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 76.986 Acc@5 93.988 loss 1.030
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 76.99%
Epoch: [89]  [   0/1251]  eta: 1:05:31  lr: 0.003430  min_lr: 0.003430  loss: 2.3115 (2.3115)  weight_decay: 0.0500 (0.0500)  time: 3.1429  data: 2.8402  max mem: 21847
Epoch: [89]  [ 200/1251]  eta: 0:05:01  lr: 0.003428  min_lr: 0.003428  loss: 2.9398 (3.2479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6435 (0.7036)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [89]  [ 400/1251]  eta: 0:03:58  lr: 0.003425  min_lr: 0.003425  loss: 3.1193 (3.2125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9852 (0.7996)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [89]  [ 600/1251]  eta: 0:03:01  lr: 0.003423  min_lr: 0.003423  loss: 2.9777 (3.2451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7509 (0.8169)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [89]  [ 800/1251]  eta: 0:02:05  lr: 0.003420  min_lr: 0.003420  loss: 3.1627 (3.2672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7146 (0.8122)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [89]  [1000/1251]  eta: 0:01:09  lr: 0.003418  min_lr: 0.003418  loss: 3.4834 (3.2783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7314 (0.8043)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [89]  [1200/1251]  eta: 0:00:14  lr: 0.003415  min_lr: 0.003415  loss: 3.5985 (3.2852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7197 (0.8434)  time: 0.2729  data: 0.0005  max mem: 21847
Epoch: [89]  [1250/1251]  eta: 0:00:00  lr: 0.003414  min_lr: 0.003414  loss: 3.6428 (3.2886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6029 (0.8340)  time: 0.2306  data: 0.0006  max mem: 21847
Epoch: [89] Total time: 0:05:45 (0.2761 s / it)
Averaged stats: lr: 0.003414  min_lr: 0.003414  loss: 3.6428 (3.2929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6029 (0.8340)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.7673 (0.7673)  acc1: 85.2000 (85.2000)  acc5: 97.2000 (97.2000)  time: 5.8296  data: 5.6878  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8885 (0.9157)  acc1: 80.0000 (80.2182)  acc5: 96.8000 (96.4364)  time: 0.7387  data: 0.6073  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1459 (1.0974)  acc1: 74.4000 (76.5905)  acc5: 92.8000 (93.9810)  time: 0.1943  data: 0.0653  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1627 (1.1025)  acc1: 74.4000 (76.3840)  acc5: 92.4000 (93.9360)  time: 0.1928  data: 0.0652  max mem: 21847
Test: Total time: 0:00:10 (0.4124 s / it)
* Acc@1 76.822 Acc@5 93.684 loss 1.093
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.99%
Epoch: [90]  [   0/1251]  eta: 1:12:01  lr: 0.003414  min_lr: 0.003414  loss: 3.0989 (3.0989)  weight_decay: 0.0500 (0.0500)  time: 3.4547  data: 3.0732  max mem: 21847
Epoch: [90]  [ 200/1251]  eta: 0:05:04  lr: 0.003412  min_lr: 0.003412  loss: 3.0597 (3.4078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6379 (0.6591)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [90]  [ 400/1251]  eta: 0:04:00  lr: 0.003409  min_lr: 0.003409  loss: 3.6068 (3.3421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6976 (0.7102)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [90]  [ 600/1251]  eta: 0:03:01  lr: 0.003407  min_lr: 0.003407  loss: 3.6719 (3.3202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6412 (0.7547)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [90]  [ 800/1251]  eta: 0:02:05  lr: 0.003404  min_lr: 0.003404  loss: 3.5264 (3.2936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8481 (0.7523)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [90]  [1000/1251]  eta: 0:01:09  lr: 0.003402  min_lr: 0.003402  loss: 3.3712 (3.2962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6621 (0.7623)  time: 0.2848  data: 0.0004  max mem: 21847
Epoch: [90]  [1200/1251]  eta: 0:00:14  lr: 0.003399  min_lr: 0.003399  loss: 3.7957 (3.3076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6843 (0.7591)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [90]  [1250/1251]  eta: 0:00:00  lr: 0.003398  min_lr: 0.003398  loss: 3.0941 (3.3054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6843 (0.7604)  time: 0.2284  data: 0.0007  max mem: 21847
Epoch: [90] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.003398  min_lr: 0.003398  loss: 3.0941 (3.2952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6843 (0.7604)
Test:  [ 0/25]  eta: 0:01:40  loss: 0.6183 (0.6183)  acc1: 88.8000 (88.8000)  acc5: 97.6000 (97.6000)  time: 4.0071  data: 3.8394  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8638 (0.8546)  acc1: 81.2000 (81.2727)  acc5: 95.2000 (95.9273)  time: 0.6062  data: 0.4730  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0305 (1.0334)  acc1: 75.6000 (77.2571)  acc5: 94.4000 (93.9429)  time: 0.2435  data: 0.1141  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1159 (1.0458)  acc1: 74.8000 (77.0560)  acc5: 92.0000 (93.7760)  time: 0.2119  data: 0.0837  max mem: 21847
Test: Total time: 0:00:09 (0.3964 s / it)
* Acc@1 77.170 Acc@5 93.956 loss 1.033
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.17%
Epoch: [91]  [   0/1251]  eta: 1:08:34  lr: 0.003398  min_lr: 0.003398  loss: 2.2666 (2.2666)  weight_decay: 0.0500 (0.0500)  time: 3.2892  data: 2.9908  max mem: 21847
Epoch: [91]  [ 200/1251]  eta: 0:05:06  lr: 0.003396  min_lr: 0.003396  loss: 3.4828 (3.2455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7329 (0.8016)  time: 0.2904  data: 0.0004  max mem: 21847
Epoch: [91]  [ 400/1251]  eta: 0:04:00  lr: 0.003393  min_lr: 0.003393  loss: 3.3851 (3.2677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8204 (0.8016)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [91]  [ 600/1251]  eta: 0:03:02  lr: 0.003391  min_lr: 0.003391  loss: 2.7353 (3.2727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6508 (0.7659)  time: 0.2833  data: 0.0005  max mem: 21847
Epoch: [91]  [ 800/1251]  eta: 0:02:05  lr: 0.003388  min_lr: 0.003388  loss: 3.2149 (3.2705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7539 (0.7803)  time: 0.2743  data: 0.0005  max mem: 21847
Epoch: [91]  [1000/1251]  eta: 0:01:09  lr: 0.003385  min_lr: 0.003385  loss: 2.7734 (3.2506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7631 (0.7824)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [91]  [1200/1251]  eta: 0:00:14  lr: 0.003383  min_lr: 0.003383  loss: 3.0058 (3.2566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7295 (0.7718)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [91]  [1250/1251]  eta: 0:00:00  lr: 0.003382  min_lr: 0.003382  loss: 3.2389 (3.2548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.7771)  time: 0.2288  data: 0.0006  max mem: 21847
Epoch: [91] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.003382  min_lr: 0.003382  loss: 3.2389 (3.2751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.7771)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7311 (0.7311)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 5.4531  data: 5.2836  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9687 (0.9425)  acc1: 79.6000 (80.6909)  acc5: 96.4000 (96.4000)  time: 0.7346  data: 0.5995  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1095 (1.0992)  acc1: 74.0000 (77.1429)  acc5: 94.0000 (94.1143)  time: 0.2004  data: 0.0708  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1479 (1.1146)  acc1: 74.0000 (76.5280)  acc5: 92.0000 (93.9840)  time: 0.1998  data: 0.0708  max mem: 21847
Test: Total time: 0:00:10 (0.4031 s / it)
* Acc@1 76.734 Acc@5 93.886 loss 1.109
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 77.17%
Epoch: [92]  [   0/1251]  eta: 1:08:00  lr: 0.003382  min_lr: 0.003382  loss: 3.0772 (3.0772)  weight_decay: 0.0500 (0.0500)  time: 3.2618  data: 2.5824  max mem: 21847
Epoch: [92]  [ 200/1251]  eta: 0:05:05  lr: 0.003380  min_lr: 0.003380  loss: 3.4591 (3.2596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0094 (0.8875)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [92]  [ 400/1251]  eta: 0:04:00  lr: 0.003377  min_lr: 0.003377  loss: 2.7797 (3.2273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6555 (0.8407)  time: 0.2734  data: 0.0003  max mem: 21847
Epoch: [92]  [ 600/1251]  eta: 0:03:02  lr: 0.003374  min_lr: 0.003374  loss: 3.2198 (3.2231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6992 (0.8246)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [92]  [ 800/1251]  eta: 0:02:05  lr: 0.003372  min_lr: 0.003372  loss: 3.1276 (3.2289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6886 (0.8021)  time: 0.2716  data: 0.0006  max mem: 21847
Epoch: [92]  [1000/1251]  eta: 0:01:09  lr: 0.003369  min_lr: 0.003369  loss: 3.0882 (3.2287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6002 (0.7939)  time: 0.2765  data: 0.0005  max mem: 21847
Epoch: [92]  [1200/1251]  eta: 0:00:14  lr: 0.003367  min_lr: 0.003367  loss: 3.3750 (3.2434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6278 (0.7891)  time: 0.2761  data: 0.0004  max mem: 21847
Epoch: [92]  [1250/1251]  eta: 0:00:00  lr: 0.003366  min_lr: 0.003366  loss: 3.5379 (3.2519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6278 (0.7835)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [92] Total time: 0:05:47 (0.2780 s / it)
Averaged stats: lr: 0.003366  min_lr: 0.003366  loss: 3.5379 (3.2739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6278 (0.7835)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7993 (0.7993)  acc1: 86.0000 (86.0000)  acc5: 97.2000 (97.2000)  time: 5.5153  data: 5.3676  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9480 (0.9754)  acc1: 80.0000 (80.2909)  acc5: 96.8000 (96.1818)  time: 0.7014  data: 0.5682  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1201 (1.1549)  acc1: 74.4000 (76.8571)  acc5: 92.8000 (93.7714)  time: 0.1851  data: 0.0548  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2939 (1.1675)  acc1: 74.4000 (76.6400)  acc5: 92.8000 (93.6640)  time: 0.1909  data: 0.0618  max mem: 21847
Test: Total time: 0:00:09 (0.3985 s / it)
* Acc@1 76.954 Acc@5 93.922 loss 1.153
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.17%
Epoch: [93]  [   0/1251]  eta: 1:10:01  lr: 0.003366  min_lr: 0.003366  loss: 3.5715 (3.5715)  weight_decay: 0.0500 (0.0500)  time: 3.3585  data: 2.9605  max mem: 21847
Epoch: [93]  [ 200/1251]  eta: 0:05:04  lr: 0.003363  min_lr: 0.003363  loss: 2.8987 (3.2375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6454 (0.7501)  time: 0.2716  data: 0.0005  max mem: 21847
Epoch: [93]  [ 400/1251]  eta: 0:04:01  lr: 0.003361  min_lr: 0.003361  loss: 3.4813 (3.2745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6858 (0.7978)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [93]  [ 600/1251]  eta: 0:03:02  lr: 0.003358  min_lr: 0.003358  loss: 3.3693 (3.2614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7002 (0.7719)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [93]  [ 800/1251]  eta: 0:02:05  lr: 0.003355  min_lr: 0.003355  loss: 3.6945 (3.2664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8237 (0.8027)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [93]  [1000/1251]  eta: 0:01:09  lr: 0.003353  min_lr: 0.003353  loss: 3.7012 (3.2649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5871 (0.8206)  time: 0.2814  data: 0.0004  max mem: 21847
Epoch: [93]  [1200/1251]  eta: 0:00:14  lr: 0.003350  min_lr: 0.003350  loss: 2.8067 (3.2781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6865 (0.8092)  time: 0.2781  data: 0.0004  max mem: 21847
Epoch: [93]  [1250/1251]  eta: 0:00:00  lr: 0.003350  min_lr: 0.003350  loss: 2.5770 (3.2761)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6812 (0.8040)  time: 0.2282  data: 0.0005  max mem: 21847
Epoch: [93] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.003350  min_lr: 0.003350  loss: 2.5770 (3.2705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6812 (0.8040)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6325 (0.6325)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.3549  data: 5.2022  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8186 (0.8267)  acc1: 81.6000 (81.5636)  acc5: 97.2000 (96.6545)  time: 0.7335  data: 0.5989  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0649 (1.0093)  acc1: 76.0000 (77.6191)  acc5: 94.0000 (94.1333)  time: 0.2132  data: 0.0822  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1439 (1.0227)  acc1: 74.4000 (77.2640)  acc5: 92.4000 (93.9840)  time: 0.2114  data: 0.0813  max mem: 21847
Test: Total time: 0:00:10 (0.4084 s / it)
* Acc@1 77.290 Acc@5 93.992 loss 1.017
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.29%
Epoch: [94]  [   0/1251]  eta: 0:57:51  lr: 0.003350  min_lr: 0.003350  loss: 2.2622 (2.2622)  weight_decay: 0.0500 (0.0500)  time: 2.7750  data: 2.4224  max mem: 21847
Epoch: [94]  [ 200/1251]  eta: 0:05:01  lr: 0.003347  min_lr: 0.003347  loss: 2.5851 (3.2094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7253 (0.7431)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [94]  [ 400/1251]  eta: 0:03:58  lr: 0.003344  min_lr: 0.003344  loss: 3.0758 (3.2568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7080 (0.7966)  time: 0.2804  data: 0.0004  max mem: 21847
Epoch: [94]  [ 600/1251]  eta: 0:03:01  lr: 0.003342  min_lr: 0.003342  loss: 3.3950 (3.2666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7121 (0.7805)  time: 0.2703  data: 0.0004  max mem: 21847
Epoch: [94]  [ 800/1251]  eta: 0:02:05  lr: 0.003339  min_lr: 0.003339  loss: 3.1632 (3.2583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7733 (0.7952)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [94]  [1000/1251]  eta: 0:01:09  lr: 0.003336  min_lr: 0.003336  loss: 3.3533 (3.2661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7640 (0.7906)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [94]  [1200/1251]  eta: 0:00:14  lr: 0.003334  min_lr: 0.003334  loss: 3.6250 (3.2767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9660 (nan)  time: 0.2742  data: 0.0004  max mem: 21847
Epoch: [94]  [1250/1251]  eta: 0:00:00  lr: 0.003333  min_lr: 0.003333  loss: 3.2018 (3.2798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6822 (nan)  time: 0.2281  data: 0.0006  max mem: 21847
Epoch: [94] Total time: 0:05:45 (0.2758 s / it)
Averaged stats: lr: 0.003333  min_lr: 0.003333  loss: 3.2018 (3.2777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6822 (nan)
Test:  [ 0/25]  eta: 0:01:59  loss: 0.7407 (0.7407)  acc1: 85.2000 (85.2000)  acc5: 97.2000 (97.2000)  time: 4.7926  data: 4.6055  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8406 (0.9041)  acc1: 81.6000 (80.3273)  acc5: 96.4000 (96.5818)  time: 0.6395  data: 0.5061  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0780 (1.0744)  acc1: 75.2000 (76.4000)  acc5: 94.4000 (94.5143)  time: 0.2220  data: 0.0940  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1554 (1.0805)  acc1: 74.4000 (76.2080)  acc5: 93.6000 (94.4000)  time: 0.2066  data: 0.0780  max mem: 21847
Test: Total time: 0:00:10 (0.4098 s / it)
* Acc@1 77.016 Acc@5 94.180 loss 1.068
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.29%
Epoch: [95]  [   0/1251]  eta: 1:10:10  lr: 0.003333  min_lr: 0.003333  loss: 2.1366 (2.1366)  weight_decay: 0.0500 (0.0500)  time: 3.3658  data: 2.1418  max mem: 21847
Epoch: [95]  [ 200/1251]  eta: 0:05:07  lr: 0.003330  min_lr: 0.003330  loss: 3.4704 (3.2985)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6488 (0.8849)  time: 0.2767  data: 0.0005  max mem: 21847
Epoch: [95]  [ 400/1251]  eta: 0:04:02  lr: 0.003327  min_lr: 0.003327  loss: 3.6291 (3.2927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7243 (0.8577)  time: 0.2828  data: 0.0004  max mem: 21847
Epoch: [95]  [ 600/1251]  eta: 0:03:02  lr: 0.003325  min_lr: 0.003325  loss: 3.5413 (3.2642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6286 (0.8187)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [95]  [ 800/1251]  eta: 0:02:06  lr: 0.003322  min_lr: 0.003322  loss: 3.4172 (3.2661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8157 (0.8270)  time: 0.2920  data: 0.0004  max mem: 21847
Epoch: [95]  [1000/1251]  eta: 0:01:09  lr: 0.003319  min_lr: 0.003319  loss: 3.6676 (3.2749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7880 (0.8202)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [95]  [1200/1251]  eta: 0:00:14  lr: 0.003317  min_lr: 0.003317  loss: 3.5674 (3.2679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8165 (0.8179)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [95]  [1250/1251]  eta: 0:00:00  lr: 0.003316  min_lr: 0.003316  loss: 2.8692 (3.2688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6655 (0.8110)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [95] Total time: 0:05:47 (0.2779 s / it)
Averaged stats: lr: 0.003316  min_lr: 0.003316  loss: 2.8692 (3.2652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6655 (0.8110)
Test:  [ 0/25]  eta: 0:01:33  loss: 0.6379 (0.6379)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 3.7326  data: 3.5518  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8195 (0.8550)  acc1: 80.4000 (79.7455)  acc5: 96.0000 (96.1091)  time: 0.6576  data: 0.5174  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0422 (1.0372)  acc1: 74.0000 (76.1905)  acc5: 93.6000 (93.8857)  time: 0.2594  data: 0.1250  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1822 (1.0469)  acc1: 73.6000 (75.8880)  acc5: 92.4000 (93.7280)  time: 0.2016  data: 0.0707  max mem: 21847
Test: Total time: 0:00:09 (0.3943 s / it)
* Acc@1 77.226 Acc@5 93.970 loss 1.024
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.29%
Epoch: [96]  [   0/1251]  eta: 1:08:12  lr: 0.003316  min_lr: 0.003316  loss: 3.2925 (3.2925)  weight_decay: 0.0500 (0.0500)  time: 3.2714  data: 2.5860  max mem: 21847
Epoch: [96]  [ 200/1251]  eta: 0:05:04  lr: 0.003313  min_lr: 0.003313  loss: 2.8797 (3.2598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7242 (0.7181)  time: 0.2826  data: 0.0004  max mem: 21847
Epoch: [96]  [ 400/1251]  eta: 0:04:00  lr: 0.003311  min_lr: 0.003311  loss: 3.1059 (3.2704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7466 (0.7740)  time: 0.2755  data: 0.0004  max mem: 21847
Epoch: [96]  [ 600/1251]  eta: 0:03:02  lr: 0.003308  min_lr: 0.003308  loss: 3.5105 (3.3050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9024 (0.8050)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [96]  [ 800/1251]  eta: 0:02:05  lr: 0.003305  min_lr: 0.003305  loss: 3.3808 (3.3043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8728 (0.8228)  time: 0.2835  data: 0.0005  max mem: 21847
Epoch: [96]  [1000/1251]  eta: 0:01:09  lr: 0.003302  min_lr: 0.003302  loss: 3.2859 (3.3100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7688 (0.8096)  time: 0.2743  data: 0.0004  max mem: 21847
Epoch: [96]  [1200/1251]  eta: 0:00:14  lr: 0.003300  min_lr: 0.003300  loss: 3.2427 (3.3005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7853 (0.8108)  time: 0.2750  data: 0.0004  max mem: 21847
Epoch: [96]  [1250/1251]  eta: 0:00:00  lr: 0.003299  min_lr: 0.003299  loss: 3.6762 (3.2956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7818 (0.8142)  time: 0.2283  data: 0.0007  max mem: 21847
Epoch: [96] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.003299  min_lr: 0.003299  loss: 3.6762 (3.2665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7818 (0.8142)
Test:  [ 0/25]  eta: 0:01:38  loss: 0.7177 (0.7177)  acc1: 87.6000 (87.6000)  acc5: 96.8000 (96.8000)  time: 3.9596  data: 3.8079  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.9535 (0.9416)  acc1: 81.2000 (81.6364)  acc5: 96.4000 (96.2182)  time: 0.6195  data: 0.4884  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1481 (1.1084)  acc1: 76.0000 (77.5048)  acc5: 93.2000 (94.0381)  time: 0.2519  data: 0.1216  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1900 (1.1152)  acc1: 75.2000 (77.1840)  acc5: 92.8000 (94.0800)  time: 0.1956  data: 0.0659  max mem: 21847
Test: Total time: 0:00:09 (0.3959 s / it)
* Acc@1 77.254 Acc@5 94.076 loss 1.105
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.29%
Epoch: [97]  [   0/1251]  eta: 1:07:41  lr: 0.003299  min_lr: 0.003299  loss: 4.0010 (4.0010)  weight_decay: 0.0500 (0.0500)  time: 3.2465  data: 2.6022  max mem: 21847
Epoch: [97]  [ 200/1251]  eta: 0:05:02  lr: 0.003296  min_lr: 0.003296  loss: 3.3156 (3.3053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7561 (0.7952)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [97]  [ 400/1251]  eta: 0:03:59  lr: 0.003294  min_lr: 0.003294  loss: 3.6860 (3.3221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6212 (0.8277)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [97]  [ 600/1251]  eta: 0:03:01  lr: 0.003291  min_lr: 0.003291  loss: 3.5994 (3.3251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7067 (0.8394)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [97]  [ 800/1251]  eta: 0:02:05  lr: 0.003288  min_lr: 0.003288  loss: 3.4596 (3.3194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6789 (0.8494)  time: 0.2742  data: 0.0005  max mem: 21847
Epoch: [97]  [1000/1251]  eta: 0:01:09  lr: 0.003285  min_lr: 0.003285  loss: 3.4781 (3.3077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6896 (0.8319)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [97]  [1200/1251]  eta: 0:00:14  lr: 0.003283  min_lr: 0.003283  loss: 3.6034 (3.2989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8271 (0.8328)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [97]  [1250/1251]  eta: 0:00:00  lr: 0.003282  min_lr: 0.003282  loss: 3.5950 (3.2980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6367 (0.8271)  time: 0.2283  data: 0.0005  max mem: 21847
Epoch: [97] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.003282  min_lr: 0.003282  loss: 3.5950 (3.2598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6367 (0.8271)
Test:  [ 0/25]  eta: 0:02:01  loss: 0.7175 (0.7175)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 4.8540  data: 4.6967  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.9306 (0.9503)  acc1: 80.8000 (80.6545)  acc5: 96.4000 (96.3636)  time: 0.6405  data: 0.5070  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1282 (1.1271)  acc1: 75.6000 (76.7238)  acc5: 94.4000 (93.9429)  time: 0.2087  data: 0.0794  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2013 (1.1359)  acc1: 74.0000 (76.3040)  acc5: 92.4000 (93.8400)  time: 0.2153  data: 0.0871  max mem: 21847
Test: Total time: 0:00:09 (0.3974 s / it)
* Acc@1 77.170 Acc@5 94.028 loss 1.119
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.29%
Epoch: [98]  [   0/1251]  eta: 1:04:04  lr: 0.003282  min_lr: 0.003282  loss: 2.7475 (2.7475)  weight_decay: 0.0500 (0.0500)  time: 3.0730  data: 1.7290  max mem: 21847
Epoch: [98]  [ 200/1251]  eta: 0:05:08  lr: 0.003279  min_lr: 0.003279  loss: 3.5257 (3.1963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8727 (0.8495)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [98]  [ 400/1251]  eta: 0:04:00  lr: 0.003276  min_lr: 0.003276  loss: 3.1848 (3.2437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7105 (0.7977)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [98]  [ 600/1251]  eta: 0:03:02  lr: 0.003274  min_lr: 0.003274  loss: 3.3361 (3.2493)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7487 (0.8116)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [98]  [ 800/1251]  eta: 0:02:05  lr: 0.003271  min_lr: 0.003271  loss: 3.4960 (3.2533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7294 (0.8044)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [98]  [1000/1251]  eta: 0:01:09  lr: 0.003268  min_lr: 0.003268  loss: 2.6255 (3.2498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6646 (0.7916)  time: 0.2804  data: 0.0004  max mem: 21847
Epoch: [98]  [1200/1251]  eta: 0:00:14  lr: 0.003265  min_lr: 0.003265  loss: 3.5967 (3.2595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6862 (0.8108)  time: 0.2722  data: 0.0005  max mem: 21847
Epoch: [98]  [1250/1251]  eta: 0:00:00  lr: 0.003265  min_lr: 0.003265  loss: 3.5891 (3.2616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6862 (0.8128)  time: 0.2283  data: 0.0010  max mem: 21847
Epoch: [98] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.003265  min_lr: 0.003265  loss: 3.5891 (3.2482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6862 (0.8128)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7488 (0.7488)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 5.7299  data: 5.5627  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9192 (0.9049)  acc1: 80.0000 (80.5455)  acc5: 96.4000 (96.0364)  time: 0.7328  data: 0.5988  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0553 (1.0733)  acc1: 74.8000 (77.0286)  acc5: 93.2000 (93.9238)  time: 0.1940  data: 0.0649  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1749 (1.0813)  acc1: 74.0000 (76.7360)  acc5: 92.4000 (93.8080)  time: 0.1940  data: 0.0648  max mem: 21847
Test: Total time: 0:00:10 (0.4087 s / it)
* Acc@1 77.270 Acc@5 94.026 loss 1.074
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.29%
Epoch: [99]  [   0/1251]  eta: 1:09:28  lr: 0.003265  min_lr: 0.003265  loss: 4.0376 (4.0376)  weight_decay: 0.0500 (0.0500)  time: 3.3320  data: 2.9323  max mem: 21847
Epoch: [99]  [ 200/1251]  eta: 0:05:05  lr: 0.003262  min_lr: 0.003262  loss: 3.3698 (3.2099)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6945 (0.7524)  time: 0.2766  data: 0.0004  max mem: 21847
Epoch: [99]  [ 400/1251]  eta: 0:04:01  lr: 0.003259  min_lr: 0.003259  loss: 3.2229 (3.2410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6249 (0.7643)  time: 0.2827  data: 0.0010  max mem: 21847
Epoch: [99]  [ 600/1251]  eta: 0:03:02  lr: 0.003256  min_lr: 0.003256  loss: 3.5222 (3.2562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8900 (0.7781)  time: 0.2710  data: 0.0003  max mem: 21847
Epoch: [99]  [ 800/1251]  eta: 0:02:05  lr: 0.003253  min_lr: 0.003253  loss: 3.2024 (3.2595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7894 (0.8031)  time: 0.2733  data: 0.0005  max mem: 21847
Epoch: [99]  [1000/1251]  eta: 0:01:09  lr: 0.003251  min_lr: 0.003251  loss: 3.6980 (3.2549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6267 (0.8037)  time: 0.2748  data: 0.0004  max mem: 21847
Epoch: [99]  [1200/1251]  eta: 0:00:14  lr: 0.003248  min_lr: 0.003248  loss: 3.2056 (3.2558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6936 (0.8070)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [99]  [1250/1251]  eta: 0:00:00  lr: 0.003247  min_lr: 0.003247  loss: 3.3125 (3.2597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5829 (0.7987)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [99] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.003247  min_lr: 0.003247  loss: 3.3125 (3.2568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5829 (0.7987)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7252 (0.7252)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 5.5005  data: 5.3525  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8988 (0.9177)  acc1: 82.0000 (80.8000)  acc5: 96.8000 (96.3273)  time: 0.7043  data: 0.5705  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1010 (1.0736)  acc1: 75.2000 (77.3524)  acc5: 93.6000 (94.1524)  time: 0.1868  data: 0.0569  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1430 (1.0823)  acc1: 74.8000 (76.9600)  acc5: 92.8000 (94.0640)  time: 0.1864  data: 0.0567  max mem: 21847
Test: Total time: 0:00:09 (0.3943 s / it)
* Acc@1 77.274 Acc@5 94.138 loss 1.072
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.29%
Epoch: [100]  [   0/1251]  eta: 1:02:42  lr: 0.003247  min_lr: 0.003247  loss: 2.4348 (2.4348)  weight_decay: 0.0500 (0.0500)  time: 3.0073  data: 1.6846  max mem: 21847
Epoch: [100]  [ 200/1251]  eta: 0:05:04  lr: 0.003244  min_lr: 0.003244  loss: 2.9616 (3.2053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6988 (0.8046)  time: 0.2817  data: 0.0004  max mem: 21847
Epoch: [100]  [ 400/1251]  eta: 0:03:59  lr: 0.003242  min_lr: 0.003242  loss: 3.1814 (3.2218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6788 (0.8091)  time: 0.2713  data: 0.0003  max mem: 21847
Epoch: [100]  [ 600/1251]  eta: 0:03:01  lr: 0.003239  min_lr: 0.003239  loss: 3.5037 (3.2264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8249 (0.8366)  time: 0.2817  data: 0.0004  max mem: 21847
Epoch: [100]  [ 800/1251]  eta: 0:02:05  lr: 0.003236  min_lr: 0.003236  loss: 3.2566 (3.2290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7186 (0.8615)  time: 0.2826  data: 0.0004  max mem: 21847
Epoch: [100]  [1000/1251]  eta: 0:01:09  lr: 0.003233  min_lr: 0.003233  loss: 3.1107 (3.2496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7142 (0.8445)  time: 0.2725  data: 0.0003  max mem: 21847
Epoch: [100]  [1200/1251]  eta: 0:00:14  lr: 0.003230  min_lr: 0.003230  loss: 3.1712 (3.2340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8064 (0.8447)  time: 0.2796  data: 0.0005  max mem: 21847
Epoch: [100]  [1250/1251]  eta: 0:00:00  lr: 0.003230  min_lr: 0.003230  loss: 3.5887 (3.2384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8302 (0.8516)  time: 0.2281  data: 0.0007  max mem: 21847
Epoch: [100] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003230  min_lr: 0.003230  loss: 3.5887 (3.2522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8302 (0.8516)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.7801 (0.7801)  acc1: 85.2000 (85.2000)  acc5: 97.2000 (97.2000)  time: 5.4048  data: 5.2518  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.9740 (0.9418)  acc1: 81.2000 (81.1636)  acc5: 96.8000 (96.4364)  time: 0.6566  data: 0.5236  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1440 (1.1109)  acc1: 76.4000 (77.1810)  acc5: 94.0000 (94.2095)  time: 0.1736  data: 0.0442  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1850 (1.1211)  acc1: 74.4000 (76.9280)  acc5: 93.2000 (94.1280)  time: 0.1913  data: 0.0627  max mem: 21847
Test: Total time: 0:00:09 (0.3943 s / it)
* Acc@1 77.290 Acc@5 94.108 loss 1.106
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.29%
Epoch: [101]  [   0/1251]  eta: 1:06:10  lr: 0.003230  min_lr: 0.003230  loss: 2.3902 (2.3902)  weight_decay: 0.0500 (0.0500)  time: 3.1742  data: 2.0908  max mem: 21847
Epoch: [101]  [ 200/1251]  eta: 0:05:04  lr: 0.003227  min_lr: 0.003227  loss: 2.4516 (3.2016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7343 (0.7785)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [101]  [ 400/1251]  eta: 0:04:00  lr: 0.003224  min_lr: 0.003224  loss: 3.4556 (3.2284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7515 (0.7824)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [101]  [ 600/1251]  eta: 0:03:01  lr: 0.003221  min_lr: 0.003221  loss: 3.0158 (3.2195)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0293 (0.8269)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [101]  [ 800/1251]  eta: 0:02:05  lr: 0.003218  min_lr: 0.003218  loss: 3.7561 (3.2275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6388 (0.8055)  time: 0.2796  data: 0.0003  max mem: 21847
Epoch: [101]  [1000/1251]  eta: 0:01:09  lr: 0.003215  min_lr: 0.003215  loss: 2.8336 (3.2324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8104 (0.8089)  time: 0.2815  data: 0.0005  max mem: 21847
Epoch: [101]  [1200/1251]  eta: 0:00:14  lr: 0.003212  min_lr: 0.003212  loss: 3.5459 (3.2373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8493 (0.8201)  time: 0.2740  data: 0.0005  max mem: 21847
Epoch: [101]  [1250/1251]  eta: 0:00:00  lr: 0.003212  min_lr: 0.003212  loss: 3.6026 (3.2381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7225 (0.8159)  time: 0.2283  data: 0.0008  max mem: 21847
Epoch: [101] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.003212  min_lr: 0.003212  loss: 3.6026 (3.2488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7225 (0.8159)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7006 (0.7006)  acc1: 85.2000 (85.2000)  acc5: 99.2000 (99.2000)  time: 5.7156  data: 5.5716  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8788 (0.8640)  acc1: 82.0000 (81.2727)  acc5: 96.8000 (96.7636)  time: 0.7675  data: 0.6346  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0587 (1.0398)  acc1: 75.2000 (77.2000)  acc5: 94.0000 (94.4571)  time: 0.2204  data: 0.0908  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1673 (1.0487)  acc1: 74.4000 (76.9920)  acc5: 93.2000 (94.2720)  time: 0.2190  data: 0.0907  max mem: 21847
Test: Total time: 0:00:10 (0.4287 s / it)
* Acc@1 77.252 Acc@5 94.168 loss 1.045
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.29%
Epoch: [102]  [   0/1251]  eta: 1:08:32  lr: 0.003212  min_lr: 0.003212  loss: 3.6685 (3.6685)  weight_decay: 0.0500 (0.0500)  time: 3.2871  data: 2.7503  max mem: 21847
Epoch: [102]  [ 200/1251]  eta: 0:05:02  lr: 0.003209  min_lr: 0.003209  loss: 3.7350 (3.2533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7551 (0.7724)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [102]  [ 400/1251]  eta: 0:03:59  lr: 0.003206  min_lr: 0.003206  loss: 3.3453 (3.2573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (0.7598)  time: 0.2705  data: 0.0005  max mem: 21847
Epoch: [102]  [ 600/1251]  eta: 0:03:01  lr: 0.003203  min_lr: 0.003203  loss: 3.5234 (3.2534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6486 (0.7804)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [102]  [ 800/1251]  eta: 0:02:05  lr: 0.003200  min_lr: 0.003200  loss: 3.6795 (3.2488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6356 (0.7790)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [102]  [1000/1251]  eta: 0:01:09  lr: 0.003197  min_lr: 0.003197  loss: 2.8146 (3.2471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9226 (0.7992)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [102]  [1200/1251]  eta: 0:00:14  lr: 0.003195  min_lr: 0.003195  loss: 3.4198 (3.2547)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6816 (0.7777)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [102]  [1250/1251]  eta: 0:00:00  lr: 0.003194  min_lr: 0.003194  loss: 3.4371 (3.2592)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2285  data: 0.0008  max mem: 21847
Epoch: [102] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.003194  min_lr: 0.003194  loss: 3.4371 (3.2445)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7953 (0.7953)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.7332  data: 5.5735  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9788 (0.9528)  acc1: 80.4000 (80.6545)  acc5: 96.4000 (96.4364)  time: 0.7693  data: 0.6346  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1157 (1.1185)  acc1: 73.2000 (76.9333)  acc5: 93.6000 (94.2476)  time: 0.2091  data: 0.0792  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2762 (1.1273)  acc1: 72.8000 (76.5760)  acc5: 92.8000 (94.1920)  time: 0.2083  data: 0.0791  max mem: 21847
Test: Total time: 0:00:10 (0.4205 s / it)
* Acc@1 77.366 Acc@5 94.132 loss 1.116
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.37%
Epoch: [103]  [   0/1251]  eta: 1:05:04  lr: 0.003194  min_lr: 0.003194  loss: 2.2212 (2.2212)  weight_decay: 0.0500 (0.0500)  time: 3.1211  data: 2.7916  max mem: 21847
Epoch: [103]  [ 200/1251]  eta: 0:05:03  lr: 0.003191  min_lr: 0.003191  loss: 3.3574 (3.2392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7261 (0.7722)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [103]  [ 400/1251]  eta: 0:03:59  lr: 0.003188  min_lr: 0.003188  loss: 3.4765 (3.2121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7644 (0.8438)  time: 0.2811  data: 0.0004  max mem: 21847
Epoch: [103]  [ 600/1251]  eta: 0:03:01  lr: 0.003185  min_lr: 0.003185  loss: 3.0509 (3.2263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6866 (0.8665)  time: 0.2805  data: 0.0004  max mem: 21847
Epoch: [103]  [ 800/1251]  eta: 0:02:04  lr: 0.003182  min_lr: 0.003182  loss: 3.7613 (3.2389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7315 (0.8298)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [103]  [1000/1251]  eta: 0:01:09  lr: 0.003179  min_lr: 0.003179  loss: 3.0487 (3.2579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6352 (0.8418)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [103]  [1200/1251]  eta: 0:00:14  lr: 0.003176  min_lr: 0.003176  loss: 3.3655 (3.2543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7693 (0.8340)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [103]  [1250/1251]  eta: 0:00:00  lr: 0.003176  min_lr: 0.003176  loss: 2.8445 (3.2515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5779 (0.8290)  time: 0.2279  data: 0.0005  max mem: 21847
Epoch: [103] Total time: 0:05:44 (0.2757 s / it)
Averaged stats: lr: 0.003176  min_lr: 0.003176  loss: 2.8445 (3.2447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5779 (0.8290)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.6547 (0.6547)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.2270  data: 5.0813  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8890 (0.8625)  acc1: 80.4000 (81.6000)  acc5: 96.8000 (96.5091)  time: 0.7640  data: 0.6193  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0479 (1.0304)  acc1: 76.8000 (77.7333)  acc5: 94.4000 (94.4000)  time: 0.2456  data: 0.1095  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0839 (1.0486)  acc1: 74.0000 (77.0240)  acc5: 92.8000 (94.1280)  time: 0.2418  data: 0.1094  max mem: 21847
Test: Total time: 0:00:10 (0.4294 s / it)
* Acc@1 77.366 Acc@5 94.272 loss 1.031
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.37%
Epoch: [104]  [   0/1251]  eta: 1:09:46  lr: 0.003176  min_lr: 0.003176  loss: 3.8733 (3.8733)  weight_decay: 0.0500 (0.0500)  time: 3.3464  data: 3.0361  max mem: 21847
Epoch: [104]  [ 200/1251]  eta: 0:05:02  lr: 0.003173  min_lr: 0.003173  loss: 3.3877 (3.2657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5951 (0.7474)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [104]  [ 400/1251]  eta: 0:03:58  lr: 0.003170  min_lr: 0.003170  loss: 3.2577 (3.2537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8057 (0.7890)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [104]  [ 600/1251]  eta: 0:03:00  lr: 0.003167  min_lr: 0.003167  loss: 3.3498 (3.2369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9201 (0.8356)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [104]  [ 800/1251]  eta: 0:02:05  lr: 0.003164  min_lr: 0.003164  loss: 3.2811 (3.2415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7956 (0.8307)  time: 0.2852  data: 0.0004  max mem: 21847
Epoch: [104]  [1000/1251]  eta: 0:01:09  lr: 0.003161  min_lr: 0.003161  loss: 2.9612 (3.2432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8305 (0.8352)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [104]  [1200/1251]  eta: 0:00:14  lr: 0.003158  min_lr: 0.003158  loss: 3.2295 (3.2409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6273 (0.8359)  time: 0.2759  data: 0.0005  max mem: 21847
Epoch: [104]  [1250/1251]  eta: 0:00:00  lr: 0.003158  min_lr: 0.003158  loss: 3.3012 (3.2417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5872 (0.8288)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [104] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.003158  min_lr: 0.003158  loss: 3.3012 (3.2356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5872 (0.8288)
Test:  [ 0/25]  eta: 0:01:56  loss: 0.7172 (0.7172)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 4.6753  data: 4.5069  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9479 (0.9321)  acc1: 80.0000 (81.2000)  acc5: 97.2000 (96.5455)  time: 0.6868  data: 0.5527  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1105 (1.0961)  acc1: 75.2000 (77.3333)  acc5: 92.8000 (94.2476)  time: 0.2330  data: 0.1039  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1882 (1.1062)  acc1: 74.8000 (76.9440)  acc5: 92.4000 (94.1440)  time: 0.1896  data: 0.0611  max mem: 21847
Test: Total time: 0:00:10 (0.4049 s / it)
* Acc@1 77.344 Acc@5 94.114 loss 1.091
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.37%
Epoch: [105]  [   0/1251]  eta: 1:11:52  lr: 0.003158  min_lr: 0.003158  loss: 3.6825 (3.6825)  weight_decay: 0.0500 (0.0500)  time: 3.4475  data: 2.9605  max mem: 21847
Epoch: [105]  [ 200/1251]  eta: 0:05:07  lr: 0.003155  min_lr: 0.003155  loss: 3.6786 (3.2500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7837 (0.8243)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [105]  [ 400/1251]  eta: 0:04:01  lr: 0.003152  min_lr: 0.003152  loss: 2.8426 (3.2227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8371 (0.8039)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [105]  [ 600/1251]  eta: 0:03:02  lr: 0.003149  min_lr: 0.003149  loss: 3.3747 (3.2347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6891 (0.7886)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [105]  [ 800/1251]  eta: 0:02:05  lr: 0.003146  min_lr: 0.003146  loss: 3.4206 (3.2384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6677 (0.7807)  time: 0.2779  data: 0.0004  max mem: 21847
Epoch: [105]  [1000/1251]  eta: 0:01:09  lr: 0.003143  min_lr: 0.003143  loss: 3.3919 (3.2482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5767 (0.7671)  time: 0.2836  data: 0.0004  max mem: 21847
Epoch: [105]  [1200/1251]  eta: 0:00:14  lr: 0.003140  min_lr: 0.003140  loss: 3.1233 (3.2296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6063 (0.7770)  time: 0.2836  data: 0.0005  max mem: 21847
Epoch: [105]  [1250/1251]  eta: 0:00:00  lr: 0.003139  min_lr: 0.003139  loss: 3.5930 (3.2308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6063 (0.7768)  time: 0.2283  data: 0.0007  max mem: 21847
Epoch: [105] Total time: 0:05:47 (0.2778 s / it)
Averaged stats: lr: 0.003139  min_lr: 0.003139  loss: 3.5930 (3.2346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6063 (0.7768)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6787 (0.6787)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 5.6851  data: 5.5179  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8331 (0.8483)  acc1: 81.6000 (81.0545)  acc5: 96.4000 (96.4364)  time: 0.7714  data: 0.6344  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9979 (1.0225)  acc1: 75.2000 (77.2952)  acc5: 94.0000 (93.9048)  time: 0.2178  data: 0.0872  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1355 (1.0322)  acc1: 75.2000 (76.9920)  acc5: 92.8000 (94.0000)  time: 0.2168  data: 0.0871  max mem: 21847
Test: Total time: 0:00:10 (0.4252 s / it)
* Acc@1 77.520 Acc@5 94.230 loss 1.021
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.52%
Epoch: [106]  [   0/1251]  eta: 1:02:14  lr: 0.003139  min_lr: 0.003139  loss: 2.3570 (2.3570)  weight_decay: 0.0500 (0.0500)  time: 2.9854  data: 2.6899  max mem: 21847
Epoch: [106]  [ 200/1251]  eta: 0:05:01  lr: 0.003136  min_lr: 0.003136  loss: 3.6044 (3.1606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6693 (0.7287)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [106]  [ 400/1251]  eta: 0:03:59  lr: 0.003133  min_lr: 0.003133  loss: 3.5315 (3.2144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7664 (0.7443)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [106]  [ 600/1251]  eta: 0:03:01  lr: 0.003130  min_lr: 0.003130  loss: 3.1012 (3.2099)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7125 (0.7339)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [106]  [ 800/1251]  eta: 0:02:05  lr: 0.003127  min_lr: 0.003127  loss: 3.5158 (3.2263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7627 (0.7613)  time: 0.2744  data: 0.0004  max mem: 21847
Epoch: [106]  [1000/1251]  eta: 0:01:09  lr: 0.003124  min_lr: 0.003124  loss: 3.0691 (3.2277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7232 (0.7579)  time: 0.2742  data: 0.0004  max mem: 21847
Epoch: [106]  [1200/1251]  eta: 0:00:14  lr: 0.003121  min_lr: 0.003121  loss: 3.3314 (3.2242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9484 (0.7684)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [106]  [1250/1251]  eta: 0:00:00  lr: 0.003121  min_lr: 0.003121  loss: 3.3515 (3.2238)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9170 (0.7715)  time: 0.2337  data: 0.0005  max mem: 21847
Epoch: [106] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.003121  min_lr: 0.003121  loss: 3.3515 (3.2241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9170 (0.7715)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7439 (0.7439)  acc1: 85.2000 (85.2000)  acc5: 98.8000 (98.8000)  time: 5.6177  data: 5.4676  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9396 (0.9280)  acc1: 81.6000 (81.3818)  acc5: 96.8000 (96.5818)  time: 0.7492  data: 0.6170  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1393 (1.0933)  acc1: 74.4000 (77.5810)  acc5: 93.6000 (94.3810)  time: 0.2092  data: 0.0803  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1829 (1.1010)  acc1: 73.6000 (77.2800)  acc5: 92.8000 (94.3360)  time: 0.2084  data: 0.0802  max mem: 21847
Test: Total time: 0:00:10 (0.4160 s / it)
* Acc@1 77.724 Acc@5 94.158 loss 1.098
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.72%
Epoch: [107]  [   0/1251]  eta: 1:07:45  lr: 0.003121  min_lr: 0.003121  loss: 3.0779 (3.0779)  weight_decay: 0.0500 (0.0500)  time: 3.2501  data: 2.9415  max mem: 21847
Epoch: [107]  [ 200/1251]  eta: 0:05:02  lr: 0.003118  min_lr: 0.003118  loss: 3.3914 (3.2038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7675 (0.8418)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [107]  [ 400/1251]  eta: 0:03:59  lr: 0.003115  min_lr: 0.003115  loss: 3.6497 (3.2483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8784 (0.8042)  time: 0.2820  data: 0.0003  max mem: 21847
Epoch: [107]  [ 600/1251]  eta: 0:03:01  lr: 0.003112  min_lr: 0.003112  loss: 3.4659 (3.2529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6102 (0.8005)  time: 0.2747  data: 0.0003  max mem: 21847
Epoch: [107]  [ 800/1251]  eta: 0:02:05  lr: 0.003109  min_lr: 0.003109  loss: 3.4802 (3.2572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7712 (0.8174)  time: 0.2723  data: 0.0005  max mem: 21847
Epoch: [107]  [1000/1251]  eta: 0:01:09  lr: 0.003106  min_lr: 0.003106  loss: 2.9694 (3.2402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6884 (0.8052)  time: 0.2726  data: 0.0005  max mem: 21847
Epoch: [107]  [1200/1251]  eta: 0:00:14  lr: 0.003103  min_lr: 0.003103  loss: 3.2308 (3.2326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6860 (0.8052)  time: 0.2847  data: 0.0004  max mem: 21847
Epoch: [107]  [1250/1251]  eta: 0:00:00  lr: 0.003102  min_lr: 0.003102  loss: 3.1879 (3.2349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7002 (0.8053)  time: 0.2337  data: 0.0006  max mem: 21847
Epoch: [107] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.003102  min_lr: 0.003102  loss: 3.1879 (3.2219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7002 (0.8053)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6578 (0.6578)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.7691  data: 5.6242  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9361 (0.9036)  acc1: 84.0000 (81.8909)  acc5: 96.4000 (96.4727)  time: 0.7755  data: 0.6403  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1011 (1.0624)  acc1: 76.8000 (78.0952)  acc5: 94.4000 (94.3619)  time: 0.2036  data: 0.0728  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1542 (1.0761)  acc1: 75.2000 (77.6800)  acc5: 93.2000 (94.2880)  time: 0.2025  data: 0.0727  max mem: 21847
Test: Total time: 0:00:10 (0.4171 s / it)
* Acc@1 77.682 Acc@5 94.220 loss 1.068
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.72%
Epoch: [108]  [   0/1251]  eta: 1:03:25  lr: 0.003102  min_lr: 0.003102  loss: 2.3121 (2.3121)  weight_decay: 0.0500 (0.0500)  time: 3.0419  data: 2.6286  max mem: 21847
Epoch: [108]  [ 200/1251]  eta: 0:05:05  lr: 0.003099  min_lr: 0.003099  loss: 3.5007 (3.2976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6892 (0.8512)  time: 0.2735  data: 0.0004  max mem: 21847
Epoch: [108]  [ 400/1251]  eta: 0:04:00  lr: 0.003096  min_lr: 0.003096  loss: 3.6051 (3.2845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6642 (0.7833)  time: 0.2858  data: 0.0004  max mem: 21847
Epoch: [108]  [ 600/1251]  eta: 0:03:02  lr: 0.003093  min_lr: 0.003093  loss: 3.5315 (3.2534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8529 (0.8324)  time: 0.2812  data: 0.0004  max mem: 21847
Epoch: [108]  [ 800/1251]  eta: 0:02:05  lr: 0.003090  min_lr: 0.003090  loss: 3.6453 (3.2548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8010 (0.8172)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [108]  [1000/1251]  eta: 0:01:09  lr: 0.003087  min_lr: 0.003087  loss: 3.1224 (3.2404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7599 (0.8135)  time: 0.2806  data: 0.0004  max mem: 21847
Epoch: [108]  [1200/1251]  eta: 0:00:14  lr: 0.003084  min_lr: 0.003084  loss: 3.2223 (3.2499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6565 (0.8162)  time: 0.2751  data: 0.0004  max mem: 21847
Epoch: [108]  [1250/1251]  eta: 0:00:00  lr: 0.003083  min_lr: 0.003083  loss: 3.2855 (3.2472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6486 (0.8108)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [108] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.003083  min_lr: 0.003083  loss: 3.2855 (3.2336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6486 (0.8108)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6609 (0.6609)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.5631  data: 5.4037  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8448 (0.8668)  acc1: 83.2000 (81.7818)  acc5: 96.8000 (96.4000)  time: 0.7357  data: 0.6003  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0811 (1.0490)  acc1: 76.0000 (77.8667)  acc5: 94.0000 (94.3048)  time: 0.2151  data: 0.0827  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1859 (1.0589)  acc1: 74.4000 (77.3920)  acc5: 93.6000 (94.3040)  time: 0.2143  data: 0.0826  max mem: 21847
Test: Total time: 0:00:10 (0.4180 s / it)
* Acc@1 77.622 Acc@5 94.224 loss 1.045
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.72%
Epoch: [109]  [   0/1251]  eta: 1:09:58  lr: 0.003083  min_lr: 0.003083  loss: 2.3360 (2.3360)  weight_decay: 0.0500 (0.0500)  time: 3.3559  data: 2.8338  max mem: 21847
Epoch: [109]  [ 200/1251]  eta: 0:05:07  lr: 0.003080  min_lr: 0.003080  loss: 3.1484 (3.2009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6632 (0.7111)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [109]  [ 400/1251]  eta: 0:04:00  lr: 0.003077  min_lr: 0.003077  loss: 3.3645 (3.2221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9295 (0.7436)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [109]  [ 600/1251]  eta: 0:03:02  lr: 0.003074  min_lr: 0.003074  loss: 3.5592 (3.2494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6889 (0.7726)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [109]  [ 800/1251]  eta: 0:02:05  lr: 0.003071  min_lr: 0.003071  loss: 3.5814 (3.2473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7402 (0.7782)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [109]  [1000/1251]  eta: 0:01:09  lr: 0.003068  min_lr: 0.003068  loss: 3.3784 (3.2418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7829 (0.7930)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [109]  [1200/1251]  eta: 0:00:14  lr: 0.003065  min_lr: 0.003065  loss: 3.4220 (3.2438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8443 (0.8077)  time: 0.2727  data: 0.0003  max mem: 21847
Epoch: [109]  [1250/1251]  eta: 0:00:00  lr: 0.003064  min_lr: 0.003064  loss: 2.8481 (3.2428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6802 (0.8026)  time: 0.2275  data: 0.0005  max mem: 21847
Epoch: [109] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.003064  min_lr: 0.003064  loss: 2.8481 (3.2254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6802 (0.8026)
Test:  [ 0/25]  eta: 0:01:46  loss: 0.6278 (0.6278)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 4.2584  data: 4.0763  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8522 (0.8340)  acc1: 82.0000 (81.7091)  acc5: 96.8000 (96.1455)  time: 0.6476  data: 0.5118  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0196 (0.9966)  acc1: 75.6000 (78.0381)  acc5: 93.2000 (94.2476)  time: 0.2349  data: 0.1056  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1241 (1.0109)  acc1: 74.8000 (77.5360)  acc5: 93.2000 (94.0960)  time: 0.1947  data: 0.0664  max mem: 21847
Test: Total time: 0:00:09 (0.3940 s / it)
* Acc@1 77.804 Acc@5 94.286 loss 1.002
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.80%
Epoch: [110]  [   0/1251]  eta: 1:07:47  lr: 0.003064  min_lr: 0.003064  loss: 2.6211 (2.6211)  weight_decay: 0.0500 (0.0500)  time: 3.2513  data: 2.9607  max mem: 21847
Epoch: [110]  [ 200/1251]  eta: 0:05:06  lr: 0.003061  min_lr: 0.003061  loss: 3.3373 (3.2470)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8096 (0.8304)  time: 0.2718  data: 0.0005  max mem: 21847
Epoch: [110]  [ 400/1251]  eta: 0:04:00  lr: 0.003058  min_lr: 0.003058  loss: 3.7053 (3.2161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9855 (0.8680)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [110]  [ 600/1251]  eta: 0:03:02  lr: 0.003055  min_lr: 0.003055  loss: 3.4730 (3.1822)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2722  data: 0.0005  max mem: 21847
Epoch: [110]  [ 800/1251]  eta: 0:02:05  lr: 0.003052  min_lr: 0.003052  loss: 3.3003 (3.1839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6223 (nan)  time: 0.2707  data: 0.0003  max mem: 21847
Epoch: [110]  [1000/1251]  eta: 0:01:09  lr: 0.003049  min_lr: 0.003049  loss: 3.2564 (3.1992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7856 (nan)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [110]  [1200/1251]  eta: 0:00:14  lr: 0.003046  min_lr: 0.003046  loss: 3.3960 (3.2011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7538 (nan)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [110]  [1250/1251]  eta: 0:00:00  lr: 0.003045  min_lr: 0.003045  loss: 2.9921 (3.2015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6868 (nan)  time: 0.2276  data: 0.0006  max mem: 21847
Epoch: [110] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003045  min_lr: 0.003045  loss: 2.9921 (3.2228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6868 (nan)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6801 (0.6801)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.9137  data: 5.7376  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8778 (0.8724)  acc1: 82.8000 (81.5636)  acc5: 97.2000 (96.4364)  time: 0.7163  data: 0.5806  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0269 (1.0425)  acc1: 77.6000 (77.7905)  acc5: 93.6000 (94.2857)  time: 0.1859  data: 0.0563  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1407 (1.0514)  acc1: 75.2000 (77.4560)  acc5: 93.2000 (94.2400)  time: 0.1851  data: 0.0562  max mem: 21847
Test: Total time: 0:00:10 (0.4090 s / it)
* Acc@1 77.686 Acc@5 94.258 loss 1.039
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.80%
Epoch: [111]  [   0/1251]  eta: 1:06:07  lr: 0.003045  min_lr: 0.003045  loss: 3.8220 (3.8220)  weight_decay: 0.0500 (0.0500)  time: 3.1714  data: 1.8266  max mem: 21847
Epoch: [111]  [ 200/1251]  eta: 0:05:06  lr: 0.003042  min_lr: 0.003042  loss: 3.1401 (3.2048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6836 (0.7732)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [111]  [ 400/1251]  eta: 0:04:01  lr: 0.003039  min_lr: 0.003039  loss: 2.8331 (3.2048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8379 (0.7730)  time: 0.2800  data: 0.0005  max mem: 21847
Epoch: [111]  [ 600/1251]  eta: 0:03:02  lr: 0.003036  min_lr: 0.003036  loss: 3.4012 (3.2162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6663 (0.7462)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [111]  [ 800/1251]  eta: 0:02:05  lr: 0.003033  min_lr: 0.003033  loss: 3.1310 (3.2119)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0049 (0.7798)  time: 0.2707  data: 0.0003  max mem: 21847
Epoch: [111]  [1000/1251]  eta: 0:01:09  lr: 0.003030  min_lr: 0.003030  loss: 3.4364 (3.2035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7517 (0.7712)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [111]  [1200/1251]  eta: 0:00:14  lr: 0.003027  min_lr: 0.003027  loss: 3.3248 (3.1890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8915 (0.8193)  time: 0.2786  data: 0.0004  max mem: 21847
Epoch: [111]  [1250/1251]  eta: 0:00:00  lr: 0.003026  min_lr: 0.003026  loss: 3.1588 (3.1877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8915 (0.8239)  time: 0.2278  data: 0.0005  max mem: 21847
Epoch: [111] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.003026  min_lr: 0.003026  loss: 3.1588 (3.2092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8915 (0.8239)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6669 (0.6669)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.6946  data: 5.5432  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8784 (0.8471)  acc1: 81.2000 (81.9273)  acc5: 96.8000 (96.6546)  time: 0.7481  data: 0.6148  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0156 (1.0216)  acc1: 76.0000 (78.0571)  acc5: 94.4000 (94.6286)  time: 0.1991  data: 0.0692  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1259 (1.0361)  acc1: 75.6000 (77.8240)  acc5: 94.0000 (94.5120)  time: 0.1980  data: 0.0691  max mem: 21847
Test: Total time: 0:00:10 (0.4109 s / it)
* Acc@1 77.832 Acc@5 94.370 loss 1.029
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.83%
Epoch: [112]  [   0/1251]  eta: 1:05:57  lr: 0.003026  min_lr: 0.003026  loss: 2.7793 (2.7793)  weight_decay: 0.0500 (0.0500)  time: 3.1631  data: 2.8665  max mem: 21847
Epoch: [112]  [ 200/1251]  eta: 0:05:02  lr: 0.003023  min_lr: 0.003023  loss: 2.9725 (3.2060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5917 (0.6764)  time: 0.2724  data: 0.0005  max mem: 21847
Epoch: [112]  [ 400/1251]  eta: 0:03:59  lr: 0.003020  min_lr: 0.003020  loss: 3.0003 (3.2161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8287 (0.7198)  time: 0.2829  data: 0.0004  max mem: 21847
Epoch: [112]  [ 600/1251]  eta: 0:03:01  lr: 0.003017  min_lr: 0.003017  loss: 3.4237 (3.1970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5445 (0.7460)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [112]  [ 800/1251]  eta: 0:02:05  lr: 0.003014  min_lr: 0.003014  loss: 3.1081 (3.1984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7457 (0.7640)  time: 0.2741  data: 0.0005  max mem: 21847
Epoch: [112]  [1000/1251]  eta: 0:01:09  lr: 0.003011  min_lr: 0.003011  loss: 3.0396 (3.1851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5854 (0.7443)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [112]  [1200/1251]  eta: 0:00:14  lr: 0.003007  min_lr: 0.003007  loss: 3.3664 (3.1915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7769 (0.7596)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [112]  [1250/1251]  eta: 0:00:00  lr: 0.003007  min_lr: 0.003007  loss: 3.5809 (3.1940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6071 (0.7553)  time: 0.2347  data: 0.0009  max mem: 21847
Epoch: [112] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.003007  min_lr: 0.003007  loss: 3.5809 (3.2061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6071 (0.7553)
Test:  [ 0/25]  eta: 0:01:28  loss: 0.6336 (0.6336)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 3.5370  data: 3.3643  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8629 (0.8706)  acc1: 82.0000 (82.4364)  acc5: 96.4000 (96.3636)  time: 0.6422  data: 0.5071  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0640 (1.0549)  acc1: 75.2000 (78.2476)  acc5: 94.4000 (94.2857)  time: 0.2664  data: 0.1370  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1694 (1.0645)  acc1: 74.8000 (77.6960)  acc5: 94.0000 (94.3040)  time: 0.2108  data: 0.0815  max mem: 21847
Test: Total time: 0:00:09 (0.3936 s / it)
* Acc@1 77.840 Acc@5 94.294 loss 1.059
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.84%
Epoch: [113]  [   0/1251]  eta: 1:03:41  lr: 0.003007  min_lr: 0.003007  loss: 3.8960 (3.8960)  weight_decay: 0.0500 (0.0500)  time: 3.0544  data: 2.6766  max mem: 21847
Epoch: [113]  [ 200/1251]  eta: 0:05:07  lr: 0.003004  min_lr: 0.003004  loss: 3.4980 (3.2042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7094 (0.7484)  time: 0.2834  data: 0.0004  max mem: 21847
Epoch: [113]  [ 400/1251]  eta: 0:04:01  lr: 0.003000  min_lr: 0.003000  loss: 3.4431 (3.1795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7477 (0.8361)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [113]  [ 600/1251]  eta: 0:03:02  lr: 0.002997  min_lr: 0.002997  loss: 3.1087 (3.1831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6741 (0.8360)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [113]  [ 800/1251]  eta: 0:02:05  lr: 0.002994  min_lr: 0.002994  loss: 2.8140 (3.2008)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7559 (0.8330)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [113]  [1000/1251]  eta: 0:01:09  lr: 0.002991  min_lr: 0.002991  loss: 3.5506 (3.1853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8962 (0.8516)  time: 0.2778  data: 0.0004  max mem: 21847
Epoch: [113]  [1200/1251]  eta: 0:00:14  lr: 0.002988  min_lr: 0.002988  loss: 3.6897 (3.1887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8389 (0.8444)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [113]  [1250/1251]  eta: 0:00:00  lr: 0.002987  min_lr: 0.002987  loss: 3.0873 (3.1936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7472 (0.8401)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [113] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.002987  min_lr: 0.002987  loss: 3.0873 (3.2105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7472 (0.8401)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7457 (0.7457)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.4469  data: 5.2900  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9501 (0.9498)  acc1: 81.2000 (80.7273)  acc5: 96.8000 (96.2909)  time: 0.7625  data: 0.6261  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1427 (1.0904)  acc1: 75.6000 (77.3333)  acc5: 94.0000 (94.7429)  time: 0.2273  data: 0.0964  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1427 (1.0994)  acc1: 75.2000 (77.0080)  acc5: 93.6000 (94.6880)  time: 0.2254  data: 0.0963  max mem: 21847
Test: Total time: 0:00:10 (0.4238 s / it)
* Acc@1 77.926 Acc@5 94.436 loss 1.094
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.93%
Epoch: [114]  [   0/1251]  eta: 1:08:52  lr: 0.002987  min_lr: 0.002987  loss: 2.6200 (2.6200)  weight_decay: 0.0500 (0.0500)  time: 3.3030  data: 3.0039  max mem: 21847
Epoch: [114]  [ 200/1251]  eta: 0:05:03  lr: 0.002984  min_lr: 0.002984  loss: 3.2873 (3.0920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8048 (0.7578)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [114]  [ 400/1251]  eta: 0:04:00  lr: 0.002981  min_lr: 0.002981  loss: 3.5739 (3.1398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (0.7724)  time: 0.2717  data: 0.0003  max mem: 21847
Epoch: [114]  [ 600/1251]  eta: 0:03:02  lr: 0.002978  min_lr: 0.002978  loss: 3.2786 (3.1486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6739 (0.7644)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [114]  [ 800/1251]  eta: 0:02:05  lr: 0.002975  min_lr: 0.002975  loss: 3.2608 (3.1735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6747 (0.7556)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [114]  [1000/1251]  eta: 0:01:09  lr: 0.002972  min_lr: 0.002972  loss: 2.9830 (3.1597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8095 (0.7695)  time: 0.2745  data: 0.0004  max mem: 21847
Epoch: [114]  [1200/1251]  eta: 0:00:14  lr: 0.002968  min_lr: 0.002968  loss: 3.4888 (3.1637)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6300 (0.7741)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [114]  [1250/1251]  eta: 0:00:00  lr: 0.002968  min_lr: 0.002968  loss: 3.1786 (3.1714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6251 (0.7733)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [114] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.002968  min_lr: 0.002968  loss: 3.1786 (3.1916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6251 (0.7733)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6726 (0.6726)  acc1: 86.4000 (86.4000)  acc5: 98.8000 (98.8000)  time: 5.7011  data: 5.5410  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9785 (0.9031)  acc1: 81.6000 (81.8182)  acc5: 96.4000 (96.6182)  time: 0.7075  data: 0.5720  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0637 (1.0776)  acc1: 74.8000 (77.7714)  acc5: 94.4000 (94.2857)  time: 0.1872  data: 0.0569  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1424 (1.0872)  acc1: 74.8000 (77.5360)  acc5: 93.2000 (94.2240)  time: 0.1860  data: 0.0568  max mem: 21847
Test: Total time: 0:00:10 (0.4031 s / it)
* Acc@1 77.902 Acc@5 94.358 loss 1.078
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.93%
Epoch: [115]  [   0/1251]  eta: 1:06:10  lr: 0.002968  min_lr: 0.002968  loss: 2.4471 (2.4471)  weight_decay: 0.0500 (0.0500)  time: 3.1739  data: 1.6678  max mem: 21847
Epoch: [115]  [ 200/1251]  eta: 0:05:05  lr: 0.002965  min_lr: 0.002965  loss: 3.2678 (3.1223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6891 (0.7972)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [115]  [ 400/1251]  eta: 0:03:59  lr: 0.002961  min_lr: 0.002961  loss: 3.3440 (3.1496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7807 (0.8011)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [115]  [ 600/1251]  eta: 0:03:02  lr: 0.002958  min_lr: 0.002958  loss: 3.3619 (3.1856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6881 (0.7767)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [115]  [ 800/1251]  eta: 0:02:05  lr: 0.002955  min_lr: 0.002955  loss: 3.3102 (3.2031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6443 (0.7838)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [115]  [1000/1251]  eta: 0:01:09  lr: 0.002952  min_lr: 0.002952  loss: 3.7402 (3.2147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7433 (0.7856)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [115]  [1200/1251]  eta: 0:00:14  lr: 0.002949  min_lr: 0.002949  loss: 3.3025 (3.2222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8334 (0.8333)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [115]  [1250/1251]  eta: 0:00:00  lr: 0.002948  min_lr: 0.002948  loss: 3.5133 (3.2164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7996 (0.8347)  time: 0.2341  data: 0.0007  max mem: 21847
Epoch: [115] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.002948  min_lr: 0.002948  loss: 3.5133 (3.1998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7996 (0.8347)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7310 (0.7310)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.4976  data: 5.3462  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9317 (0.8862)  acc1: 80.8000 (80.9818)  acc5: 96.0000 (96.4000)  time: 0.7416  data: 0.6086  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0713 (1.0597)  acc1: 75.2000 (77.0857)  acc5: 94.8000 (94.2857)  time: 0.2108  data: 0.0815  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1289 (1.0641)  acc1: 74.0000 (77.0240)  acc5: 93.6000 (94.4000)  time: 0.2097  data: 0.0814  max mem: 21847
Test: Total time: 0:00:10 (0.4125 s / it)
* Acc@1 77.824 Acc@5 94.338 loss 1.051
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.93%
Epoch: [116]  [   0/1251]  eta: 1:01:01  lr: 0.002948  min_lr: 0.002948  loss: 3.6953 (3.6953)  weight_decay: 0.0500 (0.0500)  time: 2.9271  data: 2.5197  max mem: 21847
Epoch: [116]  [ 200/1251]  eta: 0:05:06  lr: 0.002945  min_lr: 0.002945  loss: 3.4289 (3.2551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7329 (0.7369)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [116]  [ 400/1251]  eta: 0:04:00  lr: 0.002942  min_lr: 0.002942  loss: 3.1939 (3.2401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7138 (0.7589)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [116]  [ 600/1251]  eta: 0:03:02  lr: 0.002938  min_lr: 0.002938  loss: 3.1724 (3.2246)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6898 (0.7598)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [116]  [ 800/1251]  eta: 0:02:05  lr: 0.002935  min_lr: 0.002935  loss: 3.1504 (3.2178)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6670 (0.7574)  time: 0.2799  data: 0.0004  max mem: 21847
Epoch: [116]  [1000/1251]  eta: 0:01:09  lr: 0.002932  min_lr: 0.002932  loss: 3.1705 (3.2115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8055 (0.7738)  time: 0.2730  data: 0.0005  max mem: 21847
Epoch: [116]  [1200/1251]  eta: 0:00:14  lr: 0.002929  min_lr: 0.002929  loss: 2.8621 (3.2125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6487 (0.7682)  time: 0.2754  data: 0.0004  max mem: 21847
Epoch: [116]  [1250/1251]  eta: 0:00:00  lr: 0.002928  min_lr: 0.002928  loss: 3.6633 (3.2183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6832 (0.7705)  time: 0.2282  data: 0.0010  max mem: 21847
Epoch: [116] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.002928  min_lr: 0.002928  loss: 3.6633 (3.1894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6832 (0.7705)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.7843 (0.7843)  acc1: 83.6000 (83.6000)  acc5: 98.4000 (98.4000)  time: 5.4009  data: 5.2505  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9834 (0.9458)  acc1: 83.2000 (81.7091)  acc5: 96.8000 (96.6182)  time: 0.6993  data: 0.5654  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0900 (1.1141)  acc1: 76.8000 (77.7905)  acc5: 94.0000 (94.3238)  time: 0.1948  data: 0.0649  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2510 (1.1203)  acc1: 76.4000 (77.6800)  acc5: 93.2000 (94.2400)  time: 0.1939  data: 0.0649  max mem: 21847
Test: Total time: 0:00:09 (0.3970 s / it)
* Acc@1 77.830 Acc@5 94.360 loss 1.118
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.93%
Epoch: [117]  [   0/1251]  eta: 1:04:07  lr: 0.002928  min_lr: 0.002928  loss: 3.5438 (3.5438)  weight_decay: 0.0500 (0.0500)  time: 3.0754  data: 1.8219  max mem: 21847
Epoch: [117]  [ 200/1251]  eta: 0:05:04  lr: 0.002925  min_lr: 0.002925  loss: 3.0089 (3.1722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7290 (0.9198)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [117]  [ 400/1251]  eta: 0:03:59  lr: 0.002922  min_lr: 0.002922  loss: 3.3485 (3.2186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6778 (0.8035)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [117]  [ 600/1251]  eta: 0:03:01  lr: 0.002919  min_lr: 0.002919  loss: 3.1060 (3.1820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7401 (0.8324)  time: 0.2722  data: 0.0003  max mem: 21847
Epoch: [117]  [ 800/1251]  eta: 0:02:05  lr: 0.002915  min_lr: 0.002915  loss: 3.3360 (3.1912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7477 (0.8112)  time: 0.2709  data: 0.0003  max mem: 21847
Epoch: [117]  [1000/1251]  eta: 0:01:09  lr: 0.002912  min_lr: 0.002912  loss: 3.2353 (3.1895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8217 (0.8079)  time: 0.2731  data: 0.0005  max mem: 21847
Epoch: [117]  [1200/1251]  eta: 0:00:14  lr: 0.002909  min_lr: 0.002909  loss: 3.3480 (3.1976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6740 (0.7959)  time: 0.2716  data: 0.0003  max mem: 21847
Epoch: [117]  [1250/1251]  eta: 0:00:00  lr: 0.002908  min_lr: 0.002908  loss: 3.5204 (3.1975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6740 (0.7921)  time: 0.2281  data: 0.0006  max mem: 21847
Epoch: [117] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.002908  min_lr: 0.002908  loss: 3.5204 (3.2010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6740 (0.7921)
Test:  [ 0/25]  eta: 0:01:59  loss: 0.6811 (0.6811)  acc1: 86.0000 (86.0000)  acc5: 98.8000 (98.8000)  time: 4.7748  data: 4.5976  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9121 (0.8846)  acc1: 81.6000 (81.3818)  acc5: 96.8000 (96.7273)  time: 0.6944  data: 0.5572  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0817 (1.0486)  acc1: 76.0000 (77.9429)  acc5: 94.0000 (94.6857)  time: 0.2359  data: 0.1053  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1621 (1.0644)  acc1: 74.4000 (77.5200)  acc5: 92.8000 (94.4800)  time: 0.2253  data: 0.0960  max mem: 21847
Test: Total time: 0:00:10 (0.4034 s / it)
* Acc@1 77.776 Acc@5 94.336 loss 1.059
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.93%
Epoch: [118]  [   0/1251]  eta: 1:09:09  lr: 0.002908  min_lr: 0.002908  loss: 3.8724 (3.8724)  weight_decay: 0.0500 (0.0500)  time: 3.3167  data: 2.9789  max mem: 21847
Epoch: [118]  [ 200/1251]  eta: 0:05:02  lr: 0.002905  min_lr: 0.002905  loss: 3.5396 (3.2243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7721 (0.9099)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [118]  [ 400/1251]  eta: 0:03:59  lr: 0.002902  min_lr: 0.002902  loss: 2.7189 (3.1826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6958 (0.8236)  time: 0.2844  data: 0.0003  max mem: 21847
Epoch: [118]  [ 600/1251]  eta: 0:03:01  lr: 0.002899  min_lr: 0.002899  loss: 3.3502 (3.2023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5887 (0.8326)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [118]  [ 800/1251]  eta: 0:02:05  lr: 0.002895  min_lr: 0.002895  loss: 2.9345 (3.1725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7996 (0.8192)  time: 0.2761  data: 0.0005  max mem: 21847
Epoch: [118]  [1000/1251]  eta: 0:01:09  lr: 0.002892  min_lr: 0.002892  loss: 2.6941 (3.1825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7140 (0.8115)  time: 0.2817  data: 0.0004  max mem: 21847
Epoch: [118]  [1200/1251]  eta: 0:00:14  lr: 0.002889  min_lr: 0.002889  loss: 3.5786 (3.1962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7114 (0.7997)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [118]  [1250/1251]  eta: 0:00:00  lr: 0.002888  min_lr: 0.002888  loss: 3.3779 (3.1996)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7114 (0.8042)  time: 0.2328  data: 0.0007  max mem: 21847
Epoch: [118] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.002888  min_lr: 0.002888  loss: 3.3779 (3.1924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7114 (0.8042)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6387 (0.6387)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.6752  data: 5.5028  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8690 (0.8654)  acc1: 82.0000 (82.3273)  acc5: 96.8000 (96.6909)  time: 0.7117  data: 0.5758  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0449 (1.0450)  acc1: 76.8000 (78.0000)  acc5: 93.6000 (94.5333)  time: 0.1929  data: 0.0597  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1753 (1.0593)  acc1: 74.8000 (77.5360)  acc5: 93.2000 (94.4960)  time: 0.2309  data: 0.0985  max mem: 21847
Test: Total time: 0:00:10 (0.4361 s / it)
* Acc@1 78.194 Acc@5 94.540 loss 1.043
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.19%
Epoch: [119]  [   0/1251]  eta: 1:01:38  lr: 0.002888  min_lr: 0.002888  loss: 2.9728 (2.9728)  weight_decay: 0.0500 (0.0500)  time: 2.9565  data: 2.6409  max mem: 21847
Epoch: [119]  [ 200/1251]  eta: 0:05:02  lr: 0.002885  min_lr: 0.002885  loss: 3.6441 (3.2347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6887 (0.8152)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [119]  [ 400/1251]  eta: 0:03:59  lr: 0.002882  min_lr: 0.002882  loss: 3.3902 (3.2205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6683 (0.7934)  time: 0.2817  data: 0.0004  max mem: 21847
Epoch: [119]  [ 600/1251]  eta: 0:03:01  lr: 0.002879  min_lr: 0.002879  loss: 3.0438 (3.2227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6169 (0.7670)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [119]  [ 800/1251]  eta: 0:02:05  lr: 0.002875  min_lr: 0.002875  loss: 3.7640 (3.2254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7088 (0.7716)  time: 0.2794  data: 0.0004  max mem: 21847
Epoch: [119]  [1000/1251]  eta: 0:01:09  lr: 0.002872  min_lr: 0.002872  loss: 3.6747 (3.2076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8888 (0.7858)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [119]  [1200/1251]  eta: 0:00:14  lr: 0.002869  min_lr: 0.002869  loss: 3.4783 (3.2109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9331 (0.8162)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [119]  [1250/1251]  eta: 0:00:00  lr: 0.002868  min_lr: 0.002868  loss: 2.6361 (3.2015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7427 (0.8119)  time: 0.2335  data: 0.0005  max mem: 21847
Epoch: [119] Total time: 0:05:45 (0.2766 s / it)
Averaged stats: lr: 0.002868  min_lr: 0.002868  loss: 2.6361 (3.1781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7427 (0.8119)
Test:  [ 0/25]  eta: 0:02:01  loss: 0.6172 (0.6172)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 4.8719  data: 4.7120  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7929 (0.8267)  acc1: 80.4000 (81.7818)  acc5: 96.8000 (96.3636)  time: 0.7299  data: 0.5953  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0096 (0.9920)  acc1: 75.6000 (77.8667)  acc5: 94.4000 (94.4191)  time: 0.2490  data: 0.1192  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1114 (1.0005)  acc1: 75.6000 (77.7440)  acc5: 93.6000 (94.3520)  time: 0.2118  data: 0.0835  max mem: 21847
Test: Total time: 0:00:10 (0.4178 s / it)
* Acc@1 78.204 Acc@5 94.552 loss 0.983
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.20%
Epoch: [120]  [   0/1251]  eta: 1:08:44  lr: 0.002868  min_lr: 0.002868  loss: 3.8932 (3.8932)  weight_decay: 0.0500 (0.0500)  time: 3.2970  data: 2.9913  max mem: 21847
Epoch: [120]  [ 200/1251]  eta: 0:05:03  lr: 0.002865  min_lr: 0.002865  loss: 3.6018 (3.1886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7606 (0.7598)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [120]  [ 400/1251]  eta: 0:04:00  lr: 0.002862  min_lr: 0.002862  loss: 3.5303 (3.1426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7053 (nan)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [120]  [ 600/1251]  eta: 0:03:01  lr: 0.002858  min_lr: 0.002858  loss: 3.4026 (3.1735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7525 (nan)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [120]  [ 800/1251]  eta: 0:02:05  lr: 0.002855  min_lr: 0.002855  loss: 3.2684 (3.1791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8928 (nan)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [120]  [1000/1251]  eta: 0:01:09  lr: 0.002852  min_lr: 0.002852  loss: 3.4507 (3.1975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8187 (nan)  time: 0.2704  data: 0.0004  max mem: 21847
Epoch: [120]  [1200/1251]  eta: 0:00:14  lr: 0.002849  min_lr: 0.002849  loss: 3.5170 (3.2045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7992 (nan)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [120]  [1250/1251]  eta: 0:00:00  lr: 0.002848  min_lr: 0.002848  loss: 3.1942 (3.2063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6621 (nan)  time: 0.2284  data: 0.0006  max mem: 21847
Epoch: [120] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.002848  min_lr: 0.002848  loss: 3.1942 (3.1797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6621 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6447 (0.6447)  acc1: 88.0000 (88.0000)  acc5: 97.6000 (97.6000)  time: 5.5903  data: 5.4329  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8171 (0.8496)  acc1: 82.8000 (82.0364)  acc5: 97.2000 (96.8000)  time: 0.7354  data: 0.5983  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0215 (1.0285)  acc1: 75.2000 (78.2667)  acc5: 94.8000 (94.8762)  time: 0.2028  data: 0.0715  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1518 (1.0400)  acc1: 74.8000 (78.0960)  acc5: 94.0000 (94.7360)  time: 0.2019  data: 0.0714  max mem: 21847
Test: Total time: 0:00:10 (0.4093 s / it)
* Acc@1 78.378 Acc@5 94.506 loss 1.025
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.38%
Epoch: [121]  [   0/1251]  eta: 1:11:54  lr: 0.002848  min_lr: 0.002848  loss: 3.0296 (3.0296)  weight_decay: 0.0500 (0.0500)  time: 3.4489  data: 3.1425  max mem: 21847
Epoch: [121]  [ 200/1251]  eta: 0:05:05  lr: 0.002845  min_lr: 0.002845  loss: 3.3369 (3.2068)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0485 (0.9151)  time: 0.2742  data: 0.0004  max mem: 21847
Epoch: [121]  [ 400/1251]  eta: 0:04:00  lr: 0.002841  min_lr: 0.002841  loss: 3.0399 (3.2082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6395 (0.8675)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [121]  [ 600/1251]  eta: 0:03:02  lr: 0.002838  min_lr: 0.002838  loss: 3.4555 (3.2249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8217 (0.8640)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [121]  [ 800/1251]  eta: 0:02:05  lr: 0.002835  min_lr: 0.002835  loss: 3.2198 (3.2130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7796 (0.8399)  time: 0.2824  data: 0.0004  max mem: 21847
Epoch: [121]  [1000/1251]  eta: 0:01:09  lr: 0.002831  min_lr: 0.002831  loss: 3.4613 (3.2016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8026 (0.8540)  time: 0.2746  data: 0.0004  max mem: 21847
Epoch: [121]  [1200/1251]  eta: 0:00:14  lr: 0.002828  min_lr: 0.002828  loss: 3.1719 (3.1975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6500 (0.8319)  time: 0.2706  data: 0.0004  max mem: 21847
Epoch: [121]  [1250/1251]  eta: 0:00:00  lr: 0.002827  min_lr: 0.002827  loss: 3.4274 (3.2002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6552 (0.8284)  time: 0.2352  data: 0.0005  max mem: 21847
Epoch: [121] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.002827  min_lr: 0.002827  loss: 3.4274 (3.1942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6552 (0.8284)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6300 (0.6300)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.7130  data: 5.5630  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8902 (0.8450)  acc1: 80.4000 (82.1455)  acc5: 97.2000 (96.9455)  time: 0.7564  data: 0.6225  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0223 (1.0246)  acc1: 75.6000 (77.7714)  acc5: 94.0000 (94.4000)  time: 0.2048  data: 0.0749  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1544 (1.0338)  acc1: 74.4000 (77.4080)  acc5: 92.8000 (94.2400)  time: 0.2038  data: 0.0748  max mem: 21847
Test: Total time: 0:00:10 (0.4171 s / it)
* Acc@1 78.122 Acc@5 94.490 loss 1.020
Accuracy of the model on the 50000 test images: 78.1%
Max accuracy: 78.38%
Epoch: [122]  [   0/1251]  eta: 1:00:50  lr: 0.002827  min_lr: 0.002827  loss: 4.1164 (4.1164)  weight_decay: 0.0500 (0.0500)  time: 2.9185  data: 2.4339  max mem: 21847
Epoch: [122]  [ 200/1251]  eta: 0:05:03  lr: 0.002824  min_lr: 0.002824  loss: 2.9372 (3.1290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8851 (0.8107)  time: 0.2718  data: 0.0003  max mem: 21847
Epoch: [122]  [ 400/1251]  eta: 0:03:59  lr: 0.002821  min_lr: 0.002821  loss: 3.2389 (3.1234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6988 (0.7762)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [122]  [ 600/1251]  eta: 0:03:01  lr: 0.002818  min_lr: 0.002818  loss: 3.1145 (3.1496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7869 (0.8289)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [122]  [ 800/1251]  eta: 0:02:05  lr: 0.002814  min_lr: 0.002814  loss: 3.4609 (3.1648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6776 (0.8152)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [122]  [1000/1251]  eta: 0:01:09  lr: 0.002811  min_lr: 0.002811  loss: 3.4033 (3.1675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7015 (0.8007)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [122]  [1200/1251]  eta: 0:00:14  lr: 0.002808  min_lr: 0.002808  loss: 2.9115 (3.1627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6667 (0.7965)  time: 0.2732  data: 0.0003  max mem: 21847
Epoch: [122]  [1250/1251]  eta: 0:00:00  lr: 0.002807  min_lr: 0.002807  loss: 3.2579 (3.1619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6139 (0.7932)  time: 0.2274  data: 0.0006  max mem: 21847
Epoch: [122] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.002807  min_lr: 0.002807  loss: 3.2579 (3.1698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6139 (0.7932)
Test:  [ 0/25]  eta: 0:01:28  loss: 0.6621 (0.6621)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 3.5375  data: 3.3525  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.9080 (0.8523)  acc1: 81.2000 (81.5636)  acc5: 96.8000 (96.8727)  time: 0.6531  data: 0.5162  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0276 (1.0248)  acc1: 76.4000 (78.2095)  acc5: 94.0000 (94.6286)  time: 0.2822  data: 0.1524  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1384 (1.0401)  acc1: 76.4000 (77.6480)  acc5: 93.2000 (94.5440)  time: 0.2185  data: 0.0894  max mem: 21847
Test: Total time: 0:00:10 (0.4092 s / it)
* Acc@1 78.156 Acc@5 94.556 loss 1.025
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.38%
Epoch: [123]  [   0/1251]  eta: 0:57:40  lr: 0.002807  min_lr: 0.002807  loss: 3.5742 (3.5742)  weight_decay: 0.0500 (0.0500)  time: 2.7664  data: 2.3181  max mem: 21847
Epoch: [123]  [ 200/1251]  eta: 0:05:07  lr: 0.002804  min_lr: 0.002804  loss: 3.1428 (3.1459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9047 (0.9451)  time: 0.2735  data: 0.0005  max mem: 21847
Epoch: [123]  [ 400/1251]  eta: 0:04:01  lr: 0.002800  min_lr: 0.002800  loss: 2.4726 (3.0718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.8486)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [123]  [ 600/1251]  eta: 0:03:03  lr: 0.002797  min_lr: 0.002797  loss: 3.6383 (3.1203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7602 (0.8487)  time: 0.2754  data: 0.0004  max mem: 21847
Epoch: [123]  [ 800/1251]  eta: 0:02:06  lr: 0.002794  min_lr: 0.002794  loss: 3.0664 (3.1342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6938 (0.8383)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [123]  [1000/1251]  eta: 0:01:09  lr: 0.002790  min_lr: 0.002790  loss: 3.4106 (3.1461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7873 (0.8366)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [123]  [1200/1251]  eta: 0:00:14  lr: 0.002787  min_lr: 0.002787  loss: 3.3560 (3.1491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8275 (0.8317)  time: 0.2795  data: 0.0004  max mem: 21847
Epoch: [123]  [1250/1251]  eta: 0:00:00  lr: 0.002786  min_lr: 0.002786  loss: 3.0921 (3.1501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8028 (0.8345)  time: 0.2278  data: 0.0005  max mem: 21847
Epoch: [123] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.002786  min_lr: 0.002786  loss: 3.0921 (3.1704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8028 (0.8345)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5750 (0.5750)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.5486  data: 5.3755  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8292 (0.8161)  acc1: 81.2000 (81.9636)  acc5: 96.8000 (96.4727)  time: 0.7404  data: 0.6059  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9996 (0.9837)  acc1: 76.0000 (78.2667)  acc5: 93.6000 (94.4000)  time: 0.2145  data: 0.0855  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0870 (0.9934)  acc1: 76.0000 (77.9200)  acc5: 93.6000 (94.3520)  time: 0.2136  data: 0.0854  max mem: 21847
Test: Total time: 0:00:10 (0.4175 s / it)
* Acc@1 78.118 Acc@5 94.496 loss 0.983
Accuracy of the model on the 50000 test images: 78.1%
Max accuracy: 78.38%
Epoch: [124]  [   0/1251]  eta: 1:01:01  lr: 0.002786  min_lr: 0.002786  loss: 2.5212 (2.5212)  weight_decay: 0.0500 (0.0500)  time: 2.9268  data: 2.3695  max mem: 21847
Epoch: [124]  [ 200/1251]  eta: 0:05:04  lr: 0.002783  min_lr: 0.002783  loss: 3.1275 (3.1453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7827 (0.7687)  time: 0.2737  data: 0.0005  max mem: 21847
Epoch: [124]  [ 400/1251]  eta: 0:04:00  lr: 0.002780  min_lr: 0.002780  loss: 3.4675 (3.1423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6832 (0.7712)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [124]  [ 600/1251]  eta: 0:03:02  lr: 0.002776  min_lr: 0.002776  loss: 3.4359 (3.1378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6699 (0.7819)  time: 0.2711  data: 0.0003  max mem: 21847
Epoch: [124]  [ 800/1251]  eta: 0:02:05  lr: 0.002773  min_lr: 0.002773  loss: 2.6693 (3.1570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7150 (0.7920)  time: 0.2710  data: 0.0005  max mem: 21847
Epoch: [124]  [1000/1251]  eta: 0:01:09  lr: 0.002770  min_lr: 0.002770  loss: 3.3388 (3.1614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8855 (0.8085)  time: 0.2766  data: 0.0004  max mem: 21847
Epoch: [124]  [1200/1251]  eta: 0:00:14  lr: 0.002766  min_lr: 0.002766  loss: 3.5976 (3.1567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6831 (0.8075)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [124]  [1250/1251]  eta: 0:00:00  lr: 0.002766  min_lr: 0.002766  loss: 3.4055 (3.1551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7961 (0.8092)  time: 0.2291  data: 0.0005  max mem: 21847
Epoch: [124] Total time: 0:05:45 (0.2766 s / it)
Averaged stats: lr: 0.002766  min_lr: 0.002766  loss: 3.4055 (3.1657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7961 (0.8092)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6630 (0.6630)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 5.8494  data: 5.6970  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8708 (0.8599)  acc1: 81.6000 (81.3818)  acc5: 97.2000 (96.6909)  time: 0.7197  data: 0.5872  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0527 (1.0278)  acc1: 76.4000 (78.0381)  acc5: 93.6000 (94.3429)  time: 0.1853  data: 0.0563  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1516 (1.0385)  acc1: 75.6000 (77.5680)  acc5: 93.6000 (94.4160)  time: 0.1847  data: 0.0564  max mem: 21847
Test: Total time: 0:00:10 (0.4060 s / it)
* Acc@1 78.204 Acc@5 94.510 loss 1.023
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.38%
Epoch: [125]  [   0/1251]  eta: 1:07:46  lr: 0.002766  min_lr: 0.002766  loss: 3.7161 (3.7161)  weight_decay: 0.0500 (0.0500)  time: 3.2507  data: 2.6377  max mem: 21847
Epoch: [125]  [ 200/1251]  eta: 0:05:06  lr: 0.002762  min_lr: 0.002762  loss: 3.1835 (3.0993)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5915 (0.7781)  time: 0.2709  data: 0.0005  max mem: 21847
Epoch: [125]  [ 400/1251]  eta: 0:04:00  lr: 0.002759  min_lr: 0.002759  loss: 3.4597 (3.1368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7680 (0.7946)  time: 0.2716  data: 0.0005  max mem: 21847
Epoch: [125]  [ 600/1251]  eta: 0:03:02  lr: 0.002756  min_lr: 0.002756  loss: 3.3262 (3.1506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6604 (0.7946)  time: 0.2724  data: 0.0005  max mem: 21847
Epoch: [125]  [ 800/1251]  eta: 0:02:05  lr: 0.002752  min_lr: 0.002752  loss: 2.8120 (3.1633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9150 (0.8093)  time: 0.2765  data: 0.0004  max mem: 21847
Epoch: [125]  [1000/1251]  eta: 0:01:09  lr: 0.002749  min_lr: 0.002749  loss: 3.3669 (3.1559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7740 (0.8040)  time: 0.2722  data: 0.0005  max mem: 21847
Epoch: [125]  [1200/1251]  eta: 0:00:14  lr: 0.002746  min_lr: 0.002746  loss: 3.1589 (3.1702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7602 (0.8158)  time: 0.2723  data: 0.0005  max mem: 21847
Epoch: [125]  [1250/1251]  eta: 0:00:00  lr: 0.002745  min_lr: 0.002745  loss: 3.6267 (3.1714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7602 (0.8138)  time: 0.2283  data: 0.0008  max mem: 21847
Epoch: [125] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.002745  min_lr: 0.002745  loss: 3.6267 (3.1724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7602 (0.8138)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6952 (0.6952)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 5.5322  data: 5.3784  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8730 (0.8555)  acc1: 83.6000 (82.6546)  acc5: 96.8000 (96.8364)  time: 0.6739  data: 0.5397  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0512 (1.0228)  acc1: 77.2000 (78.5714)  acc5: 94.4000 (94.8762)  time: 0.1921  data: 0.0616  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1084 (1.0376)  acc1: 76.0000 (78.0160)  acc5: 94.0000 (94.7360)  time: 0.2124  data: 0.0826  max mem: 21847
Test: Total time: 0:00:10 (0.4151 s / it)
* Acc@1 78.278 Acc@5 94.664 loss 1.031
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.38%
Epoch: [126]  [   0/1251]  eta: 1:12:43  lr: 0.002745  min_lr: 0.002745  loss: 3.1675 (3.1675)  weight_decay: 0.0500 (0.0500)  time: 3.4881  data: 2.4513  max mem: 21847
Epoch: [126]  [ 200/1251]  eta: 0:05:06  lr: 0.002742  min_lr: 0.002742  loss: 3.3548 (3.1283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6884 (0.7917)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [126]  [ 400/1251]  eta: 0:04:01  lr: 0.002738  min_lr: 0.002738  loss: 2.6552 (3.1447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7256 (0.7597)  time: 0.3036  data: 0.0004  max mem: 21847
Epoch: [126]  [ 600/1251]  eta: 0:03:02  lr: 0.002735  min_lr: 0.002735  loss: 3.2131 (3.1422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6580 (0.8228)  time: 0.2745  data: 0.0005  max mem: 21847
Epoch: [126]  [ 800/1251]  eta: 0:02:05  lr: 0.002732  min_lr: 0.002732  loss: 2.8437 (3.1508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8326 (0.8093)  time: 0.2742  data: 0.0005  max mem: 21847
Epoch: [126]  [1000/1251]  eta: 0:01:09  lr: 0.002728  min_lr: 0.002728  loss: 3.0776 (3.1644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7787 (0.8109)  time: 0.2830  data: 0.0003  max mem: 21847
Epoch: [126]  [1200/1251]  eta: 0:00:14  lr: 0.002725  min_lr: 0.002725  loss: 3.0940 (3.1665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8396 (0.8236)  time: 0.2803  data: 0.0004  max mem: 21847
Epoch: [126]  [1250/1251]  eta: 0:00:00  lr: 0.002724  min_lr: 0.002724  loss: 3.3938 (3.1638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7747 (0.8214)  time: 0.2354  data: 0.0007  max mem: 21847
Epoch: [126] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.002724  min_lr: 0.002724  loss: 3.3938 (3.1636)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7747 (0.8214)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7007 (0.7007)  acc1: 83.6000 (83.6000)  acc5: 97.2000 (97.2000)  time: 5.7371  data: 5.5868  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8830 (0.8686)  acc1: 82.8000 (81.8545)  acc5: 97.2000 (96.4727)  time: 0.6934  data: 0.5596  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0417 (1.0341)  acc1: 76.0000 (78.6095)  acc5: 94.4000 (94.4000)  time: 0.1802  data: 0.0504  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1398 (1.0467)  acc1: 76.0000 (78.2400)  acc5: 93.6000 (94.3520)  time: 0.1786  data: 0.0503  max mem: 21847
Test: Total time: 0:00:09 (0.3976 s / it)
* Acc@1 78.492 Acc@5 94.612 loss 1.033
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.49%
Epoch: [127]  [   0/1251]  eta: 1:08:07  lr: 0.002724  min_lr: 0.002724  loss: 2.2103 (2.2103)  weight_decay: 0.0500 (0.0500)  time: 3.2671  data: 2.9601  max mem: 21847
Epoch: [127]  [ 200/1251]  eta: 0:05:02  lr: 0.002721  min_lr: 0.002721  loss: 2.8305 (3.1057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7695 (0.7784)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [127]  [ 400/1251]  eta: 0:03:59  lr: 0.002717  min_lr: 0.002717  loss: 3.2051 (3.1234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6571 (0.8097)  time: 0.2718  data: 0.0005  max mem: 21847
Epoch: [127]  [ 600/1251]  eta: 0:03:01  lr: 0.002714  min_lr: 0.002714  loss: 3.4398 (3.1377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7353 (0.8096)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [127]  [ 800/1251]  eta: 0:02:05  lr: 0.002711  min_lr: 0.002711  loss: 3.4311 (3.1340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6815 (0.8073)  time: 0.2705  data: 0.0004  max mem: 21847
Epoch: [127]  [1000/1251]  eta: 0:01:09  lr: 0.002707  min_lr: 0.002707  loss: 3.2231 (3.1503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6967 (0.8208)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [127]  [1200/1251]  eta: 0:00:14  lr: 0.002704  min_lr: 0.002704  loss: 3.4329 (3.1560)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0868 (0.8262)  time: 0.2749  data: 0.0005  max mem: 21847
Epoch: [127]  [1250/1251]  eta: 0:00:00  lr: 0.002703  min_lr: 0.002703  loss: 2.8035 (3.1476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8026 (0.8277)  time: 0.2282  data: 0.0006  max mem: 21847
Epoch: [127] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.002703  min_lr: 0.002703  loss: 2.8035 (3.1615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8026 (0.8277)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6225 (0.6225)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.7835  data: 5.6131  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8606 (0.8225)  acc1: 81.6000 (82.0364)  acc5: 96.8000 (96.5818)  time: 0.7085  data: 0.5743  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0436 (0.9831)  acc1: 76.4000 (78.3048)  acc5: 94.4000 (94.5714)  time: 0.1808  data: 0.0517  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0549 (0.9906)  acc1: 76.0000 (77.9200)  acc5: 94.0000 (94.5600)  time: 0.1807  data: 0.0524  max mem: 21847
Test: Total time: 0:00:10 (0.4006 s / it)
* Acc@1 78.344 Acc@5 94.586 loss 0.981
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.49%
Epoch: [128]  [   0/1251]  eta: 1:06:51  lr: 0.002703  min_lr: 0.002703  loss: 2.1368 (2.1368)  weight_decay: 0.0500 (0.0500)  time: 3.2070  data: 2.8251  max mem: 21847
Epoch: [128]  [ 200/1251]  eta: 0:05:06  lr: 0.002700  min_lr: 0.002700  loss: 3.5256 (3.2069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7674 (0.7241)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [128]  [ 400/1251]  eta: 0:04:00  lr: 0.002696  min_lr: 0.002696  loss: 3.3164 (3.1527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6689 (0.7313)  time: 0.2712  data: 0.0003  max mem: 21847
Epoch: [128]  [ 600/1251]  eta: 0:03:01  lr: 0.002693  min_lr: 0.002693  loss: 2.9978 (3.1714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6791 (0.7442)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [128]  [ 800/1251]  eta: 0:02:05  lr: 0.002690  min_lr: 0.002690  loss: 3.2924 (3.1625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8416 (nan)  time: 0.2820  data: 0.0003  max mem: 21847
Epoch: [128]  [1000/1251]  eta: 0:01:09  lr: 0.002686  min_lr: 0.002686  loss: 3.0529 (3.1539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6812 (nan)  time: 0.2725  data: 0.0003  max mem: 21847
Epoch: [128]  [1200/1251]  eta: 0:00:14  lr: 0.002683  min_lr: 0.002683  loss: 3.4716 (3.1572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8301 (nan)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [128]  [1250/1251]  eta: 0:00:00  lr: 0.002682  min_lr: 0.002682  loss: 2.8892 (3.1533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9233 (nan)  time: 0.2274  data: 0.0006  max mem: 21847
Epoch: [128] Total time: 0:05:45 (0.2763 s / it)
Averaged stats: lr: 0.002682  min_lr: 0.002682  loss: 2.8892 (3.1595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9233 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6432 (0.6432)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.6698  data: 5.5093  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8428 (0.8490)  acc1: 82.8000 (81.7455)  acc5: 96.8000 (96.5455)  time: 0.7544  data: 0.6189  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0196 (1.0095)  acc1: 77.6000 (78.4952)  acc5: 94.0000 (94.6095)  time: 0.2025  data: 0.0705  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1145 (1.0221)  acc1: 77.2000 (77.9360)  acc5: 93.2000 (94.5120)  time: 0.2037  data: 0.0722  max mem: 21847
Test: Total time: 0:00:10 (0.4137 s / it)
* Acc@1 78.230 Acc@5 94.602 loss 1.014
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.49%
Epoch: [129]  [   0/1251]  eta: 1:10:35  lr: 0.002682  min_lr: 0.002682  loss: 2.0575 (2.0575)  weight_decay: 0.0500 (0.0500)  time: 3.3856  data: 2.4634  max mem: 21847
Epoch: [129]  [ 200/1251]  eta: 0:05:05  lr: 0.002679  min_lr: 0.002679  loss: 3.0655 (3.1642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7782 (0.7580)  time: 0.2849  data: 0.0005  max mem: 21847
Epoch: [129]  [ 400/1251]  eta: 0:04:01  lr: 0.002675  min_lr: 0.002675  loss: 2.6269 (3.1639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7379 (0.7495)  time: 0.2743  data: 0.0004  max mem: 21847
Epoch: [129]  [ 600/1251]  eta: 0:03:02  lr: 0.002672  min_lr: 0.002672  loss: 3.1132 (3.1516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7527 (0.7581)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [129]  [ 800/1251]  eta: 0:02:05  lr: 0.002668  min_lr: 0.002668  loss: 3.3003 (3.1548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6809 (0.7672)  time: 0.2711  data: 0.0005  max mem: 21847
Epoch: [129]  [1000/1251]  eta: 0:01:09  lr: 0.002665  min_lr: 0.002665  loss: 3.5899 (3.1687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6846 (0.7811)  time: 0.2807  data: 0.0005  max mem: 21847
Epoch: [129]  [1200/1251]  eta: 0:00:14  lr: 0.002662  min_lr: 0.002662  loss: 3.6292 (3.1766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7649 (0.7859)  time: 0.2788  data: 0.0004  max mem: 21847
Epoch: [129]  [1250/1251]  eta: 0:00:00  lr: 0.002661  min_lr: 0.002661  loss: 3.3070 (3.1786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7606 (0.7854)  time: 0.2291  data: 0.0007  max mem: 21847
Epoch: [129] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.002661  min_lr: 0.002661  loss: 3.3070 (3.1458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7606 (0.7854)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7308 (0.7308)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 5.6063  data: 5.4556  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9218 (0.9250)  acc1: 82.4000 (81.3091)  acc5: 96.4000 (96.2909)  time: 0.7447  data: 0.6103  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1114 (1.0699)  acc1: 76.0000 (78.0952)  acc5: 94.0000 (94.3810)  time: 0.2060  data: 0.0759  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1812 (1.0794)  acc1: 75.2000 (77.7120)  acc5: 93.6000 (94.3200)  time: 0.2041  data: 0.0758  max mem: 21847
Test: Total time: 0:00:10 (0.4137 s / it)
* Acc@1 78.258 Acc@5 94.558 loss 1.070
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.49%
Epoch: [130]  [   0/1251]  eta: 1:08:33  lr: 0.002661  min_lr: 0.002661  loss: 3.1821 (3.1821)  weight_decay: 0.0500 (0.0500)  time: 3.2885  data: 2.4723  max mem: 21847
Epoch: [130]  [ 200/1251]  eta: 0:05:06  lr: 0.002657  min_lr: 0.002657  loss: 3.2315 (3.0347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7152 (0.9381)  time: 0.2845  data: 0.0004  max mem: 21847
Epoch: [130]  [ 400/1251]  eta: 0:03:59  lr: 0.002654  min_lr: 0.002654  loss: 3.6519 (3.0832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8142 (0.8928)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [130]  [ 600/1251]  eta: 0:03:02  lr: 0.002651  min_lr: 0.002651  loss: 2.9531 (3.1041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7407 (0.8472)  time: 0.2830  data: 0.0004  max mem: 21847
Epoch: [130]  [ 800/1251]  eta: 0:02:05  lr: 0.002647  min_lr: 0.002647  loss: 2.8786 (3.1041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6066 (0.8063)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [130]  [1000/1251]  eta: 0:01:09  lr: 0.002644  min_lr: 0.002644  loss: 2.9321 (3.1180)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7116 (0.8192)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [130]  [1200/1251]  eta: 0:00:14  lr: 0.002640  min_lr: 0.002640  loss: 3.5525 (3.1307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8217 (0.8261)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [130]  [1250/1251]  eta: 0:00:00  lr: 0.002640  min_lr: 0.002640  loss: 3.5777 (3.1318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7439 (0.8255)  time: 0.2277  data: 0.0007  max mem: 21847
Epoch: [130] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.002640  min_lr: 0.002640  loss: 3.5777 (3.1464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7439 (0.8255)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6838 (0.6838)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.3906  data: 5.2309  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8970 (0.8700)  acc1: 82.4000 (82.4727)  acc5: 96.8000 (96.5091)  time: 0.7420  data: 0.6087  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0750 (1.0370)  acc1: 77.2000 (78.7619)  acc5: 93.6000 (94.3238)  time: 0.2142  data: 0.0852  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1444 (1.0481)  acc1: 76.8000 (78.4480)  acc5: 94.0000 (94.4000)  time: 0.2134  data: 0.0851  max mem: 21847
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 78.432 Acc@5 94.588 loss 1.032
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.49%
Epoch: [131]  [   0/1251]  eta: 1:06:09  lr: 0.002640  min_lr: 0.002640  loss: 3.7460 (3.7460)  weight_decay: 0.0500 (0.0500)  time: 3.1731  data: 2.7248  max mem: 21847
Epoch: [131]  [ 200/1251]  eta: 0:05:05  lr: 0.002636  min_lr: 0.002636  loss: 3.0766 (3.1354)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6123 (0.7667)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [131]  [ 400/1251]  eta: 0:04:00  lr: 0.002633  min_lr: 0.002633  loss: 2.8139 (3.1747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8696 (0.9152)  time: 0.2828  data: 0.0004  max mem: 21847
Epoch: [131]  [ 600/1251]  eta: 0:03:01  lr: 0.002629  min_lr: 0.002629  loss: 3.2153 (3.1790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6090 (0.8562)  time: 0.2723  data: 0.0005  max mem: 21847
Epoch: [131]  [ 800/1251]  eta: 0:02:05  lr: 0.002626  min_lr: 0.002626  loss: 2.8942 (3.1626)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6337 (0.8091)  time: 0.2721  data: 0.0005  max mem: 21847
Epoch: [131]  [1000/1251]  eta: 0:01:09  lr: 0.002623  min_lr: 0.002623  loss: 2.9144 (3.1797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6308 (0.7944)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [131]  [1200/1251]  eta: 0:00:14  lr: 0.002619  min_lr: 0.002619  loss: 3.3106 (3.1740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6797 (0.7864)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [131]  [1250/1251]  eta: 0:00:00  lr: 0.002618  min_lr: 0.002618  loss: 3.4206 (3.1776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6428 (0.7813)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [131] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.002618  min_lr: 0.002618  loss: 3.4206 (3.1642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6428 (0.7813)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6778 (0.6778)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.7964  data: 5.6452  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9138 (0.8983)  acc1: 80.4000 (81.8545)  acc5: 96.8000 (96.3636)  time: 0.7479  data: 0.6151  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0837 (1.0401)  acc1: 76.8000 (78.6476)  acc5: 94.0000 (94.6857)  time: 0.1907  data: 0.0612  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1179 (1.0483)  acc1: 76.4000 (78.3360)  acc5: 94.0000 (94.6720)  time: 0.1898  data: 0.0611  max mem: 21847
Test: Total time: 0:00:10 (0.4086 s / it)
* Acc@1 78.454 Acc@5 94.694 loss 1.040
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.49%
Epoch: [132]  [   0/1251]  eta: 1:11:02  lr: 0.002618  min_lr: 0.002618  loss: 2.3464 (2.3464)  weight_decay: 0.0500 (0.0500)  time: 3.4077  data: 3.0518  max mem: 21847
Epoch: [132]  [ 200/1251]  eta: 0:05:03  lr: 0.002615  min_lr: 0.002615  loss: 3.2641 (3.2326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7314 (0.7608)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [132]  [ 400/1251]  eta: 0:04:00  lr: 0.002612  min_lr: 0.002612  loss: 3.3212 (3.1815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6798 (0.7805)  time: 0.2790  data: 0.0004  max mem: 21847
Epoch: [132]  [ 600/1251]  eta: 0:03:02  lr: 0.002608  min_lr: 0.002608  loss: 3.4675 (3.1772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.7834)  time: 0.2724  data: 0.0005  max mem: 21847
Epoch: [132]  [ 800/1251]  eta: 0:02:05  lr: 0.002605  min_lr: 0.002605  loss: 3.1278 (3.1827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6288 (0.7633)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [132]  [1000/1251]  eta: 0:01:09  lr: 0.002601  min_lr: 0.002601  loss: 3.4156 (3.1780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8464 (0.7845)  time: 0.2712  data: 0.0003  max mem: 21847
Epoch: [132]  [1200/1251]  eta: 0:00:14  lr: 0.002598  min_lr: 0.002598  loss: 3.0305 (3.1794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7712 (0.7886)  time: 0.2754  data: 0.0004  max mem: 21847
Epoch: [132]  [1250/1251]  eta: 0:00:00  lr: 0.002597  min_lr: 0.002597  loss: 3.5601 (3.1826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6773 (0.7839)  time: 0.2339  data: 0.0005  max mem: 21847
Epoch: [132] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.002597  min_lr: 0.002597  loss: 3.5601 (3.1519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6773 (0.7839)
Test:  [ 0/25]  eta: 0:02:02  loss: 0.6700 (0.6700)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 4.8985  data: 4.7464  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8459 (0.8454)  acc1: 82.0000 (82.0727)  acc5: 97.2000 (96.7636)  time: 0.7121  data: 0.5775  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0060 (1.0131)  acc1: 76.8000 (78.6667)  acc5: 94.4000 (94.6667)  time: 0.2365  data: 0.1064  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1380 (1.0224)  acc1: 76.4000 (78.3520)  acc5: 93.6000 (94.6720)  time: 0.2239  data: 0.0957  max mem: 21847
Test: Total time: 0:00:10 (0.4119 s / it)
* Acc@1 78.570 Acc@5 94.734 loss 1.017
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.57%
Epoch: [133]  [   0/1251]  eta: 1:08:19  lr: 0.002597  min_lr: 0.002597  loss: 3.9052 (3.9052)  weight_decay: 0.0500 (0.0500)  time: 3.2771  data: 2.9476  max mem: 21847
Epoch: [133]  [ 200/1251]  eta: 0:05:02  lr: 0.002594  min_lr: 0.002594  loss: 3.1558 (3.1169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7380 (0.7639)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [133]  [ 400/1251]  eta: 0:03:58  lr: 0.002590  min_lr: 0.002590  loss: 2.4056 (3.0956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7414 (0.8060)  time: 0.2701  data: 0.0004  max mem: 21847
Epoch: [133]  [ 600/1251]  eta: 0:03:01  lr: 0.002587  min_lr: 0.002587  loss: 2.8336 (3.0968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8738 (0.8222)  time: 0.2822  data: 0.0004  max mem: 21847
Epoch: [133]  [ 800/1251]  eta: 0:02:05  lr: 0.002583  min_lr: 0.002583  loss: 3.4745 (3.1083)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7345 (0.8107)  time: 0.2706  data: 0.0004  max mem: 21847
Epoch: [133]  [1000/1251]  eta: 0:01:09  lr: 0.002580  min_lr: 0.002580  loss: 3.5595 (3.1224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7874 (0.8113)  time: 0.2798  data: 0.0004  max mem: 21847
Epoch: [133]  [1200/1251]  eta: 0:00:14  lr: 0.002576  min_lr: 0.002576  loss: 3.1497 (3.1224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8401 (0.8069)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [133]  [1250/1251]  eta: 0:00:00  lr: 0.002576  min_lr: 0.002576  loss: 3.5521 (3.1219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9950 (0.8193)  time: 0.2336  data: 0.0007  max mem: 21847
Epoch: [133] Total time: 0:05:45 (0.2763 s / it)
Averaged stats: lr: 0.002576  min_lr: 0.002576  loss: 3.5521 (3.1421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9950 (0.8193)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7081 (0.7081)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.4609  data: 5.2889  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8624 (0.8465)  acc1: 82.0000 (81.8909)  acc5: 97.2000 (96.8364)  time: 0.7503  data: 0.6145  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0413 (1.0283)  acc1: 75.6000 (78.0381)  acc5: 93.6000 (94.5524)  time: 0.2250  data: 0.0951  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0671 (1.0353)  acc1: 75.2000 (77.6640)  acc5: 92.8000 (94.5120)  time: 0.2240  data: 0.0950  max mem: 21847
Test: Total time: 0:00:10 (0.4222 s / it)
* Acc@1 78.422 Acc@5 94.638 loss 1.021
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.57%
Epoch: [134]  [   0/1251]  eta: 1:07:49  lr: 0.002576  min_lr: 0.002576  loss: 2.3616 (2.3616)  weight_decay: 0.0500 (0.0500)  time: 3.2530  data: 2.7165  max mem: 21847
Epoch: [134]  [ 200/1251]  eta: 0:05:06  lr: 0.002572  min_lr: 0.002572  loss: 3.5011 (3.1452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6355 (0.8819)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [134]  [ 400/1251]  eta: 0:04:00  lr: 0.002569  min_lr: 0.002569  loss: 3.5628 (3.1704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7586 (0.8246)  time: 0.2736  data: 0.0005  max mem: 21847
Epoch: [134]  [ 600/1251]  eta: 0:03:02  lr: 0.002565  min_lr: 0.002565  loss: 3.2846 (3.1406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6328 (0.8404)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [134]  [ 800/1251]  eta: 0:02:05  lr: 0.002562  min_lr: 0.002562  loss: 3.2475 (3.1651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7957 (0.8350)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [134]  [1000/1251]  eta: 0:01:09  lr: 0.002558  min_lr: 0.002558  loss: 3.4500 (3.1796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6329 (0.8120)  time: 0.2735  data: 0.0004  max mem: 21847
Epoch: [134]  [1200/1251]  eta: 0:00:14  lr: 0.002555  min_lr: 0.002555  loss: 3.2505 (3.1713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7583 (0.8164)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [134]  [1250/1251]  eta: 0:00:00  lr: 0.002554  min_lr: 0.002554  loss: 2.8063 (3.1685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7359 (0.8115)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [134] Total time: 0:05:47 (0.2777 s / it)
Averaged stats: lr: 0.002554  min_lr: 0.002554  loss: 2.8063 (3.1538)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7359 (0.8115)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6181 (0.6181)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 5.6985  data: 5.5326  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8228 (0.7921)  acc1: 84.0000 (82.3273)  acc5: 97.2000 (96.6182)  time: 0.7422  data: 0.6055  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9583 (0.9439)  acc1: 75.6000 (78.8191)  acc5: 93.6000 (94.7048)  time: 0.2091  data: 0.0786  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0396 (0.9574)  acc1: 76.4000 (78.6080)  acc5: 93.6000 (94.7200)  time: 0.2068  data: 0.0785  max mem: 21847
Test: Total time: 0:00:10 (0.4188 s / it)
* Acc@1 78.770 Acc@5 94.784 loss 0.951
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.77%
Epoch: [135]  [   0/1251]  eta: 1:05:21  lr: 0.002554  min_lr: 0.002554  loss: 2.0553 (2.0553)  weight_decay: 0.0500 (0.0500)  time: 3.1347  data: 2.8234  max mem: 21847
Epoch: [135]  [ 200/1251]  eta: 0:05:03  lr: 0.002551  min_lr: 0.002551  loss: 3.1676 (3.2137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6921 (0.7581)  time: 0.2722  data: 0.0003  max mem: 21847
Epoch: [135]  [ 400/1251]  eta: 0:03:59  lr: 0.002547  min_lr: 0.002547  loss: 3.2229 (3.1569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7962 (0.7909)  time: 0.2706  data: 0.0004  max mem: 21847
Epoch: [135]  [ 600/1251]  eta: 0:03:01  lr: 0.002544  min_lr: 0.002544  loss: 3.1926 (3.1572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6318 (0.7537)  time: 0.2859  data: 0.0003  max mem: 21847
Epoch: [135]  [ 800/1251]  eta: 0:02:05  lr: 0.002540  min_lr: 0.002540  loss: 3.4017 (3.1563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7300 (0.7762)  time: 0.2803  data: 0.0004  max mem: 21847
Epoch: [135]  [1000/1251]  eta: 0:01:09  lr: 0.002537  min_lr: 0.002537  loss: 2.9261 (3.1472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6131 (0.7563)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [135]  [1200/1251]  eta: 0:00:14  lr: 0.002533  min_lr: 0.002533  loss: 3.0661 (3.1426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7375 (0.7603)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [135]  [1250/1251]  eta: 0:00:00  lr: 0.002533  min_lr: 0.002533  loss: 3.3607 (3.1398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7105 (0.7584)  time: 0.2279  data: 0.0005  max mem: 21847
Epoch: [135] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.002533  min_lr: 0.002533  loss: 3.3607 (3.1449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7105 (0.7584)
Test:  [ 0/25]  eta: 0:01:38  loss: 0.6616 (0.6616)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 3.9478  data: 3.7963  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8398 (0.8635)  acc1: 83.6000 (82.8727)  acc5: 97.2000 (96.6909)  time: 0.6565  data: 0.5191  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0369 (1.0272)  acc1: 76.8000 (79.2191)  acc5: 94.0000 (94.6857)  time: 0.2614  data: 0.1290  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1364 (1.0381)  acc1: 76.4000 (78.8800)  acc5: 93.6000 (94.6560)  time: 0.2081  data: 0.0799  max mem: 21847
Test: Total time: 0:00:09 (0.3981 s / it)
* Acc@1 78.606 Acc@5 94.698 loss 1.041
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.77%
Epoch: [136]  [   0/1251]  eta: 1:02:33  lr: 0.002532  min_lr: 0.002532  loss: 3.2548 (3.2548)  weight_decay: 0.0500 (0.0500)  time: 3.0001  data: 2.5106  max mem: 21847
Epoch: [136]  [ 200/1251]  eta: 0:05:04  lr: 0.002529  min_lr: 0.002529  loss: 3.2827 (3.0961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7979 (0.8261)  time: 0.2791  data: 0.0005  max mem: 21847
Epoch: [136]  [ 400/1251]  eta: 0:04:00  lr: 0.002526  min_lr: 0.002526  loss: 2.9923 (3.1313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7326 (0.8544)  time: 0.2742  data: 0.0003  max mem: 21847
Epoch: [136]  [ 600/1251]  eta: 0:03:02  lr: 0.002522  min_lr: 0.002522  loss: 3.2978 (3.1231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6974 (0.8706)  time: 0.2724  data: 0.0003  max mem: 21847
Epoch: [136]  [ 800/1251]  eta: 0:02:05  lr: 0.002519  min_lr: 0.002519  loss: 3.2214 (3.1388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7379 (0.8579)  time: 0.2820  data: 0.0003  max mem: 21847
Epoch: [136]  [1000/1251]  eta: 0:01:09  lr: 0.002515  min_lr: 0.002515  loss: 2.6842 (3.1350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7475 (0.8569)  time: 0.2810  data: 0.0004  max mem: 21847
Epoch: [136]  [1200/1251]  eta: 0:00:14  lr: 0.002512  min_lr: 0.002512  loss: 3.3544 (3.1400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7238 (0.8408)  time: 0.2794  data: 0.0004  max mem: 21847
Epoch: [136]  [1250/1251]  eta: 0:00:00  lr: 0.002511  min_lr: 0.002511  loss: 3.0079 (3.1405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7836 (0.8402)  time: 0.2276  data: 0.0007  max mem: 21847
Epoch: [136] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.002511  min_lr: 0.002511  loss: 3.0079 (3.1358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7836 (0.8402)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6448 (0.6448)  acc1: 85.6000 (85.6000)  acc5: 98.8000 (98.8000)  time: 5.4471  data: 5.2689  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7863 (0.8255)  acc1: 84.4000 (82.9455)  acc5: 96.8000 (96.8000)  time: 0.7031  data: 0.5673  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0251 (0.9886)  acc1: 78.0000 (79.0286)  acc5: 94.8000 (95.0286)  time: 0.2028  data: 0.0731  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0849 (0.9955)  acc1: 76.0000 (78.7680)  acc5: 94.0000 (94.9760)  time: 0.2025  data: 0.0730  max mem: 21847
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 78.722 Acc@5 94.784 loss 0.988
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.77%
Epoch: [137]  [   0/1251]  eta: 1:10:50  lr: 0.002511  min_lr: 0.002511  loss: 2.3865 (2.3865)  weight_decay: 0.0500 (0.0500)  time: 3.3973  data: 2.5521  max mem: 21847
Epoch: [137]  [ 200/1251]  eta: 0:05:03  lr: 0.002507  min_lr: 0.002507  loss: 2.7864 (3.0764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7927 (0.8224)  time: 0.2825  data: 0.0004  max mem: 21847
Epoch: [137]  [ 400/1251]  eta: 0:03:59  lr: 0.002504  min_lr: 0.002504  loss: 2.6968 (3.1079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8000 (0.8274)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [137]  [ 600/1251]  eta: 0:03:02  lr: 0.002500  min_lr: 0.002500  loss: 2.7676 (3.0788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6540 (0.8129)  time: 0.2839  data: 0.0005  max mem: 21847
Epoch: [137]  [ 800/1251]  eta: 0:02:05  lr: 0.002497  min_lr: 0.002497  loss: 3.1570 (3.0783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7144 (0.8332)  time: 0.2814  data: 0.0005  max mem: 21847
Epoch: [137]  [1000/1251]  eta: 0:01:09  lr: 0.002493  min_lr: 0.002493  loss: 3.3032 (3.0841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6940 (0.8366)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [137]  [1200/1251]  eta: 0:00:14  lr: 0.002490  min_lr: 0.002490  loss: 3.0021 (3.0986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7219 (nan)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [137]  [1250/1251]  eta: 0:00:00  lr: 0.002489  min_lr: 0.002489  loss: 2.8373 (3.0970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7821 (nan)  time: 0.2277  data: 0.0006  max mem: 21847
Epoch: [137] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.002489  min_lr: 0.002489  loss: 2.8373 (3.1299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7821 (nan)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6020 (0.6020)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.8882  data: 5.7381  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8167 (0.7977)  acc1: 83.6000 (82.8364)  acc5: 96.4000 (96.6546)  time: 0.7681  data: 0.6356  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9322 (0.9539)  acc1: 77.2000 (79.2571)  acc5: 94.8000 (94.6286)  time: 0.2047  data: 0.0756  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0864 (0.9603)  acc1: 77.2000 (79.0880)  acc5: 93.2000 (94.5920)  time: 0.2038  data: 0.0756  max mem: 21847
Test: Total time: 0:00:10 (0.4233 s / it)
* Acc@1 78.862 Acc@5 94.776 loss 0.952
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 78.86%
Epoch: [138]  [   0/1251]  eta: 1:03:40  lr: 0.002489  min_lr: 0.002489  loss: 2.1297 (2.1297)  weight_decay: 0.0500 (0.0500)  time: 3.0538  data: 2.7698  max mem: 21847
Epoch: [138]  [ 200/1251]  eta: 0:05:05  lr: 0.002486  min_lr: 0.002486  loss: 3.0047 (3.0827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5821 (0.7182)  time: 0.2729  data: 0.0005  max mem: 21847
Epoch: [138]  [ 400/1251]  eta: 0:04:00  lr: 0.002482  min_lr: 0.002482  loss: 3.3189 (3.0686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9542 (0.7919)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [138]  [ 600/1251]  eta: 0:03:01  lr: 0.002479  min_lr: 0.002479  loss: 3.1818 (3.0899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8782 (0.8188)  time: 0.2797  data: 0.0004  max mem: 21847
Epoch: [138]  [ 800/1251]  eta: 0:02:05  lr: 0.002475  min_lr: 0.002475  loss: 3.0266 (3.1120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8185 (0.8170)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [138]  [1000/1251]  eta: 0:01:09  lr: 0.002472  min_lr: 0.002472  loss: 3.4472 (3.1298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7485 (0.8131)  time: 0.2714  data: 0.0003  max mem: 21847
Epoch: [138]  [1200/1251]  eta: 0:00:14  lr: 0.002468  min_lr: 0.002468  loss: 3.4181 (3.1253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7325 (0.8145)  time: 0.2728  data: 0.0003  max mem: 21847
Epoch: [138]  [1250/1251]  eta: 0:00:00  lr: 0.002467  min_lr: 0.002467  loss: 3.4213 (3.1310)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0555 (0.8286)  time: 0.2277  data: 0.0005  max mem: 21847
Epoch: [138] Total time: 0:05:45 (0.2763 s / it)
Averaged stats: lr: 0.002467  min_lr: 0.002467  loss: 3.4213 (3.1285)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0555 (0.8286)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.7869 (0.7869)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.8106  data: 5.6571  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9603 (0.9744)  acc1: 83.2000 (81.9273)  acc5: 97.2000 (96.9091)  time: 0.7698  data: 0.6360  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1417 (1.1396)  acc1: 76.0000 (78.8381)  acc5: 95.2000 (94.7810)  time: 0.2117  data: 0.0808  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2672 (1.1554)  acc1: 76.0000 (78.3200)  acc5: 93.2000 (94.5760)  time: 0.2103  data: 0.0807  max mem: 21847
Test: Total time: 0:00:10 (0.4252 s / it)
* Acc@1 78.590 Acc@5 94.586 loss 1.143
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.86%
Epoch: [139]  [   0/1251]  eta: 1:07:44  lr: 0.002467  min_lr: 0.002467  loss: 3.5804 (3.5804)  weight_decay: 0.0500 (0.0500)  time: 3.2489  data: 1.6165  max mem: 21847
Epoch: [139]  [ 200/1251]  eta: 0:05:04  lr: 0.002464  min_lr: 0.002464  loss: 3.1490 (3.0502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6462 (0.7485)  time: 0.2806  data: 0.0003  max mem: 21847
Epoch: [139]  [ 400/1251]  eta: 0:04:00  lr: 0.002460  min_lr: 0.002460  loss: 3.2634 (3.0981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8403 (0.8045)  time: 0.2801  data: 0.0003  max mem: 21847
Epoch: [139]  [ 600/1251]  eta: 0:03:02  lr: 0.002457  min_lr: 0.002457  loss: 3.6001 (3.1067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7646 (0.7983)  time: 0.2728  data: 0.0005  max mem: 21847
Epoch: [139]  [ 800/1251]  eta: 0:02:05  lr: 0.002453  min_lr: 0.002453  loss: 3.3549 (3.1013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7178 (0.8062)  time: 0.2742  data: 0.0005  max mem: 21847
Epoch: [139]  [1000/1251]  eta: 0:01:09  lr: 0.002450  min_lr: 0.002450  loss: 3.1411 (3.1001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7188 (0.8098)  time: 0.2756  data: 0.0005  max mem: 21847
Epoch: [139]  [1200/1251]  eta: 0:00:14  lr: 0.002446  min_lr: 0.002446  loss: 2.8848 (3.0957)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7990 (0.8276)  time: 0.2753  data: 0.0005  max mem: 21847
Epoch: [139]  [1250/1251]  eta: 0:00:00  lr: 0.002446  min_lr: 0.002446  loss: 3.3152 (3.0980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7516 (0.8281)  time: 0.2334  data: 0.0007  max mem: 21847
Epoch: [139] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.002446  min_lr: 0.002446  loss: 3.3152 (3.1224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7516 (0.8281)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6817 (0.6817)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.6137  data: 5.4430  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9137 (0.8636)  acc1: 82.0000 (82.5455)  acc5: 96.8000 (96.8364)  time: 0.7387  data: 0.6031  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0330 (1.0137)  acc1: 76.0000 (78.8571)  acc5: 94.8000 (94.7048)  time: 0.2000  data: 0.0692  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0924 (1.0244)  acc1: 76.0000 (78.5760)  acc5: 93.6000 (94.6240)  time: 0.1995  data: 0.0691  max mem: 21847
Test: Total time: 0:00:10 (0.4082 s / it)
* Acc@1 78.704 Acc@5 94.760 loss 1.013
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.86%
Epoch: [140]  [   0/1251]  eta: 1:02:51  lr: 0.002445  min_lr: 0.002445  loss: 3.7992 (3.7992)  weight_decay: 0.0500 (0.0500)  time: 3.0150  data: 2.5862  max mem: 21847
Epoch: [140]  [ 200/1251]  eta: 0:05:06  lr: 0.002442  min_lr: 0.002442  loss: 2.7038 (3.1639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7498 (0.7307)  time: 0.2707  data: 0.0005  max mem: 21847
Epoch: [140]  [ 400/1251]  eta: 0:04:00  lr: 0.002438  min_lr: 0.002438  loss: 3.0746 (3.1440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7196 (0.7740)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [140]  [ 600/1251]  eta: 0:03:02  lr: 0.002435  min_lr: 0.002435  loss: 2.7114 (3.1263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7200 (0.7863)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [140]  [ 800/1251]  eta: 0:02:05  lr: 0.002431  min_lr: 0.002431  loss: 3.2418 (3.1191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7615 (0.8068)  time: 0.2751  data: 0.0004  max mem: 21847
Epoch: [140]  [1000/1251]  eta: 0:01:09  lr: 0.002428  min_lr: 0.002428  loss: 3.5386 (3.1191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8388 (0.8237)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [140]  [1200/1251]  eta: 0:00:14  lr: 0.002424  min_lr: 0.002424  loss: 3.2337 (3.1196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6454 (0.8183)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [140]  [1250/1251]  eta: 0:00:00  lr: 0.002424  min_lr: 0.002424  loss: 2.8369 (3.1198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8069 (0.8224)  time: 0.2295  data: 0.0007  max mem: 21847
Epoch: [140] Total time: 0:05:46 (0.2774 s / it)
Averaged stats: lr: 0.002424  min_lr: 0.002424  loss: 2.8369 (3.1242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8069 (0.8224)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6263 (0.6263)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.6236  data: 5.4679  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8241 (0.8001)  acc1: 83.2000 (82.5455)  acc5: 96.8000 (96.8364)  time: 0.7428  data: 0.6075  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9869 (0.9535)  acc1: 77.6000 (79.2952)  acc5: 94.4000 (94.8000)  time: 0.2046  data: 0.0717  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0897 (0.9663)  acc1: 77.2000 (78.7040)  acc5: 93.6000 (94.7520)  time: 0.2049  data: 0.0716  max mem: 21847
Test: Total time: 0:00:10 (0.4133 s / it)
* Acc@1 78.724 Acc@5 94.854 loss 0.965
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.86%
Epoch: [141]  [   0/1251]  eta: 1:07:51  lr: 0.002424  min_lr: 0.002424  loss: 3.4959 (3.4959)  weight_decay: 0.0500 (0.0500)  time: 3.2545  data: 2.4239  max mem: 21847
Epoch: [141]  [ 200/1251]  eta: 0:05:07  lr: 0.002420  min_lr: 0.002420  loss: 3.4779 (3.0892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8774 (0.8402)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [141]  [ 400/1251]  eta: 0:04:01  lr: 0.002417  min_lr: 0.002417  loss: 2.8344 (3.1075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6447 (0.8101)  time: 0.2859  data: 0.0005  max mem: 21847
Epoch: [141]  [ 600/1251]  eta: 0:03:02  lr: 0.002413  min_lr: 0.002413  loss: 2.9936 (3.0741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6998 (0.7859)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [141]  [ 800/1251]  eta: 0:02:06  lr: 0.002409  min_lr: 0.002409  loss: 2.9553 (3.0772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8710 (0.8137)  time: 0.2750  data: 0.0005  max mem: 21847
Epoch: [141]  [1000/1251]  eta: 0:01:10  lr: 0.002406  min_lr: 0.002406  loss: 3.1820 (3.0867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6471 (0.8104)  time: 0.2716  data: 0.0005  max mem: 21847
Epoch: [141]  [1200/1251]  eta: 0:00:14  lr: 0.002402  min_lr: 0.002402  loss: 3.6206 (3.0909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8048 (0.8189)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [141]  [1250/1251]  eta: 0:00:00  lr: 0.002402  min_lr: 0.002402  loss: 3.3546 (3.0943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7192 (0.8169)  time: 0.2282  data: 0.0006  max mem: 21847
Epoch: [141] Total time: 0:05:47 (0.2780 s / it)
Averaged stats: lr: 0.002402  min_lr: 0.002402  loss: 3.3546 (3.1202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7192 (0.8169)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6675 (0.6675)  acc1: 87.6000 (87.6000)  acc5: 97.2000 (97.2000)  time: 5.5899  data: 5.4165  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8671 (0.8641)  acc1: 83.6000 (82.5455)  acc5: 96.4000 (96.4727)  time: 0.6999  data: 0.5631  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1076 (1.0196)  acc1: 77.2000 (78.9143)  acc5: 94.4000 (94.6667)  time: 0.1856  data: 0.0553  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1076 (1.0354)  acc1: 74.4000 (78.1440)  acc5: 94.4000 (94.6400)  time: 0.1978  data: 0.0694  max mem: 21847
Test: Total time: 0:00:10 (0.4074 s / it)
* Acc@1 78.674 Acc@5 94.714 loss 1.024
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.86%
Epoch: [142]  [   0/1251]  eta: 1:10:13  lr: 0.002402  min_lr: 0.002402  loss: 3.0101 (3.0101)  weight_decay: 0.0500 (0.0500)  time: 3.3685  data: 2.9741  max mem: 21847
Epoch: [142]  [ 200/1251]  eta: 0:05:05  lr: 0.002398  min_lr: 0.002398  loss: 2.8705 (3.1085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7876 (0.8230)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [142]  [ 400/1251]  eta: 0:04:00  lr: 0.002395  min_lr: 0.002395  loss: 3.3587 (3.0778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6829 (0.8168)  time: 0.2811  data: 0.0004  max mem: 21847
Epoch: [142]  [ 600/1251]  eta: 0:03:02  lr: 0.002391  min_lr: 0.002391  loss: 3.1594 (3.1013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7629 (0.8020)  time: 0.2718  data: 0.0005  max mem: 21847
Epoch: [142]  [ 800/1251]  eta: 0:02:05  lr: 0.002387  min_lr: 0.002387  loss: 2.9274 (3.1252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8627 (0.8156)  time: 0.2796  data: 0.0004  max mem: 21847
Epoch: [142]  [1000/1251]  eta: 0:01:09  lr: 0.002384  min_lr: 0.002384  loss: 3.0267 (3.1166)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6374 (0.8048)  time: 0.2853  data: 0.0004  max mem: 21847
Epoch: [142]  [1200/1251]  eta: 0:00:14  lr: 0.002380  min_lr: 0.002380  loss: 3.1649 (3.1131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7748 (0.8103)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [142]  [1250/1251]  eta: 0:00:00  lr: 0.002380  min_lr: 0.002380  loss: 2.9651 (3.1068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8046 (0.8109)  time: 0.2358  data: 0.0006  max mem: 21847
Epoch: [142] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.002380  min_lr: 0.002380  loss: 2.9651 (3.1136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8046 (0.8109)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6474 (0.6474)  acc1: 85.6000 (85.6000)  acc5: 97.2000 (97.2000)  time: 5.7991  data: 5.6279  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8204 (0.8161)  acc1: 83.2000 (81.9636)  acc5: 96.4000 (96.6909)  time: 0.7687  data: 0.6320  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0073 (0.9730)  acc1: 76.8000 (78.3810)  acc5: 95.2000 (94.9143)  time: 0.2072  data: 0.0767  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0921 (0.9869)  acc1: 76.0000 (78.0640)  acc5: 93.2000 (94.6560)  time: 0.2050  data: 0.0766  max mem: 21847
Test: Total time: 0:00:10 (0.4220 s / it)
* Acc@1 78.830 Acc@5 94.808 loss 0.975
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.86%
Epoch: [143]  [   0/1251]  eta: 1:06:40  lr: 0.002380  min_lr: 0.002380  loss: 2.5374 (2.5374)  weight_decay: 0.0500 (0.0500)  time: 3.1980  data: 1.8750  max mem: 21847
Epoch: [143]  [ 200/1251]  eta: 0:05:03  lr: 0.002376  min_lr: 0.002376  loss: 2.9392 (2.9869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9007 (0.9594)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [143]  [ 400/1251]  eta: 0:03:59  lr: 0.002373  min_lr: 0.002373  loss: 2.5439 (3.0616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6785 (0.8661)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [143]  [ 600/1251]  eta: 0:03:02  lr: 0.002369  min_lr: 0.002369  loss: 3.0787 (3.0579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8404 (0.8662)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [143]  [ 800/1251]  eta: 0:02:05  lr: 0.002365  min_lr: 0.002365  loss: 3.3380 (3.0908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7225 (0.8580)  time: 0.2705  data: 0.0004  max mem: 21847
Epoch: [143]  [1000/1251]  eta: 0:01:09  lr: 0.002362  min_lr: 0.002362  loss: 2.8950 (3.0993)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7179 (0.8447)  time: 0.2735  data: 0.0005  max mem: 21847
Epoch: [143]  [1200/1251]  eta: 0:00:14  lr: 0.002358  min_lr: 0.002358  loss: 3.4024 (3.0977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6318 (0.8471)  time: 0.2758  data: 0.0004  max mem: 21847
Epoch: [143]  [1250/1251]  eta: 0:00:00  lr: 0.002358  min_lr: 0.002358  loss: 2.7108 (3.0931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6261 (0.8403)  time: 0.2279  data: 0.0006  max mem: 21847
Epoch: [143] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.002358  min_lr: 0.002358  loss: 2.7108 (3.1007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6261 (0.8403)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6665 (0.6665)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.3314  data: 5.1601  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8954 (0.8285)  acc1: 82.8000 (82.5091)  acc5: 96.8000 (97.1273)  time: 0.6218  data: 0.4860  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0049 (0.9762)  acc1: 77.2000 (78.9905)  acc5: 94.0000 (95.2191)  time: 0.1746  data: 0.0444  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0248 (0.9812)  acc1: 75.6000 (78.6080)  acc5: 94.0000 (95.1040)  time: 0.1931  data: 0.0638  max mem: 21847
Test: Total time: 0:00:09 (0.3921 s / it)
* Acc@1 79.106 Acc@5 94.996 loss 0.970
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.11%
Epoch: [144]  [   0/1251]  eta: 0:59:37  lr: 0.002358  min_lr: 0.002358  loss: 3.7453 (3.7453)  weight_decay: 0.0500 (0.0500)  time: 2.8601  data: 2.5309  max mem: 21847
Epoch: [144]  [ 200/1251]  eta: 0:05:03  lr: 0.002354  min_lr: 0.002354  loss: 2.8210 (3.0577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7211 (0.7090)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [144]  [ 400/1251]  eta: 0:03:59  lr: 0.002350  min_lr: 0.002350  loss: 3.2722 (3.0756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7268 (0.7562)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [144]  [ 600/1251]  eta: 0:03:01  lr: 0.002347  min_lr: 0.002347  loss: 3.0340 (3.0764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7134 (0.7969)  time: 0.2748  data: 0.0004  max mem: 21847
Epoch: [144]  [ 800/1251]  eta: 0:02:05  lr: 0.002343  min_lr: 0.002343  loss: 3.2535 (3.0874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7967 (0.8155)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [144]  [1000/1251]  eta: 0:01:09  lr: 0.002340  min_lr: 0.002340  loss: 3.0264 (3.0966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7820 (0.8235)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [144]  [1200/1251]  eta: 0:00:14  lr: 0.002336  min_lr: 0.002336  loss: 3.3380 (3.0942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8137 (0.8296)  time: 0.2745  data: 0.0004  max mem: 21847
Epoch: [144]  [1250/1251]  eta: 0:00:00  lr: 0.002335  min_lr: 0.002335  loss: 3.2572 (3.0892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6955 (0.8265)  time: 0.2285  data: 0.0005  max mem: 21847
Epoch: [144] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.002335  min_lr: 0.002335  loss: 3.2572 (3.1116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6955 (0.8265)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6631 (0.6631)  acc1: 88.0000 (88.0000)  acc5: 97.2000 (97.2000)  time: 5.4533  data: 5.2954  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8918 (0.8585)  acc1: 83.6000 (82.6182)  acc5: 96.4000 (96.9818)  time: 0.6960  data: 0.5599  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0175 (1.0107)  acc1: 76.8000 (79.3143)  acc5: 94.8000 (94.8762)  time: 0.1938  data: 0.0629  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1454 (1.0206)  acc1: 76.4000 (78.8000)  acc5: 94.0000 (94.8000)  time: 0.1993  data: 0.0700  max mem: 21847
Test: Total time: 0:00:10 (0.4025 s / it)
* Acc@1 79.162 Acc@5 94.778 loss 1.006
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.16%
Epoch: [145]  [   0/1251]  eta: 1:09:53  lr: 0.002335  min_lr: 0.002335  loss: 2.8840 (2.8840)  weight_decay: 0.0500 (0.0500)  time: 3.3520  data: 3.0695  max mem: 21847
Epoch: [145]  [ 200/1251]  eta: 0:05:03  lr: 0.002332  min_lr: 0.002332  loss: 2.8489 (3.0439)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8189 (0.7939)  time: 0.2803  data: 0.0004  max mem: 21847
Epoch: [145]  [ 400/1251]  eta: 0:03:59  lr: 0.002328  min_lr: 0.002328  loss: 3.1641 (3.0566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6134 (0.7979)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [145]  [ 600/1251]  eta: 0:03:02  lr: 0.002325  min_lr: 0.002325  loss: 2.9748 (3.0872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7934 (0.8024)  time: 0.2895  data: 0.0004  max mem: 21847
Epoch: [145]  [ 800/1251]  eta: 0:02:05  lr: 0.002321  min_lr: 0.002321  loss: 2.8024 (3.0868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6580 (0.8005)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [145]  [1000/1251]  eta: 0:01:09  lr: 0.002318  min_lr: 0.002318  loss: 2.6089 (3.0933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8139 (0.8310)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [145]  [1200/1251]  eta: 0:00:14  lr: 0.002314  min_lr: 0.002314  loss: 2.9752 (3.1034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8248 (0.8316)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [145]  [1250/1251]  eta: 0:00:00  lr: 0.002313  min_lr: 0.002313  loss: 3.4853 (3.1064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7640 (0.8322)  time: 0.2281  data: 0.0007  max mem: 21847
Epoch: [145] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.002313  min_lr: 0.002313  loss: 3.4853 (3.0982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7640 (0.8322)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7218 (0.7218)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 5.7129  data: 5.5412  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8863 (0.8939)  acc1: 82.8000 (82.1455)  acc5: 96.8000 (96.8364)  time: 0.7654  data: 0.6296  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0663 (1.0583)  acc1: 76.8000 (78.8571)  acc5: 94.8000 (94.6095)  time: 0.2076  data: 0.0778  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2048 (1.0755)  acc1: 76.0000 (78.3360)  acc5: 93.6000 (94.4320)  time: 0.2068  data: 0.0777  max mem: 21847
Test: Total time: 0:00:10 (0.4186 s / it)
* Acc@1 78.934 Acc@5 94.742 loss 1.065
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 79.16%
Epoch: [146]  [   0/1251]  eta: 1:07:35  lr: 0.002313  min_lr: 0.002313  loss: 3.0142 (3.0142)  weight_decay: 0.0500 (0.0500)  time: 3.2417  data: 2.0079  max mem: 21847
Epoch: [146]  [ 200/1251]  eta: 0:05:04  lr: 0.002310  min_lr: 0.002310  loss: 2.8005 (3.1035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7581 (0.7700)  time: 0.2715  data: 0.0007  max mem: 21847
Epoch: [146]  [ 400/1251]  eta: 0:03:59  lr: 0.002306  min_lr: 0.002306  loss: 3.1521 (3.1081)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8255 (0.8088)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [146]  [ 600/1251]  eta: 0:03:01  lr: 0.002303  min_lr: 0.002303  loss: 3.5440 (3.1369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7848 (0.8159)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [146]  [ 800/1251]  eta: 0:02:05  lr: 0.002299  min_lr: 0.002299  loss: 3.0781 (3.1318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7676 (0.8158)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [146]  [1000/1251]  eta: 0:01:09  lr: 0.002296  min_lr: 0.002296  loss: 3.2437 (3.1132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7184 (0.8117)  time: 0.2771  data: 0.0004  max mem: 21847
Epoch: [146]  [1200/1251]  eta: 0:00:14  lr: 0.002292  min_lr: 0.002292  loss: 3.4105 (3.1277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6757 (nan)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [146]  [1250/1251]  eta: 0:00:00  lr: 0.002291  min_lr: 0.002291  loss: 2.9515 (3.1242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6757 (nan)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [146] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.002291  min_lr: 0.002291  loss: 2.9515 (3.0968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6757 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6418 (0.6418)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.5230  data: 5.3420  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8639 (0.8169)  acc1: 83.2000 (82.9091)  acc5: 96.8000 (96.8727)  time: 0.7509  data: 0.6130  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9855 (0.9700)  acc1: 77.2000 (78.9714)  acc5: 95.2000 (94.9143)  time: 0.2129  data: 0.0811  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0490 (0.9782)  acc1: 76.0000 (78.7200)  acc5: 93.6000 (94.8800)  time: 0.2106  data: 0.0810  max mem: 21847
Test: Total time: 0:00:10 (0.4155 s / it)
* Acc@1 78.878 Acc@5 94.906 loss 0.965
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 79.16%
Epoch: [147]  [   0/1251]  eta: 1:10:38  lr: 0.002291  min_lr: 0.002291  loss: 2.0618 (2.0618)  weight_decay: 0.0500 (0.0500)  time: 3.3879  data: 1.7316  max mem: 21847
Epoch: [147]  [ 200/1251]  eta: 0:05:08  lr: 0.002288  min_lr: 0.002288  loss: 3.3123 (3.1350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6881 (0.7917)  time: 0.2748  data: 0.0004  max mem: 21847
Epoch: [147]  [ 400/1251]  eta: 0:04:02  lr: 0.002284  min_lr: 0.002284  loss: 3.2304 (3.1334)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0067 (0.8334)  time: 0.2863  data: 0.0006  max mem: 21847
Epoch: [147]  [ 600/1251]  eta: 0:03:03  lr: 0.002280  min_lr: 0.002280  loss: 3.0420 (3.1239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7467 (0.8632)  time: 0.2731  data: 0.0005  max mem: 21847
Epoch: [147]  [ 800/1251]  eta: 0:02:05  lr: 0.002277  min_lr: 0.002277  loss: 2.8127 (3.1137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6684 (0.8565)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [147]  [1000/1251]  eta: 0:01:09  lr: 0.002273  min_lr: 0.002273  loss: 3.2859 (3.1106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7951 (0.8581)  time: 0.2771  data: 0.0005  max mem: 21847
Epoch: [147]  [1200/1251]  eta: 0:00:14  lr: 0.002270  min_lr: 0.002270  loss: 2.6891 (3.0984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8022 (0.8539)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [147]  [1250/1251]  eta: 0:00:00  lr: 0.002269  min_lr: 0.002269  loss: 3.1589 (3.1036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7055 (0.8486)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [147] Total time: 0:05:47 (0.2777 s / it)
Averaged stats: lr: 0.002269  min_lr: 0.002269  loss: 3.1589 (3.1006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7055 (0.8486)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6721 (0.6721)  acc1: 85.2000 (85.2000)  acc5: 98.0000 (98.0000)  time: 5.6849  data: 5.5373  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8798 (0.8563)  acc1: 81.6000 (82.3273)  acc5: 97.2000 (96.7636)  time: 0.6851  data: 0.5515  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0218 (1.0050)  acc1: 78.0000 (78.9524)  acc5: 94.8000 (94.9143)  time: 0.1755  data: 0.0437  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1299 (1.0192)  acc1: 76.4000 (78.4960)  acc5: 94.0000 (94.9440)  time: 0.1823  data: 0.0520  max mem: 21847
Test: Total time: 0:00:09 (0.3981 s / it)
* Acc@1 78.926 Acc@5 94.900 loss 1.017
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 79.16%
Epoch: [148]  [   0/1251]  eta: 1:09:22  lr: 0.002269  min_lr: 0.002269  loss: 2.7636 (2.7636)  weight_decay: 0.0500 (0.0500)  time: 3.3275  data: 2.3559  max mem: 21847
Epoch: [148]  [ 200/1251]  eta: 0:05:06  lr: 0.002265  min_lr: 0.002265  loss: 3.4475 (3.1096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8011 (0.8975)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [148]  [ 400/1251]  eta: 0:04:01  lr: 0.002262  min_lr: 0.002262  loss: 3.0549 (3.0939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7263 (0.8297)  time: 0.2815  data: 0.0004  max mem: 21847
Epoch: [148]  [ 600/1251]  eta: 0:03:02  lr: 0.002258  min_lr: 0.002258  loss: 3.2816 (3.1064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6820 (0.8160)  time: 0.2726  data: 0.0005  max mem: 21847
Epoch: [148]  [ 800/1251]  eta: 0:02:05  lr: 0.002255  min_lr: 0.002255  loss: 3.3436 (3.1014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7701 (0.8229)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [148]  [1000/1251]  eta: 0:01:09  lr: 0.002251  min_lr: 0.002251  loss: 2.8378 (3.0971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7350 (0.8224)  time: 0.2743  data: 0.0005  max mem: 21847
Epoch: [148]  [1200/1251]  eta: 0:00:14  lr: 0.002248  min_lr: 0.002248  loss: 3.2460 (3.1016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8061 (0.8352)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [148]  [1250/1251]  eta: 0:00:00  lr: 0.002247  min_lr: 0.002247  loss: 3.4917 (3.1077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7753 (0.8392)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [148] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.002247  min_lr: 0.002247  loss: 3.4917 (3.0892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7753 (0.8392)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7856 (0.7856)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.7771  data: 5.6228  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9572 (0.9570)  acc1: 82.4000 (81.9273)  acc5: 96.4000 (96.5091)  time: 0.7510  data: 0.6142  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.1378 (1.1143)  acc1: 75.2000 (78.4571)  acc5: 94.8000 (94.5143)  time: 0.2090  data: 0.0771  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.2302 (1.1256)  acc1: 75.2000 (78.3040)  acc5: 94.4000 (94.4800)  time: 0.2067  data: 0.0770  max mem: 21847
Test: Total time: 0:00:10 (0.4221 s / it)
* Acc@1 78.734 Acc@5 94.818 loss 1.116
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 79.16%
Epoch: [149]  [   0/1251]  eta: 1:09:11  lr: 0.002247  min_lr: 0.002247  loss: 3.6980 (3.6980)  weight_decay: 0.0500 (0.0500)  time: 3.3185  data: 2.9554  max mem: 21847
Epoch: [149]  [ 200/1251]  eta: 0:05:04  lr: 0.002243  min_lr: 0.002243  loss: 3.2779 (3.0854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7191 (0.7286)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [149]  [ 400/1251]  eta: 0:04:00  lr: 0.002240  min_lr: 0.002240  loss: 3.0484 (3.0816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7165 (0.8136)  time: 0.2822  data: 0.0004  max mem: 21847
Epoch: [149]  [ 600/1251]  eta: 0:03:02  lr: 0.002236  min_lr: 0.002236  loss: 3.1324 (3.1271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7097 (0.8160)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [149]  [ 800/1251]  eta: 0:02:05  lr: 0.002232  min_lr: 0.002232  loss: 3.1876 (3.0967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7356 (0.8093)  time: 0.2704  data: 0.0005  max mem: 21847
Epoch: [149]  [1000/1251]  eta: 0:01:09  lr: 0.002229  min_lr: 0.002229  loss: 3.2858 (3.0986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8201 (0.8307)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [149]  [1200/1251]  eta: 0:00:14  lr: 0.002225  min_lr: 0.002225  loss: 3.1354 (3.0958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7632 (0.8419)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [149]  [1250/1251]  eta: 0:00:00  lr: 0.002224  min_lr: 0.002224  loss: 2.7205 (3.0923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8180 (0.8448)  time: 0.2279  data: 0.0005  max mem: 21847
Epoch: [149] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.002224  min_lr: 0.002224  loss: 2.7205 (3.0842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8180 (0.8448)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6485 (0.6485)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 5.6353  data: 5.4603  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8306 (0.8209)  acc1: 84.4000 (83.4545)  acc5: 96.8000 (97.0182)  time: 0.7183  data: 0.5806  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9888 (0.9720)  acc1: 79.2000 (79.7714)  acc5: 94.4000 (95.0286)  time: 0.1915  data: 0.0592  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0765 (0.9823)  acc1: 78.8000 (79.3120)  acc5: 94.0000 (95.0080)  time: 0.2006  data: 0.0701  max mem: 21847
Test: Total time: 0:00:10 (0.4116 s / it)
* Acc@1 79.340 Acc@5 95.076 loss 0.975
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.34%
Epoch: [150]  [   0/1251]  eta: 1:01:54  lr: 0.002224  min_lr: 0.002224  loss: 3.7035 (3.7035)  weight_decay: 0.0500 (0.0500)  time: 2.9693  data: 2.5986  max mem: 21847
Epoch: [150]  [ 200/1251]  eta: 0:05:01  lr: 0.002221  min_lr: 0.002221  loss: 2.7751 (3.0702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8574 (0.8685)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [150]  [ 400/1251]  eta: 0:03:59  lr: 0.002217  min_lr: 0.002217  loss: 3.2027 (3.0880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6825 (0.8497)  time: 0.2711  data: 0.0003  max mem: 21847
Epoch: [150]  [ 600/1251]  eta: 0:03:01  lr: 0.002214  min_lr: 0.002214  loss: 3.3147 (3.0861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8139 (0.8319)  time: 0.2764  data: 0.0003  max mem: 21847
Epoch: [150]  [ 800/1251]  eta: 0:02:05  lr: 0.002210  min_lr: 0.002210  loss: 3.0449 (3.0803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.8124)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [150]  [1000/1251]  eta: 0:01:09  lr: 0.002207  min_lr: 0.002207  loss: 3.4071 (3.0978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7689 (0.8287)  time: 0.2750  data: 0.0005  max mem: 21847
Epoch: [150]  [1200/1251]  eta: 0:00:14  lr: 0.002203  min_lr: 0.002203  loss: 3.2712 (3.0974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7223 (0.8258)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [150]  [1250/1251]  eta: 0:00:00  lr: 0.002202  min_lr: 0.002202  loss: 3.1771 (3.0973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6800 (0.8196)  time: 0.2316  data: 0.0006  max mem: 21847
Epoch: [150] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.002202  min_lr: 0.002202  loss: 3.1771 (3.0890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6800 (0.8196)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7007 (0.7007)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.4946  data: 5.3502  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8951 (0.8561)  acc1: 82.8000 (82.9818)  acc5: 96.4000 (96.8000)  time: 0.7337  data: 0.5993  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0223 (1.0001)  acc1: 76.8000 (79.5619)  acc5: 94.8000 (95.2191)  time: 0.2087  data: 0.0783  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1083 (1.0153)  acc1: 76.4000 (78.8800)  acc5: 94.4000 (95.1200)  time: 0.2083  data: 0.0782  max mem: 21847
Test: Total time: 0:00:10 (0.4103 s / it)
* Acc@1 79.112 Acc@5 94.962 loss 1.007
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.34%
Epoch: [151]  [   0/1251]  eta: 1:07:09  lr: 0.002202  min_lr: 0.002202  loss: 1.9952 (1.9952)  weight_decay: 0.0500 (0.0500)  time: 3.2213  data: 1.6401  max mem: 21847
Epoch: [151]  [ 200/1251]  eta: 0:05:05  lr: 0.002198  min_lr: 0.002198  loss: 3.0701 (3.0870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7905 (0.8420)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [151]  [ 400/1251]  eta: 0:04:00  lr: 0.002195  min_lr: 0.002195  loss: 3.1074 (3.1154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7704 (0.8122)  time: 0.2717  data: 0.0003  max mem: 21847
Epoch: [151]  [ 600/1251]  eta: 0:03:02  lr: 0.002191  min_lr: 0.002191  loss: 2.8025 (3.0884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7007 (0.8054)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [151]  [ 800/1251]  eta: 0:02:05  lr: 0.002188  min_lr: 0.002188  loss: 3.2987 (3.0851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7337 (0.8092)  time: 0.2826  data: 0.0005  max mem: 21847
Epoch: [151]  [1000/1251]  eta: 0:01:09  lr: 0.002184  min_lr: 0.002184  loss: 3.3262 (3.0878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8721 (0.8071)  time: 0.2730  data: 0.0003  max mem: 21847
Epoch: [151]  [1200/1251]  eta: 0:00:14  lr: 0.002181  min_lr: 0.002181  loss: 2.7584 (3.0911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8055 (0.8416)  time: 0.2785  data: 0.0003  max mem: 21847
Epoch: [151]  [1250/1251]  eta: 0:00:00  lr: 0.002180  min_lr: 0.002180  loss: 3.2461 (3.0889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8620 (0.8462)  time: 0.2337  data: 0.0006  max mem: 21847
Epoch: [151] Total time: 0:05:47 (0.2777 s / it)
Averaged stats: lr: 0.002180  min_lr: 0.002180  loss: 3.2461 (3.0835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8620 (0.8462)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6593 (0.6593)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 5.5780  data: 5.4302  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.9289 (0.8550)  acc1: 82.0000 (82.8727)  acc5: 97.2000 (96.6182)  time: 0.7351  data: 0.6019  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9740 (1.0011)  acc1: 77.6000 (79.4476)  acc5: 94.4000 (94.9143)  time: 0.2053  data: 0.0723  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0697 (1.0083)  acc1: 76.8000 (79.0400)  acc5: 94.0000 (94.8640)  time: 0.2274  data: 0.0958  max mem: 21847
Test: Total time: 0:00:10 (0.4299 s / it)
* Acc@1 79.412 Acc@5 95.012 loss 0.996
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.41%
Epoch: [152]  [   0/1251]  eta: 0:54:05  lr: 0.002180  min_lr: 0.002180  loss: 3.4960 (3.4960)  weight_decay: 0.0500 (0.0500)  time: 2.5939  data: 2.2226  max mem: 21847
Epoch: [152]  [ 200/1251]  eta: 0:05:01  lr: 0.002176  min_lr: 0.002176  loss: 3.4470 (3.1162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.7893)  time: 0.2723  data: 0.0005  max mem: 21847
Epoch: [152]  [ 400/1251]  eta: 0:03:59  lr: 0.002173  min_lr: 0.002173  loss: 2.6784 (3.0675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7505 (0.8385)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [152]  [ 600/1251]  eta: 0:03:02  lr: 0.002169  min_lr: 0.002169  loss: 3.0157 (3.0573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7418 (0.8483)  time: 0.2716  data: 0.0005  max mem: 21847
Epoch: [152]  [ 800/1251]  eta: 0:02:05  lr: 0.002165  min_lr: 0.002165  loss: 3.3185 (3.0562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (0.8489)  time: 0.2814  data: 0.0004  max mem: 21847
Epoch: [152]  [1000/1251]  eta: 0:01:09  lr: 0.002162  min_lr: 0.002162  loss: 2.8947 (3.0531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7020 (0.8322)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [152]  [1200/1251]  eta: 0:00:14  lr: 0.002158  min_lr: 0.002158  loss: 3.3473 (3.0628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7731 (0.8276)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [152]  [1250/1251]  eta: 0:00:00  lr: 0.002157  min_lr: 0.002157  loss: 3.5297 (3.0636)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8338 (0.8333)  time: 0.2281  data: 0.0005  max mem: 21847
Epoch: [152] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.002157  min_lr: 0.002157  loss: 3.5297 (3.0794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8338 (0.8333)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7080 (0.7080)  acc1: 87.2000 (87.2000)  acc5: 97.2000 (97.2000)  time: 5.5783  data: 5.4041  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9098 (0.8871)  acc1: 84.4000 (82.6545)  acc5: 96.8000 (96.6909)  time: 0.7209  data: 0.5852  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0139 (1.0233)  acc1: 77.2000 (79.3905)  acc5: 94.8000 (94.9524)  time: 0.2066  data: 0.0770  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0894 (1.0318)  acc1: 77.2000 (79.1520)  acc5: 94.4000 (94.8640)  time: 0.2040  data: 0.0756  max mem: 21847
Test: Total time: 0:00:10 (0.4122 s / it)
* Acc@1 79.288 Acc@5 95.010 loss 1.025
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.41%
Epoch: [153]  [   0/1251]  eta: 1:12:36  lr: 0.002157  min_lr: 0.002157  loss: 3.7478 (3.7478)  weight_decay: 0.0500 (0.0500)  time: 3.4823  data: 3.0847  max mem: 21847
Epoch: [153]  [ 200/1251]  eta: 0:05:05  lr: 0.002154  min_lr: 0.002154  loss: 3.3003 (3.0415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7850 (0.8938)  time: 0.2728  data: 0.0005  max mem: 21847
Epoch: [153]  [ 400/1251]  eta: 0:04:00  lr: 0.002150  min_lr: 0.002150  loss: 3.4537 (3.0556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7917 (0.8994)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [153]  [ 600/1251]  eta: 0:03:02  lr: 0.002147  min_lr: 0.002147  loss: 2.9270 (3.0796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7262 (0.8464)  time: 0.2800  data: 0.0004  max mem: 21847
Epoch: [153]  [ 800/1251]  eta: 0:02:05  lr: 0.002143  min_lr: 0.002143  loss: 2.6424 (3.0732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8296 (0.8404)  time: 0.2728  data: 0.0005  max mem: 21847
Epoch: [153]  [1000/1251]  eta: 0:01:09  lr: 0.002139  min_lr: 0.002139  loss: 2.9699 (3.0741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7154 (0.8342)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [153]  [1200/1251]  eta: 0:00:14  lr: 0.002136  min_lr: 0.002136  loss: 3.3994 (3.0619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6239 (nan)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [153]  [1250/1251]  eta: 0:00:00  lr: 0.002135  min_lr: 0.002135  loss: 3.3923 (3.0654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5858 (nan)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [153] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.002135  min_lr: 0.002135  loss: 3.3923 (3.0753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5858 (nan)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6767 (0.6767)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 5.8039  data: 5.6534  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8791 (0.8427)  acc1: 81.6000 (83.2000)  acc5: 97.2000 (96.9091)  time: 0.7152  data: 0.5825  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9450 (0.9887)  acc1: 78.0000 (79.8857)  acc5: 94.4000 (94.9524)  time: 0.1883  data: 0.0592  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0948 (1.0007)  acc1: 76.8000 (79.4720)  acc5: 94.4000 (94.9440)  time: 0.2165  data: 0.0883  max mem: 21847
Test: Total time: 0:00:10 (0.4299 s / it)
* Acc@1 79.442 Acc@5 95.032 loss 0.997
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.44%
Epoch: [154]  [   0/1251]  eta: 1:01:19  lr: 0.002135  min_lr: 0.002135  loss: 2.0523 (2.0523)  weight_decay: 0.0500 (0.0500)  time: 2.9416  data: 2.5324  max mem: 21847
Epoch: [154]  [ 200/1251]  eta: 0:05:05  lr: 0.002131  min_lr: 0.002131  loss: 2.7777 (3.0185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7984 (0.8900)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [154]  [ 400/1251]  eta: 0:03:59  lr: 0.002128  min_lr: 0.002128  loss: 3.2892 (3.0408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7679 (0.8705)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [154]  [ 600/1251]  eta: 0:03:01  lr: 0.002124  min_lr: 0.002124  loss: 3.2083 (3.0499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6995 (0.8816)  time: 0.2706  data: 0.0004  max mem: 21847
Epoch: [154]  [ 800/1251]  eta: 0:02:05  lr: 0.002121  min_lr: 0.002121  loss: 2.7285 (3.0657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7151 (0.8767)  time: 0.2861  data: 0.0004  max mem: 21847
Epoch: [154]  [1000/1251]  eta: 0:01:09  lr: 0.002117  min_lr: 0.002117  loss: 3.3427 (3.0771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8906 (0.8721)  time: 0.2701  data: 0.0004  max mem: 21847
Epoch: [154]  [1200/1251]  eta: 0:00:14  lr: 0.002113  min_lr: 0.002113  loss: 3.1169 (3.0748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6680 (0.8757)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [154]  [1250/1251]  eta: 0:00:00  lr: 0.002113  min_lr: 0.002113  loss: 3.4965 (3.0817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.8752)  time: 0.2276  data: 0.0006  max mem: 21847
Epoch: [154] Total time: 0:05:45 (0.2759 s / it)
Averaged stats: lr: 0.002113  min_lr: 0.002113  loss: 3.4965 (3.0683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.8752)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6369 (0.6369)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.6175  data: 5.4564  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8319 (0.8267)  acc1: 84.0000 (82.2545)  acc5: 97.6000 (97.0909)  time: 0.7199  data: 0.5850  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9560 (0.9824)  acc1: 76.4000 (79.3143)  acc5: 94.4000 (95.2191)  time: 0.1993  data: 0.0695  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1143 (0.9970)  acc1: 76.4000 (78.7040)  acc5: 94.4000 (95.1040)  time: 0.2139  data: 0.0849  max mem: 21847
Test: Total time: 0:00:10 (0.4202 s / it)
* Acc@1 79.182 Acc@5 94.948 loss 0.992
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.44%
Epoch: [155]  [   0/1251]  eta: 1:11:24  lr: 0.002113  min_lr: 0.002113  loss: 3.8125 (3.8125)  weight_decay: 0.0500 (0.0500)  time: 3.4245  data: 2.7080  max mem: 21847
Epoch: [155]  [ 200/1251]  eta: 0:05:07  lr: 0.002109  min_lr: 0.002109  loss: 3.4020 (3.0511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7825 (0.9027)  time: 0.2715  data: 0.0005  max mem: 21847
Epoch: [155]  [ 400/1251]  eta: 0:04:02  lr: 0.002105  min_lr: 0.002105  loss: 3.1750 (3.0526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8257 (0.9208)  time: 0.2850  data: 0.0005  max mem: 21847
Epoch: [155]  [ 600/1251]  eta: 0:03:03  lr: 0.002102  min_lr: 0.002102  loss: 2.8259 (3.0327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6919 (0.9134)  time: 0.2754  data: 0.0004  max mem: 21847
Epoch: [155]  [ 800/1251]  eta: 0:02:06  lr: 0.002098  min_lr: 0.002098  loss: 3.6348 (3.0458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8075 (0.8970)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [155]  [1000/1251]  eta: 0:01:10  lr: 0.002095  min_lr: 0.002095  loss: 3.0511 (3.0337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6652 (0.8646)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [155]  [1200/1251]  eta: 0:00:14  lr: 0.002091  min_lr: 0.002091  loss: 3.2368 (3.0521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7803 (0.8546)  time: 0.2720  data: 0.0005  max mem: 21847
Epoch: [155]  [1250/1251]  eta: 0:00:00  lr: 0.002090  min_lr: 0.002090  loss: 2.4956 (3.0453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7891 (0.8529)  time: 0.2286  data: 0.0007  max mem: 21847
Epoch: [155] Total time: 0:05:48 (0.2784 s / it)
Averaged stats: lr: 0.002090  min_lr: 0.002090  loss: 2.4956 (3.0654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7891 (0.8529)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5859 (0.5859)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.5954  data: 5.4330  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8198 (0.7804)  acc1: 82.8000 (83.3818)  acc5: 97.2000 (97.1273)  time: 0.6479  data: 0.5130  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9474 (0.9243)  acc1: 77.2000 (79.6571)  acc5: 95.2000 (95.3905)  time: 0.1633  data: 0.0333  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0189 (0.9351)  acc1: 76.4000 (79.3440)  acc5: 94.8000 (95.2800)  time: 0.1733  data: 0.0441  max mem: 21847
Test: Total time: 0:00:09 (0.3868 s / it)
* Acc@1 79.548 Acc@5 95.094 loss 0.932
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.55%
Epoch: [156]  [   0/1251]  eta: 1:04:45  lr: 0.002090  min_lr: 0.002090  loss: 2.8813 (2.8813)  weight_decay: 0.0500 (0.0500)  time: 3.1059  data: 2.7525  max mem: 21847
Epoch: [156]  [ 200/1251]  eta: 0:05:01  lr: 0.002087  min_lr: 0.002087  loss: 3.0894 (3.0617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7851 (0.8727)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [156]  [ 400/1251]  eta: 0:04:00  lr: 0.002083  min_lr: 0.002083  loss: 3.1785 (3.0699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7791 (0.8403)  time: 0.2823  data: 0.0005  max mem: 21847
Epoch: [156]  [ 600/1251]  eta: 0:03:02  lr: 0.002079  min_lr: 0.002079  loss: 3.3597 (3.0862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8211 (0.8241)  time: 0.2734  data: 0.0005  max mem: 21847
Epoch: [156]  [ 800/1251]  eta: 0:02:05  lr: 0.002076  min_lr: 0.002076  loss: 3.3534 (3.0915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8155 (0.8265)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [156]  [1000/1251]  eta: 0:01:09  lr: 0.002072  min_lr: 0.002072  loss: 3.1162 (3.1004)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6622 (0.8224)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [156]  [1200/1251]  eta: 0:00:14  lr: 0.002069  min_lr: 0.002069  loss: 2.8145 (3.1060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.8163)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [156]  [1250/1251]  eta: 0:00:00  lr: 0.002068  min_lr: 0.002068  loss: 2.9066 (3.0974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7472 (0.8141)  time: 0.2284  data: 0.0007  max mem: 21847
Epoch: [156] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.002068  min_lr: 0.002068  loss: 2.9066 (3.0671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7472 (0.8141)
Test:  [ 0/25]  eta: 0:01:55  loss: 0.6257 (0.6257)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 4.6366  data: 4.4802  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8243 (0.8172)  acc1: 82.0000 (82.8000)  acc5: 97.2000 (96.7636)  time: 0.6713  data: 0.5364  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9533 (0.9604)  acc1: 78.8000 (79.6191)  acc5: 94.8000 (94.9714)  time: 0.2200  data: 0.0896  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0085 (0.9718)  acc1: 78.0000 (79.1840)  acc5: 94.0000 (94.9440)  time: 0.2149  data: 0.0862  max mem: 21847
Test: Total time: 0:00:09 (0.3940 s / it)
* Acc@1 79.434 Acc@5 95.136 loss 0.961
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.55%
Epoch: [157]  [   0/1251]  eta: 1:06:16  lr: 0.002068  min_lr: 0.002068  loss: 3.7325 (3.7325)  weight_decay: 0.0500 (0.0500)  time: 3.1786  data: 1.6329  max mem: 21847
Epoch: [157]  [ 200/1251]  eta: 0:05:05  lr: 0.002064  min_lr: 0.002064  loss: 3.1434 (3.0144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7141 (0.8607)  time: 0.2857  data: 0.0008  max mem: 21847
Epoch: [157]  [ 400/1251]  eta: 0:03:59  lr: 0.002061  min_lr: 0.002061  loss: 2.9491 (3.0617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6945 (0.8375)  time: 0.2827  data: 0.0007  max mem: 21847
Epoch: [157]  [ 600/1251]  eta: 0:03:01  lr: 0.002057  min_lr: 0.002057  loss: 3.2173 (3.0691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7241 (0.8522)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [157]  [ 800/1251]  eta: 0:02:05  lr: 0.002053  min_lr: 0.002053  loss: 3.2234 (3.0587)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2776  data: 0.0004  max mem: 21847
Epoch: [157]  [1000/1251]  eta: 0:01:09  lr: 0.002050  min_lr: 0.002050  loss: 3.1529 (3.0593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8446 (nan)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [157]  [1200/1251]  eta: 0:00:14  lr: 0.002046  min_lr: 0.002046  loss: 3.1998 (3.0592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8705 (nan)  time: 0.2765  data: 0.0005  max mem: 21847
Epoch: [157]  [1250/1251]  eta: 0:00:00  lr: 0.002045  min_lr: 0.002045  loss: 2.8843 (3.0566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7381 (nan)  time: 0.2404  data: 0.0006  max mem: 21847
Epoch: [157] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.002045  min_lr: 0.002045  loss: 2.8843 (3.0577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7381 (nan)
Test:  [ 0/25]  eta: 0:01:30  loss: 0.6313 (0.6313)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 3.6399  data: 3.4745  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8467 (0.8005)  acc1: 83.2000 (83.2727)  acc5: 97.2000 (96.8000)  time: 0.6081  data: 0.4715  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9942 (0.9531)  acc1: 79.2000 (79.7143)  acc5: 95.2000 (95.2191)  time: 0.2679  data: 0.1362  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0978 (0.9664)  acc1: 77.6000 (79.3120)  acc5: 94.8000 (95.0880)  time: 0.2124  data: 0.0816  max mem: 21847
Test: Total time: 0:00:10 (0.4054 s / it)
* Acc@1 79.452 Acc@5 94.996 loss 0.960
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.55%
Epoch: [158]  [   0/1251]  eta: 1:13:20  lr: 0.002045  min_lr: 0.002045  loss: 3.6285 (3.6285)  weight_decay: 0.0500 (0.0500)  time: 3.5178  data: 3.1309  max mem: 21847
Epoch: [158]  [ 200/1251]  eta: 0:05:06  lr: 0.002042  min_lr: 0.002042  loss: 3.2544 (3.0156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8472 (0.8789)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [158]  [ 400/1251]  eta: 0:04:00  lr: 0.002038  min_lr: 0.002038  loss: 2.8881 (3.0363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7231 (0.8077)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [158]  [ 600/1251]  eta: 0:03:02  lr: 0.002035  min_lr: 0.002035  loss: 3.0071 (3.0306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6911 (0.7927)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [158]  [ 800/1251]  eta: 0:02:05  lr: 0.002031  min_lr: 0.002031  loss: 2.7402 (3.0508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7946 (0.8145)  time: 0.2815  data: 0.0004  max mem: 21847
Epoch: [158]  [1000/1251]  eta: 0:01:09  lr: 0.002027  min_lr: 0.002027  loss: 2.8648 (3.0497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8149 (0.8279)  time: 0.2916  data: 0.0004  max mem: 21847
Epoch: [158]  [1200/1251]  eta: 0:00:14  lr: 0.002024  min_lr: 0.002024  loss: 2.7312 (3.0562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7723 (0.8217)  time: 0.2740  data: 0.0004  max mem: 21847
Epoch: [158]  [1250/1251]  eta: 0:00:00  lr: 0.002023  min_lr: 0.002023  loss: 2.7978 (3.0529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6652 (0.8158)  time: 0.2277  data: 0.0005  max mem: 21847
Epoch: [158] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.002023  min_lr: 0.002023  loss: 2.7978 (3.0497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6652 (0.8158)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6121 (0.6121)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 5.5910  data: 5.4270  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8352 (0.8182)  acc1: 84.8000 (83.9636)  acc5: 97.2000 (97.0545)  time: 0.7587  data: 0.6236  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9790 (0.9741)  acc1: 78.4000 (80.4571)  acc5: 94.8000 (95.2000)  time: 0.2079  data: 0.0780  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0925 (0.9850)  acc1: 78.0000 (80.0480)  acc5: 93.6000 (95.1040)  time: 0.2070  data: 0.0779  max mem: 21847
Test: Total time: 0:00:10 (0.4135 s / it)
* Acc@1 79.582 Acc@5 94.990 loss 0.990
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.58%
Epoch: [159]  [   0/1251]  eta: 0:59:20  lr: 0.002023  min_lr: 0.002023  loss: 2.9944 (2.9944)  weight_decay: 0.0500 (0.0500)  time: 2.8461  data: 2.4668  max mem: 21847
Epoch: [159]  [ 200/1251]  eta: 0:05:02  lr: 0.002019  min_lr: 0.002019  loss: 3.3017 (3.0721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7640 (1.0178)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [159]  [ 400/1251]  eta: 0:04:00  lr: 0.002016  min_lr: 0.002016  loss: 3.4380 (3.0848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9394 (0.9695)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [159]  [ 600/1251]  eta: 0:03:01  lr: 0.002012  min_lr: 0.002012  loss: 3.2090 (3.0743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7718 (0.9320)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [159]  [ 800/1251]  eta: 0:02:05  lr: 0.002009  min_lr: 0.002009  loss: 3.3179 (3.0661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9268 (0.9157)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [159]  [1000/1251]  eta: 0:01:09  lr: 0.002005  min_lr: 0.002005  loss: 3.1606 (3.0633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6562 (0.8916)  time: 0.2793  data: 0.0004  max mem: 21847
Epoch: [159]  [1200/1251]  eta: 0:00:14  lr: 0.002001  min_lr: 0.002001  loss: 2.8591 (3.0757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.8962)  time: 0.2752  data: 0.0004  max mem: 21847
Epoch: [159]  [1250/1251]  eta: 0:00:00  lr: 0.002001  min_lr: 0.002001  loss: 2.7723 (3.0689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.8915)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [159] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.002001  min_lr: 0.002001  loss: 2.7723 (3.0620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.8915)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6231 (0.6231)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.5201  data: 5.3748  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7942 (0.7901)  acc1: 84.0000 (83.1273)  acc5: 97.6000 (97.3091)  time: 0.7211  data: 0.5876  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9756 (0.9436)  acc1: 77.2000 (79.8857)  acc5: 95.2000 (95.2952)  time: 0.2029  data: 0.0730  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0656 (0.9535)  acc1: 77.2000 (79.4720)  acc5: 94.4000 (95.1680)  time: 0.2013  data: 0.0729  max mem: 21847
Test: Total time: 0:00:10 (0.4070 s / it)
* Acc@1 79.638 Acc@5 95.118 loss 0.948
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.64%
Epoch: [160]  [   0/1251]  eta: 1:09:03  lr: 0.002001  min_lr: 0.002001  loss: 1.9971 (1.9971)  weight_decay: 0.0500 (0.0500)  time: 3.3119  data: 3.0108  max mem: 21847
Epoch: [160]  [ 200/1251]  eta: 0:05:04  lr: 0.001997  min_lr: 0.001997  loss: 2.4200 (3.0085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8412 (0.9441)  time: 0.2720  data: 0.0005  max mem: 21847
Epoch: [160]  [ 400/1251]  eta: 0:04:00  lr: 0.001993  min_lr: 0.001993  loss: 2.7587 (2.9617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (0.8925)  time: 0.2851  data: 0.0004  max mem: 21847
Epoch: [160]  [ 600/1251]  eta: 0:03:02  lr: 0.001990  min_lr: 0.001990  loss: 3.4068 (3.0092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8769 (0.8746)  time: 0.2837  data: 0.0004  max mem: 21847
Epoch: [160]  [ 800/1251]  eta: 0:02:05  lr: 0.001986  min_lr: 0.001986  loss: 2.7825 (3.0335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7257 (0.8472)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [160]  [1000/1251]  eta: 0:01:09  lr: 0.001983  min_lr: 0.001983  loss: 3.4169 (3.0410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6706 (0.8205)  time: 0.2717  data: 0.0005  max mem: 21847
Epoch: [160]  [1200/1251]  eta: 0:00:14  lr: 0.001979  min_lr: 0.001979  loss: 3.3654 (3.0529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7835 (0.8427)  time: 0.2819  data: 0.0005  max mem: 21847
Epoch: [160]  [1250/1251]  eta: 0:00:00  lr: 0.001978  min_lr: 0.001978  loss: 3.4097 (3.0584)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7835 (0.8470)  time: 0.2280  data: 0.0005  max mem: 21847
Epoch: [160] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.001978  min_lr: 0.001978  loss: 3.4097 (3.0455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7835 (0.8470)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6804 (0.6804)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 5.7631  data: 5.5937  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8883 (0.8644)  acc1: 84.0000 (82.7636)  acc5: 96.8000 (96.7273)  time: 0.7223  data: 0.5866  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0472 (1.0086)  acc1: 76.8000 (79.4857)  acc5: 94.8000 (94.9905)  time: 0.1808  data: 0.0510  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0984 (1.0163)  acc1: 76.0000 (79.1680)  acc5: 94.4000 (94.9760)  time: 0.1917  data: 0.0626  max mem: 21847
Test: Total time: 0:00:10 (0.4086 s / it)
* Acc@1 79.458 Acc@5 94.954 loss 1.016
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.64%
Epoch: [161]  [   0/1251]  eta: 1:12:10  lr: 0.001978  min_lr: 0.001978  loss: 3.1866 (3.1866)  weight_decay: 0.0500 (0.0500)  time: 3.4619  data: 2.8645  max mem: 21847
Epoch: [161]  [ 200/1251]  eta: 0:05:06  lr: 0.001974  min_lr: 0.001974  loss: 3.0509 (3.0449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8067 (0.8170)  time: 0.2859  data: 0.0004  max mem: 21847
Epoch: [161]  [ 400/1251]  eta: 0:04:00  lr: 0.001971  min_lr: 0.001971  loss: 2.8104 (3.0428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7527 (0.7922)  time: 0.2762  data: 0.0005  max mem: 21847
Epoch: [161]  [ 600/1251]  eta: 0:03:02  lr: 0.001967  min_lr: 0.001967  loss: 3.3656 (3.0789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8312 (0.8422)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [161]  [ 800/1251]  eta: 0:02:05  lr: 0.001964  min_lr: 0.001964  loss: 3.4105 (3.0805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7776 (0.8295)  time: 0.2786  data: 0.0004  max mem: 21847
Epoch: [161]  [1000/1251]  eta: 0:01:09  lr: 0.001960  min_lr: 0.001960  loss: 3.1543 (3.0794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7555 (0.8230)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [161]  [1200/1251]  eta: 0:00:14  lr: 0.001956  min_lr: 0.001956  loss: 3.2809 (3.0753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7530 (0.8180)  time: 0.2727  data: 0.0005  max mem: 21847
Epoch: [161]  [1250/1251]  eta: 0:00:00  lr: 0.001956  min_lr: 0.001956  loss: 3.2362 (3.0770)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7600 (0.8190)  time: 0.2280  data: 0.0010  max mem: 21847
Epoch: [161] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.001956  min_lr: 0.001956  loss: 3.2362 (3.0568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7600 (0.8190)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6416 (0.6416)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.6984  data: 5.5229  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8494 (0.8499)  acc1: 84.0000 (83.7818)  acc5: 96.8000 (96.8727)  time: 0.7444  data: 0.6099  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0032 (0.9953)  acc1: 78.4000 (80.2286)  acc5: 95.2000 (95.0476)  time: 0.2094  data: 0.0787  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0757 (0.9996)  acc1: 76.4000 (79.8240)  acc5: 95.2000 (95.1040)  time: 0.2088  data: 0.0786  max mem: 21847
Test: Total time: 0:00:10 (0.4192 s / it)
* Acc@1 79.624 Acc@5 95.084 loss 0.999
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.64%
Epoch: [162]  [   0/1251]  eta: 1:01:07  lr: 0.001956  min_lr: 0.001956  loss: 2.3216 (2.3216)  weight_decay: 0.0500 (0.0500)  time: 2.9314  data: 2.2334  max mem: 21847
Epoch: [162]  [ 200/1251]  eta: 0:05:05  lr: 0.001952  min_lr: 0.001952  loss: 3.2444 (3.0699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8576 (0.8795)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [162]  [ 400/1251]  eta: 0:04:00  lr: 0.001948  min_lr: 0.001948  loss: 3.2249 (3.0608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9050 (0.9087)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [162]  [ 600/1251]  eta: 0:03:02  lr: 0.001945  min_lr: 0.001945  loss: 2.5917 (3.0505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7795 (0.8759)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [162]  [ 800/1251]  eta: 0:02:05  lr: 0.001941  min_lr: 0.001941  loss: 3.5079 (3.0502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6567 (0.8574)  time: 0.2725  data: 0.0005  max mem: 21847
Epoch: [162]  [1000/1251]  eta: 0:01:09  lr: 0.001938  min_lr: 0.001938  loss: 3.1985 (3.0516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9308 (0.8696)  time: 0.2798  data: 0.0004  max mem: 21847
Epoch: [162]  [1200/1251]  eta: 0:00:14  lr: 0.001934  min_lr: 0.001934  loss: 3.1157 (3.0350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7882 (0.8672)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [162]  [1250/1251]  eta: 0:00:00  lr: 0.001933  min_lr: 0.001933  loss: 2.8699 (3.0363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7626 (0.8620)  time: 0.2299  data: 0.0006  max mem: 21847
Epoch: [162] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.001933  min_lr: 0.001933  loss: 2.8699 (3.0402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7626 (0.8620)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5910 (0.5910)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.5578  data: 5.3836  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7821 (0.7921)  acc1: 82.0000 (82.9455)  acc5: 96.8000 (96.9455)  time: 0.7037  data: 0.5675  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9603 (0.9465)  acc1: 77.6000 (79.7143)  acc5: 95.6000 (95.2952)  time: 0.1973  data: 0.0670  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0361 (0.9583)  acc1: 77.2000 (79.2480)  acc5: 94.4000 (95.1680)  time: 0.1951  data: 0.0660  max mem: 21847
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 79.432 Acc@5 95.096 loss 0.954
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.64%
Epoch: [163]  [   0/1251]  eta: 1:04:44  lr: 0.001933  min_lr: 0.001933  loss: 3.3498 (3.3498)  weight_decay: 0.0500 (0.0500)  time: 3.1054  data: 1.8381  max mem: 21847
Epoch: [163]  [ 200/1251]  eta: 0:05:03  lr: 0.001930  min_lr: 0.001930  loss: 2.8217 (3.0982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8080 (0.8445)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [163]  [ 400/1251]  eta: 0:04:00  lr: 0.001926  min_lr: 0.001926  loss: 2.7088 (3.0802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6964 (0.9046)  time: 0.2705  data: 0.0004  max mem: 21847
Epoch: [163]  [ 600/1251]  eta: 0:03:02  lr: 0.001922  min_lr: 0.001922  loss: 3.4044 (3.0853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9069 (0.8873)  time: 0.2733  data: 0.0005  max mem: 21847
Epoch: [163]  [ 800/1251]  eta: 0:02:05  lr: 0.001919  min_lr: 0.001919  loss: 2.6975 (3.0800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7271 (0.8878)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [163]  [1000/1251]  eta: 0:01:09  lr: 0.001915  min_lr: 0.001915  loss: 3.4209 (3.0760)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8966 (0.8759)  time: 0.2724  data: 0.0005  max mem: 21847
Epoch: [163]  [1200/1251]  eta: 0:00:14  lr: 0.001912  min_lr: 0.001912  loss: 3.1963 (3.0716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7440 (0.8836)  time: 0.2727  data: 0.0005  max mem: 21847
Epoch: [163]  [1250/1251]  eta: 0:00:00  lr: 0.001911  min_lr: 0.001911  loss: 2.8382 (3.0739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7372 (0.8780)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [163] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.001911  min_lr: 0.001911  loss: 2.8382 (3.0532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7372 (0.8780)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6093 (0.6093)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 5.6329  data: 5.4799  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8603 (0.8138)  acc1: 83.2000 (83.3091)  acc5: 96.8000 (96.9818)  time: 0.7351  data: 0.6008  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9574 (0.9643)  acc1: 78.0000 (79.7905)  acc5: 95.6000 (95.4095)  time: 0.1980  data: 0.0671  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0278 (0.9699)  acc1: 77.2000 (79.6160)  acc5: 94.8000 (95.3440)  time: 0.1969  data: 0.0669  max mem: 21847
Test: Total time: 0:00:10 (0.4075 s / it)
* Acc@1 79.796 Acc@5 95.208 loss 0.969
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.80%
Epoch: [164]  [   0/1251]  eta: 1:02:19  lr: 0.001911  min_lr: 0.001911  loss: 3.6833 (3.6833)  weight_decay: 0.0500 (0.0500)  time: 2.9892  data: 2.6399  max mem: 21847
Epoch: [164]  [ 200/1251]  eta: 0:05:02  lr: 0.001907  min_lr: 0.001907  loss: 3.0967 (3.0958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6755 (0.7963)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [164]  [ 400/1251]  eta: 0:03:59  lr: 0.001904  min_lr: 0.001904  loss: 2.8896 (3.0679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7069 (0.7790)  time: 0.2726  data: 0.0005  max mem: 21847
Epoch: [164]  [ 600/1251]  eta: 0:03:01  lr: 0.001900  min_lr: 0.001900  loss: 3.4010 (3.0516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6663 (0.7932)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [164]  [ 800/1251]  eta: 0:02:05  lr: 0.001896  min_lr: 0.001896  loss: 3.1181 (3.0415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7643 (0.7985)  time: 0.2716  data: 0.0005  max mem: 21847
Epoch: [164]  [1000/1251]  eta: 0:01:09  lr: 0.001893  min_lr: 0.001893  loss: 3.4750 (3.0410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8833 (0.8264)  time: 0.2729  data: 0.0005  max mem: 21847
Epoch: [164]  [1200/1251]  eta: 0:00:14  lr: 0.001889  min_lr: 0.001889  loss: 2.8051 (3.0389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8955 (0.8772)  time: 0.2720  data: 0.0005  max mem: 21847
Epoch: [164]  [1250/1251]  eta: 0:00:00  lr: 0.001888  min_lr: 0.001888  loss: 3.2397 (3.0408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7858 (0.8789)  time: 0.2283  data: 0.0005  max mem: 21847
Epoch: [164] Total time: 0:05:45 (0.2765 s / it)
Averaged stats: lr: 0.001888  min_lr: 0.001888  loss: 3.2397 (3.0307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7858 (0.8789)
Test:  [ 0/25]  eta: 0:02:31  loss: 0.6191 (0.6191)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 6.0475  data: 5.8962  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8824 (0.8311)  acc1: 83.6000 (83.4182)  acc5: 97.2000 (97.0182)  time: 0.7611  data: 0.6281  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0398 (0.9742)  acc1: 78.8000 (80.0952)  acc5: 95.2000 (95.5048)  time: 0.1943  data: 0.0631  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0398 (0.9812)  acc1: 78.0000 (79.9360)  acc5: 94.8000 (95.5040)  time: 0.1924  data: 0.0630  max mem: 21847
Test: Total time: 0:00:10 (0.4217 s / it)
* Acc@1 79.678 Acc@5 95.254 loss 0.987
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.80%
Epoch: [165]  [   0/1251]  eta: 1:08:16  lr: 0.001888  min_lr: 0.001888  loss: 3.5332 (3.5332)  weight_decay: 0.0500 (0.0500)  time: 3.2745  data: 2.7166  max mem: 21847
Epoch: [165]  [ 200/1251]  eta: 0:05:05  lr: 0.001885  min_lr: 0.001885  loss: 2.7841 (3.0512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (0.8422)  time: 0.2744  data: 0.0005  max mem: 21847
Epoch: [165]  [ 400/1251]  eta: 0:04:01  lr: 0.001881  min_lr: 0.001881  loss: 3.1878 (3.0508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9671 (0.8842)  time: 0.2751  data: 0.0004  max mem: 21847
Epoch: [165]  [ 600/1251]  eta: 0:03:02  lr: 0.001878  min_lr: 0.001878  loss: 3.0290 (3.0459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6864 (0.8542)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [165]  [ 800/1251]  eta: 0:02:05  lr: 0.001874  min_lr: 0.001874  loss: 2.9883 (3.0344)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8868 (0.8699)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [165]  [1000/1251]  eta: 0:01:09  lr: 0.001870  min_lr: 0.001870  loss: 3.0609 (3.0346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8029 (0.8733)  time: 0.2703  data: 0.0004  max mem: 21847
Epoch: [165]  [1200/1251]  eta: 0:00:14  lr: 0.001867  min_lr: 0.001867  loss: 3.2034 (3.0407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6978 (0.8613)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [165]  [1250/1251]  eta: 0:00:00  lr: 0.001866  min_lr: 0.001866  loss: 3.2574 (3.0455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6610 (0.8549)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [165] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.001866  min_lr: 0.001866  loss: 3.2574 (3.0328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6610 (0.8549)
Test:  [ 0/25]  eta: 0:01:23  loss: 0.6280 (0.6280)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 3.3301  data: 3.1594  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8603 (0.8435)  acc1: 84.0000 (83.3818)  acc5: 97.6000 (97.2727)  time: 0.6329  data: 0.4975  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9783 (0.9950)  acc1: 78.0000 (79.8857)  acc5: 95.2000 (95.5619)  time: 0.2844  data: 0.1520  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0985 (1.0047)  acc1: 77.2000 (79.7120)  acc5: 94.8000 (95.6000)  time: 0.2208  data: 0.0891  max mem: 21847
Test: Total time: 0:00:10 (0.4036 s / it)
* Acc@1 79.804 Acc@5 95.346 loss 1.005
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.80%
Epoch: [166]  [   0/1251]  eta: 1:11:39  lr: 0.001866  min_lr: 0.001866  loss: 3.9596 (3.9596)  weight_decay: 0.0500 (0.0500)  time: 3.4369  data: 3.1571  max mem: 21847
Epoch: [166]  [ 200/1251]  eta: 0:05:03  lr: 0.001862  min_lr: 0.001862  loss: 2.8584 (3.0559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7796 (0.8345)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [166]  [ 400/1251]  eta: 0:03:59  lr: 0.001859  min_lr: 0.001859  loss: 3.0818 (3.0453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7921 (0.8848)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [166]  [ 600/1251]  eta: 0:03:02  lr: 0.001855  min_lr: 0.001855  loss: 3.2225 (3.0476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7015 (0.8637)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [166]  [ 800/1251]  eta: 0:02:05  lr: 0.001852  min_lr: 0.001852  loss: 2.9842 (3.0485)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8233 (0.8602)  time: 0.2820  data: 0.0004  max mem: 21847
Epoch: [166]  [1000/1251]  eta: 0:01:09  lr: 0.001848  min_lr: 0.001848  loss: 2.7130 (3.0384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7843 (0.8604)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [166]  [1200/1251]  eta: 0:00:14  lr: 0.001844  min_lr: 0.001844  loss: 3.2860 (3.0320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8524 (0.8559)  time: 0.2841  data: 0.0004  max mem: 21847
Epoch: [166]  [1250/1251]  eta: 0:00:00  lr: 0.001844  min_lr: 0.001844  loss: 3.1321 (3.0339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9918 (0.8609)  time: 0.2339  data: 0.0009  max mem: 21847
Epoch: [166] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.001844  min_lr: 0.001844  loss: 3.1321 (3.0236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9918 (0.8609)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6080 (0.6080)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.5779  data: 5.4286  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8294 (0.8188)  acc1: 83.6000 (83.1636)  acc5: 97.6000 (97.4909)  time: 0.7545  data: 0.6181  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9685 (0.9805)  acc1: 78.0000 (79.7143)  acc5: 95.6000 (95.5810)  time: 0.2063  data: 0.0751  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0626 (0.9850)  acc1: 78.0000 (79.4880)  acc5: 94.8000 (95.5520)  time: 0.2027  data: 0.0750  max mem: 21847
Test: Total time: 0:00:10 (0.4122 s / it)
* Acc@1 79.778 Acc@5 95.234 loss 0.985
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.80%
Epoch: [167]  [   0/1251]  eta: 1:06:52  lr: 0.001844  min_lr: 0.001844  loss: 3.5297 (3.5297)  weight_decay: 0.0500 (0.0500)  time: 3.2074  data: 2.8134  max mem: 21847
Epoch: [167]  [ 200/1251]  eta: 0:05:04  lr: 0.001840  min_lr: 0.001840  loss: 3.4381 (3.0107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8660 (0.8476)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [167]  [ 400/1251]  eta: 0:04:00  lr: 0.001836  min_lr: 0.001836  loss: 2.8684 (2.9911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7381 (0.9010)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [167]  [ 600/1251]  eta: 0:03:02  lr: 0.001833  min_lr: 0.001833  loss: 3.0997 (3.0206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7425 (0.8760)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [167]  [ 800/1251]  eta: 0:02:05  lr: 0.001829  min_lr: 0.001829  loss: 3.2212 (3.0132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7480 (0.8646)  time: 0.2712  data: 0.0005  max mem: 21847
Epoch: [167]  [1000/1251]  eta: 0:01:09  lr: 0.001826  min_lr: 0.001826  loss: 3.2887 (3.0135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7621 (0.8611)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [167]  [1200/1251]  eta: 0:00:14  lr: 0.001822  min_lr: 0.001822  loss: 3.4442 (3.0317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8076 (0.8555)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [167]  [1250/1251]  eta: 0:00:00  lr: 0.001821  min_lr: 0.001821  loss: 2.5892 (3.0267)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7999 (0.8521)  time: 0.2313  data: 0.0007  max mem: 21847
Epoch: [167] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.001821  min_lr: 0.001821  loss: 2.5892 (3.0331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7999 (0.8521)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6116 (0.6116)  acc1: 85.6000 (85.6000)  acc5: 98.8000 (98.8000)  time: 5.8535  data: 5.7058  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7907 (0.7977)  acc1: 84.4000 (83.4909)  acc5: 97.6000 (97.1273)  time: 0.6993  data: 0.5669  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9681 (0.9445)  acc1: 77.6000 (79.6381)  acc5: 95.6000 (95.3905)  time: 0.1737  data: 0.0445  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0278 (0.9533)  acc1: 77.2000 (79.2800)  acc5: 95.2000 (95.3760)  time: 0.1720  data: 0.0436  max mem: 21847
Test: Total time: 0:00:09 (0.3974 s / it)
* Acc@1 79.860 Acc@5 95.210 loss 0.944
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.86%
Epoch: [168]  [   0/1251]  eta: 1:06:19  lr: 0.001821  min_lr: 0.001821  loss: 3.1743 (3.1743)  weight_decay: 0.0500 (0.0500)  time: 3.1812  data: 2.8924  max mem: 21847
Epoch: [168]  [ 200/1251]  eta: 0:05:01  lr: 0.001818  min_lr: 0.001818  loss: 3.0380 (2.9956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8433 (1.1339)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [168]  [ 400/1251]  eta: 0:03:59  lr: 0.001814  min_lr: 0.001814  loss: 3.3891 (3.0347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8942 (0.9848)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [168]  [ 600/1251]  eta: 0:03:01  lr: 0.001811  min_lr: 0.001811  loss: 3.1171 (3.0474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7124 (0.9746)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [168]  [ 800/1251]  eta: 0:02:05  lr: 0.001807  min_lr: 0.001807  loss: 3.1808 (3.0329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7169 (0.9520)  time: 0.2858  data: 0.0005  max mem: 21847
Epoch: [168]  [1000/1251]  eta: 0:01:09  lr: 0.001803  min_lr: 0.001803  loss: 2.9010 (3.0262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7917 (0.9276)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [168]  [1200/1251]  eta: 0:00:14  lr: 0.001800  min_lr: 0.001800  loss: 3.0999 (3.0339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7472 (0.9205)  time: 0.2883  data: 0.0004  max mem: 21847
Epoch: [168]  [1250/1251]  eta: 0:00:00  lr: 0.001799  min_lr: 0.001799  loss: 2.7502 (3.0292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8144 (0.9189)  time: 0.2285  data: 0.0007  max mem: 21847
Epoch: [168] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.001799  min_lr: 0.001799  loss: 2.7502 (3.0203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8144 (0.9189)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6027 (0.6027)  acc1: 87.2000 (87.2000)  acc5: 99.2000 (99.2000)  time: 5.4337  data: 5.2817  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7858 (0.7849)  acc1: 83.6000 (83.7091)  acc5: 97.2000 (97.3455)  time: 0.6686  data: 0.5363  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9717 (0.9421)  acc1: 77.2000 (79.8476)  acc5: 94.8000 (95.3333)  time: 0.1787  data: 0.0498  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0095 (0.9471)  acc1: 77.2000 (79.5520)  acc5: 94.4000 (95.3440)  time: 0.1916  data: 0.0641  max mem: 21847
Test: Total time: 0:00:09 (0.3952 s / it)
* Acc@1 79.764 Acc@5 95.174 loss 0.944
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.86%
Epoch: [169]  [   0/1251]  eta: 1:08:32  lr: 0.001799  min_lr: 0.001799  loss: 3.3227 (3.3227)  weight_decay: 0.0500 (0.0500)  time: 3.2875  data: 2.8283  max mem: 21847
Epoch: [169]  [ 200/1251]  eta: 0:05:04  lr: 0.001795  min_lr: 0.001795  loss: 2.8701 (3.0185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8491 (0.7951)  time: 0.2832  data: 0.0004  max mem: 21847
Epoch: [169]  [ 400/1251]  eta: 0:03:59  lr: 0.001792  min_lr: 0.001792  loss: 2.8753 (3.0020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7886 (0.8513)  time: 0.2702  data: 0.0004  max mem: 21847
Epoch: [169]  [ 600/1251]  eta: 0:03:01  lr: 0.001788  min_lr: 0.001788  loss: 2.5138 (2.9911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7488 (0.8211)  time: 0.2725  data: 0.0003  max mem: 21847
Epoch: [169]  [ 800/1251]  eta: 0:02:05  lr: 0.001785  min_lr: 0.001785  loss: 3.0895 (3.0023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7886 (0.8368)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [169]  [1000/1251]  eta: 0:01:09  lr: 0.001781  min_lr: 0.001781  loss: 3.1302 (3.0189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7523 (0.8434)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [169]  [1200/1251]  eta: 0:00:14  lr: 0.001777  min_lr: 0.001777  loss: 3.2117 (3.0273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8295 (nan)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [169]  [1250/1251]  eta: 0:00:00  lr: 0.001777  min_lr: 0.001777  loss: 3.2672 (3.0320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8757 (nan)  time: 0.2283  data: 0.0005  max mem: 21847
Epoch: [169] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.001777  min_lr: 0.001777  loss: 3.2672 (3.0173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8757 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6963 (0.6963)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.5270  data: 5.3681  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8414 (0.8673)  acc1: 82.8000 (82.7273)  acc5: 97.2000 (96.9091)  time: 0.6472  data: 0.5108  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0210 (1.0015)  acc1: 78.0000 (79.6191)  acc5: 94.4000 (95.1048)  time: 0.1682  data: 0.0372  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0403 (1.0060)  acc1: 77.2000 (79.3600)  acc5: 95.2000 (95.2160)  time: 0.1964  data: 0.0671  max mem: 21847
Test: Total time: 0:00:10 (0.4033 s / it)
* Acc@1 79.780 Acc@5 95.200 loss 1.000
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.86%
Epoch: [170]  [   0/1251]  eta: 1:05:14  lr: 0.001777  min_lr: 0.001777  loss: 3.4403 (3.4403)  weight_decay: 0.0500 (0.0500)  time: 3.1294  data: 2.6860  max mem: 21847
Epoch: [170]  [ 200/1251]  eta: 0:05:05  lr: 0.001773  min_lr: 0.001773  loss: 3.2721 (3.0020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6945 (0.7773)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [170]  [ 400/1251]  eta: 0:04:00  lr: 0.001769  min_lr: 0.001769  loss: 2.4367 (2.9957)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7956 (0.8092)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [170]  [ 600/1251]  eta: 0:03:02  lr: 0.001766  min_lr: 0.001766  loss: 3.5216 (2.9986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7929 (0.8235)  time: 0.2750  data: 0.0004  max mem: 21847
Epoch: [170]  [ 800/1251]  eta: 0:02:05  lr: 0.001762  min_lr: 0.001762  loss: 3.3729 (3.0231)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0283 (0.8505)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [170]  [1000/1251]  eta: 0:01:09  lr: 0.001759  min_lr: 0.001759  loss: 3.2954 (3.0204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8770 (0.8526)  time: 0.2710  data: 0.0005  max mem: 21847
Epoch: [170]  [1200/1251]  eta: 0:00:14  lr: 0.001755  min_lr: 0.001755  loss: 2.4170 (3.0189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7715 (0.8600)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [170]  [1250/1251]  eta: 0:00:00  lr: 0.001754  min_lr: 0.001754  loss: 2.7422 (3.0191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7468 (0.8579)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [170] Total time: 0:05:45 (0.2766 s / it)
Averaged stats: lr: 0.001754  min_lr: 0.001754  loss: 2.7422 (3.0127)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7468 (0.8579)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5692 (0.5692)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.5932  data: 5.4409  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7343 (0.7560)  acc1: 84.8000 (83.5636)  acc5: 97.6000 (97.1636)  time: 0.7487  data: 0.6146  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9294 (0.9079)  acc1: 78.8000 (80.1524)  acc5: 95.2000 (95.4095)  time: 0.2303  data: 0.1003  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9867 (0.9166)  acc1: 78.4000 (79.8720)  acc5: 94.8000 (95.2800)  time: 0.2286  data: 0.1002  max mem: 21847
Test: Total time: 0:00:10 (0.4316 s / it)
* Acc@1 79.920 Acc@5 95.286 loss 0.909
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.92%
Epoch: [171]  [   0/1251]  eta: 1:04:30  lr: 0.001754  min_lr: 0.001754  loss: 2.7334 (2.7334)  weight_decay: 0.0500 (0.0500)  time: 3.0940  data: 2.7562  max mem: 21847
Epoch: [171]  [ 200/1251]  eta: 0:05:04  lr: 0.001751  min_lr: 0.001751  loss: 2.5441 (2.9209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9500 (0.9825)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [171]  [ 400/1251]  eta: 0:04:00  lr: 0.001747  min_lr: 0.001747  loss: 3.1314 (2.9399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8670 (0.9150)  time: 0.2735  data: 0.0004  max mem: 21847
Epoch: [171]  [ 600/1251]  eta: 0:03:02  lr: 0.001744  min_lr: 0.001744  loss: 3.2277 (2.9739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1840 (0.9351)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [171]  [ 800/1251]  eta: 0:02:05  lr: 0.001740  min_lr: 0.001740  loss: 2.8886 (2.9758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7729 (0.8965)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [171]  [1000/1251]  eta: 0:01:09  lr: 0.001737  min_lr: 0.001737  loss: 2.5449 (2.9798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8571 (0.9066)  time: 0.2836  data: 0.0005  max mem: 21847
Epoch: [171]  [1200/1251]  eta: 0:00:14  lr: 0.001733  min_lr: 0.001733  loss: 3.3800 (2.9875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8976 (0.9067)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [171]  [1250/1251]  eta: 0:00:00  lr: 0.001732  min_lr: 0.001732  loss: 2.7229 (2.9862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.9033)  time: 0.2279  data: 0.0009  max mem: 21847
Epoch: [171] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.001732  min_lr: 0.001732  loss: 2.7229 (3.0112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.9033)
Test:  [ 0/25]  eta: 0:01:25  loss: 0.6161 (0.6161)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 3.4397  data: 3.2866  max mem: 21847
Test:  [10/25]  eta: 0:00:08  loss: 0.8251 (0.8112)  acc1: 84.4000 (83.5273)  acc5: 97.2000 (97.2364)  time: 0.5891  data: 0.4532  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9724 (0.9706)  acc1: 78.8000 (80.2286)  acc5: 94.0000 (95.0095)  time: 0.2722  data: 0.1413  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0459 (0.9799)  acc1: 78.8000 (79.9840)  acc5: 93.6000 (94.9760)  time: 0.2040  data: 0.0757  max mem: 21847
Test: Total time: 0:00:09 (0.3962 s / it)
* Acc@1 80.148 Acc@5 95.216 loss 0.964
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.15%
Epoch: [172]  [   0/1251]  eta: 0:54:23  lr: 0.001732  min_lr: 0.001732  loss: 3.1854 (3.1854)  weight_decay: 0.0500 (0.0500)  time: 2.6087  data: 2.1977  max mem: 21847
Epoch: [172]  [ 200/1251]  eta: 0:05:01  lr: 0.001729  min_lr: 0.001729  loss: 3.1274 (2.9763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7858 (0.8278)  time: 0.2844  data: 0.0006  max mem: 21847
Epoch: [172]  [ 400/1251]  eta: 0:03:58  lr: 0.001725  min_lr: 0.001725  loss: 3.0565 (3.0003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7877 (0.8679)  time: 0.2812  data: 0.0005  max mem: 21847
Epoch: [172]  [ 600/1251]  eta: 0:03:01  lr: 0.001721  min_lr: 0.001721  loss: 2.9389 (3.0046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9210 (0.8850)  time: 0.2835  data: 0.0003  max mem: 21847
Epoch: [172]  [ 800/1251]  eta: 0:02:05  lr: 0.001718  min_lr: 0.001718  loss: 2.9121 (3.0072)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7785 (0.8721)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [172]  [1000/1251]  eta: 0:01:09  lr: 0.001714  min_lr: 0.001714  loss: 3.2037 (3.0079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8652 (0.8812)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [172]  [1200/1251]  eta: 0:00:14  lr: 0.001711  min_lr: 0.001711  loss: 3.2456 (3.0182)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7603 (0.8683)  time: 0.2717  data: 0.0005  max mem: 21847
Epoch: [172]  [1250/1251]  eta: 0:00:00  lr: 0.001710  min_lr: 0.001710  loss: 3.2953 (3.0239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9537 (0.8890)  time: 0.2332  data: 0.0006  max mem: 21847
Epoch: [172] Total time: 0:05:45 (0.2762 s / it)
Averaged stats: lr: 0.001710  min_lr: 0.001710  loss: 3.2953 (3.0031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9537 (0.8890)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7038 (0.7038)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.8418  data: 5.6720  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8818 (0.8902)  acc1: 84.4000 (83.7455)  acc5: 97.2000 (96.8364)  time: 0.7592  data: 0.6240  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0331 (1.0483)  acc1: 79.2000 (80.3810)  acc5: 94.4000 (95.2762)  time: 0.2101  data: 0.0803  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1443 (1.0582)  acc1: 78.0000 (80.0800)  acc5: 94.4000 (95.2960)  time: 0.2079  data: 0.0802  max mem: 21847
Test: Total time: 0:00:10 (0.4254 s / it)
* Acc@1 79.942 Acc@5 95.188 loss 1.051
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.15%
Epoch: [173]  [   0/1251]  eta: 1:06:31  lr: 0.001710  min_lr: 0.001710  loss: 2.7015 (2.7015)  weight_decay: 0.0500 (0.0500)  time: 3.1906  data: 2.5652  max mem: 21847
Epoch: [173]  [ 200/1251]  eta: 0:05:04  lr: 0.001706  min_lr: 0.001706  loss: 2.7127 (2.9506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7624 (0.9804)  time: 0.2812  data: 0.0004  max mem: 21847
Epoch: [173]  [ 400/1251]  eta: 0:03:59  lr: 0.001703  min_lr: 0.001703  loss: 2.3679 (2.9519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7203 (0.8836)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [173]  [ 600/1251]  eta: 0:03:01  lr: 0.001699  min_lr: 0.001699  loss: 3.2264 (2.9596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2182 (0.9097)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [173]  [ 800/1251]  eta: 0:02:05  lr: 0.001696  min_lr: 0.001696  loss: 3.0304 (2.9747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8564 (0.8994)  time: 0.2771  data: 0.0004  max mem: 21847
Epoch: [173]  [1000/1251]  eta: 0:01:09  lr: 0.001692  min_lr: 0.001692  loss: 3.2817 (2.9809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7869 (0.8961)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [173]  [1200/1251]  eta: 0:00:14  lr: 0.001689  min_lr: 0.001689  loss: 3.5283 (2.9864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9418 (0.9028)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [173]  [1250/1251]  eta: 0:00:00  lr: 0.001688  min_lr: 0.001688  loss: 3.1545 (2.9905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7595 (0.8997)  time: 0.2277  data: 0.0006  max mem: 21847
Epoch: [173] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.001688  min_lr: 0.001688  loss: 3.1545 (3.0029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7595 (0.8997)
Test:  [ 0/25]  eta: 0:01:23  loss: 0.6892 (0.6892)  acc1: 85.6000 (85.6000)  acc5: 98.8000 (98.8000)  time: 3.3573  data: 3.1933  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.9301 (0.9032)  acc1: 84.8000 (83.4545)  acc5: 96.8000 (96.9818)  time: 0.6082  data: 0.4731  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0580 (1.0378)  acc1: 78.4000 (80.4000)  acc5: 95.2000 (95.2000)  time: 0.2780  data: 0.1481  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1406 (1.0511)  acc1: 78.4000 (79.9520)  acc5: 95.2000 (95.3600)  time: 0.2166  data: 0.0862  max mem: 21847
Test: Total time: 0:00:09 (0.3989 s / it)
* Acc@1 79.764 Acc@5 95.312 loss 1.046
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 80.15%
Epoch: [174]  [   0/1251]  eta: 1:05:48  lr: 0.001688  min_lr: 0.001688  loss: 2.0785 (2.0785)  weight_decay: 0.0500 (0.0500)  time: 3.1562  data: 2.4452  max mem: 21847
Epoch: [174]  [ 200/1251]  eta: 0:05:05  lr: 0.001684  min_lr: 0.001684  loss: 2.5031 (2.9662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8519 (0.9890)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [174]  [ 400/1251]  eta: 0:03:59  lr: 0.001681  min_lr: 0.001681  loss: 3.1706 (2.9810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8222 (0.9825)  time: 0.2704  data: 0.0004  max mem: 21847
Epoch: [174]  [ 600/1251]  eta: 0:03:02  lr: 0.001677  min_lr: 0.001677  loss: 2.8043 (2.9854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7888 (0.9451)  time: 0.2841  data: 0.0004  max mem: 21847
Epoch: [174]  [ 800/1251]  eta: 0:02:05  lr: 0.001674  min_lr: 0.001674  loss: 2.5490 (2.9775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9040 (0.9364)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [174]  [1000/1251]  eta: 0:01:09  lr: 0.001670  min_lr: 0.001670  loss: 2.6040 (2.9956)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0054 (0.9379)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [174]  [1200/1251]  eta: 0:00:14  lr: 0.001666  min_lr: 0.001666  loss: 3.4603 (2.9869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8162 (0.9311)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [174]  [1250/1251]  eta: 0:00:00  lr: 0.001666  min_lr: 0.001666  loss: 2.7262 (2.9892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8859 (0.9328)  time: 0.2291  data: 0.0007  max mem: 21847
Epoch: [174] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.001666  min_lr: 0.001666  loss: 2.7262 (3.0096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8859 (0.9328)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6008 (0.6008)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 5.5194  data: 5.3648  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8172 (0.7871)  acc1: 84.4000 (83.6364)  acc5: 97.6000 (97.2000)  time: 0.7106  data: 0.5769  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9617 (0.9235)  acc1: 78.0000 (80.4952)  acc5: 95.2000 (95.4286)  time: 0.1987  data: 0.0692  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0087 (0.9342)  acc1: 78.0000 (80.0960)  acc5: 94.8000 (95.3920)  time: 0.2132  data: 0.0846  max mem: 21847
Test: Total time: 0:00:10 (0.4164 s / it)
* Acc@1 80.062 Acc@5 95.334 loss 0.940
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.15%
Epoch: [175]  [   0/1251]  eta: 1:10:10  lr: 0.001666  min_lr: 0.001666  loss: 3.8496 (3.8496)  weight_decay: 0.0500 (0.0500)  time: 3.3660  data: 2.6594  max mem: 21847
Epoch: [175]  [ 200/1251]  eta: 0:05:09  lr: 0.001662  min_lr: 0.001662  loss: 3.1633 (3.0395)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2317 (1.1407)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [175]  [ 400/1251]  eta: 0:04:01  lr: 0.001658  min_lr: 0.001658  loss: 2.5122 (2.9703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7932 (0.9980)  time: 0.2868  data: 0.0004  max mem: 21847
Epoch: [175]  [ 600/1251]  eta: 0:03:02  lr: 0.001655  min_lr: 0.001655  loss: 2.6247 (2.9616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7176 (0.9258)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [175]  [ 800/1251]  eta: 0:02:05  lr: 0.001651  min_lr: 0.001651  loss: 2.7068 (2.9820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7679 (0.9388)  time: 0.2757  data: 0.0004  max mem: 21847
Epoch: [175]  [1000/1251]  eta: 0:01:10  lr: 0.001648  min_lr: 0.001648  loss: 3.2573 (2.9970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7287 (0.9039)  time: 0.2861  data: 0.0004  max mem: 21847
Epoch: [175]  [1200/1251]  eta: 0:00:14  lr: 0.001644  min_lr: 0.001644  loss: 3.4340 (3.0140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8309 (0.9107)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [175]  [1250/1251]  eta: 0:00:00  lr: 0.001644  min_lr: 0.001644  loss: 3.1611 (3.0145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8319 (0.9087)  time: 0.2282  data: 0.0006  max mem: 21847
Epoch: [175] Total time: 0:05:48 (0.2783 s / it)
Averaged stats: lr: 0.001644  min_lr: 0.001644  loss: 3.1611 (3.0066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8319 (0.9087)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5785 (0.5785)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.3039  data: 5.1554  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7937 (0.8042)  acc1: 84.0000 (82.8364)  acc5: 96.8000 (96.7636)  time: 0.7060  data: 0.5726  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9905 (0.9449)  acc1: 78.0000 (79.9810)  acc5: 95.2000 (95.1619)  time: 0.2055  data: 0.0758  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0293 (0.9479)  acc1: 77.2000 (79.7440)  acc5: 94.8000 (95.1680)  time: 0.2054  data: 0.0771  max mem: 21847
Test: Total time: 0:00:10 (0.4010 s / it)
* Acc@1 79.900 Acc@5 95.306 loss 0.937
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.15%
Epoch: [176]  [   0/1251]  eta: 1:05:16  lr: 0.001643  min_lr: 0.001643  loss: 2.1814 (2.1814)  weight_decay: 0.0500 (0.0500)  time: 3.1305  data: 2.4190  max mem: 21847
Epoch: [176]  [ 200/1251]  eta: 0:05:03  lr: 0.001640  min_lr: 0.001640  loss: 2.9460 (2.9555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7356 (0.9123)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [176]  [ 400/1251]  eta: 0:04:00  lr: 0.001636  min_lr: 0.001636  loss: 3.2294 (2.9915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8374 (0.8826)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [176]  [ 600/1251]  eta: 0:03:02  lr: 0.001633  min_lr: 0.001633  loss: 3.2660 (2.9864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8193 (0.9000)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [176]  [ 800/1251]  eta: 0:02:05  lr: 0.001629  min_lr: 0.001629  loss: 3.2888 (2.9994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8417 (0.8948)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [176]  [1000/1251]  eta: 0:01:09  lr: 0.001626  min_lr: 0.001626  loss: 3.3715 (3.0133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6832 (0.9172)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [176]  [1200/1251]  eta: 0:00:14  lr: 0.001622  min_lr: 0.001622  loss: 3.0093 (3.0112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8255 (0.9040)  time: 0.2771  data: 0.0004  max mem: 21847
Epoch: [176]  [1250/1251]  eta: 0:00:00  lr: 0.001621  min_lr: 0.001621  loss: 2.7923 (3.0137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8255 (0.9023)  time: 0.2282  data: 0.0006  max mem: 21847
Epoch: [176] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.001621  min_lr: 0.001621  loss: 2.7923 (2.9929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8255 (0.9023)
Test:  [ 0/25]  eta: 0:01:52  loss: 0.6241 (0.6241)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 4.5171  data: 4.3627  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8220 (0.8221)  acc1: 84.4000 (83.6000)  acc5: 96.8000 (96.9455)  time: 0.6935  data: 0.5593  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9543 (0.9545)  acc1: 78.0000 (80.5333)  acc5: 94.8000 (95.2191)  time: 0.2352  data: 0.1054  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0374 (0.9601)  acc1: 77.2000 (80.1280)  acc5: 94.8000 (95.3440)  time: 0.2380  data: 0.1093  max mem: 21847
Test: Total time: 0:00:09 (0.3963 s / it)
* Acc@1 80.110 Acc@5 95.334 loss 0.958
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.15%
Epoch: [177]  [   0/1251]  eta: 1:08:40  lr: 0.001621  min_lr: 0.001621  loss: 3.4651 (3.4651)  weight_decay: 0.0500 (0.0500)  time: 3.2936  data: 2.8969  max mem: 21847
Epoch: [177]  [ 200/1251]  eta: 0:05:03  lr: 0.001618  min_lr: 0.001618  loss: 3.0055 (3.0039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8316 (0.8839)  time: 0.2732  data: 0.0005  max mem: 21847
Epoch: [177]  [ 400/1251]  eta: 0:03:59  lr: 0.001614  min_lr: 0.001614  loss: 2.8265 (3.0105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.8716)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [177]  [ 600/1251]  eta: 0:03:01  lr: 0.001611  min_lr: 0.001611  loss: 3.4352 (3.0073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8045 (0.8705)  time: 0.2818  data: 0.0004  max mem: 21847
Epoch: [177]  [ 800/1251]  eta: 0:02:05  lr: 0.001607  min_lr: 0.001607  loss: 2.8052 (3.0024)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8877 (0.8965)  time: 0.2864  data: 0.0004  max mem: 21847
Epoch: [177]  [1000/1251]  eta: 0:01:09  lr: 0.001604  min_lr: 0.001604  loss: 2.6928 (2.9979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8635 (0.9002)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [177]  [1200/1251]  eta: 0:00:14  lr: 0.001600  min_lr: 0.001600  loss: 3.0136 (2.9884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7425 (0.8922)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [177]  [1250/1251]  eta: 0:00:00  lr: 0.001599  min_lr: 0.001599  loss: 2.9527 (2.9920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7547 (0.8918)  time: 0.2279  data: 0.0006  max mem: 21847
Epoch: [177] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.001599  min_lr: 0.001599  loss: 2.9527 (2.9918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7547 (0.8918)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6130 (0.6130)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.7055  data: 5.5526  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8368 (0.8085)  acc1: 84.0000 (83.9273)  acc5: 96.8000 (97.2000)  time: 0.7040  data: 0.5686  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9716 (0.9493)  acc1: 78.8000 (80.5143)  acc5: 96.0000 (95.5238)  time: 0.1818  data: 0.0512  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0384 (0.9615)  acc1: 77.6000 (80.0480)  acc5: 94.4000 (95.4880)  time: 0.1916  data: 0.0633  max mem: 21847
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 80.068 Acc@5 95.338 loss 0.958
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.15%
Epoch: [178]  [   0/1251]  eta: 1:04:52  lr: 0.001599  min_lr: 0.001599  loss: 2.5680 (2.5680)  weight_decay: 0.0500 (0.0500)  time: 3.1118  data: 1.8087  max mem: 21847
Epoch: [178]  [ 200/1251]  eta: 0:05:04  lr: 0.001596  min_lr: 0.001596  loss: 2.4141 (2.8264)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1311 (0.9846)  time: 0.2725  data: 0.0008  max mem: 21847
Epoch: [178]  [ 400/1251]  eta: 0:04:01  lr: 0.001592  min_lr: 0.001592  loss: 2.3565 (2.9068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7732 (0.9099)  time: 0.2718  data: 0.0005  max mem: 21847
Epoch: [178]  [ 600/1251]  eta: 0:03:02  lr: 0.001589  min_lr: 0.001589  loss: 3.3335 (2.9123)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8575 (0.8891)  time: 0.2708  data: 0.0005  max mem: 21847
Epoch: [178]  [ 800/1251]  eta: 0:02:05  lr: 0.001585  min_lr: 0.001585  loss: 3.0216 (2.9253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8208 (0.8898)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [178]  [1000/1251]  eta: 0:01:09  lr: 0.001582  min_lr: 0.001582  loss: 2.7708 (2.9433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7986 (0.8832)  time: 0.2761  data: 0.0005  max mem: 21847
Epoch: [178]  [1200/1251]  eta: 0:00:14  lr: 0.001578  min_lr: 0.001578  loss: 3.2580 (2.9418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7670 (0.9054)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [178]  [1250/1251]  eta: 0:00:00  lr: 0.001578  min_lr: 0.001578  loss: 3.2572 (2.9479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7880 (0.9022)  time: 0.2280  data: 0.0007  max mem: 21847
Epoch: [178] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.001578  min_lr: 0.001578  loss: 3.2572 (2.9803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7880 (0.9022)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6799 (0.6799)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.6907  data: 5.5356  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8835 (0.8652)  acc1: 83.6000 (83.6727)  acc5: 96.8000 (96.9818)  time: 0.7438  data: 0.6109  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0087 (1.0049)  acc1: 78.4000 (80.1905)  acc5: 95.2000 (95.1619)  time: 0.1995  data: 0.0704  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0703 (1.0126)  acc1: 77.6000 (80.0480)  acc5: 94.0000 (95.1680)  time: 0.1987  data: 0.0703  max mem: 21847
Test: Total time: 0:00:10 (0.4112 s / it)
* Acc@1 80.110 Acc@5 95.240 loss 1.012
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.15%
Epoch: [179]  [   0/1251]  eta: 1:10:15  lr: 0.001577  min_lr: 0.001577  loss: 3.7957 (3.7957)  weight_decay: 0.0500 (0.0500)  time: 3.3698  data: 1.6443  max mem: 21847
Epoch: [179]  [ 200/1251]  eta: 0:05:04  lr: 0.001574  min_lr: 0.001574  loss: 3.1906 (2.9256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8515 (0.8758)  time: 0.2746  data: 0.0004  max mem: 21847
Epoch: [179]  [ 400/1251]  eta: 0:04:00  lr: 0.001570  min_lr: 0.001570  loss: 3.3421 (2.9761)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9980 (0.9082)  time: 0.2750  data: 0.0004  max mem: 21847
Epoch: [179]  [ 600/1251]  eta: 0:03:02  lr: 0.001567  min_lr: 0.001567  loss: 3.0111 (2.9522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8993 (0.9138)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [179]  [ 800/1251]  eta: 0:02:05  lr: 0.001563  min_lr: 0.001563  loss: 3.1976 (2.9518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7733 (0.9286)  time: 0.2720  data: 0.0003  max mem: 21847
Epoch: [179]  [1000/1251]  eta: 0:01:09  lr: 0.001560  min_lr: 0.001560  loss: 2.8740 (2.9630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7848 (0.9051)  time: 0.2731  data: 0.0005  max mem: 21847
Epoch: [179]  [1200/1251]  eta: 0:00:14  lr: 0.001556  min_lr: 0.001556  loss: 3.3068 (2.9630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6947 (0.8865)  time: 0.2761  data: 0.0004  max mem: 21847
Epoch: [179]  [1250/1251]  eta: 0:00:00  lr: 0.001556  min_lr: 0.001556  loss: 3.0278 (2.9670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7929 (0.8866)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [179] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.001556  min_lr: 0.001556  loss: 3.0278 (2.9816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7929 (0.8866)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6981 (0.6981)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.7487  data: 5.6041  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9436 (0.8882)  acc1: 84.4000 (83.7455)  acc5: 97.6000 (97.2364)  time: 0.7165  data: 0.5846  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0393 (1.0297)  acc1: 79.2000 (80.4191)  acc5: 95.2000 (95.2952)  time: 0.1951  data: 0.0660  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.1275 (1.0376)  acc1: 78.0000 (80.2240)  acc5: 94.4000 (95.2480)  time: 0.1935  data: 0.0659  max mem: 21847
Test: Total time: 0:00:10 (0.4100 s / it)
* Acc@1 80.238 Acc@5 95.310 loss 1.030
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [180]  [   0/1251]  eta: 1:02:30  lr: 0.001556  min_lr: 0.001556  loss: 1.9199 (1.9199)  weight_decay: 0.0500 (0.0500)  time: 2.9979  data: 2.6549  max mem: 21847
Epoch: [180]  [ 200/1251]  eta: 0:05:01  lr: 0.001552  min_lr: 0.001552  loss: 2.7407 (2.9278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8631 (0.9407)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [180]  [ 400/1251]  eta: 0:03:58  lr: 0.001549  min_lr: 0.001549  loss: 2.3889 (2.9256)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0170 (0.9613)  time: 0.2777  data: 0.0004  max mem: 21847
Epoch: [180]  [ 600/1251]  eta: 0:03:01  lr: 0.001545  min_lr: 0.001545  loss: 2.6859 (2.9387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7318 (0.9427)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [180]  [ 800/1251]  eta: 0:02:05  lr: 0.001542  min_lr: 0.001542  loss: 2.7535 (2.9388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8951 (0.9393)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [180]  [1000/1251]  eta: 0:01:09  lr: 0.001538  min_lr: 0.001538  loss: 3.3094 (2.9378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9992 (0.9651)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [180]  [1200/1251]  eta: 0:00:14  lr: 0.001535  min_lr: 0.001535  loss: 2.4687 (2.9301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8413 (0.9528)  time: 0.2742  data: 0.0004  max mem: 21847
Epoch: [180]  [1250/1251]  eta: 0:00:00  lr: 0.001534  min_lr: 0.001534  loss: 3.3694 (2.9346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8237 (0.9519)  time: 0.2276  data: 0.0006  max mem: 21847
Epoch: [180] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.001534  min_lr: 0.001534  loss: 3.3694 (2.9762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8237 (0.9519)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6091 (0.6091)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.3429  data: 5.1734  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8190 (0.8283)  acc1: 84.0000 (83.4545)  acc5: 96.8000 (96.9818)  time: 0.7443  data: 0.6082  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9963 (0.9675)  acc1: 76.8000 (79.9429)  acc5: 95.6000 (95.6191)  time: 0.2237  data: 0.0937  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0832 (0.9762)  acc1: 76.8000 (79.5840)  acc5: 94.8000 (95.6000)  time: 0.2222  data: 0.0934  max mem: 21847
Test: Total time: 0:00:10 (0.4163 s / it)
* Acc@1 80.122 Acc@5 95.392 loss 0.968
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.24%
Epoch: [181]  [   0/1251]  eta: 1:04:20  lr: 0.001534  min_lr: 0.001534  loss: 3.9305 (3.9305)  weight_decay: 0.0500 (0.0500)  time: 3.0859  data: 2.6372  max mem: 21847
Epoch: [181]  [ 200/1251]  eta: 0:05:07  lr: 0.001530  min_lr: 0.001530  loss: 2.6862 (2.9660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8173 (0.8556)  time: 0.2812  data: 0.0004  max mem: 21847
Epoch: [181]  [ 400/1251]  eta: 0:04:00  lr: 0.001527  min_lr: 0.001527  loss: 2.6421 (2.9580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7175 (0.8285)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [181]  [ 600/1251]  eta: 0:03:02  lr: 0.001523  min_lr: 0.001523  loss: 2.7090 (2.9676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9560 (0.8783)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [181]  [ 800/1251]  eta: 0:02:05  lr: 0.001520  min_lr: 0.001520  loss: 3.0541 (2.9667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8072 (0.8903)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [181]  [1000/1251]  eta: 0:01:09  lr: 0.001516  min_lr: 0.001516  loss: 3.1207 (2.9675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9318 (0.8877)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [181]  [1200/1251]  eta: 0:00:14  lr: 0.001513  min_lr: 0.001513  loss: 3.1715 (2.9745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8315 (0.9109)  time: 0.2796  data: 0.0004  max mem: 21847
Epoch: [181]  [1250/1251]  eta: 0:00:00  lr: 0.001512  min_lr: 0.001512  loss: 2.6793 (2.9684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7755 (0.9086)  time: 0.2287  data: 0.0010  max mem: 21847
Epoch: [181] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.001512  min_lr: 0.001512  loss: 2.6793 (2.9731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7755 (0.9086)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5041 (0.5041)  acc1: 91.6000 (91.6000)  acc5: 98.4000 (98.4000)  time: 5.6847  data: 5.5332  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7885 (0.7555)  acc1: 84.4000 (83.8909)  acc5: 97.2000 (97.3455)  time: 0.7505  data: 0.6167  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8952 (0.9060)  acc1: 78.8000 (80.5905)  acc5: 94.8000 (95.1810)  time: 0.2109  data: 0.0810  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9566 (0.9110)  acc1: 76.8000 (80.0960)  acc5: 94.8000 (95.2480)  time: 0.2094  data: 0.0809  max mem: 21847
Test: Total time: 0:00:10 (0.4201 s / it)
* Acc@1 80.266 Acc@5 95.462 loss 0.903
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.27%
Epoch: [182]  [   0/1251]  eta: 1:11:14  lr: 0.001512  min_lr: 0.001512  loss: 2.1328 (2.1328)  weight_decay: 0.0500 (0.0500)  time: 3.4167  data: 3.0983  max mem: 21847
Epoch: [182]  [ 200/1251]  eta: 0:05:04  lr: 0.001508  min_lr: 0.001508  loss: 3.1275 (2.9360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9143 (0.8702)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [182]  [ 400/1251]  eta: 0:04:00  lr: 0.001505  min_lr: 0.001505  loss: 3.2695 (2.9387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8065 (0.8601)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [182]  [ 600/1251]  eta: 0:03:02  lr: 0.001501  min_lr: 0.001501  loss: 3.0375 (2.9680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7947 (0.8682)  time: 0.2717  data: 0.0003  max mem: 21847
Epoch: [182]  [ 800/1251]  eta: 0:02:05  lr: 0.001498  min_lr: 0.001498  loss: 2.6998 (2.9543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7564 (0.8942)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [182]  [1000/1251]  eta: 0:01:09  lr: 0.001495  min_lr: 0.001495  loss: 3.2962 (2.9580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7872 (nan)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [182]  [1200/1251]  eta: 0:00:14  lr: 0.001491  min_lr: 0.001491  loss: 3.2429 (2.9624)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8671 (nan)  time: 0.2872  data: 0.0004  max mem: 21847
Epoch: [182]  [1250/1251]  eta: 0:00:00  lr: 0.001490  min_lr: 0.001490  loss: 3.2288 (2.9619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9106 (nan)  time: 0.2279  data: 0.0005  max mem: 21847
Epoch: [182] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.001490  min_lr: 0.001490  loss: 3.2288 (2.9560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9106 (nan)
Test:  [ 0/25]  eta: 0:01:47  loss: 0.6426 (0.6426)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 4.3119  data: 4.1585  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8902 (0.8380)  acc1: 84.8000 (83.4909)  acc5: 96.8000 (97.0909)  time: 0.6215  data: 0.4883  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0031 (0.9928)  acc1: 78.0000 (79.8095)  acc5: 95.6000 (95.2571)  time: 0.2377  data: 0.1084  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0826 (0.9995)  acc1: 77.2000 (79.4720)  acc5: 94.8000 (95.2640)  time: 0.2334  data: 0.1048  max mem: 21847
Test: Total time: 0:00:09 (0.3991 s / it)
* Acc@1 80.000 Acc@5 95.324 loss 0.991
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.27%
Epoch: [183]  [   0/1251]  eta: 1:00:44  lr: 0.001490  min_lr: 0.001490  loss: 2.4507 (2.4507)  weight_decay: 0.0500 (0.0500)  time: 2.9134  data: 2.4509  max mem: 21847
Epoch: [183]  [ 200/1251]  eta: 0:05:04  lr: 0.001487  min_lr: 0.001487  loss: 2.8952 (2.9291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8270 (0.8873)  time: 0.2760  data: 0.0005  max mem: 21847
Epoch: [183]  [ 400/1251]  eta: 0:04:00  lr: 0.001483  min_lr: 0.001483  loss: 2.7681 (2.9268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8090 (0.8978)  time: 0.2798  data: 0.0004  max mem: 21847
Epoch: [183]  [ 600/1251]  eta: 0:03:01  lr: 0.001480  min_lr: 0.001480  loss: 3.4924 (2.9442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8733 (0.8993)  time: 0.2697  data: 0.0004  max mem: 21847
Epoch: [183]  [ 800/1251]  eta: 0:02:05  lr: 0.001476  min_lr: 0.001476  loss: 3.0162 (2.9484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8528 (0.9020)  time: 0.2717  data: 0.0005  max mem: 21847
Epoch: [183]  [1000/1251]  eta: 0:01:09  lr: 0.001473  min_lr: 0.001473  loss: 2.6708 (2.9471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9917 (0.9166)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [183]  [1200/1251]  eta: 0:00:14  lr: 0.001469  min_lr: 0.001469  loss: 3.2123 (2.9625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7786 (0.9211)  time: 0.2727  data: 0.0005  max mem: 21847
Epoch: [183]  [1250/1251]  eta: 0:00:00  lr: 0.001469  min_lr: 0.001469  loss: 3.1796 (2.9657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7759 (0.9172)  time: 0.2288  data: 0.0005  max mem: 21847
Epoch: [183] Total time: 0:05:45 (0.2762 s / it)
Averaged stats: lr: 0.001469  min_lr: 0.001469  loss: 3.1796 (2.9645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7759 (0.9172)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5764 (0.5764)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.5213  data: 5.3560  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7644 (0.8037)  acc1: 82.4000 (83.3091)  acc5: 97.2000 (97.3091)  time: 0.7222  data: 0.5884  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9465 (0.9390)  acc1: 78.0000 (80.2286)  acc5: 95.2000 (95.6191)  time: 0.1965  data: 0.0671  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0304 (0.9516)  acc1: 77.2000 (79.8080)  acc5: 94.8000 (95.5520)  time: 0.1974  data: 0.0687  max mem: 21847
Test: Total time: 0:00:10 (0.4032 s / it)
* Acc@1 80.478 Acc@5 95.450 loss 0.940
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.48%
Epoch: [184]  [   0/1251]  eta: 1:02:20  lr: 0.001469  min_lr: 0.001469  loss: 3.8975 (3.8975)  weight_decay: 0.0500 (0.0500)  time: 2.9899  data: 2.6659  max mem: 21847
Epoch: [184]  [ 200/1251]  eta: 0:05:02  lr: 0.001465  min_lr: 0.001465  loss: 3.2546 (2.9479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8794 (0.8739)  time: 0.2825  data: 0.0004  max mem: 21847
Epoch: [184]  [ 400/1251]  eta: 0:03:59  lr: 0.001462  min_lr: 0.001462  loss: 3.3433 (2.9395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8922 (0.8972)  time: 0.2815  data: 0.0005  max mem: 21847
Epoch: [184]  [ 600/1251]  eta: 0:03:01  lr: 0.001458  min_lr: 0.001458  loss: 3.3959 (2.9484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8881 (0.9134)  time: 0.2746  data: 0.0005  max mem: 21847
Epoch: [184]  [ 800/1251]  eta: 0:02:05  lr: 0.001455  min_lr: 0.001455  loss: 3.2125 (2.9676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9139 (0.9153)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [184]  [1000/1251]  eta: 0:01:09  lr: 0.001451  min_lr: 0.001451  loss: 3.1275 (2.9762)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1213 (0.9443)  time: 0.2747  data: 0.0005  max mem: 21847
Epoch: [184]  [1200/1251]  eta: 0:00:14  lr: 0.001448  min_lr: 0.001448  loss: 3.1489 (2.9828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7563 (0.9316)  time: 0.2829  data: 0.0004  max mem: 21847
Epoch: [184]  [1250/1251]  eta: 0:00:00  lr: 0.001447  min_lr: 0.001447  loss: 2.9642 (2.9822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8305 (0.9307)  time: 0.2283  data: 0.0008  max mem: 21847
Epoch: [184] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.001447  min_lr: 0.001447  loss: 2.9642 (2.9675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8305 (0.9307)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5930 (0.5930)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.6302  data: 5.4744  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7607 (0.7886)  acc1: 84.4000 (83.3818)  acc5: 97.2000 (97.1273)  time: 0.7625  data: 0.6295  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9491 (0.9321)  acc1: 78.0000 (80.2095)  acc5: 94.8000 (95.7905)  time: 0.2093  data: 0.0794  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0025 (0.9370)  acc1: 78.0000 (80.1600)  acc5: 94.8000 (95.6960)  time: 0.2101  data: 0.0811  max mem: 21847
Test: Total time: 0:00:10 (0.4179 s / it)
* Acc@1 80.178 Acc@5 95.480 loss 0.935
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.48%
Epoch: [185]  [   0/1251]  eta: 1:09:26  lr: 0.001447  min_lr: 0.001447  loss: 3.0658 (3.0658)  weight_decay: 0.0500 (0.0500)  time: 3.3302  data: 2.9808  max mem: 21847
Epoch: [185]  [ 200/1251]  eta: 0:05:04  lr: 0.001444  min_lr: 0.001444  loss: 3.2863 (2.9971)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2978 (1.2429)  time: 0.2777  data: 0.0004  max mem: 21847
Epoch: [185]  [ 400/1251]  eta: 0:03:59  lr: 0.001440  min_lr: 0.001440  loss: 2.8730 (2.9741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9498 (1.0544)  time: 0.2713  data: 0.0003  max mem: 21847
Epoch: [185]  [ 600/1251]  eta: 0:03:01  lr: 0.001437  min_lr: 0.001437  loss: 3.2958 (2.9692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7881 (0.9804)  time: 0.2789  data: 0.0004  max mem: 21847
Epoch: [185]  [ 800/1251]  eta: 0:02:05  lr: 0.001433  min_lr: 0.001433  loss: 3.2339 (2.9641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9703 (0.9729)  time: 0.2740  data: 0.0004  max mem: 21847
Epoch: [185]  [1000/1251]  eta: 0:01:09  lr: 0.001430  min_lr: 0.001430  loss: 3.0792 (2.9728)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9215 (0.9681)  time: 0.2806  data: 0.0005  max mem: 21847
Epoch: [185]  [1200/1251]  eta: 0:00:14  lr: 0.001426  min_lr: 0.001426  loss: 2.7912 (2.9668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7907 (0.9695)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [185]  [1250/1251]  eta: 0:00:00  lr: 0.001426  min_lr: 0.001426  loss: 3.0727 (2.9611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7995 (0.9631)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [185] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.001426  min_lr: 0.001426  loss: 3.0727 (2.9611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7995 (0.9631)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6415 (0.6415)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.3239  data: 5.1728  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8062 (0.8115)  acc1: 84.4000 (83.8909)  acc5: 97.2000 (97.1636)  time: 0.6875  data: 0.5471  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9658 (0.9497)  acc1: 80.0000 (81.0857)  acc5: 95.6000 (95.4667)  time: 0.2054  data: 0.0721  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0132 (0.9630)  acc1: 78.8000 (80.5920)  acc5: 94.4000 (95.3760)  time: 0.2103  data: 0.0813  max mem: 21847
Test: Total time: 0:00:10 (0.4093 s / it)
* Acc@1 80.402 Acc@5 95.506 loss 0.954
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.48%
Epoch: [186]  [   0/1251]  eta: 1:05:00  lr: 0.001425  min_lr: 0.001425  loss: 2.3500 (2.3500)  weight_decay: 0.0500 (0.0500)  time: 3.1181  data: 2.7265  max mem: 21847
Epoch: [186]  [ 200/1251]  eta: 0:05:04  lr: 0.001422  min_lr: 0.001422  loss: 2.8066 (2.9958)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0671 (1.0224)  time: 0.2782  data: 0.0004  max mem: 21847
Epoch: [186]  [ 400/1251]  eta: 0:04:00  lr: 0.001419  min_lr: 0.001419  loss: 3.0188 (2.9735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7704 (0.9904)  time: 0.2727  data: 0.0003  max mem: 21847
Epoch: [186]  [ 600/1251]  eta: 0:03:02  lr: 0.001415  min_lr: 0.001415  loss: 2.6684 (2.9534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7837 (1.0049)  time: 0.2739  data: 0.0006  max mem: 21847
Epoch: [186]  [ 800/1251]  eta: 0:02:05  lr: 0.001412  min_lr: 0.001412  loss: 3.0622 (2.9542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8575 (0.9713)  time: 0.2734  data: 0.0005  max mem: 21847
Epoch: [186]  [1000/1251]  eta: 0:01:09  lr: 0.001408  min_lr: 0.001408  loss: 2.6607 (2.9529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9006 (0.9518)  time: 0.2861  data: 0.0003  max mem: 21847
Epoch: [186]  [1200/1251]  eta: 0:00:14  lr: 0.001405  min_lr: 0.001405  loss: 3.3073 (2.9474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7882 (0.9509)  time: 0.2840  data: 0.0004  max mem: 21847
Epoch: [186]  [1250/1251]  eta: 0:00:00  lr: 0.001404  min_lr: 0.001404  loss: 2.7845 (2.9411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9032 (0.9515)  time: 0.2336  data: 0.0005  max mem: 21847
Epoch: [186] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.001404  min_lr: 0.001404  loss: 2.7845 (2.9576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9032 (0.9515)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.5932 (0.5932)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.9871  data: 5.8147  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7278 (0.7464)  acc1: 84.4000 (83.2364)  acc5: 97.2000 (97.1636)  time: 0.7032  data: 0.5678  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9195 (0.9070)  acc1: 78.0000 (80.3048)  acc5: 95.6000 (95.2571)  time: 0.1715  data: 0.0419  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0075 (0.9134)  acc1: 77.6000 (79.9360)  acc5: 94.0000 (95.2160)  time: 0.1715  data: 0.0439  max mem: 21847
Test: Total time: 0:00:10 (0.4037 s / it)
* Acc@1 80.412 Acc@5 95.466 loss 0.892
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.48%
Epoch: [187]  [   0/1251]  eta: 1:09:02  lr: 0.001404  min_lr: 0.001404  loss: 2.4852 (2.4852)  weight_decay: 0.0500 (0.0500)  time: 3.3117  data: 2.9434  max mem: 21847
Epoch: [187]  [ 200/1251]  eta: 0:05:02  lr: 0.001401  min_lr: 0.001401  loss: 2.9946 (2.9145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8814 (0.8698)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [187]  [ 400/1251]  eta: 0:03:58  lr: 0.001397  min_lr: 0.001397  loss: 3.2115 (2.9338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8113 (0.8799)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [187]  [ 600/1251]  eta: 0:03:01  lr: 0.001394  min_lr: 0.001394  loss: 2.7786 (2.9440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7360 (0.9176)  time: 0.2702  data: 0.0003  max mem: 21847
Epoch: [187]  [ 800/1251]  eta: 0:02:05  lr: 0.001390  min_lr: 0.001390  loss: 3.3634 (2.9621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8682 (0.9261)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [187]  [1000/1251]  eta: 0:01:09  lr: 0.001387  min_lr: 0.001387  loss: 2.8096 (2.9589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7697 (0.9150)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [187]  [1200/1251]  eta: 0:00:14  lr: 0.001383  min_lr: 0.001383  loss: 3.4134 (2.9627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8165 (0.9182)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [187]  [1250/1251]  eta: 0:00:00  lr: 0.001383  min_lr: 0.001383  loss: 3.1518 (2.9623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8927 (0.9195)  time: 0.2281  data: 0.0007  max mem: 21847
Epoch: [187] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.001383  min_lr: 0.001383  loss: 3.1518 (2.9570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8927 (0.9195)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6750 (0.6750)  acc1: 88.8000 (88.8000)  acc5: 96.4000 (96.4000)  time: 5.4989  data: 5.3377  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7984 (0.7971)  acc1: 85.6000 (84.1091)  acc5: 97.2000 (97.0182)  time: 0.7323  data: 0.5974  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9616 (0.9563)  acc1: 78.4000 (80.6476)  acc5: 94.8000 (95.2952)  time: 0.2088  data: 0.0789  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0324 (0.9693)  acc1: 78.4000 (80.1600)  acc5: 94.8000 (95.2960)  time: 0.2080  data: 0.0789  max mem: 21847
Test: Total time: 0:00:10 (0.4105 s / it)
* Acc@1 80.492 Acc@5 95.516 loss 0.953
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.49%
Epoch: [188]  [   0/1251]  eta: 1:02:58  lr: 0.001383  min_lr: 0.001383  loss: 2.1287 (2.1287)  weight_decay: 0.0500 (0.0500)  time: 3.0206  data: 2.6986  max mem: 21847
Epoch: [188]  [ 200/1251]  eta: 0:05:04  lr: 0.001379  min_lr: 0.001379  loss: 3.1403 (3.0034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9252 (0.9188)  time: 0.2826  data: 0.0004  max mem: 21847
Epoch: [188]  [ 400/1251]  eta: 0:03:59  lr: 0.001376  min_lr: 0.001376  loss: 2.8234 (2.9807)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0204 (0.9397)  time: 0.2705  data: 0.0003  max mem: 21847
Epoch: [188]  [ 600/1251]  eta: 0:03:01  lr: 0.001372  min_lr: 0.001372  loss: 2.6036 (2.9718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8690 (0.9243)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [188]  [ 800/1251]  eta: 0:02:05  lr: 0.001369  min_lr: 0.001369  loss: 3.1772 (2.9831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8986 (0.9208)  time: 0.2806  data: 0.0004  max mem: 21847
Epoch: [188]  [1000/1251]  eta: 0:01:09  lr: 0.001366  min_lr: 0.001366  loss: 3.2446 (2.9773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8865 (0.9242)  time: 0.2749  data: 0.0004  max mem: 21847
Epoch: [188]  [1200/1251]  eta: 0:00:14  lr: 0.001362  min_lr: 0.001362  loss: 2.9976 (2.9647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8061 (0.9129)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [188]  [1250/1251]  eta: 0:00:00  lr: 0.001361  min_lr: 0.001361  loss: 2.7421 (2.9677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8916 (0.9161)  time: 0.2282  data: 0.0006  max mem: 21847
Epoch: [188] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.001361  min_lr: 0.001361  loss: 2.7421 (2.9442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8916 (0.9161)
Test:  [ 0/25]  eta: 0:01:41  loss: 0.6161 (0.6161)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 4.0441  data: 3.8646  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.7365 (0.7670)  acc1: 84.8000 (83.8545)  acc5: 97.6000 (97.2727)  time: 0.6346  data: 0.5001  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9492 (0.9007)  acc1: 78.4000 (80.4191)  acc5: 95.6000 (95.7143)  time: 0.2444  data: 0.1131  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9538 (0.9119)  acc1: 77.2000 (80.0480)  acc5: 94.4000 (95.6320)  time: 0.1973  data: 0.0665  max mem: 21847
Test: Total time: 0:00:09 (0.3976 s / it)
* Acc@1 80.512 Acc@5 95.588 loss 0.903
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.51%
Epoch: [189]  [   0/1251]  eta: 1:05:30  lr: 0.001361  min_lr: 0.001361  loss: 3.4498 (3.4498)  weight_decay: 0.0500 (0.0500)  time: 3.1417  data: 2.8196  max mem: 21847
Epoch: [189]  [ 200/1251]  eta: 0:05:04  lr: 0.001358  min_lr: 0.001358  loss: 2.7839 (2.8479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8652 (0.9369)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [189]  [ 400/1251]  eta: 0:04:00  lr: 0.001355  min_lr: 0.001355  loss: 2.4973 (2.9094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8818 (0.9352)  time: 0.2770  data: 0.0004  max mem: 21847
Epoch: [189]  [ 600/1251]  eta: 0:03:02  lr: 0.001351  min_lr: 0.001351  loss: 3.1139 (2.9269)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2704  data: 0.0004  max mem: 21847
Epoch: [189]  [ 800/1251]  eta: 0:02:05  lr: 0.001348  min_lr: 0.001348  loss: 2.8582 (2.9282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7693 (nan)  time: 0.2827  data: 0.0004  max mem: 21847
Epoch: [189]  [1000/1251]  eta: 0:01:09  lr: 0.001344  min_lr: 0.001344  loss: 2.7461 (2.9269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8582 (nan)  time: 0.2725  data: 0.0005  max mem: 21847
Epoch: [189]  [1200/1251]  eta: 0:00:14  lr: 0.001341  min_lr: 0.001341  loss: 2.8485 (2.9289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7112 (nan)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [189]  [1250/1251]  eta: 0:00:00  lr: 0.001340  min_lr: 0.001340  loss: 2.3716 (2.9207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8791 (nan)  time: 0.2285  data: 0.0005  max mem: 21847
Epoch: [189] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.001340  min_lr: 0.001340  loss: 2.3716 (2.9451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8791 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5638 (0.5638)  acc1: 90.4000 (90.4000)  acc5: 98.0000 (98.0000)  time: 5.5894  data: 5.4250  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7561 (0.7340)  acc1: 84.8000 (84.3273)  acc5: 97.2000 (97.0546)  time: 0.7126  data: 0.5775  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9102 (0.8823)  acc1: 78.4000 (80.4381)  acc5: 95.2000 (95.5238)  time: 0.2104  data: 0.0805  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9525 (0.8896)  acc1: 78.0000 (80.2240)  acc5: 95.2000 (95.4560)  time: 0.2095  data: 0.0804  max mem: 21847
Test: Total time: 0:00:10 (0.4158 s / it)
* Acc@1 80.474 Acc@5 95.558 loss 0.874
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.51%
Epoch: [190]  [   0/1251]  eta: 1:08:35  lr: 0.001340  min_lr: 0.001340  loss: 3.5498 (3.5498)  weight_decay: 0.0500 (0.0500)  time: 3.2897  data: 2.6939  max mem: 21847
Epoch: [190]  [ 200/1251]  eta: 0:05:08  lr: 0.001337  min_lr: 0.001337  loss: 3.1569 (2.9183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9944 (0.9976)  time: 0.2932  data: 0.0005  max mem: 21847
Epoch: [190]  [ 400/1251]  eta: 0:04:01  lr: 0.001333  min_lr: 0.001333  loss: 2.4427 (2.9075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8904 (0.9760)  time: 0.2725  data: 0.0006  max mem: 21847
Epoch: [190]  [ 600/1251]  eta: 0:03:02  lr: 0.001330  min_lr: 0.001330  loss: 2.8449 (2.9037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9070 (0.9969)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [190]  [ 800/1251]  eta: 0:02:06  lr: 0.001327  min_lr: 0.001327  loss: 3.0421 (2.9011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8641 (0.9618)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [190]  [1000/1251]  eta: 0:01:09  lr: 0.001323  min_lr: 0.001323  loss: 2.9888 (2.9087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8742 (0.9699)  time: 0.2758  data: 0.0004  max mem: 21847
Epoch: [190]  [1200/1251]  eta: 0:00:14  lr: 0.001320  min_lr: 0.001320  loss: 2.6609 (2.9082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7810 (0.9571)  time: 0.2715  data: 0.0003  max mem: 21847
Epoch: [190]  [1250/1251]  eta: 0:00:00  lr: 0.001319  min_lr: 0.001319  loss: 2.5888 (2.9029)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1042 (0.9813)  time: 0.2288  data: 0.0006  max mem: 21847
Epoch: [190] Total time: 0:05:47 (0.2780 s / it)
Averaged stats: lr: 0.001319  min_lr: 0.001319  loss: 2.5888 (2.9389)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1042 (0.9813)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6270 (0.6270)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.6595  data: 5.4893  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8288 (0.8000)  acc1: 83.6000 (83.7455)  acc5: 97.2000 (97.3091)  time: 0.7252  data: 0.5898  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9495 (0.9430)  acc1: 77.2000 (80.6286)  acc5: 95.2000 (95.4095)  time: 0.1984  data: 0.0683  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9801 (0.9500)  acc1: 77.2000 (80.3360)  acc5: 94.0000 (95.2960)  time: 0.2083  data: 0.0784  max mem: 21847
Test: Total time: 0:00:10 (0.4167 s / it)
* Acc@1 80.476 Acc@5 95.524 loss 0.940
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.51%
Epoch: [191]  [   0/1251]  eta: 1:09:10  lr: 0.001319  min_lr: 0.001319  loss: 2.1558 (2.1558)  weight_decay: 0.0500 (0.0500)  time: 3.3175  data: 2.9193  max mem: 21847
Epoch: [191]  [ 200/1251]  eta: 0:05:05  lr: 0.001316  min_lr: 0.001316  loss: 3.0134 (2.8733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8538 (0.9450)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [191]  [ 400/1251]  eta: 0:04:00  lr: 0.001312  min_lr: 0.001312  loss: 3.3433 (2.9142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8490 (0.9012)  time: 0.2820  data: 0.0004  max mem: 21847
Epoch: [191]  [ 600/1251]  eta: 0:03:01  lr: 0.001309  min_lr: 0.001309  loss: 2.5640 (2.9295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9660 (0.9467)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [191]  [ 800/1251]  eta: 0:02:05  lr: 0.001305  min_lr: 0.001305  loss: 2.8227 (2.9277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7931 (0.9382)  time: 0.2838  data: 0.0004  max mem: 21847
Epoch: [191]  [1000/1251]  eta: 0:01:09  lr: 0.001302  min_lr: 0.001302  loss: 2.9475 (2.9303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8911 (0.9346)  time: 0.2804  data: 0.0004  max mem: 21847
Epoch: [191]  [1200/1251]  eta: 0:00:14  lr: 0.001299  min_lr: 0.001299  loss: 2.8015 (2.9239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8243 (0.9310)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [191]  [1250/1251]  eta: 0:00:00  lr: 0.001298  min_lr: 0.001298  loss: 2.5099 (2.9251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9079 (0.9343)  time: 0.2347  data: 0.0007  max mem: 21847
Epoch: [191] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.001298  min_lr: 0.001298  loss: 2.5099 (2.9296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9079 (0.9343)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5842 (0.5842)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 5.4893  data: 5.3416  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7459 (0.7501)  acc1: 85.6000 (84.4000)  acc5: 97.6000 (97.0909)  time: 0.7184  data: 0.5847  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9517 (0.8954)  acc1: 79.6000 (80.8762)  acc5: 94.8000 (95.5048)  time: 0.1967  data: 0.0668  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9697 (0.9026)  acc1: 78.8000 (80.6560)  acc5: 94.8000 (95.4880)  time: 0.2233  data: 0.0950  max mem: 21847
Test: Total time: 0:00:10 (0.4235 s / it)
* Acc@1 80.714 Acc@5 95.560 loss 0.899
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.71%
Epoch: [192]  [   0/1251]  eta: 0:59:16  lr: 0.001298  min_lr: 0.001298  loss: 2.2244 (2.2244)  weight_decay: 0.0500 (0.0500)  time: 2.8432  data: 2.4727  max mem: 21847
Epoch: [192]  [ 200/1251]  eta: 0:05:03  lr: 0.001295  min_lr: 0.001295  loss: 3.1345 (2.9033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8556 (0.9264)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [192]  [ 400/1251]  eta: 0:04:00  lr: 0.001291  min_lr: 0.001291  loss: 2.6322 (2.8921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7957 (0.9101)  time: 0.2905  data: 0.0004  max mem: 21847
Epoch: [192]  [ 600/1251]  eta: 0:03:01  lr: 0.001288  min_lr: 0.001288  loss: 3.3070 (2.9179)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [192]  [ 800/1251]  eta: 0:02:05  lr: 0.001284  min_lr: 0.001284  loss: 2.8388 (2.9191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7654 (nan)  time: 0.2830  data: 0.0004  max mem: 21847
Epoch: [192]  [1000/1251]  eta: 0:01:09  lr: 0.001281  min_lr: 0.001281  loss: 3.3671 (2.9272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8422 (nan)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [192]  [1200/1251]  eta: 0:00:14  lr: 0.001278  min_lr: 0.001278  loss: 2.6640 (2.9277)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0345 (nan)  time: 0.2785  data: 0.0004  max mem: 21847
Epoch: [192]  [1250/1251]  eta: 0:00:00  lr: 0.001277  min_lr: 0.001277  loss: 3.1126 (2.9300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9559 (nan)  time: 0.2280  data: 0.0005  max mem: 21847
Epoch: [192] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.001277  min_lr: 0.001277  loss: 3.1126 (2.9279)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9559 (nan)
Test:  [ 0/25]  eta: 0:01:44  loss: 0.6365 (0.6365)  acc1: 90.0000 (90.0000)  acc5: 98.0000 (98.0000)  time: 4.1691  data: 3.9890  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8294 (0.8120)  acc1: 84.0000 (83.6000)  acc5: 97.6000 (97.5273)  time: 0.7063  data: 0.5711  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9913 (0.9579)  acc1: 78.4000 (80.3238)  acc5: 95.6000 (95.4857)  time: 0.2696  data: 0.1399  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9967 (0.9631)  acc1: 78.4000 (80.1760)  acc5: 94.8000 (95.5360)  time: 0.2142  data: 0.0853  max mem: 21847
Test: Total time: 0:00:10 (0.4067 s / it)
* Acc@1 80.574 Acc@5 95.516 loss 0.946
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.71%
Epoch: [193]  [   0/1251]  eta: 1:11:03  lr: 0.001277  min_lr: 0.001277  loss: 3.0648 (3.0648)  weight_decay: 0.0500 (0.0500)  time: 3.4081  data: 2.9733  max mem: 21847
Epoch: [193]  [ 200/1251]  eta: 0:05:06  lr: 0.001274  min_lr: 0.001274  loss: 2.6381 (2.8729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8664 (1.0114)  time: 0.2932  data: 0.0004  max mem: 21847
Epoch: [193]  [ 400/1251]  eta: 0:04:00  lr: 0.001270  min_lr: 0.001270  loss: 3.2312 (2.8918)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0434 (1.0071)  time: 0.2743  data: 0.0004  max mem: 21847
Epoch: [193]  [ 600/1251]  eta: 0:03:02  lr: 0.001267  min_lr: 0.001267  loss: 3.2914 (2.9019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9628 (0.9731)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [193]  [ 800/1251]  eta: 0:02:05  lr: 0.001264  min_lr: 0.001264  loss: 3.2389 (2.9266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8206 (0.9709)  time: 0.2743  data: 0.0004  max mem: 21847
Epoch: [193]  [1000/1251]  eta: 0:01:09  lr: 0.001260  min_lr: 0.001260  loss: 3.1330 (2.9220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8019 (0.9498)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [193]  [1200/1251]  eta: 0:00:14  lr: 0.001257  min_lr: 0.001257  loss: 2.8135 (2.9231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7822 (0.9502)  time: 0.2822  data: 0.0003  max mem: 21847
Epoch: [193]  [1250/1251]  eta: 0:00:00  lr: 0.001256  min_lr: 0.001256  loss: 2.4571 (2.9228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7288 (0.9476)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [193] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.001256  min_lr: 0.001256  loss: 2.4571 (2.9300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7288 (0.9476)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5876 (0.5876)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 5.6570  data: 5.5024  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7710 (0.7579)  acc1: 85.6000 (84.5818)  acc5: 98.0000 (97.2727)  time: 0.7256  data: 0.5926  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9273 (0.8899)  acc1: 78.8000 (81.0857)  acc5: 95.2000 (95.6762)  time: 0.1886  data: 0.0595  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9754 (0.8965)  acc1: 78.8000 (80.8640)  acc5: 94.4000 (95.4880)  time: 0.2087  data: 0.0805  max mem: 21847
Test: Total time: 0:00:10 (0.4177 s / it)
* Acc@1 80.828 Acc@5 95.642 loss 0.897
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.83%
Epoch: [194]  [   0/1251]  eta: 1:05:03  lr: 0.001256  min_lr: 0.001256  loss: 3.3937 (3.3937)  weight_decay: 0.0500 (0.0500)  time: 3.1206  data: 2.7505  max mem: 21847
Epoch: [194]  [ 200/1251]  eta: 0:05:05  lr: 0.001253  min_lr: 0.001253  loss: 3.2347 (2.8863)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0875 (1.0945)  time: 0.2830  data: 0.0006  max mem: 21847
Epoch: [194]  [ 400/1251]  eta: 0:04:00  lr: 0.001249  min_lr: 0.001249  loss: 2.8591 (2.9380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8189 (1.0230)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [194]  [ 600/1251]  eta: 0:03:01  lr: 0.001246  min_lr: 0.001246  loss: 3.2371 (2.9314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9204 (1.0214)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [194]  [ 800/1251]  eta: 0:02:05  lr: 0.001243  min_lr: 0.001243  loss: 3.0571 (2.9223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8027 (1.0108)  time: 0.2752  data: 0.0004  max mem: 21847
Epoch: [194]  [1000/1251]  eta: 0:01:09  lr: 0.001239  min_lr: 0.001239  loss: 3.3507 (2.9309)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0403 (1.0262)  time: 0.2740  data: 0.0004  max mem: 21847
Epoch: [194]  [1200/1251]  eta: 0:00:14  lr: 0.001236  min_lr: 0.001236  loss: 2.5399 (2.9181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9976 (1.0229)  time: 0.2716  data: 0.0005  max mem: 21847
Epoch: [194]  [1250/1251]  eta: 0:00:00  lr: 0.001235  min_lr: 0.001235  loss: 3.5255 (2.9271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9327 (1.0194)  time: 0.2277  data: 0.0007  max mem: 21847
Epoch: [194] Total time: 0:05:45 (0.2765 s / it)
Averaged stats: lr: 0.001235  min_lr: 0.001235  loss: 3.5255 (2.9267)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9327 (1.0194)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6951 (0.6951)  acc1: 89.6000 (89.6000)  acc5: 98.0000 (98.0000)  time: 5.3140  data: 5.1627  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.9002 (0.8710)  acc1: 84.0000 (84.4727)  acc5: 97.2000 (97.0546)  time: 0.7219  data: 0.5873  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0193 (1.0266)  acc1: 79.6000 (80.9143)  acc5: 95.6000 (95.5048)  time: 0.2129  data: 0.0827  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0734 (1.0299)  acc1: 78.8000 (80.6400)  acc5: 95.2000 (95.5040)  time: 0.1983  data: 0.0700  max mem: 21847
Test: Total time: 0:00:10 (0.4076 s / it)
* Acc@1 80.704 Acc@5 95.496 loss 1.029
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.83%
Epoch: [195]  [   0/1251]  eta: 1:08:41  lr: 0.001235  min_lr: 0.001235  loss: 2.0581 (2.0581)  weight_decay: 0.0500 (0.0500)  time: 3.2943  data: 2.3403  max mem: 21847
Epoch: [195]  [ 200/1251]  eta: 0:05:04  lr: 0.001232  min_lr: 0.001232  loss: 2.9421 (2.8606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8141 (0.9183)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [195]  [ 400/1251]  eta: 0:03:59  lr: 0.001229  min_lr: 0.001229  loss: 3.1689 (2.9019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7831 (0.8937)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [195]  [ 600/1251]  eta: 0:03:02  lr: 0.001225  min_lr: 0.001225  loss: 3.1452 (2.9033)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0075 (0.9253)  time: 0.2846  data: 0.0004  max mem: 21847
Epoch: [195]  [ 800/1251]  eta: 0:02:05  lr: 0.001222  min_lr: 0.001222  loss: 3.2450 (2.9144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8590 (0.9072)  time: 0.2846  data: 0.0004  max mem: 21847
Epoch: [195]  [1000/1251]  eta: 0:01:09  lr: 0.001219  min_lr: 0.001219  loss: 3.0974 (2.9222)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0331 (0.9201)  time: 0.2798  data: 0.0004  max mem: 21847
Epoch: [195]  [1200/1251]  eta: 0:00:14  lr: 0.001215  min_lr: 0.001215  loss: 2.8714 (2.9166)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8834 (0.9248)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [195]  [1250/1251]  eta: 0:00:00  lr: 0.001215  min_lr: 0.001215  loss: 3.3381 (2.9209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9432 (0.9276)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [195] Total time: 0:05:47 (0.2777 s / it)
Averaged stats: lr: 0.001215  min_lr: 0.001215  loss: 3.3381 (2.9115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9432 (0.9276)
Test:  [ 0/25]  eta: 0:01:26  loss: 0.6178 (0.6178)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 3.4571  data: 3.2998  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.7564 (0.7760)  acc1: 85.6000 (84.4000)  acc5: 97.6000 (97.5636)  time: 0.6253  data: 0.4894  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9361 (0.9308)  acc1: 78.4000 (81.1238)  acc5: 95.6000 (95.7905)  time: 0.2744  data: 0.1439  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0022 (0.9357)  acc1: 78.8000 (81.0720)  acc5: 94.8000 (95.7440)  time: 0.2350  data: 0.1062  max mem: 21847
Test: Total time: 0:00:09 (0.3979 s / it)
* Acc@1 80.874 Acc@5 95.748 loss 0.936
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.87%
Epoch: [196]  [   0/1251]  eta: 1:08:07  lr: 0.001215  min_lr: 0.001215  loss: 3.4804 (3.4804)  weight_decay: 0.0500 (0.0500)  time: 3.2673  data: 2.9729  max mem: 21847
Epoch: [196]  [ 200/1251]  eta: 0:05:03  lr: 0.001211  min_lr: 0.001211  loss: 2.8905 (2.9643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8761 (0.9351)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [196]  [ 400/1251]  eta: 0:03:59  lr: 0.001208  min_lr: 0.001208  loss: 3.0559 (2.9204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8083 (0.9730)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [196]  [ 600/1251]  eta: 0:03:01  lr: 0.001205  min_lr: 0.001205  loss: 3.1979 (2.9217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8196 (0.9422)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [196]  [ 800/1251]  eta: 0:02:05  lr: 0.001201  min_lr: 0.001201  loss: 3.1171 (2.9147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9096 (0.9370)  time: 0.2755  data: 0.0004  max mem: 21847
Epoch: [196]  [1000/1251]  eta: 0:01:09  lr: 0.001198  min_lr: 0.001198  loss: 3.0990 (2.9233)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0672 (0.9557)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [196]  [1200/1251]  eta: 0:00:14  lr: 0.001195  min_lr: 0.001195  loss: 2.8660 (2.9199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8703 (0.9612)  time: 0.2785  data: 0.0004  max mem: 21847
Epoch: [196]  [1250/1251]  eta: 0:00:00  lr: 0.001194  min_lr: 0.001194  loss: 2.8640 (2.9233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8859 (0.9594)  time: 0.2314  data: 0.0005  max mem: 21847
Epoch: [196] Total time: 0:05:45 (0.2766 s / it)
Averaged stats: lr: 0.001194  min_lr: 0.001194  loss: 2.8640 (2.9126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8859 (0.9594)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.5973 (0.5973)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 5.8806  data: 5.7236  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7568 (0.7451)  acc1: 84.0000 (84.2909)  acc5: 98.0000 (97.6364)  time: 0.7516  data: 0.6186  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9418 (0.8800)  acc1: 79.6000 (81.0667)  acc5: 95.2000 (95.9619)  time: 0.2042  data: 0.0751  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9418 (0.8932)  acc1: 79.2000 (80.7520)  acc5: 94.8000 (95.8240)  time: 0.2033  data: 0.0750  max mem: 21847
Test: Total time: 0:00:10 (0.4221 s / it)
* Acc@1 80.792 Acc@5 95.716 loss 0.887
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.87%
Epoch: [197]  [   0/1251]  eta: 1:09:17  lr: 0.001194  min_lr: 0.001194  loss: 3.4003 (3.4003)  weight_decay: 0.0500 (0.0500)  time: 3.3236  data: 1.6541  max mem: 21847
Epoch: [197]  [ 200/1251]  eta: 0:05:06  lr: 0.001191  min_lr: 0.001191  loss: 3.1741 (2.8371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8685 (0.8944)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [197]  [ 400/1251]  eta: 0:04:00  lr: 0.001187  min_lr: 0.001187  loss: 2.4296 (2.8579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8395 (0.8938)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [197]  [ 600/1251]  eta: 0:03:02  lr: 0.001184  min_lr: 0.001184  loss: 3.1045 (2.8794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8377 (0.8992)  time: 0.2803  data: 0.0004  max mem: 21847
Epoch: [197]  [ 800/1251]  eta: 0:02:06  lr: 0.001181  min_lr: 0.001181  loss: 2.9140 (2.8882)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0302 (0.9093)  time: 0.2811  data: 0.0004  max mem: 21847
Epoch: [197]  [1000/1251]  eta: 0:01:09  lr: 0.001178  min_lr: 0.001178  loss: 2.4942 (2.8833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8507 (0.9317)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [197]  [1200/1251]  eta: 0:00:14  lr: 0.001174  min_lr: 0.001174  loss: 3.2230 (2.8833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8799 (0.9308)  time: 0.2703  data: 0.0004  max mem: 21847
Epoch: [197]  [1250/1251]  eta: 0:00:00  lr: 0.001174  min_lr: 0.001174  loss: 2.7487 (2.8846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8514 (0.9289)  time: 0.2279  data: 0.0006  max mem: 21847
Epoch: [197] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.001174  min_lr: 0.001174  loss: 2.7487 (2.9091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8514 (0.9289)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6466 (0.6466)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 5.4067  data: 5.2488  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8017 (0.7965)  acc1: 84.8000 (84.0000)  acc5: 97.6000 (97.3091)  time: 0.7025  data: 0.5678  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0020 (0.9366)  acc1: 78.0000 (80.5905)  acc5: 95.6000 (95.6571)  time: 0.2045  data: 0.0745  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9853 (0.9421)  acc1: 78.8000 (80.4000)  acc5: 94.8000 (95.6000)  time: 0.2141  data: 0.0847  max mem: 21847
Test: Total time: 0:00:10 (0.4120 s / it)
* Acc@1 80.618 Acc@5 95.582 loss 0.942
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.87%
Epoch: [198]  [   0/1251]  eta: 1:11:29  lr: 0.001174  min_lr: 0.001174  loss: 3.1075 (3.1075)  weight_decay: 0.0500 (0.0500)  time: 3.4286  data: 3.1065  max mem: 21847
Epoch: [198]  [ 200/1251]  eta: 0:05:07  lr: 0.001170  min_lr: 0.001170  loss: 2.9839 (2.8879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8680 (1.0031)  time: 0.2735  data: 0.0005  max mem: 21847
Epoch: [198]  [ 400/1251]  eta: 0:04:01  lr: 0.001167  min_lr: 0.001167  loss: 3.2478 (2.9218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9857 (1.0533)  time: 0.2714  data: 0.0005  max mem: 21847
Epoch: [198]  [ 600/1251]  eta: 0:03:02  lr: 0.001164  min_lr: 0.001164  loss: 3.0157 (2.9102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9686 (1.0299)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [198]  [ 800/1251]  eta: 0:02:05  lr: 0.001161  min_lr: 0.001161  loss: 3.0103 (2.9247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7943 (1.0053)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [198]  [1000/1251]  eta: 0:01:09  lr: 0.001157  min_lr: 0.001157  loss: 2.8404 (2.9263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8523 (0.9839)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [198]  [1200/1251]  eta: 0:00:14  lr: 0.001154  min_lr: 0.001154  loss: 2.4087 (2.9156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8581 (0.9802)  time: 0.2707  data: 0.0005  max mem: 21847
Epoch: [198]  [1250/1251]  eta: 0:00:00  lr: 0.001153  min_lr: 0.001153  loss: 2.7825 (2.9122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8403 (0.9764)  time: 0.2307  data: 0.0010  max mem: 21847
Epoch: [198] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.001153  min_lr: 0.001153  loss: 2.7825 (2.9157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8403 (0.9764)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5684 (0.5684)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.3405  data: 5.1880  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7671 (0.7472)  acc1: 85.2000 (84.6182)  acc5: 97.6000 (97.3091)  time: 0.7296  data: 0.5971  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9343 (0.8880)  acc1: 78.0000 (81.2762)  acc5: 94.8000 (95.3333)  time: 0.2157  data: 0.0868  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9831 (0.8997)  acc1: 78.0000 (80.7520)  acc5: 94.4000 (95.3760)  time: 0.2149  data: 0.0867  max mem: 21847
Test: Total time: 0:00:10 (0.4102 s / it)
* Acc@1 80.836 Acc@5 95.602 loss 0.892
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.87%
Epoch: [199]  [   0/1251]  eta: 1:06:52  lr: 0.001153  min_lr: 0.001153  loss: 3.0534 (3.0534)  weight_decay: 0.0500 (0.0500)  time: 3.2074  data: 2.6268  max mem: 21847
Epoch: [199]  [ 200/1251]  eta: 0:05:05  lr: 0.001150  min_lr: 0.001150  loss: 3.1795 (2.9472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9260 (0.9737)  time: 0.2707  data: 0.0005  max mem: 21847
Epoch: [199]  [ 400/1251]  eta: 0:04:00  lr: 0.001147  min_lr: 0.001147  loss: 2.6241 (2.9120)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1785 (1.0541)  time: 0.2782  data: 0.0004  max mem: 21847
Epoch: [199]  [ 600/1251]  eta: 0:03:02  lr: 0.001143  min_lr: 0.001143  loss: 2.6670 (2.9047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7855 (1.0275)  time: 0.2751  data: 0.0004  max mem: 21847
Epoch: [199]  [ 800/1251]  eta: 0:02:05  lr: 0.001140  min_lr: 0.001140  loss: 3.1407 (2.9079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7950 (1.0207)  time: 0.2815  data: 0.0004  max mem: 21847
Epoch: [199]  [1000/1251]  eta: 0:01:09  lr: 0.001137  min_lr: 0.001137  loss: 2.7865 (2.8960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9013 (1.0047)  time: 0.2737  data: 0.0005  max mem: 21847
Epoch: [199]  [1200/1251]  eta: 0:00:14  lr: 0.001134  min_lr: 0.001134  loss: 2.7814 (2.9001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8253 (1.0036)  time: 0.2762  data: 0.0004  max mem: 21847
Epoch: [199]  [1250/1251]  eta: 0:00:00  lr: 0.001133  min_lr: 0.001133  loss: 2.7031 (2.8963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9377 (1.0073)  time: 0.2280  data: 0.0007  max mem: 21847
Epoch: [199] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.001133  min_lr: 0.001133  loss: 2.7031 (2.9011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9377 (1.0073)
Test:  [ 0/25]  eta: 0:01:50  loss: 0.6488 (0.6488)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 4.4122  data: 4.2373  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7564 (0.7688)  acc1: 86.4000 (84.4000)  acc5: 97.6000 (97.4909)  time: 0.6816  data: 0.5419  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9440 (0.9167)  acc1: 78.0000 (80.9714)  acc5: 95.6000 (95.6381)  time: 0.2424  data: 0.1106  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0111 (0.9231)  acc1: 77.6000 (80.5760)  acc5: 94.8000 (95.6960)  time: 0.2042  data: 0.0751  max mem: 21847
Test: Total time: 0:00:10 (0.4082 s / it)
* Acc@1 80.904 Acc@5 95.582 loss 0.919
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.90%
Epoch: [200]  [   0/1251]  eta: 1:05:44  lr: 0.001133  min_lr: 0.001133  loss: 2.8228 (2.8228)  weight_decay: 0.0500 (0.0500)  time: 3.1533  data: 2.8216  max mem: 21847
Epoch: [200]  [ 200/1251]  eta: 0:05:03  lr: 0.001130  min_lr: 0.001130  loss: 2.8369 (2.9093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7686 (0.9071)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [200]  [ 400/1251]  eta: 0:04:00  lr: 0.001126  min_lr: 0.001126  loss: 3.2647 (2.8633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8402 (0.9076)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [200]  [ 600/1251]  eta: 0:03:02  lr: 0.001123  min_lr: 0.001123  loss: 2.9110 (2.8672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8111 (0.9188)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [200]  [ 800/1251]  eta: 0:02:05  lr: 0.001120  min_lr: 0.001120  loss: 3.2066 (2.8565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9207 (0.9154)  time: 0.2816  data: 0.0005  max mem: 21847
Epoch: [200]  [1000/1251]  eta: 0:01:09  lr: 0.001117  min_lr: 0.001117  loss: 3.2364 (2.8725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8944 (0.9568)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [200]  [1200/1251]  eta: 0:00:14  lr: 0.001114  min_lr: 0.001114  loss: 2.5884 (2.8726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3054 (0.9677)  time: 0.2704  data: 0.0004  max mem: 21847
Epoch: [200]  [1250/1251]  eta: 0:00:00  lr: 0.001113  min_lr: 0.001113  loss: 2.7706 (2.8684)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0990 (0.9746)  time: 0.2278  data: 0.0005  max mem: 21847
Epoch: [200] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.001113  min_lr: 0.001113  loss: 2.7706 (2.8963)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0990 (0.9746)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6053 (0.6053)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.3221  data: 5.1572  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7388 (0.7376)  acc1: 85.6000 (83.7818)  acc5: 98.0000 (97.4909)  time: 0.7507  data: 0.6159  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9142 (0.8790)  acc1: 78.0000 (80.5143)  acc5: 95.6000 (95.6952)  time: 0.2398  data: 0.1091  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9663 (0.8843)  acc1: 78.0000 (80.3040)  acc5: 94.8000 (95.6320)  time: 0.2392  data: 0.1090  max mem: 21847
Test: Total time: 0:00:10 (0.4288 s / it)
* Acc@1 80.834 Acc@5 95.674 loss 0.878
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.90%
Epoch: [201]  [   0/1251]  eta: 1:05:52  lr: 0.001113  min_lr: 0.001113  loss: 2.9598 (2.9598)  weight_decay: 0.0500 (0.0500)  time: 3.1598  data: 2.4920  max mem: 21847
Epoch: [201]  [ 200/1251]  eta: 0:05:04  lr: 0.001110  min_lr: 0.001110  loss: 2.7355 (2.8375)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0075 (1.0463)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [201]  [ 400/1251]  eta: 0:03:59  lr: 0.001106  min_lr: 0.001106  loss: 3.0509 (2.8733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9856 (1.0162)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [201]  [ 600/1251]  eta: 0:03:02  lr: 0.001103  min_lr: 0.001103  loss: 2.7237 (2.8746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8489 (1.0107)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [201]  [ 800/1251]  eta: 0:02:05  lr: 0.001100  min_lr: 0.001100  loss: 3.3250 (2.8881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8360 (0.9681)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [201]  [1000/1251]  eta: 0:01:09  lr: 0.001097  min_lr: 0.001097  loss: 2.7581 (2.8810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9217 (0.9776)  time: 0.2873  data: 0.0005  max mem: 21847
Epoch: [201]  [1200/1251]  eta: 0:00:14  lr: 0.001094  min_lr: 0.001094  loss: 2.7531 (2.8777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8427 (0.9673)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [201]  [1250/1251]  eta: 0:00:00  lr: 0.001093  min_lr: 0.001093  loss: 3.2574 (2.8878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8767 (0.9634)  time: 0.2280  data: 0.0007  max mem: 21847
Epoch: [201] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.001093  min_lr: 0.001093  loss: 3.2574 (2.8970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8767 (0.9634)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6892 (0.6892)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.7967  data: 5.6492  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8745 (0.8679)  acc1: 84.0000 (83.4182)  acc5: 97.6000 (97.3091)  time: 0.7533  data: 0.6199  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0542 (1.0027)  acc1: 78.4000 (80.5524)  acc5: 95.2000 (95.6191)  time: 0.2120  data: 0.0823  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0976 (1.0130)  acc1: 78.0000 (80.2240)  acc5: 94.8000 (95.5040)  time: 0.2106  data: 0.0823  max mem: 21847
Test: Total time: 0:00:10 (0.4250 s / it)
* Acc@1 80.870 Acc@5 95.690 loss 1.002
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.90%
Epoch: [202]  [   0/1251]  eta: 1:14:52  lr: 0.001093  min_lr: 0.001093  loss: 3.2902 (3.2902)  weight_decay: 0.0500 (0.0500)  time: 3.5913  data: 1.8485  max mem: 21847
Epoch: [202]  [ 200/1251]  eta: 0:05:08  lr: 0.001090  min_lr: 0.001090  loss: 2.9822 (2.8844)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8760 (0.9281)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [202]  [ 400/1251]  eta: 0:04:01  lr: 0.001086  min_lr: 0.001086  loss: 2.7015 (2.8909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7921 (0.9160)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [202]  [ 600/1251]  eta: 0:03:03  lr: 0.001083  min_lr: 0.001083  loss: 2.4872 (2.8884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9839 (0.9466)  time: 0.2729  data: 0.0005  max mem: 21847
Epoch: [202]  [ 800/1251]  eta: 0:02:05  lr: 0.001080  min_lr: 0.001080  loss: 2.7560 (2.8910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9735 (0.9776)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [202]  [1000/1251]  eta: 0:01:09  lr: 0.001077  min_lr: 0.001077  loss: 3.0455 (2.8936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9354 (0.9703)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [202]  [1200/1251]  eta: 0:00:14  lr: 0.001074  min_lr: 0.001074  loss: 3.1264 (2.8870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9184 (0.9565)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [202]  [1250/1251]  eta: 0:00:00  lr: 0.001073  min_lr: 0.001073  loss: 2.8949 (2.8859)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0045 (0.9737)  time: 0.2275  data: 0.0005  max mem: 21847
Epoch: [202] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.001073  min_lr: 0.001073  loss: 2.8949 (2.8772)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0045 (0.9737)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6251 (0.6251)  acc1: 87.2000 (87.2000)  acc5: 99.2000 (99.2000)  time: 5.4323  data: 5.2847  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8032 (0.7739)  acc1: 84.0000 (83.5273)  acc5: 97.2000 (97.2727)  time: 0.7396  data: 0.6069  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9068 (0.9191)  acc1: 78.0000 (80.6476)  acc5: 95.6000 (95.8286)  time: 0.2067  data: 0.0774  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9715 (0.9323)  acc1: 78.0000 (80.2720)  acc5: 95.2000 (95.7760)  time: 0.2010  data: 0.0727  max mem: 21847
Test: Total time: 0:00:10 (0.4063 s / it)
* Acc@1 80.926 Acc@5 95.706 loss 0.921
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.93%
Epoch: [203]  [   0/1251]  eta: 1:05:41  lr: 0.001073  min_lr: 0.001073  loss: 2.7764 (2.7764)  weight_decay: 0.0500 (0.0500)  time: 3.1507  data: 2.8464  max mem: 21847
Epoch: [203]  [ 200/1251]  eta: 0:05:05  lr: 0.001070  min_lr: 0.001070  loss: 2.2520 (2.8284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8695 (1.0470)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [203]  [ 400/1251]  eta: 0:04:01  lr: 0.001066  min_lr: 0.001066  loss: 2.4454 (2.8250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9261 (1.0127)  time: 0.2844  data: 0.0003  max mem: 21847
Epoch: [203]  [ 600/1251]  eta: 0:03:02  lr: 0.001063  min_lr: 0.001063  loss: 2.8287 (2.8367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8873 (1.0219)  time: 0.2712  data: 0.0005  max mem: 21847
Epoch: [203]  [ 800/1251]  eta: 0:02:05  lr: 0.001060  min_lr: 0.001060  loss: 2.8623 (2.8344)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9377 (1.0144)  time: 0.2717  data: 0.0005  max mem: 21847
Epoch: [203]  [1000/1251]  eta: 0:01:09  lr: 0.001057  min_lr: 0.001057  loss: 3.0858 (2.8525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8795 (1.0188)  time: 0.2741  data: 0.0005  max mem: 21847
Epoch: [203]  [1200/1251]  eta: 0:00:14  lr: 0.001054  min_lr: 0.001054  loss: 2.5056 (2.8636)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9137 (1.0008)  time: 0.2751  data: 0.0006  max mem: 21847
Epoch: [203]  [1250/1251]  eta: 0:00:00  lr: 0.001053  min_lr: 0.001053  loss: 2.9071 (2.8704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8702 (0.9963)  time: 0.2278  data: 0.0006  max mem: 21847
Epoch: [203] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.001053  min_lr: 0.001053  loss: 2.9071 (2.8786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8702 (0.9963)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6422 (0.6422)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.6896  data: 5.5213  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8213 (0.8159)  acc1: 84.4000 (83.7091)  acc5: 98.0000 (97.7818)  time: 0.7685  data: 0.6330  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9621 (0.9676)  acc1: 78.0000 (80.6476)  acc5: 96.0000 (95.8667)  time: 0.2075  data: 0.0776  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0512 (0.9773)  acc1: 78.0000 (80.4000)  acc5: 95.2000 (95.7120)  time: 0.2066  data: 0.0775  max mem: 21847
Test: Total time: 0:00:10 (0.4174 s / it)
* Acc@1 80.912 Acc@5 95.712 loss 0.964
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.93%
Epoch: [204]  [   0/1251]  eta: 1:08:44  lr: 0.001053  min_lr: 0.001053  loss: 1.7711 (1.7711)  weight_decay: 0.0500 (0.0500)  time: 3.2971  data: 2.5986  max mem: 21847
Epoch: [204]  [ 200/1251]  eta: 0:05:06  lr: 0.001050  min_lr: 0.001050  loss: 3.0795 (2.7930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8485 (1.0071)  time: 0.2820  data: 0.0004  max mem: 21847
Epoch: [204]  [ 400/1251]  eta: 0:04:00  lr: 0.001047  min_lr: 0.001047  loss: 2.7264 (2.8409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8924 (0.9600)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [204]  [ 600/1251]  eta: 0:03:02  lr: 0.001044  min_lr: 0.001044  loss: 3.3138 (2.8440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9571 (0.9762)  time: 0.2719  data: 0.0005  max mem: 21847
Epoch: [204]  [ 800/1251]  eta: 0:02:05  lr: 0.001040  min_lr: 0.001040  loss: 3.1326 (2.8372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8693 (0.9993)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [204]  [1000/1251]  eta: 0:01:09  lr: 0.001037  min_lr: 0.001037  loss: 2.9395 (2.8458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8301 (1.0136)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [204]  [1200/1251]  eta: 0:00:14  lr: 0.001034  min_lr: 0.001034  loss: 3.2794 (2.8414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8474 (0.9926)  time: 0.2710  data: 0.0003  max mem: 21847
Epoch: [204]  [1250/1251]  eta: 0:00:00  lr: 0.001033  min_lr: 0.001033  loss: 2.8434 (2.8412)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9934 (1.0007)  time: 0.2278  data: 0.0005  max mem: 21847
Epoch: [204] Total time: 0:05:47 (0.2777 s / it)
Averaged stats: lr: 0.001033  min_lr: 0.001033  loss: 2.8434 (2.8765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9934 (1.0007)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5728 (0.5728)  acc1: 88.8000 (88.8000)  acc5: 99.6000 (99.6000)  time: 5.7109  data: 5.5557  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7747 (0.7669)  acc1: 85.2000 (84.1091)  acc5: 97.2000 (97.2364)  time: 0.7454  data: 0.6119  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9393 (0.9072)  acc1: 78.0000 (81.0476)  acc5: 95.6000 (95.7714)  time: 0.2025  data: 0.0731  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9855 (0.9162)  acc1: 78.8000 (80.8160)  acc5: 94.8000 (95.6320)  time: 0.2020  data: 0.0738  max mem: 21847
Test: Total time: 0:00:10 (0.4152 s / it)
* Acc@1 81.142 Acc@5 95.706 loss 0.914
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.14%
Epoch: [205]  [   0/1251]  eta: 1:03:24  lr: 0.001033  min_lr: 0.001033  loss: 3.1174 (3.1174)  weight_decay: 0.0500 (0.0500)  time: 3.0409  data: 2.7328  max mem: 21847
Epoch: [205]  [ 200/1251]  eta: 0:05:01  lr: 0.001030  min_lr: 0.001030  loss: 2.4715 (2.8753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9411 (0.9732)  time: 0.2742  data: 0.0004  max mem: 21847
Epoch: [205]  [ 400/1251]  eta: 0:03:58  lr: 0.001027  min_lr: 0.001027  loss: 3.2006 (2.8938)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9204 (0.9631)  time: 0.2819  data: 0.0004  max mem: 21847
Epoch: [205]  [ 600/1251]  eta: 0:03:01  lr: 0.001024  min_lr: 0.001024  loss: 3.3175 (2.8791)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0276 (nan)  time: 0.2735  data: 0.0004  max mem: 21847
Epoch: [205]  [ 800/1251]  eta: 0:02:05  lr: 0.001021  min_lr: 0.001021  loss: 3.2587 (2.8835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8860 (nan)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [205]  [1000/1251]  eta: 0:01:09  lr: 0.001018  min_lr: 0.001018  loss: 3.1408 (2.8840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9012 (nan)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [205]  [1200/1251]  eta: 0:00:14  lr: 0.001014  min_lr: 0.001014  loss: 3.0449 (2.8877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9181 (nan)  time: 0.2766  data: 0.0004  max mem: 21847
Epoch: [205]  [1250/1251]  eta: 0:00:00  lr: 0.001014  min_lr: 0.001014  loss: 2.6404 (2.8858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (nan)  time: 0.2285  data: 0.0005  max mem: 21847
Epoch: [205] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.001014  min_lr: 0.001014  loss: 2.6404 (2.8840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6013 (0.6013)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.6811  data: 5.5240  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8177 (0.7716)  acc1: 85.2000 (84.1455)  acc5: 97.6000 (97.3818)  time: 0.7538  data: 0.6212  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9252 (0.9072)  acc1: 79.2000 (81.3714)  acc5: 96.0000 (95.8476)  time: 0.2047  data: 0.0758  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9760 (0.9139)  acc1: 79.2000 (81.0400)  acc5: 94.8000 (95.7120)  time: 0.2038  data: 0.0758  max mem: 21847
Test: Total time: 0:00:10 (0.4149 s / it)
* Acc@1 81.276 Acc@5 95.738 loss 0.910
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.28%
Epoch: [206]  [   0/1251]  eta: 1:00:21  lr: 0.001014  min_lr: 0.001014  loss: 3.6727 (3.6727)  weight_decay: 0.0500 (0.0500)  time: 2.8945  data: 2.5823  max mem: 21847
Epoch: [206]  [ 200/1251]  eta: 0:05:02  lr: 0.001011  min_lr: 0.001011  loss: 3.0194 (2.8706)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2367 (1.1446)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [206]  [ 400/1251]  eta: 0:03:58  lr: 0.001007  min_lr: 0.001007  loss: 3.0588 (2.8878)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [206]  [ 600/1251]  eta: 0:03:00  lr: 0.001004  min_lr: 0.001004  loss: 3.1423 (2.8936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9205 (nan)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [206]  [ 800/1251]  eta: 0:02:05  lr: 0.001001  min_lr: 0.001001  loss: 2.6104 (2.9063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8908 (nan)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [206]  [1000/1251]  eta: 0:01:09  lr: 0.000998  min_lr: 0.000998  loss: 3.1656 (2.8897)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0029 (nan)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [206]  [1200/1251]  eta: 0:00:14  lr: 0.000995  min_lr: 0.000995  loss: 3.0117 (2.8773)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0553 (nan)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [206]  [1250/1251]  eta: 0:00:00  lr: 0.000994  min_lr: 0.000994  loss: 3.3379 (2.8805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9715 (nan)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [206] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.000994  min_lr: 0.000994  loss: 3.3379 (2.8721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9715 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6623 (0.6623)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.5761  data: 5.4249  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8150 (0.8222)  acc1: 83.6000 (84.0000)  acc5: 98.0000 (97.4909)  time: 0.7390  data: 0.6054  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9727 (0.9595)  acc1: 78.8000 (80.9905)  acc5: 95.6000 (95.7714)  time: 0.2016  data: 0.0719  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0218 (0.9673)  acc1: 78.4000 (80.7200)  acc5: 94.8000 (95.7280)  time: 0.2117  data: 0.0834  max mem: 21847
Test: Total time: 0:00:10 (0.4176 s / it)
* Acc@1 81.104 Acc@5 95.682 loss 0.959
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.28%
Epoch: [207]  [   0/1251]  eta: 1:15:50  lr: 0.000994  min_lr: 0.000994  loss: 1.8036 (1.8036)  weight_decay: 0.0500 (0.0500)  time: 3.6374  data: 1.8332  max mem: 21847
Epoch: [207]  [ 200/1251]  eta: 0:05:08  lr: 0.000991  min_lr: 0.000991  loss: 3.4387 (2.9609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9023 (1.0008)  time: 0.2949  data: 0.0004  max mem: 21847
Epoch: [207]  [ 400/1251]  eta: 0:04:01  lr: 0.000988  min_lr: 0.000988  loss: 2.9500 (2.9260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8940 (0.9902)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [207]  [ 600/1251]  eta: 0:03:02  lr: 0.000985  min_lr: 0.000985  loss: 2.9071 (2.8881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9442 (1.0164)  time: 0.2705  data: 0.0004  max mem: 21847
Epoch: [207]  [ 800/1251]  eta: 0:02:05  lr: 0.000982  min_lr: 0.000982  loss: 2.6279 (2.8687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8888 (1.0284)  time: 0.2810  data: 0.0004  max mem: 21847
Epoch: [207]  [1000/1251]  eta: 0:01:09  lr: 0.000979  min_lr: 0.000979  loss: 2.9222 (2.8710)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0109 (1.0407)  time: 0.2792  data: 0.0005  max mem: 21847
Epoch: [207]  [1200/1251]  eta: 0:00:14  lr: 0.000976  min_lr: 0.000976  loss: 3.0215 (2.8677)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0506 (1.0437)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [207]  [1250/1251]  eta: 0:00:00  lr: 0.000975  min_lr: 0.000975  loss: 3.1577 (2.8696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9472 (1.0394)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [207] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.000975  min_lr: 0.000975  loss: 3.1577 (2.8674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9472 (1.0394)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6922 (0.6922)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.6365  data: 5.4895  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8634 (0.8428)  acc1: 85.6000 (83.9636)  acc5: 97.2000 (96.9455)  time: 0.7490  data: 0.6175  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0087 (0.9785)  acc1: 78.4000 (81.0476)  acc5: 95.6000 (95.7143)  time: 0.2167  data: 0.0880  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0582 (0.9878)  acc1: 78.4000 (80.7520)  acc5: 95.6000 (95.5520)  time: 0.2154  data: 0.0879  max mem: 21847
Test: Total time: 0:00:10 (0.4226 s / it)
* Acc@1 81.162 Acc@5 95.578 loss 0.979
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.28%
Epoch: [208]  [   0/1251]  eta: 1:05:24  lr: 0.000975  min_lr: 0.000975  loss: 3.4472 (3.4472)  weight_decay: 0.0500 (0.0500)  time: 3.1371  data: 2.7397  max mem: 21847
Epoch: [208]  [ 200/1251]  eta: 0:05:04  lr: 0.000972  min_lr: 0.000972  loss: 3.1741 (2.7493)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9201 (0.9125)  time: 0.2760  data: 0.0004  max mem: 21847
Epoch: [208]  [ 400/1251]  eta: 0:04:00  lr: 0.000969  min_lr: 0.000969  loss: 2.8841 (2.7949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9104 (0.9403)  time: 0.2828  data: 0.0004  max mem: 21847
Epoch: [208]  [ 600/1251]  eta: 0:03:02  lr: 0.000966  min_lr: 0.000966  loss: 2.8743 (2.8167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9763 (0.9779)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [208]  [ 800/1251]  eta: 0:02:05  lr: 0.000963  min_lr: 0.000963  loss: 2.8846 (2.8237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8712 (0.9636)  time: 0.2734  data: 0.0003  max mem: 21847
Epoch: [208]  [1000/1251]  eta: 0:01:09  lr: 0.000960  min_lr: 0.000960  loss: 2.6677 (2.8222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8871 (0.9559)  time: 0.2821  data: 0.0005  max mem: 21847
Epoch: [208]  [1200/1251]  eta: 0:00:14  lr: 0.000956  min_lr: 0.000956  loss: 3.1515 (2.8327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9748 (0.9664)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [208]  [1250/1251]  eta: 0:00:00  lr: 0.000956  min_lr: 0.000956  loss: 3.0872 (2.8311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9468 (0.9668)  time: 0.2285  data: 0.0007  max mem: 21847
Epoch: [208] Total time: 0:05:46 (0.2774 s / it)
Averaged stats: lr: 0.000956  min_lr: 0.000956  loss: 3.0872 (2.8545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9468 (0.9668)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5909 (0.5909)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.5771  data: 5.4314  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7942 (0.7674)  acc1: 84.4000 (83.7455)  acc5: 97.6000 (97.4182)  time: 0.7444  data: 0.6110  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9064 (0.8992)  acc1: 78.8000 (80.9714)  acc5: 96.4000 (95.8857)  time: 0.2139  data: 0.0840  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9729 (0.9098)  acc1: 78.8000 (80.6880)  acc5: 94.8000 (95.7600)  time: 0.2123  data: 0.0839  max mem: 21847
Test: Total time: 0:00:10 (0.4182 s / it)
* Acc@1 81.116 Acc@5 95.736 loss 0.902
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.28%
Epoch: [209]  [   0/1251]  eta: 1:08:24  lr: 0.000956  min_lr: 0.000956  loss: 3.7272 (3.7272)  weight_decay: 0.0500 (0.0500)  time: 3.2812  data: 2.9004  max mem: 21847
Epoch: [209]  [ 200/1251]  eta: 0:05:06  lr: 0.000953  min_lr: 0.000953  loss: 3.0366 (2.8023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9395 (0.9959)  time: 0.2751  data: 0.0004  max mem: 21847
Epoch: [209]  [ 400/1251]  eta: 0:04:00  lr: 0.000950  min_lr: 0.000950  loss: 2.8554 (2.7981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8275 (0.9850)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [209]  [ 600/1251]  eta: 0:03:01  lr: 0.000947  min_lr: 0.000947  loss: 2.9144 (2.8232)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0385 (0.9945)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [209]  [ 800/1251]  eta: 0:02:05  lr: 0.000944  min_lr: 0.000944  loss: 3.2507 (2.8369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8610 (0.9825)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [209]  [1000/1251]  eta: 0:01:09  lr: 0.000940  min_lr: 0.000940  loss: 3.1363 (2.8402)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0282 (1.0023)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [209]  [1200/1251]  eta: 0:00:14  lr: 0.000937  min_lr: 0.000937  loss: 2.7042 (2.8430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9971 (1.0060)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [209]  [1250/1251]  eta: 0:00:00  lr: 0.000937  min_lr: 0.000937  loss: 2.9519 (2.8428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0951 (1.0112)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [209] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.000937  min_lr: 0.000937  loss: 2.9519 (2.8541)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0951 (1.0112)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5795 (0.5795)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.6694  data: 5.5076  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7506 (0.7408)  acc1: 85.2000 (84.4727)  acc5: 98.0000 (97.6000)  time: 0.7318  data: 0.5982  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9188 (0.8738)  acc1: 78.8000 (81.1619)  acc5: 96.0000 (95.9619)  time: 0.1988  data: 0.0688  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9741 (0.8794)  acc1: 78.4000 (80.9440)  acc5: 95.2000 (95.9360)  time: 0.1981  data: 0.0687  max mem: 21847
Test: Total time: 0:00:10 (0.4094 s / it)
* Acc@1 81.228 Acc@5 95.780 loss 0.880
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.28%
Epoch: [210]  [   0/1251]  eta: 1:04:34  lr: 0.000937  min_lr: 0.000937  loss: 1.9731 (1.9731)  weight_decay: 0.0500 (0.0500)  time: 3.0974  data: 2.5190  max mem: 21847
Epoch: [210]  [ 200/1251]  eta: 0:05:07  lr: 0.000934  min_lr: 0.000934  loss: 2.5627 (2.8617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9817 (1.1864)  time: 0.2820  data: 0.0004  max mem: 21847
Epoch: [210]  [ 400/1251]  eta: 0:04:00  lr: 0.000931  min_lr: 0.000931  loss: 2.7137 (2.8281)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9284 (1.0807)  time: 0.2725  data: 0.0005  max mem: 21847
Epoch: [210]  [ 600/1251]  eta: 0:03:02  lr: 0.000928  min_lr: 0.000928  loss: 2.4432 (2.8186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9158 (1.0375)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [210]  [ 800/1251]  eta: 0:02:05  lr: 0.000925  min_lr: 0.000925  loss: 2.8639 (2.8292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8685 (1.0199)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [210]  [1000/1251]  eta: 0:01:09  lr: 0.000922  min_lr: 0.000922  loss: 2.4989 (2.8341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8582 (0.9980)  time: 0.2726  data: 0.0005  max mem: 21847
Epoch: [210]  [1200/1251]  eta: 0:00:14  lr: 0.000918  min_lr: 0.000918  loss: 3.1535 (2.8370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9276 (1.0083)  time: 0.2712  data: 0.0005  max mem: 21847
Epoch: [210]  [1250/1251]  eta: 0:00:00  lr: 0.000918  min_lr: 0.000918  loss: 2.8120 (2.8402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8608 (1.0048)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [210] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.000918  min_lr: 0.000918  loss: 2.8120 (2.8530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8608 (1.0048)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5929 (0.5929)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.8065  data: 5.6309  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7422 (0.7444)  acc1: 84.8000 (84.1091)  acc5: 97.6000 (97.5636)  time: 0.7359  data: 0.5941  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9048 (0.8831)  acc1: 78.8000 (81.2381)  acc5: 96.0000 (96.0000)  time: 0.1959  data: 0.0628  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0145 (0.8920)  acc1: 78.8000 (80.9280)  acc5: 95.2000 (95.9680)  time: 0.1936  data: 0.0627  max mem: 21847
Test: Total time: 0:00:10 (0.4127 s / it)
* Acc@1 81.328 Acc@5 95.760 loss 0.889
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.33%
Epoch: [211]  [   0/1251]  eta: 1:06:35  lr: 0.000918  min_lr: 0.000918  loss: 3.2902 (3.2902)  weight_decay: 0.0500 (0.0500)  time: 3.1940  data: 2.8214  max mem: 21847
Epoch: [211]  [ 200/1251]  eta: 0:05:07  lr: 0.000915  min_lr: 0.000915  loss: 2.8817 (2.8299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8885 (0.8978)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [211]  [ 400/1251]  eta: 0:04:01  lr: 0.000912  min_lr: 0.000912  loss: 2.9382 (2.8280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9171 (0.9361)  time: 0.2725  data: 0.0005  max mem: 21847
Epoch: [211]  [ 600/1251]  eta: 0:03:03  lr: 0.000909  min_lr: 0.000909  loss: 2.9817 (2.8259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9785 (0.9761)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [211]  [ 800/1251]  eta: 0:02:06  lr: 0.000906  min_lr: 0.000906  loss: 2.7312 (2.8298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8783 (1.0003)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [211]  [1000/1251]  eta: 0:01:09  lr: 0.000903  min_lr: 0.000903  loss: 3.0631 (2.8391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9207 (1.0086)  time: 0.2748  data: 0.0004  max mem: 21847
Epoch: [211]  [1200/1251]  eta: 0:00:14  lr: 0.000900  min_lr: 0.000900  loss: 3.0850 (2.8477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8957 (1.0012)  time: 0.2735  data: 0.0004  max mem: 21847
Epoch: [211]  [1250/1251]  eta: 0:00:00  lr: 0.000899  min_lr: 0.000899  loss: 2.5843 (2.8435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9076 (1.0059)  time: 0.2288  data: 0.0006  max mem: 21847
Epoch: [211] Total time: 0:05:47 (0.2780 s / it)
Averaged stats: lr: 0.000899  min_lr: 0.000899  loss: 2.5843 (2.8556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9076 (1.0059)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5413 (0.5413)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.6838  data: 5.5175  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7581 (0.7491)  acc1: 85.2000 (84.6545)  acc5: 97.2000 (97.0909)  time: 0.7341  data: 0.6003  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9221 (0.8768)  acc1: 80.4000 (81.9238)  acc5: 95.6000 (95.7714)  time: 0.1918  data: 0.0629  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9692 (0.8842)  acc1: 79.2000 (81.6000)  acc5: 95.2000 (95.7120)  time: 0.1910  data: 0.0628  max mem: 21847
Test: Total time: 0:00:10 (0.4046 s / it)
* Acc@1 81.550 Acc@5 95.788 loss 0.881
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.55%
Epoch: [212]  [   0/1251]  eta: 0:59:47  lr: 0.000899  min_lr: 0.000899  loss: 2.5131 (2.5131)  weight_decay: 0.0500 (0.0500)  time: 2.8676  data: 2.4686  max mem: 21847
Epoch: [212]  [ 200/1251]  eta: 0:05:05  lr: 0.000896  min_lr: 0.000896  loss: 2.6130 (2.8425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9467 (0.9974)  time: 0.2735  data: 0.0005  max mem: 21847
Epoch: [212]  [ 400/1251]  eta: 0:04:00  lr: 0.000893  min_lr: 0.000893  loss: 3.1564 (2.8521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9918 (1.0216)  time: 0.2831  data: 0.0005  max mem: 21847
Epoch: [212]  [ 600/1251]  eta: 0:03:02  lr: 0.000890  min_lr: 0.000890  loss: 3.0424 (2.8519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9959 (1.0245)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [212]  [ 800/1251]  eta: 0:02:05  lr: 0.000887  min_lr: 0.000887  loss: 2.3465 (2.8505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8715 (1.0020)  time: 0.2780  data: 0.0005  max mem: 21847
Epoch: [212]  [1000/1251]  eta: 0:01:09  lr: 0.000884  min_lr: 0.000884  loss: 3.0931 (2.8366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9259 (0.9929)  time: 0.2768  data: 0.0004  max mem: 21847
Epoch: [212]  [1200/1251]  eta: 0:00:14  lr: 0.000881  min_lr: 0.000881  loss: 2.2750 (2.8382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8953 (0.9846)  time: 0.2743  data: 0.0005  max mem: 21847
Epoch: [212]  [1250/1251]  eta: 0:00:00  lr: 0.000880  min_lr: 0.000880  loss: 3.0629 (2.8368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8684 (0.9832)  time: 0.2301  data: 0.0005  max mem: 21847
Epoch: [212] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.000880  min_lr: 0.000880  loss: 3.0629 (2.8462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8684 (0.9832)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5916 (0.5916)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.6394  data: 5.4681  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7925 (0.7668)  acc1: 86.0000 (84.3636)  acc5: 97.2000 (97.5273)  time: 0.7267  data: 0.5896  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9104 (0.9178)  acc1: 78.4000 (80.8381)  acc5: 96.4000 (95.8476)  time: 0.1954  data: 0.0648  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0270 (0.9254)  acc1: 78.4000 (80.5440)  acc5: 95.2000 (95.8080)  time: 0.1930  data: 0.0648  max mem: 21847
Test: Total time: 0:00:10 (0.4056 s / it)
* Acc@1 81.138 Acc@5 95.772 loss 0.917
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.55%
Epoch: [213]  [   0/1251]  eta: 1:08:32  lr: 0.000880  min_lr: 0.000880  loss: 2.3749 (2.3749)  weight_decay: 0.0500 (0.0500)  time: 3.2877  data: 2.8960  max mem: 21847
Epoch: [213]  [ 200/1251]  eta: 0:05:05  lr: 0.000877  min_lr: 0.000877  loss: 2.5287 (2.7868)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0627 (1.1781)  time: 0.2723  data: 0.0006  max mem: 21847
Epoch: [213]  [ 400/1251]  eta: 0:04:00  lr: 0.000874  min_lr: 0.000874  loss: 2.7247 (2.7843)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8513 (1.0618)  time: 0.2723  data: 0.0005  max mem: 21847
Epoch: [213]  [ 600/1251]  eta: 0:03:02  lr: 0.000871  min_lr: 0.000871  loss: 2.6786 (2.7988)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9142 (1.0870)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [213]  [ 800/1251]  eta: 0:02:05  lr: 0.000868  min_lr: 0.000868  loss: 2.6463 (2.8106)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0696 (nan)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [213]  [1000/1251]  eta: 0:01:09  lr: 0.000865  min_lr: 0.000865  loss: 3.1249 (2.8321)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0636 (nan)  time: 0.2702  data: 0.0005  max mem: 21847
Epoch: [213]  [1200/1251]  eta: 0:00:14  lr: 0.000863  min_lr: 0.000863  loss: 2.8501 (2.8359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8283 (nan)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [213]  [1250/1251]  eta: 0:00:00  lr: 0.000862  min_lr: 0.000862  loss: 2.5889 (2.8314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9087 (nan)  time: 0.2278  data: 0.0006  max mem: 21847
Epoch: [213] Total time: 0:05:47 (0.2778 s / it)
Averaged stats: lr: 0.000862  min_lr: 0.000862  loss: 2.5889 (2.8480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9087 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5478 (0.5478)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 5.5280  data: 5.3476  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7600 (0.7497)  acc1: 85.2000 (84.4364)  acc5: 97.2000 (97.2727)  time: 0.7397  data: 0.6026  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8856 (0.8948)  acc1: 80.0000 (81.1619)  acc5: 96.0000 (95.7524)  time: 0.2052  data: 0.0751  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9883 (0.9011)  acc1: 78.4000 (80.9280)  acc5: 95.6000 (95.8080)  time: 0.2033  data: 0.0749  max mem: 21847
Test: Total time: 0:00:10 (0.4100 s / it)
* Acc@1 81.404 Acc@5 95.862 loss 0.891
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.55%
Epoch: [214]  [   0/1251]  eta: 1:11:23  lr: 0.000862  min_lr: 0.000862  loss: 3.8190 (3.8190)  weight_decay: 0.0500 (0.0500)  time: 3.4242  data: 2.6832  max mem: 21847
Epoch: [214]  [ 200/1251]  eta: 0:05:05  lr: 0.000859  min_lr: 0.000859  loss: 2.6034 (2.7775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8992 (1.1481)  time: 0.2714  data: 0.0003  max mem: 21847
Epoch: [214]  [ 400/1251]  eta: 0:04:00  lr: 0.000856  min_lr: 0.000856  loss: 2.9923 (2.8079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8352 (1.0981)  time: 0.2823  data: 0.0004  max mem: 21847
Epoch: [214]  [ 600/1251]  eta: 0:03:02  lr: 0.000853  min_lr: 0.000853  loss: 2.7270 (2.8104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9363 (1.0667)  time: 0.2729  data: 0.0003  max mem: 21847
Epoch: [214]  [ 800/1251]  eta: 0:02:05  lr: 0.000850  min_lr: 0.000850  loss: 2.8451 (2.8121)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0122 (1.0735)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [214]  [1000/1251]  eta: 0:01:09  lr: 0.000847  min_lr: 0.000847  loss: 2.9024 (2.8223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9503 (1.0695)  time: 0.2739  data: 0.0006  max mem: 21847
Epoch: [214]  [1200/1251]  eta: 0:00:14  lr: 0.000844  min_lr: 0.000844  loss: 2.2448 (2.8246)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9089 (1.0650)  time: 0.2826  data: 0.0004  max mem: 21847
Epoch: [214]  [1250/1251]  eta: 0:00:00  lr: 0.000844  min_lr: 0.000844  loss: 3.0722 (2.8263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9086 (1.0629)  time: 0.2281  data: 0.0006  max mem: 21847
Epoch: [214] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.000844  min_lr: 0.000844  loss: 3.0722 (2.8287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9086 (1.0629)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6130 (0.6130)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.6792  data: 5.5261  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8060 (0.7895)  acc1: 86.4000 (84.3636)  acc5: 97.6000 (97.6000)  time: 0.7426  data: 0.6088  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9272 (0.9185)  acc1: 78.4000 (81.4095)  acc5: 96.4000 (96.1524)  time: 0.1950  data: 0.0654  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0094 (0.9274)  acc1: 78.4000 (81.2160)  acc5: 95.6000 (96.0640)  time: 0.1936  data: 0.0653  max mem: 21847
Test: Total time: 0:00:10 (0.4071 s / it)
* Acc@1 81.412 Acc@5 95.950 loss 0.921
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.55%
Epoch: [215]  [   0/1251]  eta: 1:11:42  lr: 0.000843  min_lr: 0.000843  loss: 2.6762 (2.6762)  weight_decay: 0.0500 (0.0500)  time: 3.4394  data: 1.6161  max mem: 21847
Epoch: [215]  [ 200/1251]  eta: 0:05:04  lr: 0.000841  min_lr: 0.000841  loss: 3.3526 (2.8271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9822 (0.9613)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [215]  [ 400/1251]  eta: 0:03:59  lr: 0.000838  min_lr: 0.000838  loss: 3.2617 (2.8098)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0793 (1.0230)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [215]  [ 600/1251]  eta: 0:03:01  lr: 0.000835  min_lr: 0.000835  loss: 3.0784 (2.8322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9278 (1.0073)  time: 0.2850  data: 0.0004  max mem: 21847
Epoch: [215]  [ 800/1251]  eta: 0:02:05  lr: 0.000832  min_lr: 0.000832  loss: 2.8569 (2.8418)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0865 (1.0178)  time: 0.2740  data: 0.0004  max mem: 21847
Epoch: [215]  [1000/1251]  eta: 0:01:09  lr: 0.000829  min_lr: 0.000829  loss: 2.5712 (2.8455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9708 (1.0146)  time: 0.2815  data: 0.0004  max mem: 21847
Epoch: [215]  [1200/1251]  eta: 0:00:14  lr: 0.000826  min_lr: 0.000826  loss: 2.8132 (2.8437)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0487 (1.0187)  time: 0.2821  data: 0.0005  max mem: 21847
Epoch: [215]  [1250/1251]  eta: 0:00:00  lr: 0.000825  min_lr: 0.000825  loss: 3.0326 (2.8459)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0437 (1.0285)  time: 0.2282  data: 0.0011  max mem: 21847
Epoch: [215] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.000825  min_lr: 0.000825  loss: 3.0326 (2.8362)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0437 (1.0285)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5682 (0.5682)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5005  data: 5.3320  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8148 (0.7823)  acc1: 84.8000 (84.7273)  acc5: 97.6000 (97.6000)  time: 0.7347  data: 0.5984  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9449 (0.9306)  acc1: 80.0000 (81.6381)  acc5: 96.0000 (96.0571)  time: 0.2121  data: 0.0818  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0408 (0.9403)  acc1: 80.0000 (81.2480)  acc5: 95.6000 (95.9840)  time: 0.2108  data: 0.0817  max mem: 21847
Test: Total time: 0:00:10 (0.4139 s / it)
* Acc@1 81.502 Acc@5 95.808 loss 0.940
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.55%
Epoch: [216]  [   0/1251]  eta: 1:09:24  lr: 0.000825  min_lr: 0.000825  loss: 3.6522 (3.6522)  weight_decay: 0.0500 (0.0500)  time: 3.3291  data: 2.3459  max mem: 21847
Epoch: [216]  [ 200/1251]  eta: 0:05:05  lr: 0.000822  min_lr: 0.000822  loss: 2.9213 (2.7946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9396 (1.0053)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [216]  [ 400/1251]  eta: 0:03:59  lr: 0.000819  min_lr: 0.000819  loss: 2.9091 (2.7893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8519 (0.9981)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [216]  [ 600/1251]  eta: 0:03:01  lr: 0.000817  min_lr: 0.000817  loss: 2.9733 (2.7955)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0743 (1.0308)  time: 0.2808  data: 0.0004  max mem: 21847
Epoch: [216]  [ 800/1251]  eta: 0:02:05  lr: 0.000814  min_lr: 0.000814  loss: 2.4455 (2.8060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9540 (1.0410)  time: 0.2797  data: 0.0004  max mem: 21847
Epoch: [216]  [1000/1251]  eta: 0:01:09  lr: 0.000811  min_lr: 0.000811  loss: 3.0431 (2.8137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8557 (1.0284)  time: 0.2772  data: 0.0004  max mem: 21847
Epoch: [216]  [1200/1251]  eta: 0:00:14  lr: 0.000808  min_lr: 0.000808  loss: 2.3562 (2.8111)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9266 (1.0208)  time: 0.2733  data: 0.0005  max mem: 21847
Epoch: [216]  [1250/1251]  eta: 0:00:00  lr: 0.000807  min_lr: 0.000807  loss: 2.6609 (2.8138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9534 (1.0196)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [216] Total time: 0:05:45 (0.2763 s / it)
Averaged stats: lr: 0.000807  min_lr: 0.000807  loss: 2.6609 (2.8301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9534 (1.0196)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5681 (0.5681)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.4868  data: 5.3233  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7804 (0.7638)  acc1: 85.2000 (85.1273)  acc5: 98.0000 (97.5273)  time: 0.7453  data: 0.6077  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9028 (0.9027)  acc1: 79.6000 (81.5429)  acc5: 96.0000 (96.0571)  time: 0.2178  data: 0.0846  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9882 (0.9121)  acc1: 79.6000 (81.0880)  acc5: 95.2000 (96.0480)  time: 0.2151  data: 0.0845  max mem: 21847
Test: Total time: 0:00:10 (0.4176 s / it)
* Acc@1 81.522 Acc@5 95.916 loss 0.908
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.55%
Epoch: [217]  [   0/1251]  eta: 1:02:40  lr: 0.000807  min_lr: 0.000807  loss: 2.8583 (2.8583)  weight_decay: 0.0500 (0.0500)  time: 3.0060  data: 1.5773  max mem: 21847
Epoch: [217]  [ 200/1251]  eta: 0:05:02  lr: 0.000804  min_lr: 0.000804  loss: 2.7945 (2.8148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9657 (1.0114)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [217]  [ 400/1251]  eta: 0:03:59  lr: 0.000801  min_lr: 0.000801  loss: 3.1024 (2.8108)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0764 (1.0212)  time: 0.2818  data: 0.0004  max mem: 21847
Epoch: [217]  [ 600/1251]  eta: 0:03:02  lr: 0.000799  min_lr: 0.000799  loss: 2.6811 (2.8210)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2080 (1.0802)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [217]  [ 800/1251]  eta: 0:02:05  lr: 0.000796  min_lr: 0.000796  loss: 2.8877 (2.8225)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2874 (1.1433)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [217]  [1000/1251]  eta: 0:01:09  lr: 0.000793  min_lr: 0.000793  loss: 2.6153 (2.8164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9170 (1.1017)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [217]  [1200/1251]  eta: 0:00:14  lr: 0.000790  min_lr: 0.000790  loss: 2.7308 (2.8188)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1573 (1.0987)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [217]  [1250/1251]  eta: 0:00:00  lr: 0.000789  min_lr: 0.000789  loss: 3.0087 (2.8190)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0874 (1.0982)  time: 0.2280  data: 0.0007  max mem: 21847
Epoch: [217] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.000789  min_lr: 0.000789  loss: 3.0087 (2.8237)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0874 (1.0982)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6569 (0.6569)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.5316  data: 5.3699  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8464 (0.8188)  acc1: 84.8000 (84.8364)  acc5: 97.6000 (97.4182)  time: 0.7499  data: 0.6150  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9722 (0.9454)  acc1: 79.6000 (81.6381)  acc5: 96.0000 (96.1524)  time: 0.2189  data: 0.0887  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0293 (0.9530)  acc1: 79.6000 (81.3280)  acc5: 96.0000 (96.1600)  time: 0.2200  data: 0.0886  max mem: 21847
Test: Total time: 0:00:10 (0.4220 s / it)
* Acc@1 81.356 Acc@5 95.832 loss 0.949
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.55%
Epoch: [218]  [   0/1251]  eta: 1:06:12  lr: 0.000789  min_lr: 0.000789  loss: 3.5595 (3.5595)  weight_decay: 0.0500 (0.0500)  time: 3.1758  data: 2.4847  max mem: 21847
Epoch: [218]  [ 200/1251]  eta: 0:05:04  lr: 0.000786  min_lr: 0.000786  loss: 2.9056 (2.8296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9380 (0.9884)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [218]  [ 400/1251]  eta: 0:03:59  lr: 0.000784  min_lr: 0.000784  loss: 3.2213 (2.8510)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9637 (0.9874)  time: 0.2728  data: 0.0005  max mem: 21847
Epoch: [218]  [ 600/1251]  eta: 0:03:02  lr: 0.000781  min_lr: 0.000781  loss: 2.8851 (2.8356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0020 (1.0278)  time: 0.2739  data: 0.0003  max mem: 21847
Epoch: [218]  [ 800/1251]  eta: 0:02:05  lr: 0.000778  min_lr: 0.000778  loss: 2.5676 (2.8382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9840 (1.0173)  time: 0.2742  data: 0.0005  max mem: 21847
Epoch: [218]  [1000/1251]  eta: 0:01:09  lr: 0.000775  min_lr: 0.000775  loss: 2.6176 (2.8377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8756 (1.0104)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [218]  [1200/1251]  eta: 0:00:14  lr: 0.000772  min_lr: 0.000772  loss: 2.9411 (2.8396)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1401 (1.0214)  time: 0.2834  data: 0.0005  max mem: 21847
Epoch: [218]  [1250/1251]  eta: 0:00:00  lr: 0.000772  min_lr: 0.000772  loss: 3.2298 (2.8440)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0642 (1.0202)  time: 0.2278  data: 0.0006  max mem: 21847
Epoch: [218] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.000772  min_lr: 0.000772  loss: 3.2298 (2.8295)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0642 (1.0202)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6144 (0.6144)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 5.4357  data: 5.2593  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8373 (0.7981)  acc1: 84.4000 (84.4364)  acc5: 97.6000 (97.4909)  time: 0.7504  data: 0.6130  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9616 (0.9242)  acc1: 80.0000 (81.4667)  acc5: 96.4000 (96.0571)  time: 0.2212  data: 0.0901  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0157 (0.9366)  acc1: 78.8000 (81.1040)  acc5: 95.2000 (95.9360)  time: 0.2193  data: 0.0900  max mem: 21847
Test: Total time: 0:00:10 (0.4180 s / it)
* Acc@1 81.338 Acc@5 95.898 loss 0.931
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.55%
Epoch: [219]  [   0/1251]  eta: 1:10:22  lr: 0.000771  min_lr: 0.000771  loss: 2.0249 (2.0249)  weight_decay: 0.0500 (0.0500)  time: 3.3750  data: 2.4840  max mem: 21847
Epoch: [219]  [ 200/1251]  eta: 0:05:05  lr: 0.000769  min_lr: 0.000769  loss: 3.0770 (2.8342)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0135 (1.0826)  time: 0.2720  data: 0.0003  max mem: 21847
Epoch: [219]  [ 400/1251]  eta: 0:04:00  lr: 0.000766  min_lr: 0.000766  loss: 2.8114 (2.8405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9063 (1.0819)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [219]  [ 600/1251]  eta: 0:03:02  lr: 0.000763  min_lr: 0.000763  loss: 3.1966 (2.8714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9173 (1.0176)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [219]  [ 800/1251]  eta: 0:02:05  lr: 0.000760  min_lr: 0.000760  loss: 2.7930 (2.8625)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1326 (1.0401)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [219]  [1000/1251]  eta: 0:01:09  lr: 0.000757  min_lr: 0.000757  loss: 2.4165 (2.8432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8658 (1.0146)  time: 0.2741  data: 0.0003  max mem: 21847
Epoch: [219]  [1200/1251]  eta: 0:00:14  lr: 0.000755  min_lr: 0.000755  loss: 2.9838 (2.8342)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0434 (1.0034)  time: 0.2705  data: 0.0004  max mem: 21847
Epoch: [219]  [1250/1251]  eta: 0:00:00  lr: 0.000754  min_lr: 0.000754  loss: 3.0468 (2.8345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0187 (1.0114)  time: 0.2278  data: 0.0006  max mem: 21847
Epoch: [219] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.000754  min_lr: 0.000754  loss: 3.0468 (2.8246)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0187 (1.0114)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5654 (0.5654)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.6818  data: 5.5331  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7422 (0.7502)  acc1: 85.6000 (84.1455)  acc5: 97.6000 (97.3091)  time: 0.7548  data: 0.6224  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9305 (0.8819)  acc1: 79.6000 (81.6571)  acc5: 96.0000 (96.0191)  time: 0.2167  data: 0.0876  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9674 (0.8910)  acc1: 79.6000 (81.2800)  acc5: 95.2000 (95.9520)  time: 0.2158  data: 0.0876  max mem: 21847
Test: Total time: 0:00:10 (0.4242 s / it)
* Acc@1 81.472 Acc@5 95.806 loss 0.886
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.55%
Epoch: [220]  [   0/1251]  eta: 1:10:56  lr: 0.000754  min_lr: 0.000754  loss: 2.5926 (2.5926)  weight_decay: 0.0500 (0.0500)  time: 3.4027  data: 3.0423  max mem: 21847
Epoch: [220]  [ 200/1251]  eta: 0:05:09  lr: 0.000751  min_lr: 0.000751  loss: 2.7050 (2.8575)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0022 (1.0792)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [220]  [ 400/1251]  eta: 0:04:02  lr: 0.000748  min_lr: 0.000748  loss: 2.7417 (2.8374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9286 (1.0711)  time: 0.2836  data: 0.0004  max mem: 21847
Epoch: [220]  [ 600/1251]  eta: 0:03:02  lr: 0.000745  min_lr: 0.000745  loss: 2.9322 (2.8348)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0634 (1.0949)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [220]  [ 800/1251]  eta: 0:02:06  lr: 0.000743  min_lr: 0.000743  loss: 3.0306 (2.8393)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0164 (1.1017)  time: 0.2790  data: 0.0004  max mem: 21847
Epoch: [220]  [1000/1251]  eta: 0:01:09  lr: 0.000740  min_lr: 0.000740  loss: 2.3959 (2.8258)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1763 (1.0951)  time: 0.2725  data: 0.0005  max mem: 21847
Epoch: [220]  [1200/1251]  eta: 0:00:14  lr: 0.000737  min_lr: 0.000737  loss: 3.3721 (2.8257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9831 (1.1016)  time: 0.2822  data: 0.0004  max mem: 21847
Epoch: [220]  [1250/1251]  eta: 0:00:00  lr: 0.000736  min_lr: 0.000736  loss: 3.0987 (2.8273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9046 (1.0998)  time: 0.2277  data: 0.0007  max mem: 21847
Epoch: [220] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.000736  min_lr: 0.000736  loss: 3.0987 (2.8188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9046 (1.0998)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6142 (0.6142)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.6029  data: 5.4553  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8284 (0.8177)  acc1: 84.4000 (84.3273)  acc5: 97.6000 (97.6000)  time: 0.7131  data: 0.5784  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9638 (0.9408)  acc1: 79.6000 (81.5619)  acc5: 96.0000 (95.8286)  time: 0.1877  data: 0.0567  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0049 (0.9529)  acc1: 79.2000 (81.1360)  acc5: 95.2000 (95.7280)  time: 0.2016  data: 0.0721  max mem: 21847
Test: Total time: 0:00:10 (0.4103 s / it)
* Acc@1 81.654 Acc@5 95.882 loss 0.943
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.65%
Epoch: [221]  [   0/1251]  eta: 1:10:34  lr: 0.000736  min_lr: 0.000736  loss: 3.4378 (3.4378)  weight_decay: 0.0500 (0.0500)  time: 3.3850  data: 3.0637  max mem: 21847
Epoch: [221]  [ 200/1251]  eta: 0:05:03  lr: 0.000734  min_lr: 0.000734  loss: 2.2290 (2.7190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8789 (0.9653)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [221]  [ 400/1251]  eta: 0:03:59  lr: 0.000731  min_lr: 0.000731  loss: 3.1620 (2.7921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8616 (0.9637)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [221]  [ 600/1251]  eta: 0:03:01  lr: 0.000728  min_lr: 0.000728  loss: 2.9560 (2.8030)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0732 (0.9946)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [221]  [ 800/1251]  eta: 0:02:05  lr: 0.000725  min_lr: 0.000725  loss: 3.1327 (2.8104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9027 (1.0034)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [221]  [1000/1251]  eta: 0:01:09  lr: 0.000722  min_lr: 0.000722  loss: 3.0188 (2.8241)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0472 (1.0412)  time: 0.2835  data: 0.0003  max mem: 21847
Epoch: [221]  [1200/1251]  eta: 0:00:14  lr: 0.000720  min_lr: 0.000720  loss: 2.9662 (2.8257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9017 (1.0314)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [221]  [1250/1251]  eta: 0:00:00  lr: 0.000719  min_lr: 0.000719  loss: 2.6824 (2.8206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9909 (1.0289)  time: 0.2279  data: 0.0009  max mem: 21847
Epoch: [221] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.000719  min_lr: 0.000719  loss: 2.6824 (2.8089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9909 (1.0289)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5326 (0.5326)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.7396  data: 5.5777  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7677 (0.7348)  acc1: 86.0000 (84.7636)  acc5: 97.6000 (97.4182)  time: 0.7180  data: 0.5853  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8818 (0.8560)  acc1: 80.0000 (82.0952)  acc5: 96.0000 (96.0000)  time: 0.1842  data: 0.0555  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9516 (0.8656)  acc1: 79.2000 (81.5520)  acc5: 95.6000 (95.9840)  time: 0.1910  data: 0.0630  max mem: 21847
Test: Total time: 0:00:10 (0.4064 s / it)
* Acc@1 81.516 Acc@5 95.982 loss 0.861
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.65%
Epoch: [222]  [   0/1251]  eta: 1:09:53  lr: 0.000719  min_lr: 0.000719  loss: 3.7068 (3.7068)  weight_decay: 0.0500 (0.0500)  time: 3.3517  data: 2.6853  max mem: 21847
Epoch: [222]  [ 200/1251]  eta: 0:05:05  lr: 0.000716  min_lr: 0.000716  loss: 2.9057 (2.7939)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2324 (1.1191)  time: 0.2811  data: 0.0004  max mem: 21847
Epoch: [222]  [ 400/1251]  eta: 0:04:00  lr: 0.000714  min_lr: 0.000714  loss: 2.2748 (2.7832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8266 (1.0770)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [222]  [ 600/1251]  eta: 0:03:02  lr: 0.000711  min_lr: 0.000711  loss: 3.0716 (2.7979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9539 (1.0398)  time: 0.2814  data: 0.0004  max mem: 21847
Epoch: [222]  [ 800/1251]  eta: 0:02:05  lr: 0.000708  min_lr: 0.000708  loss: 3.1784 (2.7923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9548 (1.0341)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [222]  [1000/1251]  eta: 0:01:09  lr: 0.000705  min_lr: 0.000705  loss: 2.5672 (2.7978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8733 (1.0415)  time: 0.2814  data: 0.0004  max mem: 21847
Epoch: [222]  [1200/1251]  eta: 0:00:14  lr: 0.000703  min_lr: 0.000703  loss: 2.8844 (2.7983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0422 (1.0440)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [222]  [1250/1251]  eta: 0:00:00  lr: 0.000702  min_lr: 0.000702  loss: 2.7621 (2.7932)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0698 (1.0472)  time: 0.2281  data: 0.0007  max mem: 21847
Epoch: [222] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.000702  min_lr: 0.000702  loss: 2.7621 (2.8122)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0698 (1.0472)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5522 (0.5522)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.8042  data: 5.6512  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8092 (0.7438)  acc1: 83.6000 (84.5091)  acc5: 97.6000 (97.4182)  time: 0.7449  data: 0.6112  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9181 (0.8727)  acc1: 80.0000 (81.4476)  acc5: 96.0000 (96.0762)  time: 0.1974  data: 0.0679  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9422 (0.8852)  acc1: 79.2000 (81.1520)  acc5: 95.6000 (95.9840)  time: 0.2173  data: 0.0890  max mem: 21847
Test: Total time: 0:00:10 (0.4307 s / it)
* Acc@1 81.466 Acc@5 95.964 loss 0.874
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.65%
Epoch: [223]  [   0/1251]  eta: 1:04:33  lr: 0.000702  min_lr: 0.000702  loss: 3.4770 (3.4770)  weight_decay: 0.0500 (0.0500)  time: 3.0963  data: 2.7086  max mem: 21847
Epoch: [223]  [ 200/1251]  eta: 0:05:05  lr: 0.000699  min_lr: 0.000699  loss: 3.0020 (2.7883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9550 (1.1076)  time: 0.2750  data: 0.0004  max mem: 21847
Epoch: [223]  [ 400/1251]  eta: 0:04:00  lr: 0.000696  min_lr: 0.000696  loss: 3.1868 (2.8010)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2090 (1.1668)  time: 0.2842  data: 0.0004  max mem: 21847
Epoch: [223]  [ 600/1251]  eta: 0:03:02  lr: 0.000694  min_lr: 0.000694  loss: 3.1477 (2.8103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9284 (1.1259)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [223]  [ 800/1251]  eta: 0:02:05  lr: 0.000691  min_lr: 0.000691  loss: 2.9656 (2.8106)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0076 (1.1188)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [223]  [1000/1251]  eta: 0:01:09  lr: 0.000688  min_lr: 0.000688  loss: 2.9935 (2.8060)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0952 (1.1525)  time: 0.2721  data: 0.0003  max mem: 21847
Epoch: [223]  [1200/1251]  eta: 0:00:14  lr: 0.000686  min_lr: 0.000686  loss: 2.6350 (2.8028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9982 (1.1512)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [223]  [1250/1251]  eta: 0:00:00  lr: 0.000685  min_lr: 0.000685  loss: 3.1729 (2.8022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1009 (1.1553)  time: 0.2281  data: 0.0007  max mem: 21847
Epoch: [223] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.000685  min_lr: 0.000685  loss: 3.1729 (2.7926)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1009 (1.1553)
Test:  [ 0/25]  eta: 0:01:46  loss: 0.6668 (0.6668)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 4.2522  data: 4.0850  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8892 (0.8304)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.3091)  time: 0.7039  data: 0.5697  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 1.0002 (0.9687)  acc1: 78.8000 (81.2381)  acc5: 95.6000 (95.9048)  time: 0.2631  data: 0.1339  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0553 (0.9772)  acc1: 78.8000 (81.0720)  acc5: 94.8000 (95.7440)  time: 0.2349  data: 0.1067  max mem: 21847
Test: Total time: 0:00:10 (0.4058 s / it)
* Acc@1 81.510 Acc@5 95.874 loss 0.966
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.65%
Epoch: [224]  [   0/1251]  eta: 1:13:30  lr: 0.000685  min_lr: 0.000685  loss: 2.9552 (2.9552)  weight_decay: 0.0500 (0.0500)  time: 3.5253  data: 3.1600  max mem: 21847
Epoch: [224]  [ 200/1251]  eta: 0:05:06  lr: 0.000682  min_lr: 0.000682  loss: 2.9036 (2.8490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9643 (1.0400)  time: 0.2819  data: 0.0004  max mem: 21847
Epoch: [224]  [ 400/1251]  eta: 0:04:00  lr: 0.000680  min_lr: 0.000680  loss: 3.1700 (2.8061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9751 (1.0043)  time: 0.2748  data: 0.0004  max mem: 21847
Epoch: [224]  [ 600/1251]  eta: 0:03:02  lr: 0.000677  min_lr: 0.000677  loss: 2.2922 (2.7971)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [224]  [ 800/1251]  eta: 0:02:05  lr: 0.000674  min_lr: 0.000674  loss: 2.8011 (2.8031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8839 (nan)  time: 0.2724  data: 0.0005  max mem: 21847
Epoch: [224]  [1000/1251]  eta: 0:01:09  lr: 0.000671  min_lr: 0.000671  loss: 2.8044 (2.8051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9139 (nan)  time: 0.2798  data: 0.0004  max mem: 21847
Epoch: [224]  [1200/1251]  eta: 0:00:14  lr: 0.000669  min_lr: 0.000669  loss: 3.0294 (2.8057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8626 (nan)  time: 0.2933  data: 0.0004  max mem: 21847
Epoch: [224]  [1250/1251]  eta: 0:00:00  lr: 0.000668  min_lr: 0.000668  loss: 2.9852 (2.8047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8998 (nan)  time: 0.2286  data: 0.0010  max mem: 21847
Epoch: [224] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.000668  min_lr: 0.000668  loss: 2.9852 (2.7915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8998 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6483 (0.6483)  acc1: 88.4000 (88.4000)  acc5: 97.6000 (97.6000)  time: 5.5451  data: 5.3898  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8628 (0.8175)  acc1: 86.0000 (84.8000)  acc5: 97.2000 (97.4182)  time: 0.7342  data: 0.5974  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9707 (0.9552)  acc1: 78.8000 (81.5810)  acc5: 96.4000 (95.9810)  time: 0.2079  data: 0.0767  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0046 (0.9622)  acc1: 78.8000 (81.3920)  acc5: 95.2000 (95.9520)  time: 0.2044  data: 0.0766  max mem: 21847
Test: Total time: 0:00:10 (0.4117 s / it)
* Acc@1 81.550 Acc@5 95.932 loss 0.956
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.65%
Epoch: [225]  [   0/1251]  eta: 1:08:27  lr: 0.000668  min_lr: 0.000668  loss: 1.8688 (1.8688)  weight_decay: 0.0500 (0.0500)  time: 3.2834  data: 2.8591  max mem: 21847
Epoch: [225]  [ 200/1251]  eta: 0:05:04  lr: 0.000665  min_lr: 0.000665  loss: 2.5325 (2.8255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0810 (1.0629)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [225]  [ 400/1251]  eta: 0:03:59  lr: 0.000663  min_lr: 0.000663  loss: 3.0590 (2.8191)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0980 (1.1069)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [225]  [ 600/1251]  eta: 0:03:02  lr: 0.000660  min_lr: 0.000660  loss: 2.7319 (2.8038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8469 (1.0825)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [225]  [ 800/1251]  eta: 0:02:05  lr: 0.000657  min_lr: 0.000657  loss: 3.0106 (2.8099)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9373 (1.0674)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [225]  [1000/1251]  eta: 0:01:09  lr: 0.000655  min_lr: 0.000655  loss: 2.7374 (2.8041)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0632 (1.0651)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [225]  [1200/1251]  eta: 0:00:14  lr: 0.000652  min_lr: 0.000652  loss: 2.4653 (2.8066)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0147 (1.0996)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [225]  [1250/1251]  eta: 0:00:00  lr: 0.000652  min_lr: 0.000652  loss: 2.6758 (2.8103)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0967 (1.1073)  time: 0.2372  data: 0.0005  max mem: 21847
Epoch: [225] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.000652  min_lr: 0.000652  loss: 2.6758 (2.8030)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0967 (1.1073)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5606 (0.5606)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.4116  data: 5.2597  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7711 (0.7459)  acc1: 85.6000 (85.1273)  acc5: 97.6000 (97.5273)  time: 0.7330  data: 0.5988  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9011 (0.8837)  acc1: 80.4000 (82.2667)  acc5: 95.6000 (96.0191)  time: 0.2140  data: 0.0841  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9898 (0.8940)  acc1: 80.0000 (81.7280)  acc5: 95.6000 (95.9520)  time: 0.2123  data: 0.0840  max mem: 21847
Test: Total time: 0:00:10 (0.4119 s / it)
* Acc@1 81.930 Acc@5 95.892 loss 0.888
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.93%
Epoch: [226]  [   0/1251]  eta: 1:04:27  lr: 0.000651  min_lr: 0.000651  loss: 2.8965 (2.8965)  weight_decay: 0.0500 (0.0500)  time: 3.0915  data: 2.7890  max mem: 21847
Epoch: [226]  [ 200/1251]  eta: 0:05:03  lr: 0.000649  min_lr: 0.000649  loss: 2.9854 (2.8893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9238 (1.0488)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [226]  [ 400/1251]  eta: 0:03:59  lr: 0.000646  min_lr: 0.000646  loss: 2.6296 (2.7773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9071 (1.0274)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [226]  [ 600/1251]  eta: 0:03:01  lr: 0.000644  min_lr: 0.000644  loss: 2.7597 (2.7603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9208 (1.0478)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [226]  [ 800/1251]  eta: 0:02:05  lr: 0.000641  min_lr: 0.000641  loss: 2.4544 (2.7563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8816 (1.0150)  time: 0.2844  data: 0.0004  max mem: 21847
Epoch: [226]  [1000/1251]  eta: 0:01:09  lr: 0.000638  min_lr: 0.000638  loss: 2.7506 (2.7655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9853 (1.0231)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [226]  [1200/1251]  eta: 0:00:14  lr: 0.000636  min_lr: 0.000636  loss: 3.0461 (2.7695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9480 (1.0237)  time: 0.2840  data: 0.0004  max mem: 21847
Epoch: [226]  [1250/1251]  eta: 0:00:00  lr: 0.000635  min_lr: 0.000635  loss: 2.5596 (2.7701)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0492 (1.0280)  time: 0.2275  data: 0.0006  max mem: 21847
Epoch: [226] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.000635  min_lr: 0.000635  loss: 2.5596 (2.7813)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0492 (1.0280)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6019 (0.6019)  acc1: 89.6000 (89.6000)  acc5: 97.6000 (97.6000)  time: 5.6213  data: 5.4588  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8092 (0.7622)  acc1: 84.4000 (84.9091)  acc5: 97.6000 (97.4182)  time: 0.7437  data: 0.6098  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9142 (0.8934)  acc1: 79.6000 (81.5238)  acc5: 96.0000 (95.9238)  time: 0.2055  data: 0.0763  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9797 (0.9017)  acc1: 78.8000 (81.1680)  acc5: 94.8000 (95.8080)  time: 0.2052  data: 0.0762  max mem: 21847
Test: Total time: 0:00:10 (0.4130 s / it)
* Acc@1 81.780 Acc@5 96.006 loss 0.890
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.93%
Epoch: [227]  [   0/1251]  eta: 1:05:42  lr: 0.000635  min_lr: 0.000635  loss: 2.3770 (2.3770)  weight_decay: 0.0500 (0.0500)  time: 3.1515  data: 1.9669  max mem: 21847
Epoch: [227]  [ 200/1251]  eta: 0:05:06  lr: 0.000632  min_lr: 0.000632  loss: 2.5137 (2.8786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9851 (1.0768)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [227]  [ 400/1251]  eta: 0:04:00  lr: 0.000630  min_lr: 0.000630  loss: 3.0113 (2.8331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9532 (1.0435)  time: 0.2746  data: 0.0004  max mem: 21847
Epoch: [227]  [ 600/1251]  eta: 0:03:01  lr: 0.000627  min_lr: 0.000627  loss: 2.5085 (2.7966)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0869 (1.0562)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [227]  [ 800/1251]  eta: 0:02:05  lr: 0.000625  min_lr: 0.000625  loss: 2.9492 (2.7862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9087 (1.0552)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [227]  [1000/1251]  eta: 0:01:09  lr: 0.000622  min_lr: 0.000622  loss: 2.6711 (2.7894)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0914 (1.0574)  time: 0.2900  data: 0.0004  max mem: 21847
Epoch: [227]  [1200/1251]  eta: 0:00:14  lr: 0.000619  min_lr: 0.000619  loss: 2.8342 (2.7847)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0945 (1.0733)  time: 0.2730  data: 0.0005  max mem: 21847
Epoch: [227]  [1250/1251]  eta: 0:00:00  lr: 0.000619  min_lr: 0.000619  loss: 2.4783 (2.7829)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9927 (1.0710)  time: 0.2282  data: 0.0005  max mem: 21847
Epoch: [227] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.000619  min_lr: 0.000619  loss: 2.4783 (2.7760)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9927 (1.0710)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5738 (0.5738)  acc1: 90.0000 (90.0000)  acc5: 98.0000 (98.0000)  time: 5.4689  data: 5.3211  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7742 (0.7298)  acc1: 85.2000 (84.9818)  acc5: 97.6000 (97.5273)  time: 0.7499  data: 0.6157  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8915 (0.8646)  acc1: 80.8000 (82.0000)  acc5: 96.0000 (95.9810)  time: 0.2149  data: 0.0848  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9536 (0.8723)  acc1: 80.4000 (81.6160)  acc5: 95.6000 (96.0640)  time: 0.2126  data: 0.0835  max mem: 21847
Test: Total time: 0:00:10 (0.4146 s / it)
* Acc@1 81.846 Acc@5 96.090 loss 0.868
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.93%
Epoch: [228]  [   0/1251]  eta: 1:05:13  lr: 0.000619  min_lr: 0.000619  loss: 3.4143 (3.4143)  weight_decay: 0.0500 (0.0500)  time: 3.1284  data: 2.6587  max mem: 21847
Epoch: [228]  [ 200/1251]  eta: 0:05:03  lr: 0.000616  min_lr: 0.000616  loss: 2.8622 (2.7693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9224 (0.9567)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [228]  [ 400/1251]  eta: 0:03:59  lr: 0.000614  min_lr: 0.000614  loss: 2.6740 (2.7591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9686 (1.0204)  time: 0.2744  data: 0.0004  max mem: 21847
Epoch: [228]  [ 600/1251]  eta: 0:03:02  lr: 0.000611  min_lr: 0.000611  loss: 2.4808 (2.7569)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0499 (1.0881)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [228]  [ 800/1251]  eta: 0:02:05  lr: 0.000608  min_lr: 0.000608  loss: 3.1072 (2.7679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9740 (1.0747)  time: 0.2745  data: 0.0004  max mem: 21847
Epoch: [228]  [1000/1251]  eta: 0:01:09  lr: 0.000606  min_lr: 0.000606  loss: 2.8001 (2.7730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8655 (1.0462)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [228]  [1200/1251]  eta: 0:00:14  lr: 0.000603  min_lr: 0.000603  loss: 2.3193 (2.7769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0829 (1.0535)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [228]  [1250/1251]  eta: 0:00:00  lr: 0.000603  min_lr: 0.000603  loss: 2.7130 (2.7778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9698 (1.0521)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [228] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.000603  min_lr: 0.000603  loss: 2.7130 (2.7769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9698 (1.0521)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5693 (0.5693)  acc1: 90.0000 (90.0000)  acc5: 98.0000 (98.0000)  time: 5.6487  data: 5.4816  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7656 (0.7403)  acc1: 85.2000 (84.8000)  acc5: 97.6000 (97.4182)  time: 0.7433  data: 0.6107  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9083 (0.8796)  acc1: 80.4000 (81.8095)  acc5: 96.0000 (95.9238)  time: 0.2100  data: 0.0817  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9378 (0.8862)  acc1: 78.8000 (81.3760)  acc5: 95.2000 (95.9040)  time: 0.2091  data: 0.0816  max mem: 21847
Test: Total time: 0:00:10 (0.4183 s / it)
* Acc@1 81.982 Acc@5 96.080 loss 0.876
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 81.98%
Epoch: [229]  [   0/1251]  eta: 1:02:23  lr: 0.000603  min_lr: 0.000603  loss: 2.9334 (2.9334)  weight_decay: 0.0500 (0.0500)  time: 2.9926  data: 2.6752  max mem: 21847
Epoch: [229]  [ 200/1251]  eta: 0:05:03  lr: 0.000600  min_lr: 0.000600  loss: 2.3369 (2.7134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9763 (1.0726)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [229]  [ 400/1251]  eta: 0:03:59  lr: 0.000597  min_lr: 0.000597  loss: 3.0810 (2.7341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9849 (1.0780)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [229]  [ 600/1251]  eta: 0:03:01  lr: 0.000595  min_lr: 0.000595  loss: 3.1183 (2.7711)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0323 (1.0723)  time: 0.2727  data: 0.0003  max mem: 21847
Epoch: [229]  [ 800/1251]  eta: 0:02:05  lr: 0.000592  min_lr: 0.000592  loss: 3.0053 (2.7772)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1115 (1.1167)  time: 0.2779  data: 0.0005  max mem: 21847
Epoch: [229]  [1000/1251]  eta: 0:01:09  lr: 0.000590  min_lr: 0.000590  loss: 2.8724 (2.7718)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0305 (1.1001)  time: 0.2715  data: 0.0003  max mem: 21847
Epoch: [229]  [1200/1251]  eta: 0:00:14  lr: 0.000587  min_lr: 0.000587  loss: 2.5360 (2.7711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8781 (1.0835)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [229]  [1250/1251]  eta: 0:00:00  lr: 0.000587  min_lr: 0.000587  loss: 3.1734 (2.7778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9621 (1.0877)  time: 0.2285  data: 0.0005  max mem: 21847
Epoch: [229] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.000587  min_lr: 0.000587  loss: 3.1734 (2.7679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9621 (1.0877)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6720 (0.6720)  acc1: 88.8000 (88.8000)  acc5: 97.2000 (97.2000)  time: 5.8822  data: 5.7141  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8605 (0.8267)  acc1: 85.2000 (84.7273)  acc5: 97.2000 (97.2727)  time: 0.7593  data: 0.6252  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9851 (0.9589)  acc1: 80.0000 (81.7714)  acc5: 95.6000 (95.7905)  time: 0.1943  data: 0.0641  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0028 (0.9655)  acc1: 80.0000 (81.4880)  acc5: 95.2000 (95.7120)  time: 0.2139  data: 0.0845  max mem: 21847
Test: Total time: 0:00:10 (0.4307 s / it)
* Acc@1 81.938 Acc@5 95.902 loss 0.956
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.98%
Epoch: [230]  [   0/1251]  eta: 1:06:54  lr: 0.000587  min_lr: 0.000587  loss: 2.2493 (2.2493)  weight_decay: 0.0500 (0.0500)  time: 3.2093  data: 2.0111  max mem: 21847
Epoch: [230]  [ 200/1251]  eta: 0:05:05  lr: 0.000584  min_lr: 0.000584  loss: 2.7767 (2.7758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9381 (1.0307)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [230]  [ 400/1251]  eta: 0:04:00  lr: 0.000582  min_lr: 0.000582  loss: 2.9893 (2.7790)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0767 (1.0650)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [230]  [ 600/1251]  eta: 0:03:02  lr: 0.000579  min_lr: 0.000579  loss: 2.9104 (2.7632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9512 (1.0905)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [230]  [ 800/1251]  eta: 0:02:05  lr: 0.000577  min_lr: 0.000577  loss: 2.4488 (2.7723)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9056 (1.0689)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [230]  [1000/1251]  eta: 0:01:09  lr: 0.000574  min_lr: 0.000574  loss: 2.9896 (2.7736)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0458 (1.0625)  time: 0.2724  data: 0.0005  max mem: 21847
Epoch: [230]  [1200/1251]  eta: 0:00:14  lr: 0.000571  min_lr: 0.000571  loss: 2.8417 (2.7666)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1423 (1.0787)  time: 0.2739  data: 0.0005  max mem: 21847
Epoch: [230]  [1250/1251]  eta: 0:00:00  lr: 0.000571  min_lr: 0.000571  loss: 2.6670 (2.7619)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1461 (1.0809)  time: 0.2277  data: 0.0005  max mem: 21847
Epoch: [230] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.000571  min_lr: 0.000571  loss: 2.6670 (2.7704)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1461 (1.0809)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5359 (0.5359)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.3155  data: 5.1711  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7479 (0.7250)  acc1: 85.2000 (84.5091)  acc5: 97.6000 (97.2000)  time: 0.7181  data: 0.5747  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9072 (0.8539)  acc1: 78.8000 (81.6191)  acc5: 95.6000 (95.8286)  time: 0.2152  data: 0.0799  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9153 (0.8577)  acc1: 79.2000 (81.4720)  acc5: 95.6000 (95.8400)  time: 0.2130  data: 0.0820  max mem: 21847
Test: Total time: 0:00:10 (0.4104 s / it)
* Acc@1 81.830 Acc@5 95.940 loss 0.845
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.98%
Epoch: [231]  [   0/1251]  eta: 1:10:46  lr: 0.000571  min_lr: 0.000571  loss: 3.4283 (3.4283)  weight_decay: 0.0500 (0.0500)  time: 3.3947  data: 2.9961  max mem: 21847
Epoch: [231]  [ 200/1251]  eta: 0:05:04  lr: 0.000568  min_lr: 0.000568  loss: 2.0725 (2.8126)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0303 (1.1076)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [231]  [ 400/1251]  eta: 0:04:00  lr: 0.000566  min_lr: 0.000566  loss: 2.8813 (2.7939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9817 (1.0818)  time: 0.2706  data: 0.0003  max mem: 21847
Epoch: [231]  [ 600/1251]  eta: 0:03:01  lr: 0.000563  min_lr: 0.000563  loss: 2.8693 (2.7812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9738 (1.0743)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [231]  [ 800/1251]  eta: 0:02:05  lr: 0.000561  min_lr: 0.000561  loss: 2.5190 (2.7578)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0200 (1.0722)  time: 0.2820  data: 0.0004  max mem: 21847
Epoch: [231]  [1000/1251]  eta: 0:01:09  lr: 0.000558  min_lr: 0.000558  loss: 2.4551 (2.7544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9478 (1.0790)  time: 0.2716  data: 0.0005  max mem: 21847
Epoch: [231]  [1200/1251]  eta: 0:00:14  lr: 0.000556  min_lr: 0.000556  loss: 2.5881 (2.7472)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0222 (1.0714)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [231]  [1250/1251]  eta: 0:00:00  lr: 0.000555  min_lr: 0.000555  loss: 3.1806 (2.7519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0222 (1.0713)  time: 0.2282  data: 0.0005  max mem: 21847
Epoch: [231] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.000555  min_lr: 0.000555  loss: 3.1806 (2.7556)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0222 (1.0713)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6206 (0.6206)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.5176  data: 5.3670  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8455 (0.8077)  acc1: 86.4000 (84.9091)  acc5: 97.6000 (97.4909)  time: 0.7333  data: 0.6003  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9483 (0.9343)  acc1: 80.0000 (81.9429)  acc5: 95.6000 (95.9048)  time: 0.2054  data: 0.0762  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0220 (0.9457)  acc1: 78.8000 (81.4240)  acc5: 95.2000 (95.9200)  time: 0.2043  data: 0.0761  max mem: 21847
Test: Total time: 0:00:10 (0.4096 s / it)
* Acc@1 81.808 Acc@5 96.018 loss 0.937
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.98%
Epoch: [232]  [   0/1251]  eta: 1:03:47  lr: 0.000555  min_lr: 0.000555  loss: 2.5149 (2.5149)  weight_decay: 0.0500 (0.0500)  time: 3.0595  data: 2.5624  max mem: 21847
Epoch: [232]  [ 200/1251]  eta: 0:05:07  lr: 0.000553  min_lr: 0.000553  loss: 3.1115 (2.7952)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1177 (1.1679)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [232]  [ 400/1251]  eta: 0:04:00  lr: 0.000550  min_lr: 0.000550  loss: 2.8113 (2.7546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1291 (1.1537)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [232]  [ 600/1251]  eta: 0:03:02  lr: 0.000548  min_lr: 0.000548  loss: 2.9883 (2.7507)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1119 (1.1725)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [232]  [ 800/1251]  eta: 0:02:05  lr: 0.000545  min_lr: 0.000545  loss: 2.6540 (2.7453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9870 (1.1571)  time: 0.2706  data: 0.0004  max mem: 21847
Epoch: [232]  [1000/1251]  eta: 0:01:09  lr: 0.000543  min_lr: 0.000543  loss: 2.7087 (2.7270)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1200 (1.1359)  time: 0.2932  data: 0.0004  max mem: 21847
Epoch: [232]  [1200/1251]  eta: 0:00:14  lr: 0.000540  min_lr: 0.000540  loss: 2.8516 (2.7413)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0394 (1.1323)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [232]  [1250/1251]  eta: 0:00:00  lr: 0.000540  min_lr: 0.000540  loss: 2.6246 (2.7331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9484 (1.1245)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [232] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.000540  min_lr: 0.000540  loss: 2.6246 (2.7704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9484 (1.1245)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.5497 (0.5497)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.8737  data: 5.7218  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7260 (0.7161)  acc1: 86.0000 (84.8364)  acc5: 97.2000 (97.2727)  time: 0.7576  data: 0.6233  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9112 (0.8501)  acc1: 78.8000 (81.6000)  acc5: 95.6000 (95.8095)  time: 0.2008  data: 0.0708  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9431 (0.8633)  acc1: 78.8000 (81.2480)  acc5: 95.2000 (95.7280)  time: 0.1990  data: 0.0707  max mem: 21847
Test: Total time: 0:00:10 (0.4193 s / it)
* Acc@1 81.828 Acc@5 95.918 loss 0.848
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.98%
Epoch: [233]  [   0/1251]  eta: 1:11:54  lr: 0.000540  min_lr: 0.000540  loss: 2.1343 (2.1343)  weight_decay: 0.0500 (0.0500)  time: 3.4492  data: 1.7401  max mem: 21847
Epoch: [233]  [ 200/1251]  eta: 0:05:04  lr: 0.000537  min_lr: 0.000537  loss: 2.6007 (2.7881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9791 (0.9898)  time: 0.2741  data: 0.0006  max mem: 21847
Epoch: [233]  [ 400/1251]  eta: 0:04:00  lr: 0.000535  min_lr: 0.000535  loss: 3.0101 (2.7480)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2564 (1.0924)  time: 0.2723  data: 0.0006  max mem: 21847
Epoch: [233]  [ 600/1251]  eta: 0:03:02  lr: 0.000533  min_lr: 0.000533  loss: 3.2175 (2.7589)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0009 (1.1006)  time: 0.2746  data: 0.0004  max mem: 21847
Epoch: [233]  [ 800/1251]  eta: 0:02:05  lr: 0.000530  min_lr: 0.000530  loss: 2.5799 (2.7716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9831 (1.0994)  time: 0.2725  data: 0.0005  max mem: 21847
Epoch: [233]  [1000/1251]  eta: 0:01:09  lr: 0.000528  min_lr: 0.000528  loss: 2.6101 (2.7609)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2638 (nan)  time: 0.2851  data: 0.0004  max mem: 21847
Epoch: [233]  [1200/1251]  eta: 0:00:14  lr: 0.000525  min_lr: 0.000525  loss: 2.7738 (2.7596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0321 (nan)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [233]  [1250/1251]  eta: 0:00:00  lr: 0.000525  min_lr: 0.000525  loss: 2.7854 (2.7569)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1171 (nan)  time: 0.2376  data: 0.0007  max mem: 21847
Epoch: [233] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.000525  min_lr: 0.000525  loss: 2.7854 (2.7571)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1171 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5647 (0.5647)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.4637  data: 5.3055  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7622 (0.7253)  acc1: 84.8000 (84.5455)  acc5: 97.2000 (97.3091)  time: 0.7285  data: 0.5943  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8890 (0.8610)  acc1: 80.0000 (81.6000)  acc5: 96.0000 (95.8095)  time: 0.2067  data: 0.0768  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.8936 (0.8669)  acc1: 80.0000 (81.3600)  acc5: 95.6000 (95.7760)  time: 0.2045  data: 0.0755  max mem: 21847
Test: Total time: 0:00:10 (0.4079 s / it)
* Acc@1 81.836 Acc@5 95.972 loss 0.856
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.98%
Epoch: [234]  [   0/1251]  eta: 1:10:03  lr: 0.000525  min_lr: 0.000525  loss: 2.9613 (2.9613)  weight_decay: 0.0500 (0.0500)  time: 3.3602  data: 2.7870  max mem: 21847
Epoch: [234]  [ 200/1251]  eta: 0:05:07  lr: 0.000522  min_lr: 0.000522  loss: 2.7271 (2.7342)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1163 (1.1170)  time: 0.2827  data: 0.0004  max mem: 21847
Epoch: [234]  [ 400/1251]  eta: 0:04:00  lr: 0.000520  min_lr: 0.000520  loss: 2.8542 (2.7245)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0110 (1.0680)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [234]  [ 600/1251]  eta: 0:03:02  lr: 0.000517  min_lr: 0.000517  loss: 3.2401 (2.7426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8852 (1.0282)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [234]  [ 800/1251]  eta: 0:02:05  lr: 0.000515  min_lr: 0.000515  loss: 2.8054 (2.7173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9948 (1.0476)  time: 0.2724  data: 0.0005  max mem: 21847
Epoch: [234]  [1000/1251]  eta: 0:01:09  lr: 0.000513  min_lr: 0.000513  loss: 2.9402 (2.7347)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0293 (1.0525)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [234]  [1200/1251]  eta: 0:00:14  lr: 0.000510  min_lr: 0.000510  loss: 2.9653 (2.7401)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0224 (1.0597)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [234]  [1250/1251]  eta: 0:00:00  lr: 0.000510  min_lr: 0.000510  loss: 2.7966 (2.7406)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0891 (1.0625)  time: 0.2280  data: 0.0007  max mem: 21847
Epoch: [234] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.000510  min_lr: 0.000510  loss: 2.7966 (2.7456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0891 (1.0625)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6184 (0.6184)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.5755  data: 5.4069  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8089 (0.8017)  acc1: 86.8000 (85.5636)  acc5: 97.6000 (97.5636)  time: 0.7328  data: 0.5980  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9944 (0.9448)  acc1: 80.8000 (82.0571)  acc5: 96.4000 (95.8857)  time: 0.2106  data: 0.0813  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0362 (0.9550)  acc1: 78.8000 (81.6800)  acc5: 95.6000 (95.9520)  time: 0.2102  data: 0.0812  max mem: 21847
Test: Total time: 0:00:10 (0.4151 s / it)
* Acc@1 81.752 Acc@5 95.918 loss 0.953
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.98%
Epoch: [235]  [   0/1251]  eta: 1:09:17  lr: 0.000510  min_lr: 0.000510  loss: 1.9105 (1.9105)  weight_decay: 0.0500 (0.0500)  time: 3.3237  data: 2.9215  max mem: 21847
Epoch: [235]  [ 200/1251]  eta: 0:05:05  lr: 0.000507  min_lr: 0.000507  loss: 2.5042 (2.7508)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0556 (1.1164)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [235]  [ 400/1251]  eta: 0:04:01  lr: 0.000505  min_lr: 0.000505  loss: 3.0323 (2.7575)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0147 (1.0564)  time: 0.2744  data: 0.0004  max mem: 21847
Epoch: [235]  [ 600/1251]  eta: 0:03:02  lr: 0.000502  min_lr: 0.000502  loss: 2.4101 (2.7337)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1712 (1.0711)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [235]  [ 800/1251]  eta: 0:02:05  lr: 0.000500  min_lr: 0.000500  loss: 2.9903 (2.7377)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0524 (1.0620)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [235]  [1000/1251]  eta: 0:01:09  lr: 0.000498  min_lr: 0.000498  loss: 2.9414 (2.7322)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1041 (1.0679)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [235]  [1200/1251]  eta: 0:00:14  lr: 0.000495  min_lr: 0.000495  loss: 2.7277 (2.7383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2500 (1.0940)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [235]  [1250/1251]  eta: 0:00:00  lr: 0.000495  min_lr: 0.000495  loss: 2.6083 (2.7380)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2529 (1.0969)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [235] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.000495  min_lr: 0.000495  loss: 2.6083 (2.7474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2529 (1.0969)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5346 (0.5346)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.5102  data: 5.3529  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7148 (0.7281)  acc1: 86.0000 (85.2727)  acc5: 98.0000 (97.6727)  time: 0.7042  data: 0.5701  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9026 (0.8682)  acc1: 79.6000 (81.9619)  acc5: 96.4000 (96.1714)  time: 0.1912  data: 0.0615  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9466 (0.8786)  acc1: 80.0000 (81.8240)  acc5: 96.0000 (96.1600)  time: 0.1927  data: 0.0643  max mem: 21847
Test: Total time: 0:00:10 (0.4000 s / it)
* Acc@1 81.858 Acc@5 96.038 loss 0.875
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.98%
Epoch: [236]  [   0/1251]  eta: 1:08:37  lr: 0.000495  min_lr: 0.000495  loss: 1.9530 (1.9530)  weight_decay: 0.0500 (0.0500)  time: 3.2916  data: 2.2008  max mem: 21847
Epoch: [236]  [ 200/1251]  eta: 0:05:04  lr: 0.000492  min_lr: 0.000492  loss: 3.1439 (2.7450)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0495 (1.1205)  time: 0.2730  data: 0.0005  max mem: 21847
Epoch: [236]  [ 400/1251]  eta: 0:04:00  lr: 0.000490  min_lr: 0.000490  loss: 2.3456 (2.7125)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1846 (1.1788)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [236]  [ 600/1251]  eta: 0:03:02  lr: 0.000488  min_lr: 0.000488  loss: 2.3435 (2.7107)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2301 (1.1696)  time: 0.2800  data: 0.0004  max mem: 21847
Epoch: [236]  [ 800/1251]  eta: 0:02:05  lr: 0.000485  min_lr: 0.000485  loss: 2.6920 (2.7229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9260 (1.1444)  time: 0.2735  data: 0.0004  max mem: 21847
Epoch: [236]  [1000/1251]  eta: 0:01:09  lr: 0.000483  min_lr: 0.000483  loss: 2.5359 (2.7130)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0394 (1.1284)  time: 0.2796  data: 0.0004  max mem: 21847
Epoch: [236]  [1200/1251]  eta: 0:00:14  lr: 0.000481  min_lr: 0.000481  loss: 2.5546 (2.7109)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [236]  [1250/1251]  eta: 0:00:00  lr: 0.000480  min_lr: 0.000480  loss: 3.1437 (2.7112)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1593 (nan)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [236] Total time: 0:05:46 (0.2774 s / it)
Averaged stats: lr: 0.000480  min_lr: 0.000480  loss: 3.1437 (2.7366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1593 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5770 (0.5770)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.5328  data: 5.3678  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7188 (0.7368)  acc1: 86.4000 (85.2727)  acc5: 97.2000 (97.4909)  time: 0.7655  data: 0.6303  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9134 (0.8733)  acc1: 80.0000 (81.6381)  acc5: 96.4000 (96.2095)  time: 0.2121  data: 0.0822  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9556 (0.8795)  acc1: 80.0000 (81.4880)  acc5: 95.6000 (96.2560)  time: 0.2112  data: 0.0822  max mem: 21847
Test: Total time: 0:00:10 (0.4150 s / it)
* Acc@1 81.870 Acc@5 96.020 loss 0.876
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.98%
Epoch: [237]  [   0/1251]  eta: 1:10:51  lr: 0.000480  min_lr: 0.000480  loss: 1.9777 (1.9777)  weight_decay: 0.0500 (0.0500)  time: 3.3989  data: 3.0158  max mem: 21847
Epoch: [237]  [ 200/1251]  eta: 0:05:06  lr: 0.000478  min_lr: 0.000478  loss: 2.8042 (2.7421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9802 (1.0418)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [237]  [ 400/1251]  eta: 0:04:00  lr: 0.000475  min_lr: 0.000475  loss: 2.6104 (2.6991)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2478 (1.1312)  time: 0.2834  data: 0.0004  max mem: 21847
Epoch: [237]  [ 600/1251]  eta: 0:03:02  lr: 0.000473  min_lr: 0.000473  loss: 3.1702 (2.6983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0474 (1.1090)  time: 0.2743  data: 0.0004  max mem: 21847
Epoch: [237]  [ 800/1251]  eta: 0:02:05  lr: 0.000471  min_lr: 0.000471  loss: 2.2401 (2.7001)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0329 (1.1099)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [237]  [1000/1251]  eta: 0:01:09  lr: 0.000468  min_lr: 0.000468  loss: 2.5696 (2.7098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9766 (1.0902)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [237]  [1200/1251]  eta: 0:00:14  lr: 0.000466  min_lr: 0.000466  loss: 2.6484 (2.7075)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3391 (1.1272)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [237]  [1250/1251]  eta: 0:00:00  lr: 0.000466  min_lr: 0.000466  loss: 2.6375 (2.7094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0597 (1.1247)  time: 0.2281  data: 0.0007  max mem: 21847
Epoch: [237] Total time: 0:05:47 (0.2777 s / it)
Averaged stats: lr: 0.000466  min_lr: 0.000466  loss: 2.6375 (2.7401)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0597 (1.1247)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.5974 (0.5974)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.8761  data: 5.7252  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7520 (0.7433)  acc1: 87.2000 (85.5273)  acc5: 98.0000 (97.7091)  time: 0.7130  data: 0.5793  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9040 (0.8812)  acc1: 79.6000 (82.1143)  acc5: 95.6000 (96.1524)  time: 0.1839  data: 0.0536  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9804 (0.8905)  acc1: 79.6000 (81.7920)  acc5: 95.6000 (96.1440)  time: 0.1928  data: 0.0639  max mem: 21847
Test: Total time: 0:00:10 (0.4142 s / it)
* Acc@1 81.992 Acc@5 95.940 loss 0.886
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 81.99%
Epoch: [238]  [   0/1251]  eta: 1:02:26  lr: 0.000466  min_lr: 0.000466  loss: 2.1998 (2.1998)  weight_decay: 0.0500 (0.0500)  time: 2.9950  data: 2.5798  max mem: 21847
Epoch: [238]  [ 200/1251]  eta: 0:05:02  lr: 0.000463  min_lr: 0.000463  loss: 3.0552 (2.6714)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1765 (1.1020)  time: 0.2894  data: 0.0003  max mem: 21847
Epoch: [238]  [ 400/1251]  eta: 0:03:59  lr: 0.000461  min_lr: 0.000461  loss: 2.8737 (2.6980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9530 (1.0783)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [238]  [ 600/1251]  eta: 0:03:01  lr: 0.000459  min_lr: 0.000459  loss: 2.9460 (2.7141)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0930 (1.0978)  time: 0.2706  data: 0.0003  max mem: 21847
Epoch: [238]  [ 800/1251]  eta: 0:02:05  lr: 0.000456  min_lr: 0.000456  loss: 2.6624 (2.7303)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0457 (1.0987)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [238]  [1000/1251]  eta: 0:01:09  lr: 0.000454  min_lr: 0.000454  loss: 3.2338 (2.7321)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0230 (1.1143)  time: 0.2780  data: 0.0004  max mem: 21847
Epoch: [238]  [1200/1251]  eta: 0:00:14  lr: 0.000452  min_lr: 0.000452  loss: 2.8589 (2.7208)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0242 (1.1025)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [238]  [1250/1251]  eta: 0:00:00  lr: 0.000451  min_lr: 0.000451  loss: 2.5413 (2.7222)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0217 (1.0998)  time: 0.2277  data: 0.0005  max mem: 21847
Epoch: [238] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.000451  min_lr: 0.000451  loss: 2.5413 (2.7387)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0217 (1.0998)
Test:  [ 0/25]  eta: 0:01:57  loss: 0.5511 (0.5511)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 4.6978  data: 4.5354  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.7525 (0.7316)  acc1: 86.8000 (85.4545)  acc5: 97.6000 (97.5273)  time: 0.6487  data: 0.5053  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8742 (0.8647)  acc1: 80.0000 (82.5333)  acc5: 96.0000 (95.8286)  time: 0.2218  data: 0.0871  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9605 (0.8745)  acc1: 80.0000 (82.3040)  acc5: 94.8000 (95.7920)  time: 0.2124  data: 0.0812  max mem: 21847
Test: Total time: 0:00:09 (0.3904 s / it)
* Acc@1 82.218 Acc@5 96.112 loss 0.871
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.22%
Epoch: [239]  [   0/1251]  eta: 0:51:35  lr: 0.000451  min_lr: 0.000451  loss: 3.1679 (3.1679)  weight_decay: 0.0500 (0.0500)  time: 2.4743  data: 2.1059  max mem: 21847
Epoch: [239]  [ 200/1251]  eta: 0:05:04  lr: 0.000449  min_lr: 0.000449  loss: 3.0483 (2.7280)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0463 (1.0768)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [239]  [ 400/1251]  eta: 0:04:00  lr: 0.000447  min_lr: 0.000447  loss: 3.1166 (2.7189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0786 (1.1385)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [239]  [ 600/1251]  eta: 0:03:02  lr: 0.000445  min_lr: 0.000445  loss: 2.7684 (2.7266)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0336 (1.1255)  time: 0.2821  data: 0.0004  max mem: 21847
Epoch: [239]  [ 800/1251]  eta: 0:02:05  lr: 0.000442  min_lr: 0.000442  loss: 2.0931 (2.7172)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0920 (1.1271)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [239]  [1000/1251]  eta: 0:01:09  lr: 0.000440  min_lr: 0.000440  loss: 2.8093 (2.7179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9986 (1.1021)  time: 0.2801  data: 0.0004  max mem: 21847
Epoch: [239]  [1200/1251]  eta: 0:00:14  lr: 0.000438  min_lr: 0.000438  loss: 2.7147 (2.7186)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0520 (1.1247)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [239]  [1250/1251]  eta: 0:00:00  lr: 0.000437  min_lr: 0.000437  loss: 2.6110 (2.7205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9903 (1.1202)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [239] Total time: 0:05:47 (0.2778 s / it)
Averaged stats: lr: 0.000437  min_lr: 0.000437  loss: 2.6110 (2.7330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9903 (1.1202)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5764 (0.5764)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.5865  data: 5.4359  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7574 (0.7481)  acc1: 86.0000 (85.0909)  acc5: 98.0000 (97.5273)  time: 0.7657  data: 0.6319  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9287 (0.8821)  acc1: 79.6000 (82.0000)  acc5: 96.4000 (95.9810)  time: 0.2179  data: 0.0881  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9604 (0.8891)  acc1: 78.8000 (81.5840)  acc5: 96.0000 (96.0640)  time: 0.2171  data: 0.0881  max mem: 21847
Test: Total time: 0:00:10 (0.4213 s / it)
* Acc@1 82.170 Acc@5 96.084 loss 0.882
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.22%
Epoch: [240]  [   0/1251]  eta: 1:02:44  lr: 0.000437  min_lr: 0.000437  loss: 2.1172 (2.1172)  weight_decay: 0.0500 (0.0500)  time: 3.0089  data: 2.5486  max mem: 21847
Epoch: [240]  [ 200/1251]  eta: 0:05:04  lr: 0.000435  min_lr: 0.000435  loss: 2.4981 (2.7169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9385 (1.0716)  time: 0.2742  data: 0.0004  max mem: 21847
Epoch: [240]  [ 400/1251]  eta: 0:03:59  lr: 0.000433  min_lr: 0.000433  loss: 2.4938 (2.7273)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0184 (1.0896)  time: 0.2724  data: 0.0003  max mem: 21847
Epoch: [240]  [ 600/1251]  eta: 0:03:01  lr: 0.000431  min_lr: 0.000431  loss: 3.0597 (2.7221)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2827 (1.1213)  time: 0.2720  data: 0.0003  max mem: 21847
Epoch: [240]  [ 800/1251]  eta: 0:02:05  lr: 0.000428  min_lr: 0.000428  loss: 2.5314 (2.7041)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0371 (1.1096)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [240]  [1000/1251]  eta: 0:01:09  lr: 0.000426  min_lr: 0.000426  loss: 3.0229 (2.7051)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0025 (1.1006)  time: 0.2723  data: 0.0003  max mem: 21847
Epoch: [240]  [1200/1251]  eta: 0:00:14  lr: 0.000424  min_lr: 0.000424  loss: 2.7762 (2.7145)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0017 (1.0980)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [240]  [1250/1251]  eta: 0:00:00  lr: 0.000423  min_lr: 0.000423  loss: 2.7436 (2.7158)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0104 (1.0966)  time: 0.2280  data: 0.0007  max mem: 21847
Epoch: [240] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.000423  min_lr: 0.000423  loss: 2.7436 (2.7274)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0104 (1.0966)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6034 (0.6034)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.7468  data: 5.5758  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8030 (0.7789)  acc1: 86.0000 (85.5273)  acc5: 97.6000 (97.4545)  time: 0.7598  data: 0.6258  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9540 (0.9105)  acc1: 80.4000 (82.2095)  acc5: 95.6000 (96.0381)  time: 0.2113  data: 0.0823  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0072 (0.9195)  acc1: 78.8000 (81.6960)  acc5: 95.2000 (96.0160)  time: 0.2110  data: 0.0822  max mem: 21847
Test: Total time: 0:00:10 (0.4237 s / it)
* Acc@1 82.174 Acc@5 96.070 loss 0.913
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.22%
Epoch: [241]  [   0/1251]  eta: 1:06:38  lr: 0.000423  min_lr: 0.000423  loss: 2.8167 (2.8167)  weight_decay: 0.0500 (0.0500)  time: 3.1961  data: 2.3748  max mem: 21847
Epoch: [241]  [ 200/1251]  eta: 0:05:07  lr: 0.000421  min_lr: 0.000421  loss: 2.6939 (2.6871)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1067 (1.0737)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [241]  [ 400/1251]  eta: 0:04:00  lr: 0.000419  min_lr: 0.000419  loss: 2.3619 (2.6997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0136 (1.0921)  time: 0.2828  data: 0.0004  max mem: 21847
Epoch: [241]  [ 600/1251]  eta: 0:03:02  lr: 0.000417  min_lr: 0.000417  loss: 3.0165 (2.7255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0580 (1.1085)  time: 0.2813  data: 0.0004  max mem: 21847
Epoch: [241]  [ 800/1251]  eta: 0:02:05  lr: 0.000415  min_lr: 0.000415  loss: 3.0451 (2.7098)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0982 (1.1053)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [241]  [1000/1251]  eta: 0:01:09  lr: 0.000412  min_lr: 0.000412  loss: 2.2766 (2.7166)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0286 (1.1097)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [241]  [1200/1251]  eta: 0:00:14  lr: 0.000410  min_lr: 0.000410  loss: 2.8814 (2.7193)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1830 (1.1229)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [241]  [1250/1251]  eta: 0:00:00  lr: 0.000410  min_lr: 0.000410  loss: 2.7318 (2.7130)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1830 (1.1255)  time: 0.2318  data: 0.0006  max mem: 21847
Epoch: [241] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.000410  min_lr: 0.000410  loss: 2.7318 (2.7271)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1830 (1.1255)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5914 (0.5914)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 5.6032  data: 5.4297  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7638 (0.7384)  acc1: 86.0000 (84.8727)  acc5: 97.6000 (97.2364)  time: 0.6831  data: 0.5465  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9097 (0.8663)  acc1: 79.2000 (82.1143)  acc5: 95.6000 (96.0762)  time: 0.1806  data: 0.0503  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9440 (0.8723)  acc1: 79.6000 (81.8880)  acc5: 95.6000 (96.0800)  time: 0.1786  data: 0.0503  max mem: 21847
Test: Total time: 0:00:09 (0.3947 s / it)
* Acc@1 82.156 Acc@5 96.082 loss 0.866
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.22%
Epoch: [242]  [   0/1251]  eta: 1:10:05  lr: 0.000410  min_lr: 0.000410  loss: 3.5471 (3.5471)  weight_decay: 0.0500 (0.0500)  time: 3.3613  data: 2.4443  max mem: 21847
Epoch: [242]  [ 200/1251]  eta: 0:05:07  lr: 0.000407  min_lr: 0.000407  loss: 3.0463 (2.6932)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0003 (1.1402)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [242]  [ 400/1251]  eta: 0:04:01  lr: 0.000405  min_lr: 0.000405  loss: 2.6776 (2.6922)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0233 (1.1520)  time: 0.2746  data: 0.0005  max mem: 21847
Epoch: [242]  [ 600/1251]  eta: 0:03:02  lr: 0.000403  min_lr: 0.000403  loss: 3.0670 (2.7034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9806 (1.1203)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [242]  [ 800/1251]  eta: 0:02:05  lr: 0.000401  min_lr: 0.000401  loss: 2.8418 (2.7136)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1039 (1.1151)  time: 0.2732  data: 0.0004  max mem: 21847
Epoch: [242]  [1000/1251]  eta: 0:01:09  lr: 0.000399  min_lr: 0.000399  loss: 2.4873 (2.6993)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0472 (1.1071)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [242]  [1200/1251]  eta: 0:00:14  lr: 0.000397  min_lr: 0.000397  loss: 2.7904 (2.7195)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9839 (1.0958)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [242]  [1250/1251]  eta: 0:00:00  lr: 0.000396  min_lr: 0.000396  loss: 2.7969 (2.7166)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1641 (1.1054)  time: 0.2277  data: 0.0006  max mem: 21847
Epoch: [242] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.000396  min_lr: 0.000396  loss: 2.7969 (2.7137)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1641 (1.1054)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5861 (0.5861)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.7414  data: 5.5911  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7546 (0.7266)  acc1: 86.0000 (85.4546)  acc5: 97.6000 (97.4182)  time: 0.7641  data: 0.6311  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8821 (0.8560)  acc1: 80.8000 (82.4952)  acc5: 95.6000 (95.9619)  time: 0.2149  data: 0.0849  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9210 (0.8629)  acc1: 80.0000 (82.0640)  acc5: 95.2000 (95.9840)  time: 0.2139  data: 0.0848  max mem: 21847
Test: Total time: 0:00:10 (0.4257 s / it)
* Acc@1 82.288 Acc@5 96.104 loss 0.857
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.29%
Epoch: [243]  [   0/1251]  eta: 1:04:21  lr: 0.000396  min_lr: 0.000396  loss: 2.8958 (2.8958)  weight_decay: 0.0500 (0.0500)  time: 3.0871  data: 2.7568  max mem: 21847
Epoch: [243]  [ 200/1251]  eta: 0:05:03  lr: 0.000394  min_lr: 0.000394  loss: 2.7510 (2.6832)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0566 (1.1852)  time: 0.2749  data: 0.0004  max mem: 21847
Epoch: [243]  [ 400/1251]  eta: 0:04:00  lr: 0.000392  min_lr: 0.000392  loss: 2.8408 (2.6511)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2773 (1.2118)  time: 0.2839  data: 0.0004  max mem: 21847
Epoch: [243]  [ 600/1251]  eta: 0:03:02  lr: 0.000390  min_lr: 0.000390  loss: 2.9994 (2.6769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0304 (1.1731)  time: 0.2734  data: 0.0003  max mem: 21847
Epoch: [243]  [ 800/1251]  eta: 0:02:05  lr: 0.000388  min_lr: 0.000388  loss: 2.7143 (2.6828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0382 (1.1503)  time: 0.2826  data: 0.0004  max mem: 21847
Epoch: [243]  [1000/1251]  eta: 0:01:09  lr: 0.000385  min_lr: 0.000385  loss: 2.7771 (2.6792)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4434 (1.1762)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [243]  [1200/1251]  eta: 0:00:14  lr: 0.000383  min_lr: 0.000383  loss: 2.8783 (2.6912)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0966 (1.1785)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [243]  [1250/1251]  eta: 0:00:00  lr: 0.000383  min_lr: 0.000383  loss: 2.6316 (2.6900)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0610 (1.1768)  time: 0.2283  data: 0.0006  max mem: 21847
Epoch: [243] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.000383  min_lr: 0.000383  loss: 2.6316 (2.7008)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0610 (1.1768)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5979 (0.5979)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 5.7949  data: 5.6473  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7406 (0.7482)  acc1: 85.2000 (84.9091)  acc5: 97.6000 (97.4182)  time: 0.7173  data: 0.5821  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8840 (0.8799)  acc1: 80.0000 (81.9619)  acc5: 96.0000 (95.9619)  time: 0.1789  data: 0.0466  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9720 (0.8892)  acc1: 79.2000 (81.6160)  acc5: 95.2000 (95.9360)  time: 0.1782  data: 0.0465  max mem: 21847
Test: Total time: 0:00:09 (0.3985 s / it)
* Acc@1 82.136 Acc@5 96.072 loss 0.881
Accuracy of the model on the 50000 test images: 82.1%
Max accuracy: 82.29%
Epoch: [244]  [   0/1251]  eta: 1:08:15  lr: 0.000383  min_lr: 0.000383  loss: 1.6219 (1.6219)  weight_decay: 0.0500 (0.0500)  time: 3.2735  data: 2.0338  max mem: 21847
Epoch: [244]  [ 200/1251]  eta: 0:05:05  lr: 0.000381  min_lr: 0.000381  loss: 3.0019 (2.7457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0721 (1.1180)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [244]  [ 400/1251]  eta: 0:04:00  lr: 0.000379  min_lr: 0.000379  loss: 2.2417 (2.7262)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0242 (1.1372)  time: 0.2713  data: 0.0007  max mem: 21847
Epoch: [244]  [ 600/1251]  eta: 0:03:02  lr: 0.000377  min_lr: 0.000377  loss: 2.8555 (2.7288)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1128 (1.1492)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [244]  [ 800/1251]  eta: 0:02:05  lr: 0.000374  min_lr: 0.000374  loss: 2.5471 (2.7220)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1455 (1.1458)  time: 0.2802  data: 0.0006  max mem: 21847
Epoch: [244]  [1000/1251]  eta: 0:01:09  lr: 0.000372  min_lr: 0.000372  loss: 2.9747 (2.7287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9705 (1.1301)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [244]  [1200/1251]  eta: 0:00:14  lr: 0.000370  min_lr: 0.000370  loss: 2.7552 (2.7232)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1470 (1.1496)  time: 0.2845  data: 0.0005  max mem: 21847
Epoch: [244]  [1250/1251]  eta: 0:00:00  lr: 0.000370  min_lr: 0.000370  loss: 2.7186 (2.7203)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0690 (1.1466)  time: 0.2281  data: 0.0007  max mem: 21847
Epoch: [244] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.000370  min_lr: 0.000370  loss: 2.7186 (2.7070)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0690 (1.1466)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6202 (0.6202)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.7713  data: 5.6155  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7759 (0.7531)  acc1: 86.0000 (85.4182)  acc5: 97.6000 (97.6000)  time: 0.7459  data: 0.6120  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9086 (0.8887)  acc1: 80.4000 (82.1333)  acc5: 96.0000 (96.1524)  time: 0.1990  data: 0.0695  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9949 (0.8984)  acc1: 79.2000 (81.7120)  acc5: 95.6000 (96.1280)  time: 0.1984  data: 0.0694  max mem: 21847
Test: Total time: 0:00:10 (0.4139 s / it)
* Acc@1 82.260 Acc@5 96.108 loss 0.892
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.29%
Epoch: [245]  [   0/1251]  eta: 1:07:02  lr: 0.000370  min_lr: 0.000370  loss: 1.9168 (1.9168)  weight_decay: 0.0500 (0.0500)  time: 3.2158  data: 2.5854  max mem: 21847
Epoch: [245]  [ 200/1251]  eta: 0:05:02  lr: 0.000368  min_lr: 0.000368  loss: 2.9547 (2.7101)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0088 (1.0419)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [245]  [ 400/1251]  eta: 0:03:59  lr: 0.000366  min_lr: 0.000366  loss: 2.8738 (2.6740)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0664 (1.1275)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [245]  [ 600/1251]  eta: 0:03:01  lr: 0.000364  min_lr: 0.000364  loss: 2.6309 (2.7024)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1056 (1.1255)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [245]  [ 800/1251]  eta: 0:02:05  lr: 0.000362  min_lr: 0.000362  loss: 2.8667 (2.6808)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1740 (1.1427)  time: 0.2746  data: 0.0004  max mem: 21847
Epoch: [245]  [1000/1251]  eta: 0:01:09  lr: 0.000359  min_lr: 0.000359  loss: 2.6653 (2.7052)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0575 (1.1302)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [245]  [1200/1251]  eta: 0:00:14  lr: 0.000357  min_lr: 0.000357  loss: 2.1530 (2.7002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0138 (1.1236)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [245]  [1250/1251]  eta: 0:00:00  lr: 0.000357  min_lr: 0.000357  loss: 3.0235 (2.7029)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0007 (1.1216)  time: 0.2345  data: 0.0007  max mem: 21847
Epoch: [245] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.000357  min_lr: 0.000357  loss: 3.0235 (2.6988)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0007 (1.1216)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5677 (0.5677)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.7426  data: 5.5851  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7720 (0.7408)  acc1: 86.0000 (85.2727)  acc5: 97.2000 (97.4909)  time: 0.7271  data: 0.5954  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8764 (0.8733)  acc1: 80.8000 (82.0191)  acc5: 96.0000 (96.0571)  time: 0.1895  data: 0.0612  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9571 (0.8808)  acc1: 78.8000 (81.7600)  acc5: 95.6000 (96.0800)  time: 0.1894  data: 0.0612  max mem: 21847
Test: Total time: 0:00:10 (0.4052 s / it)
* Acc@1 82.184 Acc@5 96.092 loss 0.877
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.29%
Epoch: [246]  [   0/1251]  eta: 1:08:02  lr: 0.000357  min_lr: 0.000357  loss: 2.6047 (2.6047)  weight_decay: 0.0500 (0.0500)  time: 3.2635  data: 2.8265  max mem: 21847
Epoch: [246]  [ 200/1251]  eta: 0:05:04  lr: 0.000355  min_lr: 0.000355  loss: 2.5715 (2.7498)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1542 (1.0779)  time: 0.2745  data: 0.0004  max mem: 21847
Epoch: [246]  [ 400/1251]  eta: 0:04:00  lr: 0.000353  min_lr: 0.000353  loss: 3.0715 (2.7609)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0443 (1.1325)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [246]  [ 600/1251]  eta: 0:03:02  lr: 0.000351  min_lr: 0.000351  loss: 2.9665 (2.7504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9953 (1.1260)  time: 0.2823  data: 0.0004  max mem: 21847
Epoch: [246]  [ 800/1251]  eta: 0:02:05  lr: 0.000349  min_lr: 0.000349  loss: 2.5463 (2.7200)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1920 (1.1504)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [246]  [1000/1251]  eta: 0:01:09  lr: 0.000347  min_lr: 0.000347  loss: 2.5763 (2.7189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9895 (1.1442)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [246]  [1200/1251]  eta: 0:00:14  lr: 0.000345  min_lr: 0.000345  loss: 2.5195 (2.7126)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1103 (1.1371)  time: 0.2814  data: 0.0004  max mem: 21847
Epoch: [246]  [1250/1251]  eta: 0:00:00  lr: 0.000344  min_lr: 0.000344  loss: 3.0215 (2.7158)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0893 (1.1418)  time: 0.2341  data: 0.0007  max mem: 21847
Epoch: [246] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.000344  min_lr: 0.000344  loss: 3.0215 (2.7080)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0893 (1.1418)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5994 (0.5994)  acc1: 90.0000 (90.0000)  acc5: 98.0000 (98.0000)  time: 5.4979  data: 5.3287  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7941 (0.7637)  acc1: 86.8000 (85.7091)  acc5: 97.6000 (97.6000)  time: 0.7263  data: 0.5903  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9345 (0.9023)  acc1: 80.4000 (82.6667)  acc5: 96.4000 (96.1143)  time: 0.2040  data: 0.0732  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0075 (0.9107)  acc1: 79.6000 (82.3360)  acc5: 95.2000 (96.0960)  time: 0.2021  data: 0.0730  max mem: 21847
Test: Total time: 0:00:10 (0.4068 s / it)
* Acc@1 82.450 Acc@5 96.166 loss 0.906
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.45%
Epoch: [247]  [   0/1251]  eta: 1:04:35  lr: 0.000344  min_lr: 0.000344  loss: 3.2040 (3.2040)  weight_decay: 0.0500 (0.0500)  time: 3.0983  data: 2.7619  max mem: 21847
Epoch: [247]  [ 200/1251]  eta: 0:05:02  lr: 0.000342  min_lr: 0.000342  loss: 2.5922 (2.6741)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2364 (1.3389)  time: 0.2804  data: 0.0005  max mem: 21847
Epoch: [247]  [ 400/1251]  eta: 0:03:59  lr: 0.000340  min_lr: 0.000340  loss: 2.3971 (2.6722)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0879 (1.2659)  time: 0.2727  data: 0.0003  max mem: 21847
Epoch: [247]  [ 600/1251]  eta: 0:03:01  lr: 0.000338  min_lr: 0.000338  loss: 2.8640 (2.6564)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0813 (1.2171)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [247]  [ 800/1251]  eta: 0:02:05  lr: 0.000336  min_lr: 0.000336  loss: 2.6298 (2.6757)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2109 (1.2015)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [247]  [1000/1251]  eta: 0:01:09  lr: 0.000334  min_lr: 0.000334  loss: 2.9421 (2.6740)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0115 (1.1766)  time: 0.2768  data: 0.0004  max mem: 21847
Epoch: [247]  [1200/1251]  eta: 0:00:14  lr: 0.000332  min_lr: 0.000332  loss: 2.9479 (2.6753)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1944 (1.1894)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [247]  [1250/1251]  eta: 0:00:00  lr: 0.000332  min_lr: 0.000332  loss: 2.5039 (2.6736)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3653 (1.1952)  time: 0.2282  data: 0.0005  max mem: 21847
Epoch: [247] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.000332  min_lr: 0.000332  loss: 2.5039 (2.6922)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3653 (1.1952)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5401 (0.5401)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.7422  data: 5.5904  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7224 (0.7060)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.5636)  time: 0.7201  data: 0.5867  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8652 (0.8416)  acc1: 81.6000 (82.6286)  acc5: 96.0000 (96.0952)  time: 0.1936  data: 0.0614  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9308 (0.8498)  acc1: 79.6000 (82.1440)  acc5: 95.2000 (96.0480)  time: 0.1927  data: 0.0613  max mem: 21847
Test: Total time: 0:00:10 (0.4081 s / it)
* Acc@1 82.370 Acc@5 96.148 loss 0.847
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.45%
Epoch: [248]  [   0/1251]  eta: 1:05:52  lr: 0.000332  min_lr: 0.000332  loss: 3.5433 (3.5433)  weight_decay: 0.0500 (0.0500)  time: 3.1593  data: 1.6339  max mem: 21847
Epoch: [248]  [ 200/1251]  eta: 0:05:03  lr: 0.000330  min_lr: 0.000330  loss: 2.9719 (2.6763)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2147 (1.2091)  time: 0.2740  data: 0.0004  max mem: 21847
Epoch: [248]  [ 400/1251]  eta: 0:03:59  lr: 0.000328  min_lr: 0.000328  loss: 2.6520 (2.6445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0603 (1.1804)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [248]  [ 600/1251]  eta: 0:03:01  lr: 0.000326  min_lr: 0.000326  loss: 2.7495 (2.6680)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2522 (1.1831)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [248]  [ 800/1251]  eta: 0:02:05  lr: 0.000324  min_lr: 0.000324  loss: 2.5090 (2.6593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9547 (1.1687)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [248]  [1000/1251]  eta: 0:01:09  lr: 0.000322  min_lr: 0.000322  loss: 2.6211 (2.6596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0723 (1.1531)  time: 0.2705  data: 0.0004  max mem: 21847
Epoch: [248]  [1200/1251]  eta: 0:00:14  lr: 0.000320  min_lr: 0.000320  loss: 2.6407 (2.6730)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2601 (1.1594)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [248]  [1250/1251]  eta: 0:00:00  lr: 0.000320  min_lr: 0.000320  loss: 3.0473 (2.6764)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0900 (1.1585)  time: 0.2276  data: 0.0006  max mem: 21847
Epoch: [248] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.000320  min_lr: 0.000320  loss: 3.0473 (2.7010)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0900 (1.1585)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6257 (0.6257)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.8114  data: 5.6388  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7897 (0.7854)  acc1: 85.6000 (85.4545)  acc5: 97.6000 (97.4909)  time: 0.7278  data: 0.5924  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9487 (0.9168)  acc1: 80.0000 (82.4762)  acc5: 95.6000 (96.0952)  time: 0.1937  data: 0.0634  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0148 (0.9238)  acc1: 80.0000 (82.2720)  acc5: 95.2000 (96.0480)  time: 0.1917  data: 0.0633  max mem: 21847
Test: Total time: 0:00:10 (0.4112 s / it)
* Acc@1 82.408 Acc@5 96.172 loss 0.918
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.45%
Epoch: [249]  [   0/1251]  eta: 1:13:10  lr: 0.000320  min_lr: 0.000320  loss: 2.8227 (2.8227)  weight_decay: 0.0500 (0.0500)  time: 3.5096  data: 1.5865  max mem: 21847
Epoch: [249]  [ 200/1251]  eta: 0:05:07  lr: 0.000318  min_lr: 0.000318  loss: 2.9308 (2.6908)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1137 (1.1391)  time: 0.2747  data: 0.0005  max mem: 21847
Epoch: [249]  [ 400/1251]  eta: 0:04:01  lr: 0.000316  min_lr: 0.000316  loss: 2.4372 (2.6861)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1664 (1.1833)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [249]  [ 600/1251]  eta: 0:03:02  lr: 0.000314  min_lr: 0.000314  loss: 2.9382 (2.6909)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1000 (1.1955)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [249]  [ 800/1251]  eta: 0:02:05  lr: 0.000312  min_lr: 0.000312  loss: 2.8587 (2.6918)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4139 (1.2243)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [249]  [1000/1251]  eta: 0:01:09  lr: 0.000310  min_lr: 0.000310  loss: 2.8178 (2.6928)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1108 (1.2118)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [249]  [1200/1251]  eta: 0:00:14  lr: 0.000308  min_lr: 0.000308  loss: 2.4286 (2.6843)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1497 (1.2100)  time: 0.2744  data: 0.0004  max mem: 21847
Epoch: [249]  [1250/1251]  eta: 0:00:00  lr: 0.000308  min_lr: 0.000308  loss: 2.8946 (2.6875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1497 (1.2099)  time: 0.2275  data: 0.0006  max mem: 21847
Epoch: [249] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.000308  min_lr: 0.000308  loss: 2.8946 (2.6881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1497 (1.2099)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5744 (0.5744)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.6540  data: 5.4956  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7504 (0.7244)  acc1: 86.8000 (85.7455)  acc5: 97.6000 (97.6727)  time: 0.7142  data: 0.5807  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8979 (0.8617)  acc1: 81.2000 (82.5905)  acc5: 96.4000 (96.0952)  time: 0.1976  data: 0.0680  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9538 (0.8712)  acc1: 79.2000 (82.2560)  acc5: 95.2000 (96.1120)  time: 0.2063  data: 0.0775  max mem: 21847
Test: Total time: 0:00:10 (0.4159 s / it)
* Acc@1 82.410 Acc@5 96.204 loss 0.870
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.45%
Epoch: [250]  [   0/1251]  eta: 1:04:57  lr: 0.000307  min_lr: 0.000307  loss: 2.6791 (2.6791)  weight_decay: 0.0500 (0.0500)  time: 3.1158  data: 2.4099  max mem: 21847
Epoch: [250]  [ 200/1251]  eta: 0:05:05  lr: 0.000306  min_lr: 0.000306  loss: 2.7515 (2.7388)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2748  data: 0.0004  max mem: 21847
Epoch: [250]  [ 400/1251]  eta: 0:04:01  lr: 0.000304  min_lr: 0.000304  loss: 2.8109 (2.7124)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1151 (nan)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [250]  [ 600/1251]  eta: 0:03:02  lr: 0.000302  min_lr: 0.000302  loss: 2.7085 (2.7073)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0845 (nan)  time: 0.2740  data: 0.0004  max mem: 21847
Epoch: [250]  [ 800/1251]  eta: 0:02:05  lr: 0.000300  min_lr: 0.000300  loss: 2.8115 (2.7097)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1523 (nan)  time: 0.2728  data: 0.0005  max mem: 21847
Epoch: [250]  [1000/1251]  eta: 0:01:09  lr: 0.000298  min_lr: 0.000298  loss: 2.4370 (2.7172)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9586 (nan)  time: 0.2740  data: 0.0004  max mem: 21847
Epoch: [250]  [1200/1251]  eta: 0:00:14  lr: 0.000296  min_lr: 0.000296  loss: 2.8964 (2.7026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0568 (nan)  time: 0.2845  data: 0.0004  max mem: 21847
Epoch: [250]  [1250/1251]  eta: 0:00:00  lr: 0.000296  min_lr: 0.000296  loss: 2.2498 (2.7028)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0822 (nan)  time: 0.2282  data: 0.0006  max mem: 21847
Epoch: [250] Total time: 0:05:47 (0.2780 s / it)
Averaged stats: lr: 0.000296  min_lr: 0.000296  loss: 2.2498 (2.6838)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0822 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5717 (0.5717)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.6741  data: 5.5234  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7295 (0.7148)  acc1: 86.4000 (85.4182)  acc5: 97.6000 (97.7455)  time: 0.6821  data: 0.5497  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8999 (0.8510)  acc1: 80.4000 (82.5524)  acc5: 96.4000 (96.0952)  time: 0.1785  data: 0.0495  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9024 (0.8566)  acc1: 80.4000 (82.3040)  acc5: 95.2000 (96.1600)  time: 0.2005  data: 0.0729  max mem: 21847
Test: Total time: 0:00:10 (0.4124 s / it)
* Acc@1 82.532 Acc@5 96.194 loss 0.850
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.53%
Epoch: [251]  [   0/1251]  eta: 1:07:33  lr: 0.000296  min_lr: 0.000296  loss: 2.8440 (2.8440)  weight_decay: 0.0500 (0.0500)  time: 3.2404  data: 2.9467  max mem: 21847
Epoch: [251]  [ 200/1251]  eta: 0:05:01  lr: 0.000294  min_lr: 0.000294  loss: 2.8606 (2.6323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1601 (1.3275)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [251]  [ 400/1251]  eta: 0:03:59  lr: 0.000292  min_lr: 0.000292  loss: 2.9304 (2.6497)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0617 (1.2397)  time: 0.2801  data: 0.0004  max mem: 21847
Epoch: [251]  [ 600/1251]  eta: 0:03:01  lr: 0.000290  min_lr: 0.000290  loss: 2.8030 (2.6445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1139 (1.2267)  time: 0.2721  data: 0.0005  max mem: 21847
Epoch: [251]  [ 800/1251]  eta: 0:02:05  lr: 0.000288  min_lr: 0.000288  loss: 2.6290 (2.6489)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0908 (1.2201)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [251]  [1000/1251]  eta: 0:01:09  lr: 0.000286  min_lr: 0.000286  loss: 2.7019 (2.6483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0621 (1.1920)  time: 0.2714  data: 0.0005  max mem: 21847
Epoch: [251]  [1200/1251]  eta: 0:00:14  lr: 0.000284  min_lr: 0.000284  loss: 3.0733 (2.6588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0564 (1.1829)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [251]  [1250/1251]  eta: 0:00:00  lr: 0.000284  min_lr: 0.000284  loss: 3.0050 (2.6660)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0375 (1.1784)  time: 0.2281  data: 0.0005  max mem: 21847
Epoch: [251] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.000284  min_lr: 0.000284  loss: 3.0050 (2.6825)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0375 (1.1784)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6755 (0.6755)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.6061  data: 5.4404  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.8124 (0.8155)  acc1: 86.8000 (85.6000)  acc5: 97.6000 (97.6364)  time: 0.7327  data: 0.5965  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9689 (0.9443)  acc1: 81.2000 (82.5333)  acc5: 96.4000 (96.1905)  time: 0.2058  data: 0.0755  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0211 (0.9511)  acc1: 79.6000 (82.1760)  acc5: 95.6000 (96.1920)  time: 0.2037  data: 0.0754  max mem: 21847
Test: Total time: 0:00:10 (0.4129 s / it)
* Acc@1 82.462 Acc@5 96.160 loss 0.946
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.53%
Epoch: [252]  [   0/1251]  eta: 1:05:27  lr: 0.000284  min_lr: 0.000284  loss: 3.2123 (3.2123)  weight_decay: 0.0500 (0.0500)  time: 3.1396  data: 1.6700  max mem: 21847
Epoch: [252]  [ 200/1251]  eta: 0:05:06  lr: 0.000282  min_lr: 0.000282  loss: 2.3048 (2.6453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0836 (1.2459)  time: 0.2764  data: 0.0005  max mem: 21847
Epoch: [252]  [ 400/1251]  eta: 0:04:01  lr: 0.000280  min_lr: 0.000280  loss: 2.9153 (2.6644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0824 (1.2170)  time: 0.2723  data: 0.0005  max mem: 21847
Epoch: [252]  [ 600/1251]  eta: 0:03:02  lr: 0.000279  min_lr: 0.000279  loss: 2.9033 (2.6732)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0103 (1.2091)  time: 0.2820  data: 0.0004  max mem: 21847
Epoch: [252]  [ 800/1251]  eta: 0:02:05  lr: 0.000277  min_lr: 0.000277  loss: 2.7293 (2.6833)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0852 (1.2121)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [252]  [1000/1251]  eta: 0:01:09  lr: 0.000275  min_lr: 0.000275  loss: 2.7961 (2.6736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9864 (1.2166)  time: 0.2732  data: 0.0005  max mem: 21847
Epoch: [252]  [1200/1251]  eta: 0:00:14  lr: 0.000273  min_lr: 0.000273  loss: 2.6758 (2.6717)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0481 (1.2011)  time: 0.2773  data: 0.0003  max mem: 21847
Epoch: [252]  [1250/1251]  eta: 0:00:00  lr: 0.000273  min_lr: 0.000273  loss: 2.7843 (2.6677)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0051 (1.1977)  time: 0.2280  data: 0.0006  max mem: 21847
Epoch: [252] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.000273  min_lr: 0.000273  loss: 2.7843 (2.6740)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0051 (1.1977)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5657 (0.5657)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.8236  data: 5.6532  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7288 (0.7201)  acc1: 86.8000 (85.5636)  acc5: 97.2000 (97.4182)  time: 0.7373  data: 0.6022  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8885 (0.8545)  acc1: 80.8000 (82.5333)  acc5: 96.0000 (95.9429)  time: 0.1878  data: 0.0583  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9418 (0.8621)  acc1: 80.0000 (82.2400)  acc5: 94.8000 (95.9840)  time: 0.1857  data: 0.0582  max mem: 21847
Test: Total time: 0:00:10 (0.4067 s / it)
* Acc@1 82.358 Acc@5 96.170 loss 0.857
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.53%
Epoch: [253]  [   0/1251]  eta: 1:07:41  lr: 0.000273  min_lr: 0.000273  loss: 3.2412 (3.2412)  weight_decay: 0.0500 (0.0500)  time: 3.2466  data: 2.8249  max mem: 21847
Epoch: [253]  [ 200/1251]  eta: 0:05:05  lr: 0.000271  min_lr: 0.000271  loss: 2.9161 (2.6787)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1497 (1.1401)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [253]  [ 400/1251]  eta: 0:04:01  lr: 0.000269  min_lr: 0.000269  loss: 3.0429 (2.6860)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1254 (nan)  time: 0.2760  data: 0.0005  max mem: 21847
Epoch: [253]  [ 600/1251]  eta: 0:03:02  lr: 0.000267  min_lr: 0.000267  loss: 2.9307 (2.7177)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0841 (nan)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [253]  [ 800/1251]  eta: 0:02:06  lr: 0.000265  min_lr: 0.000265  loss: 2.5355 (2.7223)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3213 (nan)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [253]  [1000/1251]  eta: 0:01:09  lr: 0.000264  min_lr: 0.000264  loss: 2.5415 (2.7108)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0270 (nan)  time: 0.2706  data: 0.0004  max mem: 21847
Epoch: [253]  [1200/1251]  eta: 0:00:14  lr: 0.000262  min_lr: 0.000262  loss: 2.7295 (2.7064)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0986 (nan)  time: 0.2741  data: 0.0005  max mem: 21847
Epoch: [253]  [1250/1251]  eta: 0:00:00  lr: 0.000261  min_lr: 0.000261  loss: 2.8215 (2.7057)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0471 (nan)  time: 0.2428  data: 0.0006  max mem: 21847
Epoch: [253] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.000261  min_lr: 0.000261  loss: 2.8215 (2.6838)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0471 (nan)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.6052 (0.6052)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.9414  data: 5.7861  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7867 (0.7540)  acc1: 85.6000 (85.4182)  acc5: 97.6000 (97.6000)  time: 0.7056  data: 0.5728  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9089 (0.8786)  acc1: 80.4000 (82.4762)  acc5: 96.0000 (96.2286)  time: 0.1868  data: 0.0548  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9320 (0.8866)  acc1: 80.0000 (82.0960)  acc5: 95.6000 (96.2080)  time: 0.1859  data: 0.0547  max mem: 21847
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 82.322 Acc@5 96.192 loss 0.882
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.53%
Epoch: [254]  [   0/1251]  eta: 1:04:55  lr: 0.000261  min_lr: 0.000261  loss: 2.3647 (2.3647)  weight_decay: 0.0500 (0.0500)  time: 3.1141  data: 2.7596  max mem: 21847
Epoch: [254]  [ 200/1251]  eta: 0:05:07  lr: 0.000260  min_lr: 0.000260  loss: 2.8712 (2.6787)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0798 (1.3750)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [254]  [ 400/1251]  eta: 0:04:00  lr: 0.000258  min_lr: 0.000258  loss: 2.8236 (2.6537)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0040 (1.2592)  time: 0.2730  data: 0.0005  max mem: 21847
Epoch: [254]  [ 600/1251]  eta: 0:03:02  lr: 0.000256  min_lr: 0.000256  loss: 2.1721 (2.6571)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2211 (1.2231)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [254]  [ 800/1251]  eta: 0:02:05  lr: 0.000254  min_lr: 0.000254  loss: 2.8135 (2.6583)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1619 (1.2006)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [254]  [1000/1251]  eta: 0:01:09  lr: 0.000253  min_lr: 0.000253  loss: 2.8732 (2.6798)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0105 (1.1868)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [254]  [1200/1251]  eta: 0:00:14  lr: 0.000251  min_lr: 0.000251  loss: 2.4527 (2.6787)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1795 (1.1731)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [254]  [1250/1251]  eta: 0:00:00  lr: 0.000251  min_lr: 0.000251  loss: 2.7704 (2.6829)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2150 (1.1814)  time: 0.2276  data: 0.0005  max mem: 21847
Epoch: [254] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.000251  min_lr: 0.000251  loss: 2.7704 (2.6739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2150 (1.1814)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6093 (0.6093)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.6292  data: 5.4700  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7746 (0.7617)  acc1: 86.8000 (85.7091)  acc5: 97.6000 (97.5636)  time: 0.7265  data: 0.5918  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9215 (0.8942)  acc1: 80.4000 (82.7048)  acc5: 96.0000 (96.0571)  time: 0.1969  data: 0.0671  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9949 (0.9038)  acc1: 79.6000 (82.3680)  acc5: 95.2000 (96.0000)  time: 0.1976  data: 0.0686  max mem: 21847
Test: Total time: 0:00:10 (0.4079 s / it)
* Acc@1 82.528 Acc@5 96.152 loss 0.901
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.53%
Epoch: [255]  [   0/1251]  eta: 1:10:52  lr: 0.000250  min_lr: 0.000250  loss: 1.7440 (1.7440)  weight_decay: 0.0500 (0.0500)  time: 3.3994  data: 2.4480  max mem: 21847
Epoch: [255]  [ 200/1251]  eta: 0:05:08  lr: 0.000249  min_lr: 0.000249  loss: 2.8058 (2.6395)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2272 (1.2584)  time: 0.2812  data: 0.0004  max mem: 21847
Epoch: [255]  [ 400/1251]  eta: 0:04:01  lr: 0.000247  min_lr: 0.000247  loss: 3.1163 (2.6453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9744 (1.2168)  time: 0.2732  data: 0.0005  max mem: 21847
Epoch: [255]  [ 600/1251]  eta: 0:03:02  lr: 0.000245  min_lr: 0.000245  loss: 2.5881 (2.6581)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2420 (1.2114)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [255]  [ 800/1251]  eta: 0:02:05  lr: 0.000244  min_lr: 0.000244  loss: 2.8258 (2.6539)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0311 (1.1999)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [255]  [1000/1251]  eta: 0:01:09  lr: 0.000242  min_lr: 0.000242  loss: 2.7104 (2.6703)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1246 (1.1948)  time: 0.2776  data: 0.0004  max mem: 21847
Epoch: [255]  [1200/1251]  eta: 0:00:14  lr: 0.000240  min_lr: 0.000240  loss: 2.9217 (2.6654)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2624 (1.1890)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [255]  [1250/1251]  eta: 0:00:00  lr: 0.000240  min_lr: 0.000240  loss: 3.0048 (2.6655)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1877 (1.1880)  time: 0.2286  data: 0.0006  max mem: 21847
Epoch: [255] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.000240  min_lr: 0.000240  loss: 3.0048 (2.6683)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1877 (1.1880)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5856 (0.5856)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.4451  data: 5.2918  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7677 (0.7422)  acc1: 87.2000 (85.8182)  acc5: 97.6000 (97.7455)  time: 0.7262  data: 0.5911  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9257 (0.8792)  acc1: 80.4000 (82.4762)  acc5: 96.0000 (96.0952)  time: 0.2009  data: 0.0705  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9822 (0.8877)  acc1: 80.0000 (82.1920)  acc5: 95.6000 (96.1280)  time: 0.1987  data: 0.0696  max mem: 21847
Test: Total time: 0:00:10 (0.4024 s / it)
* Acc@1 82.494 Acc@5 96.208 loss 0.888
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.53%
Epoch: [256]  [   0/1251]  eta: 1:10:42  lr: 0.000240  min_lr: 0.000240  loss: 3.0955 (3.0955)  weight_decay: 0.0500 (0.0500)  time: 3.3915  data: 2.1068  max mem: 21847
Epoch: [256]  [ 200/1251]  eta: 0:05:03  lr: 0.000238  min_lr: 0.000238  loss: 2.8802 (2.6752)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1339 (1.2350)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [256]  [ 400/1251]  eta: 0:03:59  lr: 0.000236  min_lr: 0.000236  loss: 3.0508 (2.6453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1571 (1.2710)  time: 0.2845  data: 0.0004  max mem: 21847
Epoch: [256]  [ 600/1251]  eta: 0:03:02  lr: 0.000235  min_lr: 0.000235  loss: 2.5052 (2.6734)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1545 (1.2939)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [256]  [ 800/1251]  eta: 0:02:05  lr: 0.000233  min_lr: 0.000233  loss: 2.5394 (2.6699)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1307 (1.2647)  time: 0.2720  data: 0.0005  max mem: 21847
Epoch: [256]  [1000/1251]  eta: 0:01:09  lr: 0.000231  min_lr: 0.000231  loss: 2.7852 (2.6696)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2014 (1.2739)  time: 0.2729  data: 0.0005  max mem: 21847
Epoch: [256]  [1200/1251]  eta: 0:00:14  lr: 0.000230  min_lr: 0.000230  loss: 2.8396 (2.6728)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2109 (1.2572)  time: 0.2712  data: 0.0005  max mem: 21847
Epoch: [256]  [1250/1251]  eta: 0:00:00  lr: 0.000229  min_lr: 0.000229  loss: 2.9949 (2.6759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2792 (1.2771)  time: 0.2355  data: 0.0007  max mem: 21847
Epoch: [256] Total time: 0:05:46 (0.2774 s / it)
Averaged stats: lr: 0.000229  min_lr: 0.000229  loss: 2.9949 (2.6658)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2792 (1.2771)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6303 (0.6303)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.6591  data: 5.5173  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8106 (0.7875)  acc1: 86.0000 (86.1091)  acc5: 97.6000 (97.6000)  time: 0.7452  data: 0.6098  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9596 (0.9190)  acc1: 80.4000 (82.8191)  acc5: 96.4000 (96.1143)  time: 0.2016  data: 0.0663  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9981 (0.9247)  acc1: 80.0000 (82.5120)  acc5: 95.2000 (96.1760)  time: 0.1993  data: 0.0663  max mem: 21847
Test: Total time: 0:00:10 (0.4115 s / it)
* Acc@1 82.520 Acc@5 96.196 loss 0.925
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.53%
Epoch: [257]  [   0/1251]  eta: 1:09:17  lr: 0.000229  min_lr: 0.000229  loss: 3.3008 (3.3008)  weight_decay: 0.0500 (0.0500)  time: 3.3229  data: 2.3585  max mem: 21847
Epoch: [257]  [ 200/1251]  eta: 0:05:04  lr: 0.000228  min_lr: 0.000228  loss: 2.8256 (2.6952)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1744 (1.2105)  time: 0.2716  data: 0.0003  max mem: 21847
Epoch: [257]  [ 400/1251]  eta: 0:03:59  lr: 0.000226  min_lr: 0.000226  loss: 2.9374 (2.6489)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2277 (1.2639)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [257]  [ 600/1251]  eta: 0:03:01  lr: 0.000224  min_lr: 0.000224  loss: 2.9109 (2.6446)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3640 (1.2505)  time: 0.2727  data: 0.0005  max mem: 21847
Epoch: [257]  [ 800/1251]  eta: 0:02:05  lr: 0.000223  min_lr: 0.000223  loss: 2.8742 (2.6584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0361 (1.2226)  time: 0.2744  data: 0.0005  max mem: 21847
Epoch: [257]  [1000/1251]  eta: 0:01:09  lr: 0.000221  min_lr: 0.000221  loss: 2.7867 (2.6666)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2235 (1.2140)  time: 0.2753  data: 0.0004  max mem: 21847
Epoch: [257]  [1200/1251]  eta: 0:00:14  lr: 0.000219  min_lr: 0.000219  loss: 2.3752 (2.6643)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1667 (1.2057)  time: 0.2753  data: 0.0004  max mem: 21847
Epoch: [257]  [1250/1251]  eta: 0:00:00  lr: 0.000219  min_lr: 0.000219  loss: 2.9784 (2.6652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9993 (1.1985)  time: 0.2281  data: 0.0006  max mem: 21847
Epoch: [257] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.000219  min_lr: 0.000219  loss: 2.9784 (2.6646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9993 (1.1985)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5794 (0.5794)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.7624  data: 5.5886  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7643 (0.7422)  acc1: 86.8000 (86.2182)  acc5: 97.6000 (97.5636)  time: 0.6938  data: 0.5597  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9181 (0.8774)  acc1: 80.8000 (82.7810)  acc5: 96.4000 (96.1333)  time: 0.1799  data: 0.0511  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9566 (0.8835)  acc1: 79.6000 (82.4960)  acc5: 95.6000 (96.2080)  time: 0.1997  data: 0.0717  max mem: 21847
Test: Total time: 0:00:10 (0.4147 s / it)
* Acc@1 82.644 Acc@5 96.176 loss 0.884
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.64%
Epoch: [258]  [   0/1251]  eta: 1:00:39  lr: 0.000219  min_lr: 0.000219  loss: 3.2378 (3.2378)  weight_decay: 0.0500 (0.0500)  time: 2.9090  data: 2.5800  max mem: 21847
Epoch: [258]  [ 200/1251]  eta: 0:05:03  lr: 0.000217  min_lr: 0.000217  loss: 2.7956 (2.7023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4685 (1.3141)  time: 0.2810  data: 0.0004  max mem: 21847
Epoch: [258]  [ 400/1251]  eta: 0:03:59  lr: 0.000216  min_lr: 0.000216  loss: 2.4309 (2.6759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1487 (1.2902)  time: 0.2740  data: 0.0004  max mem: 21847
Epoch: [258]  [ 600/1251]  eta: 0:03:01  lr: 0.000214  min_lr: 0.000214  loss: 2.6525 (2.6876)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0285 (1.2275)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [258]  [ 800/1251]  eta: 0:02:05  lr: 0.000212  min_lr: 0.000212  loss: 2.5791 (2.6748)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0901 (1.1936)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [258]  [1000/1251]  eta: 0:01:09  lr: 0.000211  min_lr: 0.000211  loss: 2.6694 (2.6925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1463 (1.2164)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [258]  [1200/1251]  eta: 0:00:14  lr: 0.000209  min_lr: 0.000209  loss: 2.7820 (2.6896)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0773 (1.2020)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [258]  [1250/1251]  eta: 0:00:00  lr: 0.000209  min_lr: 0.000209  loss: 2.5453 (2.6870)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1276 (1.2049)  time: 0.2281  data: 0.0005  max mem: 21847
Epoch: [258] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.000209  min_lr: 0.000209  loss: 2.5453 (2.6571)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1276 (1.2049)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5562 (0.5562)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.5481  data: 5.3949  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.7531 (0.7202)  acc1: 87.2000 (86.0364)  acc5: 97.6000 (97.6364)  time: 0.6654  data: 0.5305  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8839 (0.8557)  acc1: 80.8000 (82.7238)  acc5: 96.4000 (96.1524)  time: 0.1836  data: 0.0533  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9444 (0.8642)  acc1: 79.6000 (82.3200)  acc5: 95.6000 (96.1120)  time: 0.1876  data: 0.0588  max mem: 21847
Test: Total time: 0:00:09 (0.3988 s / it)
* Acc@1 82.582 Acc@5 96.238 loss 0.858
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.64%
Epoch: [259]  [   0/1251]  eta: 1:07:15  lr: 0.000209  min_lr: 0.000209  loss: 3.1916 (3.1916)  weight_decay: 0.0500 (0.0500)  time: 3.2260  data: 2.2726  max mem: 21847
Epoch: [259]  [ 200/1251]  eta: 0:05:05  lr: 0.000207  min_lr: 0.000207  loss: 2.7217 (2.6712)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2008 (1.2622)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [259]  [ 400/1251]  eta: 0:04:01  lr: 0.000206  min_lr: 0.000206  loss: 2.8904 (2.7093)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1244 (1.2321)  time: 0.2821  data: 0.0004  max mem: 21847
Epoch: [259]  [ 600/1251]  eta: 0:03:02  lr: 0.000204  min_lr: 0.000204  loss: 2.3681 (2.6902)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0264 (1.1986)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [259]  [ 800/1251]  eta: 0:02:05  lr: 0.000203  min_lr: 0.000203  loss: 2.8497 (2.6832)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0264 (1.1802)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [259]  [1000/1251]  eta: 0:01:09  lr: 0.000201  min_lr: 0.000201  loss: 2.7072 (2.6660)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1574 (1.1847)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [259]  [1200/1251]  eta: 0:00:14  lr: 0.000199  min_lr: 0.000199  loss: 2.8500 (2.6624)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0938 (1.1906)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [259]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 2.9787 (2.6647)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0917 (1.1892)  time: 0.2280  data: 0.0007  max mem: 21847
Epoch: [259] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 2.9787 (2.6516)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0917 (1.1892)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5925 (0.5925)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.4815  data: 5.3362  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7554 (0.7459)  acc1: 87.6000 (85.7818)  acc5: 97.6000 (97.6364)  time: 0.6920  data: 0.5586  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9294 (0.8729)  acc1: 80.4000 (82.6667)  acc5: 96.0000 (96.2476)  time: 0.1879  data: 0.0562  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9561 (0.8811)  acc1: 80.0000 (82.3520)  acc5: 95.6000 (96.1440)  time: 0.1978  data: 0.0677  max mem: 21847
Test: Total time: 0:00:10 (0.4029 s / it)
* Acc@1 82.594 Acc@5 96.208 loss 0.884
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.64%
Epoch: [260]  [   0/1251]  eta: 1:08:54  lr: 0.000199  min_lr: 0.000199  loss: 1.8593 (1.8593)  weight_decay: 0.0500 (0.0500)  time: 3.3052  data: 2.2820  max mem: 21847
Epoch: [260]  [ 200/1251]  eta: 0:05:04  lr: 0.000197  min_lr: 0.000197  loss: 2.9776 (2.6276)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1840 (1.1796)  time: 0.2757  data: 0.0004  max mem: 21847
Epoch: [260]  [ 400/1251]  eta: 0:03:59  lr: 0.000196  min_lr: 0.000196  loss: 2.5637 (2.6524)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1157 (1.2018)  time: 0.2820  data: 0.0004  max mem: 21847
Epoch: [260]  [ 600/1251]  eta: 0:03:01  lr: 0.000194  min_lr: 0.000194  loss: 2.7702 (2.6518)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2240 (1.2024)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [260]  [ 800/1251]  eta: 0:02:05  lr: 0.000193  min_lr: 0.000193  loss: 2.7342 (2.6509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0837 (1.1704)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [260]  [1000/1251]  eta: 0:01:09  lr: 0.000191  min_lr: 0.000191  loss: 2.8235 (2.6584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0558 (1.1614)  time: 0.2706  data: 0.0004  max mem: 21847
Epoch: [260]  [1200/1251]  eta: 0:00:14  lr: 0.000190  min_lr: 0.000190  loss: 2.7711 (2.6604)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1542 (1.1629)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [260]  [1250/1251]  eta: 0:00:00  lr: 0.000189  min_lr: 0.000189  loss: 2.8853 (2.6596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0179 (1.1584)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [260] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.000189  min_lr: 0.000189  loss: 2.8853 (2.6515)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0179 (1.1584)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5950 (0.5950)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.6956  data: 5.5274  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7594 (0.7533)  acc1: 86.8000 (85.5273)  acc5: 97.6000 (97.5273)  time: 0.7224  data: 0.5884  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9208 (0.8859)  acc1: 79.2000 (82.3048)  acc5: 96.4000 (96.2095)  time: 0.1883  data: 0.0592  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9705 (0.8941)  acc1: 79.2000 (82.0000)  acc5: 95.6000 (96.1440)  time: 0.1887  data: 0.0604  max mem: 21847
Test: Total time: 0:00:10 (0.4038 s / it)
* Acc@1 82.718 Acc@5 96.176 loss 0.890
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.72%
Epoch: [261]  [   0/1251]  eta: 1:12:07  lr: 0.000189  min_lr: 0.000189  loss: 2.3470 (2.3470)  weight_decay: 0.0500 (0.0500)  time: 3.4590  data: 3.1718  max mem: 21847
Epoch: [261]  [ 200/1251]  eta: 0:05:06  lr: 0.000188  min_lr: 0.000188  loss: 3.0798 (2.6462)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1763 (1.1664)  time: 0.2841  data: 0.0003  max mem: 21847
Epoch: [261]  [ 400/1251]  eta: 0:04:00  lr: 0.000186  min_lr: 0.000186  loss: 2.8332 (2.6400)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1611 (1.1857)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [261]  [ 600/1251]  eta: 0:03:02  lr: 0.000185  min_lr: 0.000185  loss: 2.8530 (2.6592)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1932 (1.1859)  time: 0.2742  data: 0.0005  max mem: 21847
Epoch: [261]  [ 800/1251]  eta: 0:02:05  lr: 0.000183  min_lr: 0.000183  loss: 2.7155 (2.6689)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2512 (1.2172)  time: 0.2780  data: 0.0004  max mem: 21847
Epoch: [261]  [1000/1251]  eta: 0:01:09  lr: 0.000182  min_lr: 0.000182  loss: 2.6143 (2.6627)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0796 (1.2051)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [261]  [1200/1251]  eta: 0:00:14  lr: 0.000180  min_lr: 0.000180  loss: 2.7268 (2.6628)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0601 (1.1959)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [261]  [1250/1251]  eta: 0:00:00  lr: 0.000180  min_lr: 0.000180  loss: 2.5369 (2.6655)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0755 (1.1964)  time: 0.2281  data: 0.0006  max mem: 21847
Epoch: [261] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.000180  min_lr: 0.000180  loss: 2.5369 (2.6553)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0755 (1.1964)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5272 (0.5272)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.6610  data: 5.5150  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7185 (0.6955)  acc1: 86.8000 (85.7818)  acc5: 97.6000 (97.4182)  time: 0.7144  data: 0.5804  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8745 (0.8256)  acc1: 80.8000 (82.8000)  acc5: 95.6000 (96.0381)  time: 0.1934  data: 0.0633  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9217 (0.8353)  acc1: 80.0000 (82.4000)  acc5: 95.6000 (96.0320)  time: 0.1915  data: 0.0632  max mem: 21847
Test: Total time: 0:00:10 (0.4049 s / it)
* Acc@1 82.698 Acc@5 96.244 loss 0.831
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.72%
Epoch: [262]  [   0/1251]  eta: 1:05:34  lr: 0.000180  min_lr: 0.000180  loss: 3.0038 (3.0038)  weight_decay: 0.0500 (0.0500)  time: 3.1453  data: 1.7012  max mem: 21847
Epoch: [262]  [ 200/1251]  eta: 0:05:04  lr: 0.000179  min_lr: 0.000179  loss: 2.4111 (2.5526)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1715 (1.2117)  time: 0.2712  data: 0.0003  max mem: 21847
Epoch: [262]  [ 400/1251]  eta: 0:04:00  lr: 0.000177  min_lr: 0.000177  loss: 2.9038 (2.6227)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [262]  [ 600/1251]  eta: 0:03:02  lr: 0.000176  min_lr: 0.000176  loss: 3.0371 (2.6209)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1339 (nan)  time: 0.2908  data: 0.0004  max mem: 21847
Epoch: [262]  [ 800/1251]  eta: 0:02:05  lr: 0.000174  min_lr: 0.000174  loss: 2.8912 (2.6374)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2856 (nan)  time: 0.2801  data: 0.0004  max mem: 21847
Epoch: [262]  [1000/1251]  eta: 0:01:09  lr: 0.000173  min_lr: 0.000173  loss: 2.6108 (2.6432)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0846 (nan)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [262]  [1200/1251]  eta: 0:00:14  lr: 0.000171  min_lr: 0.000171  loss: 2.0678 (2.6512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0850 (nan)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [262]  [1250/1251]  eta: 0:00:00  lr: 0.000171  min_lr: 0.000171  loss: 2.5664 (2.6498)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3271 (nan)  time: 0.2296  data: 0.0006  max mem: 21847
Epoch: [262] Total time: 0:05:45 (0.2763 s / it)
Averaged stats: lr: 0.000171  min_lr: 0.000171  loss: 2.5664 (2.6479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3271 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5631 (0.5631)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.6310  data: 5.4841  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7259 (0.7096)  acc1: 87.2000 (86.0364)  acc5: 97.2000 (97.5273)  time: 0.7378  data: 0.6031  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9063 (0.8376)  acc1: 80.0000 (82.6095)  acc5: 96.4000 (96.0952)  time: 0.1987  data: 0.0664  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9243 (0.8463)  acc1: 79.6000 (82.2720)  acc5: 95.6000 (96.0800)  time: 0.1977  data: 0.0663  max mem: 21847
Test: Total time: 0:00:10 (0.4077 s / it)
* Acc@1 82.642 Acc@5 96.180 loss 0.846
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.72%
Epoch: [263]  [   0/1251]  eta: 1:02:27  lr: 0.000171  min_lr: 0.000171  loss: 3.3124 (3.3124)  weight_decay: 0.0500 (0.0500)  time: 2.9960  data: 1.6749  max mem: 21847
Epoch: [263]  [ 200/1251]  eta: 0:05:03  lr: 0.000169  min_lr: 0.000169  loss: 3.1176 (2.6953)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2087 (1.3498)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [263]  [ 400/1251]  eta: 0:04:00  lr: 0.000168  min_lr: 0.000168  loss: 2.3954 (2.7058)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3446 (1.3174)  time: 0.2732  data: 0.0003  max mem: 21847
Epoch: [263]  [ 600/1251]  eta: 0:03:02  lr: 0.000167  min_lr: 0.000167  loss: 2.4613 (2.6611)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3089 (1.3195)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [263]  [ 800/1251]  eta: 0:02:05  lr: 0.000165  min_lr: 0.000165  loss: 2.1268 (2.6545)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1005 (1.2863)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [263]  [1000/1251]  eta: 0:01:09  lr: 0.000164  min_lr: 0.000164  loss: 2.8508 (2.6463)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0462 (1.2759)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [263]  [1200/1251]  eta: 0:00:14  lr: 0.000162  min_lr: 0.000162  loss: 2.6446 (2.6460)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4513 (1.2820)  time: 0.2711  data: 0.0004  max mem: 21847
Epoch: [263]  [1250/1251]  eta: 0:00:00  lr: 0.000162  min_lr: 0.000162  loss: 2.9952 (2.6455)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3022 (1.2806)  time: 0.2282  data: 0.0005  max mem: 21847
Epoch: [263] Total time: 0:05:46 (0.2773 s / it)
Averaged stats: lr: 0.000162  min_lr: 0.000162  loss: 2.9952 (2.6520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3022 (1.2806)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.5858 (0.5858)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.2306  data: 5.0756  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7694 (0.7463)  acc1: 86.0000 (85.3818)  acc5: 97.2000 (97.4182)  time: 0.7080  data: 0.5727  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9230 (0.8701)  acc1: 80.4000 (82.4381)  acc5: 95.6000 (96.0191)  time: 0.2157  data: 0.0853  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9567 (0.8808)  acc1: 79.6000 (82.0800)  acc5: 95.6000 (95.9680)  time: 0.2070  data: 0.0779  max mem: 21847
Test: Total time: 0:00:10 (0.4081 s / it)
* Acc@1 82.718 Acc@5 96.254 loss 0.876
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.72%
Epoch: [264]  [   0/1251]  eta: 1:06:32  lr: 0.000162  min_lr: 0.000162  loss: 2.4819 (2.4819)  weight_decay: 0.0500 (0.0500)  time: 3.1913  data: 2.9026  max mem: 21847
Epoch: [264]  [ 200/1251]  eta: 0:05:03  lr: 0.000160  min_lr: 0.000160  loss: 2.4063 (2.5713)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3145 (1.3494)  time: 0.2735  data: 0.0004  max mem: 21847
Epoch: [264]  [ 400/1251]  eta: 0:03:59  lr: 0.000159  min_lr: 0.000159  loss: 2.4083 (2.6050)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0710 (1.2700)  time: 0.2818  data: 0.0004  max mem: 21847
Epoch: [264]  [ 600/1251]  eta: 0:03:01  lr: 0.000158  min_lr: 0.000158  loss: 2.7579 (2.6164)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2418 (1.2529)  time: 0.2729  data: 0.0005  max mem: 21847
Epoch: [264]  [ 800/1251]  eta: 0:02:05  lr: 0.000156  min_lr: 0.000156  loss: 2.5998 (2.6151)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2331 (1.2374)  time: 0.2855  data: 0.0005  max mem: 21847
Epoch: [264]  [1000/1251]  eta: 0:01:09  lr: 0.000155  min_lr: 0.000155  loss: 1.9633 (2.6192)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0877 (1.2360)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [264]  [1200/1251]  eta: 0:00:14  lr: 0.000154  min_lr: 0.000154  loss: 2.8851 (2.6284)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1148 (1.2326)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [264]  [1250/1251]  eta: 0:00:00  lr: 0.000153  min_lr: 0.000153  loss: 2.3939 (2.6251)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0318 (1.2304)  time: 0.2354  data: 0.0006  max mem: 21847
Epoch: [264] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.000153  min_lr: 0.000153  loss: 2.3939 (2.6502)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0318 (1.2304)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5237 (0.5237)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.3743  data: 5.2167  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7096 (0.6826)  acc1: 86.8000 (85.7818)  acc5: 97.6000 (97.4909)  time: 0.6734  data: 0.5261  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8499 (0.8074)  acc1: 80.4000 (82.6667)  acc5: 96.4000 (96.1714)  time: 0.1878  data: 0.0508  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9189 (0.8184)  acc1: 79.6000 (82.3040)  acc5: 95.6000 (96.0800)  time: 0.1843  data: 0.0517  max mem: 21847
Test: Total time: 0:00:09 (0.3894 s / it)
* Acc@1 82.710 Acc@5 96.228 loss 0.814
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.72%
Epoch: [265]  [   0/1251]  eta: 1:07:51  lr: 0.000153  min_lr: 0.000153  loss: 1.7037 (1.7037)  weight_decay: 0.0500 (0.0500)  time: 3.2542  data: 1.6585  max mem: 21847
Epoch: [265]  [ 200/1251]  eta: 0:05:07  lr: 0.000152  min_lr: 0.000152  loss: 2.9027 (2.6265)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1279 (1.1562)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [265]  [ 400/1251]  eta: 0:03:59  lr: 0.000150  min_lr: 0.000150  loss: 2.6034 (2.6567)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1490 (1.1495)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [265]  [ 600/1251]  eta: 0:03:01  lr: 0.000149  min_lr: 0.000149  loss: 3.0327 (2.6391)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0288 (1.1638)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [265]  [ 800/1251]  eta: 0:02:05  lr: 0.000148  min_lr: 0.000148  loss: 2.6920 (2.6444)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1391 (1.1683)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [265]  [1000/1251]  eta: 0:01:09  lr: 0.000146  min_lr: 0.000146  loss: 2.6912 (2.6375)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0634 (1.1645)  time: 0.2703  data: 0.0004  max mem: 21847
Epoch: [265]  [1200/1251]  eta: 0:00:14  lr: 0.000145  min_lr: 0.000145  loss: 2.9355 (2.6463)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9809 (1.1651)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [265]  [1250/1251]  eta: 0:00:00  lr: 0.000145  min_lr: 0.000145  loss: 3.0652 (2.6479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1499 (1.1676)  time: 0.2353  data: 0.0007  max mem: 21847
Epoch: [265] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.000145  min_lr: 0.000145  loss: 3.0652 (2.6415)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1499 (1.1676)
Test:  [ 0/25]  eta: 0:02:05  loss: 0.6841 (0.6841)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.0203  data: 4.8719  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.8573 (0.8398)  acc1: 86.8000 (85.8545)  acc5: 97.2000 (97.4545)  time: 0.6365  data: 0.5024  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9963 (0.9708)  acc1: 79.6000 (82.5524)  acc5: 96.0000 (96.1333)  time: 0.1953  data: 0.0652  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0467 (0.9790)  acc1: 79.6000 (82.2560)  acc5: 95.2000 (96.0000)  time: 0.1966  data: 0.0683  max mem: 21847
Test: Total time: 0:00:09 (0.3958 s / it)
* Acc@1 82.566 Acc@5 96.156 loss 0.975
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.72%
Epoch: [266]  [   0/1251]  eta: 1:07:48  lr: 0.000145  min_lr: 0.000145  loss: 2.0282 (2.0282)  weight_decay: 0.0500 (0.0500)  time: 3.2525  data: 2.8677  max mem: 21847
Epoch: [266]  [ 200/1251]  eta: 0:05:03  lr: 0.000143  min_lr: 0.000143  loss: 2.0378 (2.6808)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1781 (1.2229)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [266]  [ 400/1251]  eta: 0:04:00  lr: 0.000142  min_lr: 0.000142  loss: 2.7571 (2.6722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9777 (1.1734)  time: 0.2738  data: 0.0004  max mem: 21847
Epoch: [266]  [ 600/1251]  eta: 0:03:02  lr: 0.000141  min_lr: 0.000141  loss: 2.6050 (2.6729)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2393 (1.2007)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [266]  [ 800/1251]  eta: 0:02:05  lr: 0.000139  min_lr: 0.000139  loss: 2.8590 (2.6599)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1807 (1.1920)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [266]  [1000/1251]  eta: 0:01:09  lr: 0.000138  min_lr: 0.000138  loss: 2.6186 (2.6521)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1082 (1.1869)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [266]  [1200/1251]  eta: 0:00:14  lr: 0.000137  min_lr: 0.000137  loss: 2.4179 (2.6543)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0585 (1.1766)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [266]  [1250/1251]  eta: 0:00:00  lr: 0.000137  min_lr: 0.000137  loss: 2.7448 (2.6514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0840 (1.1749)  time: 0.2279  data: 0.0006  max mem: 21847
Epoch: [266] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.000137  min_lr: 0.000137  loss: 2.7448 (2.6320)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0840 (1.1749)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5531 (0.5531)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.7540  data: 5.6067  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7484 (0.7193)  acc1: 86.4000 (85.8909)  acc5: 97.6000 (97.5273)  time: 0.7387  data: 0.6046  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8657 (0.8523)  acc1: 80.8000 (82.8952)  acc5: 96.4000 (96.4000)  time: 0.1902  data: 0.0600  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9295 (0.8606)  acc1: 80.4000 (82.4640)  acc5: 96.0000 (96.3200)  time: 0.1887  data: 0.0599  max mem: 21847
Test: Total time: 0:00:10 (0.4059 s / it)
* Acc@1 82.624 Acc@5 96.224 loss 0.858
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.72%
Epoch: [267]  [   0/1251]  eta: 1:07:12  lr: 0.000136  min_lr: 0.000136  loss: 3.4191 (3.4191)  weight_decay: 0.0500 (0.0500)  time: 3.2232  data: 1.6107  max mem: 21847
Epoch: [267]  [ 200/1251]  eta: 0:05:04  lr: 0.000135  min_lr: 0.000135  loss: 2.3331 (2.5934)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0970 (1.1317)  time: 0.2825  data: 0.0004  max mem: 21847
Epoch: [267]  [ 400/1251]  eta: 0:04:00  lr: 0.000134  min_lr: 0.000134  loss: 2.6053 (2.6217)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1948 (1.1925)  time: 0.2735  data: 0.0004  max mem: 21847
Epoch: [267]  [ 600/1251]  eta: 0:03:02  lr: 0.000133  min_lr: 0.000133  loss: 2.8899 (2.6480)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0627 (1.1722)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [267]  [ 800/1251]  eta: 0:02:05  lr: 0.000131  min_lr: 0.000131  loss: 2.6594 (2.6370)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1175 (1.1785)  time: 0.2814  data: 0.0004  max mem: 21847
Epoch: [267]  [1000/1251]  eta: 0:01:09  lr: 0.000130  min_lr: 0.000130  loss: 2.5393 (2.6206)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1345 (1.1750)  time: 0.2761  data: 0.0004  max mem: 21847
Epoch: [267]  [1200/1251]  eta: 0:00:14  lr: 0.000129  min_lr: 0.000129  loss: 2.5169 (2.6067)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1816 (1.1792)  time: 0.2800  data: 0.0005  max mem: 21847
Epoch: [267]  [1250/1251]  eta: 0:00:00  lr: 0.000129  min_lr: 0.000129  loss: 2.8987 (2.6134)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0756 (1.1747)  time: 0.2282  data: 0.0008  max mem: 21847
Epoch: [267] Total time: 0:05:47 (0.2779 s / it)
Averaged stats: lr: 0.000129  min_lr: 0.000129  loss: 2.8987 (2.6324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0756 (1.1747)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6015 (0.6015)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.6928  data: 5.5431  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7546 (0.7593)  acc1: 86.0000 (85.6364)  acc5: 97.6000 (97.6000)  time: 0.7015  data: 0.5685  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9192 (0.8840)  acc1: 80.8000 (82.6857)  acc5: 96.4000 (96.2286)  time: 0.1891  data: 0.0594  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9511 (0.8921)  acc1: 80.0000 (82.3520)  acc5: 95.2000 (96.1440)  time: 0.1881  data: 0.0593  max mem: 21847
Test: Total time: 0:00:10 (0.4029 s / it)
* Acc@1 82.646 Acc@5 96.226 loss 0.891
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.72%
Epoch: [268]  [   0/1251]  eta: 1:08:08  lr: 0.000128  min_lr: 0.000128  loss: 2.8274 (2.8274)  weight_decay: 0.0500 (0.0500)  time: 3.2686  data: 2.2667  max mem: 21847
Epoch: [268]  [ 200/1251]  eta: 0:05:03  lr: 0.000127  min_lr: 0.000127  loss: 2.7940 (2.5825)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1527 (1.1643)  time: 0.2716  data: 0.0003  max mem: 21847
Epoch: [268]  [ 400/1251]  eta: 0:04:00  lr: 0.000126  min_lr: 0.000126  loss: 2.6880 (2.6120)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1593 (1.2026)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [268]  [ 600/1251]  eta: 0:03:02  lr: 0.000125  min_lr: 0.000125  loss: 2.8945 (2.6261)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1052 (1.2338)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [268]  [ 800/1251]  eta: 0:02:05  lr: 0.000123  min_lr: 0.000123  loss: 2.9143 (2.6301)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2196 (1.2459)  time: 0.2732  data: 0.0005  max mem: 21847
Epoch: [268]  [1000/1251]  eta: 0:01:09  lr: 0.000122  min_lr: 0.000122  loss: 2.3621 (2.6341)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0362 (1.2287)  time: 0.2716  data: 0.0005  max mem: 21847
Epoch: [268]  [1200/1251]  eta: 0:00:14  lr: 0.000121  min_lr: 0.000121  loss: 3.0294 (2.6369)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2478 (1.2167)  time: 0.2772  data: 0.0005  max mem: 21847
Epoch: [268]  [1250/1251]  eta: 0:00:00  lr: 0.000121  min_lr: 0.000121  loss: 2.7758 (2.6308)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0814 (1.2130)  time: 0.2280  data: 0.0008  max mem: 21847
Epoch: [268] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.000121  min_lr: 0.000121  loss: 2.7758 (2.6279)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0814 (1.2130)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5239 (0.5239)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.3996  data: 5.2433  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7268 (0.6910)  acc1: 86.4000 (85.8182)  acc5: 97.2000 (97.4182)  time: 0.7260  data: 0.5924  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8343 (0.8162)  acc1: 80.8000 (82.8952)  acc5: 96.4000 (96.1905)  time: 0.2130  data: 0.0831  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.8791 (0.8240)  acc1: 80.4000 (82.5920)  acc5: 95.6000 (96.1760)  time: 0.2125  data: 0.0830  max mem: 21847
Test: Total time: 0:00:10 (0.4100 s / it)
* Acc@1 82.730 Acc@5 96.258 loss 0.821
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.73%
Epoch: [269]  [   0/1251]  eta: 1:03:47  lr: 0.000121  min_lr: 0.000121  loss: 2.0935 (2.0935)  weight_decay: 0.0500 (0.0500)  time: 3.0597  data: 2.7173  max mem: 21847
Epoch: [269]  [ 200/1251]  eta: 0:05:03  lr: 0.000120  min_lr: 0.000120  loss: 2.1934 (2.6860)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0896 (1.2145)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [269]  [ 400/1251]  eta: 0:04:00  lr: 0.000118  min_lr: 0.000118  loss: 2.9078 (2.6521)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0627 (1.1807)  time: 0.2737  data: 0.0004  max mem: 21847
Epoch: [269]  [ 600/1251]  eta: 0:03:01  lr: 0.000117  min_lr: 0.000117  loss: 2.9274 (2.6519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1311 (1.1913)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [269]  [ 800/1251]  eta: 0:02:05  lr: 0.000116  min_lr: 0.000116  loss: 2.6816 (2.6449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0409 (1.1717)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [269]  [1000/1251]  eta: 0:01:09  lr: 0.000115  min_lr: 0.000115  loss: 2.2144 (2.6516)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3162 (1.1923)  time: 0.2746  data: 0.0004  max mem: 21847
Epoch: [269]  [1200/1251]  eta: 0:00:14  lr: 0.000113  min_lr: 0.000113  loss: 2.7973 (2.6423)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [269]  [1250/1251]  eta: 0:00:00  lr: 0.000113  min_lr: 0.000113  loss: 2.7250 (2.6411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1375 (nan)  time: 0.2276  data: 0.0005  max mem: 21847
Epoch: [269] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.000113  min_lr: 0.000113  loss: 2.7250 (2.6306)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1375 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5702 (0.5702)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.6673  data: 5.5173  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7471 (0.7171)  acc1: 86.0000 (85.7818)  acc5: 97.2000 (97.4182)  time: 0.7409  data: 0.6076  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8870 (0.8494)  acc1: 80.8000 (82.6095)  acc5: 96.0000 (96.0381)  time: 0.2059  data: 0.0750  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9374 (0.8595)  acc1: 80.0000 (82.3680)  acc5: 95.6000 (96.0320)  time: 0.2049  data: 0.0750  max mem: 21847
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 82.618 Acc@5 96.246 loss 0.855
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.73%
Epoch: [270]  [   0/1251]  eta: 1:14:23  lr: 0.000113  min_lr: 0.000113  loss: 2.9272 (2.9272)  weight_decay: 0.0500 (0.0500)  time: 3.5680  data: 3.1928  max mem: 21847
Epoch: [270]  [ 200/1251]  eta: 0:05:07  lr: 0.000112  min_lr: 0.000112  loss: 2.2197 (2.6208)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1769 (1.2351)  time: 0.2820  data: 0.0004  max mem: 21847
Epoch: [270]  [ 400/1251]  eta: 0:04:01  lr: 0.000111  min_lr: 0.000111  loss: 2.7229 (2.6032)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0861 (1.2355)  time: 0.2721  data: 0.0006  max mem: 21847
Epoch: [270]  [ 600/1251]  eta: 0:03:02  lr: 0.000110  min_lr: 0.000110  loss: 3.0191 (2.5995)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1988 (1.2402)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [270]  [ 800/1251]  eta: 0:02:05  lr: 0.000109  min_lr: 0.000109  loss: 2.4976 (2.5929)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3301 (1.2370)  time: 0.2731  data: 0.0005  max mem: 21847
Epoch: [270]  [1000/1251]  eta: 0:01:09  lr: 0.000107  min_lr: 0.000107  loss: 2.9072 (2.5868)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1737 (1.2451)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [270]  [1200/1251]  eta: 0:00:14  lr: 0.000106  min_lr: 0.000106  loss: 2.5171 (2.5997)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2767  data: 0.0005  max mem: 21847
Epoch: [270]  [1250/1251]  eta: 0:00:00  lr: 0.000106  min_lr: 0.000106  loss: 2.6059 (2.5972)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1119 (nan)  time: 0.2283  data: 0.0007  max mem: 21847
Epoch: [270] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.000106  min_lr: 0.000106  loss: 2.6059 (2.6168)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1119 (nan)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5554 (0.5554)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.7957  data: 5.6218  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7547 (0.7351)  acc1: 86.0000 (85.8546)  acc5: 97.6000 (97.6000)  time: 0.7070  data: 0.5725  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8923 (0.8656)  acc1: 80.8000 (82.7048)  acc5: 96.0000 (96.2095)  time: 0.1768  data: 0.0472  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9199 (0.8723)  acc1: 80.8000 (82.5280)  acc5: 95.6000 (96.1600)  time: 0.1761  data: 0.0471  max mem: 21847
Test: Total time: 0:00:09 (0.3985 s / it)
* Acc@1 82.750 Acc@5 96.248 loss 0.872
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.75%
Epoch: [271]  [   0/1251]  eta: 0:57:09  lr: 0.000106  min_lr: 0.000106  loss: 2.7133 (2.7133)  weight_decay: 0.0500 (0.0500)  time: 2.7416  data: 2.3780  max mem: 21847
Epoch: [271]  [ 200/1251]  eta: 0:05:02  lr: 0.000105  min_lr: 0.000105  loss: 2.1699 (2.5585)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1820 (1.1784)  time: 0.2708  data: 0.0005  max mem: 21847
Epoch: [271]  [ 400/1251]  eta: 0:03:59  lr: 0.000104  min_lr: 0.000104  loss: 2.7295 (2.6176)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4162 (1.2700)  time: 0.2826  data: 0.0003  max mem: 21847
Epoch: [271]  [ 600/1251]  eta: 0:03:01  lr: 0.000102  min_lr: 0.000102  loss: 2.1060 (2.6011)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1935 (1.2737)  time: 0.2762  data: 0.0004  max mem: 21847
Epoch: [271]  [ 800/1251]  eta: 0:02:05  lr: 0.000101  min_lr: 0.000101  loss: 2.7955 (2.5957)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1940 (1.2557)  time: 0.2795  data: 0.0004  max mem: 21847
Epoch: [271]  [1000/1251]  eta: 0:01:09  lr: 0.000100  min_lr: 0.000100  loss: 2.9779 (2.6034)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1403 (1.2432)  time: 0.2740  data: 0.0003  max mem: 21847
Epoch: [271]  [1200/1251]  eta: 0:00:14  lr: 0.000099  min_lr: 0.000099  loss: 2.7823 (2.6059)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1321 (1.2352)  time: 0.2711  data: 0.0003  max mem: 21847
Epoch: [271]  [1250/1251]  eta: 0:00:00  lr: 0.000099  min_lr: 0.000099  loss: 2.6851 (2.6067)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1694 (1.2349)  time: 0.2311  data: 0.0005  max mem: 21847
Epoch: [271] Total time: 0:05:45 (0.2763 s / it)
Averaged stats: lr: 0.000099  min_lr: 0.000099  loss: 2.6851 (2.6201)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1694 (1.2349)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5386 (0.5386)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.4538  data: 5.3030  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7234 (0.7047)  acc1: 86.8000 (85.9273)  acc5: 97.2000 (97.3455)  time: 0.7244  data: 0.5909  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8556 (0.8358)  acc1: 81.2000 (82.8952)  acc5: 96.4000 (96.1714)  time: 0.2078  data: 0.0778  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9239 (0.8461)  acc1: 80.4000 (82.6400)  acc5: 95.6000 (96.1120)  time: 0.2070  data: 0.0777  max mem: 21847
Test: Total time: 0:00:10 (0.4090 s / it)
* Acc@1 82.828 Acc@5 96.262 loss 0.847
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.83%
Epoch: [272]  [   0/1251]  eta: 1:06:12  lr: 0.000099  min_lr: 0.000099  loss: 1.8506 (1.8506)  weight_decay: 0.0500 (0.0500)  time: 3.1752  data: 2.8690  max mem: 21847
Epoch: [272]  [ 200/1251]  eta: 0:05:04  lr: 0.000098  min_lr: 0.000098  loss: 2.2871 (2.6217)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0588 (1.1961)  time: 0.2704  data: 0.0003  max mem: 21847
Epoch: [272]  [ 400/1251]  eta: 0:03:59  lr: 0.000097  min_lr: 0.000097  loss: 2.6600 (2.6234)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2955 (1.2028)  time: 0.2729  data: 0.0004  max mem: 21847
Epoch: [272]  [ 600/1251]  eta: 0:03:02  lr: 0.000096  min_lr: 0.000096  loss: 2.4695 (2.6223)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1248 (1.2176)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [272]  [ 800/1251]  eta: 0:02:05  lr: 0.000094  min_lr: 0.000094  loss: 2.5804 (2.6102)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1941 (1.2371)  time: 0.2805  data: 0.0004  max mem: 21847
Epoch: [272]  [1000/1251]  eta: 0:01:09  lr: 0.000093  min_lr: 0.000093  loss: 2.8333 (2.6164)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1657 (1.2330)  time: 0.2711  data: 0.0005  max mem: 21847
Epoch: [272]  [1200/1251]  eta: 0:00:14  lr: 0.000092  min_lr: 0.000092  loss: 2.6195 (2.6173)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0923 (1.2157)  time: 0.2762  data: 0.0004  max mem: 21847
Epoch: [272]  [1250/1251]  eta: 0:00:00  lr: 0.000092  min_lr: 0.000092  loss: 2.8015 (2.6193)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1294 (1.2171)  time: 0.2277  data: 0.0008  max mem: 21847
Epoch: [272] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.000092  min_lr: 0.000092  loss: 2.8015 (2.6197)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1294 (1.2171)
Test:  [ 0/25]  eta: 0:01:54  loss: 0.5520 (0.5520)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 4.5807  data: 4.4323  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.7455 (0.7145)  acc1: 86.8000 (85.6000)  acc5: 97.6000 (97.4546)  time: 0.6623  data: 0.5278  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8652 (0.8419)  acc1: 80.4000 (82.5143)  acc5: 96.0000 (96.1524)  time: 0.2307  data: 0.1005  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.8904 (0.8497)  acc1: 80.0000 (82.3200)  acc5: 96.0000 (96.0960)  time: 0.1871  data: 0.0596  max mem: 21847
Test: Total time: 0:00:09 (0.3955 s / it)
* Acc@1 82.792 Acc@5 96.260 loss 0.849
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.83%
Epoch: [273]  [   0/1251]  eta: 1:08:50  lr: 0.000092  min_lr: 0.000092  loss: 2.8812 (2.8812)  weight_decay: 0.0500 (0.0500)  time: 3.3021  data: 2.9763  max mem: 21847
Epoch: [273]  [ 200/1251]  eta: 0:05:04  lr: 0.000091  min_lr: 0.000091  loss: 2.7557 (2.6986)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2537 (1.2076)  time: 0.2715  data: 0.0004  max mem: 21847
Epoch: [273]  [ 400/1251]  eta: 0:03:59  lr: 0.000090  min_lr: 0.000090  loss: 2.7879 (2.6782)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1234 (1.2437)  time: 0.2726  data: 0.0004  max mem: 21847
Epoch: [273]  [ 600/1251]  eta: 0:03:01  lr: 0.000089  min_lr: 0.000089  loss: 2.3170 (2.6451)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2270 (1.2528)  time: 0.2712  data: 0.0003  max mem: 21847
Epoch: [273]  [ 800/1251]  eta: 0:02:05  lr: 0.000088  min_lr: 0.000088  loss: 2.8378 (2.6480)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1419 (1.2456)  time: 0.2738  data: 0.0006  max mem: 21847
Epoch: [273]  [1000/1251]  eta: 0:01:09  lr: 0.000087  min_lr: 0.000087  loss: 2.7510 (2.6437)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0911 (1.2335)  time: 0.2722  data: 0.0005  max mem: 21847
Epoch: [273]  [1200/1251]  eta: 0:00:14  lr: 0.000086  min_lr: 0.000086  loss: 2.7764 (2.6381)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1219 (1.2322)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [273]  [1250/1251]  eta: 0:00:00  lr: 0.000085  min_lr: 0.000085  loss: 2.7310 (2.6404)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1830 (1.2350)  time: 0.2285  data: 0.0007  max mem: 21847
Epoch: [273] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.000085  min_lr: 0.000085  loss: 2.7310 (2.6173)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1830 (1.2350)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.5692 (0.5692)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.8732  data: 5.7218  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7611 (0.7354)  acc1: 87.2000 (85.8182)  acc5: 97.6000 (97.4182)  time: 0.7670  data: 0.6339  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8872 (0.8636)  acc1: 80.8000 (82.9143)  acc5: 96.0000 (96.2476)  time: 0.2146  data: 0.0852  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9233 (0.8717)  acc1: 80.4000 (82.6400)  acc5: 95.6000 (96.1920)  time: 0.2135  data: 0.0851  max mem: 21847
Test: Total time: 0:00:10 (0.4303 s / it)
* Acc@1 82.836 Acc@5 96.296 loss 0.870
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.84%
Epoch: [274]  [   0/1251]  eta: 1:03:21  lr: 0.000085  min_lr: 0.000085  loss: 3.6125 (3.6125)  weight_decay: 0.0500 (0.0500)  time: 3.0385  data: 2.7150  max mem: 21847
Epoch: [274]  [ 200/1251]  eta: 0:05:06  lr: 0.000084  min_lr: 0.000084  loss: 2.5774 (2.6023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1891 (1.2394)  time: 0.2810  data: 0.0004  max mem: 21847
Epoch: [274]  [ 400/1251]  eta: 0:04:00  lr: 0.000083  min_lr: 0.000083  loss: 2.8627 (2.6248)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1703 (1.2834)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [274]  [ 600/1251]  eta: 0:03:02  lr: 0.000082  min_lr: 0.000082  loss: 2.3801 (2.6236)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1016 (1.2436)  time: 0.2735  data: 0.0005  max mem: 21847
Epoch: [274]  [ 800/1251]  eta: 0:02:05  lr: 0.000081  min_lr: 0.000081  loss: 2.8263 (2.6161)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0951 (1.2339)  time: 0.2791  data: 0.0004  max mem: 21847
Epoch: [274]  [1000/1251]  eta: 0:01:09  lr: 0.000080  min_lr: 0.000080  loss: 2.8006 (2.6145)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2360 (1.2277)  time: 0.2828  data: 0.0004  max mem: 21847
Epoch: [274]  [1200/1251]  eta: 0:00:14  lr: 0.000079  min_lr: 0.000079  loss: 2.4909 (2.6168)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1207 (1.2207)  time: 0.2743  data: 0.0004  max mem: 21847
Epoch: [274]  [1250/1251]  eta: 0:00:00  lr: 0.000079  min_lr: 0.000079  loss: 2.9187 (2.6183)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1243 (1.2215)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [274] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.000079  min_lr: 0.000079  loss: 2.9187 (2.6063)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1243 (1.2215)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5727 (0.5727)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.6976  data: 5.5500  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7714 (0.7455)  acc1: 86.4000 (85.7818)  acc5: 97.6000 (97.4546)  time: 0.7183  data: 0.5825  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9008 (0.8749)  acc1: 81.2000 (82.8191)  acc5: 96.0000 (96.2286)  time: 0.1870  data: 0.0560  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9509 (0.8836)  acc1: 80.0000 (82.4000)  acc5: 95.6000 (96.2560)  time: 0.1889  data: 0.0570  max mem: 21847
Test: Total time: 0:00:10 (0.4031 s / it)
* Acc@1 82.854 Acc@5 96.258 loss 0.881
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.85%
Epoch: [275]  [   0/1251]  eta: 0:51:25  lr: 0.000079  min_lr: 0.000079  loss: 1.6764 (1.6764)  weight_decay: 0.0500 (0.0500)  time: 2.4661  data: 2.1163  max mem: 21847
Epoch: [275]  [ 200/1251]  eta: 0:05:00  lr: 0.000078  min_lr: 0.000078  loss: 3.0269 (2.6001)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1469 (1.2253)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [275]  [ 400/1251]  eta: 0:03:59  lr: 0.000077  min_lr: 0.000077  loss: 2.8982 (2.6282)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0907 (1.1909)  time: 0.2819  data: 0.0005  max mem: 21847
Epoch: [275]  [ 600/1251]  eta: 0:03:01  lr: 0.000076  min_lr: 0.000076  loss: 2.9276 (2.6354)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1461 (1.1895)  time: 0.2757  data: 0.0004  max mem: 21847
Epoch: [275]  [ 800/1251]  eta: 0:02:04  lr: 0.000075  min_lr: 0.000075  loss: 2.8180 (2.6189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2318 (1.1838)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [275]  [1000/1251]  eta: 0:01:09  lr: 0.000074  min_lr: 0.000074  loss: 2.1042 (2.6137)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2684 (1.1971)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [275]  [1200/1251]  eta: 0:00:14  lr: 0.000073  min_lr: 0.000073  loss: 2.4606 (2.6191)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0706 (1.1870)  time: 0.2742  data: 0.0004  max mem: 21847
Epoch: [275]  [1250/1251]  eta: 0:00:00  lr: 0.000073  min_lr: 0.000073  loss: 2.6271 (2.6151)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0642 (1.1830)  time: 0.2335  data: 0.0005  max mem: 21847
Epoch: [275] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.000073  min_lr: 0.000073  loss: 2.6271 (2.6212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0642 (1.1830)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.5516 (0.5516)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.2752  data: 5.1021  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7494 (0.7213)  acc1: 86.0000 (85.4909)  acc5: 97.6000 (97.4909)  time: 0.7080  data: 0.5705  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8798 (0.8490)  acc1: 80.8000 (82.6667)  acc5: 96.0000 (96.2857)  time: 0.2064  data: 0.0751  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9185 (0.8578)  acc1: 80.4000 (82.4160)  acc5: 95.6000 (96.2880)  time: 0.2109  data: 0.0820  max mem: 21847
Test: Total time: 0:00:10 (0.4055 s / it)
* Acc@1 82.914 Acc@5 96.270 loss 0.857
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [276]  [   0/1251]  eta: 1:06:45  lr: 0.000073  min_lr: 0.000073  loss: 2.9767 (2.9767)  weight_decay: 0.0500 (0.0500)  time: 3.2022  data: 2.8804  max mem: 21847
Epoch: [276]  [ 200/1251]  eta: 0:05:00  lr: 0.000072  min_lr: 0.000072  loss: 2.8544 (2.5349)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2363 (1.2266)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [276]  [ 400/1251]  eta: 0:03:58  lr: 0.000071  min_lr: 0.000071  loss: 3.0053 (2.5572)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1228 (1.2227)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [276]  [ 600/1251]  eta: 0:03:01  lr: 0.000070  min_lr: 0.000070  loss: 2.8634 (2.5676)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2606 (1.2339)  time: 0.2806  data: 0.0004  max mem: 21847
Epoch: [276]  [ 800/1251]  eta: 0:02:05  lr: 0.000069  min_lr: 0.000069  loss: 2.7569 (2.5685)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3135 (1.2510)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [276]  [1000/1251]  eta: 0:01:09  lr: 0.000068  min_lr: 0.000068  loss: 2.9610 (2.5834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2735 (1.2622)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [276]  [1200/1251]  eta: 0:00:14  lr: 0.000067  min_lr: 0.000067  loss: 2.7165 (2.5910)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1898 (1.2665)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [276]  [1250/1251]  eta: 0:00:00  lr: 0.000067  min_lr: 0.000067  loss: 2.6480 (2.5911)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2716 (1.2681)  time: 0.2335  data: 0.0008  max mem: 21847
Epoch: [276] Total time: 0:05:45 (0.2765 s / it)
Averaged stats: lr: 0.000067  min_lr: 0.000067  loss: 2.6480 (2.5985)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2716 (1.2681)
Test:  [ 0/25]  eta: 0:01:24  loss: 0.5434 (0.5434)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 3.3915  data: 3.2407  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.7326 (0.7106)  acc1: 86.8000 (85.5636)  acc5: 97.6000 (97.6000)  time: 0.6321  data: 0.4959  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8629 (0.8352)  acc1: 81.2000 (82.8571)  acc5: 96.4000 (96.3619)  time: 0.3059  data: 0.1748  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9005 (0.8431)  acc1: 80.0000 (82.5280)  acc5: 95.6000 (96.2880)  time: 0.2367  data: 0.1084  max mem: 21847
Test: Total time: 0:00:10 (0.4268 s / it)
* Acc@1 82.904 Acc@5 96.300 loss 0.843
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [277]  [   0/1251]  eta: 1:11:16  lr: 0.000067  min_lr: 0.000067  loss: 2.9052 (2.9052)  weight_decay: 0.0500 (0.0500)  time: 3.4189  data: 2.9056  max mem: 21847
Epoch: [277]  [ 200/1251]  eta: 0:05:05  lr: 0.000066  min_lr: 0.000066  loss: 2.9416 (2.5986)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2915 (1.2611)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [277]  [ 400/1251]  eta: 0:03:59  lr: 0.000065  min_lr: 0.000065  loss: 2.9676 (2.6092)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1748 (1.2445)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [277]  [ 600/1251]  eta: 0:03:01  lr: 0.000064  min_lr: 0.000064  loss: 2.6546 (2.6125)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1232 (1.2455)  time: 0.2725  data: 0.0005  max mem: 21847
Epoch: [277]  [ 800/1251]  eta: 0:02:05  lr: 0.000064  min_lr: 0.000064  loss: 2.3453 (2.6047)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2111 (1.2401)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [277]  [1000/1251]  eta: 0:01:09  lr: 0.000063  min_lr: 0.000063  loss: 2.1791 (2.5904)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1826 (1.2381)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [277]  [1200/1251]  eta: 0:00:14  lr: 0.000062  min_lr: 0.000062  loss: 2.8835 (2.5908)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1001 (1.2256)  time: 0.2793  data: 0.0004  max mem: 21847
Epoch: [277]  [1250/1251]  eta: 0:00:00  lr: 0.000062  min_lr: 0.000062  loss: 2.8906 (2.5928)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0575 (1.2229)  time: 0.2288  data: 0.0007  max mem: 21847
Epoch: [277] Total time: 0:05:46 (0.2766 s / it)
Averaged stats: lr: 0.000062  min_lr: 0.000062  loss: 2.8906 (2.6129)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0575 (1.2229)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5823 (0.5823)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.3637  data: 5.1843  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7671 (0.7475)  acc1: 87.2000 (85.6000)  acc5: 97.2000 (97.4909)  time: 0.7390  data: 0.6014  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9177 (0.8732)  acc1: 80.8000 (82.7238)  acc5: 96.0000 (96.1905)  time: 0.2165  data: 0.0861  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9298 (0.8803)  acc1: 79.6000 (82.3680)  acc5: 95.6000 (96.1760)  time: 0.2143  data: 0.0860  max mem: 21847
Test: Total time: 0:00:10 (0.4115 s / it)
* Acc@1 82.874 Acc@5 96.278 loss 0.879
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [278]  [   0/1251]  eta: 1:11:28  lr: 0.000062  min_lr: 0.000062  loss: 3.2049 (3.2049)  weight_decay: 0.0500 (0.0500)  time: 3.4285  data: 2.3865  max mem: 21847
Epoch: [278]  [ 200/1251]  eta: 0:05:06  lr: 0.000061  min_lr: 0.000061  loss: 2.8989 (2.6099)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2059 (1.2785)  time: 0.2822  data: 0.0004  max mem: 21847
Epoch: [278]  [ 400/1251]  eta: 0:04:01  lr: 0.000060  min_lr: 0.000060  loss: 2.8063 (2.6242)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2625 (1.2175)  time: 0.2818  data: 0.0005  max mem: 21847
Epoch: [278]  [ 600/1251]  eta: 0:03:02  lr: 0.000059  min_lr: 0.000059  loss: 3.0331 (2.6209)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1902 (1.2123)  time: 0.2747  data: 0.0004  max mem: 21847
Epoch: [278]  [ 800/1251]  eta: 0:02:05  lr: 0.000058  min_lr: 0.000058  loss: 2.5498 (2.6251)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1238 (1.1839)  time: 0.2708  data: 0.0004  max mem: 21847
Epoch: [278]  [1000/1251]  eta: 0:01:09  lr: 0.000057  min_lr: 0.000057  loss: 2.9640 (2.6094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1351 (1.1884)  time: 0.2715  data: 0.0005  max mem: 21847
Epoch: [278]  [1200/1251]  eta: 0:00:14  lr: 0.000056  min_lr: 0.000056  loss: 2.8235 (2.6120)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2209 (1.1979)  time: 0.2707  data: 0.0005  max mem: 21847
Epoch: [278]  [1250/1251]  eta: 0:00:00  lr: 0.000056  min_lr: 0.000056  loss: 2.5874 (2.6122)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1189 (1.1937)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [278] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.000056  min_lr: 0.000056  loss: 2.5874 (2.6009)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1189 (1.1937)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5600 (0.5600)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.5203  data: 5.3728  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7334 (0.7096)  acc1: 86.8000 (85.5636)  acc5: 97.6000 (97.4909)  time: 0.7514  data: 0.6188  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8718 (0.8365)  acc1: 80.8000 (82.6476)  acc5: 96.4000 (96.2476)  time: 0.2191  data: 0.0899  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.8952 (0.8446)  acc1: 80.4000 (82.4000)  acc5: 95.6000 (96.1920)  time: 0.2180  data: 0.0899  max mem: 21847
Test: Total time: 0:00:10 (0.4201 s / it)
* Acc@1 82.846 Acc@5 96.278 loss 0.842
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.91%
Epoch: [279]  [   0/1251]  eta: 1:09:45  lr: 0.000056  min_lr: 0.000056  loss: 2.3030 (2.3030)  weight_decay: 0.0500 (0.0500)  time: 3.3454  data: 2.9588  max mem: 21847
Epoch: [279]  [ 200/1251]  eta: 0:05:04  lr: 0.000055  min_lr: 0.000055  loss: 2.4914 (2.5909)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1098 (1.1946)  time: 0.2785  data: 0.0004  max mem: 21847
Epoch: [279]  [ 400/1251]  eta: 0:04:00  lr: 0.000055  min_lr: 0.000055  loss: 2.5971 (2.5919)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2793 (1.2139)  time: 0.2815  data: 0.0004  max mem: 21847
Epoch: [279]  [ 600/1251]  eta: 0:03:02  lr: 0.000054  min_lr: 0.000054  loss: 2.3254 (2.5793)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1108 (1.2157)  time: 0.2916  data: 0.0005  max mem: 21847
Epoch: [279]  [ 800/1251]  eta: 0:02:05  lr: 0.000053  min_lr: 0.000053  loss: 2.7635 (2.5932)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2523 (1.2192)  time: 0.2820  data: 0.0004  max mem: 21847
Epoch: [279]  [1000/1251]  eta: 0:01:09  lr: 0.000052  min_lr: 0.000052  loss: 2.4778 (2.5975)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1392 (1.2225)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [279]  [1200/1251]  eta: 0:00:14  lr: 0.000051  min_lr: 0.000051  loss: 2.7586 (2.6005)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2345 (1.2206)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [279]  [1250/1251]  eta: 0:00:00  lr: 0.000051  min_lr: 0.000051  loss: 2.7349 (2.6011)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2034 (1.2262)  time: 0.2284  data: 0.0006  max mem: 21847
Epoch: [279] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.000051  min_lr: 0.000051  loss: 2.7349 (2.6038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2034 (1.2262)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5576 (0.5576)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.7518  data: 5.5960  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.7520 (0.7199)  acc1: 86.8000 (86.0364)  acc5: 97.6000 (97.6364)  time: 0.6661  data: 0.5333  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8850 (0.8471)  acc1: 80.0000 (82.7810)  acc5: 96.4000 (96.2476)  time: 0.1645  data: 0.0354  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9109 (0.8555)  acc1: 80.0000 (82.5280)  acc5: 96.0000 (96.2400)  time: 0.1815  data: 0.0531  max mem: 21847
Test: Total time: 0:00:10 (0.4013 s / it)
* Acc@1 82.862 Acc@5 96.270 loss 0.854
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [280]  [   0/1251]  eta: 1:02:25  lr: 0.000051  min_lr: 0.000051  loss: 2.6082 (2.6082)  weight_decay: 0.0500 (0.0500)  time: 2.9939  data: 2.5304  max mem: 21847
Epoch: [280]  [ 200/1251]  eta: 0:05:02  lr: 0.000050  min_lr: 0.000050  loss: 2.8884 (2.5987)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1353 (1.1751)  time: 0.2736  data: 0.0004  max mem: 21847
Epoch: [280]  [ 400/1251]  eta: 0:03:58  lr: 0.000050  min_lr: 0.000050  loss: 2.8787 (2.6400)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2717 (1.1985)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [280]  [ 600/1251]  eta: 0:03:01  lr: 0.000049  min_lr: 0.000049  loss: 2.0208 (2.6133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2110 (1.2220)  time: 0.2697  data: 0.0004  max mem: 21847
Epoch: [280]  [ 800/1251]  eta: 0:02:05  lr: 0.000048  min_lr: 0.000048  loss: 2.7250 (2.6103)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1119 (1.2216)  time: 0.2760  data: 0.0004  max mem: 21847
Epoch: [280]  [1000/1251]  eta: 0:01:09  lr: 0.000047  min_lr: 0.000047  loss: 2.3982 (2.6230)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0319 (1.2030)  time: 0.2787  data: 0.0004  max mem: 21847
Epoch: [280]  [1200/1251]  eta: 0:00:14  lr: 0.000046  min_lr: 0.000046  loss: 2.6572 (2.6218)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1781 (1.2061)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [280]  [1250/1251]  eta: 0:00:00  lr: 0.000046  min_lr: 0.000046  loss: 2.7587 (2.6216)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1678 (1.2066)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [280] Total time: 0:05:45 (0.2764 s / it)
Averaged stats: lr: 0.000046  min_lr: 0.000046  loss: 2.7587 (2.6102)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1678 (1.2066)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5581 (0.5581)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.4593  data: 5.3093  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7421 (0.7185)  acc1: 86.8000 (86.0727)  acc5: 97.2000 (97.5273)  time: 0.7392  data: 0.6055  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8880 (0.8460)  acc1: 81.2000 (82.9905)  acc5: 96.0000 (96.1905)  time: 0.2115  data: 0.0786  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9048 (0.8539)  acc1: 80.4000 (82.6400)  acc5: 96.0000 (96.1760)  time: 0.2106  data: 0.0785  max mem: 21847
Test: Total time: 0:00:10 (0.4114 s / it)
* Acc@1 82.866 Acc@5 96.254 loss 0.853
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [281]  [   0/1251]  eta: 1:10:52  lr: 0.000046  min_lr: 0.000046  loss: 2.6022 (2.6022)  weight_decay: 0.0500 (0.0500)  time: 3.3991  data: 3.0194  max mem: 21847
Epoch: [281]  [ 200/1251]  eta: 0:05:06  lr: 0.000046  min_lr: 0.000046  loss: 2.8573 (2.5846)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0490 (1.1587)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [281]  [ 400/1251]  eta: 0:03:59  lr: 0.000045  min_lr: 0.000045  loss: 2.9450 (2.6511)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2669 (1.1958)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [281]  [ 600/1251]  eta: 0:03:01  lr: 0.000044  min_lr: 0.000044  loss: 2.5327 (2.6279)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0715 (1.2005)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [281]  [ 800/1251]  eta: 0:02:05  lr: 0.000043  min_lr: 0.000043  loss: 2.7695 (2.6268)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0783 (1.2013)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [281]  [1000/1251]  eta: 0:01:09  lr: 0.000043  min_lr: 0.000043  loss: 2.4151 (2.6223)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0689 (1.1990)  time: 0.2722  data: 0.0004  max mem: 21847
Epoch: [281]  [1200/1251]  eta: 0:00:14  lr: 0.000042  min_lr: 0.000042  loss: 2.7160 (2.6120)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0686 (1.1983)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [281]  [1250/1251]  eta: 0:00:00  lr: 0.000042  min_lr: 0.000042  loss: 2.8108 (2.6117)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0962 (1.1976)  time: 0.2282  data: 0.0005  max mem: 21847
Epoch: [281] Total time: 0:05:46 (0.2768 s / it)
Averaged stats: lr: 0.000042  min_lr: 0.000042  loss: 2.8108 (2.6094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0962 (1.1976)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5761 (0.5761)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.3795  data: 5.2338  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7719 (0.7395)  acc1: 86.8000 (86.0364)  acc5: 97.6000 (97.5273)  time: 0.7310  data: 0.5974  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9039 (0.8616)  acc1: 80.8000 (82.9333)  acc5: 96.4000 (96.3238)  time: 0.2147  data: 0.0834  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9330 (0.8702)  acc1: 80.0000 (82.5600)  acc5: 95.6000 (96.2560)  time: 0.2129  data: 0.0824  max mem: 21847
Test: Total time: 0:00:10 (0.4108 s / it)
* Acc@1 82.854 Acc@5 96.270 loss 0.870
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [282]  [   0/1251]  eta: 1:03:50  lr: 0.000042  min_lr: 0.000042  loss: 3.1405 (3.1405)  weight_decay: 0.0500 (0.0500)  time: 3.0620  data: 2.5751  max mem: 21847
Epoch: [282]  [ 200/1251]  eta: 0:05:05  lr: 0.000041  min_lr: 0.000041  loss: 2.9449 (2.5726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0732 (1.1561)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [282]  [ 400/1251]  eta: 0:04:00  lr: 0.000040  min_lr: 0.000040  loss: 2.5474 (2.5844)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0965 (1.2313)  time: 0.2723  data: 0.0004  max mem: 21847
Epoch: [282]  [ 600/1251]  eta: 0:03:02  lr: 0.000040  min_lr: 0.000040  loss: 2.8659 (2.5928)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1842 (1.2588)  time: 0.2728  data: 0.0004  max mem: 21847
Epoch: [282]  [ 800/1251]  eta: 0:02:05  lr: 0.000039  min_lr: 0.000039  loss: 2.9390 (2.5983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1646 (1.2515)  time: 0.2846  data: 0.0004  max mem: 21847
Epoch: [282]  [1000/1251]  eta: 0:01:09  lr: 0.000038  min_lr: 0.000038  loss: 2.6172 (2.5948)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1282 (1.2464)  time: 0.2782  data: 0.0004  max mem: 21847
Epoch: [282]  [1200/1251]  eta: 0:00:14  lr: 0.000037  min_lr: 0.000037  loss: 2.6394 (2.5917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9978 (1.2354)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [282]  [1250/1251]  eta: 0:00:00  lr: 0.000037  min_lr: 0.000037  loss: 2.6620 (2.5916)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0814 (1.2369)  time: 0.2282  data: 0.0005  max mem: 21847
Epoch: [282] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.000037  min_lr: 0.000037  loss: 2.6620 (2.6038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0814 (1.2369)
Test:  [ 0/25]  eta: 0:01:51  loss: 0.5921 (0.5921)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 4.4561  data: 4.2801  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7768 (0.7444)  acc1: 86.4000 (85.9636)  acc5: 97.6000 (97.6000)  time: 0.6910  data: 0.5535  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9066 (0.8716)  acc1: 80.8000 (82.7048)  acc5: 96.4000 (96.2095)  time: 0.2504  data: 0.1198  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9398 (0.8794)  acc1: 80.8000 (82.4640)  acc5: 95.6000 (96.1600)  time: 0.2160  data: 0.0877  max mem: 21847
Test: Total time: 0:00:10 (0.4031 s / it)
* Acc@1 82.836 Acc@5 96.280 loss 0.877
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.91%
Epoch: [283]  [   0/1251]  eta: 1:10:20  lr: 0.000037  min_lr: 0.000037  loss: 3.0394 (3.0394)  weight_decay: 0.0500 (0.0500)  time: 3.3737  data: 2.8220  max mem: 21847
Epoch: [283]  [ 200/1251]  eta: 0:05:06  lr: 0.000037  min_lr: 0.000037  loss: 2.5828 (2.5877)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1465 (1.1958)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [283]  [ 400/1251]  eta: 0:04:00  lr: 0.000036  min_lr: 0.000036  loss: 2.7866 (2.6003)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1939 (1.2325)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [283]  [ 600/1251]  eta: 0:03:02  lr: 0.000035  min_lr: 0.000035  loss: 2.2244 (2.5964)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1146 (1.2429)  time: 0.2703  data: 0.0004  max mem: 21847
Epoch: [283]  [ 800/1251]  eta: 0:02:05  lr: 0.000035  min_lr: 0.000035  loss: 2.4196 (2.6006)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1193 (1.2425)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [283]  [1000/1251]  eta: 0:01:09  lr: 0.000034  min_lr: 0.000034  loss: 2.4801 (2.5889)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0643 (1.2247)  time: 0.2724  data: 0.0005  max mem: 21847
Epoch: [283]  [1200/1251]  eta: 0:00:14  lr: 0.000033  min_lr: 0.000033  loss: 2.7262 (2.5841)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0317 (1.2217)  time: 0.2715  data: 0.0005  max mem: 21847
Epoch: [283]  [1250/1251]  eta: 0:00:00  lr: 0.000033  min_lr: 0.000033  loss: 2.8908 (2.5899)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2285 (1.2256)  time: 0.2281  data: 0.0007  max mem: 21847
Epoch: [283] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.000033  min_lr: 0.000033  loss: 2.8908 (2.6020)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2285 (1.2256)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6012 (0.6012)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.8045  data: 5.6354  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7860 (0.7566)  acc1: 86.0000 (85.9636)  acc5: 97.6000 (97.5636)  time: 0.7086  data: 0.5736  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9183 (0.8853)  acc1: 80.8000 (82.8571)  acc5: 96.0000 (96.2286)  time: 0.1799  data: 0.0504  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9597 (0.8938)  acc1: 80.4000 (82.5280)  acc5: 95.6000 (96.1920)  time: 0.1833  data: 0.0544  max mem: 21847
Test: Total time: 0:00:10 (0.4033 s / it)
* Acc@1 82.856 Acc@5 96.274 loss 0.893
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [284]  [   0/1251]  eta: 1:03:21  lr: 0.000033  min_lr: 0.000033  loss: 3.3626 (3.3626)  weight_decay: 0.0500 (0.0500)  time: 3.0390  data: 1.6073  max mem: 21847
Epoch: [284]  [ 200/1251]  eta: 0:05:06  lr: 0.000032  min_lr: 0.000032  loss: 2.4767 (2.5690)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1122 (1.1567)  time: 0.2845  data: 0.0004  max mem: 21847
Epoch: [284]  [ 400/1251]  eta: 0:04:00  lr: 0.000032  min_lr: 0.000032  loss: 2.9551 (2.6229)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2661 (1.2728)  time: 0.2719  data: 0.0004  max mem: 21847
Epoch: [284]  [ 600/1251]  eta: 0:03:02  lr: 0.000031  min_lr: 0.000031  loss: 2.7720 (2.6139)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1085 (1.2450)  time: 0.2702  data: 0.0004  max mem: 21847
Epoch: [284]  [ 800/1251]  eta: 0:02:05  lr: 0.000031  min_lr: 0.000031  loss: 2.4873 (2.6062)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1941 (1.2495)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [284]  [1000/1251]  eta: 0:01:09  lr: 0.000030  min_lr: 0.000030  loss: 2.9059 (2.6163)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2782 (1.2511)  time: 0.2741  data: 0.0008  max mem: 21847
Epoch: [284]  [1200/1251]  eta: 0:00:14  lr: 0.000029  min_lr: 0.000029  loss: 2.9663 (2.6164)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1451 (1.2365)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [284]  [1250/1251]  eta: 0:00:00  lr: 0.000029  min_lr: 0.000029  loss: 2.3764 (2.6152)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0564 (1.2304)  time: 0.2284  data: 0.0006  max mem: 21847
Epoch: [284] Total time: 0:05:46 (0.2770 s / it)
Averaged stats: lr: 0.000029  min_lr: 0.000029  loss: 2.3764 (2.6068)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0564 (1.2304)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5530 (0.5530)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.3581  data: 5.1899  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7445 (0.7065)  acc1: 86.0000 (85.6364)  acc5: 97.6000 (97.5636)  time: 0.7616  data: 0.6246  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8648 (0.8302)  acc1: 80.8000 (82.7238)  acc5: 96.4000 (96.2095)  time: 0.2284  data: 0.0972  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.8954 (0.8381)  acc1: 80.4000 (82.4480)  acc5: 95.6000 (96.1600)  time: 0.2261  data: 0.0971  max mem: 21847
Test: Total time: 0:00:10 (0.4215 s / it)
* Acc@1 82.874 Acc@5 96.288 loss 0.836
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [285]  [   0/1251]  eta: 1:09:32  lr: 0.000029  min_lr: 0.000029  loss: 2.3462 (2.3462)  weight_decay: 0.0500 (0.0500)  time: 3.3353  data: 2.4907  max mem: 21847
Epoch: [285]  [ 200/1251]  eta: 0:05:05  lr: 0.000029  min_lr: 0.000029  loss: 2.4491 (2.5643)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2337 (1.2437)  time: 0.2727  data: 0.0004  max mem: 21847
Epoch: [285]  [ 400/1251]  eta: 0:04:00  lr: 0.000028  min_lr: 0.000028  loss: 2.5608 (2.5797)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1738 (nan)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [285]  [ 600/1251]  eta: 0:03:02  lr: 0.000027  min_lr: 0.000027  loss: 2.6018 (2.5630)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0633 (nan)  time: 0.2723  data: 0.0005  max mem: 21847
Epoch: [285]  [ 800/1251]  eta: 0:02:05  lr: 0.000027  min_lr: 0.000027  loss: 2.1953 (2.5706)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1626 (nan)  time: 0.2750  data: 0.0004  max mem: 21847
Epoch: [285]  [1000/1251]  eta: 0:01:09  lr: 0.000026  min_lr: 0.000026  loss: 2.4447 (2.5729)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0524 (nan)  time: 0.2720  data: 0.0005  max mem: 21847
Epoch: [285]  [1200/1251]  eta: 0:00:14  lr: 0.000026  min_lr: 0.000026  loss: 2.5842 (2.5902)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1817 (nan)  time: 0.2706  data: 0.0004  max mem: 21847
Epoch: [285]  [1250/1251]  eta: 0:00:00  lr: 0.000026  min_lr: 0.000026  loss: 2.6102 (2.5898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0870 (nan)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [285] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.000026  min_lr: 0.000026  loss: 2.6102 (2.5981)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0870 (nan)
Test:  [ 0/25]  eta: 0:01:39  loss: 0.5592 (0.5592)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 3.9632  data: 3.7810  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.7450 (0.7144)  acc1: 86.4000 (85.8545)  acc5: 97.6000 (97.6364)  time: 0.6220  data: 0.4859  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8763 (0.8387)  acc1: 80.8000 (82.9714)  acc5: 96.4000 (96.3429)  time: 0.2451  data: 0.1156  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9028 (0.8471)  acc1: 80.4000 (82.6080)  acc5: 95.6000 (96.3040)  time: 0.2024  data: 0.0730  max mem: 21847
Test: Total time: 0:00:09 (0.3938 s / it)
* Acc@1 82.864 Acc@5 96.298 loss 0.847
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [286]  [   0/1251]  eta: 1:05:47  lr: 0.000026  min_lr: 0.000026  loss: 2.9051 (2.9051)  weight_decay: 0.0500 (0.0500)  time: 3.1558  data: 2.3105  max mem: 21847
Epoch: [286]  [ 200/1251]  eta: 0:05:05  lr: 0.000025  min_lr: 0.000025  loss: 2.7349 (2.6000)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1664 (1.2647)  time: 0.2815  data: 0.0004  max mem: 21847
Epoch: [286]  [ 400/1251]  eta: 0:04:00  lr: 0.000025  min_lr: 0.000025  loss: 2.5550 (2.5818)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0806 (1.2332)  time: 0.2909  data: 0.0004  max mem: 21847
Epoch: [286]  [ 600/1251]  eta: 0:03:02  lr: 0.000024  min_lr: 0.000024  loss: 2.4909 (2.5883)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0842 (1.2207)  time: 0.2725  data: 0.0004  max mem: 21847
Epoch: [286]  [ 800/1251]  eta: 0:02:05  lr: 0.000023  min_lr: 0.000023  loss: 2.6207 (2.6038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0262 (1.2062)  time: 0.2734  data: 0.0004  max mem: 21847
Epoch: [286]  [1000/1251]  eta: 0:01:09  lr: 0.000023  min_lr: 0.000023  loss: 2.7239 (2.5961)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1183 (1.1960)  time: 0.2804  data: 0.0004  max mem: 21847
Epoch: [286]  [1200/1251]  eta: 0:00:14  lr: 0.000022  min_lr: 0.000022  loss: 2.1332 (2.5939)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2019 (1.2010)  time: 0.2717  data: 0.0005  max mem: 21847
Epoch: [286]  [1250/1251]  eta: 0:00:00  lr: 0.000022  min_lr: 0.000022  loss: 2.7953 (2.5943)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1638 (1.1969)  time: 0.2285  data: 0.0007  max mem: 21847
Epoch: [286] Total time: 0:05:47 (0.2774 s / it)
Averaged stats: lr: 0.000022  min_lr: 0.000022  loss: 2.7953 (2.5979)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1638 (1.1969)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5892 (0.5892)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.5901  data: 5.4201  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7685 (0.7429)  acc1: 86.0000 (85.8546)  acc5: 97.6000 (97.4546)  time: 0.7449  data: 0.6104  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9123 (0.8684)  acc1: 81.2000 (82.7238)  acc5: 96.0000 (96.2286)  time: 0.2001  data: 0.0705  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9337 (0.8771)  acc1: 80.0000 (82.4000)  acc5: 95.6000 (96.2080)  time: 0.2043  data: 0.0755  max mem: 21847
Test: Total time: 0:00:10 (0.4114 s / it)
* Acc@1 82.858 Acc@5 96.278 loss 0.877
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [287]  [   0/1251]  eta: 1:03:37  lr: 0.000022  min_lr: 0.000022  loss: 2.9388 (2.9388)  weight_decay: 0.0500 (0.0500)  time: 3.0513  data: 2.5922  max mem: 21847
Epoch: [287]  [ 200/1251]  eta: 0:05:03  lr: 0.000022  min_lr: 0.000022  loss: 2.6222 (2.5966)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1865 (1.1542)  time: 0.2764  data: 0.0004  max mem: 21847
Epoch: [287]  [ 400/1251]  eta: 0:04:00  lr: 0.000021  min_lr: 0.000021  loss: 2.8195 (2.6148)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1537 (1.2122)  time: 0.2761  data: 0.0005  max mem: 21847
Epoch: [287]  [ 600/1251]  eta: 0:03:02  lr: 0.000021  min_lr: 0.000021  loss: 2.5485 (2.6042)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2004 (1.2236)  time: 0.2734  data: 0.0005  max mem: 21847
Epoch: [287]  [ 800/1251]  eta: 0:02:05  lr: 0.000020  min_lr: 0.000020  loss: 2.8161 (2.5966)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1464 (1.2175)  time: 0.2733  data: 0.0005  max mem: 21847
Epoch: [287]  [1000/1251]  eta: 0:01:09  lr: 0.000020  min_lr: 0.000020  loss: 2.3047 (2.5960)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1536 (1.2199)  time: 0.2727  data: 0.0005  max mem: 21847
Epoch: [287]  [1200/1251]  eta: 0:00:14  lr: 0.000019  min_lr: 0.000019  loss: 2.9614 (2.6019)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0919 (1.2114)  time: 0.2802  data: 0.0005  max mem: 21847
Epoch: [287]  [1250/1251]  eta: 0:00:00  lr: 0.000019  min_lr: 0.000019  loss: 2.4704 (2.5986)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0419 (1.2112)  time: 0.2279  data: 0.0006  max mem: 21847
Epoch: [287] Total time: 0:05:47 (0.2775 s / it)
Averaged stats: lr: 0.000019  min_lr: 0.000019  loss: 2.4704 (2.5938)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0419 (1.2112)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5484 (0.5484)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.4438  data: 5.2889  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7438 (0.7068)  acc1: 86.4000 (85.7455)  acc5: 97.6000 (97.5636)  time: 0.7450  data: 0.6106  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8725 (0.8296)  acc1: 81.2000 (82.8381)  acc5: 96.4000 (96.2667)  time: 0.2083  data: 0.0784  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9012 (0.8382)  acc1: 80.8000 (82.5120)  acc5: 96.0000 (96.2560)  time: 0.2074  data: 0.0783  max mem: 21847
Test: Total time: 0:00:10 (0.4082 s / it)
* Acc@1 82.860 Acc@5 96.302 loss 0.837
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [288]  [   0/1251]  eta: 1:07:23  lr: 0.000019  min_lr: 0.000019  loss: 1.6638 (1.6638)  weight_decay: 0.0500 (0.0500)  time: 3.2319  data: 1.7600  max mem: 21847
Epoch: [288]  [ 200/1251]  eta: 0:05:05  lr: 0.000019  min_lr: 0.000019  loss: 2.7751 (2.5703)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1463 (1.1722)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [288]  [ 400/1251]  eta: 0:04:01  lr: 0.000018  min_lr: 0.000018  loss: 2.4626 (2.5497)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0446 (1.1841)  time: 0.2726  data: 0.0005  max mem: 21847
Epoch: [288]  [ 600/1251]  eta: 0:03:02  lr: 0.000018  min_lr: 0.000018  loss: 2.9042 (2.5618)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1154 (1.1705)  time: 0.2730  data: 0.0004  max mem: 21847
Epoch: [288]  [ 800/1251]  eta: 0:02:05  lr: 0.000017  min_lr: 0.000017  loss: 2.6841 (2.5629)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1909 (1.2001)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [288]  [1000/1251]  eta: 0:01:09  lr: 0.000017  min_lr: 0.000017  loss: 2.9130 (2.5717)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0945 (1.2124)  time: 0.2798  data: 0.0004  max mem: 21847
Epoch: [288]  [1200/1251]  eta: 0:00:14  lr: 0.000016  min_lr: 0.000016  loss: 2.4079 (2.5745)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1487 (1.2124)  time: 0.2716  data: 0.0004  max mem: 21847
Epoch: [288]  [1250/1251]  eta: 0:00:00  lr: 0.000016  min_lr: 0.000016  loss: 2.8661 (2.5813)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2112 (1.2121)  time: 0.2332  data: 0.0006  max mem: 21847
Epoch: [288] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.000016  min_lr: 0.000016  loss: 2.8661 (2.5931)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2112 (1.2121)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5674 (0.5674)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.6506  data: 5.4883  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7594 (0.7291)  acc1: 86.8000 (85.8545)  acc5: 97.6000 (97.5636)  time: 0.7476  data: 0.6126  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8907 (0.8540)  acc1: 81.2000 (82.9143)  acc5: 96.4000 (96.3238)  time: 0.2165  data: 0.0867  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9301 (0.8626)  acc1: 80.8000 (82.6720)  acc5: 95.6000 (96.2720)  time: 0.2157  data: 0.0866  max mem: 21847
Test: Total time: 0:00:10 (0.4231 s / it)
* Acc@1 82.866 Acc@5 96.298 loss 0.862
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [289]  [   0/1251]  eta: 1:07:00  lr: 0.000016  min_lr: 0.000016  loss: 2.9988 (2.9988)  weight_decay: 0.0500 (0.0500)  time: 3.2141  data: 2.1322  max mem: 21847
Epoch: [289]  [ 200/1251]  eta: 0:05:03  lr: 0.000016  min_lr: 0.000016  loss: 2.5721 (2.6045)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1643 (1.2096)  time: 0.2821  data: 0.0005  max mem: 21847
Epoch: [289]  [ 400/1251]  eta: 0:03:59  lr: 0.000015  min_lr: 0.000015  loss: 2.9865 (2.6010)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0459 (1.1996)  time: 0.2712  data: 0.0004  max mem: 21847
Epoch: [289]  [ 600/1251]  eta: 0:03:02  lr: 0.000015  min_lr: 0.000015  loss: 2.9140 (2.6178)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0813 (1.1944)  time: 0.2737  data: 0.0005  max mem: 21847
Epoch: [289]  [ 800/1251]  eta: 0:02:05  lr: 0.000014  min_lr: 0.000014  loss: 2.7854 (2.6252)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1400 (1.1922)  time: 0.2739  data: 0.0005  max mem: 21847
Epoch: [289]  [1000/1251]  eta: 0:01:09  lr: 0.000014  min_lr: 0.000014  loss: 2.4465 (2.6209)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1367 (1.1887)  time: 0.2718  data: 0.0005  max mem: 21847
Epoch: [289]  [1200/1251]  eta: 0:00:14  lr: 0.000014  min_lr: 0.000014  loss: 2.5067 (2.6169)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0912 (1.1772)  time: 0.2817  data: 0.0004  max mem: 21847
Epoch: [289]  [1250/1251]  eta: 0:00:00  lr: 0.000014  min_lr: 0.000014  loss: 3.0840 (2.6170)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1449 (1.1761)  time: 0.2278  data: 0.0007  max mem: 21847
Epoch: [289] Total time: 0:05:46 (0.2769 s / it)
Averaged stats: lr: 0.000014  min_lr: 0.000014  loss: 3.0840 (2.5854)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1449 (1.1761)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5680 (0.5680)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.5734  data: 5.3942  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7600 (0.7260)  acc1: 86.4000 (85.9273)  acc5: 97.6000 (97.6364)  time: 0.7317  data: 0.5945  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8900 (0.8521)  acc1: 81.6000 (82.9143)  acc5: 96.0000 (96.3619)  time: 0.1950  data: 0.0649  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9224 (0.8611)  acc1: 80.4000 (82.6080)  acc5: 95.6000 (96.2880)  time: 0.1931  data: 0.0647  max mem: 21847
Test: Total time: 0:00:10 (0.4033 s / it)
* Acc@1 82.892 Acc@5 96.300 loss 0.860
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [290]  [   0/1251]  eta: 1:09:17  lr: 0.000014  min_lr: 0.000014  loss: 1.8880 (1.8880)  weight_decay: 0.0500 (0.0500)  time: 3.3234  data: 2.6143  max mem: 21847
Epoch: [290]  [ 200/1251]  eta: 0:05:05  lr: 0.000013  min_lr: 0.000013  loss: 2.8365 (2.6159)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1673 (1.1394)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [290]  [ 400/1251]  eta: 0:04:00  lr: 0.000013  min_lr: 0.000013  loss: 2.4771 (2.5933)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1683 (1.1659)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [290]  [ 600/1251]  eta: 0:03:02  lr: 0.000012  min_lr: 0.000012  loss: 2.4988 (2.6140)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0598 (1.1894)  time: 0.2735  data: 0.0005  max mem: 21847
Epoch: [290]  [ 800/1251]  eta: 0:02:05  lr: 0.000012  min_lr: 0.000012  loss: 2.7199 (2.6011)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1164 (1.1833)  time: 0.2721  data: 0.0005  max mem: 21847
Epoch: [290]  [1000/1251]  eta: 0:01:09  lr: 0.000012  min_lr: 0.000012  loss: 2.8213 (2.5918)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1542 (1.1838)  time: 0.2717  data: 0.0004  max mem: 21847
Epoch: [290]  [1200/1251]  eta: 0:00:14  lr: 0.000011  min_lr: 0.000011  loss: 2.5779 (2.5969)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1146 (1.1859)  time: 0.2713  data: 0.0003  max mem: 21847
Epoch: [290]  [1250/1251]  eta: 0:00:00  lr: 0.000011  min_lr: 0.000011  loss: 2.4111 (2.5937)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1542 (1.1938)  time: 0.2277  data: 0.0006  max mem: 21847
Epoch: [290] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.000011  min_lr: 0.000011  loss: 2.4111 (2.6016)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1542 (1.1938)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5439 (0.5439)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.6041  data: 5.4519  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7383 (0.7079)  acc1: 86.4000 (85.8909)  acc5: 97.6000 (97.6364)  time: 0.6999  data: 0.5663  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8695 (0.8340)  acc1: 81.2000 (82.9333)  acc5: 96.4000 (96.2857)  time: 0.1890  data: 0.0594  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.8989 (0.8429)  acc1: 80.0000 (82.6400)  acc5: 95.6000 (96.2080)  time: 0.2127  data: 0.0843  max mem: 21847
Test: Total time: 0:00:10 (0.4190 s / it)
* Acc@1 82.912 Acc@5 96.286 loss 0.843
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [291]  [   0/1251]  eta: 1:03:52  lr: 0.000011  min_lr: 0.000011  loss: 2.4599 (2.4599)  weight_decay: 0.0500 (0.0500)  time: 3.0636  data: 2.2705  max mem: 21847
Epoch: [291]  [ 200/1251]  eta: 0:05:05  lr: 0.000011  min_lr: 0.000011  loss: 2.6743 (2.5383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0983 (1.1336)  time: 0.2772  data: 0.0005  max mem: 21847
Epoch: [291]  [ 400/1251]  eta: 0:04:00  lr: 0.000010  min_lr: 0.000010  loss: 2.7326 (2.5731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1251 (1.1854)  time: 0.2822  data: 0.0004  max mem: 21847
Epoch: [291]  [ 600/1251]  eta: 0:03:02  lr: 0.000010  min_lr: 0.000010  loss: 2.8776 (2.5868)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0921 (1.1927)  time: 0.2714  data: 0.0004  max mem: 21847
Epoch: [291]  [ 800/1251]  eta: 0:02:05  lr: 0.000010  min_lr: 0.000010  loss: 2.4016 (2.5765)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1210 (1.2012)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [291]  [1000/1251]  eta: 0:01:09  lr: 0.000009  min_lr: 0.000009  loss: 2.6840 (2.5663)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0394 (1.1966)  time: 0.2749  data: 0.0004  max mem: 21847
Epoch: [291]  [1200/1251]  eta: 0:00:14  lr: 0.000009  min_lr: 0.000009  loss: 2.8090 (2.5776)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2064 (1.2027)  time: 0.2707  data: 0.0004  max mem: 21847
Epoch: [291]  [1250/1251]  eta: 0:00:00  lr: 0.000009  min_lr: 0.000009  loss: 2.6375 (2.5766)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1363 (1.1980)  time: 0.2282  data: 0.0007  max mem: 21847
Epoch: [291] Total time: 0:05:46 (0.2774 s / it)
Averaged stats: lr: 0.000009  min_lr: 0.000009  loss: 2.6375 (2.5879)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1363 (1.1980)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5755 (0.5755)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.6717  data: 5.5243  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7694 (0.7342)  acc1: 86.4000 (85.7818)  acc5: 97.6000 (97.4182)  time: 0.7272  data: 0.5926  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.8906 (0.8588)  acc1: 81.2000 (82.7619)  acc5: 96.0000 (96.2286)  time: 0.1963  data: 0.0659  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9303 (0.8677)  acc1: 80.4000 (82.4640)  acc5: 95.6000 (96.1920)  time: 0.2031  data: 0.0749  max mem: 21847
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 82.836 Acc@5 96.294 loss 0.867
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.91%
Epoch: [292]  [   0/1251]  eta: 1:08:02  lr: 0.000009  min_lr: 0.000009  loss: 3.4025 (3.4025)  weight_decay: 0.0500 (0.0500)  time: 3.2632  data: 1.7532  max mem: 21847
Epoch: [292]  [ 200/1251]  eta: 0:05:08  lr: 0.000009  min_lr: 0.000009  loss: 2.7735 (2.6216)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1182 (1.1621)  time: 0.2745  data: 0.0004  max mem: 21847
Epoch: [292]  [ 400/1251]  eta: 0:04:01  lr: 0.000008  min_lr: 0.000008  loss: 2.6099 (2.6148)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1970 (1.2036)  time: 0.2841  data: 0.0004  max mem: 21847
Epoch: [292]  [ 600/1251]  eta: 0:03:02  lr: 0.000008  min_lr: 0.000008  loss: 2.8653 (2.5962)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0576 (1.1980)  time: 0.2720  data: 0.0003  max mem: 21847
Epoch: [292]  [ 800/1251]  eta: 0:02:05  lr: 0.000008  min_lr: 0.000008  loss: 2.8158 (2.6050)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0745 (1.1908)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [292]  [1000/1251]  eta: 0:01:09  lr: 0.000008  min_lr: 0.000008  loss: 2.2932 (2.6105)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1284 (1.1807)  time: 0.2836  data: 0.0004  max mem: 21847
Epoch: [292]  [1200/1251]  eta: 0:00:14  lr: 0.000007  min_lr: 0.000007  loss: 2.8629 (2.6107)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1367 (1.1849)  time: 0.2725  data: 0.0005  max mem: 21847
Epoch: [292]  [1250/1251]  eta: 0:00:00  lr: 0.000007  min_lr: 0.000007  loss: 2.9544 (2.6105)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1538 (1.1863)  time: 0.2279  data: 0.0007  max mem: 21847
Epoch: [292] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.000007  min_lr: 0.000007  loss: 2.9544 (2.5975)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1538 (1.1863)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5924 (0.5924)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.8176  data: 5.6570  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7840 (0.7529)  acc1: 86.4000 (86.0364)  acc5: 97.2000 (97.5273)  time: 0.7279  data: 0.5952  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9135 (0.8794)  acc1: 81.6000 (82.9333)  acc5: 96.0000 (96.2095)  time: 0.1890  data: 0.0603  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9476 (0.8876)  acc1: 80.4000 (82.6400)  acc5: 95.6000 (96.1760)  time: 0.1957  data: 0.0677  max mem: 21847
Test: Total time: 0:00:10 (0.4136 s / it)
* Acc@1 82.916 Acc@5 96.318 loss 0.887
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.92%
Epoch: [293]  [   0/1251]  eta: 1:06:56  lr: 0.000007  min_lr: 0.000007  loss: 2.2028 (2.2028)  weight_decay: 0.0500 (0.0500)  time: 3.2108  data: 2.8631  max mem: 21847
Epoch: [293]  [ 200/1251]  eta: 0:05:03  lr: 0.000007  min_lr: 0.000007  loss: 2.6485 (2.6355)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0236 (1.1272)  time: 0.2728  data: 0.0005  max mem: 21847
Epoch: [293]  [ 400/1251]  eta: 0:03:59  lr: 0.000007  min_lr: 0.000007  loss: 2.9338 (2.6053)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1577 (1.1630)  time: 0.2749  data: 0.0004  max mem: 21847
Epoch: [293]  [ 600/1251]  eta: 0:03:01  lr: 0.000006  min_lr: 0.000006  loss: 2.8231 (2.5843)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0261 (1.1766)  time: 0.2716  data: 0.0003  max mem: 21847
Epoch: [293]  [ 800/1251]  eta: 0:02:05  lr: 0.000006  min_lr: 0.000006  loss: 2.8576 (2.5855)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0455 (1.1784)  time: 0.2797  data: 0.0005  max mem: 21847
Epoch: [293]  [1000/1251]  eta: 0:01:09  lr: 0.000006  min_lr: 0.000006  loss: 2.4217 (2.5854)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0828 (1.1698)  time: 0.2804  data: 0.0006  max mem: 21847
Epoch: [293]  [1200/1251]  eta: 0:00:14  lr: 0.000006  min_lr: 0.000006  loss: 2.8449 (2.5881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0762 (1.1758)  time: 0.2731  data: 0.0005  max mem: 21847
Epoch: [293]  [1250/1251]  eta: 0:00:00  lr: 0.000006  min_lr: 0.000006  loss: 2.8779 (2.5924)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2055 (1.1770)  time: 0.2279  data: 0.0009  max mem: 21847
Epoch: [293] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.000006  min_lr: 0.000006  loss: 2.8779 (2.5875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2055 (1.1770)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5979 (0.5979)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.4989  data: 5.3527  max mem: 21847
Test:  [10/25]  eta: 0:00:10  loss: 0.7870 (0.7570)  acc1: 86.0000 (85.9273)  acc5: 97.6000 (97.6000)  time: 0.7242  data: 0.5917  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9089 (0.8837)  acc1: 82.0000 (82.8762)  acc5: 96.0000 (96.2857)  time: 0.1966  data: 0.0669  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9585 (0.8931)  acc1: 80.4000 (82.5600)  acc5: 95.6000 (96.2240)  time: 0.1946  data: 0.0658  max mem: 21847
Test: Total time: 0:00:10 (0.4013 s / it)
* Acc@1 82.884 Acc@5 96.312 loss 0.893
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.92%
Epoch: [294]  [   0/1251]  eta: 1:06:28  lr: 0.000006  min_lr: 0.000006  loss: 2.6339 (2.6339)  weight_decay: 0.0500 (0.0500)  time: 3.1885  data: 2.7365  max mem: 21847
Epoch: [294]  [ 200/1251]  eta: 0:05:06  lr: 0.000005  min_lr: 0.000005  loss: 2.8916 (2.5919)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1915 (1.1960)  time: 0.2754  data: 0.0004  max mem: 21847
Epoch: [294]  [ 400/1251]  eta: 0:04:00  lr: 0.000005  min_lr: 0.000005  loss: 2.6039 (2.5680)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0892 (1.1663)  time: 0.2701  data: 0.0004  max mem: 21847
Epoch: [294]  [ 600/1251]  eta: 0:03:02  lr: 0.000005  min_lr: 0.000005  loss: 2.0961 (2.5550)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1459 (1.1740)  time: 0.2741  data: 0.0004  max mem: 21847
Epoch: [294]  [ 800/1251]  eta: 0:02:05  lr: 0.000005  min_lr: 0.000005  loss: 2.5458 (2.5574)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0480 (1.1729)  time: 0.2730  data: 0.0005  max mem: 21847
Epoch: [294]  [1000/1251]  eta: 0:01:09  lr: 0.000004  min_lr: 0.000004  loss: 2.3769 (2.5638)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1993 (1.1710)  time: 0.2710  data: 0.0004  max mem: 21847
Epoch: [294]  [1200/1251]  eta: 0:00:14  lr: 0.000004  min_lr: 0.000004  loss: 2.8596 (2.5730)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0519 (1.1691)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [294]  [1250/1251]  eta: 0:00:00  lr: 0.000004  min_lr: 0.000004  loss: 2.6994 (2.5715)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0790 (1.1665)  time: 0.2277  data: 0.0007  max mem: 21847
Epoch: [294] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.000004  min_lr: 0.000004  loss: 2.6994 (2.5824)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0790 (1.1665)
Test:  [ 0/25]  eta: 0:01:51  loss: 0.5892 (0.5892)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 4.4579  data: 4.2933  max mem: 21847
Test:  [10/25]  eta: 0:00:09  loss: 0.7861 (0.7496)  acc1: 86.8000 (85.9273)  acc5: 97.6000 (97.6000)  time: 0.6117  data: 0.4773  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9084 (0.8741)  acc1: 81.6000 (83.1048)  acc5: 96.4000 (96.3238)  time: 0.2163  data: 0.0868  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9483 (0.8830)  acc1: 80.4000 (82.7520)  acc5: 95.6000 (96.2880)  time: 0.1900  data: 0.0599  max mem: 21847
Test: Total time: 0:00:09 (0.3899 s / it)
* Acc@1 82.932 Acc@5 96.302 loss 0.883
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.93%
Epoch: [295]  [   0/1251]  eta: 1:13:14  lr: 0.000004  min_lr: 0.000004  loss: 1.7067 (1.7067)  weight_decay: 0.0500 (0.0500)  time: 3.5132  data: 3.2281  max mem: 21847
Epoch: [295]  [ 200/1251]  eta: 0:05:07  lr: 0.000004  min_lr: 0.000004  loss: 2.2193 (2.5877)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1391 (1.1485)  time: 0.2864  data: 0.0005  max mem: 21847
Epoch: [295]  [ 400/1251]  eta: 0:04:01  lr: 0.000004  min_lr: 0.000004  loss: 2.8801 (2.5998)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0759 (1.1576)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [295]  [ 600/1251]  eta: 0:03:02  lr: 0.000004  min_lr: 0.000004  loss: 2.1615 (2.6064)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1827 (1.1752)  time: 0.2724  data: 0.0005  max mem: 21847
Epoch: [295]  [ 800/1251]  eta: 0:02:05  lr: 0.000003  min_lr: 0.000003  loss: 1.9010 (2.5982)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1974 (1.1785)  time: 0.2787  data: 0.0005  max mem: 21847
Epoch: [295]  [1000/1251]  eta: 0:01:09  lr: 0.000003  min_lr: 0.000003  loss: 2.1049 (2.5919)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1317 (1.1750)  time: 0.2778  data: 0.0005  max mem: 21847
Epoch: [295]  [1200/1251]  eta: 0:00:14  lr: 0.000003  min_lr: 0.000003  loss: 2.4774 (2.5944)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0554 (1.1783)  time: 0.2739  data: 0.0004  max mem: 21847
Epoch: [295]  [1250/1251]  eta: 0:00:00  lr: 0.000003  min_lr: 0.000003  loss: 2.8361 (2.5924)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0532 (1.1764)  time: 0.2276  data: 0.0005  max mem: 21847
Epoch: [295] Total time: 0:05:46 (0.2772 s / it)
Averaged stats: lr: 0.000003  min_lr: 0.000003  loss: 2.8361 (2.5956)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0532 (1.1764)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6284 (0.6284)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.3919  data: 5.2444  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8232 (0.7888)  acc1: 86.8000 (86.0364)  acc5: 97.6000 (97.5273)  time: 0.7398  data: 0.6068  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9387 (0.9166)  acc1: 81.6000 (82.9524)  acc5: 96.0000 (96.2095)  time: 0.2343  data: 0.1048  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9904 (0.9252)  acc1: 80.4000 (82.6560)  acc5: 95.2000 (96.1600)  time: 0.2338  data: 0.1048  max mem: 21847
Test: Total time: 0:00:10 (0.4269 s / it)
* Acc@1 82.856 Acc@5 96.334 loss 0.925
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.93%
Epoch: [296]  [   0/1251]  eta: 1:07:03  lr: 0.000003  min_lr: 0.000003  loss: 2.7315 (2.7315)  weight_decay: 0.0500 (0.0500)  time: 3.2161  data: 2.7988  max mem: 21847
Epoch: [296]  [ 200/1251]  eta: 0:05:04  lr: 0.000003  min_lr: 0.000003  loss: 2.8395 (2.5974)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2724  data: 0.0005  max mem: 21847
Epoch: [296]  [ 400/1251]  eta: 0:04:00  lr: 0.000003  min_lr: 0.000003  loss: 2.8101 (2.5954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1000 (nan)  time: 0.2733  data: 0.0005  max mem: 21847
Epoch: [296]  [ 600/1251]  eta: 0:03:02  lr: 0.000003  min_lr: 0.000003  loss: 2.7732 (2.6074)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1419 (nan)  time: 0.2792  data: 0.0005  max mem: 21847
Epoch: [296]  [ 800/1251]  eta: 0:02:05  lr: 0.000002  min_lr: 0.000002  loss: 2.7083 (2.6052)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1417 (nan)  time: 0.2815  data: 0.0005  max mem: 21847
Epoch: [296]  [1000/1251]  eta: 0:01:09  lr: 0.000002  min_lr: 0.000002  loss: 2.9116 (2.6164)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2018 (nan)  time: 0.2828  data: 0.0005  max mem: 21847
Epoch: [296]  [1200/1251]  eta: 0:00:14  lr: 0.000002  min_lr: 0.000002  loss: 2.7331 (2.6200)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0164 (nan)  time: 0.2728  data: 0.0005  max mem: 21847
Epoch: [296]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.8755 (2.6172)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1552 (nan)  time: 0.2284  data: 0.0007  max mem: 21847
Epoch: [296] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.8755 (2.5983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1552 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5804 (0.5804)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.6519  data: 5.4997  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7723 (0.7408)  acc1: 86.4000 (86.0364)  acc5: 97.2000 (97.4546)  time: 0.7671  data: 0.6345  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9025 (0.8669)  acc1: 81.6000 (83.0476)  acc5: 96.0000 (96.2286)  time: 0.2266  data: 0.0975  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9405 (0.8758)  acc1: 80.4000 (82.7360)  acc5: 95.6000 (96.1920)  time: 0.2255  data: 0.0974  max mem: 21847
Test: Total time: 0:00:10 (0.4310 s / it)
* Acc@1 82.852 Acc@5 96.308 loss 0.876
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.93%
Epoch: [297]  [   0/1251]  eta: 1:07:54  lr: 0.000002  min_lr: 0.000002  loss: 3.3444 (3.3444)  weight_decay: 0.0500 (0.0500)  time: 3.2571  data: 2.4147  max mem: 21847
Epoch: [297]  [ 200/1251]  eta: 0:05:03  lr: 0.000002  min_lr: 0.000002  loss: 2.5580 (2.6162)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0899 (1.1938)  time: 0.2701  data: 0.0004  max mem: 21847
Epoch: [297]  [ 400/1251]  eta: 0:03:58  lr: 0.000002  min_lr: 0.000002  loss: 2.5010 (2.5638)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1603 (1.1772)  time: 0.2713  data: 0.0004  max mem: 21847
Epoch: [297]  [ 600/1251]  eta: 0:03:01  lr: 0.000002  min_lr: 0.000002  loss: 2.8812 (2.5802)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1243 (1.1655)  time: 0.2709  data: 0.0004  max mem: 21847
Epoch: [297]  [ 800/1251]  eta: 0:02:05  lr: 0.000002  min_lr: 0.000002  loss: 2.7083 (2.5807)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0389 (1.1610)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [297]  [1000/1251]  eta: 0:01:09  lr: 0.000002  min_lr: 0.000002  loss: 2.5734 (2.5850)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1111 (1.1685)  time: 0.2731  data: 0.0004  max mem: 21847
Epoch: [297]  [1200/1251]  eta: 0:00:14  lr: 0.000002  min_lr: 0.000002  loss: 2.8770 (2.5851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0724 (1.1644)  time: 0.2721  data: 0.0004  max mem: 21847
Epoch: [297]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.8188 (2.5872)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1015 (1.1650)  time: 0.2284  data: 0.0005  max mem: 21847
Epoch: [297] Total time: 0:05:46 (0.2767 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.8188 (2.5909)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1015 (1.1650)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5980 (0.5980)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.5145  data: 5.3574  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7876 (0.7560)  acc1: 86.4000 (85.8545)  acc5: 97.6000 (97.4909)  time: 0.7364  data: 0.6019  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9096 (0.8806)  acc1: 82.0000 (82.8952)  acc5: 96.0000 (96.2476)  time: 0.2023  data: 0.0721  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9494 (0.8889)  acc1: 80.4000 (82.5760)  acc5: 95.6000 (96.2080)  time: 0.2018  data: 0.0720  max mem: 21847
Test: Total time: 0:00:10 (0.4068 s / it)
* Acc@1 82.828 Acc@5 96.306 loss 0.889
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.93%
Epoch: [298]  [   0/1251]  eta: 1:09:29  lr: 0.000002  min_lr: 0.000002  loss: 3.1981 (3.1981)  weight_decay: 0.0500 (0.0500)  time: 3.3330  data: 2.5428  max mem: 21847
Epoch: [298]  [ 200/1251]  eta: 0:05:05  lr: 0.000001  min_lr: 0.000001  loss: 2.3204 (2.5471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2015 (1.2183)  time: 0.2746  data: 0.0004  max mem: 21847
Epoch: [298]  [ 400/1251]  eta: 0:04:00  lr: 0.000001  min_lr: 0.000001  loss: 2.7061 (2.5384)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0584 (1.1795)  time: 0.2724  data: 0.0004  max mem: 21847
Epoch: [298]  [ 600/1251]  eta: 0:03:02  lr: 0.000001  min_lr: 0.000001  loss: 2.9576 (2.5823)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0947 (1.2817)  time: 0.2831  data: 0.0004  max mem: 21847
Epoch: [298]  [ 800/1251]  eta: 0:02:05  lr: 0.000001  min_lr: 0.000001  loss: 2.5236 (2.5986)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1237 (1.2650)  time: 0.2733  data: 0.0004  max mem: 21847
Epoch: [298]  [1000/1251]  eta: 0:01:09  lr: 0.000001  min_lr: 0.000001  loss: 2.6371 (2.5962)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0503 (1.2407)  time: 0.2929  data: 0.0004  max mem: 21847
Epoch: [298]  [1200/1251]  eta: 0:00:14  lr: 0.000001  min_lr: 0.000001  loss: 2.5905 (2.5895)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0803 (1.2242)  time: 0.2718  data: 0.0004  max mem: 21847
Epoch: [298]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 3.0616 (2.5982)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2283  data: 0.0007  max mem: 21847
Epoch: [298] Total time: 0:05:46 (0.2771 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 3.0616 (2.5858)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6432 (0.6432)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.4583  data: 5.2942  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.8338 (0.8003)  acc1: 87.2000 (86.0000)  acc5: 97.6000 (97.6000)  time: 0.7551  data: 0.6203  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9468 (0.9324)  acc1: 81.2000 (82.9524)  acc5: 96.4000 (96.3238)  time: 0.2147  data: 0.0850  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 1.0190 (0.9417)  acc1: 80.4000 (82.6400)  acc5: 95.6000 (96.2720)  time: 0.2140  data: 0.0849  max mem: 21847
Test: Total time: 0:00:10 (0.4136 s / it)
* Acc@1 82.836 Acc@5 96.312 loss 0.942
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.93%
Epoch: [299]  [   0/1251]  eta: 1:09:25  lr: 0.000001  min_lr: 0.000001  loss: 2.8920 (2.8920)  weight_decay: 0.0500 (0.0500)  time: 3.3297  data: 2.9614  max mem: 21847
Epoch: [299]  [ 200/1251]  eta: 0:05:07  lr: 0.000001  min_lr: 0.000001  loss: 2.5763 (2.5671)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1071 (1.1938)  time: 0.2839  data: 0.0004  max mem: 21847
Epoch: [299]  [ 400/1251]  eta: 0:04:00  lr: 0.000001  min_lr: 0.000001  loss: 2.5211 (2.5640)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0571 (1.1629)  time: 0.2746  data: 0.0005  max mem: 21847
Epoch: [299]  [ 600/1251]  eta: 0:03:02  lr: 0.000001  min_lr: 0.000001  loss: 2.4652 (2.5531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0894 (1.1658)  time: 0.2720  data: 0.0004  max mem: 21847
Epoch: [299]  [ 800/1251]  eta: 0:02:05  lr: 0.000001  min_lr: 0.000001  loss: 2.9468 (2.5743)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1871 (1.1808)  time: 0.2766  data: 0.0003  max mem: 21847
Epoch: [299]  [1000/1251]  eta: 0:01:09  lr: 0.000001  min_lr: 0.000001  loss: 2.4242 (2.5733)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1085 (1.1813)  time: 0.2778  data: 0.0004  max mem: 21847
Epoch: [299]  [1200/1251]  eta: 0:00:14  lr: 0.000001  min_lr: 0.000001  loss: 2.7992 (2.5915)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1152 (1.1726)  time: 0.2729  data: 0.0003  max mem: 21847
Epoch: [299]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.6795 (2.5920)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1229 (1.1727)  time: 0.2353  data: 0.0005  max mem: 21847
Epoch: [299] Total time: 0:05:47 (0.2776 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.6795 (2.5879)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1229 (1.1727)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.6008 (0.6008)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 6.0208  data: 5.8475  max mem: 21847
Test:  [10/25]  eta: 0:00:11  loss: 0.7830 (0.7560)  acc1: 86.0000 (85.6727)  acc5: 97.6000 (97.4909)  time: 0.7423  data: 0.6103  max mem: 21847
Test:  [20/25]  eta: 0:00:02  loss: 0.9135 (0.8830)  acc1: 81.6000 (82.6857)  acc5: 96.0000 (96.1905)  time: 0.1862  data: 0.0582  max mem: 21847
Test:  [24/25]  eta: 0:00:00  loss: 0.9563 (0.8919)  acc1: 79.6000 (82.3520)  acc5: 95.6000 (96.1760)  time: 0.1860  data: 0.0582  max mem: 21847
Test: Total time: 0:00:10 (0.4135 s / it)
* Acc@1 82.756 Acc@5 96.292 loss 0.891
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.93%
Training time 1 day, 5:49:12
