| distributed init (rank 0): env://, gpu 0
| distributed init (rank 7): env://, gpu 7
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 6): env://, gpu 6
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 3): env://, gpu 3
| distributed init (rank 5): env://, gpu 5
Namespace(batch_size=128, epochs=300, update_freq=4, model='base', drop_path=0, input_size=288, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.0035, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.8, cutmix=1.0, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_base_288_14.5G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=8, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(288, 288), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=329, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(288, 288))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7f81604e7350>
Mixup is activated!
Model = SFCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
  )
  (layer1): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((40,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
        (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(40, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(40, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=40, bias=False)
          (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(40, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): Identity()
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.005)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.011)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.016)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.022)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.027)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.032)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.038)
    )
  )
  (layer2): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((80,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=80, bias=False)
          (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(80, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.043)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.048)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.054)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.059)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.065)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.070)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.075)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.081)
    )
    (8): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.086)
    )
    (9): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.092)
    )
    (10): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.097)
    )
    (11): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.102)
    )
    (12): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.108)
    )
    (13): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.113)
    )
    (14): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.118)
    )
  )
  (layer3): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=160, bias=False)
          (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(160, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.124)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.129)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.135)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.140)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.145)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.151)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.156)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.162)
    )
    (8): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.167)
    )
    (9): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.172)
    )
    (10): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.178)
    )
    (11): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.183)
    )
    (12): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.188)
    )
    (13): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.194)
    )
    (14): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.199)
    )
    (15): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.205)
    )
    (16): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.210)
    )
    (17): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.215)
    )
    (18): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.221)
    )
    (19): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.226)
    )
    (20): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.232)
    )
    (21): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.237)
    )
    (22): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.242)
    )
    (23): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.248)
    )
    (24): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.253)
    )
    (25): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.258)
    )
    (26): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.264)
    )
    (27): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.269)
    )
    (28): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.275)
    )
    (29): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.280)
    )
    (30): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.285)
    )
    (31): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.291)
    )
    (32): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.296)
    )
    (33): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.302)
    )
    (34): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.307)
    )
  )
  (layer4): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=320, bias=False)
          (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.312)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.318)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.323)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.328)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.334)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.339)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.345)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.350)
    )
  )
  (head): ConvX(
    (conv): Conv2d(640, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 49050136
LR = 0.00350000
Batch size = 4096
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 312
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.le.conv.weight",
      "layer1.0.conv_in.conv.weight",
      "layer1.0.conv.conv.weight",
      "layer1.0.conv_out.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.le.conv.weight",
      "layer1.1.conv_in.conv.weight",
      "layer1.1.conv.conv.weight",
      "layer1.1.conv_out.conv.weight",
      "layer1.2.le.conv.weight",
      "layer1.2.conv_in.conv.weight",
      "layer1.2.conv.conv.weight",
      "layer1.2.conv_out.conv.weight",
      "layer1.3.le.conv.weight",
      "layer1.3.conv_in.conv.weight",
      "layer1.3.conv.conv.weight",
      "layer1.3.conv_out.conv.weight",
      "layer1.4.le.conv.weight",
      "layer1.4.conv_in.conv.weight",
      "layer1.4.conv.conv.weight",
      "layer1.4.conv_out.conv.weight",
      "layer1.5.le.conv.weight",
      "layer1.5.conv_in.conv.weight",
      "layer1.5.conv.conv.weight",
      "layer1.5.conv_out.conv.weight",
      "layer1.6.le.conv.weight",
      "layer1.6.conv_in.conv.weight",
      "layer1.6.conv.conv.weight",
      "layer1.6.conv_out.conv.weight",
      "layer1.7.le.conv.weight",
      "layer1.7.conv_in.conv.weight",
      "layer1.7.conv.conv.weight",
      "layer1.7.conv_out.conv.weight",
      "layer2.0.le.conv.weight",
      "layer2.0.conv_in.conv.weight",
      "layer2.0.conv.conv.weight",
      "layer2.0.conv_out.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.le.conv.weight",
      "layer2.1.conv_in.conv.weight",
      "layer2.1.conv.conv.weight",
      "layer2.1.conv_out.conv.weight",
      "layer2.2.le.conv.weight",
      "layer2.2.conv_in.conv.weight",
      "layer2.2.conv.conv.weight",
      "layer2.2.conv_out.conv.weight",
      "layer2.3.le.conv.weight",
      "layer2.3.conv_in.conv.weight",
      "layer2.3.conv.conv.weight",
      "layer2.3.conv_out.conv.weight",
      "layer2.4.le.conv.weight",
      "layer2.4.conv_in.conv.weight",
      "layer2.4.conv.conv.weight",
      "layer2.4.conv_out.conv.weight",
      "layer2.5.le.conv.weight",
      "layer2.5.conv_in.conv.weight",
      "layer2.5.conv.conv.weight",
      "layer2.5.conv_out.conv.weight",
      "layer2.6.le.conv.weight",
      "layer2.6.conv_in.conv.weight",
      "layer2.6.conv.conv.weight",
      "layer2.6.conv_out.conv.weight",
      "layer2.7.le.conv.weight",
      "layer2.7.conv_in.conv.weight",
      "layer2.7.conv.conv.weight",
      "layer2.7.conv_out.conv.weight",
      "layer2.8.le.conv.weight",
      "layer2.8.conv_in.conv.weight",
      "layer2.8.conv.conv.weight",
      "layer2.8.conv_out.conv.weight",
      "layer2.9.le.conv.weight",
      "layer2.9.conv_in.conv.weight",
      "layer2.9.conv.conv.weight",
      "layer2.9.conv_out.conv.weight",
      "layer2.10.le.conv.weight",
      "layer2.10.conv_in.conv.weight",
      "layer2.10.conv.conv.weight",
      "layer2.10.conv_out.conv.weight",
      "layer2.11.le.conv.weight",
      "layer2.11.conv_in.conv.weight",
      "layer2.11.conv.conv.weight",
      "layer2.11.conv_out.conv.weight",
      "layer2.12.le.conv.weight",
      "layer2.12.conv_in.conv.weight",
      "layer2.12.conv.conv.weight",
      "layer2.12.conv_out.conv.weight",
      "layer2.13.le.conv.weight",
      "layer2.13.conv_in.conv.weight",
      "layer2.13.conv.conv.weight",
      "layer2.13.conv_out.conv.weight",
      "layer2.14.le.conv.weight",
      "layer2.14.conv_in.conv.weight",
      "layer2.14.conv.conv.weight",
      "layer2.14.conv_out.conv.weight",
      "layer3.0.le.conv.weight",
      "layer3.0.conv_in.conv.weight",
      "layer3.0.conv.conv.weight",
      "layer3.0.conv_out.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.le.conv.weight",
      "layer3.1.conv_in.conv.weight",
      "layer3.1.conv.conv.weight",
      "layer3.1.conv_out.conv.weight",
      "layer3.2.le.conv.weight",
      "layer3.2.conv_in.conv.weight",
      "layer3.2.conv.conv.weight",
      "layer3.2.conv_out.conv.weight",
      "layer3.3.le.conv.weight",
      "layer3.3.conv_in.conv.weight",
      "layer3.3.conv.conv.weight",
      "layer3.3.conv_out.conv.weight",
      "layer3.4.le.conv.weight",
      "layer3.4.conv_in.conv.weight",
      "layer3.4.conv.conv.weight",
      "layer3.4.conv_out.conv.weight",
      "layer3.5.le.conv.weight",
      "layer3.5.conv_in.conv.weight",
      "layer3.5.conv.conv.weight",
      "layer3.5.conv_out.conv.weight",
      "layer3.6.le.conv.weight",
      "layer3.6.conv_in.conv.weight",
      "layer3.6.conv.conv.weight",
      "layer3.6.conv_out.conv.weight",
      "layer3.7.le.conv.weight",
      "layer3.7.conv_in.conv.weight",
      "layer3.7.conv.conv.weight",
      "layer3.7.conv_out.conv.weight",
      "layer3.8.le.conv.weight",
      "layer3.8.conv_in.conv.weight",
      "layer3.8.conv.conv.weight",
      "layer3.8.conv_out.conv.weight",
      "layer3.9.le.conv.weight",
      "layer3.9.conv_in.conv.weight",
      "layer3.9.conv.conv.weight",
      "layer3.9.conv_out.conv.weight",
      "layer3.10.le.conv.weight",
      "layer3.10.conv_in.conv.weight",
      "layer3.10.conv.conv.weight",
      "layer3.10.conv_out.conv.weight",
      "layer3.11.le.conv.weight",
      "layer3.11.conv_in.conv.weight",
      "layer3.11.conv.conv.weight",
      "layer3.11.conv_out.conv.weight",
      "layer3.12.le.conv.weight",
      "layer3.12.conv_in.conv.weight",
      "layer3.12.conv.conv.weight",
      "layer3.12.conv_out.conv.weight",
      "layer3.13.le.conv.weight",
      "layer3.13.conv_in.conv.weight",
      "layer3.13.conv.conv.weight",
      "layer3.13.conv_out.conv.weight",
      "layer3.14.le.conv.weight",
      "layer3.14.conv_in.conv.weight",
      "layer3.14.conv.conv.weight",
      "layer3.14.conv_out.conv.weight",
      "layer3.15.le.conv.weight",
      "layer3.15.conv_in.conv.weight",
      "layer3.15.conv.conv.weight",
      "layer3.15.conv_out.conv.weight",
      "layer3.16.le.conv.weight",
      "layer3.16.conv_in.conv.weight",
      "layer3.16.conv.conv.weight",
      "layer3.16.conv_out.conv.weight",
      "layer3.17.le.conv.weight",
      "layer3.17.conv_in.conv.weight",
      "layer3.17.conv.conv.weight",
      "layer3.17.conv_out.conv.weight",
      "layer3.18.le.conv.weight",
      "layer3.18.conv_in.conv.weight",
      "layer3.18.conv.conv.weight",
      "layer3.18.conv_out.conv.weight",
      "layer3.19.le.conv.weight",
      "layer3.19.conv_in.conv.weight",
      "layer3.19.conv.conv.weight",
      "layer3.19.conv_out.conv.weight",
      "layer3.20.le.conv.weight",
      "layer3.20.conv_in.conv.weight",
      "layer3.20.conv.conv.weight",
      "layer3.20.conv_out.conv.weight",
      "layer3.21.le.conv.weight",
      "layer3.21.conv_in.conv.weight",
      "layer3.21.conv.conv.weight",
      "layer3.21.conv_out.conv.weight",
      "layer3.22.le.conv.weight",
      "layer3.22.conv_in.conv.weight",
      "layer3.22.conv.conv.weight",
      "layer3.22.conv_out.conv.weight",
      "layer3.23.le.conv.weight",
      "layer3.23.conv_in.conv.weight",
      "layer3.23.conv.conv.weight",
      "layer3.23.conv_out.conv.weight",
      "layer3.24.le.conv.weight",
      "layer3.24.conv_in.conv.weight",
      "layer3.24.conv.conv.weight",
      "layer3.24.conv_out.conv.weight",
      "layer3.25.le.conv.weight",
      "layer3.25.conv_in.conv.weight",
      "layer3.25.conv.conv.weight",
      "layer3.25.conv_out.conv.weight",
      "layer3.26.le.conv.weight",
      "layer3.26.conv_in.conv.weight",
      "layer3.26.conv.conv.weight",
      "layer3.26.conv_out.conv.weight",
      "layer3.27.le.conv.weight",
      "layer3.27.conv_in.conv.weight",
      "layer3.27.conv.conv.weight",
      "layer3.27.conv_out.conv.weight",
      "layer3.28.le.conv.weight",
      "layer3.28.conv_in.conv.weight",
      "layer3.28.conv.conv.weight",
      "layer3.28.conv_out.conv.weight",
      "layer3.29.le.conv.weight",
      "layer3.29.conv_in.conv.weight",
      "layer3.29.conv.conv.weight",
      "layer3.29.conv_out.conv.weight",
      "layer3.30.le.conv.weight",
      "layer3.30.conv_in.conv.weight",
      "layer3.30.conv.conv.weight",
      "layer3.30.conv_out.conv.weight",
      "layer3.31.le.conv.weight",
      "layer3.31.conv_in.conv.weight",
      "layer3.31.conv.conv.weight",
      "layer3.31.conv_out.conv.weight",
      "layer3.32.le.conv.weight",
      "layer3.32.conv_in.conv.weight",
      "layer3.32.conv.conv.weight",
      "layer3.32.conv_out.conv.weight",
      "layer3.33.le.conv.weight",
      "layer3.33.conv_in.conv.weight",
      "layer3.33.conv.conv.weight",
      "layer3.33.conv_out.conv.weight",
      "layer3.34.le.conv.weight",
      "layer3.34.conv_in.conv.weight",
      "layer3.34.conv.conv.weight",
      "layer3.34.conv_out.conv.weight",
      "layer4.0.le.conv.weight",
      "layer4.0.conv_in.conv.weight",
      "layer4.0.conv.conv.weight",
      "layer4.0.conv_out.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.le.conv.weight",
      "layer4.1.conv_in.conv.weight",
      "layer4.1.conv.conv.weight",
      "layer4.1.conv_out.conv.weight",
      "layer4.2.le.conv.weight",
      "layer4.2.conv_in.conv.weight",
      "layer4.2.conv.conv.weight",
      "layer4.2.conv_out.conv.weight",
      "layer4.3.le.conv.weight",
      "layer4.3.conv_in.conv.weight",
      "layer4.3.conv.conv.weight",
      "layer4.3.conv_out.conv.weight",
      "layer4.4.le.conv.weight",
      "layer4.4.conv_in.conv.weight",
      "layer4.4.conv.conv.weight",
      "layer4.4.conv_out.conv.weight",
      "layer4.5.le.conv.weight",
      "layer4.5.conv_in.conv.weight",
      "layer4.5.conv.conv.weight",
      "layer4.5.conv_out.conv.weight",
      "layer4.6.le.conv.weight",
      "layer4.6.conv_in.conv.weight",
      "layer4.6.conv.conv.weight",
      "layer4.6.conv_out.conv.weight",
      "layer4.7.le.conv.weight",
      "layer4.7.conv_in.conv.weight",
      "layer4.7.conv.conv.weight",
      "layer4.7.conv_out.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.ln.weight",
      "layer1.0.ln.bias",
      "layer1.0.le.norm.weight",
      "layer1.0.le.norm.bias",
      "layer1.0.conv_in.norm.weight",
      "layer1.0.conv_in.norm.bias",
      "layer1.0.conv.norm.weight",
      "layer1.0.conv.norm.bias",
      "layer1.0.conv_out.norm.weight",
      "layer1.0.conv_out.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.le.norm.weight",
      "layer1.1.le.norm.bias",
      "layer1.1.conv_in.norm.weight",
      "layer1.1.conv_in.norm.bias",
      "layer1.1.conv.norm.weight",
      "layer1.1.conv.norm.bias",
      "layer1.1.conv_out.norm.weight",
      "layer1.1.conv_out.norm.bias",
      "layer1.2.le.norm.weight",
      "layer1.2.le.norm.bias",
      "layer1.2.conv_in.norm.weight",
      "layer1.2.conv_in.norm.bias",
      "layer1.2.conv.norm.weight",
      "layer1.2.conv.norm.bias",
      "layer1.2.conv_out.norm.weight",
      "layer1.2.conv_out.norm.bias",
      "layer1.3.le.norm.weight",
      "layer1.3.le.norm.bias",
      "layer1.3.conv_in.norm.weight",
      "layer1.3.conv_in.norm.bias",
      "layer1.3.conv.norm.weight",
      "layer1.3.conv.norm.bias",
      "layer1.3.conv_out.norm.weight",
      "layer1.3.conv_out.norm.bias",
      "layer1.4.le.norm.weight",
      "layer1.4.le.norm.bias",
      "layer1.4.conv_in.norm.weight",
      "layer1.4.conv_in.norm.bias",
      "layer1.4.conv.norm.weight",
      "layer1.4.conv.norm.bias",
      "layer1.4.conv_out.norm.weight",
      "layer1.4.conv_out.norm.bias",
      "layer1.5.le.norm.weight",
      "layer1.5.le.norm.bias",
      "layer1.5.conv_in.norm.weight",
      "layer1.5.conv_in.norm.bias",
      "layer1.5.conv.norm.weight",
      "layer1.5.conv.norm.bias",
      "layer1.5.conv_out.norm.weight",
      "layer1.5.conv_out.norm.bias",
      "layer1.6.le.norm.weight",
      "layer1.6.le.norm.bias",
      "layer1.6.conv_in.norm.weight",
      "layer1.6.conv_in.norm.bias",
      "layer1.6.conv.norm.weight",
      "layer1.6.conv.norm.bias",
      "layer1.6.conv_out.norm.weight",
      "layer1.6.conv_out.norm.bias",
      "layer1.7.le.norm.weight",
      "layer1.7.le.norm.bias",
      "layer1.7.conv_in.norm.weight",
      "layer1.7.conv_in.norm.bias",
      "layer1.7.conv.norm.weight",
      "layer1.7.conv.norm.bias",
      "layer1.7.conv_out.norm.weight",
      "layer1.7.conv_out.norm.bias",
      "layer2.0.ln.weight",
      "layer2.0.ln.bias",
      "layer2.0.le.norm.weight",
      "layer2.0.le.norm.bias",
      "layer2.0.conv_in.norm.weight",
      "layer2.0.conv_in.norm.bias",
      "layer2.0.conv.norm.weight",
      "layer2.0.conv.norm.bias",
      "layer2.0.conv_out.norm.weight",
      "layer2.0.conv_out.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.le.norm.weight",
      "layer2.1.le.norm.bias",
      "layer2.1.conv_in.norm.weight",
      "layer2.1.conv_in.norm.bias",
      "layer2.1.conv.norm.weight",
      "layer2.1.conv.norm.bias",
      "layer2.1.conv_out.norm.weight",
      "layer2.1.conv_out.norm.bias",
      "layer2.2.le.norm.weight",
      "layer2.2.le.norm.bias",
      "layer2.2.conv_in.norm.weight",
      "layer2.2.conv_in.norm.bias",
      "layer2.2.conv.norm.weight",
      "layer2.2.conv.norm.bias",
      "layer2.2.conv_out.norm.weight",
      "layer2.2.conv_out.norm.bias",
      "layer2.3.le.norm.weight",
      "layer2.3.le.norm.bias",
      "layer2.3.conv_in.norm.weight",
      "layer2.3.conv_in.norm.bias",
      "layer2.3.conv.norm.weight",
      "layer2.3.conv.norm.bias",
      "layer2.3.conv_out.norm.weight",
      "layer2.3.conv_out.norm.bias",
      "layer2.4.le.norm.weight",
      "layer2.4.le.norm.bias",
      "layer2.4.conv_in.norm.weight",
      "layer2.4.conv_in.norm.bias",
      "layer2.4.conv.norm.weight",
      "layer2.4.conv.norm.bias",
      "layer2.4.conv_out.norm.weight",
      "layer2.4.conv_out.norm.bias",
      "layer2.5.le.norm.weight",
      "layer2.5.le.norm.bias",
      "layer2.5.conv_in.norm.weight",
      "layer2.5.conv_in.norm.bias",
      "layer2.5.conv.norm.weight",
      "layer2.5.conv.norm.bias",
      "layer2.5.conv_out.norm.weight",
      "layer2.5.conv_out.norm.bias",
      "layer2.6.le.norm.weight",
      "layer2.6.le.norm.bias",
      "layer2.6.conv_in.norm.weight",
      "layer2.6.conv_in.norm.bias",
      "layer2.6.conv.norm.weight",
      "layer2.6.conv.norm.bias",
      "layer2.6.conv_out.norm.weight",
      "layer2.6.conv_out.norm.bias",
      "layer2.7.le.norm.weight",
      "layer2.7.le.norm.bias",
      "layer2.7.conv_in.norm.weight",
      "layer2.7.conv_in.norm.bias",
      "layer2.7.conv.norm.weight",
      "layer2.7.conv.norm.bias",
      "layer2.7.conv_out.norm.weight",
      "layer2.7.conv_out.norm.bias",
      "layer2.8.le.norm.weight",
      "layer2.8.le.norm.bias",
      "layer2.8.conv_in.norm.weight",
      "layer2.8.conv_in.norm.bias",
      "layer2.8.conv.norm.weight",
      "layer2.8.conv.norm.bias",
      "layer2.8.conv_out.norm.weight",
      "layer2.8.conv_out.norm.bias",
      "layer2.9.le.norm.weight",
      "layer2.9.le.norm.bias",
      "layer2.9.conv_in.norm.weight",
      "layer2.9.conv_in.norm.bias",
      "layer2.9.conv.norm.weight",
      "layer2.9.conv.norm.bias",
      "layer2.9.conv_out.norm.weight",
      "layer2.9.conv_out.norm.bias",
      "layer2.10.le.norm.weight",
      "layer2.10.le.norm.bias",
      "layer2.10.conv_in.norm.weight",
      "layer2.10.conv_in.norm.bias",
      "layer2.10.conv.norm.weight",
      "layer2.10.conv.norm.bias",
      "layer2.10.conv_out.norm.weight",
      "layer2.10.conv_out.norm.bias",
      "layer2.11.le.norm.weight",
      "layer2.11.le.norm.bias",
      "layer2.11.conv_in.norm.weight",
      "layer2.11.conv_in.norm.bias",
      "layer2.11.conv.norm.weight",
      "layer2.11.conv.norm.bias",
      "layer2.11.conv_out.norm.weight",
      "layer2.11.conv_out.norm.bias",
      "layer2.12.le.norm.weight",
      "layer2.12.le.norm.bias",
      "layer2.12.conv_in.norm.weight",
      "layer2.12.conv_in.norm.bias",
      "layer2.12.conv.norm.weight",
      "layer2.12.conv.norm.bias",
      "layer2.12.conv_out.norm.weight",
      "layer2.12.conv_out.norm.bias",
      "layer2.13.le.norm.weight",
      "layer2.13.le.norm.bias",
      "layer2.13.conv_in.norm.weight",
      "layer2.13.conv_in.norm.bias",
      "layer2.13.conv.norm.weight",
      "layer2.13.conv.norm.bias",
      "layer2.13.conv_out.norm.weight",
      "layer2.13.conv_out.norm.bias",
      "layer2.14.le.norm.weight",
      "layer2.14.le.norm.bias",
      "layer2.14.conv_in.norm.weight",
      "layer2.14.conv_in.norm.bias",
      "layer2.14.conv.norm.weight",
      "layer2.14.conv.norm.bias",
      "layer2.14.conv_out.norm.weight",
      "layer2.14.conv_out.norm.bias",
      "layer3.0.ln.weight",
      "layer3.0.ln.bias",
      "layer3.0.le.norm.weight",
      "layer3.0.le.norm.bias",
      "layer3.0.conv_in.norm.weight",
      "layer3.0.conv_in.norm.bias",
      "layer3.0.conv.norm.weight",
      "layer3.0.conv.norm.bias",
      "layer3.0.conv_out.norm.weight",
      "layer3.0.conv_out.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.le.norm.weight",
      "layer3.1.le.norm.bias",
      "layer3.1.conv_in.norm.weight",
      "layer3.1.conv_in.norm.bias",
      "layer3.1.conv.norm.weight",
      "layer3.1.conv.norm.bias",
      "layer3.1.conv_out.norm.weight",
      "layer3.1.conv_out.norm.bias",
      "layer3.2.le.norm.weight",
      "layer3.2.le.norm.bias",
      "layer3.2.conv_in.norm.weight",
      "layer3.2.conv_in.norm.bias",
      "layer3.2.conv.norm.weight",
      "layer3.2.conv.norm.bias",
      "layer3.2.conv_out.norm.weight",
      "layer3.2.conv_out.norm.bias",
      "layer3.3.le.norm.weight",
      "layer3.3.le.norm.bias",
      "layer3.3.conv_in.norm.weight",
      "layer3.3.conv_in.norm.bias",
      "layer3.3.conv.norm.weight",
      "layer3.3.conv.norm.bias",
      "layer3.3.conv_out.norm.weight",
      "layer3.3.conv_out.norm.bias",
      "layer3.4.le.norm.weight",
      "layer3.4.le.norm.bias",
      "layer3.4.conv_in.norm.weight",
      "layer3.4.conv_in.norm.bias",
      "layer3.4.conv.norm.weight",
      "layer3.4.conv.norm.bias",
      "layer3.4.conv_out.norm.weight",
      "layer3.4.conv_out.norm.bias",
      "layer3.5.le.norm.weight",
      "layer3.5.le.norm.bias",
      "layer3.5.conv_in.norm.weight",
      "layer3.5.conv_in.norm.bias",
      "layer3.5.conv.norm.weight",
      "layer3.5.conv.norm.bias",
      "layer3.5.conv_out.norm.weight",
      "layer3.5.conv_out.norm.bias",
      "layer3.6.le.norm.weight",
      "layer3.6.le.norm.bias",
      "layer3.6.conv_in.norm.weight",
      "layer3.6.conv_in.norm.bias",
      "layer3.6.conv.norm.weight",
      "layer3.6.conv.norm.bias",
      "layer3.6.conv_out.norm.weight",
      "layer3.6.conv_out.norm.bias",
      "layer3.7.le.norm.weight",
      "layer3.7.le.norm.bias",
      "layer3.7.conv_in.norm.weight",
      "layer3.7.conv_in.norm.bias",
      "layer3.7.conv.norm.weight",
      "layer3.7.conv.norm.bias",
      "layer3.7.conv_out.norm.weight",
      "layer3.7.conv_out.norm.bias",
      "layer3.8.le.norm.weight",
      "layer3.8.le.norm.bias",
      "layer3.8.conv_in.norm.weight",
      "layer3.8.conv_in.norm.bias",
      "layer3.8.conv.norm.weight",
      "layer3.8.conv.norm.bias",
      "layer3.8.conv_out.norm.weight",
      "layer3.8.conv_out.norm.bias",
      "layer3.9.le.norm.weight",
      "layer3.9.le.norm.bias",
      "layer3.9.conv_in.norm.weight",
      "layer3.9.conv_in.norm.bias",
      "layer3.9.conv.norm.weight",
      "layer3.9.conv.norm.bias",
      "layer3.9.conv_out.norm.weight",
      "layer3.9.conv_out.norm.bias",
      "layer3.10.le.norm.weight",
      "layer3.10.le.norm.bias",
      "layer3.10.conv_in.norm.weight",
      "layer3.10.conv_in.norm.bias",
      "layer3.10.conv.norm.weight",
      "layer3.10.conv.norm.bias",
      "layer3.10.conv_out.norm.weight",
      "layer3.10.conv_out.norm.bias",
      "layer3.11.le.norm.weight",
      "layer3.11.le.norm.bias",
      "layer3.11.conv_in.norm.weight",
      "layer3.11.conv_in.norm.bias",
      "layer3.11.conv.norm.weight",
      "layer3.11.conv.norm.bias",
      "layer3.11.conv_out.norm.weight",
      "layer3.11.conv_out.norm.bias",
      "layer3.12.le.norm.weight",
      "layer3.12.le.norm.bias",
      "layer3.12.conv_in.norm.weight",
      "layer3.12.conv_in.norm.bias",
      "layer3.12.conv.norm.weight",
      "layer3.12.conv.norm.bias",
      "layer3.12.conv_out.norm.weight",
      "layer3.12.conv_out.norm.bias",
      "layer3.13.le.norm.weight",
      "layer3.13.le.norm.bias",
      "layer3.13.conv_in.norm.weight",
      "layer3.13.conv_in.norm.bias",
      "layer3.13.conv.norm.weight",
      "layer3.13.conv.norm.bias",
      "layer3.13.conv_out.norm.weight",
      "layer3.13.conv_out.norm.bias",
      "layer3.14.le.norm.weight",
      "layer3.14.le.norm.bias",
      "layer3.14.conv_in.norm.weight",
      "layer3.14.conv_in.norm.bias",
      "layer3.14.conv.norm.weight",
      "layer3.14.conv.norm.bias",
      "layer3.14.conv_out.norm.weight",
      "layer3.14.conv_out.norm.bias",
      "layer3.15.le.norm.weight",
      "layer3.15.le.norm.bias",
      "layer3.15.conv_in.norm.weight",
      "layer3.15.conv_in.norm.bias",
      "layer3.15.conv.norm.weight",
      "layer3.15.conv.norm.bias",
      "layer3.15.conv_out.norm.weight",
      "layer3.15.conv_out.norm.bias",
      "layer3.16.le.norm.weight",
      "layer3.16.le.norm.bias",
      "layer3.16.conv_in.norm.weight",
      "layer3.16.conv_in.norm.bias",
      "layer3.16.conv.norm.weight",
      "layer3.16.conv.norm.bias",
      "layer3.16.conv_out.norm.weight",
      "layer3.16.conv_out.norm.bias",
      "layer3.17.le.norm.weight",
      "layer3.17.le.norm.bias",
      "layer3.17.conv_in.norm.weight",
      "layer3.17.conv_in.norm.bias",
      "layer3.17.conv.norm.weight",
      "layer3.17.conv.norm.bias",
      "layer3.17.conv_out.norm.weight",
      "layer3.17.conv_out.norm.bias",
      "layer3.18.le.norm.weight",
      "layer3.18.le.norm.bias",
      "layer3.18.conv_in.norm.weight",
      "layer3.18.conv_in.norm.bias",
      "layer3.18.conv.norm.weight",
      "layer3.18.conv.norm.bias",
      "layer3.18.conv_out.norm.weight",
      "layer3.18.conv_out.norm.bias",
      "layer3.19.le.norm.weight",
      "layer3.19.le.norm.bias",
      "layer3.19.conv_in.norm.weight",
      "layer3.19.conv_in.norm.bias",
      "layer3.19.conv.norm.weight",
      "layer3.19.conv.norm.bias",
      "layer3.19.conv_out.norm.weight",
      "layer3.19.conv_out.norm.bias",
      "layer3.20.le.norm.weight",
      "layer3.20.le.norm.bias",
      "layer3.20.conv_in.norm.weight",
      "layer3.20.conv_in.norm.bias",
      "layer3.20.conv.norm.weight",
      "layer3.20.conv.norm.bias",
      "layer3.20.conv_out.norm.weight",
      "layer3.20.conv_out.norm.bias",
      "layer3.21.le.norm.weight",
      "layer3.21.le.norm.bias",
      "layer3.21.conv_in.norm.weight",
      "layer3.21.conv_in.norm.bias",
      "layer3.21.conv.norm.weight",
      "layer3.21.conv.norm.bias",
      "layer3.21.conv_out.norm.weight",
      "layer3.21.conv_out.norm.bias",
      "layer3.22.le.norm.weight",
      "layer3.22.le.norm.bias",
      "layer3.22.conv_in.norm.weight",
      "layer3.22.conv_in.norm.bias",
      "layer3.22.conv.norm.weight",
      "layer3.22.conv.norm.bias",
      "layer3.22.conv_out.norm.weight",
      "layer3.22.conv_out.norm.bias",
      "layer3.23.le.norm.weight",
      "layer3.23.le.norm.bias",
      "layer3.23.conv_in.norm.weight",
      "layer3.23.conv_in.norm.bias",
      "layer3.23.conv.norm.weight",
      "layer3.23.conv.norm.bias",
      "layer3.23.conv_out.norm.weight",
      "layer3.23.conv_out.norm.bias",
      "layer3.24.le.norm.weight",
      "layer3.24.le.norm.bias",
      "layer3.24.conv_in.norm.weight",
      "layer3.24.conv_in.norm.bias",
      "layer3.24.conv.norm.weight",
      "layer3.24.conv.norm.bias",
      "layer3.24.conv_out.norm.weight",
      "layer3.24.conv_out.norm.bias",
      "layer3.25.le.norm.weight",
      "layer3.25.le.norm.bias",
      "layer3.25.conv_in.norm.weight",
      "layer3.25.conv_in.norm.bias",
      "layer3.25.conv.norm.weight",
      "layer3.25.conv.norm.bias",
      "layer3.25.conv_out.norm.weight",
      "layer3.25.conv_out.norm.bias",
      "layer3.26.le.norm.weight",
      "layer3.26.le.norm.bias",
      "layer3.26.conv_in.norm.weight",
      "layer3.26.conv_in.norm.bias",
      "layer3.26.conv.norm.weight",
      "layer3.26.conv.norm.bias",
      "layer3.26.conv_out.norm.weight",
      "layer3.26.conv_out.norm.bias",
      "layer3.27.le.norm.weight",
      "layer3.27.le.norm.bias",
      "layer3.27.conv_in.norm.weight",
      "layer3.27.conv_in.norm.bias",
      "layer3.27.conv.norm.weight",
      "layer3.27.conv.norm.bias",
      "layer3.27.conv_out.norm.weight",
      "layer3.27.conv_out.norm.bias",
      "layer3.28.le.norm.weight",
      "layer3.28.le.norm.bias",
      "layer3.28.conv_in.norm.weight",
      "layer3.28.conv_in.norm.bias",
      "layer3.28.conv.norm.weight",
      "layer3.28.conv.norm.bias",
      "layer3.28.conv_out.norm.weight",
      "layer3.28.conv_out.norm.bias",
      "layer3.29.le.norm.weight",
      "layer3.29.le.norm.bias",
      "layer3.29.conv_in.norm.weight",
      "layer3.29.conv_in.norm.bias",
      "layer3.29.conv.norm.weight",
      "layer3.29.conv.norm.bias",
      "layer3.29.conv_out.norm.weight",
      "layer3.29.conv_out.norm.bias",
      "layer3.30.le.norm.weight",
      "layer3.30.le.norm.bias",
      "layer3.30.conv_in.norm.weight",
      "layer3.30.conv_in.norm.bias",
      "layer3.30.conv.norm.weight",
      "layer3.30.conv.norm.bias",
      "layer3.30.conv_out.norm.weight",
      "layer3.30.conv_out.norm.bias",
      "layer3.31.le.norm.weight",
      "layer3.31.le.norm.bias",
      "layer3.31.conv_in.norm.weight",
      "layer3.31.conv_in.norm.bias",
      "layer3.31.conv.norm.weight",
      "layer3.31.conv.norm.bias",
      "layer3.31.conv_out.norm.weight",
      "layer3.31.conv_out.norm.bias",
      "layer3.32.le.norm.weight",
      "layer3.32.le.norm.bias",
      "layer3.32.conv_in.norm.weight",
      "layer3.32.conv_in.norm.bias",
      "layer3.32.conv.norm.weight",
      "layer3.32.conv.norm.bias",
      "layer3.32.conv_out.norm.weight",
      "layer3.32.conv_out.norm.bias",
      "layer3.33.le.norm.weight",
      "layer3.33.le.norm.bias",
      "layer3.33.conv_in.norm.weight",
      "layer3.33.conv_in.norm.bias",
      "layer3.33.conv.norm.weight",
      "layer3.33.conv.norm.bias",
      "layer3.33.conv_out.norm.weight",
      "layer3.33.conv_out.norm.bias",
      "layer3.34.le.norm.weight",
      "layer3.34.le.norm.bias",
      "layer3.34.conv_in.norm.weight",
      "layer3.34.conv_in.norm.bias",
      "layer3.34.conv.norm.weight",
      "layer3.34.conv.norm.bias",
      "layer3.34.conv_out.norm.weight",
      "layer3.34.conv_out.norm.bias",
      "layer4.0.ln.weight",
      "layer4.0.ln.bias",
      "layer4.0.le.norm.weight",
      "layer4.0.le.norm.bias",
      "layer4.0.conv_in.norm.weight",
      "layer4.0.conv_in.norm.bias",
      "layer4.0.conv.norm.weight",
      "layer4.0.conv.norm.bias",
      "layer4.0.conv_out.norm.weight",
      "layer4.0.conv_out.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.le.norm.weight",
      "layer4.1.le.norm.bias",
      "layer4.1.conv_in.norm.weight",
      "layer4.1.conv_in.norm.bias",
      "layer4.1.conv.norm.weight",
      "layer4.1.conv.norm.bias",
      "layer4.1.conv_out.norm.weight",
      "layer4.1.conv_out.norm.bias",
      "layer4.2.le.norm.weight",
      "layer4.2.le.norm.bias",
      "layer4.2.conv_in.norm.weight",
      "layer4.2.conv_in.norm.bias",
      "layer4.2.conv.norm.weight",
      "layer4.2.conv.norm.bias",
      "layer4.2.conv_out.norm.weight",
      "layer4.2.conv_out.norm.bias",
      "layer4.3.le.norm.weight",
      "layer4.3.le.norm.bias",
      "layer4.3.conv_in.norm.weight",
      "layer4.3.conv_in.norm.bias",
      "layer4.3.conv.norm.weight",
      "layer4.3.conv.norm.bias",
      "layer4.3.conv_out.norm.weight",
      "layer4.3.conv_out.norm.bias",
      "layer4.4.le.norm.weight",
      "layer4.4.le.norm.bias",
      "layer4.4.conv_in.norm.weight",
      "layer4.4.conv_in.norm.bias",
      "layer4.4.conv.norm.weight",
      "layer4.4.conv.norm.bias",
      "layer4.4.conv_out.norm.weight",
      "layer4.4.conv_out.norm.bias",
      "layer4.5.le.norm.weight",
      "layer4.5.le.norm.bias",
      "layer4.5.conv_in.norm.weight",
      "layer4.5.conv_in.norm.bias",
      "layer4.5.conv.norm.weight",
      "layer4.5.conv.norm.bias",
      "layer4.5.conv_out.norm.weight",
      "layer4.5.conv_out.norm.bias",
      "layer4.6.le.norm.weight",
      "layer4.6.le.norm.bias",
      "layer4.6.conv_in.norm.weight",
      "layer4.6.conv_in.norm.bias",
      "layer4.6.conv.norm.weight",
      "layer4.6.conv.norm.bias",
      "layer4.6.conv_out.norm.weight",
      "layer4.6.conv_out.norm.bias",
      "layer4.7.le.norm.weight",
      "layer4.7.le.norm.bias",
      "layer4.7.conv_in.norm.weight",
      "layer4.7.conv_in.norm.bias",
      "layer4.7.conv.norm.weight",
      "layer4.7.conv.norm.bias",
      "layer4.7.conv_out.norm.weight",
      "layer4.7.conv_out.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 6240
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: 
Start training for 300 epochs
Epoch: [0]  [   0/1251]  eta: 7:39:46  lr: 0.000000  min_lr: 0.000000  loss: 6.9635 (6.9635)  weight_decay: 0.0500 (0.0500)  time: 22.0515  data: 3.9423  max mem: 69511
Epoch: [0]  [ 200/1251]  eta: 0:15:42  lr: 0.000028  min_lr: 0.000028  loss: 6.8987 (6.9277)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.4452 (6.8946)  time: 0.7908  data: 0.0005  max mem: 69511
Epoch: [0]  [ 400/1251]  eta: 0:11:59  lr: 0.000056  min_lr: 0.000056  loss: 6.8314 (6.8874)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2521 (4.8636)  time: 0.8000  data: 0.0006  max mem: 69511
Epoch: [0]  [ 600/1251]  eta: 0:09:01  lr: 0.000084  min_lr: 0.000084  loss: 6.7457 (6.8413)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.5042 (4.6685)  time: 0.7928  data: 0.0005  max mem: 69511
Epoch: [0]  [ 800/1251]  eta: 0:06:10  lr: 0.000112  min_lr: 0.000112  loss: 6.6285 (6.7989)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.8904 (inf)  time: 0.7932  data: 0.0005  max mem: 69511
Epoch: [0]  [1000/1251]  eta: 0:03:25  lr: 0.000140  min_lr: 0.000140  loss: 6.6468 (6.7614)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.9225 (inf)  time: 0.8588  data: 0.0006  max mem: 69511
Epoch: [0]  [1200/1251]  eta: 0:00:41  lr: 0.000168  min_lr: 0.000168  loss: 6.5276 (6.7243)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.7954 (inf)  time: 0.7935  data: 0.0005  max mem: 69511
Epoch: [0]  [1250/1251]  eta: 0:00:00  lr: 0.000174  min_lr: 0.000174  loss: 6.4798 (6.7168)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.7954 (inf)  time: 0.6738  data: 0.0005  max mem: 69511
Epoch: [0] Total time: 0:17:03 (0.8179 s / it)
Averaged stats: lr: 0.000174  min_lr: 0.000174  loss: 6.4798 (6.7158)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.7954 (inf)
Test:  [ 0/25]  eta: 0:05:55  loss: 5.8127 (5.8127)  acc1: 4.8000 (4.8000)  acc5: 14.0000 (14.0000)  time: 14.2055  data: 8.2144  max mem: 69511
Test:  [10/25]  eta: 0:00:24  loss: 5.7842 (5.7810)  acc1: 3.6000 (3.3455)  acc5: 13.2000 (11.8909)  time: 1.6515  data: 0.7470  max mem: 69511
Test:  [20/25]  eta: 0:00:05  loss: 5.7727 (5.7748)  acc1: 3.6000 (3.5048)  acc5: 11.6000 (12.5905)  time: 0.3961  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 5.7639 (5.7171)  acc1: 3.6000 (4.2080)  acc5: 12.4000 (13.9040)  time: 0.3961  data: 0.0001  max mem: 69511
Test: Total time: 0:00:23 (0.9526 s / it)
* Acc@1 4.064 Acc@5 13.292 loss 5.726
Accuracy of the model on the 50000 test images: 4.1%
Max accuracy: 4.06%
Epoch: [1]  [   0/1251]  eta: 1:11:53  lr: 0.000175  min_lr: 0.000175  loss: 6.5221 (6.5221)  weight_decay: 0.0500 (0.0500)  time: 3.4481  data: 2.6356  max mem: 69511
Epoch: [1]  [ 200/1251]  eta: 0:13:59  lr: 0.000203  min_lr: 0.000203  loss: 6.5059 (6.4924)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.5182 (6.7090)  time: 0.7867  data: 0.0003  max mem: 69511
Epoch: [1]  [ 400/1251]  eta: 0:11:16  lr: 0.000231  min_lr: 0.000231  loss: 6.4865 (6.4637)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.7938 (6.2789)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [1]  [ 600/1251]  eta: 0:08:35  lr: 0.000259  min_lr: 0.000259  loss: 6.4648 (6.4454)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.3344 (6.1847)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [1]  [ 800/1251]  eta: 0:05:56  lr: 0.000287  min_lr: 0.000287  loss: 6.3666 (6.4186)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.1333 (6.0694)  time: 0.7991  data: 0.0004  max mem: 69511
Epoch: [1]  [1000/1251]  eta: 0:03:18  lr: 0.000315  min_lr: 0.000315  loss: 6.1782 (6.3925)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.8656 (6.0477)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [1]  [1200/1251]  eta: 0:00:40  lr: 0.000343  min_lr: 0.000343  loss: 6.1194 (6.3721)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.8009 (5.9370)  time: 0.7869  data: 0.0006  max mem: 69511
Epoch: [1]  [1250/1251]  eta: 0:00:00  lr: 0.000349  min_lr: 0.000349  loss: 6.2043 (6.3647)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.6473 (5.9222)  time: 0.6686  data: 0.0006  max mem: 69511
Epoch: [1] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.000349  min_lr: 0.000349  loss: 6.2043 (6.3556)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.6473 (5.9222)
Test:  [ 0/25]  eta: 0:03:08  loss: 5.0422 (5.0422)  acc1: 7.6000 (7.6000)  acc5: 23.2000 (23.2000)  time: 7.5571  data: 7.1241  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 5.0422 (4.9999)  acc1: 10.0000 (9.4545)  acc5: 25.2000 (25.1636)  time: 1.0481  data: 0.6479  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 5.0531 (5.0529)  acc1: 8.8000 (9.4095)  acc5: 24.4000 (24.0000)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 5.0482 (5.0020)  acc1: 10.0000 (10.0000)  acc5: 24.8000 (25.1360)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6887 s / it)
* Acc@1 9.704 Acc@5 25.392 loss 5.007
Accuracy of the model on the 50000 test images: 9.7%
Max accuracy: 9.70%
Epoch: [2]  [   0/1251]  eta: 1:27:02  lr: 0.000350  min_lr: 0.000350  loss: 6.3657 (6.3657)  weight_decay: 0.0500 (0.0500)  time: 4.1744  data: 3.3907  max mem: 69511
Epoch: [2]  [ 200/1251]  eta: 0:14:07  lr: 0.000378  min_lr: 0.000378  loss: 6.0077 (6.2035)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.3527 (5.6800)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [2]  [ 400/1251]  eta: 0:11:18  lr: 0.000406  min_lr: 0.000406  loss: 5.9761 (6.1753)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.7239 (5.5341)  time: 0.7872  data: 0.0007  max mem: 69511
Epoch: [2]  [ 600/1251]  eta: 0:08:36  lr: 0.000434  min_lr: 0.000434  loss: 6.1749 (6.1322)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.3156 (5.5054)  time: 0.7872  data: 0.0007  max mem: 69511
Epoch: [2]  [ 800/1251]  eta: 0:05:57  lr: 0.000462  min_lr: 0.000462  loss: 6.0775 (6.1186)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.3595 (5.3258)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [2]  [1000/1251]  eta: 0:03:18  lr: 0.000490  min_lr: 0.000490  loss: 6.2172 (6.1008)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.4212 (5.3332)  time: 0.7838  data: 0.0006  max mem: 69511
Epoch: [2]  [1200/1251]  eta: 0:00:40  lr: 0.000518  min_lr: 0.000518  loss: 6.0238 (6.0811)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.0063 (5.3495)  time: 0.7892  data: 0.0006  max mem: 69511
Epoch: [2]  [1250/1251]  eta: 0:00:00  lr: 0.000525  min_lr: 0.000525  loss: 5.8020 (6.0719)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.3823 (5.3476)  time: 0.6674  data: 0.0005  max mem: 69511
Epoch: [2] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.000525  min_lr: 0.000525  loss: 5.8020 (6.0719)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.3823 (5.3476)
Test:  [ 0/25]  eta: 0:03:08  loss: 3.9609 (3.9609)  acc1: 23.6000 (23.6000)  acc5: 50.8000 (50.8000)  time: 7.5353  data: 7.0936  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 4.0961 (4.1349)  acc1: 20.0000 (19.6000)  acc5: 41.2000 (43.9273)  time: 1.0457  data: 0.6452  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 4.4933 (4.3330)  acc1: 17.6000 (18.3238)  acc5: 37.2000 (39.6381)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 4.4473 (4.2886)  acc1: 18.0000 (19.3440)  acc5: 38.8000 (40.5280)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6885 s / it)
* Acc@1 19.164 Acc@5 41.046 loss 4.279
Accuracy of the model on the 50000 test images: 19.2%
Max accuracy: 19.16%
Epoch: [3]  [   0/1251]  eta: 1:30:24  lr: 0.000525  min_lr: 0.000525  loss: 6.2532 (6.2532)  weight_decay: 0.0500 (0.0500)  time: 4.3364  data: 3.5577  max mem: 69511
Epoch: [3]  [ 200/1251]  eta: 0:14:03  lr: 0.000553  min_lr: 0.000553  loss: 6.0358 (5.9620)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.6884 (5.1923)  time: 0.7827  data: 0.0004  max mem: 69511
Epoch: [3]  [ 400/1251]  eta: 0:11:14  lr: 0.000581  min_lr: 0.000581  loss: 5.9005 (5.8966)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.1451 (4.9154)  time: 0.7834  data: 0.0004  max mem: 69511
Epoch: [3]  [ 600/1251]  eta: 0:08:35  lr: 0.000609  min_lr: 0.000609  loss: 5.8610 (5.8773)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.3662 (4.9896)  time: 0.7849  data: 0.0004  max mem: 69511
Epoch: [3]  [ 800/1251]  eta: 0:05:56  lr: 0.000637  min_lr: 0.000637  loss: 5.8452 (5.8696)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2733 (4.8864)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [3]  [1000/1251]  eta: 0:03:18  lr: 0.000665  min_lr: 0.000665  loss: 5.7668 (5.8450)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9171 (4.7986)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [3]  [1200/1251]  eta: 0:00:40  lr: 0.000693  min_lr: 0.000693  loss: 5.4754 (5.8133)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.3062 (4.7400)  time: 0.7873  data: 0.0005  max mem: 69511
Epoch: [3]  [1250/1251]  eta: 0:00:00  lr: 0.000700  min_lr: 0.000700  loss: 5.8870 (5.8099)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8735 (4.7065)  time: 0.6684  data: 0.0006  max mem: 69511
Epoch: [3] Total time: 0:16:25 (0.7878 s / it)
Averaged stats: lr: 0.000700  min_lr: 0.000700  loss: 5.8870 (5.8192)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8735 (4.7065)
Test:  [ 0/25]  eta: 0:03:11  loss: 3.3110 (3.3110)  acc1: 36.0000 (36.0000)  acc5: 60.4000 (60.4000)  time: 7.6597  data: 7.2244  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 3.3110 (3.3885)  acc1: 30.8000 (29.5636)  acc5: 60.0000 (57.2727)  time: 1.0575  data: 0.6571  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 3.7542 (3.6598)  acc1: 23.2000 (26.2857)  acc5: 49.6000 (52.0762)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 3.8490 (3.6437)  acc1: 24.0000 (26.9280)  acc5: 48.8000 (52.2240)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6929 s / it)
* Acc@1 27.504 Acc@5 52.306 loss 3.630
Accuracy of the model on the 50000 test images: 27.5%
Max accuracy: 27.50%
Epoch: [4]  [   0/1251]  eta: 1:32:52  lr: 0.000700  min_lr: 0.000700  loss: 6.0634 (6.0634)  weight_decay: 0.0500 (0.0500)  time: 4.4542  data: 3.6745  max mem: 69511
Epoch: [4]  [ 200/1251]  eta: 0:14:05  lr: 0.000728  min_lr: 0.000728  loss: 5.4177 (5.6459)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9634 (4.2725)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [4]  [ 400/1251]  eta: 0:11:19  lr: 0.000756  min_lr: 0.000756  loss: 5.5132 (5.6336)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2407 (4.3204)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [4]  [ 600/1251]  eta: 0:08:37  lr: 0.000784  min_lr: 0.000784  loss: 5.4824 (5.6268)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0761 (4.2007)  time: 0.7869  data: 0.0004  max mem: 69511
Epoch: [4]  [ 800/1251]  eta: 0:05:57  lr: 0.000812  min_lr: 0.000812  loss: 5.4317 (5.5920)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0481 (4.2474)  time: 0.7867  data: 0.0004  max mem: 69511
Epoch: [4]  [1000/1251]  eta: 0:03:18  lr: 0.000840  min_lr: 0.000840  loss: 5.2965 (5.5809)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2958 (4.2063)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [4]  [1200/1251]  eta: 0:00:40  lr: 0.000868  min_lr: 0.000868  loss: 5.5500 (5.5712)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3365 (4.1644)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [4]  [1250/1251]  eta: 0:00:00  lr: 0.000875  min_lr: 0.000875  loss: 5.6917 (5.5698)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5381 (4.1428)  time: 0.6677  data: 0.0005  max mem: 69511
Epoch: [4] Total time: 0:16:28 (0.7898 s / it)
Averaged stats: lr: 0.000875  min_lr: 0.000875  loss: 5.6917 (5.5759)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5381 (4.1428)
Test:  [ 0/25]  eta: 0:03:02  loss: 2.9099 (2.9099)  acc1: 41.2000 (41.2000)  acc5: 67.6000 (67.6000)  time: 7.3008  data: 6.8593  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 2.9099 (2.9447)  acc1: 41.2000 (38.4364)  acc5: 67.6000 (67.0545)  time: 1.0251  data: 0.6239  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 3.3174 (3.2501)  acc1: 32.8000 (34.7238)  acc5: 56.0000 (60.3619)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 3.4712 (3.2374)  acc1: 31.6000 (34.9280)  acc5: 54.8000 (60.3840)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6777 s / it)
* Acc@1 34.936 Acc@5 60.626 loss 3.231
Accuracy of the model on the 50000 test images: 34.9%
Max accuracy: 34.94%
Epoch: [5]  [   0/1251]  eta: 1:16:04  lr: 0.000875  min_lr: 0.000875  loss: 5.4802 (5.4802)  weight_decay: 0.0500 (0.0500)  time: 3.6485  data: 2.8672  max mem: 69511
Epoch: [5]  [ 200/1251]  eta: 0:14:03  lr: 0.000903  min_lr: 0.000903  loss: 5.0620 (5.4079)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3016 (3.7643)  time: 0.7912  data: 0.0004  max mem: 69511
Epoch: [5]  [ 400/1251]  eta: 0:11:16  lr: 0.000931  min_lr: 0.000931  loss: 5.4194 (5.4164)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3488 (3.7670)  time: 0.7860  data: 0.0006  max mem: 69511
Epoch: [5]  [ 600/1251]  eta: 0:08:35  lr: 0.000959  min_lr: 0.000959  loss: 5.6638 (5.4005)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.4946 (3.7832)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [5]  [ 800/1251]  eta: 0:05:56  lr: 0.000987  min_lr: 0.000987  loss: 5.6033 (5.3830)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.4518 (3.7169)  time: 0.7836  data: 0.0005  max mem: 69511
Epoch: [5]  [1000/1251]  eta: 0:03:18  lr: 0.001015  min_lr: 0.001015  loss: 5.4970 (5.3751)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3154 (3.6706)  time: 0.7840  data: 0.0004  max mem: 69511
Epoch: [5]  [1200/1251]  eta: 0:00:40  lr: 0.001043  min_lr: 0.001043  loss: 5.4454 (5.3586)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3064 (3.6360)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [5]  [1250/1251]  eta: 0:00:00  lr: 0.001050  min_lr: 0.001050  loss: 5.5274 (5.3549)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9884 (3.6052)  time: 0.6674  data: 0.0007  max mem: 69511
Epoch: [5] Total time: 0:16:25 (0.7877 s / it)
Averaged stats: lr: 0.001050  min_lr: 0.001050  loss: 5.5274 (5.3782)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9884 (3.6052)
Test:  [ 0/25]  eta: 0:03:10  loss: 2.4577 (2.4577)  acc1: 52.4000 (52.4000)  acc5: 75.2000 (75.2000)  time: 7.6287  data: 7.1957  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 2.3846 (2.4955)  acc1: 48.4000 (45.6727)  acc5: 76.8000 (74.2909)  time: 1.0524  data: 0.6544  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 3.0638 (2.8983)  acc1: 36.0000 (40.4952)  acc5: 62.8000 (66.4000)  time: 0.3947  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 3.2087 (2.8848)  acc1: 36.8000 (40.9920)  acc5: 60.8000 (66.5600)  time: 0.3946  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6922 s / it)
* Acc@1 40.982 Acc@5 67.026 loss 2.873
Accuracy of the model on the 50000 test images: 41.0%
Max accuracy: 40.98%
Epoch: [6]  [   0/1251]  eta: 1:27:49  lr: 0.001050  min_lr: 0.001050  loss: 5.6787 (5.6787)  weight_decay: 0.0500 (0.0500)  time: 4.2119  data: 3.4257  max mem: 69511
Epoch: [6]  [ 200/1251]  eta: 0:14:05  lr: 0.001078  min_lr: 0.001078  loss: 5.2070 (5.2220)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7684 (3.1504)  time: 0.7841  data: 0.0005  max mem: 69511
Epoch: [6]  [ 400/1251]  eta: 0:11:15  lr: 0.001106  min_lr: 0.001106  loss: 5.1766 (5.2093)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3216 (3.2394)  time: 0.7827  data: 0.0005  max mem: 69511
Epoch: [6]  [ 600/1251]  eta: 0:08:35  lr: 0.001134  min_lr: 0.001134  loss: 5.0802 (5.1956)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0038 (3.2289)  time: 0.8065  data: 0.0006  max mem: 69511
Epoch: [6]  [ 800/1251]  eta: 0:05:56  lr: 0.001162  min_lr: 0.001162  loss: 4.9693 (5.2068)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7595 (3.1771)  time: 0.7828  data: 0.0006  max mem: 69511
Epoch: [6]  [1000/1251]  eta: 0:03:18  lr: 0.001190  min_lr: 0.001190  loss: 5.2010 (5.1934)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8744 (3.1519)  time: 0.7875  data: 0.0006  max mem: 69511
Epoch: [6]  [1200/1251]  eta: 0:00:40  lr: 0.001218  min_lr: 0.001218  loss: 5.0280 (5.1831)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9426 (3.1574)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [6]  [1250/1251]  eta: 0:00:00  lr: 0.001225  min_lr: 0.001225  loss: 5.4275 (5.1824)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8809 (3.1578)  time: 0.6681  data: 0.0006  max mem: 69511
Epoch: [6] Total time: 0:16:25 (0.7878 s / it)
Averaged stats: lr: 0.001225  min_lr: 0.001225  loss: 5.4275 (5.1852)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8809 (3.1578)
Test:  [ 0/25]  eta: 0:03:01  loss: 2.3058 (2.3058)  acc1: 54.4000 (54.4000)  acc5: 80.4000 (80.4000)  time: 7.2451  data: 6.8278  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 2.2926 (2.3744)  acc1: 54.4000 (51.3455)  acc5: 80.4000 (77.2727)  time: 1.0200  data: 0.6210  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 2.7640 (2.7112)  acc1: 39.6000 (45.3143)  acc5: 67.6000 (70.7619)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 3.0232 (2.7122)  acc1: 39.2000 (45.1360)  acc5: 64.8000 (70.8000)  time: 0.3973  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6762 s / it)
* Acc@1 45.544 Acc@5 71.338 loss 2.695
Accuracy of the model on the 50000 test images: 45.5%
Max accuracy: 45.54%
Epoch: [7]  [   0/1251]  eta: 1:29:58  lr: 0.001225  min_lr: 0.001225  loss: 5.3722 (5.3722)  weight_decay: 0.0500 (0.0500)  time: 4.3157  data: 3.5179  max mem: 69511
Epoch: [7]  [ 200/1251]  eta: 0:14:03  lr: 0.001253  min_lr: 0.001253  loss: 4.8729 (5.0322)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7102 (2.9281)  time: 0.7867  data: 0.0004  max mem: 69511
Epoch: [7]  [ 400/1251]  eta: 0:11:16  lr: 0.001281  min_lr: 0.001281  loss: 5.3075 (5.0764)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5564 (2.7783)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [7]  [ 600/1251]  eta: 0:08:36  lr: 0.001309  min_lr: 0.001309  loss: 5.2969 (5.0620)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3473 (2.7629)  time: 0.7875  data: 0.0004  max mem: 69511
Epoch: [7]  [ 800/1251]  eta: 0:05:57  lr: 0.001337  min_lr: 0.001337  loss: 5.2419 (5.0525)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4385 (2.7221)  time: 0.7863  data: 0.0006  max mem: 69511
Epoch: [7]  [1000/1251]  eta: 0:03:18  lr: 0.001365  min_lr: 0.001365  loss: 4.9585 (5.0564)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4911 (2.7027)  time: 0.8044  data: 0.0005  max mem: 69511
Epoch: [7]  [1200/1251]  eta: 0:00:40  lr: 0.001393  min_lr: 0.001393  loss: 4.8952 (5.0443)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5192 (2.6721)  time: 0.7870  data: 0.0006  max mem: 69511
Epoch: [7]  [1250/1251]  eta: 0:00:00  lr: 0.001400  min_lr: 0.001400  loss: 4.7749 (5.0371)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8827 (2.6776)  time: 0.6682  data: 0.0006  max mem: 69511
Epoch: [7] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.001400  min_lr: 0.001400  loss: 4.7749 (5.0488)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8827 (2.6776)
Test:  [ 0/25]  eta: 0:03:08  loss: 1.9702 (1.9702)  acc1: 62.0000 (62.0000)  acc5: 81.2000 (81.2000)  time: 7.5219  data: 7.0916  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.9702 (2.0786)  acc1: 60.8000 (56.3636)  acc5: 82.4000 (81.2727)  time: 1.0451  data: 0.6450  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 2.5533 (2.4341)  acc1: 45.2000 (49.9810)  acc5: 71.2000 (74.9143)  time: 0.3974  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 2.7194 (2.4453)  acc1: 44.4000 (49.7120)  acc5: 70.0000 (74.8000)  time: 0.3974  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6875 s / it)
* Acc@1 50.018 Acc@5 75.104 loss 2.424
Accuracy of the model on the 50000 test images: 50.0%
Max accuracy: 50.02%
Epoch: [8]  [   0/1251]  eta: 1:25:34  lr: 0.001400  min_lr: 0.001400  loss: 5.4915 (5.4915)  weight_decay: 0.0500 (0.0500)  time: 4.1040  data: 3.3228  max mem: 69511
Epoch: [8]  [ 200/1251]  eta: 0:14:03  lr: 0.001428  min_lr: 0.001428  loss: 5.1099 (5.0044)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4347 (2.3384)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [8]  [ 400/1251]  eta: 0:11:18  lr: 0.001456  min_lr: 0.001456  loss: 5.2564 (4.9695)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3127 (2.2892)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [8]  [ 600/1251]  eta: 0:08:36  lr: 0.001484  min_lr: 0.001484  loss: 4.8646 (4.9419)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6051 (2.4003)  time: 0.7884  data: 0.0004  max mem: 69511
Epoch: [8]  [ 800/1251]  eta: 0:05:57  lr: 0.001512  min_lr: 0.001512  loss: 5.0211 (4.9338)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0393 (2.3945)  time: 0.7954  data: 0.0004  max mem: 69511
Epoch: [8]  [1000/1251]  eta: 0:03:18  lr: 0.001540  min_lr: 0.001540  loss: 5.1242 (4.9339)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3105 (2.3839)  time: 0.7875  data: 0.0005  max mem: 69511
Epoch: [8]  [1200/1251]  eta: 0:00:40  lr: 0.001569  min_lr: 0.001569  loss: 4.8362 (4.9303)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9662 (2.3645)  time: 0.7872  data: 0.0004  max mem: 69511
Epoch: [8]  [1250/1251]  eta: 0:00:00  lr: 0.001575  min_lr: 0.001575  loss: 4.9382 (4.9274)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2628 (2.3664)  time: 0.6732  data: 0.0005  max mem: 69511
Epoch: [8] Total time: 0:16:28 (0.7899 s / it)
Averaged stats: lr: 0.001575  min_lr: 0.001575  loss: 4.9382 (4.9268)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2628 (2.3664)
Test:  [ 0/25]  eta: 0:03:14  loss: 1.6971 (1.6971)  acc1: 63.2000 (63.2000)  acc5: 87.6000 (87.6000)  time: 7.7618  data: 7.3328  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.7998 (1.9160)  acc1: 60.0000 (59.0909)  acc5: 86.0000 (84.6909)  time: 1.0670  data: 0.6669  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 2.4283 (2.2727)  acc1: 51.6000 (52.9905)  acc5: 74.4000 (77.9619)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 2.4501 (2.2680)  acc1: 50.0000 (53.1520)  acc5: 72.8000 (77.7600)  time: 0.3973  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6983 s / it)
* Acc@1 53.790 Acc@5 78.192 loss 2.239
Accuracy of the model on the 50000 test images: 53.8%
Max accuracy: 53.79%
Epoch: [9]  [   0/1251]  eta: 1:22:45  lr: 0.001575  min_lr: 0.001575  loss: 4.4269 (4.4269)  weight_decay: 0.0500 (0.0500)  time: 3.9693  data: 3.1698  max mem: 69511
Epoch: [9]  [ 200/1251]  eta: 0:14:08  lr: 0.001603  min_lr: 0.001603  loss: 4.6816 (4.8138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9855 (2.2364)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [9]  [ 400/1251]  eta: 0:11:18  lr: 0.001631  min_lr: 0.001631  loss: 5.1646 (4.8346)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0883 (2.2087)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [9]  [ 600/1251]  eta: 0:08:36  lr: 0.001659  min_lr: 0.001659  loss: 5.2009 (4.8425)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0912 (2.1957)  time: 0.7865  data: 0.0006  max mem: 69511
Epoch: [9]  [ 800/1251]  eta: 0:05:57  lr: 0.001687  min_lr: 0.001687  loss: 4.5206 (4.8367)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1701 (2.2038)  time: 0.7872  data: 0.0004  max mem: 69511
Epoch: [9]  [1000/1251]  eta: 0:03:18  lr: 0.001715  min_lr: 0.001715  loss: 4.7023 (4.8252)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0909 (2.1769)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [9]  [1200/1251]  eta: 0:00:40  lr: 0.001744  min_lr: 0.001744  loss: 5.0641 (4.8087)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9504 (2.1503)  time: 0.7841  data: 0.0005  max mem: 69511
Epoch: [9]  [1250/1251]  eta: 0:00:00  lr: 0.001750  min_lr: 0.001750  loss: 4.9245 (4.8097)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9381 (2.1432)  time: 0.6710  data: 0.0006  max mem: 69511
Epoch: [9] Total time: 0:16:27 (0.7892 s / it)
Averaged stats: lr: 0.001750  min_lr: 0.001750  loss: 4.9245 (4.8077)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9381 (2.1432)
Test:  [ 0/25]  eta: 0:03:05  loss: 1.5804 (1.5804)  acc1: 68.8000 (68.8000)  acc5: 87.6000 (87.6000)  time: 7.4259  data: 6.9829  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.5860 (1.7469)  acc1: 66.4000 (62.2909)  acc5: 87.2000 (86.1818)  time: 1.0355  data: 0.6351  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 2.1576 (2.1286)  acc1: 50.8000 (55.2952)  acc5: 79.2000 (79.7714)  time: 0.3964  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 2.3478 (2.1390)  acc1: 50.8000 (55.2960)  acc5: 76.8000 (79.8720)  time: 0.3964  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6835 s / it)
* Acc@1 55.676 Acc@5 79.812 loss 2.126
Accuracy of the model on the 50000 test images: 55.7%
Max accuracy: 55.68%
Epoch: [10]  [   0/1251]  eta: 1:28:31  lr: 0.001750  min_lr: 0.001750  loss: 4.8611 (4.8611)  weight_decay: 0.0500 (0.0500)  time: 4.2458  data: 3.4522  max mem: 69511
Epoch: [10]  [ 200/1251]  eta: 0:14:05  lr: 0.001778  min_lr: 0.001778  loss: 4.8224 (4.7858)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8542 (2.0153)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [10]  [ 400/1251]  eta: 0:11:17  lr: 0.001806  min_lr: 0.001806  loss: 4.7738 (4.7722)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1022 (1.9907)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [10]  [ 600/1251]  eta: 0:08:37  lr: 0.001834  min_lr: 0.001834  loss: 4.8946 (4.7649)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6879 (1.9521)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [10]  [ 800/1251]  eta: 0:05:57  lr: 0.001862  min_lr: 0.001862  loss: 4.4190 (4.7312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7224 (1.9639)  time: 0.7873  data: 0.0003  max mem: 69511
Epoch: [10]  [1000/1251]  eta: 0:03:18  lr: 0.001891  min_lr: 0.001891  loss: 4.5286 (4.7186)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7393 (1.9527)  time: 0.7840  data: 0.0004  max mem: 69511
Epoch: [10]  [1200/1251]  eta: 0:00:40  lr: 0.001919  min_lr: 0.001919  loss: 4.0254 (4.6987)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6818 (1.9474)  time: 0.7831  data: 0.0005  max mem: 69511
Epoch: [10]  [1250/1251]  eta: 0:00:00  lr: 0.001925  min_lr: 0.001925  loss: 4.6910 (4.6951)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6793 (1.9389)  time: 0.6667  data: 0.0005  max mem: 69511
Epoch: [10] Total time: 0:16:27 (0.7892 s / it)
Averaged stats: lr: 0.001925  min_lr: 0.001925  loss: 4.6910 (4.7201)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6793 (1.9389)
Test:  [ 0/25]  eta: 0:03:20  loss: 1.5393 (1.5393)  acc1: 72.8000 (72.8000)  acc5: 88.0000 (88.0000)  time: 8.0110  data: 7.5920  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.5856 (1.7048)  acc1: 67.2000 (64.1455)  acc5: 88.0000 (88.0727)  time: 1.0888  data: 0.6904  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 2.1436 (2.0416)  acc1: 52.8000 (57.4667)  acc5: 78.8000 (81.7714)  time: 0.3966  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 2.2785 (2.0513)  acc1: 53.2000 (57.5040)  acc5: 76.8000 (81.5520)  time: 0.3966  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7069 s / it)
* Acc@1 57.682 Acc@5 81.494 loss 2.040
Accuracy of the model on the 50000 test images: 57.7%
Max accuracy: 57.68%
Epoch: [11]  [   0/1251]  eta: 1:25:43  lr: 0.001925  min_lr: 0.001925  loss: 5.2656 (5.2656)  weight_decay: 0.0500 (0.0500)  time: 4.1112  data: 3.3262  max mem: 69511
Epoch: [11]  [ 200/1251]  eta: 0:14:02  lr: 0.001953  min_lr: 0.001953  loss: 4.9891 (4.6591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8702 (1.8267)  time: 0.7828  data: 0.0005  max mem: 69511
Epoch: [11]  [ 400/1251]  eta: 0:11:15  lr: 0.001981  min_lr: 0.001981  loss: 4.7530 (4.6738)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7736 (1.8315)  time: 0.8000  data: 0.0005  max mem: 69511
Epoch: [11]  [ 600/1251]  eta: 0:08:35  lr: 0.002009  min_lr: 0.002009  loss: 4.7769 (4.6461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7653 (1.7808)  time: 0.7875  data: 0.0005  max mem: 69511
Epoch: [11]  [ 800/1251]  eta: 0:05:56  lr: 0.002038  min_lr: 0.002038  loss: 4.9918 (4.6444)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6050 (1.7748)  time: 0.7832  data: 0.0004  max mem: 69511
Epoch: [11]  [1000/1251]  eta: 0:03:18  lr: 0.002066  min_lr: 0.002066  loss: 4.6874 (4.6466)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5364 (1.7600)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [11]  [1200/1251]  eta: 0:00:40  lr: 0.002094  min_lr: 0.002094  loss: 4.9916 (4.6404)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7476 (1.7603)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [11]  [1250/1251]  eta: 0:00:00  lr: 0.002100  min_lr: 0.002100  loss: 4.8149 (4.6401)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7476 (1.7595)  time: 0.6680  data: 0.0005  max mem: 69511
Epoch: [11] Total time: 0:16:24 (0.7873 s / it)
Averaged stats: lr: 0.002100  min_lr: 0.002100  loss: 4.8149 (4.6226)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7476 (1.7595)
Test:  [ 0/25]  eta: 0:03:09  loss: 1.5123 (1.5123)  acc1: 72.0000 (72.0000)  acc5: 88.8000 (88.8000)  time: 7.5967  data: 7.1633  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.5123 (1.6309)  acc1: 70.0000 (66.7273)  acc5: 90.4000 (88.5818)  time: 1.0520  data: 0.6516  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 2.0540 (1.9584)  acc1: 56.0000 (60.3619)  acc5: 80.8000 (83.1429)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 2.1224 (1.9709)  acc1: 56.0000 (60.2880)  acc5: 78.8000 (82.8160)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6923 s / it)
* Acc@1 59.900 Acc@5 82.874 loss 1.959
Accuracy of the model on the 50000 test images: 59.9%
Max accuracy: 59.90%
Epoch: [12]  [   0/1251]  eta: 1:17:01  lr: 0.002100  min_lr: 0.002100  loss: 3.8941 (3.8941)  weight_decay: 0.0500 (0.0500)  time: 3.6943  data: 2.8998  max mem: 69511
Epoch: [12]  [ 200/1251]  eta: 0:14:01  lr: 0.002128  min_lr: 0.002128  loss: 4.4243 (4.6013)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4793 (1.5272)  time: 0.7908  data: 0.0008  max mem: 69511
Epoch: [12]  [ 400/1251]  eta: 0:11:16  lr: 0.002156  min_lr: 0.002156  loss: 4.0696 (4.5510)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7358 (1.5811)  time: 0.7862  data: 0.0008  max mem: 69511
Epoch: [12]  [ 600/1251]  eta: 0:08:35  lr: 0.002184  min_lr: 0.002184  loss: 4.6413 (4.5565)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3196 (1.5401)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [12]  [ 800/1251]  eta: 0:05:56  lr: 0.002213  min_lr: 0.002213  loss: 4.6833 (4.5687)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7008 (1.5546)  time: 0.7997  data: 0.0006  max mem: 69511
Epoch: [12]  [1000/1251]  eta: 0:03:18  lr: 0.002241  min_lr: 0.002241  loss: 4.3386 (4.5654)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6114 (1.5404)  time: 0.7865  data: 0.0006  max mem: 69511
Epoch: [12]  [1200/1251]  eta: 0:00:40  lr: 0.002269  min_lr: 0.002269  loss: 5.1255 (4.5642)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5562 (1.5398)  time: 0.7862  data: 0.0006  max mem: 69511
Epoch: [12]  [1250/1251]  eta: 0:00:00  lr: 0.002275  min_lr: 0.002275  loss: 4.8382 (4.5657)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5871 (1.5389)  time: 0.6681  data: 0.0006  max mem: 69511
Epoch: [12] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.002275  min_lr: 0.002275  loss: 4.8382 (4.5552)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5871 (1.5389)
Test:  [ 0/25]  eta: 0:03:05  loss: 1.5305 (1.5305)  acc1: 72.0000 (72.0000)  acc5: 91.2000 (91.2000)  time: 7.4212  data: 6.9925  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.5664 (1.6294)  acc1: 72.0000 (67.1636)  acc5: 91.2000 (89.4909)  time: 1.0354  data: 0.6360  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.9509 (1.9692)  acc1: 55.6000 (60.3619)  acc5: 80.4000 (83.8095)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 2.2173 (1.9867)  acc1: 53.6000 (60.1600)  acc5: 79.2000 (83.5520)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6849 s / it)
* Acc@1 60.640 Acc@5 83.582 loss 1.972
Accuracy of the model on the 50000 test images: 60.6%
Max accuracy: 60.64%
Epoch: [13]  [   0/1251]  eta: 1:11:31  lr: 0.002275  min_lr: 0.002275  loss: 5.1026 (5.1026)  weight_decay: 0.0500 (0.0500)  time: 3.4307  data: 2.6403  max mem: 69511
Epoch: [13]  [ 200/1251]  eta: 0:14:04  lr: 0.002303  min_lr: 0.002303  loss: 4.7617 (4.5414)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4437 (1.4659)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [13]  [ 400/1251]  eta: 0:11:16  lr: 0.002331  min_lr: 0.002331  loss: 4.7389 (4.5165)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3390 (1.4493)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [13]  [ 600/1251]  eta: 0:08:35  lr: 0.002360  min_lr: 0.002360  loss: 4.8368 (4.4983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3722 (1.4482)  time: 0.7913  data: 0.0005  max mem: 69511
Epoch: [13]  [ 800/1251]  eta: 0:05:57  lr: 0.002388  min_lr: 0.002388  loss: 4.6501 (4.4947)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3826 (1.4241)  time: 0.7886  data: 0.0005  max mem: 69511
Epoch: [13]  [1000/1251]  eta: 0:03:18  lr: 0.002416  min_lr: 0.002416  loss: 4.6584 (4.4808)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2801 (1.4055)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [13]  [1200/1251]  eta: 0:00:40  lr: 0.002444  min_lr: 0.002444  loss: 4.1463 (4.4743)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3642 (1.3990)  time: 0.7990  data: 0.0005  max mem: 69511
Epoch: [13]  [1250/1251]  eta: 0:00:00  lr: 0.002450  min_lr: 0.002450  loss: 4.6368 (4.4763)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3764 (1.4018)  time: 0.6726  data: 0.0006  max mem: 69511
Epoch: [13] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.002450  min_lr: 0.002450  loss: 4.6368 (4.4839)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3764 (1.4018)
Test:  [ 0/25]  eta: 0:03:16  loss: 1.3854 (1.3854)  acc1: 73.6000 (73.6000)  acc5: 92.0000 (92.0000)  time: 7.8457  data: 7.4140  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.3854 (1.5269)  acc1: 73.6000 (69.8182)  acc5: 92.0000 (90.1091)  time: 1.0745  data: 0.6743  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.9759 (1.8538)  acc1: 57.6000 (62.6095)  acc5: 82.0000 (84.6857)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 2.0526 (1.8710)  acc1: 57.6000 (62.4160)  acc5: 80.4000 (84.3680)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7007 s / it)
* Acc@1 62.542 Acc@5 84.708 loss 1.861
Accuracy of the model on the 50000 test images: 62.5%
Max accuracy: 62.54%
Epoch: [14]  [   0/1251]  eta: 1:28:23  lr: 0.002450  min_lr: 0.002450  loss: 5.1695 (5.1695)  weight_decay: 0.0500 (0.0500)  time: 4.2396  data: 3.4435  max mem: 69511
Epoch: [14]  [ 200/1251]  eta: 0:14:01  lr: 0.002478  min_lr: 0.002478  loss: 4.8692 (4.4390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4637 (1.3510)  time: 0.7834  data: 0.0005  max mem: 69511
Epoch: [14]  [ 400/1251]  eta: 0:11:14  lr: 0.002506  min_lr: 0.002506  loss: 4.2781 (4.4040)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4613 (1.3532)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [14]  [ 600/1251]  eta: 0:08:35  lr: 0.002535  min_lr: 0.002535  loss: 4.8206 (4.3889)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2866 (1.3091)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [14]  [ 800/1251]  eta: 0:05:56  lr: 0.002563  min_lr: 0.002563  loss: 4.6836 (4.3852)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3359 (1.2840)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [14]  [1000/1251]  eta: 0:03:18  lr: 0.002591  min_lr: 0.002591  loss: 4.5121 (4.4006)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3052 (1.2814)  time: 0.7935  data: 0.0005  max mem: 69511
Epoch: [14]  [1200/1251]  eta: 0:00:40  lr: 0.002619  min_lr: 0.002619  loss: 4.5193 (4.3998)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0456 (1.2657)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [14]  [1250/1251]  eta: 0:00:00  lr: 0.002625  min_lr: 0.002625  loss: 4.3370 (4.3979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9872 (1.2602)  time: 0.6681  data: 0.0005  max mem: 69511
Epoch: [14] Total time: 0:16:26 (0.7882 s / it)
Averaged stats: lr: 0.002625  min_lr: 0.002625  loss: 4.3370 (4.4387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9872 (1.2602)
Test:  [ 0/25]  eta: 0:02:27  loss: 1.2899 (1.2899)  acc1: 77.2000 (77.2000)  acc5: 91.2000 (91.2000)  time: 5.8826  data: 5.4238  max mem: 69511
Test:  [10/25]  eta: 0:00:13  loss: 1.3651 (1.4292)  acc1: 69.6000 (69.6364)  acc5: 91.2000 (90.4000)  time: 0.9078  data: 0.5052  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.7626 (1.7316)  acc1: 60.0000 (64.1143)  acc5: 82.4000 (85.8095)  time: 0.4035  data: 0.0067  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.9918 (1.7465)  acc1: 59.2000 (63.6800)  acc5: 81.6000 (85.6480)  time: 0.3969  data: 0.0002  max mem: 69511
Test: Total time: 0:00:15 (0.6287 s / it)
* Acc@1 63.364 Acc@5 85.616 loss 1.741
Accuracy of the model on the 50000 test images: 63.4%
Max accuracy: 63.36%
Epoch: [15]  [   0/1251]  eta: 1:34:30  lr: 0.002625  min_lr: 0.002625  loss: 4.5041 (4.5041)  weight_decay: 0.0500 (0.0500)  time: 4.5324  data: 3.7430  max mem: 69511
Epoch: [15]  [ 200/1251]  eta: 0:14:03  lr: 0.002653  min_lr: 0.002653  loss: 4.6919 (4.4394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1901 (1.1987)  time: 0.7906  data: 0.0004  max mem: 69511
Epoch: [15]  [ 400/1251]  eta: 0:11:16  lr: 0.002682  min_lr: 0.002682  loss: 4.4771 (4.4020)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2066 (1.1734)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [15]  [ 600/1251]  eta: 0:08:35  lr: 0.002710  min_lr: 0.002710  loss: 4.1669 (4.3975)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0162 (1.1644)  time: 0.7853  data: 0.0006  max mem: 69511
Epoch: [15]  [ 800/1251]  eta: 0:05:56  lr: 0.002738  min_lr: 0.002738  loss: 4.6112 (4.4011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9814 (1.1590)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [15]  [1000/1251]  eta: 0:03:18  lr: 0.002766  min_lr: 0.002766  loss: 4.6331 (4.3948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9740 (1.1473)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [15]  [1200/1251]  eta: 0:00:40  lr: 0.002794  min_lr: 0.002794  loss: 4.4320 (4.3883)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0187 (1.1347)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [15]  [1250/1251]  eta: 0:00:00  lr: 0.002800  min_lr: 0.002800  loss: 4.4532 (4.3826)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0327 (1.1321)  time: 0.6674  data: 0.0004  max mem: 69511
Epoch: [15] Total time: 0:16:25 (0.7877 s / it)
Averaged stats: lr: 0.002800  min_lr: 0.002800  loss: 4.4532 (4.3839)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0327 (1.1321)
Test:  [ 0/25]  eta: 0:03:08  loss: 1.1828 (1.1828)  acc1: 77.2000 (77.2000)  acc5: 94.0000 (94.0000)  time: 7.5348  data: 7.1164  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.3097 (1.3817)  acc1: 70.4000 (70.1455)  acc5: 92.4000 (90.9091)  time: 1.0461  data: 0.6472  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.7605 (1.6692)  acc1: 58.0000 (64.3429)  acc5: 83.6000 (86.4191)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.8561 (1.6803)  acc1: 58.0000 (64.2080)  acc5: 82.8000 (86.2240)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6880 s / it)
* Acc@1 64.376 Acc@5 86.144 loss 1.685
Accuracy of the model on the 50000 test images: 64.4%
Max accuracy: 64.38%
Epoch: [16]  [   0/1251]  eta: 1:28:02  lr: 0.002800  min_lr: 0.002800  loss: 4.4837 (4.4837)  weight_decay: 0.0500 (0.0500)  time: 4.2227  data: 3.4265  max mem: 69511
Epoch: [16]  [ 200/1251]  eta: 0:14:07  lr: 0.002828  min_lr: 0.002828  loss: 4.2099 (4.3866)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0740 (1.1278)  time: 0.8109  data: 0.0005  max mem: 69511
Epoch: [16]  [ 400/1251]  eta: 0:11:18  lr: 0.002857  min_lr: 0.002857  loss: 4.3330 (4.3830)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0121 (1.1011)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [16]  [ 600/1251]  eta: 0:08:36  lr: 0.002885  min_lr: 0.002885  loss: 4.6365 (4.3915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9748 (1.0865)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [16]  [ 800/1251]  eta: 0:05:57  lr: 0.002913  min_lr: 0.002913  loss: 4.2015 (4.3756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9836 (1.0714)  time: 0.7869  data: 0.0004  max mem: 69511
Epoch: [16]  [1000/1251]  eta: 0:03:18  lr: 0.002941  min_lr: 0.002941  loss: 4.3909 (4.3649)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0151 (1.0629)  time: 0.7886  data: 0.0006  max mem: 69511
Epoch: [16]  [1200/1251]  eta: 0:00:40  lr: 0.002969  min_lr: 0.002969  loss: 4.5624 (4.3520)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9559 (1.0557)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [16]  [1250/1251]  eta: 0:00:00  lr: 0.002975  min_lr: 0.002975  loss: 4.3924 (4.3518)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0294 (1.0582)  time: 0.6678  data: 0.0005  max mem: 69511
Epoch: [16] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.002975  min_lr: 0.002975  loss: 4.3924 (4.3426)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0294 (1.0582)
Test:  [ 0/25]  eta: 0:02:57  loss: 1.2369 (1.2369)  acc1: 78.0000 (78.0000)  acc5: 93.2000 (93.2000)  time: 7.1116  data: 6.6708  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.3185 (1.4528)  acc1: 75.6000 (71.0182)  acc5: 92.8000 (91.4909)  time: 1.0082  data: 0.6068  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.8594 (1.7384)  acc1: 60.4000 (65.1810)  acc5: 84.4000 (86.7810)  time: 0.3974  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.8870 (1.7514)  acc1: 60.8000 (65.0080)  acc5: 84.4000 (86.6080)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6724 s / it)
* Acc@1 64.512 Acc@5 86.472 loss 1.753
Accuracy of the model on the 50000 test images: 64.5%
Max accuracy: 64.51%
Epoch: [17]  [   0/1251]  eta: 1:27:22  lr: 0.002975  min_lr: 0.002975  loss: 4.5834 (4.5834)  weight_decay: 0.0500 (0.0500)  time: 4.1906  data: 3.4072  max mem: 69511
Epoch: [17]  [ 200/1251]  eta: 0:14:09  lr: 0.003004  min_lr: 0.003004  loss: 4.4182 (4.3029)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0857 (1.0205)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [17]  [ 400/1251]  eta: 0:11:18  lr: 0.003032  min_lr: 0.003032  loss: 4.7423 (4.3065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7856 (0.9683)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [17]  [ 600/1251]  eta: 0:08:36  lr: 0.003060  min_lr: 0.003060  loss: 4.3546 (4.2844)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0181 (0.9851)  time: 0.7943  data: 0.0004  max mem: 69511
Epoch: [17]  [ 800/1251]  eta: 0:05:57  lr: 0.003088  min_lr: 0.003088  loss: 3.7499 (4.2773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8663 (0.9637)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [17]  [1000/1251]  eta: 0:03:18  lr: 0.003116  min_lr: 0.003116  loss: 4.3501 (4.2598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8859 (0.9671)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [17]  [1200/1251]  eta: 0:00:40  lr: 0.003144  min_lr: 0.003144  loss: 4.3303 (4.2668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9749 (0.9621)  time: 0.7892  data: 0.0004  max mem: 69511
Epoch: [17]  [1250/1251]  eta: 0:00:00  lr: 0.003150  min_lr: 0.003150  loss: 4.3301 (4.2631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7942 (0.9557)  time: 0.6688  data: 0.0005  max mem: 69511
Epoch: [17] Total time: 0:16:27 (0.7897 s / it)
Averaged stats: lr: 0.003150  min_lr: 0.003150  loss: 4.3301 (4.2876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7942 (0.9557)
Test:  [ 0/25]  eta: 0:03:04  loss: 1.1825 (1.1825)  acc1: 79.2000 (79.2000)  acc5: 92.8000 (92.8000)  time: 7.3949  data: 6.9539  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.2710 (1.3673)  acc1: 74.0000 (71.4909)  acc5: 93.2000 (91.7455)  time: 1.0334  data: 0.6325  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.7591 (1.6444)  acc1: 61.6000 (66.0381)  acc5: 85.6000 (87.5429)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.8579 (1.6599)  acc1: 61.6000 (65.5840)  acc5: 83.6000 (87.1840)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6844 s / it)
* Acc@1 65.984 Acc@5 87.324 loss 1.656
Accuracy of the model on the 50000 test images: 66.0%
Max accuracy: 65.98%
Epoch: [18]  [   0/1251]  eta: 1:17:23  lr: 0.003151  min_lr: 0.003151  loss: 4.5723 (4.5723)  weight_decay: 0.0500 (0.0500)  time: 3.7115  data: 2.9297  max mem: 69511
Epoch: [18]  [ 200/1251]  eta: 0:14:01  lr: 0.003179  min_lr: 0.003179  loss: 4.5533 (4.2486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9112 (0.9335)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [18]  [ 400/1251]  eta: 0:11:16  lr: 0.003207  min_lr: 0.003207  loss: 4.5024 (4.2181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9012 (0.9387)  time: 0.7924  data: 0.0004  max mem: 69511
Epoch: [18]  [ 600/1251]  eta: 0:08:36  lr: 0.003235  min_lr: 0.003235  loss: 4.1371 (4.2304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8776 (0.9094)  time: 0.7872  data: 0.0004  max mem: 69511
Epoch: [18]  [ 800/1251]  eta: 0:05:57  lr: 0.003263  min_lr: 0.003263  loss: 4.4770 (4.2513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8563 (0.8840)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [18]  [1000/1251]  eta: 0:03:18  lr: 0.003291  min_lr: 0.003291  loss: 4.6286 (4.2526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8461 (0.8888)  time: 0.8055  data: 0.0005  max mem: 69511
Epoch: [18]  [1200/1251]  eta: 0:00:40  lr: 0.003319  min_lr: 0.003319  loss: 4.3170 (4.2682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8210 (0.8755)  time: 0.7828  data: 0.0004  max mem: 69511
Epoch: [18]  [1250/1251]  eta: 0:00:00  lr: 0.003325  min_lr: 0.003325  loss: 4.0191 (4.2631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9286 (0.8776)  time: 0.6652  data: 0.0005  max mem: 69511
Epoch: [18] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.003325  min_lr: 0.003325  loss: 4.0191 (4.2580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9286 (0.8776)
Test:  [ 0/25]  eta: 0:03:15  loss: 1.1218 (1.1218)  acc1: 76.0000 (76.0000)  acc5: 93.6000 (93.6000)  time: 7.8276  data: 7.4004  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.2076 (1.3207)  acc1: 76.0000 (72.6909)  acc5: 93.6000 (92.5455)  time: 1.0702  data: 0.6730  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.7213 (1.6223)  acc1: 63.2000 (66.6857)  acc5: 87.2000 (88.0000)  time: 0.3946  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.8500 (1.6355)  acc1: 63.2000 (66.4480)  acc5: 84.4000 (87.8560)  time: 0.3947  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7028 s / it)
* Acc@1 65.950 Acc@5 87.192 loss 1.644
Accuracy of the model on the 50000 test images: 66.0%
Max accuracy: 65.98%
Epoch: [19]  [   0/1251]  eta: 1:34:54  lr: 0.003326  min_lr: 0.003326  loss: 3.5051 (3.5051)  weight_decay: 0.0500 (0.0500)  time: 4.5517  data: 2.1852  max mem: 69511
Epoch: [19]  [ 200/1251]  eta: 0:14:05  lr: 0.003354  min_lr: 0.003354  loss: 4.5088 (4.1812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9060 (0.8959)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [19]  [ 400/1251]  eta: 0:11:18  lr: 0.003382  min_lr: 0.003382  loss: 4.2894 (4.1986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (0.8517)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [19]  [ 600/1251]  eta: 0:08:37  lr: 0.003410  min_lr: 0.003410  loss: 4.6405 (4.2239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7741 (0.8324)  time: 0.7869  data: 0.0006  max mem: 69511
Epoch: [19]  [ 800/1251]  eta: 0:05:57  lr: 0.003438  min_lr: 0.003438  loss: 4.1772 (4.2152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7703 (0.8219)  time: 0.7874  data: 0.0004  max mem: 69511
Epoch: [19]  [1000/1251]  eta: 0:03:18  lr: 0.003466  min_lr: 0.003466  loss: 4.0939 (4.2053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6502 (0.8073)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [19]  [1200/1251]  eta: 0:00:40  lr: 0.003494  min_lr: 0.003494  loss: 4.2260 (4.2052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6399 (0.7922)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [19]  [1250/1251]  eta: 0:00:00  lr: 0.003500  min_lr: 0.003500  loss: 4.1372 (4.2043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6194 (0.7873)  time: 0.6681  data: 0.0007  max mem: 69511
Epoch: [19] Total time: 0:16:27 (0.7898 s / it)
Averaged stats: lr: 0.003500  min_lr: 0.003500  loss: 4.1372 (4.2026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6194 (0.7873)
Test:  [ 0/25]  eta: 0:03:10  loss: 1.0826 (1.0826)  acc1: 78.0000 (78.0000)  acc5: 94.0000 (94.0000)  time: 7.6054  data: 7.1956  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.1699 (1.2851)  acc1: 76.0000 (72.5455)  acc5: 94.0000 (92.4364)  time: 1.0530  data: 0.6545  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.6884 (1.5503)  acc1: 64.0000 (67.2381)  acc5: 88.4000 (88.4000)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.7018 (1.5643)  acc1: 62.8000 (66.8640)  acc5: 86.4000 (88.2720)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6932 s / it)
* Acc@1 67.118 Acc@5 88.102 loss 1.561
Accuracy of the model on the 50000 test images: 67.1%
Max accuracy: 67.12%
Epoch: [20]  [   0/1251]  eta: 1:33:26  lr: 0.003500  min_lr: 0.003500  loss: 3.4260 (3.4260)  weight_decay: 0.0500 (0.0500)  time: 4.4818  data: 3.6920  max mem: 69511
Epoch: [20]  [ 200/1251]  eta: 0:14:09  lr: 0.003500  min_lr: 0.003500  loss: 4.1513 (4.2109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7582 (0.7560)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [20]  [ 400/1251]  eta: 0:11:18  lr: 0.003500  min_lr: 0.003500  loss: 3.9011 (4.1663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7325 (0.7492)  time: 0.7939  data: 0.0004  max mem: 69511
Epoch: [20]  [ 600/1251]  eta: 0:08:36  lr: 0.003500  min_lr: 0.003500  loss: 4.5525 (4.1855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6447 (0.7359)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [20]  [ 800/1251]  eta: 0:05:57  lr: 0.003500  min_lr: 0.003500  loss: 4.2678 (4.1892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7437 (0.7453)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [20]  [1000/1251]  eta: 0:03:18  lr: 0.003500  min_lr: 0.003500  loss: 4.1434 (4.1697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8154 (0.7483)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [20]  [1200/1251]  eta: 0:00:40  lr: 0.003500  min_lr: 0.003500  loss: 4.1208 (4.1745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6994 (0.7388)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [20]  [1250/1251]  eta: 0:00:00  lr: 0.003500  min_lr: 0.003500  loss: 3.4786 (4.1656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7209 (0.7394)  time: 0.6691  data: 0.0005  max mem: 69511
Epoch: [20] Total time: 0:16:27 (0.7894 s / it)
Averaged stats: lr: 0.003500  min_lr: 0.003500  loss: 3.4786 (4.1680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7209 (0.7394)
Test:  [ 0/25]  eta: 0:03:07  loss: 1.0904 (1.0904)  acc1: 80.0000 (80.0000)  acc5: 93.6000 (93.6000)  time: 7.4837  data: 7.0430  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.1860 (1.2828)  acc1: 74.4000 (73.4545)  acc5: 93.6000 (92.9091)  time: 1.0413  data: 0.6406  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.6615 (1.5395)  acc1: 64.0000 (67.7714)  acc5: 86.4000 (88.9333)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.6791 (1.5525)  acc1: 64.0000 (67.4080)  acc5: 85.2000 (88.6240)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6870 s / it)
* Acc@1 67.288 Acc@5 88.438 loss 1.551
Accuracy of the model on the 50000 test images: 67.3%
Max accuracy: 67.29%
Epoch: [21]  [   0/1251]  eta: 1:24:06  lr: 0.003500  min_lr: 0.003500  loss: 4.6497 (4.6497)  weight_decay: 0.0500 (0.0500)  time: 4.0340  data: 3.2369  max mem: 69511
Epoch: [21]  [ 200/1251]  eta: 0:14:04  lr: 0.003500  min_lr: 0.003500  loss: 4.2033 (4.1315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6733 (0.7341)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [21]  [ 400/1251]  eta: 0:11:16  lr: 0.003500  min_lr: 0.003500  loss: 3.6743 (4.1488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6706 (0.7100)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [21]  [ 600/1251]  eta: 0:08:36  lr: 0.003500  min_lr: 0.003500  loss: 4.4734 (4.1472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6763 (0.6968)  time: 0.7914  data: 0.0004  max mem: 69511
Epoch: [21]  [ 800/1251]  eta: 0:05:56  lr: 0.003500  min_lr: 0.003500  loss: 4.2513 (4.1390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6731 (0.6935)  time: 0.7830  data: 0.0004  max mem: 69511
Epoch: [21]  [1000/1251]  eta: 0:03:18  lr: 0.003500  min_lr: 0.003500  loss: 3.7017 (4.1222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6312 (0.6978)  time: 0.7843  data: 0.0005  max mem: 69511
Epoch: [21]  [1200/1251]  eta: 0:00:40  lr: 0.003500  min_lr: 0.003500  loss: 4.0901 (4.1270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7311 (0.7015)  time: 0.7820  data: 0.0006  max mem: 69511
Epoch: [21]  [1250/1251]  eta: 0:00:00  lr: 0.003500  min_lr: 0.003500  loss: 4.1265 (4.1301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6928 (0.7014)  time: 0.6649  data: 0.0007  max mem: 69511
Epoch: [21] Total time: 0:16:25 (0.7881 s / it)
Averaged stats: lr: 0.003500  min_lr: 0.003500  loss: 4.1265 (4.1300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6928 (0.7014)
Test:  [ 0/25]  eta: 0:03:05  loss: 1.1028 (1.1028)  acc1: 80.4000 (80.4000)  acc5: 95.6000 (95.6000)  time: 7.4313  data: 7.0235  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.1761 (1.2876)  acc1: 74.4000 (73.4182)  acc5: 94.0000 (93.1273)  time: 1.0363  data: 0.6388  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.6208 (1.5612)  acc1: 65.2000 (68.5143)  acc5: 88.0000 (88.7429)  time: 0.3957  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.7180 (1.5728)  acc1: 64.0000 (68.0320)  acc5: 85.6000 (88.5280)  time: 0.3947  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6914 s / it)
* Acc@1 67.698 Acc@5 88.688 loss 1.575
Accuracy of the model on the 50000 test images: 67.7%
Max accuracy: 67.70%
Epoch: [22]  [   0/1251]  eta: 1:33:19  lr: 0.003500  min_lr: 0.003500  loss: 4.4823 (4.4823)  weight_decay: 0.0500 (0.0500)  time: 4.4761  data: 3.6826  max mem: 69511
Epoch: [22]  [ 200/1251]  eta: 0:14:03  lr: 0.003499  min_lr: 0.003499  loss: 4.2748 (4.1408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6592 (0.6714)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [22]  [ 400/1251]  eta: 0:11:17  lr: 0.003499  min_lr: 0.003499  loss: 4.2882 (4.1039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7643 (0.6867)  time: 0.7961  data: 0.0004  max mem: 69511
Epoch: [22]  [ 600/1251]  eta: 0:08:36  lr: 0.003499  min_lr: 0.003499  loss: 4.3756 (4.0894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6026 (0.6820)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [22]  [ 800/1251]  eta: 0:05:57  lr: 0.003499  min_lr: 0.003499  loss: 4.2077 (4.0779)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7312 (0.6841)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [22]  [1000/1251]  eta: 0:03:18  lr: 0.003499  min_lr: 0.003499  loss: 4.0116 (4.0734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5778 (0.6755)  time: 0.7878  data: 0.0004  max mem: 69511
Epoch: [22]  [1200/1251]  eta: 0:00:40  lr: 0.003499  min_lr: 0.003499  loss: 4.2113 (4.0708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6761 (0.6823)  time: 0.7869  data: 0.0004  max mem: 69511
Epoch: [22]  [1250/1251]  eta: 0:00:00  lr: 0.003499  min_lr: 0.003499  loss: 3.9897 (4.0675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6480 (0.6812)  time: 0.6683  data: 0.0006  max mem: 69511
Epoch: [22] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.003499  min_lr: 0.003499  loss: 3.9897 (4.0769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6480 (0.6812)
Test:  [ 0/25]  eta: 0:03:02  loss: 1.1936 (1.1936)  acc1: 80.0000 (80.0000)  acc5: 94.0000 (94.0000)  time: 7.2810  data: 6.8674  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.1936 (1.2514)  acc1: 75.6000 (74.6909)  acc5: 94.0000 (92.9818)  time: 1.0232  data: 0.6246  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.6029 (1.5208)  acc1: 65.6000 (69.0476)  acc5: 87.2000 (89.0857)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.7310 (1.5327)  acc1: 64.4000 (68.5760)  acc5: 86.0000 (88.9440)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6771 s / it)
* Acc@1 68.384 Acc@5 89.018 loss 1.527
Accuracy of the model on the 50000 test images: 68.4%
Max accuracy: 68.38%
Epoch: [23]  [   0/1251]  eta: 1:24:32  lr: 0.003499  min_lr: 0.003499  loss: 4.4158 (4.4158)  weight_decay: 0.0500 (0.0500)  time: 4.0547  data: 3.2624  max mem: 69511
Epoch: [23]  [ 200/1251]  eta: 0:14:03  lr: 0.003499  min_lr: 0.003499  loss: 3.6532 (4.0220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6623 (0.6718)  time: 0.7863  data: 0.0006  max mem: 69511
Epoch: [23]  [ 400/1251]  eta: 0:11:18  lr: 0.003499  min_lr: 0.003499  loss: 4.2334 (4.0614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6070 (0.6629)  time: 0.7864  data: 0.0006  max mem: 69511
Epoch: [23]  [ 600/1251]  eta: 0:08:36  lr: 0.003499  min_lr: 0.003499  loss: 4.4099 (4.0767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6175 (0.6456)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [23]  [ 800/1251]  eta: 0:05:57  lr: 0.003499  min_lr: 0.003499  loss: 4.2156 (4.0681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7143 (0.6521)  time: 0.7929  data: 0.0004  max mem: 69511
Epoch: [23]  [1000/1251]  eta: 0:03:18  lr: 0.003498  min_lr: 0.003498  loss: 4.3392 (4.0630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5699 (0.6433)  time: 0.7881  data: 0.0005  max mem: 69511
Epoch: [23]  [1200/1251]  eta: 0:00:40  lr: 0.003498  min_lr: 0.003498  loss: 3.7548 (4.0604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5295 (0.6359)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [23]  [1250/1251]  eta: 0:00:00  lr: 0.003498  min_lr: 0.003498  loss: 3.9046 (4.0601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5330 (0.6339)  time: 0.6680  data: 0.0006  max mem: 69511
Epoch: [23] Total time: 0:16:27 (0.7894 s / it)
Averaged stats: lr: 0.003498  min_lr: 0.003498  loss: 3.9046 (4.0380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5330 (0.6339)
Test:  [ 0/25]  eta: 0:03:08  loss: 1.0942 (1.0942)  acc1: 79.2000 (79.2000)  acc5: 95.2000 (95.2000)  time: 7.5288  data: 7.1067  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.2217 (1.3174)  acc1: 74.8000 (74.3636)  acc5: 94.0000 (93.0182)  time: 1.0453  data: 0.6463  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.6436 (1.5573)  acc1: 65.6000 (68.8381)  acc5: 88.4000 (89.3333)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.6379 (1.5589)  acc1: 65.6000 (68.4960)  acc5: 86.4000 (89.1360)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6880 s / it)
* Acc@1 68.440 Acc@5 88.896 loss 1.562
Accuracy of the model on the 50000 test images: 68.4%
Max accuracy: 68.44%
Epoch: [24]  [   0/1251]  eta: 1:23:59  lr: 0.003498  min_lr: 0.003498  loss: 4.5011 (4.5011)  weight_decay: 0.0500 (0.0500)  time: 4.0286  data: 3.2499  max mem: 69511
Epoch: [24]  [ 200/1251]  eta: 0:14:07  lr: 0.003498  min_lr: 0.003498  loss: 3.7464 (3.9940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5994 (0.6122)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [24]  [ 400/1251]  eta: 0:11:18  lr: 0.003498  min_lr: 0.003498  loss: 4.2736 (4.0169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6185 (0.6093)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [24]  [ 600/1251]  eta: 0:08:36  lr: 0.003498  min_lr: 0.003498  loss: 4.0994 (4.0252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6177 (0.6200)  time: 0.7870  data: 0.0006  max mem: 69511
Epoch: [24]  [ 800/1251]  eta: 0:05:57  lr: 0.003498  min_lr: 0.003498  loss: 4.1455 (4.0267)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6204 (0.6214)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [24]  [1000/1251]  eta: 0:03:18  lr: 0.003497  min_lr: 0.003497  loss: 4.2316 (4.0053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6589 (0.6316)  time: 0.7832  data: 0.0005  max mem: 69511
Epoch: [24]  [1200/1251]  eta: 0:00:40  lr: 0.003497  min_lr: 0.003497  loss: 3.9373 (4.0009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5670 (0.6255)  time: 0.7969  data: 0.0005  max mem: 69511
Epoch: [24]  [1250/1251]  eta: 0:00:00  lr: 0.003497  min_lr: 0.003497  loss: 4.0180 (3.9987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5901 (0.6238)  time: 0.6675  data: 0.0006  max mem: 69511
Epoch: [24] Total time: 0:16:27 (0.7892 s / it)
Averaged stats: lr: 0.003497  min_lr: 0.003497  loss: 4.0180 (4.0070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5901 (0.6238)
Test:  [ 0/25]  eta: 0:03:04  loss: 1.1426 (1.1426)  acc1: 80.0000 (80.0000)  acc5: 94.8000 (94.8000)  time: 7.3701  data: 6.9559  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.2163 (1.2803)  acc1: 75.2000 (74.6545)  acc5: 94.4000 (93.2727)  time: 1.0308  data: 0.6326  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.5587 (1.5316)  acc1: 67.2000 (69.4286)  acc5: 89.6000 (89.1619)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.7050 (1.5485)  acc1: 64.4000 (68.9760)  acc5: 86.4000 (89.0240)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6826 s / it)
* Acc@1 68.750 Acc@5 89.304 loss 1.539
Accuracy of the model on the 50000 test images: 68.8%
Max accuracy: 68.75%
Epoch: [25]  [   0/1251]  eta: 1:24:21  lr: 0.003497  min_lr: 0.003497  loss: 4.3195 (4.3195)  weight_decay: 0.0500 (0.0500)  time: 4.0463  data: 3.2597  max mem: 69511
Epoch: [25]  [ 200/1251]  eta: 0:14:02  lr: 0.003497  min_lr: 0.003497  loss: 3.8684 (4.0407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5685 (0.5862)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [25]  [ 400/1251]  eta: 0:11:16  lr: 0.003497  min_lr: 0.003497  loss: 4.1697 (4.0355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6179 (0.6007)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [25]  [ 600/1251]  eta: 0:08:36  lr: 0.003497  min_lr: 0.003497  loss: 3.5904 (4.0044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5411 (0.5858)  time: 0.7830  data: 0.0006  max mem: 69511
Epoch: [25]  [ 800/1251]  eta: 0:05:56  lr: 0.003496  min_lr: 0.003496  loss: 4.0991 (4.0032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5337 (0.5764)  time: 0.7842  data: 0.0005  max mem: 69511
Epoch: [25]  [1000/1251]  eta: 0:03:18  lr: 0.003496  min_lr: 0.003496  loss: 4.1273 (3.9979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6026 (0.5765)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [25]  [1200/1251]  eta: 0:00:40  lr: 0.003496  min_lr: 0.003496  loss: 3.8859 (3.9875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5003 (0.5783)  time: 0.7852  data: 0.0007  max mem: 69511
Epoch: [25]  [1250/1251]  eta: 0:00:00  lr: 0.003496  min_lr: 0.003496  loss: 4.0131 (3.9867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5441 (0.5779)  time: 0.6668  data: 0.0006  max mem: 69511
Epoch: [25] Total time: 0:16:25 (0.7881 s / it)
Averaged stats: lr: 0.003496  min_lr: 0.003496  loss: 4.0131 (3.9756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5441 (0.5779)
Test:  [ 0/25]  eta: 0:03:00  loss: 1.1760 (1.1760)  acc1: 79.2000 (79.2000)  acc5: 95.6000 (95.6000)  time: 7.2382  data: 6.8059  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.2240 (1.2901)  acc1: 77.2000 (74.1818)  acc5: 94.0000 (93.5273)  time: 1.0165  data: 0.6190  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.5989 (1.5364)  acc1: 65.2000 (68.8762)  acc5: 87.2000 (89.6571)  time: 0.3942  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.6536 (1.5521)  acc1: 66.0000 (68.8320)  acc5: 86.4000 (89.4240)  time: 0.3942  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6794 s / it)
* Acc@1 69.042 Acc@5 89.406 loss 1.545
Accuracy of the model on the 50000 test images: 69.0%
Max accuracy: 69.04%
Epoch: [26]  [   0/1251]  eta: 1:23:53  lr: 0.003496  min_lr: 0.003496  loss: 3.1319 (3.1319)  weight_decay: 0.0500 (0.0500)  time: 4.0239  data: 3.2373  max mem: 69511
Epoch: [26]  [ 200/1251]  eta: 0:14:00  lr: 0.003496  min_lr: 0.003496  loss: 4.1082 (3.9257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5604 (0.5623)  time: 0.7916  data: 0.0006  max mem: 69511
Epoch: [26]  [ 400/1251]  eta: 0:11:16  lr: 0.003496  min_lr: 0.003496  loss: 3.8896 (3.9013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5313 (0.5618)  time: 0.7947  data: 0.0004  max mem: 69511
Epoch: [26]  [ 600/1251]  eta: 0:08:35  lr: 0.003495  min_lr: 0.003495  loss: 4.1109 (3.9218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5869 (0.5601)  time: 0.7870  data: 0.0005  max mem: 69511
Epoch: [26]  [ 800/1251]  eta: 0:05:56  lr: 0.003495  min_lr: 0.003495  loss: 3.6232 (3.9294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5406 (0.5708)  time: 0.7872  data: 0.0004  max mem: 69511
Epoch: [26]  [1000/1251]  eta: 0:03:18  lr: 0.003495  min_lr: 0.003495  loss: 4.0563 (3.9454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5305 (0.5720)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [26]  [1200/1251]  eta: 0:00:40  lr: 0.003495  min_lr: 0.003495  loss: 4.0023 (3.9499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4768 (0.5676)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [26]  [1250/1251]  eta: 0:00:00  lr: 0.003495  min_lr: 0.003495  loss: 4.1104 (3.9524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5320 (0.5685)  time: 0.6681  data: 0.0005  max mem: 69511
Epoch: [26] Total time: 0:16:26 (0.7889 s / it)
Averaged stats: lr: 0.003495  min_lr: 0.003495  loss: 4.1104 (3.9543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5320 (0.5685)
Test:  [ 0/25]  eta: 0:03:16  loss: 1.0628 (1.0628)  acc1: 82.0000 (82.0000)  acc5: 95.2000 (95.2000)  time: 7.8551  data: 7.4374  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.2474 (1.2684)  acc1: 75.6000 (74.7273)  acc5: 94.4000 (94.0000)  time: 1.0752  data: 0.6764  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.5921 (1.5491)  acc1: 64.8000 (69.0095)  acc5: 87.6000 (89.3905)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.6292 (1.5534)  acc1: 64.8000 (68.9760)  acc5: 87.6000 (89.3920)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7002 s / it)
* Acc@1 69.600 Acc@5 89.630 loss 1.544
Accuracy of the model on the 50000 test images: 69.6%
Max accuracy: 69.60%
Epoch: [27]  [   0/1251]  eta: 1:26:42  lr: 0.003495  min_lr: 0.003495  loss: 3.0893 (3.0893)  weight_decay: 0.0500 (0.0500)  time: 4.1591  data: 3.3762  max mem: 69511
Epoch: [27]  [ 200/1251]  eta: 0:14:06  lr: 0.003494  min_lr: 0.003494  loss: 3.8630 (3.9399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5958 (0.5761)  time: 0.7988  data: 0.0004  max mem: 69511
Epoch: [27]  [ 400/1251]  eta: 0:11:16  lr: 0.003494  min_lr: 0.003494  loss: 4.0707 (3.9111)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5050 (0.5765)  time: 0.7842  data: 0.0005  max mem: 69511
Epoch: [27]  [ 600/1251]  eta: 0:08:35  lr: 0.003494  min_lr: 0.003494  loss: 4.2110 (3.9410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5673 (0.5733)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [27]  [ 800/1251]  eta: 0:05:57  lr: 0.003494  min_lr: 0.003494  loss: 4.1039 (3.9515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5068 (0.5592)  time: 0.7875  data: 0.0004  max mem: 69511
Epoch: [27]  [1000/1251]  eta: 0:03:18  lr: 0.003493  min_lr: 0.003493  loss: 3.8106 (3.9418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5869 (0.5602)  time: 0.7870  data: 0.0005  max mem: 69511
Epoch: [27]  [1200/1251]  eta: 0:00:40  lr: 0.003493  min_lr: 0.003493  loss: 4.1630 (3.9396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5139 (0.5553)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [27]  [1250/1251]  eta: 0:00:00  lr: 0.003493  min_lr: 0.003493  loss: 4.1068 (3.9404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5139 (0.5550)  time: 0.6717  data: 0.0006  max mem: 69511
Epoch: [27] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.003493  min_lr: 0.003493  loss: 4.1068 (3.9291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5139 (0.5550)
Test:  [ 0/25]  eta: 0:03:14  loss: 1.1539 (1.1539)  acc1: 77.2000 (77.2000)  acc5: 96.0000 (96.0000)  time: 7.7983  data: 7.3640  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.2921 (1.3228)  acc1: 76.0000 (74.5091)  acc5: 95.2000 (94.0727)  time: 1.0701  data: 0.6697  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.6383 (1.5664)  acc1: 65.6000 (69.4286)  acc5: 87.6000 (89.9048)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.7521 (1.5671)  acc1: 65.2000 (69.2320)  acc5: 86.8000 (89.8560)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7003 s / it)
* Acc@1 69.704 Acc@5 89.878 loss 1.561
Accuracy of the model on the 50000 test images: 69.7%
Max accuracy: 69.70%
Epoch: [28]  [   0/1251]  eta: 1:25:11  lr: 0.003493  min_lr: 0.003493  loss: 3.5247 (3.5247)  weight_decay: 0.0500 (0.0500)  time: 4.0860  data: 3.2955  max mem: 69511
Epoch: [28]  [ 200/1251]  eta: 0:14:04  lr: 0.003493  min_lr: 0.003493  loss: 3.3426 (3.9477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5151 (0.5288)  time: 0.7932  data: 0.0006  max mem: 69511
Epoch: [28]  [ 400/1251]  eta: 0:11:15  lr: 0.003492  min_lr: 0.003492  loss: 3.7861 (3.9280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4930 (0.5273)  time: 0.7848  data: 0.0006  max mem: 69511
Epoch: [28]  [ 600/1251]  eta: 0:08:35  lr: 0.003492  min_lr: 0.003492  loss: 3.8788 (3.9135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4465 (0.5238)  time: 0.7889  data: 0.0006  max mem: 69511
Epoch: [28]  [ 800/1251]  eta: 0:05:56  lr: 0.003492  min_lr: 0.003492  loss: 4.0860 (3.9091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4578 (0.5246)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [28]  [1000/1251]  eta: 0:03:18  lr: 0.003491  min_lr: 0.003491  loss: 4.1418 (3.9141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5089 (0.5249)  time: 0.7874  data: 0.0008  max mem: 69511
Epoch: [28]  [1200/1251]  eta: 0:00:40  lr: 0.003491  min_lr: 0.003491  loss: 3.6420 (3.9117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4917 (0.5186)  time: 0.7985  data: 0.0006  max mem: 69511
Epoch: [28]  [1250/1251]  eta: 0:00:00  lr: 0.003491  min_lr: 0.003491  loss: 3.7196 (3.9101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5087 (0.5198)  time: 0.6680  data: 0.0007  max mem: 69511
Epoch: [28] Total time: 0:16:26 (0.7889 s / it)
Averaged stats: lr: 0.003491  min_lr: 0.003491  loss: 3.7196 (3.8981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5087 (0.5198)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.9799 (0.9799)  acc1: 82.8000 (82.8000)  acc5: 95.2000 (95.2000)  time: 6.5390  data: 6.0985  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 1.1183 (1.1455)  acc1: 77.2000 (76.4364)  acc5: 94.4000 (94.2909)  time: 0.9806  data: 0.5800  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.4425 (1.3849)  acc1: 66.4000 (70.7810)  acc5: 90.0000 (90.6857)  time: 0.4150  data: 0.0141  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.5286 (1.3903)  acc1: 66.4000 (70.7040)  acc5: 88.4000 (90.6400)  time: 0.4030  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6651 s / it)
* Acc@1 71.254 Acc@5 90.740 loss 1.384
Accuracy of the model on the 50000 test images: 71.3%
Max accuracy: 71.25%
Epoch: [29]  [   0/1251]  eta: 1:22:40  lr: 0.003491  min_lr: 0.003491  loss: 4.3150 (4.3150)  weight_decay: 0.0500 (0.0500)  time: 3.9651  data: 3.1850  max mem: 69511
Epoch: [29]  [ 200/1251]  eta: 0:14:02  lr: 0.003491  min_lr: 0.003491  loss: 3.9512 (3.8756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4906 (0.5596)  time: 0.7870  data: 0.0006  max mem: 69511
Epoch: [29]  [ 400/1251]  eta: 0:11:15  lr: 0.003490  min_lr: 0.003490  loss: 3.8653 (3.8828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4822 (0.5275)  time: 0.7827  data: 0.0007  max mem: 69511
Epoch: [29]  [ 600/1251]  eta: 0:08:35  lr: 0.003490  min_lr: 0.003490  loss: 4.0589 (3.8970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5042 (0.5231)  time: 0.7856  data: 0.0007  max mem: 69511
Epoch: [29]  [ 800/1251]  eta: 0:05:56  lr: 0.003490  min_lr: 0.003490  loss: 4.0598 (3.8877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5031 (0.5228)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [29]  [1000/1251]  eta: 0:03:18  lr: 0.003489  min_lr: 0.003489  loss: 3.9723 (3.8817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5084 (0.5228)  time: 0.7902  data: 0.0005  max mem: 69511
Epoch: [29]  [1200/1251]  eta: 0:00:40  lr: 0.003489  min_lr: 0.003489  loss: 3.9402 (3.8773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4898 (0.5193)  time: 0.7836  data: 0.0007  max mem: 69511
Epoch: [29]  [1250/1251]  eta: 0:00:00  lr: 0.003489  min_lr: 0.003489  loss: 3.7577 (3.8698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4898 (0.5191)  time: 0.6657  data: 0.0005  max mem: 69511
Epoch: [29] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.003489  min_lr: 0.003489  loss: 3.7577 (3.8809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4898 (0.5191)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.9373 (0.9373)  acc1: 83.2000 (83.2000)  acc5: 95.6000 (95.6000)  time: 6.2787  data: 5.8488  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.1611 (1.1519)  acc1: 77.2000 (76.0000)  acc5: 95.6000 (94.6546)  time: 1.0136  data: 0.6168  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.4921 (1.4111)  acc1: 68.4000 (71.2762)  acc5: 88.4000 (90.6476)  time: 0.4405  data: 0.0468  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.6053 (1.4339)  acc1: 68.8000 (71.0080)  acc5: 87.6000 (90.2240)  time: 0.3939  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6713 s / it)
* Acc@1 70.852 Acc@5 90.456 loss 1.426
Accuracy of the model on the 50000 test images: 70.9%
Max accuracy: 71.25%
Epoch: [30]  [   0/1251]  eta: 1:29:18  lr: 0.003489  min_lr: 0.003489  loss: 2.5470 (2.5470)  weight_decay: 0.0500 (0.0500)  time: 4.2834  data: 3.1025  max mem: 69511
Epoch: [30]  [ 200/1251]  eta: 0:14:03  lr: 0.003489  min_lr: 0.003489  loss: 4.0323 (3.8819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4488 (0.5376)  time: 0.7865  data: 0.0008  max mem: 69511
Epoch: [30]  [ 400/1251]  eta: 0:11:18  lr: 0.003488  min_lr: 0.003488  loss: 4.1253 (3.8894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5058 (0.5374)  time: 0.7835  data: 0.0005  max mem: 69511
Epoch: [30]  [ 600/1251]  eta: 0:08:35  lr: 0.003488  min_lr: 0.003488  loss: 3.9462 (3.8686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4736 (0.5449)  time: 0.7848  data: 0.0009  max mem: 69511
Epoch: [30]  [ 800/1251]  eta: 0:05:56  lr: 0.003488  min_lr: 0.003488  loss: 3.8911 (3.8560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4564 (0.5266)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [30]  [1000/1251]  eta: 0:03:18  lr: 0.003487  min_lr: 0.003487  loss: 3.3646 (3.8508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4824 (0.5273)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [30]  [1200/1251]  eta: 0:00:40  lr: 0.003487  min_lr: 0.003487  loss: 4.0395 (3.8416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4516 (0.5228)  time: 0.7872  data: 0.0009  max mem: 69511
Epoch: [30]  [1250/1251]  eta: 0:00:00  lr: 0.003487  min_lr: 0.003487  loss: 3.7508 (3.8413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4785 (0.5228)  time: 0.6666  data: 0.0007  max mem: 69511
Epoch: [30] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.003487  min_lr: 0.003487  loss: 3.7508 (3.8520)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4785 (0.5228)
Test:  [ 0/25]  eta: 0:03:05  loss: 1.0183 (1.0183)  acc1: 83.2000 (83.2000)  acc5: 95.2000 (95.2000)  time: 7.4092  data: 6.9810  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.1368 (1.1985)  acc1: 80.4000 (76.9455)  acc5: 95.2000 (94.9455)  time: 1.0323  data: 0.6349  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.5395 (1.4463)  acc1: 68.4000 (71.7143)  acc5: 90.4000 (91.2000)  time: 0.3945  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.6260 (1.4557)  acc1: 66.8000 (71.3440)  acc5: 88.4000 (90.9920)  time: 0.3945  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6872 s / it)
* Acc@1 71.438 Acc@5 90.888 loss 1.453
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.44%
Epoch: [31]  [   0/1251]  eta: 1:29:58  lr: 0.003487  min_lr: 0.003487  loss: 4.0478 (4.0478)  weight_decay: 0.0500 (0.0500)  time: 4.3155  data: 3.5276  max mem: 69511
Epoch: [31]  [ 200/1251]  eta: 0:14:07  lr: 0.003486  min_lr: 0.003486  loss: 4.2319 (3.7711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5158 (inf)  time: 0.8024  data: 0.0004  max mem: 69511
Epoch: [31]  [ 400/1251]  eta: 0:11:17  lr: 0.003486  min_lr: 0.003486  loss: 3.6125 (3.8108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5228 (inf)  time: 0.7860  data: 0.0006  max mem: 69511
Epoch: [31]  [ 600/1251]  eta: 0:08:36  lr: 0.003486  min_lr: 0.003486  loss: 3.9034 (3.8242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4838 (inf)  time: 0.7864  data: 0.0006  max mem: 69511
Epoch: [31]  [ 800/1251]  eta: 0:05:57  lr: 0.003485  min_lr: 0.003485  loss: 3.9148 (3.8329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5095 (inf)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [31]  [1000/1251]  eta: 0:03:18  lr: 0.003485  min_lr: 0.003485  loss: 3.9726 (3.8361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4382 (inf)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [31]  [1200/1251]  eta: 0:00:40  lr: 0.003484  min_lr: 0.003484  loss: 4.1306 (3.8400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4343 (inf)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [31]  [1250/1251]  eta: 0:00:00  lr: 0.003484  min_lr: 0.003484  loss: 3.9329 (3.8388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4412 (inf)  time: 0.6675  data: 0.0005  max mem: 69511
Epoch: [31] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.003484  min_lr: 0.003484  loss: 3.9329 (3.8259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4412 (inf)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.9565 (0.9565)  acc1: 83.6000 (83.6000)  acc5: 94.4000 (94.4000)  time: 7.8192  data: 7.4013  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.1591 (1.1899)  acc1: 76.4000 (77.1273)  acc5: 96.4000 (94.9818)  time: 1.0717  data: 0.6731  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.4498 (1.4468)  acc1: 70.4000 (71.9429)  acc5: 90.0000 (91.2191)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.6740 (1.4552)  acc1: 67.2000 (71.6320)  acc5: 87.6000 (90.9920)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7022 s / it)
* Acc@1 71.836 Acc@5 91.140 loss 1.448
Accuracy of the model on the 50000 test images: 71.8%
Max accuracy: 71.84%
Epoch: [32]  [   0/1251]  eta: 1:27:10  lr: 0.003484  min_lr: 0.003484  loss: 3.5615 (3.5615)  weight_decay: 0.0500 (0.0500)  time: 4.1811  data: 3.3908  max mem: 69511
Epoch: [32]  [ 200/1251]  eta: 0:14:06  lr: 0.003484  min_lr: 0.003484  loss: 3.6603 (3.7587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4789 (0.4989)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [32]  [ 400/1251]  eta: 0:11:17  lr: 0.003483  min_lr: 0.003483  loss: 3.8405 (3.7587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4936 (0.5083)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [32]  [ 600/1251]  eta: 0:08:36  lr: 0.003483  min_lr: 0.003483  loss: 4.1083 (3.7734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5886 (0.5134)  time: 0.7888  data: 0.0004  max mem: 69511
Epoch: [32]  [ 800/1251]  eta: 0:05:57  lr: 0.003482  min_lr: 0.003482  loss: 3.5089 (3.7741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4565 (0.5137)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [32]  [1000/1251]  eta: 0:03:18  lr: 0.003482  min_lr: 0.003482  loss: 3.8956 (3.7924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5173 (0.5170)  time: 0.7870  data: 0.0005  max mem: 69511
Epoch: [32]  [1200/1251]  eta: 0:00:40  lr: 0.003482  min_lr: 0.003482  loss: 4.0757 (3.8018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4956 (0.5205)  time: 0.7874  data: 0.0005  max mem: 69511
Epoch: [32]  [1250/1251]  eta: 0:00:00  lr: 0.003481  min_lr: 0.003481  loss: 4.0222 (3.8048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4717 (0.5214)  time: 0.6690  data: 0.0008  max mem: 69511
Epoch: [32] Total time: 0:16:27 (0.7895 s / it)
Averaged stats: lr: 0.003481  min_lr: 0.003481  loss: 4.0222 (3.8023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4717 (0.5214)
Test:  [ 0/25]  eta: 0:03:22  loss: 1.1002 (1.1002)  acc1: 82.8000 (82.8000)  acc5: 96.0000 (96.0000)  time: 8.0825  data: 7.6720  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.2475 (1.2582)  acc1: 76.4000 (76.7636)  acc5: 94.8000 (94.6546)  time: 1.0966  data: 0.6977  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.5142 (1.4964)  acc1: 69.2000 (72.0191)  acc5: 92.0000 (91.3714)  time: 0.3978  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.6056 (1.5115)  acc1: 69.2000 (71.5520)  acc5: 89.2000 (91.2160)  time: 0.3975  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7122 s / it)
* Acc@1 71.680 Acc@5 91.224 loss 1.501
Accuracy of the model on the 50000 test images: 71.7%
Max accuracy: 71.84%
Epoch: [33]  [   0/1251]  eta: 1:31:20  lr: 0.003481  min_lr: 0.003481  loss: 3.1834 (3.1834)  weight_decay: 0.0500 (0.0500)  time: 4.3808  data: 2.1463  max mem: 69511
Epoch: [33]  [ 200/1251]  eta: 0:14:04  lr: 0.003481  min_lr: 0.003481  loss: 3.9629 (3.7831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4910 (0.4971)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [33]  [ 400/1251]  eta: 0:11:17  lr: 0.003480  min_lr: 0.003480  loss: 4.0302 (3.7673)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4921 (0.5227)  time: 0.7980  data: 0.0004  max mem: 69511
Epoch: [33]  [ 600/1251]  eta: 0:08:36  lr: 0.003480  min_lr: 0.003480  loss: 3.9134 (3.7845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4989 (0.5173)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [33]  [ 800/1251]  eta: 0:05:57  lr: 0.003480  min_lr: 0.003480  loss: 3.9464 (3.7864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4842 (0.5166)  time: 0.7877  data: 0.0005  max mem: 69511
Epoch: [33]  [1000/1251]  eta: 0:03:18  lr: 0.003479  min_lr: 0.003479  loss: 3.7545 (3.7937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5197 (0.5199)  time: 0.7910  data: 0.0004  max mem: 69511
Epoch: [33]  [1200/1251]  eta: 0:00:40  lr: 0.003479  min_lr: 0.003479  loss: 3.9425 (3.7942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4531 (0.5150)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [33]  [1250/1251]  eta: 0:00:00  lr: 0.003478  min_lr: 0.003478  loss: 3.6990 (3.7933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4626 (0.5133)  time: 0.6682  data: 0.0005  max mem: 69511
Epoch: [33] Total time: 0:16:27 (0.7894 s / it)
Averaged stats: lr: 0.003478  min_lr: 0.003478  loss: 3.6990 (3.7901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4626 (0.5133)
Test:  [ 0/25]  eta: 0:03:06  loss: 1.0312 (1.0312)  acc1: 84.0000 (84.0000)  acc5: 94.8000 (94.8000)  time: 7.4538  data: 7.0263  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0916 (1.1936)  acc1: 76.0000 (77.3455)  acc5: 96.0000 (95.0182)  time: 1.0388  data: 0.6391  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.5308 (1.4371)  acc1: 68.4000 (72.5905)  acc5: 90.0000 (91.6000)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.5852 (1.4455)  acc1: 68.4000 (72.4960)  acc5: 90.0000 (91.5360)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6850 s / it)
* Acc@1 72.526 Acc@5 91.408 loss 1.438
Accuracy of the model on the 50000 test images: 72.5%
Max accuracy: 72.53%
Epoch: [34]  [   0/1251]  eta: 1:18:49  lr: 0.003478  min_lr: 0.003478  loss: 4.4261 (4.4261)  weight_decay: 0.0500 (0.0500)  time: 3.7803  data: 2.9984  max mem: 69511
Epoch: [34]  [ 200/1251]  eta: 0:14:01  lr: 0.003478  min_lr: 0.003478  loss: 3.9699 (3.7804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5445 (0.5146)  time: 0.7866  data: 0.0006  max mem: 69511
Epoch: [34]  [ 400/1251]  eta: 0:11:17  lr: 0.003477  min_lr: 0.003477  loss: 3.8371 (3.7820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5089 (0.5362)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [34]  [ 600/1251]  eta: 0:08:36  lr: 0.003477  min_lr: 0.003477  loss: 3.9253 (3.7870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5107 (0.5310)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [34]  [ 800/1251]  eta: 0:05:57  lr: 0.003476  min_lr: 0.003476  loss: 3.3561 (3.7663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4990 (0.5343)  time: 0.7870  data: 0.0006  max mem: 69511
Epoch: [34]  [1000/1251]  eta: 0:03:18  lr: 0.003476  min_lr: 0.003476  loss: 4.0521 (3.7641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4694 (0.5295)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [34]  [1200/1251]  eta: 0:00:40  lr: 0.003475  min_lr: 0.003475  loss: 3.6188 (3.7568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4518 (0.5228)  time: 0.7827  data: 0.0005  max mem: 69511
Epoch: [34]  [1250/1251]  eta: 0:00:00  lr: 0.003475  min_lr: 0.003475  loss: 3.6043 (3.7519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4927 (0.5221)  time: 0.6655  data: 0.0006  max mem: 69511
Epoch: [34] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.003475  min_lr: 0.003475  loss: 3.6043 (3.7775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4927 (0.5221)
Test:  [ 0/25]  eta: 0:03:08  loss: 0.9089 (0.9089)  acc1: 83.2000 (83.2000)  acc5: 95.6000 (95.6000)  time: 7.5593  data: 7.1453  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0912 (1.1580)  acc1: 78.4000 (77.2727)  acc5: 95.6000 (95.0546)  time: 1.0457  data: 0.6499  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.4387 (1.3815)  acc1: 68.8000 (72.4571)  acc5: 90.4000 (91.6762)  time: 0.3944  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.5225 (1.3889)  acc1: 68.8000 (72.3040)  acc5: 90.0000 (91.4560)  time: 0.3944  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6934 s / it)
* Acc@1 72.548 Acc@5 91.678 loss 1.384
Accuracy of the model on the 50000 test images: 72.5%
Max accuracy: 72.55%
Epoch: [35]  [   0/1251]  eta: 1:09:08  lr: 0.003475  min_lr: 0.003475  loss: 3.8333 (3.8333)  weight_decay: 0.0500 (0.0500)  time: 3.3158  data: 2.5345  max mem: 69511
Epoch: [35]  [ 200/1251]  eta: 0:14:02  lr: 0.003475  min_lr: 0.003475  loss: 4.0069 (3.7704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5410 (0.5454)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [35]  [ 400/1251]  eta: 0:11:15  lr: 0.003474  min_lr: 0.003474  loss: 3.5016 (3.7431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5587 (0.5507)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [35]  [ 600/1251]  eta: 0:08:34  lr: 0.003474  min_lr: 0.003474  loss: 3.7576 (3.7214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4518 (0.5327)  time: 0.7826  data: 0.0004  max mem: 69511
Epoch: [35]  [ 800/1251]  eta: 0:05:56  lr: 0.003473  min_lr: 0.003473  loss: 3.7414 (3.7253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4913 (0.5269)  time: 0.7833  data: 0.0005  max mem: 69511
Epoch: [35]  [1000/1251]  eta: 0:03:17  lr: 0.003473  min_lr: 0.003473  loss: 4.0406 (3.7328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5653 (0.5290)  time: 0.7825  data: 0.0005  max mem: 69511
Epoch: [35]  [1200/1251]  eta: 0:00:40  lr: 0.003472  min_lr: 0.003472  loss: 3.7181 (3.7392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4511 (0.5247)  time: 0.7935  data: 0.0004  max mem: 69511
Epoch: [35]  [1250/1251]  eta: 0:00:00  lr: 0.003472  min_lr: 0.003472  loss: 3.4902 (3.7403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4557 (0.5232)  time: 0.6690  data: 0.0005  max mem: 69511
Epoch: [35] Total time: 0:16:24 (0.7867 s / it)
Averaged stats: lr: 0.003472  min_lr: 0.003472  loss: 3.4902 (3.7587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4557 (0.5232)
Test:  [ 0/25]  eta: 0:03:08  loss: 0.9190 (0.9190)  acc1: 80.8000 (80.8000)  acc5: 96.4000 (96.4000)  time: 7.5202  data: 7.0943  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0853 (1.0984)  acc1: 76.8000 (77.1273)  acc5: 96.0000 (95.1273)  time: 1.0447  data: 0.6452  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.3725 (1.3161)  acc1: 70.0000 (73.0667)  acc5: 91.2000 (92.1333)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.4774 (1.3274)  acc1: 70.0000 (73.0080)  acc5: 89.6000 (91.9840)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6884 s / it)
* Acc@1 73.020 Acc@5 91.898 loss 1.325
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.02%
Epoch: [36]  [   0/1251]  eta: 1:20:51  lr: 0.003472  min_lr: 0.003472  loss: 4.0858 (4.0858)  weight_decay: 0.0500 (0.0500)  time: 3.8779  data: 3.0824  max mem: 69511
Epoch: [36]  [ 200/1251]  eta: 0:14:03  lr: 0.003471  min_lr: 0.003471  loss: 3.6848 (3.7791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5761 (0.5479)  time: 0.7980  data: 0.0005  max mem: 69511
Epoch: [36]  [ 400/1251]  eta: 0:11:16  lr: 0.003471  min_lr: 0.003471  loss: 3.8209 (3.7741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5129 (0.5479)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [36]  [ 600/1251]  eta: 0:08:36  lr: 0.003470  min_lr: 0.003470  loss: 3.8527 (3.7707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4678 (0.5408)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [36]  [ 800/1251]  eta: 0:05:57  lr: 0.003470  min_lr: 0.003470  loss: 3.9352 (3.7729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5363 (0.5340)  time: 0.7865  data: 0.0007  max mem: 69511
Epoch: [36]  [1000/1251]  eta: 0:03:18  lr: 0.003469  min_lr: 0.003469  loss: 3.6714 (3.7617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4942 (0.5321)  time: 0.7876  data: 0.0005  max mem: 69511
Epoch: [36]  [1200/1251]  eta: 0:00:40  lr: 0.003468  min_lr: 0.003468  loss: 3.8465 (3.7557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5505 (0.5316)  time: 0.7829  data: 0.0005  max mem: 69511
Epoch: [36]  [1250/1251]  eta: 0:00:00  lr: 0.003468  min_lr: 0.003468  loss: 3.9015 (3.7525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5505 (0.5321)  time: 0.6660  data: 0.0006  max mem: 69511
Epoch: [36] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.003468  min_lr: 0.003468  loss: 3.9015 (3.7368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5505 (0.5321)
Test:  [ 0/25]  eta: 0:03:02  loss: 0.9793 (0.9793)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 7.2890  data: 6.8580  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.1810 (1.1653)  acc1: 80.0000 (78.5455)  acc5: 95.2000 (94.7273)  time: 1.0221  data: 0.6238  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.4136 (1.3709)  acc1: 70.0000 (73.7524)  acc5: 90.4000 (91.6571)  time: 0.3953  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.4119 (1.3735)  acc1: 70.0000 (73.5520)  acc5: 90.4000 (91.6480)  time: 0.3954  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6821 s / it)
* Acc@1 73.464 Acc@5 92.032 loss 1.369
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.46%
Epoch: [37]  [   0/1251]  eta: 1:24:36  lr: 0.003468  min_lr: 0.003468  loss: 4.3903 (4.3903)  weight_decay: 0.0500 (0.0500)  time: 4.0582  data: 3.2684  max mem: 69511
Epoch: [37]  [ 200/1251]  eta: 0:14:01  lr: 0.003468  min_lr: 0.003468  loss: 3.7837 (3.7358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5403 (0.5087)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [37]  [ 400/1251]  eta: 0:11:17  lr: 0.003467  min_lr: 0.003467  loss: 3.7273 (3.7738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5052 (0.5206)  time: 0.7940  data: 0.0004  max mem: 69511
Epoch: [37]  [ 600/1251]  eta: 0:08:36  lr: 0.003466  min_lr: 0.003466  loss: 3.7999 (3.7536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5113 (0.5148)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [37]  [ 800/1251]  eta: 0:05:56  lr: 0.003466  min_lr: 0.003466  loss: 4.0074 (3.7545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6086 (0.5230)  time: 0.7870  data: 0.0005  max mem: 69511
Epoch: [37]  [1000/1251]  eta: 0:03:18  lr: 0.003465  min_lr: 0.003465  loss: 3.5909 (3.7467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4802 (0.5163)  time: 0.7933  data: 0.0005  max mem: 69511
Epoch: [37]  [1200/1251]  eta: 0:00:40  lr: 0.003465  min_lr: 0.003465  loss: 3.9260 (3.7400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4674 (0.5100)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [37]  [1250/1251]  eta: 0:00:00  lr: 0.003464  min_lr: 0.003464  loss: 3.6343 (3.7415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4988 (0.5101)  time: 0.6689  data: 0.0005  max mem: 69511
Epoch: [37] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.003464  min_lr: 0.003464  loss: 3.6343 (3.7295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4988 (0.5101)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.8077 (0.8077)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 5.3548  data: 4.9381  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 1.0433 (1.0689)  acc1: 78.4000 (78.0000)  acc5: 96.0000 (95.0182)  time: 0.9766  data: 0.5781  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.3001 (1.3016)  acc1: 70.0000 (73.1619)  acc5: 90.8000 (92.0000)  time: 0.4678  data: 0.0711  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3499 (1.3087)  acc1: 70.4000 (72.9280)  acc5: 90.8000 (91.9040)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6571 s / it)
* Acc@1 73.600 Acc@5 92.222 loss 1.294
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 73.60%
Epoch: [38]  [   0/1251]  eta: 1:29:38  lr: 0.003464  min_lr: 0.003464  loss: 3.2432 (3.2432)  weight_decay: 0.0500 (0.0500)  time: 4.2996  data: 3.5063  max mem: 69511
Epoch: [38]  [ 200/1251]  eta: 0:14:05  lr: 0.003464  min_lr: 0.003464  loss: 3.9101 (3.7400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5179 (0.5774)  time: 0.7907  data: 0.0005  max mem: 69511
Epoch: [38]  [ 400/1251]  eta: 0:11:16  lr: 0.003463  min_lr: 0.003463  loss: 3.8897 (3.7314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4483 (0.5287)  time: 0.7822  data: 0.0005  max mem: 69511
Epoch: [38]  [ 600/1251]  eta: 0:08:35  lr: 0.003463  min_lr: 0.003463  loss: 3.9527 (3.7629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5065 (0.5296)  time: 0.7850  data: 0.0004  max mem: 69511
Epoch: [38]  [ 800/1251]  eta: 0:05:56  lr: 0.003462  min_lr: 0.003462  loss: 3.4895 (3.7529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5189 (0.5281)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [38]  [1000/1251]  eta: 0:03:18  lr: 0.003461  min_lr: 0.003461  loss: 3.6382 (3.7408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5187 (0.5315)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [38]  [1200/1251]  eta: 0:00:40  lr: 0.003461  min_lr: 0.003461  loss: 3.8555 (3.7300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5128 (0.5335)  time: 0.7872  data: 0.0005  max mem: 69511
Epoch: [38]  [1250/1251]  eta: 0:00:00  lr: 0.003460  min_lr: 0.003460  loss: 3.9863 (3.7255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5128 (0.5336)  time: 0.6686  data: 0.0005  max mem: 69511
Epoch: [38] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.003460  min_lr: 0.003460  loss: 3.9863 (3.7120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5128 (0.5336)
Test:  [ 0/25]  eta: 0:03:03  loss: 1.0399 (1.0399)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 7.3260  data: 6.9047  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.2050 (1.2614)  acc1: 77.2000 (78.2909)  acc5: 95.2000 (94.9091)  time: 1.0272  data: 0.6280  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.6032 (1.4425)  acc1: 70.4000 (73.6000)  acc5: 90.0000 (91.7524)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.6032 (1.4415)  acc1: 70.4000 (73.3920)  acc5: 89.2000 (91.7120)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6805 s / it)
* Acc@1 73.674 Acc@5 92.088 loss 1.426
Accuracy of the model on the 50000 test images: 73.7%
Max accuracy: 73.67%
Epoch: [39]  [   0/1251]  eta: 1:29:51  lr: 0.003460  min_lr: 0.003460  loss: 3.4971 (3.4971)  weight_decay: 0.0500 (0.0500)  time: 4.3100  data: 3.5134  max mem: 69511
Epoch: [39]  [ 200/1251]  eta: 0:14:06  lr: 0.003460  min_lr: 0.003460  loss: 3.8305 (3.7120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4431 (0.5263)  time: 0.7827  data: 0.0005  max mem: 69511
Epoch: [39]  [ 400/1251]  eta: 0:11:16  lr: 0.003459  min_lr: 0.003459  loss: 3.9800 (3.7176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4847 (0.5184)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [39]  [ 600/1251]  eta: 0:08:36  lr: 0.003458  min_lr: 0.003458  loss: 3.6253 (3.7236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5799 (0.5206)  time: 0.7920  data: 0.0005  max mem: 69511
Epoch: [39]  [ 800/1251]  eta: 0:05:56  lr: 0.003458  min_lr: 0.003458  loss: 3.6470 (3.7136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4812 (0.5116)  time: 0.7827  data: 0.0005  max mem: 69511
Epoch: [39]  [1000/1251]  eta: 0:03:18  lr: 0.003457  min_lr: 0.003457  loss: 3.4820 (3.7145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5429 (0.5165)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [39]  [1200/1251]  eta: 0:00:40  lr: 0.003456  min_lr: 0.003456  loss: 3.8527 (3.7088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4845 (0.5133)  time: 0.7891  data: 0.0004  max mem: 69511
Epoch: [39]  [1250/1251]  eta: 0:00:00  lr: 0.003456  min_lr: 0.003456  loss: 3.5408 (3.7089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5374 (0.5171)  time: 0.6682  data: 0.0006  max mem: 69511
Epoch: [39] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.003456  min_lr: 0.003456  loss: 3.5408 (3.6991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5374 (0.5171)
Test:  [ 0/25]  eta: 0:03:17  loss: 0.8287 (0.8287)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 7.8993  data: 7.4715  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9503 (1.0178)  acc1: 80.0000 (78.1818)  acc5: 96.8000 (95.5636)  time: 1.0793  data: 0.6795  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.3183 (1.2662)  acc1: 70.4000 (73.8286)  acc5: 91.2000 (92.0381)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3730 (1.2761)  acc1: 70.4000 (73.6960)  acc5: 90.0000 (91.9680)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7051 s / it)
* Acc@1 73.962 Acc@5 92.380 loss 1.265
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 73.96%
Epoch: [40]  [   0/1251]  eta: 1:17:09  lr: 0.003456  min_lr: 0.003456  loss: 2.8434 (2.8434)  weight_decay: 0.0500 (0.0500)  time: 3.7003  data: 2.9212  max mem: 69511
Epoch: [40]  [ 200/1251]  eta: 0:13:59  lr: 0.003455  min_lr: 0.003455  loss: 3.9237 (3.6958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5455 (0.5167)  time: 0.7824  data: 0.0005  max mem: 69511
Epoch: [40]  [ 400/1251]  eta: 0:11:13  lr: 0.003455  min_lr: 0.003455  loss: 3.6293 (3.6852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4936 (0.5149)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [40]  [ 600/1251]  eta: 0:08:35  lr: 0.003454  min_lr: 0.003454  loss: 3.7463 (3.6733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5639 (0.5269)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [40]  [ 800/1251]  eta: 0:05:56  lr: 0.003453  min_lr: 0.003453  loss: 3.9549 (3.6869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4678 (0.5252)  time: 0.7828  data: 0.0004  max mem: 69511
Epoch: [40]  [1000/1251]  eta: 0:03:17  lr: 0.003453  min_lr: 0.003453  loss: 3.7848 (3.6835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5328 (0.5329)  time: 0.7846  data: 0.0005  max mem: 69511
Epoch: [40]  [1200/1251]  eta: 0:00:40  lr: 0.003452  min_lr: 0.003452  loss: 3.8790 (3.6942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5132 (0.5325)  time: 0.7822  data: 0.0005  max mem: 69511
Epoch: [40]  [1250/1251]  eta: 0:00:00  lr: 0.003452  min_lr: 0.003452  loss: 3.7405 (3.6936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4725 (0.5296)  time: 0.6645  data: 0.0006  max mem: 69511
Epoch: [40] Total time: 0:16:23 (0.7865 s / it)
Averaged stats: lr: 0.003452  min_lr: 0.003452  loss: 3.7405 (3.6898)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4725 (0.5296)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.9237 (0.9237)  acc1: 83.2000 (83.2000)  acc5: 96.8000 (96.8000)  time: 7.7002  data: 7.2923  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.1252 (1.1109)  acc1: 80.4000 (78.7636)  acc5: 96.4000 (95.8182)  time: 1.0603  data: 0.6632  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.3871 (1.3256)  acc1: 72.4000 (74.3048)  acc5: 92.0000 (92.5905)  time: 0.3952  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.4730 (1.3383)  acc1: 70.4000 (73.7760)  acc5: 90.4000 (92.4160)  time: 0.3941  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6999 s / it)
* Acc@1 74.362 Acc@5 92.554 loss 1.328
Accuracy of the model on the 50000 test images: 74.4%
Max accuracy: 74.36%
Epoch: [41]  [   0/1251]  eta: 1:33:52  lr: 0.003452  min_lr: 0.003452  loss: 3.6930 (3.6930)  weight_decay: 0.0500 (0.0500)  time: 4.5022  data: 3.7195  max mem: 69511
Epoch: [41]  [ 200/1251]  eta: 0:14:03  lr: 0.003451  min_lr: 0.003451  loss: 3.6357 (3.6176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4792 (0.5000)  time: 0.7855  data: 0.0006  max mem: 69511
Epoch: [41]  [ 400/1251]  eta: 0:11:18  lr: 0.003450  min_lr: 0.003450  loss: 3.8200 (3.6565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4947 (0.5154)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [41]  [ 600/1251]  eta: 0:08:36  lr: 0.003449  min_lr: 0.003449  loss: 3.6072 (3.6480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5339 (0.5242)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [41]  [ 800/1251]  eta: 0:05:57  lr: 0.003449  min_lr: 0.003449  loss: 3.7762 (3.6558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4492 (0.5209)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [41]  [1000/1251]  eta: 0:03:18  lr: 0.003448  min_lr: 0.003448  loss: 3.6638 (3.6544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4898 (0.5187)  time: 0.7850  data: 0.0005  max mem: 69511
Epoch: [41]  [1200/1251]  eta: 0:00:40  lr: 0.003447  min_lr: 0.003447  loss: 3.8666 (3.6570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5385 (0.5219)  time: 0.7838  data: 0.0005  max mem: 69511
Epoch: [41]  [1250/1251]  eta: 0:00:00  lr: 0.003447  min_lr: 0.003447  loss: 3.8241 (3.6583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5996 (0.5243)  time: 0.6664  data: 0.0006  max mem: 69511
Epoch: [41] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.003447  min_lr: 0.003447  loss: 3.8241 (3.6663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5996 (0.5243)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.9475 (0.9475)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 7.8395  data: 7.4218  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.0782 (1.1252)  acc1: 80.4000 (78.8000)  acc5: 96.0000 (95.7455)  time: 1.0727  data: 0.6750  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.4277 (1.3338)  acc1: 69.2000 (73.6571)  acc5: 91.6000 (92.6476)  time: 0.3961  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.5118 (1.3556)  acc1: 68.4000 (73.2480)  acc5: 89.6000 (92.3840)  time: 0.3960  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6995 s / it)
* Acc@1 74.044 Acc@5 92.412 loss 1.351
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.36%
Epoch: [42]  [   0/1251]  eta: 1:19:24  lr: 0.003447  min_lr: 0.003447  loss: 3.7509 (3.7509)  weight_decay: 0.0500 (0.0500)  time: 3.8086  data: 2.1299  max mem: 69511
Epoch: [42]  [ 200/1251]  eta: 0:14:07  lr: 0.003446  min_lr: 0.003446  loss: 3.6716 (3.6887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5353 (0.5273)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [42]  [ 400/1251]  eta: 0:11:18  lr: 0.003445  min_lr: 0.003445  loss: 3.5652 (3.6669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4468 (0.5067)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [42]  [ 600/1251]  eta: 0:08:36  lr: 0.003445  min_lr: 0.003445  loss: 3.3852 (3.6603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5744 (0.5268)  time: 0.7827  data: 0.0005  max mem: 69511
Epoch: [42]  [ 800/1251]  eta: 0:05:57  lr: 0.003444  min_lr: 0.003444  loss: 3.4642 (3.6683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5131 (0.5210)  time: 0.7872  data: 0.0004  max mem: 69511
Epoch: [42]  [1000/1251]  eta: 0:03:18  lr: 0.003443  min_lr: 0.003443  loss: 3.7123 (3.6689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5190 (0.5224)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [42]  [1200/1251]  eta: 0:00:40  lr: 0.003442  min_lr: 0.003442  loss: 3.7197 (3.6732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4957 (0.5285)  time: 0.7952  data: 0.0005  max mem: 69511
Epoch: [42]  [1250/1251]  eta: 0:00:00  lr: 0.003442  min_lr: 0.003442  loss: 4.0455 (3.6790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4854 (0.5271)  time: 0.6688  data: 0.0006  max mem: 69511
Epoch: [42] Total time: 0:16:26 (0.7889 s / it)
Averaged stats: lr: 0.003442  min_lr: 0.003442  loss: 4.0455 (3.6620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4854 (0.5271)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.9130 (0.9130)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 6.0351  data: 5.6085  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.1353 (1.1441)  acc1: 79.6000 (78.8727)  acc5: 96.8000 (95.4182)  time: 1.0032  data: 0.6038  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.5292 (1.3902)  acc1: 71.2000 (74.0571)  acc5: 90.8000 (92.3429)  time: 0.4485  data: 0.0517  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.5402 (1.4038)  acc1: 70.4000 (73.6960)  acc5: 90.8000 (92.2080)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6688 s / it)
* Acc@1 74.300 Acc@5 92.570 loss 1.390
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.36%
Epoch: [43]  [   0/1251]  eta: 1:42:37  lr: 0.003442  min_lr: 0.003442  loss: 3.8164 (3.8164)  weight_decay: 0.0500 (0.0500)  time: 4.9221  data: 3.5606  max mem: 69511
Epoch: [43]  [ 200/1251]  eta: 0:14:11  lr: 0.003441  min_lr: 0.003441  loss: 3.6684 (3.6135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5705 (0.5362)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [43]  [ 400/1251]  eta: 0:11:19  lr: 0.003440  min_lr: 0.003440  loss: 3.8828 (3.6515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5578 (0.5409)  time: 0.7939  data: 0.0004  max mem: 69511
Epoch: [43]  [ 600/1251]  eta: 0:08:36  lr: 0.003440  min_lr: 0.003440  loss: 3.9218 (3.6451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6113 (0.5465)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [43]  [ 800/1251]  eta: 0:05:57  lr: 0.003439  min_lr: 0.003439  loss: 3.8843 (3.6395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4625 (0.5335)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [43]  [1000/1251]  eta: 0:03:18  lr: 0.003438  min_lr: 0.003438  loss: 3.7309 (3.6268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4815 (0.5303)  time: 0.7877  data: 0.0004  max mem: 69511
Epoch: [43]  [1200/1251]  eta: 0:00:40  lr: 0.003437  min_lr: 0.003437  loss: 3.9407 (3.6322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4963 (inf)  time: 0.7922  data: 0.0004  max mem: 69511
Epoch: [43]  [1250/1251]  eta: 0:00:00  lr: 0.003437  min_lr: 0.003437  loss: 3.4704 (3.6301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5912 (inf)  time: 0.6688  data: 0.0006  max mem: 69511
Epoch: [43] Total time: 0:16:27 (0.7895 s / it)
Averaged stats: lr: 0.003437  min_lr: 0.003437  loss: 3.4704 (3.6435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5912 (inf)
Test:  [ 0/25]  eta: 0:03:18  loss: 0.8488 (0.8488)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 7.9205  data: 7.4941  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.0147 (1.0105)  acc1: 81.2000 (79.6364)  acc5: 96.4000 (95.9636)  time: 1.0811  data: 0.6816  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.3274 (1.2466)  acc1: 70.8000 (74.5333)  acc5: 91.2000 (92.7238)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.4122 (1.2600)  acc1: 71.2000 (74.2400)  acc5: 90.8000 (92.5600)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7039 s / it)
* Acc@1 74.638 Acc@5 92.712 loss 1.245
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 74.64%
Epoch: [44]  [   0/1251]  eta: 1:28:31  lr: 0.003437  min_lr: 0.003437  loss: 3.7844 (3.7844)  weight_decay: 0.0500 (0.0500)  time: 4.2457  data: 3.4654  max mem: 69511
Epoch: [44]  [ 200/1251]  eta: 0:14:02  lr: 0.003436  min_lr: 0.003436  loss: 3.9223 (3.6501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4434 (0.5037)  time: 0.7826  data: 0.0004  max mem: 69511
Epoch: [44]  [ 400/1251]  eta: 0:11:15  lr: 0.003435  min_lr: 0.003435  loss: 3.7679 (3.6581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5505 (0.5188)  time: 0.7854  data: 0.0006  max mem: 69511
Epoch: [44]  [ 600/1251]  eta: 0:08:35  lr: 0.003434  min_lr: 0.003434  loss: 3.9389 (3.6567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5194 (0.5265)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [44]  [ 800/1251]  eta: 0:05:56  lr: 0.003434  min_lr: 0.003434  loss: 3.9358 (3.6554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4930 (0.5205)  time: 0.7833  data: 0.0004  max mem: 69511
Epoch: [44]  [1000/1251]  eta: 0:03:18  lr: 0.003433  min_lr: 0.003433  loss: 3.8994 (3.6501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5396 (0.5274)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [44]  [1200/1251]  eta: 0:00:40  lr: 0.003432  min_lr: 0.003432  loss: 3.6583 (3.6447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5345 (0.5276)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [44]  [1250/1251]  eta: 0:00:00  lr: 0.003432  min_lr: 0.003432  loss: 3.5512 (3.6441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5003 (0.5277)  time: 0.6689  data: 0.0005  max mem: 69511
Epoch: [44] Total time: 0:16:25 (0.7878 s / it)
Averaged stats: lr: 0.003432  min_lr: 0.003432  loss: 3.5512 (3.6379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5003 (0.5277)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.8642 (0.8642)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 5.8512  data: 5.4050  max mem: 69511
Test:  [10/25]  eta: 0:00:13  loss: 1.0584 (1.0673)  acc1: 81.6000 (79.8545)  acc5: 96.0000 (95.4182)  time: 0.8938  data: 0.4918  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.3344 (1.2806)  acc1: 72.8000 (75.3714)  acc5: 92.0000 (92.7238)  time: 0.3976  data: 0.0003  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3825 (1.2811)  acc1: 72.0000 (74.9920)  acc5: 91.6000 (92.7040)  time: 0.3973  data: 0.0001  max mem: 69511
Test: Total time: 0:00:15 (0.6221 s / it)
* Acc@1 75.104 Acc@5 93.034 loss 1.266
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.10%
Epoch: [45]  [   0/1251]  eta: 1:21:08  lr: 0.003432  min_lr: 0.003432  loss: 4.2354 (4.2354)  weight_decay: 0.0500 (0.0500)  time: 3.8920  data: 3.0939  max mem: 69511
Epoch: [45]  [ 200/1251]  eta: 0:14:02  lr: 0.003431  min_lr: 0.003431  loss: 3.6863 (3.6314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5009 (0.5419)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [45]  [ 400/1251]  eta: 0:11:17  lr: 0.003430  min_lr: 0.003430  loss: 3.8949 (3.6244)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5747 (0.5394)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [45]  [ 600/1251]  eta: 0:08:35  lr: 0.003429  min_lr: 0.003429  loss: 3.7508 (3.6190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4810 (0.5434)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [45]  [ 800/1251]  eta: 0:05:56  lr: 0.003428  min_lr: 0.003428  loss: 3.7661 (3.6312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4887 (0.5417)  time: 0.7895  data: 0.0004  max mem: 69511
Epoch: [45]  [1000/1251]  eta: 0:03:18  lr: 0.003427  min_lr: 0.003427  loss: 3.6501 (3.6387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5442 (0.5407)  time: 0.7828  data: 0.0004  max mem: 69511
Epoch: [45]  [1200/1251]  eta: 0:00:40  lr: 0.003426  min_lr: 0.003426  loss: 3.8858 (3.6382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5509 (0.5377)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [45]  [1250/1251]  eta: 0:00:00  lr: 0.003426  min_lr: 0.003426  loss: 3.7392 (3.6343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4897 (0.5358)  time: 0.6676  data: 0.0006  max mem: 69511
Epoch: [45] Total time: 0:16:25 (0.7879 s / it)
Averaged stats: lr: 0.003426  min_lr: 0.003426  loss: 3.7392 (3.6304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4897 (0.5358)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.8605 (0.8605)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 7.6858  data: 7.2716  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0609 (1.0472)  acc1: 81.2000 (79.4182)  acc5: 96.4000 (95.4546)  time: 1.0598  data: 0.6613  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.3023 (1.2572)  acc1: 71.6000 (74.7619)  acc5: 92.8000 (92.7429)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3844 (1.2615)  acc1: 71.2000 (74.3520)  acc5: 91.2000 (92.6880)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6937 s / it)
* Acc@1 75.290 Acc@5 93.036 loss 1.246
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.29%
Epoch: [46]  [   0/1251]  eta: 1:15:48  lr: 0.003426  min_lr: 0.003426  loss: 4.0663 (4.0663)  weight_decay: 0.0500 (0.0500)  time: 3.6355  data: 2.8428  max mem: 69511
Epoch: [46]  [ 200/1251]  eta: 0:14:05  lr: 0.003425  min_lr: 0.003425  loss: 3.5866 (3.6524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5465 (0.5411)  time: 0.7950  data: 0.0004  max mem: 69511
Epoch: [46]  [ 400/1251]  eta: 0:11:17  lr: 0.003424  min_lr: 0.003424  loss: 3.4677 (3.6345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5238 (0.5463)  time: 0.7878  data: 0.0005  max mem: 69511
Epoch: [46]  [ 600/1251]  eta: 0:08:36  lr: 0.003423  min_lr: 0.003423  loss: 3.7630 (3.6331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5091 (0.5449)  time: 0.7877  data: 0.0004  max mem: 69511
Epoch: [46]  [ 800/1251]  eta: 0:05:57  lr: 0.003422  min_lr: 0.003422  loss: 3.6746 (3.6347)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.7878  data: 0.0005  max mem: 69511
Epoch: [46]  [1000/1251]  eta: 0:03:18  lr: 0.003421  min_lr: 0.003421  loss: 3.8229 (3.6387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5426 (nan)  time: 0.7875  data: 0.0005  max mem: 69511
Epoch: [46]  [1200/1251]  eta: 0:00:40  lr: 0.003421  min_lr: 0.003421  loss: 3.4420 (3.6408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4611 (nan)  time: 0.7837  data: 0.0004  max mem: 69511
Epoch: [46]  [1250/1251]  eta: 0:00:00  lr: 0.003420  min_lr: 0.003420  loss: 3.8187 (3.6444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4814 (nan)  time: 0.6648  data: 0.0005  max mem: 69511
Epoch: [46] Total time: 0:16:28 (0.7899 s / it)
Averaged stats: lr: 0.003420  min_lr: 0.003420  loss: 3.8187 (3.6214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4814 (nan)
Test:  [ 0/25]  eta: 0:03:01  loss: 1.0003 (1.0003)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 7.2582  data: 6.8221  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.1482 (1.1748)  acc1: 79.2000 (79.9273)  acc5: 96.0000 (95.8909)  time: 1.0190  data: 0.6205  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.3562 (1.4048)  acc1: 71.6000 (74.6857)  acc5: 92.0000 (92.8571)  time: 0.3950  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.4756 (1.4158)  acc1: 69.6000 (74.2880)  acc5: 90.8000 (92.7680)  time: 0.3950  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6815 s / it)
* Acc@1 74.824 Acc@5 92.918 loss 1.401
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 75.29%
Epoch: [47]  [   0/1251]  eta: 1:39:00  lr: 0.003420  min_lr: 0.003420  loss: 4.1317 (4.1317)  weight_decay: 0.0500 (0.0500)  time: 4.7485  data: 3.2602  max mem: 69511
Epoch: [47]  [ 200/1251]  eta: 0:14:05  lr: 0.003419  min_lr: 0.003419  loss: 3.8443 (3.6467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5184 (0.5437)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [47]  [ 400/1251]  eta: 0:11:17  lr: 0.003418  min_lr: 0.003418  loss: 3.7758 (3.6228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4914 (0.5470)  time: 0.7869  data: 0.0004  max mem: 69511
Epoch: [47]  [ 600/1251]  eta: 0:08:37  lr: 0.003417  min_lr: 0.003417  loss: 3.8171 (3.5981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5266 (0.5434)  time: 0.7872  data: 0.0004  max mem: 69511
Epoch: [47]  [ 800/1251]  eta: 0:05:57  lr: 0.003417  min_lr: 0.003417  loss: 3.3941 (3.5970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5061 (0.5399)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [47]  [1000/1251]  eta: 0:03:18  lr: 0.003416  min_lr: 0.003416  loss: 3.6645 (3.6021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5265 (0.5407)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [47]  [1200/1251]  eta: 0:00:40  lr: 0.003415  min_lr: 0.003415  loss: 3.7789 (3.6057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5323 (0.5427)  time: 0.7831  data: 0.0005  max mem: 69511
Epoch: [47]  [1250/1251]  eta: 0:00:00  lr: 0.003414  min_lr: 0.003414  loss: 3.5110 (3.6044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5147 (0.5402)  time: 0.6657  data: 0.0005  max mem: 69511
Epoch: [47] Total time: 0:16:27 (0.7898 s / it)
Averaged stats: lr: 0.003414  min_lr: 0.003414  loss: 3.5110 (3.5996)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5147 (0.5402)
Test:  [ 0/25]  eta: 0:03:11  loss: 0.9287 (0.9287)  acc1: 85.2000 (85.2000)  acc5: 97.2000 (97.2000)  time: 7.6535  data: 7.2379  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9983 (1.0786)  acc1: 80.8000 (79.8182)  acc5: 96.8000 (95.8182)  time: 1.0541  data: 0.6582  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.3102 (1.2861)  acc1: 72.8000 (75.6571)  acc5: 91.6000 (92.8762)  time: 0.3943  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3865 (1.2907)  acc1: 72.8000 (75.3440)  acc5: 91.6000 (92.8800)  time: 0.3944  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6964 s / it)
* Acc@1 75.396 Acc@5 93.080 loss 1.278
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.40%
Epoch: [48]  [   0/1251]  eta: 1:21:33  lr: 0.003414  min_lr: 0.003414  loss: 3.1590 (3.1590)  weight_decay: 0.0500 (0.0500)  time: 3.9117  data: 3.1202  max mem: 69511
Epoch: [48]  [ 200/1251]  eta: 0:14:01  lr: 0.003413  min_lr: 0.003413  loss: 3.7303 (3.5733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5168 (0.5418)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [48]  [ 400/1251]  eta: 0:11:17  lr: 0.003412  min_lr: 0.003412  loss: 3.6748 (3.5583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5334 (0.5316)  time: 0.7948  data: 0.0005  max mem: 69511
Epoch: [48]  [ 600/1251]  eta: 0:08:36  lr: 0.003411  min_lr: 0.003411  loss: 3.9105 (3.5994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5542 (0.5407)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [48]  [ 800/1251]  eta: 0:05:57  lr: 0.003410  min_lr: 0.003410  loss: 3.6410 (3.5943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5664 (0.5511)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [48]  [1000/1251]  eta: 0:03:18  lr: 0.003409  min_lr: 0.003409  loss: 3.7381 (3.5950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5499 (0.5505)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [48]  [1200/1251]  eta: 0:00:40  lr: 0.003408  min_lr: 0.003408  loss: 3.7206 (3.5927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6067 (0.5513)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [48]  [1250/1251]  eta: 0:00:00  lr: 0.003408  min_lr: 0.003408  loss: 3.4995 (3.5925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5449 (0.5509)  time: 0.6680  data: 0.0006  max mem: 69511
Epoch: [48] Total time: 0:16:27 (0.7892 s / it)
Averaged stats: lr: 0.003408  min_lr: 0.003408  loss: 3.4995 (3.5953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5449 (0.5509)
Test:  [ 0/25]  eta: 0:03:19  loss: 0.8149 (0.8149)  acc1: 83.6000 (83.6000)  acc5: 96.8000 (96.8000)  time: 7.9796  data: 7.5615  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9172 (0.9973)  acc1: 81.6000 (80.1455)  acc5: 96.8000 (95.9636)  time: 1.0865  data: 0.6877  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2542 (1.2186)  acc1: 72.4000 (75.7333)  acc5: 92.0000 (93.2191)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3680 (1.2263)  acc1: 72.4000 (75.1040)  acc5: 91.6000 (92.9920)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7055 s / it)
* Acc@1 75.722 Acc@5 93.372 loss 1.213
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 75.72%
Epoch: [49]  [   0/1251]  eta: 1:17:52  lr: 0.003408  min_lr: 0.003408  loss: 2.9217 (2.9217)  weight_decay: 0.0500 (0.0500)  time: 3.7351  data: 2.9514  max mem: 69511
Epoch: [49]  [ 200/1251]  eta: 0:14:03  lr: 0.003407  min_lr: 0.003407  loss: 3.6348 (3.6543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5442 (0.5636)  time: 0.7977  data: 0.0004  max mem: 69511
Epoch: [49]  [ 400/1251]  eta: 0:11:18  lr: 0.003406  min_lr: 0.003406  loss: 3.7798 (3.6262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5186 (0.5445)  time: 0.7875  data: 0.0005  max mem: 69511
Epoch: [49]  [ 600/1251]  eta: 0:08:36  lr: 0.003405  min_lr: 0.003405  loss: 3.8113 (3.6366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5021 (0.5452)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [49]  [ 800/1251]  eta: 0:05:57  lr: 0.003404  min_lr: 0.003404  loss: 3.5834 (3.6243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5289 (0.5418)  time: 0.7945  data: 0.0005  max mem: 69511
Epoch: [49]  [1000/1251]  eta: 0:03:18  lr: 0.003403  min_lr: 0.003403  loss: 3.8806 (3.6332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5156 (0.5445)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [49]  [1200/1251]  eta: 0:00:40  lr: 0.003402  min_lr: 0.003402  loss: 3.5595 (3.6214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5131 (0.5409)  time: 0.7866  data: 0.0006  max mem: 69511
Epoch: [49]  [1250/1251]  eta: 0:00:00  lr: 0.003402  min_lr: 0.003402  loss: 3.7663 (3.6211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5210 (0.5415)  time: 0.6680  data: 0.0006  max mem: 69511
Epoch: [49] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.003402  min_lr: 0.003402  loss: 3.7663 (3.5887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5210 (0.5415)
Test:  [ 0/25]  eta: 0:03:17  loss: 0.8772 (0.8772)  acc1: 87.2000 (87.2000)  acc5: 97.2000 (97.2000)  time: 7.9062  data: 7.4751  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.0182 (1.0540)  acc1: 80.4000 (79.8545)  acc5: 97.2000 (96.2545)  time: 1.0799  data: 0.6798  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2930 (1.2702)  acc1: 73.2000 (75.9238)  acc5: 92.4000 (93.5619)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.4364 (1.2841)  acc1: 72.4000 (75.4560)  acc5: 91.6000 (93.3920)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7037 s / it)
* Acc@1 75.536 Acc@5 93.376 loss 1.279
Accuracy of the model on the 50000 test images: 75.5%
Max accuracy: 75.72%
Epoch: [50]  [   0/1251]  eta: 1:39:35  lr: 0.003402  min_lr: 0.003402  loss: 3.8034 (3.8034)  weight_decay: 0.0500 (0.0500)  time: 4.7768  data: 3.4921  max mem: 69511
Epoch: [50]  [ 200/1251]  eta: 0:14:11  lr: 0.003401  min_lr: 0.003401  loss: 3.7274 (3.5162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4627 (0.5110)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [50]  [ 400/1251]  eta: 0:11:19  lr: 0.003400  min_lr: 0.003400  loss: 3.6381 (3.5314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5176 (0.5137)  time: 0.7869  data: 0.0004  max mem: 69511
Epoch: [50]  [ 600/1251]  eta: 0:08:37  lr: 0.003399  min_lr: 0.003399  loss: 3.7394 (3.5348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5067 (0.5255)  time: 0.7943  data: 0.0005  max mem: 69511
Epoch: [50]  [ 800/1251]  eta: 0:05:58  lr: 0.003398  min_lr: 0.003398  loss: 3.4326 (3.5480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5139 (0.5316)  time: 0.7879  data: 0.0005  max mem: 69511
Epoch: [50]  [1000/1251]  eta: 0:03:18  lr: 0.003397  min_lr: 0.003397  loss: 3.9307 (3.5504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4824 (0.5291)  time: 0.7870  data: 0.0005  max mem: 69511
Epoch: [50]  [1200/1251]  eta: 0:00:40  lr: 0.003395  min_lr: 0.003395  loss: 3.5253 (3.5517)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5158 (0.5363)  time: 0.7982  data: 0.0004  max mem: 69511
Epoch: [50]  [1250/1251]  eta: 0:00:00  lr: 0.003395  min_lr: 0.003395  loss: 3.7574 (3.5530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5119 (0.5360)  time: 0.6682  data: 0.0005  max mem: 69511
Epoch: [50] Total time: 0:16:29 (0.7909 s / it)
Averaged stats: lr: 0.003395  min_lr: 0.003395  loss: 3.7574 (3.5734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5119 (0.5360)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.8208 (0.8208)  acc1: 84.0000 (84.0000)  acc5: 97.2000 (97.2000)  time: 7.8156  data: 7.3977  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.0773 (1.0634)  acc1: 81.2000 (79.8909)  acc5: 96.4000 (96.3273)  time: 1.0717  data: 0.6728  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.3107 (1.2517)  acc1: 72.4000 (75.3714)  acc5: 92.8000 (93.7143)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3967 (1.2615)  acc1: 73.6000 (75.2160)  acc5: 92.4000 (93.4240)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6998 s / it)
* Acc@1 75.762 Acc@5 93.440 loss 1.246
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 75.76%
Epoch: [51]  [   0/1251]  eta: 1:27:53  lr: 0.003395  min_lr: 0.003395  loss: 3.7149 (3.7149)  weight_decay: 0.0500 (0.0500)  time: 4.2157  data: 3.4335  max mem: 69511
Epoch: [51]  [ 200/1251]  eta: 0:14:04  lr: 0.003394  min_lr: 0.003394  loss: 3.8382 (3.5122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5488 (0.5590)  time: 0.7869  data: 0.0004  max mem: 69511
Epoch: [51]  [ 400/1251]  eta: 0:11:17  lr: 0.003393  min_lr: 0.003393  loss: 3.7751 (3.5734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5690 (0.5635)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [51]  [ 600/1251]  eta: 0:08:37  lr: 0.003392  min_lr: 0.003392  loss: 3.6852 (3.5606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5446 (0.5630)  time: 0.7872  data: 0.0004  max mem: 69511
Epoch: [51]  [ 800/1251]  eta: 0:05:57  lr: 0.003391  min_lr: 0.003391  loss: 3.6615 (3.5621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5545 (0.5599)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [51]  [1000/1251]  eta: 0:03:18  lr: 0.003390  min_lr: 0.003390  loss: 3.6029 (3.5609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5158 (0.5541)  time: 0.7875  data: 0.0004  max mem: 69511
Epoch: [51]  [1200/1251]  eta: 0:00:40  lr: 0.003389  min_lr: 0.003389  loss: 3.8278 (3.5604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5252 (0.5594)  time: 0.7838  data: 0.0005  max mem: 69511
Epoch: [51]  [1250/1251]  eta: 0:00:00  lr: 0.003388  min_lr: 0.003388  loss: 3.6914 (3.5611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4966 (0.5575)  time: 0.6654  data: 0.0006  max mem: 69511
Epoch: [51] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.003388  min_lr: 0.003388  loss: 3.6914 (3.5637)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4966 (0.5575)
Test:  [ 0/25]  eta: 0:02:50  loss: 0.9700 (0.9700)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 6.8091  data: 6.3727  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0335 (1.0311)  acc1: 79.2000 (80.4000)  acc5: 96.4000 (96.0364)  time: 1.0322  data: 0.6343  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2514 (1.2199)  acc1: 72.4000 (75.7143)  acc5: 92.8000 (93.7143)  time: 0.4244  data: 0.0303  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3892 (1.2393)  acc1: 70.8000 (75.3440)  acc5: 92.0000 (93.4080)  time: 0.3943  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6802 s / it)
* Acc@1 76.060 Acc@5 93.548 loss 1.230
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.06%
Epoch: [52]  [   0/1251]  eta: 1:23:00  lr: 0.003388  min_lr: 0.003388  loss: 3.4099 (3.4099)  weight_decay: 0.0500 (0.0500)  time: 3.9815  data: 3.1823  max mem: 69511
Epoch: [52]  [ 200/1251]  eta: 0:14:01  lr: 0.003387  min_lr: 0.003387  loss: 3.8520 (3.5626)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5393 (0.5276)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [52]  [ 400/1251]  eta: 0:11:17  lr: 0.003386  min_lr: 0.003386  loss: 3.6215 (3.5827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5138 (0.5455)  time: 0.7825  data: 0.0004  max mem: 69511
Epoch: [52]  [ 600/1251]  eta: 0:08:35  lr: 0.003385  min_lr: 0.003385  loss: 3.8548 (3.5932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4640 (0.5366)  time: 0.7843  data: 0.0004  max mem: 69511
Epoch: [52]  [ 800/1251]  eta: 0:05:56  lr: 0.003384  min_lr: 0.003384  loss: 3.6658 (3.5841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5561 (0.5510)  time: 0.7875  data: 0.0006  max mem: 69511
Epoch: [52]  [1000/1251]  eta: 0:03:18  lr: 0.003383  min_lr: 0.003383  loss: 3.6478 (3.5627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5770 (0.5548)  time: 0.7875  data: 0.0006  max mem: 69511
Epoch: [52]  [1200/1251]  eta: 0:00:40  lr: 0.003382  min_lr: 0.003382  loss: 3.3896 (3.5582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5476 (0.5575)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [52]  [1250/1251]  eta: 0:00:00  lr: 0.003381  min_lr: 0.003381  loss: 3.5507 (3.5552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5228 (0.5556)  time: 0.6680  data: 0.0005  max mem: 69511
Epoch: [52] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.003381  min_lr: 0.003381  loss: 3.5507 (3.5469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5228 (0.5556)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.8351 (0.8351)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 7.7967  data: 7.3598  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9987 (1.0148)  acc1: 79.2000 (80.8727)  acc5: 96.8000 (96.1455)  time: 1.0698  data: 0.6693  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2582 (1.2185)  acc1: 74.4000 (76.1333)  acc5: 92.8000 (93.4857)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3324 (1.2278)  acc1: 73.2000 (75.6640)  acc5: 92.0000 (93.2960)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6986 s / it)
* Acc@1 76.092 Acc@5 93.476 loss 1.212
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.09%
Epoch: [53]  [   0/1251]  eta: 1:14:14  lr: 0.003381  min_lr: 0.003381  loss: 2.9773 (2.9773)  weight_decay: 0.0500 (0.0500)  time: 3.5611  data: 2.7725  max mem: 69511
Epoch: [53]  [ 200/1251]  eta: 0:14:07  lr: 0.003380  min_lr: 0.003380  loss: 3.6632 (3.5509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5715 (0.5634)  time: 0.7954  data: 0.0004  max mem: 69511
Epoch: [53]  [ 400/1251]  eta: 0:11:18  lr: 0.003379  min_lr: 0.003379  loss: 3.7063 (3.5537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5805 (0.5663)  time: 0.7881  data: 0.0004  max mem: 69511
Epoch: [53]  [ 600/1251]  eta: 0:08:37  lr: 0.003378  min_lr: 0.003378  loss: 3.2862 (3.5474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4766 (0.5625)  time: 0.7878  data: 0.0004  max mem: 69511
Epoch: [53]  [ 800/1251]  eta: 0:05:57  lr: 0.003377  min_lr: 0.003377  loss: 3.7356 (3.5452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5603 (0.5602)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [53]  [1000/1251]  eta: 0:03:18  lr: 0.003376  min_lr: 0.003376  loss: 3.8094 (3.5424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5008 (0.5539)  time: 0.7875  data: 0.0004  max mem: 69511
Epoch: [53]  [1200/1251]  eta: 0:00:40  lr: 0.003375  min_lr: 0.003375  loss: 3.7879 (3.5451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5620 (0.5520)  time: 0.7874  data: 0.0004  max mem: 69511
Epoch: [53]  [1250/1251]  eta: 0:00:00  lr: 0.003374  min_lr: 0.003374  loss: 3.8670 (3.5482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4922 (0.5514)  time: 0.6733  data: 0.0005  max mem: 69511
Epoch: [53] Total time: 0:16:28 (0.7901 s / it)
Averaged stats: lr: 0.003374  min_lr: 0.003374  loss: 3.8670 (3.5475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4922 (0.5514)
Test:  [ 0/25]  eta: 0:02:58  loss: 1.0848 (1.0848)  acc1: 84.0000 (84.0000)  acc5: 97.2000 (97.2000)  time: 7.1527  data: 6.7232  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.1648 (1.1915)  acc1: 80.8000 (79.9636)  acc5: 95.6000 (95.7455)  time: 1.0111  data: 0.6115  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.4438 (1.4020)  acc1: 72.8000 (76.0000)  acc5: 93.2000 (93.1619)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.5568 (1.4108)  acc1: 72.0000 (75.5200)  acc5: 90.4000 (92.9920)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6751 s / it)
* Acc@1 75.746 Acc@5 93.318 loss 1.394
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 76.09%
Epoch: [54]  [   0/1251]  eta: 1:38:40  lr: 0.003374  min_lr: 0.003374  loss: 3.6847 (3.6847)  weight_decay: 0.0500 (0.0500)  time: 4.7327  data: 2.9775  max mem: 69511
Epoch: [54]  [ 200/1251]  eta: 0:14:09  lr: 0.003373  min_lr: 0.003373  loss: 3.6531 (3.5192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6151 (0.5890)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [54]  [ 400/1251]  eta: 0:11:18  lr: 0.003372  min_lr: 0.003372  loss: 3.6806 (3.5382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5122 (0.5547)  time: 0.7870  data: 0.0005  max mem: 69511
Epoch: [54]  [ 600/1251]  eta: 0:08:37  lr: 0.003371  min_lr: 0.003371  loss: 3.7138 (3.5371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5461 (0.5427)  time: 0.7945  data: 0.0006  max mem: 69511
Epoch: [54]  [ 800/1251]  eta: 0:05:58  lr: 0.003370  min_lr: 0.003370  loss: 3.6409 (3.5497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4819 (0.5450)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [54]  [1000/1251]  eta: 0:03:18  lr: 0.003368  min_lr: 0.003368  loss: 3.6606 (3.5524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5115 (nan)  time: 0.7875  data: 0.0004  max mem: 69511
Epoch: [54]  [1200/1251]  eta: 0:00:40  lr: 0.003367  min_lr: 0.003367  loss: 3.6617 (3.5447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5715 (nan)  time: 0.7841  data: 0.0005  max mem: 69511
Epoch: [54]  [1250/1251]  eta: 0:00:00  lr: 0.003367  min_lr: 0.003367  loss: 3.8332 (3.5443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5904 (nan)  time: 0.6652  data: 0.0007  max mem: 69511
Epoch: [54] Total time: 0:16:28 (0.7905 s / it)
Averaged stats: lr: 0.003367  min_lr: 0.003367  loss: 3.8332 (3.5328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5904 (nan)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.9417 (0.9417)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 7.7984  data: 7.3613  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.0958 (1.1024)  acc1: 79.6000 (80.1818)  acc5: 96.8000 (95.9636)  time: 1.0671  data: 0.6695  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2831 (1.2850)  acc1: 72.4000 (75.8667)  acc5: 92.4000 (93.2000)  time: 0.3941  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3913 (1.2879)  acc1: 72.0000 (75.7280)  acc5: 91.2000 (93.1680)  time: 0.3941  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7009 s / it)
* Acc@1 76.038 Acc@5 93.618 loss 1.274
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.09%
Epoch: [55]  [   0/1251]  eta: 1:47:14  lr: 0.003367  min_lr: 0.003367  loss: 3.9099 (3.9099)  weight_decay: 0.0500 (0.0500)  time: 5.1439  data: 4.3639  max mem: 69511
Epoch: [55]  [ 200/1251]  eta: 0:14:06  lr: 0.003366  min_lr: 0.003366  loss: 3.6918 (3.5360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5422 (0.5626)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [55]  [ 400/1251]  eta: 0:11:18  lr: 0.003364  min_lr: 0.003364  loss: 3.6396 (3.5379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5113 (0.5509)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [55]  [ 600/1251]  eta: 0:08:37  lr: 0.003363  min_lr: 0.003363  loss: 3.7655 (3.5616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5016 (0.5384)  time: 0.7882  data: 0.0004  max mem: 69511
Epoch: [55]  [ 800/1251]  eta: 0:05:57  lr: 0.003362  min_lr: 0.003362  loss: 3.8412 (3.5686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5771 (0.5412)  time: 0.7872  data: 0.0005  max mem: 69511
Epoch: [55]  [1000/1251]  eta: 0:03:18  lr: 0.003361  min_lr: 0.003361  loss: 3.5769 (3.5666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5776 (0.5438)  time: 0.7926  data: 0.0004  max mem: 69511
Epoch: [55]  [1200/1251]  eta: 0:00:40  lr: 0.003360  min_lr: 0.003360  loss: 3.6251 (3.5548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5322 (0.5440)  time: 0.7876  data: 0.0003  max mem: 69511
Epoch: [55]  [1250/1251]  eta: 0:00:00  lr: 0.003359  min_lr: 0.003359  loss: 3.5637 (3.5523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5127 (0.5424)  time: 0.6685  data: 0.0005  max mem: 69511
Epoch: [55] Total time: 0:16:28 (0.7906 s / it)
Averaged stats: lr: 0.003359  min_lr: 0.003359  loss: 3.5637 (3.5343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5127 (0.5424)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.8376 (0.8376)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 7.7767  data: 7.3592  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9833 (1.0073)  acc1: 79.6000 (80.7273)  acc5: 96.4000 (96.2545)  time: 1.0679  data: 0.6693  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1640 (1.1857)  acc1: 74.8000 (76.8571)  acc5: 93.2000 (93.8286)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3013 (1.2001)  acc1: 74.0000 (76.4160)  acc5: 92.0000 (93.6960)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6979 s / it)
* Acc@1 76.698 Acc@5 93.806 loss 1.195
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.70%
Epoch: [56]  [   0/1251]  eta: 1:25:29  lr: 0.003359  min_lr: 0.003359  loss: 4.2100 (4.2100)  weight_decay: 0.0500 (0.0500)  time: 4.1003  data: 3.3238  max mem: 69511
Epoch: [56]  [ 200/1251]  eta: 0:14:03  lr: 0.003358  min_lr: 0.003358  loss: 3.4049 (3.4843)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6260 (0.5521)  time: 0.7915  data: 0.0006  max mem: 69511
Epoch: [56]  [ 400/1251]  eta: 0:11:18  lr: 0.003357  min_lr: 0.003357  loss: 3.4870 (3.5162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6034 (0.5657)  time: 0.7941  data: 0.0004  max mem: 69511
Epoch: [56]  [ 600/1251]  eta: 0:08:36  lr: 0.003355  min_lr: 0.003355  loss: 3.6579 (3.5384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5092 (0.5496)  time: 0.7879  data: 0.0004  max mem: 69511
Epoch: [56]  [ 800/1251]  eta: 0:05:57  lr: 0.003354  min_lr: 0.003354  loss: 3.7343 (3.5321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5379 (0.5497)  time: 0.7867  data: 0.0004  max mem: 69511
Epoch: [56]  [1000/1251]  eta: 0:03:18  lr: 0.003353  min_lr: 0.003353  loss: 3.5001 (3.5420)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5129 (0.5432)  time: 0.7831  data: 0.0005  max mem: 69511
Epoch: [56]  [1200/1251]  eta: 0:00:40  lr: 0.003352  min_lr: 0.003352  loss: 3.7655 (3.5451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5604 (0.5455)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [56]  [1250/1251]  eta: 0:00:00  lr: 0.003351  min_lr: 0.003351  loss: 3.8709 (3.5468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5324 (0.5444)  time: 0.6667  data: 0.0005  max mem: 69511
Epoch: [56] Total time: 0:16:26 (0.7889 s / it)
Averaged stats: lr: 0.003351  min_lr: 0.003351  loss: 3.8709 (3.5198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5324 (0.5444)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.8983 (0.8983)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 7.5053  data: 7.0824  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0217 (1.0473)  acc1: 81.2000 (81.0182)  acc5: 96.4000 (96.1818)  time: 1.0406  data: 0.6441  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2509 (1.2440)  acc1: 74.0000 (76.3429)  acc5: 92.0000 (93.6571)  time: 0.3941  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3288 (1.2486)  acc1: 74.0000 (76.1440)  acc5: 92.0000 (93.5200)  time: 0.3941  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6892 s / it)
* Acc@1 76.454 Acc@5 93.668 loss 1.241
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.70%
Epoch: [57]  [   0/1251]  eta: 1:39:31  lr: 0.003351  min_lr: 0.003351  loss: 3.6341 (3.6341)  weight_decay: 0.0500 (0.0500)  time: 4.7734  data: 2.3948  max mem: 69511
Epoch: [57]  [ 200/1251]  eta: 0:14:07  lr: 0.003350  min_lr: 0.003350  loss: 3.4194 (3.4567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5053 (0.5381)  time: 0.7838  data: 0.0004  max mem: 69511
Epoch: [57]  [ 400/1251]  eta: 0:11:17  lr: 0.003349  min_lr: 0.003349  loss: 3.6425 (3.4758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5650 (0.5460)  time: 0.7867  data: 0.0004  max mem: 69511
Epoch: [57]  [ 600/1251]  eta: 0:08:36  lr: 0.003348  min_lr: 0.003348  loss: 3.2560 (3.4956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5314 (0.5458)  time: 0.7929  data: 0.0004  max mem: 69511
Epoch: [57]  [ 800/1251]  eta: 0:05:57  lr: 0.003346  min_lr: 0.003346  loss: 3.5902 (3.5032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5556 (0.5445)  time: 0.7921  data: 0.0005  max mem: 69511
Epoch: [57]  [1000/1251]  eta: 0:03:18  lr: 0.003345  min_lr: 0.003345  loss: 3.6544 (3.5096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5134 (0.5435)  time: 0.7872  data: 0.0005  max mem: 69511
Epoch: [57]  [1200/1251]  eta: 0:00:40  lr: 0.003344  min_lr: 0.003344  loss: 3.4218 (3.5066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5235 (0.5380)  time: 0.7966  data: 0.0004  max mem: 69511
Epoch: [57]  [1250/1251]  eta: 0:00:00  lr: 0.003343  min_lr: 0.003343  loss: 3.5799 (3.5078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5235 (0.5397)  time: 0.6696  data: 0.0007  max mem: 69511
Epoch: [57] Total time: 0:16:28 (0.7899 s / it)
Averaged stats: lr: 0.003343  min_lr: 0.003343  loss: 3.5799 (3.5114)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5235 (0.5397)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.8233 (0.8233)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 7.8060  data: 7.3762  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.0294 (1.0179)  acc1: 82.0000 (80.9455)  acc5: 98.0000 (96.9091)  time: 1.0708  data: 0.6709  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2857 (1.2186)  acc1: 73.6000 (76.3619)  acc5: 92.4000 (94.0191)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3244 (1.2337)  acc1: 73.6000 (75.9200)  acc5: 92.4000 (93.8080)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6999 s / it)
* Acc@1 76.642 Acc@5 93.860 loss 1.220
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.70%
Epoch: [58]  [   0/1251]  eta: 1:40:58  lr: 0.003343  min_lr: 0.003343  loss: 3.4746 (3.4746)  weight_decay: 0.0500 (0.0500)  time: 4.8430  data: 2.2624  max mem: 69511
Epoch: [58]  [ 200/1251]  eta: 0:14:06  lr: 0.003342  min_lr: 0.003342  loss: 3.4501 (3.4604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5877 (nan)  time: 0.7830  data: 0.0004  max mem: 69511
Epoch: [58]  [ 400/1251]  eta: 0:11:16  lr: 0.003341  min_lr: 0.003341  loss: 3.6376 (3.5216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5036 (nan)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [58]  [ 600/1251]  eta: 0:08:36  lr: 0.003339  min_lr: 0.003339  loss: 3.6466 (3.5222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4432 (nan)  time: 0.7929  data: 0.0004  max mem: 69511
Epoch: [58]  [ 800/1251]  eta: 0:05:57  lr: 0.003338  min_lr: 0.003338  loss: 3.6292 (3.5226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5011 (nan)  time: 0.7879  data: 0.0004  max mem: 69511
Epoch: [58]  [1000/1251]  eta: 0:03:18  lr: 0.003337  min_lr: 0.003337  loss: 3.3598 (3.5129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5240 (nan)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [58]  [1200/1251]  eta: 0:00:40  lr: 0.003335  min_lr: 0.003335  loss: 3.6800 (3.5204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5040 (nan)  time: 0.7943  data: 0.0004  max mem: 69511
Epoch: [58]  [1250/1251]  eta: 0:00:00  lr: 0.003335  min_lr: 0.003335  loss: 3.5728 (3.5196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5354 (nan)  time: 0.6688  data: 0.0005  max mem: 69511
Epoch: [58] Total time: 0:16:27 (0.7897 s / it)
Averaged stats: lr: 0.003335  min_lr: 0.003335  loss: 3.5728 (3.5052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5354 (nan)
Test:  [ 0/25]  eta: 0:03:10  loss: 0.8292 (0.8292)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 7.6190  data: 7.1985  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0120 (1.0171)  acc1: 81.6000 (80.7273)  acc5: 97.6000 (96.2546)  time: 1.0536  data: 0.6547  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1926 (1.1892)  acc1: 72.4000 (76.1714)  acc5: 92.4000 (93.7333)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3052 (1.2037)  acc1: 72.4000 (75.5680)  acc5: 92.4000 (93.5200)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6916 s / it)
* Acc@1 76.326 Acc@5 93.722 loss 1.189
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.70%
Epoch: [59]  [   0/1251]  eta: 1:31:37  lr: 0.003335  min_lr: 0.003335  loss: 3.9849 (3.9849)  weight_decay: 0.0500 (0.0500)  time: 4.3943  data: 3.0195  max mem: 69511
Epoch: [59]  [ 200/1251]  eta: 0:14:04  lr: 0.003334  min_lr: 0.003334  loss: 3.6238 (3.4979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5413 (0.5709)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [59]  [ 400/1251]  eta: 0:11:18  lr: 0.003332  min_lr: 0.003332  loss: 3.7187 (3.5051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5136 (0.5595)  time: 0.7951  data: 0.0004  max mem: 69511
Epoch: [59]  [ 600/1251]  eta: 0:08:36  lr: 0.003331  min_lr: 0.003331  loss: 3.7120 (3.4992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5059 (0.5542)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [59]  [ 800/1251]  eta: 0:05:57  lr: 0.003330  min_lr: 0.003330  loss: 3.4177 (3.5025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5068 (0.5514)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [59]  [1000/1251]  eta: 0:03:18  lr: 0.003328  min_lr: 0.003328  loss: 3.5776 (3.4976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5410 (0.5493)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [59]  [1200/1251]  eta: 0:00:40  lr: 0.003327  min_lr: 0.003327  loss: 3.5156 (3.5003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5055 (0.5527)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [59]  [1250/1251]  eta: 0:00:00  lr: 0.003327  min_lr: 0.003327  loss: 3.5790 (3.4973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5048 (0.5523)  time: 0.6682  data: 0.0005  max mem: 69511
Epoch: [59] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.003327  min_lr: 0.003327  loss: 3.5790 (3.5039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5048 (0.5523)
Test:  [ 0/25]  eta: 0:03:06  loss: 0.8653 (0.8653)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 7.4617  data: 7.0446  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0041 (1.0283)  acc1: 80.0000 (81.4182)  acc5: 96.4000 (96.2546)  time: 1.0394  data: 0.6407  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2048 (1.2029)  acc1: 75.6000 (77.0476)  acc5: 93.2000 (93.7524)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3018 (1.2172)  acc1: 74.4000 (76.4800)  acc5: 92.0000 (93.5680)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6853 s / it)
* Acc@1 76.878 Acc@5 93.810 loss 1.206
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 76.88%
Epoch: [60]  [   0/1251]  eta: 1:38:12  lr: 0.003327  min_lr: 0.003327  loss: 3.8355 (3.8355)  weight_decay: 0.0500 (0.0500)  time: 4.7105  data: 3.9193  max mem: 69511
Epoch: [60]  [ 200/1251]  eta: 0:14:07  lr: 0.003325  min_lr: 0.003325  loss: 3.3343 (3.4860)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6025 (0.5752)  time: 0.7917  data: 0.0005  max mem: 69511
Epoch: [60]  [ 400/1251]  eta: 0:11:17  lr: 0.003324  min_lr: 0.003324  loss: 3.7842 (3.4895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5487 (0.5756)  time: 0.7823  data: 0.0004  max mem: 69511
Epoch: [60]  [ 600/1251]  eta: 0:08:35  lr: 0.003323  min_lr: 0.003323  loss: 3.5800 (3.4989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5626 (0.5664)  time: 0.7820  data: 0.0006  max mem: 69511
Epoch: [60]  [ 800/1251]  eta: 0:05:56  lr: 0.003321  min_lr: 0.003321  loss: 3.5388 (3.4974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4921 (0.5569)  time: 0.7875  data: 0.0004  max mem: 69511
Epoch: [60]  [1000/1251]  eta: 0:03:18  lr: 0.003320  min_lr: 0.003320  loss: 3.7095 (3.5055)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6265 (0.5607)  time: 0.7867  data: 0.0004  max mem: 69511
Epoch: [60]  [1200/1251]  eta: 0:00:40  lr: 0.003318  min_lr: 0.003318  loss: 3.6487 (3.5153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5541 (0.5560)  time: 0.7830  data: 0.0004  max mem: 69511
Epoch: [60]  [1250/1251]  eta: 0:00:00  lr: 0.003318  min_lr: 0.003318  loss: 3.6626 (3.5193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5343 (0.5568)  time: 0.6707  data: 0.0006  max mem: 69511
Epoch: [60] Total time: 0:16:25 (0.7877 s / it)
Averaged stats: lr: 0.003318  min_lr: 0.003318  loss: 3.6626 (3.4962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5343 (0.5568)
Test:  [ 0/25]  eta: 0:03:04  loss: 0.9230 (0.9230)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 7.3812  data: 6.9536  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0158 (1.0510)  acc1: 82.8000 (81.6364)  acc5: 97.2000 (96.2909)  time: 1.0297  data: 0.6324  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2940 (1.2578)  acc1: 75.2000 (77.2571)  acc5: 93.2000 (94.0381)  time: 0.3947  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.4121 (1.2670)  acc1: 74.4000 (76.9440)  acc5: 93.2000 (93.8720)  time: 0.3948  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6869 s / it)
* Acc@1 77.038 Acc@5 94.080 loss 1.255
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.04%
Epoch: [61]  [   0/1251]  eta: 1:17:07  lr: 0.003318  min_lr: 0.003318  loss: 3.6124 (3.6124)  weight_decay: 0.0500 (0.0500)  time: 3.6992  data: 2.9185  max mem: 69511
Epoch: [61]  [ 200/1251]  eta: 0:14:04  lr: 0.003317  min_lr: 0.003317  loss: 3.4809 (3.5304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4958 (0.5413)  time: 0.7937  data: 0.0004  max mem: 69511
Epoch: [61]  [ 400/1251]  eta: 0:11:16  lr: 0.003315  min_lr: 0.003315  loss: 3.6651 (3.4845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5706 (0.5607)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [61]  [ 600/1251]  eta: 0:08:36  lr: 0.003314  min_lr: 0.003314  loss: 3.6258 (3.4676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5002 (0.5417)  time: 0.7917  data: 0.0006  max mem: 69511
Epoch: [61]  [ 800/1251]  eta: 0:05:57  lr: 0.003313  min_lr: 0.003313  loss: 3.6137 (3.4659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5000 (0.5388)  time: 0.7874  data: 0.0004  max mem: 69511
Epoch: [61]  [1000/1251]  eta: 0:03:18  lr: 0.003311  min_lr: 0.003311  loss: 3.4955 (3.4761)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5428 (0.5454)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [61]  [1200/1251]  eta: 0:00:40  lr: 0.003310  min_lr: 0.003310  loss: 3.7233 (3.4830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5054 (0.5425)  time: 0.7884  data: 0.0004  max mem: 69511
Epoch: [61]  [1250/1251]  eta: 0:00:00  lr: 0.003309  min_lr: 0.003309  loss: 3.7530 (3.4851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5348 (0.5443)  time: 0.6682  data: 0.0004  max mem: 69511
Epoch: [61] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.003309  min_lr: 0.003309  loss: 3.7530 (3.4895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5348 (0.5443)
Test:  [ 0/25]  eta: 0:03:05  loss: 0.8583 (0.8583)  acc1: 85.2000 (85.2000)  acc5: 98.0000 (98.0000)  time: 7.4350  data: 6.9825  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9875 (0.9955)  acc1: 82.4000 (81.3455)  acc5: 96.4000 (96.5091)  time: 1.0372  data: 0.6351  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2316 (1.2017)  acc1: 73.6000 (76.7429)  acc5: 93.6000 (94.1333)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3031 (1.2130)  acc1: 73.6000 (76.4640)  acc5: 92.0000 (93.7600)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6853 s / it)
* Acc@1 76.978 Acc@5 93.974 loss 1.201
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.04%
Epoch: [62]  [   0/1251]  eta: 1:42:52  lr: 0.003309  min_lr: 0.003309  loss: 3.5897 (3.5897)  weight_decay: 0.0500 (0.0500)  time: 4.9338  data: 4.1368  max mem: 69511
Epoch: [62]  [ 200/1251]  eta: 0:14:09  lr: 0.003308  min_lr: 0.003308  loss: 3.6600 (3.4870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5453 (0.5418)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [62]  [ 400/1251]  eta: 0:11:19  lr: 0.003306  min_lr: 0.003306  loss: 3.5017 (3.5203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5538 (0.5532)  time: 0.7979  data: 0.0004  max mem: 69511
Epoch: [62]  [ 600/1251]  eta: 0:08:38  lr: 0.003305  min_lr: 0.003305  loss: 3.7053 (3.5204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5555 (0.5577)  time: 0.7980  data: 0.0004  max mem: 69511
Epoch: [62]  [ 800/1251]  eta: 0:05:57  lr: 0.003304  min_lr: 0.003304  loss: 3.1464 (3.5005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5625 (0.5541)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [62]  [1000/1251]  eta: 0:03:18  lr: 0.003302  min_lr: 0.003302  loss: 3.6403 (3.4945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5403 (0.5525)  time: 0.7944  data: 0.0004  max mem: 69511
Epoch: [62]  [1200/1251]  eta: 0:00:40  lr: 0.003301  min_lr: 0.003301  loss: 3.4900 (3.4938)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5678 (0.5566)  time: 0.7867  data: 0.0004  max mem: 69511
Epoch: [62]  [1250/1251]  eta: 0:00:00  lr: 0.003300  min_lr: 0.003300  loss: 3.7195 (3.4955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5341 (0.5564)  time: 0.6678  data: 0.0005  max mem: 69511
Epoch: [62] Total time: 0:16:29 (0.7906 s / it)
Averaged stats: lr: 0.003300  min_lr: 0.003300  loss: 3.7195 (3.4809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5341 (0.5564)
Test:  [ 0/25]  eta: 0:03:18  loss: 0.8889 (0.8889)  acc1: 84.4000 (84.4000)  acc5: 98.8000 (98.8000)  time: 7.9283  data: 7.5001  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.0812 (1.0647)  acc1: 80.0000 (80.6545)  acc5: 97.2000 (96.8727)  time: 1.0819  data: 0.6821  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2876 (1.2606)  acc1: 75.2000 (76.9905)  acc5: 93.6000 (94.4191)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.4088 (1.2652)  acc1: 74.8000 (76.8000)  acc5: 92.0000 (94.1440)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7030 s / it)
* Acc@1 77.074 Acc@5 94.032 loss 1.251
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.07%
Epoch: [63]  [   0/1251]  eta: 1:24:29  lr: 0.003300  min_lr: 0.003300  loss: 3.2153 (3.2153)  weight_decay: 0.0500 (0.0500)  time: 4.0523  data: 3.2567  max mem: 69511
Epoch: [63]  [ 200/1251]  eta: 0:14:02  lr: 0.003299  min_lr: 0.003299  loss: 3.6922 (3.4930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5745 (0.5734)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [63]  [ 400/1251]  eta: 0:11:17  lr: 0.003297  min_lr: 0.003297  loss: 3.5823 (3.4816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5571 (0.5735)  time: 0.7867  data: 0.0004  max mem: 69511
Epoch: [63]  [ 600/1251]  eta: 0:08:36  lr: 0.003296  min_lr: 0.003296  loss: 3.1669 (3.4782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6047 (0.5858)  time: 0.7876  data: 0.0005  max mem: 69511
Epoch: [63]  [ 800/1251]  eta: 0:05:57  lr: 0.003294  min_lr: 0.003294  loss: 3.6158 (3.4724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5698 (0.5784)  time: 0.7895  data: 0.0005  max mem: 69511
Epoch: [63]  [1000/1251]  eta: 0:03:18  lr: 0.003293  min_lr: 0.003293  loss: 3.4750 (3.4709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4960 (0.5678)  time: 0.7920  data: 0.0004  max mem: 69511
Epoch: [63]  [1200/1251]  eta: 0:00:40  lr: 0.003291  min_lr: 0.003291  loss: 3.5407 (3.4674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5112 (0.5628)  time: 0.7881  data: 0.0004  max mem: 69511
Epoch: [63]  [1250/1251]  eta: 0:00:00  lr: 0.003291  min_lr: 0.003291  loss: 3.6215 (3.4701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5447 (0.5621)  time: 0.6694  data: 0.0005  max mem: 69511
Epoch: [63] Total time: 0:16:27 (0.7895 s / it)
Averaged stats: lr: 0.003291  min_lr: 0.003291  loss: 3.6215 (3.4788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5447 (0.5621)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.7966 (0.7966)  acc1: 84.4000 (84.4000)  acc5: 98.4000 (98.4000)  time: 7.5171  data: 7.0917  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0125 (1.0109)  acc1: 82.0000 (81.0182)  acc5: 97.2000 (96.6545)  time: 1.0446  data: 0.6450  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2019 (1.2100)  acc1: 74.4000 (76.6286)  acc5: 93.2000 (94.0952)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3433 (1.2181)  acc1: 72.8000 (76.3840)  acc5: 92.8000 (93.9360)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6879 s / it)
* Acc@1 76.950 Acc@5 93.788 loss 1.207
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.07%
Epoch: [64]  [   0/1251]  eta: 1:42:17  lr: 0.003291  min_lr: 0.003291  loss: 3.5763 (3.5763)  weight_decay: 0.0500 (0.0500)  time: 4.9058  data: 3.3610  max mem: 69511
Epoch: [64]  [ 200/1251]  eta: 0:14:12  lr: 0.003290  min_lr: 0.003290  loss: 3.2634 (3.4896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4917 (0.5361)  time: 0.7940  data: 0.0005  max mem: 69511
Epoch: [64]  [ 400/1251]  eta: 0:11:20  lr: 0.003288  min_lr: 0.003288  loss: 3.3348 (3.4916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4963 (0.5499)  time: 0.7872  data: 0.0004  max mem: 69511
Epoch: [64]  [ 600/1251]  eta: 0:08:38  lr: 0.003287  min_lr: 0.003287  loss: 3.7565 (3.4931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5481 (0.5482)  time: 0.7896  data: 0.0005  max mem: 69511
Epoch: [64]  [ 800/1251]  eta: 0:05:58  lr: 0.003285  min_lr: 0.003285  loss: 3.5132 (3.4979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4922 (0.5482)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [64]  [1000/1251]  eta: 0:03:19  lr: 0.003284  min_lr: 0.003284  loss: 3.3090 (3.4829)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5498 (0.5448)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [64]  [1200/1251]  eta: 0:00:40  lr: 0.003282  min_lr: 0.003282  loss: 3.6529 (3.4719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5421 (0.5456)  time: 0.7928  data: 0.0004  max mem: 69511
Epoch: [64]  [1250/1251]  eta: 0:00:00  lr: 0.003282  min_lr: 0.003282  loss: 3.6313 (3.4710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5964 (0.5481)  time: 0.6724  data: 0.0005  max mem: 69511
Epoch: [64] Total time: 0:16:29 (0.7908 s / it)
Averaged stats: lr: 0.003282  min_lr: 0.003282  loss: 3.6313 (3.4533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5964 (0.5481)
Test:  [ 0/25]  eta: 0:03:09  loss: 0.8580 (0.8580)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 7.5622  data: 7.1345  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9175 (0.9888)  acc1: 82.0000 (81.4909)  acc5: 97.2000 (96.6546)  time: 1.0486  data: 0.6488  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2305 (1.1514)  acc1: 76.0000 (77.1238)  acc5: 92.8000 (94.2286)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2626 (1.1646)  acc1: 74.8000 (76.6880)  acc5: 92.0000 (94.0640)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6886 s / it)
* Acc@1 77.392 Acc@5 94.222 loss 1.157
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.39%
Epoch: [65]  [   0/1251]  eta: 1:34:19  lr: 0.003282  min_lr: 0.003282  loss: 3.1049 (3.1049)  weight_decay: 0.0500 (0.0500)  time: 4.5237  data: 3.7276  max mem: 69511
Epoch: [65]  [ 200/1251]  eta: 0:14:07  lr: 0.003280  min_lr: 0.003280  loss: 3.4986 (3.3820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5348 (0.5639)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [65]  [ 400/1251]  eta: 0:11:17  lr: 0.003279  min_lr: 0.003279  loss: 3.2666 (3.4341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4891 (0.5591)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [65]  [ 600/1251]  eta: 0:08:36  lr: 0.003277  min_lr: 0.003277  loss: 3.3829 (3.4259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5685 (0.5571)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [65]  [ 800/1251]  eta: 0:05:57  lr: 0.003276  min_lr: 0.003276  loss: 3.7613 (3.4395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5373 (0.5540)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [65]  [1000/1251]  eta: 0:03:18  lr: 0.003274  min_lr: 0.003274  loss: 3.3642 (3.4365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5144 (0.5513)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [65]  [1200/1251]  eta: 0:00:40  lr: 0.003272  min_lr: 0.003272  loss: 3.5115 (3.4455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6094 (0.5561)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [65]  [1250/1251]  eta: 0:00:00  lr: 0.003272  min_lr: 0.003272  loss: 3.4673 (3.4423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6094 (0.5577)  time: 0.6697  data: 0.0005  max mem: 69511
Epoch: [65] Total time: 0:16:27 (0.7895 s / it)
Averaged stats: lr: 0.003272  min_lr: 0.003272  loss: 3.4673 (3.4625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6094 (0.5577)
Test:  [ 0/25]  eta: 0:02:49  loss: 0.8045 (0.8045)  acc1: 84.8000 (84.8000)  acc5: 98.4000 (98.4000)  time: 6.7671  data: 6.3418  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.9968 (1.0025)  acc1: 82.4000 (81.2000)  acc5: 97.2000 (96.3273)  time: 0.9766  data: 0.5768  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2378 (1.2062)  acc1: 74.4000 (77.1619)  acc5: 92.4000 (93.8476)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3490 (1.2259)  acc1: 74.8000 (76.7360)  acc5: 92.0000 (93.5680)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6579 s / it)
* Acc@1 76.896 Acc@5 93.862 loss 1.224
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 77.39%
Epoch: [66]  [   0/1251]  eta: 1:36:35  lr: 0.003272  min_lr: 0.003272  loss: 2.6884 (2.6884)  weight_decay: 0.0500 (0.0500)  time: 4.6326  data: 2.9458  max mem: 69511
Epoch: [66]  [ 200/1251]  eta: 0:14:06  lr: 0.003271  min_lr: 0.003271  loss: 3.2990 (3.3725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5787 (0.5573)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [66]  [ 400/1251]  eta: 0:11:18  lr: 0.003269  min_lr: 0.003269  loss: 3.1376 (3.4427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5378 (0.5494)  time: 0.7952  data: 0.0004  max mem: 69511
Epoch: [66]  [ 600/1251]  eta: 0:08:37  lr: 0.003267  min_lr: 0.003267  loss: 3.6080 (3.4440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4934 (0.5528)  time: 0.7944  data: 0.0006  max mem: 69511
Epoch: [66]  [ 800/1251]  eta: 0:05:57  lr: 0.003266  min_lr: 0.003266  loss: 3.5135 (3.4446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5810 (0.5543)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [66]  [1000/1251]  eta: 0:03:18  lr: 0.003264  min_lr: 0.003264  loss: 3.4340 (3.4477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5570 (0.5538)  time: 0.7953  data: 0.0004  max mem: 69511
Epoch: [66]  [1200/1251]  eta: 0:00:40  lr: 0.003263  min_lr: 0.003263  loss: 3.5035 (3.4575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5517 (0.5552)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [66]  [1250/1251]  eta: 0:00:00  lr: 0.003262  min_lr: 0.003262  loss: 3.7043 (3.4593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5012 (0.5537)  time: 0.6685  data: 0.0007  max mem: 69511
Epoch: [66] Total time: 0:16:28 (0.7902 s / it)
Averaged stats: lr: 0.003262  min_lr: 0.003262  loss: 3.7043 (3.4567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5012 (0.5537)
Test:  [ 0/25]  eta: 0:03:02  loss: 0.8009 (0.8009)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 7.3111  data: 6.8949  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9372 (1.0097)  acc1: 82.8000 (81.7818)  acc5: 97.2000 (96.8000)  time: 1.0259  data: 0.6271  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1932 (1.1659)  acc1: 76.0000 (77.9619)  acc5: 94.4000 (94.6476)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2025 (1.1738)  acc1: 76.0000 (77.8080)  acc5: 93.6000 (94.3840)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6812 s / it)
* Acc@1 77.830 Acc@5 94.472 loss 1.169
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.83%
Epoch: [67]  [   0/1251]  eta: 1:17:52  lr: 0.003262  min_lr: 0.003262  loss: 3.5765 (3.5765)  weight_decay: 0.0500 (0.0500)  time: 3.7352  data: 2.9353  max mem: 69511
Epoch: [67]  [ 200/1251]  eta: 0:14:01  lr: 0.003261  min_lr: 0.003261  loss: 3.6256 (3.4217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5001 (0.5421)  time: 0.7909  data: 0.0003  max mem: 69511
Epoch: [67]  [ 400/1251]  eta: 0:11:17  lr: 0.003259  min_lr: 0.003259  loss: 3.6059 (3.4147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5854 (0.5550)  time: 0.7960  data: 0.0004  max mem: 69511
Epoch: [67]  [ 600/1251]  eta: 0:08:36  lr: 0.003258  min_lr: 0.003258  loss: 3.3327 (3.4192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5668 (0.5551)  time: 0.7878  data: 0.0005  max mem: 69511
Epoch: [67]  [ 800/1251]  eta: 0:05:57  lr: 0.003256  min_lr: 0.003256  loss: 3.6848 (3.4326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5040 (0.5597)  time: 0.7929  data: 0.0004  max mem: 69511
Epoch: [67]  [1000/1251]  eta: 0:03:18  lr: 0.003254  min_lr: 0.003254  loss: 3.5574 (3.4408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5063 (0.5524)  time: 0.7882  data: 0.0006  max mem: 69511
Epoch: [67]  [1200/1251]  eta: 0:00:40  lr: 0.003253  min_lr: 0.003253  loss: 3.2955 (3.4258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5497 (0.5556)  time: 0.7837  data: 0.0004  max mem: 69511
Epoch: [67]  [1250/1251]  eta: 0:00:00  lr: 0.003252  min_lr: 0.003252  loss: 3.5866 (3.4247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5244 (0.5554)  time: 0.6655  data: 0.0005  max mem: 69511
Epoch: [67] Total time: 0:16:27 (0.7892 s / it)
Averaged stats: lr: 0.003252  min_lr: 0.003252  loss: 3.5866 (3.4468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5244 (0.5554)
Test:  [ 0/25]  eta: 0:03:05  loss: 0.7140 (0.7140)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 7.4304  data: 7.0151  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8603 (0.8977)  acc1: 80.0000 (81.6000)  acc5: 96.4000 (96.3273)  time: 1.0346  data: 0.6380  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1141 (1.1023)  acc1: 76.0000 (77.8286)  acc5: 93.2000 (94.2286)  time: 0.3951  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2464 (1.1086)  acc1: 75.2000 (77.3920)  acc5: 93.2000 (94.2400)  time: 0.3951  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6887 s / it)
* Acc@1 77.450 Acc@5 94.326 loss 1.102
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.83%
Epoch: [68]  [   0/1251]  eta: 1:37:59  lr: 0.003252  min_lr: 0.003252  loss: 3.8063 (3.8063)  weight_decay: 0.0500 (0.0500)  time: 4.7001  data: 3.9024  max mem: 69511
Epoch: [68]  [ 200/1251]  eta: 0:14:09  lr: 0.003251  min_lr: 0.003251  loss: 2.9741 (3.4361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5515 (0.5648)  time: 0.7902  data: 0.0004  max mem: 69511
Epoch: [68]  [ 400/1251]  eta: 0:11:18  lr: 0.003249  min_lr: 0.003249  loss: 3.3986 (3.4388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5430 (0.5697)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [68]  [ 600/1251]  eta: 0:08:36  lr: 0.003247  min_lr: 0.003247  loss: 3.5546 (3.4300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5182 (0.5623)  time: 0.7939  data: 0.0005  max mem: 69511
Epoch: [68]  [ 800/1251]  eta: 0:05:57  lr: 0.003246  min_lr: 0.003246  loss: 3.1006 (3.4327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5100 (0.5580)  time: 0.7896  data: 0.0004  max mem: 69511
Epoch: [68]  [1000/1251]  eta: 0:03:18  lr: 0.003244  min_lr: 0.003244  loss: 3.3774 (3.4234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5456 (0.5666)  time: 0.7882  data: 0.0004  max mem: 69511
Epoch: [68]  [1200/1251]  eta: 0:00:40  lr: 0.003243  min_lr: 0.003243  loss: 3.5438 (3.4309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5382 (0.5657)  time: 0.7944  data: 0.0006  max mem: 69511
Epoch: [68]  [1250/1251]  eta: 0:00:00  lr: 0.003242  min_lr: 0.003242  loss: 3.7879 (3.4305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5287 (0.5644)  time: 0.6731  data: 0.0006  max mem: 69511
Epoch: [68] Total time: 0:16:28 (0.7904 s / it)
Averaged stats: lr: 0.003242  min_lr: 0.003242  loss: 3.7879 (3.4366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5287 (0.5644)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.9294 (0.9294)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 7.7179  data: 7.3028  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9564 (1.0078)  acc1: 81.6000 (81.0182)  acc5: 97.2000 (96.6909)  time: 1.0627  data: 0.6641  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2073 (1.1708)  acc1: 74.8000 (77.3333)  acc5: 92.8000 (94.1905)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2430 (1.1821)  acc1: 74.8000 (76.8800)  acc5: 92.8000 (94.1440)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6961 s / it)
* Acc@1 77.570 Acc@5 94.350 loss 1.172
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.83%
Epoch: [69]  [   0/1251]  eta: 1:43:33  lr: 0.003242  min_lr: 0.003242  loss: 3.6845 (3.6845)  weight_decay: 0.0500 (0.0500)  time: 4.9672  data: 3.0274  max mem: 69511
Epoch: [69]  [ 200/1251]  eta: 0:14:10  lr: 0.003241  min_lr: 0.003241  loss: 3.3546 (3.3970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5701 (0.5732)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [69]  [ 400/1251]  eta: 0:11:19  lr: 0.003239  min_lr: 0.003239  loss: 3.5681 (3.3945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5957 (0.5741)  time: 0.7959  data: 0.0007  max mem: 69511
Epoch: [69]  [ 600/1251]  eta: 0:08:38  lr: 0.003237  min_lr: 0.003237  loss: 3.4496 (3.4290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5887 (0.5673)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [69]  [ 800/1251]  eta: 0:05:57  lr: 0.003236  min_lr: 0.003236  loss: 3.8074 (3.4322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5127 (0.5634)  time: 0.7830  data: 0.0005  max mem: 69511
Epoch: [69]  [1000/1251]  eta: 0:03:18  lr: 0.003234  min_lr: 0.003234  loss: 3.5354 (3.4347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5349 (0.5572)  time: 0.7906  data: 0.0005  max mem: 69511
Epoch: [69]  [1200/1251]  eta: 0:00:40  lr: 0.003232  min_lr: 0.003232  loss: 3.5069 (3.4403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5255 (0.5568)  time: 0.7831  data: 0.0005  max mem: 69511
Epoch: [69]  [1250/1251]  eta: 0:00:00  lr: 0.003232  min_lr: 0.003232  loss: 3.5870 (3.4398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5460 (0.5578)  time: 0.6640  data: 0.0006  max mem: 69511
Epoch: [69] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.003232  min_lr: 0.003232  loss: 3.5870 (3.4448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5460 (0.5578)
Test:  [ 0/25]  eta: 0:03:16  loss: 0.7320 (0.7320)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 7.8665  data: 7.4326  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9176 (0.9309)  acc1: 83.2000 (82.1091)  acc5: 96.8000 (96.5818)  time: 1.0730  data: 0.6759  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1088 (1.1272)  acc1: 74.8000 (77.9810)  acc5: 93.6000 (94.1905)  time: 0.3937  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2072 (1.1304)  acc1: 75.6000 (77.6160)  acc5: 92.8000 (94.2720)  time: 0.3937  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7033 s / it)
* Acc@1 77.596 Acc@5 94.320 loss 1.127
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.83%
Epoch: [70]  [   0/1251]  eta: 1:32:14  lr: 0.003232  min_lr: 0.003232  loss: 3.3918 (3.3918)  weight_decay: 0.0500 (0.0500)  time: 4.4243  data: 3.1344  max mem: 69511
Epoch: [70]  [ 200/1251]  eta: 0:14:03  lr: 0.003230  min_lr: 0.003230  loss: 3.4827 (3.3644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6047 (0.6095)  time: 0.7850  data: 0.0006  max mem: 69511
Epoch: [70]  [ 400/1251]  eta: 0:11:16  lr: 0.003228  min_lr: 0.003228  loss: 2.9951 (3.3805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5605 (0.5965)  time: 0.7911  data: 0.0005  max mem: 69511
Epoch: [70]  [ 600/1251]  eta: 0:08:36  lr: 0.003227  min_lr: 0.003227  loss: 3.4348 (3.4067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5906 (0.5851)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [70]  [ 800/1251]  eta: 0:05:57  lr: 0.003225  min_lr: 0.003225  loss: 3.2491 (3.4018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5621 (0.5854)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [70]  [1000/1251]  eta: 0:03:18  lr: 0.003223  min_lr: 0.003223  loss: 3.5771 (3.4107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5894 (0.5798)  time: 0.7912  data: 0.0004  max mem: 69511
Epoch: [70]  [1200/1251]  eta: 0:00:40  lr: 0.003222  min_lr: 0.003222  loss: 3.5288 (3.4148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5513 (0.5769)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [70]  [1250/1251]  eta: 0:00:00  lr: 0.003221  min_lr: 0.003221  loss: 3.2040 (3.4130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5297 (0.5752)  time: 0.6683  data: 0.0005  max mem: 69511
Epoch: [70] Total time: 0:16:27 (0.7890 s / it)
Averaged stats: lr: 0.003221  min_lr: 0.003221  loss: 3.2040 (3.4308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5297 (0.5752)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.7331 (0.7331)  acc1: 85.2000 (85.2000)  acc5: 98.8000 (98.8000)  time: 7.7073  data: 7.2900  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8909 (0.8806)  acc1: 84.4000 (82.1091)  acc5: 96.8000 (97.0182)  time: 1.0617  data: 0.6630  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0457 (1.0697)  acc1: 76.8000 (78.0571)  acc5: 94.4000 (94.6476)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1545 (1.0704)  acc1: 75.2000 (77.8560)  acc5: 92.8000 (94.4320)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6940 s / it)
* Acc@1 77.734 Acc@5 94.436 loss 1.068
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.83%
Epoch: [71]  [   0/1251]  eta: 1:41:52  lr: 0.003221  min_lr: 0.003221  loss: 3.3330 (3.3330)  weight_decay: 0.0500 (0.0500)  time: 4.8859  data: 2.9473  max mem: 69511
Epoch: [71]  [ 200/1251]  eta: 0:14:08  lr: 0.003220  min_lr: 0.003220  loss: 3.7000 (3.3775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4902 (nan)  time: 0.7861  data: 0.0006  max mem: 69511
Epoch: [71]  [ 400/1251]  eta: 0:11:20  lr: 0.003218  min_lr: 0.003218  loss: 3.6688 (3.4261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6183 (nan)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [71]  [ 600/1251]  eta: 0:08:37  lr: 0.003216  min_lr: 0.003216  loss: 3.5486 (3.4059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5326 (nan)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [71]  [ 800/1251]  eta: 0:05:58  lr: 0.003214  min_lr: 0.003214  loss: 3.4118 (3.3994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5233 (nan)  time: 0.8002  data: 0.0005  max mem: 69511
Epoch: [71]  [1000/1251]  eta: 0:03:18  lr: 0.003213  min_lr: 0.003213  loss: 3.6894 (3.4175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5209 (nan)  time: 0.7870  data: 0.0005  max mem: 69511
Epoch: [71]  [1200/1251]  eta: 0:00:40  lr: 0.003211  min_lr: 0.003211  loss: 3.5152 (3.4199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4892 (nan)  time: 0.7878  data: 0.0006  max mem: 69511
Epoch: [71]  [1250/1251]  eta: 0:00:00  lr: 0.003211  min_lr: 0.003211  loss: 3.5939 (3.4179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5041 (nan)  time: 0.6690  data: 0.0006  max mem: 69511
Epoch: [71] Total time: 0:16:29 (0.7906 s / it)
Averaged stats: lr: 0.003211  min_lr: 0.003211  loss: 3.5939 (3.4242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5041 (nan)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.8221 (0.8221)  acc1: 88.4000 (88.4000)  acc5: 97.6000 (97.6000)  time: 7.8186  data: 7.4061  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9155 (0.9615)  acc1: 83.6000 (83.0909)  acc5: 97.2000 (96.7273)  time: 1.0721  data: 0.6735  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1718 (1.1434)  acc1: 77.2000 (78.8000)  acc5: 94.0000 (94.6667)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2565 (1.1544)  acc1: 76.0000 (78.3200)  acc5: 93.2000 (94.5120)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6995 s / it)
* Acc@1 77.690 Acc@5 94.512 loss 1.151
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.83%
Epoch: [72]  [   0/1251]  eta: 1:46:23  lr: 0.003211  min_lr: 0.003211  loss: 3.8265 (3.8265)  weight_decay: 0.0500 (0.0500)  time: 5.1024  data: 4.1552  max mem: 69511
Epoch: [72]  [ 200/1251]  eta: 0:14:11  lr: 0.003209  min_lr: 0.003209  loss: 3.6269 (3.4523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5636 (0.5452)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [72]  [ 400/1251]  eta: 0:11:18  lr: 0.003207  min_lr: 0.003207  loss: 3.4453 (3.4321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5161 (0.5356)  time: 0.7839  data: 0.0005  max mem: 69511
Epoch: [72]  [ 600/1251]  eta: 0:08:36  lr: 0.003205  min_lr: 0.003205  loss: 3.6825 (3.4360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6047 (0.5566)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [72]  [ 800/1251]  eta: 0:05:57  lr: 0.003204  min_lr: 0.003204  loss: 3.7612 (3.4503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5415 (0.5589)  time: 0.7909  data: 0.0005  max mem: 69511
Epoch: [72]  [1000/1251]  eta: 0:03:18  lr: 0.003202  min_lr: 0.003202  loss: 3.5452 (3.4434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5556 (0.5589)  time: 0.7848  data: 0.0004  max mem: 69511
Epoch: [72]  [1200/1251]  eta: 0:00:40  lr: 0.003200  min_lr: 0.003200  loss: 3.3711 (3.4293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5407 (0.5597)  time: 0.7914  data: 0.0004  max mem: 69511
Epoch: [72]  [1250/1251]  eta: 0:00:00  lr: 0.003200  min_lr: 0.003200  loss: 3.4767 (3.4237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5576 (0.5592)  time: 0.6679  data: 0.0005  max mem: 69511
Epoch: [72] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.003200  min_lr: 0.003200  loss: 3.4767 (3.4183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5576 (0.5592)
Test:  [ 0/25]  eta: 0:02:59  loss: 0.8148 (0.8148)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 7.1924  data: 6.7705  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9489 (1.0000)  acc1: 83.6000 (81.6727)  acc5: 96.8000 (96.8000)  time: 1.0153  data: 0.6158  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1583 (1.1905)  acc1: 75.2000 (77.7333)  acc5: 93.6000 (94.1714)  time: 0.3974  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3491 (1.2003)  acc1: 75.2000 (77.3920)  acc5: 92.8000 (94.1280)  time: 0.3973  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6756 s / it)
* Acc@1 77.892 Acc@5 94.458 loss 1.186
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.89%
Epoch: [73]  [   0/1251]  eta: 1:36:03  lr: 0.003200  min_lr: 0.003200  loss: 3.2027 (3.2027)  weight_decay: 0.0500 (0.0500)  time: 4.6074  data: 3.8228  max mem: 69511
Epoch: [73]  [ 200/1251]  eta: 0:14:07  lr: 0.003198  min_lr: 0.003198  loss: 3.4903 (3.4372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6041 (0.5900)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [73]  [ 400/1251]  eta: 0:11:16  lr: 0.003196  min_lr: 0.003196  loss: 3.4971 (3.4231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5388 (0.5622)  time: 0.7874  data: 0.0004  max mem: 69511
Epoch: [73]  [ 600/1251]  eta: 0:08:36  lr: 0.003194  min_lr: 0.003194  loss: 3.6029 (3.4036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5323 (0.5543)  time: 0.7852  data: 0.0006  max mem: 69511
Epoch: [73]  [ 800/1251]  eta: 0:05:56  lr: 0.003193  min_lr: 0.003193  loss: 3.5267 (3.4068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5841 (0.5554)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [73]  [1000/1251]  eta: 0:03:18  lr: 0.003191  min_lr: 0.003191  loss: 3.5320 (3.4183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5519 (0.5605)  time: 0.7901  data: 0.0005  max mem: 69511
Epoch: [73]  [1200/1251]  eta: 0:00:40  lr: 0.003189  min_lr: 0.003189  loss: 3.6954 (3.4220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4627 (0.5592)  time: 0.7925  data: 0.0005  max mem: 69511
Epoch: [73]  [1250/1251]  eta: 0:00:00  lr: 0.003189  min_lr: 0.003189  loss: 3.7465 (3.4209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4907 (0.5570)  time: 0.6667  data: 0.0007  max mem: 69511
Epoch: [73] Total time: 0:16:25 (0.7879 s / it)
Averaged stats: lr: 0.003189  min_lr: 0.003189  loss: 3.7465 (3.4154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4907 (0.5570)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.9189 (0.9189)  acc1: 88.0000 (88.0000)  acc5: 97.6000 (97.6000)  time: 7.6822  data: 7.2338  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0814 (1.1022)  acc1: 83.6000 (81.6000)  acc5: 96.8000 (96.3636)  time: 1.0589  data: 0.6579  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.2895 (1.2620)  acc1: 76.0000 (77.9048)  acc5: 93.2000 (94.3619)  time: 0.3966  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3511 (1.2781)  acc1: 76.0000 (77.3600)  acc5: 92.8000 (94.2240)  time: 0.3966  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6945 s / it)
* Acc@1 77.658 Acc@5 94.348 loss 1.272
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.89%
Epoch: [74]  [   0/1251]  eta: 1:39:45  lr: 0.003189  min_lr: 0.003189  loss: 3.5307 (3.5307)  weight_decay: 0.0500 (0.0500)  time: 4.7846  data: 3.9986  max mem: 69511
Epoch: [74]  [ 200/1251]  eta: 0:14:06  lr: 0.003187  min_lr: 0.003187  loss: 3.5480 (3.4261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5333 (0.5713)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [74]  [ 400/1251]  eta: 0:11:19  lr: 0.003185  min_lr: 0.003185  loss: 3.5263 (3.4254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4959 (0.5522)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [74]  [ 600/1251]  eta: 0:08:37  lr: 0.003183  min_lr: 0.003183  loss: 3.2980 (3.4204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5165 (0.5518)  time: 0.7881  data: 0.0006  max mem: 69511
Epoch: [74]  [ 800/1251]  eta: 0:05:57  lr: 0.003181  min_lr: 0.003181  loss: 3.5570 (3.4118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5921 (0.5658)  time: 0.7892  data: 0.0005  max mem: 69511
Epoch: [74]  [1000/1251]  eta: 0:03:18  lr: 0.003180  min_lr: 0.003180  loss: 3.2793 (3.4071)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5294 (0.5588)  time: 0.7842  data: 0.0005  max mem: 69511
Epoch: [74]  [1200/1251]  eta: 0:00:40  lr: 0.003178  min_lr: 0.003178  loss: 3.5208 (3.4197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4988 (0.5574)  time: 0.7849  data: 0.0006  max mem: 69511
Epoch: [74]  [1250/1251]  eta: 0:00:00  lr: 0.003177  min_lr: 0.003177  loss: 3.4126 (3.4206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5591 (0.5588)  time: 0.6673  data: 0.0007  max mem: 69511
Epoch: [74] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.003177  min_lr: 0.003177  loss: 3.4126 (3.4111)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5591 (0.5588)
Test:  [ 0/25]  eta: 0:02:56  loss: 0.8078 (0.8078)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 7.0558  data: 6.6040  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8941 (0.9243)  acc1: 83.6000 (82.1455)  acc5: 96.8000 (96.9091)  time: 1.0019  data: 0.6006  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1040 (1.1083)  acc1: 76.0000 (77.9619)  acc5: 94.0000 (94.7238)  time: 0.3964  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2145 (1.1116)  acc1: 76.4000 (77.8720)  acc5: 93.2000 (94.6080)  time: 0.3964  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6713 s / it)
* Acc@1 78.116 Acc@5 94.604 loss 1.101
Accuracy of the model on the 50000 test images: 78.1%
Max accuracy: 78.12%
Epoch: [75]  [   0/1251]  eta: 1:30:51  lr: 0.003177  min_lr: 0.003177  loss: 3.6862 (3.6862)  weight_decay: 0.0500 (0.0500)  time: 4.3580  data: 3.5726  max mem: 69511
Epoch: [75]  [ 200/1251]  eta: 0:14:06  lr: 0.003176  min_lr: 0.003176  loss: 3.4562 (3.3834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5343 (0.5756)  time: 0.8018  data: 0.0005  max mem: 69511
Epoch: [75]  [ 400/1251]  eta: 0:11:18  lr: 0.003174  min_lr: 0.003174  loss: 3.2485 (3.3877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5413 (0.5570)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [75]  [ 600/1251]  eta: 0:08:36  lr: 0.003172  min_lr: 0.003172  loss: 3.6126 (3.3783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5596 (0.5596)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [75]  [ 800/1251]  eta: 0:05:57  lr: 0.003170  min_lr: 0.003170  loss: 3.4315 (3.3864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5323 (0.5528)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [75]  [1000/1251]  eta: 0:03:18  lr: 0.003168  min_lr: 0.003168  loss: 3.5010 (3.3883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5628 (0.5596)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [75]  [1200/1251]  eta: 0:00:40  lr: 0.003166  min_lr: 0.003166  loss: 3.6062 (3.3954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5491 (0.5559)  time: 0.7833  data: 0.0004  max mem: 69511
Epoch: [75]  [1250/1251]  eta: 0:00:00  lr: 0.003166  min_lr: 0.003166  loss: 3.6137 (3.3937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5093 (0.5544)  time: 0.6648  data: 0.0005  max mem: 69511
Epoch: [75] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.003166  min_lr: 0.003166  loss: 3.6137 (3.3984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5093 (0.5544)
Test:  [ 0/25]  eta: 0:03:10  loss: 0.7497 (0.7497)  acc1: 84.8000 (84.8000)  acc5: 98.4000 (98.4000)  time: 7.6077  data: 7.1718  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9091 (0.9176)  acc1: 82.0000 (82.2909)  acc5: 97.6000 (96.9818)  time: 1.0497  data: 0.6523  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0810 (1.0709)  acc1: 76.8000 (78.4381)  acc5: 94.8000 (94.9714)  time: 0.3939  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1664 (1.0879)  acc1: 74.8000 (78.0160)  acc5: 93.6000 (94.7360)  time: 0.3940  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6935 s / it)
* Acc@1 78.424 Acc@5 94.712 loss 1.083
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.42%
Epoch: [76]  [   0/1251]  eta: 1:25:00  lr: 0.003166  min_lr: 0.003166  loss: 3.1130 (3.1130)  weight_decay: 0.0500 (0.0500)  time: 4.0774  data: 3.2973  max mem: 69511
Epoch: [76]  [ 200/1251]  eta: 0:14:03  lr: 0.003164  min_lr: 0.003164  loss: 3.5610 (3.3785)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5463 (0.5648)  time: 0.7823  data: 0.0005  max mem: 69511
Epoch: [76]  [ 400/1251]  eta: 0:11:15  lr: 0.003162  min_lr: 0.003162  loss: 3.3534 (3.3710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5899 (0.5771)  time: 0.7855  data: 0.0006  max mem: 69511
Epoch: [76]  [ 600/1251]  eta: 0:08:35  lr: 0.003160  min_lr: 0.003160  loss: 3.4931 (3.3890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5850 (0.5839)  time: 0.7942  data: 0.0004  max mem: 69511
Epoch: [76]  [ 800/1251]  eta: 0:05:56  lr: 0.003158  min_lr: 0.003158  loss: 3.4764 (3.3883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5573 (0.5861)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [76]  [1000/1251]  eta: 0:03:18  lr: 0.003157  min_lr: 0.003157  loss: 3.3537 (3.3895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5354 (0.5814)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [76]  [1200/1251]  eta: 0:00:40  lr: 0.003155  min_lr: 0.003155  loss: 3.2736 (3.3907)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5003 (0.5803)  time: 0.7947  data: 0.0004  max mem: 69511
Epoch: [76]  [1250/1251]  eta: 0:00:00  lr: 0.003154  min_lr: 0.003154  loss: 3.4669 (3.3947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5219 (0.5786)  time: 0.6689  data: 0.0006  max mem: 69511
Epoch: [76] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.003154  min_lr: 0.003154  loss: 3.4669 (3.3872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5219 (0.5786)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7709 (0.7709)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.6127  data: 5.1451  max mem: 69511
Test:  [10/25]  eta: 0:00:13  loss: 0.8704 (0.9347)  acc1: 84.4000 (81.8545)  acc5: 96.8000 (96.5818)  time: 0.8718  data: 0.4681  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1131 (1.1230)  acc1: 74.4000 (77.7524)  acc5: 93.6000 (94.5524)  time: 0.3975  data: 0.0003  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2353 (1.1367)  acc1: 74.4000 (77.3440)  acc5: 92.8000 (94.3520)  time: 0.3973  data: 0.0001  max mem: 69511
Test: Total time: 0:00:15 (0.6113 s / it)
* Acc@1 77.936 Acc@5 94.486 loss 1.122
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 78.42%
Epoch: [77]  [   0/1251]  eta: 1:21:47  lr: 0.003154  min_lr: 0.003154  loss: 3.6572 (3.6572)  weight_decay: 0.0500 (0.0500)  time: 3.9228  data: 2.2807  max mem: 69511
Epoch: [77]  [ 200/1251]  eta: 0:14:03  lr: 0.003152  min_lr: 0.003152  loss: 3.4055 (3.3650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5751 (0.5789)  time: 0.7868  data: 0.0006  max mem: 69511
Epoch: [77]  [ 400/1251]  eta: 0:11:17  lr: 0.003150  min_lr: 0.003150  loss: 3.5280 (3.3755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5390 (0.5701)  time: 0.7879  data: 0.0006  max mem: 69511
Epoch: [77]  [ 600/1251]  eta: 0:08:37  lr: 0.003149  min_lr: 0.003149  loss: 3.3775 (3.3802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5397 (0.5625)  time: 0.7924  data: 0.0005  max mem: 69511
Epoch: [77]  [ 800/1251]  eta: 0:05:57  lr: 0.003147  min_lr: 0.003147  loss: 3.3690 (3.3894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5889 (0.5650)  time: 0.7875  data: 0.0004  max mem: 69511
Epoch: [77]  [1000/1251]  eta: 0:03:18  lr: 0.003145  min_lr: 0.003145  loss: 3.1224 (3.3954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5843 (0.5775)  time: 0.7899  data: 0.0005  max mem: 69511
Epoch: [77]  [1200/1251]  eta: 0:00:40  lr: 0.003143  min_lr: 0.003143  loss: 3.6682 (3.3975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5608 (0.5744)  time: 0.7876  data: 0.0005  max mem: 69511
Epoch: [77]  [1250/1251]  eta: 0:00:00  lr: 0.003142  min_lr: 0.003142  loss: 3.6111 (3.3989)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6685  data: 0.0005  max mem: 69511
Epoch: [77] Total time: 0:16:28 (0.7900 s / it)
Averaged stats: lr: 0.003142  min_lr: 0.003142  loss: 3.6111 (3.3893)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.7686 (0.7686)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 6.4433  data: 5.9768  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8925 (0.9227)  acc1: 84.0000 (83.4909)  acc5: 97.6000 (97.3091)  time: 0.9476  data: 0.5439  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1174 (1.1006)  acc1: 76.8000 (78.8762)  acc5: 94.8000 (95.2762)  time: 0.3976  data: 0.0004  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2167 (1.1151)  acc1: 76.4000 (78.5280)  acc5: 94.4000 (95.0880)  time: 0.3973  data: 0.0002  max mem: 69511
Test: Total time: 0:00:16 (0.6463 s / it)
* Acc@1 78.368 Acc@5 94.728 loss 1.117
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.42%
Epoch: [78]  [   0/1251]  eta: 1:34:22  lr: 0.003142  min_lr: 0.003142  loss: 3.8639 (3.8639)  weight_decay: 0.0500 (0.0500)  time: 4.5262  data: 3.1191  max mem: 69511
Epoch: [78]  [ 200/1251]  eta: 0:14:04  lr: 0.003141  min_lr: 0.003141  loss: 3.2250 (3.3693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5184 (0.5660)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [78]  [ 400/1251]  eta: 0:11:18  lr: 0.003139  min_lr: 0.003139  loss: 3.5305 (3.3763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5410 (0.5574)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [78]  [ 600/1251]  eta: 0:08:36  lr: 0.003137  min_lr: 0.003137  loss: 3.4061 (3.3796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5353 (0.5534)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [78]  [ 800/1251]  eta: 0:05:57  lr: 0.003135  min_lr: 0.003135  loss: 3.5583 (3.3747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5075 (0.5468)  time: 0.7872  data: 0.0004  max mem: 69511
Epoch: [78]  [1000/1251]  eta: 0:03:18  lr: 0.003133  min_lr: 0.003133  loss: 3.3451 (3.3735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6164 (0.5504)  time: 0.7920  data: 0.0005  max mem: 69511
Epoch: [78]  [1200/1251]  eta: 0:00:40  lr: 0.003131  min_lr: 0.003131  loss: 3.6210 (3.3691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4957 (0.5470)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [78]  [1250/1251]  eta: 0:00:00  lr: 0.003131  min_lr: 0.003131  loss: 3.4379 (3.3666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5859 (0.5513)  time: 0.6686  data: 0.0005  max mem: 69511
Epoch: [78] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.003131  min_lr: 0.003131  loss: 3.4379 (3.3733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5859 (0.5513)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6937 (0.6937)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 5.8104  data: 5.3831  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8934 (0.9047)  acc1: 83.6000 (82.2909)  acc5: 97.2000 (96.8000)  time: 0.9713  data: 0.5715  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1604 (1.1004)  acc1: 75.6000 (78.4381)  acc5: 94.0000 (94.6667)  time: 0.4423  data: 0.0452  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2610 (1.1114)  acc1: 75.6000 (78.0320)  acc5: 93.6000 (94.6400)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6548 s / it)
* Acc@1 77.966 Acc@5 94.466 loss 1.103
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 78.42%
Epoch: [79]  [   0/1251]  eta: 1:28:17  lr: 0.003130  min_lr: 0.003130  loss: 3.2022 (3.2022)  weight_decay: 0.0500 (0.0500)  time: 4.2346  data: 1.9561  max mem: 69511
Epoch: [79]  [ 200/1251]  eta: 0:14:07  lr: 0.003129  min_lr: 0.003129  loss: 3.4380 (3.3141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5527 (0.6013)  time: 0.7967  data: 0.0004  max mem: 69511
Epoch: [79]  [ 400/1251]  eta: 0:11:17  lr: 0.003127  min_lr: 0.003127  loss: 3.4151 (3.3533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5088 (0.5763)  time: 0.7830  data: 0.0004  max mem: 69511
Epoch: [79]  [ 600/1251]  eta: 0:08:35  lr: 0.003125  min_lr: 0.003125  loss: 3.2432 (3.3832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5243 (0.5842)  time: 0.7837  data: 0.0007  max mem: 69511
Epoch: [79]  [ 800/1251]  eta: 0:05:56  lr: 0.003123  min_lr: 0.003123  loss: 3.4138 (3.3875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5663 (0.5788)  time: 0.7897  data: 0.0005  max mem: 69511
Epoch: [79]  [1000/1251]  eta: 0:03:18  lr: 0.003121  min_lr: 0.003121  loss: 3.0005 (3.3900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5972 (0.5802)  time: 0.7838  data: 0.0008  max mem: 69511
Epoch: [79]  [1200/1251]  eta: 0:00:40  lr: 0.003119  min_lr: 0.003119  loss: 3.6102 (3.3843)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5894 (0.5831)  time: 0.7860  data: 0.0006  max mem: 69511
Epoch: [79]  [1250/1251]  eta: 0:00:00  lr: 0.003118  min_lr: 0.003118  loss: 3.3782 (3.3821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5402 (0.5821)  time: 0.6679  data: 0.0006  max mem: 69511
Epoch: [79] Total time: 0:16:25 (0.7876 s / it)
Averaged stats: lr: 0.003118  min_lr: 0.003118  loss: 3.3782 (3.3869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5402 (0.5821)
Test:  [ 0/25]  eta: 0:03:09  loss: 0.7208 (0.7208)  acc1: 88.0000 (88.0000)  acc5: 97.2000 (97.2000)  time: 7.5701  data: 7.1283  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9250 (0.9472)  acc1: 84.4000 (82.9818)  acc5: 97.2000 (96.7636)  time: 1.0494  data: 0.6483  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1511 (1.1008)  acc1: 76.0000 (78.7429)  acc5: 94.0000 (94.7238)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2145 (1.1099)  acc1: 75.2000 (78.3840)  acc5: 93.2000 (94.5760)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6899 s / it)
* Acc@1 78.412 Acc@5 94.660 loss 1.114
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.42%
Epoch: [80]  [   0/1251]  eta: 1:31:31  lr: 0.003118  min_lr: 0.003118  loss: 4.0515 (4.0515)  weight_decay: 0.0500 (0.0500)  time: 4.3894  data: 3.4718  max mem: 69511
Epoch: [80]  [ 200/1251]  eta: 0:14:09  lr: 0.003116  min_lr: 0.003116  loss: 3.3228 (3.3245)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5336 (0.5411)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [80]  [ 400/1251]  eta: 0:11:18  lr: 0.003114  min_lr: 0.003114  loss: 3.6094 (3.3560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5591 (0.5643)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [80]  [ 600/1251]  eta: 0:08:37  lr: 0.003112  min_lr: 0.003112  loss: 3.4683 (3.3517)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5388 (0.5687)  time: 0.7878  data: 0.0004  max mem: 69511
Epoch: [80]  [ 800/1251]  eta: 0:05:57  lr: 0.003110  min_lr: 0.003110  loss: 3.0433 (3.3581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5261 (0.5638)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [80]  [1000/1251]  eta: 0:03:18  lr: 0.003108  min_lr: 0.003108  loss: 3.3289 (3.3705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6115 (0.5723)  time: 0.7823  data: 0.0004  max mem: 69511
Epoch: [80]  [1200/1251]  eta: 0:00:40  lr: 0.003106  min_lr: 0.003106  loss: 3.4365 (3.3733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5649 (nan)  time: 0.7875  data: 0.0004  max mem: 69511
Epoch: [80]  [1250/1251]  eta: 0:00:00  lr: 0.003106  min_lr: 0.003106  loss: 3.1259 (3.3689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5074 (nan)  time: 0.6728  data: 0.0006  max mem: 69511
Epoch: [80] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.003106  min_lr: 0.003106  loss: 3.1259 (3.3776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5074 (nan)
Test:  [ 0/25]  eta: 0:03:13  loss: 0.6620 (0.6620)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 7.7437  data: 7.3212  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8186 (0.8560)  acc1: 83.2000 (82.8727)  acc5: 97.6000 (96.9091)  time: 1.0644  data: 0.6659  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0363 (1.0203)  acc1: 76.8000 (79.1619)  acc5: 94.4000 (95.2571)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1442 (1.0362)  acc1: 76.8000 (78.7520)  acc5: 94.0000 (95.0560)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6956 s / it)
* Acc@1 78.630 Acc@5 94.870 loss 1.037
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.63%
Epoch: [81]  [   0/1251]  eta: 1:39:32  lr: 0.003106  min_lr: 0.003106  loss: 3.5849 (3.5849)  weight_decay: 0.0500 (0.0500)  time: 4.7738  data: 3.9913  max mem: 69511
Epoch: [81]  [ 200/1251]  eta: 0:14:06  lr: 0.003104  min_lr: 0.003104  loss: 2.9372 (3.3363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5924 (0.5655)  time: 0.7860  data: 0.0007  max mem: 69511
Epoch: [81]  [ 400/1251]  eta: 0:11:17  lr: 0.003102  min_lr: 0.003102  loss: 3.4867 (3.3314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5567 (0.5694)  time: 0.7857  data: 0.0010  max mem: 69511
Epoch: [81]  [ 600/1251]  eta: 0:08:36  lr: 0.003100  min_lr: 0.003100  loss: 3.3999 (3.3421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5439 (0.5709)  time: 0.7828  data: 0.0007  max mem: 69511
Epoch: [81]  [ 800/1251]  eta: 0:05:56  lr: 0.003098  min_lr: 0.003098  loss: 3.4407 (3.3432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5073 (0.5706)  time: 0.7840  data: 0.0009  max mem: 69511
Epoch: [81]  [1000/1251]  eta: 0:03:18  lr: 0.003096  min_lr: 0.003096  loss: 3.3120 (3.3421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5964 (0.5787)  time: 0.7827  data: 0.0010  max mem: 69511
Epoch: [81]  [1200/1251]  eta: 0:00:40  lr: 0.003094  min_lr: 0.003094  loss: 3.6563 (3.3460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5870 (0.5766)  time: 0.7825  data: 0.0007  max mem: 69511
Epoch: [81]  [1250/1251]  eta: 0:00:00  lr: 0.003094  min_lr: 0.003094  loss: 3.4769 (3.3462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5601 (0.5754)  time: 0.6646  data: 0.0005  max mem: 69511
Epoch: [81] Total time: 0:16:25 (0.7875 s / it)
Averaged stats: lr: 0.003094  min_lr: 0.003094  loss: 3.4769 (3.3646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5601 (0.5754)
Test:  [ 0/25]  eta: 0:03:04  loss: 0.7368 (0.7368)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 7.3987  data: 6.9669  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8849 (0.8893)  acc1: 82.0000 (82.8364)  acc5: 97.2000 (96.9818)  time: 1.0310  data: 0.6336  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0389 (1.0575)  acc1: 77.2000 (79.0095)  acc5: 94.0000 (94.8762)  time: 0.3941  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1506 (1.0682)  acc1: 75.6000 (78.7520)  acc5: 93.2000 (94.7840)  time: 0.3941  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6851 s / it)
* Acc@1 78.606 Acc@5 94.710 loss 1.072
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.63%
Epoch: [82]  [   0/1251]  eta: 1:41:48  lr: 0.003093  min_lr: 0.003093  loss: 1.8829 (1.8829)  weight_decay: 0.0500 (0.0500)  time: 4.8827  data: 4.0851  max mem: 69511
Epoch: [82]  [ 200/1251]  eta: 0:14:05  lr: 0.003091  min_lr: 0.003091  loss: 3.6532 (3.4586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6234 (0.5864)  time: 0.7924  data: 0.0006  max mem: 69511
Epoch: [82]  [ 400/1251]  eta: 0:11:16  lr: 0.003089  min_lr: 0.003089  loss: 3.5088 (3.4200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5278 (0.5649)  time: 0.7903  data: 0.0007  max mem: 69511
Epoch: [82]  [ 600/1251]  eta: 0:08:35  lr: 0.003087  min_lr: 0.003087  loss: 3.4253 (3.4122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5784 (0.5583)  time: 0.7851  data: 0.0005  max mem: 69511
Epoch: [82]  [ 800/1251]  eta: 0:05:56  lr: 0.003085  min_lr: 0.003085  loss: 3.4995 (3.4002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5251 (0.5577)  time: 0.7817  data: 0.0005  max mem: 69511
Epoch: [82]  [1000/1251]  eta: 0:03:18  lr: 0.003083  min_lr: 0.003083  loss: 3.6050 (3.3810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5780 (0.5618)  time: 0.7831  data: 0.0006  max mem: 69511
Epoch: [82]  [1200/1251]  eta: 0:00:40  lr: 0.003081  min_lr: 0.003081  loss: 3.4456 (3.3783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5331 (0.5633)  time: 0.7853  data: 0.0007  max mem: 69511
Epoch: [82]  [1250/1251]  eta: 0:00:00  lr: 0.003081  min_lr: 0.003081  loss: 3.4589 (3.3819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5404 (0.5646)  time: 0.6669  data: 0.0006  max mem: 69511
Epoch: [82] Total time: 0:16:24 (0.7870 s / it)
Averaged stats: lr: 0.003081  min_lr: 0.003081  loss: 3.4589 (3.3640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5404 (0.5646)
Test:  [ 0/25]  eta: 0:03:11  loss: 0.7964 (0.7964)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 7.6407  data: 7.2185  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9817 (0.9756)  acc1: 82.0000 (82.8364)  acc5: 97.2000 (96.8364)  time: 1.0552  data: 0.6565  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1381 (1.1537)  acc1: 75.2000 (78.7238)  acc5: 94.8000 (94.7238)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3509 (1.1662)  acc1: 74.8000 (78.4320)  acc5: 93.2000 (94.6400)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6905 s / it)
* Acc@1 78.270 Acc@5 94.672 loss 1.164
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.63%
Epoch: [83]  [   0/1251]  eta: 1:45:49  lr: 0.003081  min_lr: 0.003081  loss: 3.0382 (3.0382)  weight_decay: 0.0500 (0.0500)  time: 5.0757  data: 2.1033  max mem: 69511
Epoch: [83]  [ 200/1251]  eta: 0:14:11  lr: 0.003079  min_lr: 0.003079  loss: 3.1640 (3.3705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5996 (0.5709)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [83]  [ 400/1251]  eta: 0:11:19  lr: 0.003077  min_lr: 0.003077  loss: 3.3787 (3.3526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5840 (0.5796)  time: 0.7831  data: 0.0004  max mem: 69511
Epoch: [83]  [ 600/1251]  eta: 0:08:36  lr: 0.003075  min_lr: 0.003075  loss: 3.7089 (3.3464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5458 (0.5801)  time: 0.7892  data: 0.0005  max mem: 69511
Epoch: [83]  [ 800/1251]  eta: 0:05:57  lr: 0.003073  min_lr: 0.003073  loss: 3.3787 (3.3544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6147 (0.5913)  time: 0.7856  data: 0.0006  max mem: 69511
Epoch: [83]  [1000/1251]  eta: 0:03:18  lr: 0.003071  min_lr: 0.003071  loss: 3.5722 (3.3566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5521 (0.5913)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [83]  [1200/1251]  eta: 0:00:40  lr: 0.003068  min_lr: 0.003068  loss: 3.5338 (3.3592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5360 (0.5842)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [83]  [1250/1251]  eta: 0:00:00  lr: 0.003068  min_lr: 0.003068  loss: 3.3860 (3.3612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5469 (0.5853)  time: 0.6741  data: 0.0007  max mem: 69511
Epoch: [83] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.003068  min_lr: 0.003068  loss: 3.3860 (3.3643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5469 (0.5853)
Test:  [ 0/25]  eta: 0:02:59  loss: 0.8103 (0.8103)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 7.1999  data: 6.7675  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9560 (0.9835)  acc1: 84.0000 (82.2909)  acc5: 97.6000 (97.0909)  time: 1.0156  data: 0.6155  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1063 (1.1447)  acc1: 76.8000 (78.6476)  acc5: 94.0000 (94.7429)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2227 (1.1564)  acc1: 76.4000 (78.2720)  acc5: 93.2000 (94.5760)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6744 s / it)
* Acc@1 78.298 Acc@5 94.660 loss 1.157
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.63%
Epoch: [84]  [   0/1251]  eta: 1:39:02  lr: 0.003068  min_lr: 0.003068  loss: 4.0116 (4.0116)  weight_decay: 0.0500 (0.0500)  time: 4.7505  data: 3.6593  max mem: 69511
Epoch: [84]  [ 200/1251]  eta: 0:14:10  lr: 0.003066  min_lr: 0.003066  loss: 3.3781 (3.3406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5595 (0.5857)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [84]  [ 400/1251]  eta: 0:11:19  lr: 0.003064  min_lr: 0.003064  loss: 3.4243 (3.3461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5415 (0.5722)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [84]  [ 600/1251]  eta: 0:08:37  lr: 0.003062  min_lr: 0.003062  loss: 3.3177 (3.3356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5276 (0.5630)  time: 0.7839  data: 0.0004  max mem: 69511
Epoch: [84]  [ 800/1251]  eta: 0:05:57  lr: 0.003060  min_lr: 0.003060  loss: 3.2618 (3.3309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5482 (0.5694)  time: 0.7841  data: 0.0004  max mem: 69511
Epoch: [84]  [1000/1251]  eta: 0:03:18  lr: 0.003058  min_lr: 0.003058  loss: 3.1053 (3.3307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5193 (0.5680)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [84]  [1200/1251]  eta: 0:00:40  lr: 0.003056  min_lr: 0.003056  loss: 3.7171 (3.3355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6457 (0.5767)  time: 0.7863  data: 0.0007  max mem: 69511
Epoch: [84]  [1250/1251]  eta: 0:00:00  lr: 0.003055  min_lr: 0.003055  loss: 3.3378 (3.3337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6262 (0.5784)  time: 0.6676  data: 0.0006  max mem: 69511
Epoch: [84] Total time: 0:16:27 (0.7897 s / it)
Averaged stats: lr: 0.003055  min_lr: 0.003055  loss: 3.3378 (3.3483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6262 (0.5784)
Test:  [ 0/25]  eta: 0:03:01  loss: 0.7353 (0.7353)  acc1: 85.2000 (85.2000)  acc5: 98.0000 (98.0000)  time: 7.2724  data: 6.8311  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8534 (0.8898)  acc1: 84.0000 (82.6909)  acc5: 97.6000 (97.1636)  time: 1.0221  data: 0.6213  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0793 (1.0673)  acc1: 76.0000 (78.7048)  acc5: 94.0000 (94.9905)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2029 (1.0787)  acc1: 75.2000 (78.1120)  acc5: 93.6000 (94.7680)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6774 s / it)
* Acc@1 78.808 Acc@5 94.840 loss 1.067
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.81%
Epoch: [85]  [   0/1251]  eta: 1:29:19  lr: 0.003055  min_lr: 0.003055  loss: 2.8569 (2.8569)  weight_decay: 0.0500 (0.0500)  time: 4.2842  data: 3.5030  max mem: 69511
Epoch: [85]  [ 200/1251]  eta: 0:14:03  lr: 0.003053  min_lr: 0.003053  loss: 3.6307 (3.3063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5596 (0.5441)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [85]  [ 400/1251]  eta: 0:11:17  lr: 0.003051  min_lr: 0.003051  loss: 3.4355 (3.3324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5582 (0.5636)  time: 0.8001  data: 0.0004  max mem: 69511
Epoch: [85]  [ 600/1251]  eta: 0:08:36  lr: 0.003049  min_lr: 0.003049  loss: 3.6603 (3.3432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5335 (0.5642)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [85]  [ 800/1251]  eta: 0:05:57  lr: 0.003047  min_lr: 0.003047  loss: 3.3594 (3.3495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5711 (0.5606)  time: 0.7870  data: 0.0005  max mem: 69511
Epoch: [85]  [1000/1251]  eta: 0:03:18  lr: 0.003044  min_lr: 0.003044  loss: 3.0975 (3.3471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5623 (0.5675)  time: 0.7916  data: 0.0006  max mem: 69511
Epoch: [85]  [1200/1251]  eta: 0:00:40  lr: 0.003042  min_lr: 0.003042  loss: 2.9731 (3.3390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4976 (0.5668)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [85]  [1250/1251]  eta: 0:00:00  lr: 0.003042  min_lr: 0.003042  loss: 3.4236 (3.3430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5005 (0.5654)  time: 0.6676  data: 0.0005  max mem: 69511
Epoch: [85] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.003042  min_lr: 0.003042  loss: 3.4236 (3.3466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5005 (0.5654)
Test:  [ 0/25]  eta: 0:03:10  loss: 0.7460 (0.7460)  acc1: 88.0000 (88.0000)  acc5: 97.2000 (97.2000)  time: 7.6324  data: 7.2047  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8336 (0.9086)  acc1: 84.4000 (82.4727)  acc5: 97.2000 (96.9091)  time: 1.0547  data: 0.6552  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0859 (1.0775)  acc1: 76.0000 (78.5714)  acc5: 94.0000 (94.9905)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2073 (1.0847)  acc1: 76.0000 (78.3040)  acc5: 93.6000 (94.9280)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6913 s / it)
* Acc@1 78.806 Acc@5 94.840 loss 1.079
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.81%
Epoch: [86]  [   0/1251]  eta: 1:23:25  lr: 0.003042  min_lr: 0.003042  loss: 2.3531 (2.3531)  weight_decay: 0.0500 (0.0500)  time: 4.0013  data: 2.2730  max mem: 69511
Epoch: [86]  [ 200/1251]  eta: 0:14:06  lr: 0.003040  min_lr: 0.003040  loss: 3.4515 (3.3163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5460 (0.5852)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [86]  [ 400/1251]  eta: 0:11:18  lr: 0.003038  min_lr: 0.003038  loss: 3.2399 (3.3288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6084 (0.5762)  time: 0.7823  data: 0.0005  max mem: 69511
Epoch: [86]  [ 600/1251]  eta: 0:08:36  lr: 0.003035  min_lr: 0.003035  loss: 3.2784 (3.3395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5808 (0.5807)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [86]  [ 800/1251]  eta: 0:05:56  lr: 0.003033  min_lr: 0.003033  loss: 3.4128 (3.3460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5923 (0.5859)  time: 0.7852  data: 0.0005  max mem: 69511
Epoch: [86]  [1000/1251]  eta: 0:03:18  lr: 0.003031  min_lr: 0.003031  loss: 3.2501 (3.3426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5585 (0.5771)  time: 0.7852  data: 0.0005  max mem: 69511
Epoch: [86]  [1200/1251]  eta: 0:00:40  lr: 0.003029  min_lr: 0.003029  loss: 3.4333 (3.3480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5775 (0.5798)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [86]  [1250/1251]  eta: 0:00:00  lr: 0.003029  min_lr: 0.003029  loss: 3.5091 (3.3483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5738 (0.5797)  time: 0.6713  data: 0.0007  max mem: 69511
Epoch: [86] Total time: 0:16:25 (0.7876 s / it)
Averaged stats: lr: 0.003029  min_lr: 0.003029  loss: 3.5091 (3.3535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5738 (0.5797)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.7154 (0.7154)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 6.2123  data: 5.7703  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8908 (0.9352)  acc1: 83.2000 (82.5818)  acc5: 97.6000 (97.1273)  time: 1.0131  data: 0.6127  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1982 (1.1190)  acc1: 76.8000 (78.3619)  acc5: 94.0000 (94.9714)  time: 0.4448  data: 0.0485  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2317 (1.1321)  acc1: 75.2000 (77.8720)  acc5: 93.2000 (94.7040)  time: 0.3966  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6724 s / it)
* Acc@1 78.606 Acc@5 94.770 loss 1.118
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.81%
Epoch: [87]  [   0/1251]  eta: 1:33:27  lr: 0.003029  min_lr: 0.003029  loss: 3.0172 (3.0172)  weight_decay: 0.0500 (0.0500)  time: 4.4823  data: 3.1752  max mem: 69511
Epoch: [87]  [ 200/1251]  eta: 0:14:09  lr: 0.003026  min_lr: 0.003026  loss: 3.3694 (3.3524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5814 (0.6012)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [87]  [ 400/1251]  eta: 0:11:19  lr: 0.003024  min_lr: 0.003024  loss: 3.3229 (3.3405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5930 (0.6020)  time: 0.7868  data: 0.0006  max mem: 69511
Epoch: [87]  [ 600/1251]  eta: 0:08:37  lr: 0.003022  min_lr: 0.003022  loss: 3.5155 (3.3128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5047 (0.5829)  time: 0.7870  data: 0.0008  max mem: 69511
Epoch: [87]  [ 800/1251]  eta: 0:05:57  lr: 0.003020  min_lr: 0.003020  loss: 3.5254 (3.3219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5199 (0.5812)  time: 0.7872  data: 0.0005  max mem: 69511
Epoch: [87]  [1000/1251]  eta: 0:03:18  lr: 0.003018  min_lr: 0.003018  loss: 3.1610 (3.3268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5515 (0.5831)  time: 0.7955  data: 0.0004  max mem: 69511
Epoch: [87]  [1200/1251]  eta: 0:00:40  lr: 0.003016  min_lr: 0.003016  loss: 3.5807 (3.3389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5937 (0.5824)  time: 0.7971  data: 0.0006  max mem: 69511
Epoch: [87]  [1250/1251]  eta: 0:00:00  lr: 0.003015  min_lr: 0.003015  loss: 3.3704 (3.3379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5720 (0.5830)  time: 0.6720  data: 0.0007  max mem: 69511
Epoch: [87] Total time: 0:16:29 (0.7907 s / it)
Averaged stats: lr: 0.003015  min_lr: 0.003015  loss: 3.3704 (3.3415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5720 (0.5830)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.7990 (0.7990)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 7.4876  data: 7.0632  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9207 (0.9559)  acc1: 83.2000 (82.2545)  acc5: 97.2000 (96.8364)  time: 1.0416  data: 0.6424  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1044 (1.1072)  acc1: 75.2000 (78.5524)  acc5: 94.4000 (94.7619)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2242 (1.1134)  acc1: 75.2000 (78.3520)  acc5: 94.0000 (94.6080)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6852 s / it)
* Acc@1 78.722 Acc@5 94.858 loss 1.099
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.81%
Epoch: [88]  [   0/1251]  eta: 1:34:15  lr: 0.003015  min_lr: 0.003015  loss: 4.1493 (4.1493)  weight_decay: 0.0500 (0.0500)  time: 4.5212  data: 3.7335  max mem: 69511
Epoch: [88]  [ 200/1251]  eta: 0:14:06  lr: 0.003013  min_lr: 0.003013  loss: 3.2874 (3.2806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5679 (0.5649)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [88]  [ 400/1251]  eta: 0:11:17  lr: 0.003011  min_lr: 0.003011  loss: 3.4389 (3.2998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5928 (0.5720)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [88]  [ 600/1251]  eta: 0:08:37  lr: 0.003008  min_lr: 0.003008  loss: 3.3569 (3.3264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5393 (0.5723)  time: 0.7897  data: 0.0005  max mem: 69511
Epoch: [88]  [ 800/1251]  eta: 0:05:57  lr: 0.003006  min_lr: 0.003006  loss: 3.5611 (3.3202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5432 (0.5741)  time: 0.7867  data: 0.0004  max mem: 69511
Epoch: [88]  [1000/1251]  eta: 0:03:18  lr: 0.003004  min_lr: 0.003004  loss: 3.5237 (3.3319)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5762 (0.5750)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [88]  [1200/1251]  eta: 0:00:40  lr: 0.003002  min_lr: 0.003002  loss: 3.4577 (3.3186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5248 (0.5766)  time: 0.7820  data: 0.0004  max mem: 69511
Epoch: [88]  [1250/1251]  eta: 0:00:00  lr: 0.003001  min_lr: 0.003001  loss: 3.3517 (3.3186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5060 (0.5750)  time: 0.6642  data: 0.0005  max mem: 69511
Epoch: [88] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.003001  min_lr: 0.003001  loss: 3.3517 (3.3288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5060 (0.5750)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.7186 (0.7186)  acc1: 87.2000 (87.2000)  acc5: 97.2000 (97.2000)  time: 7.8268  data: 7.4171  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8374 (0.8494)  acc1: 84.0000 (83.0909)  acc5: 97.2000 (96.9091)  time: 1.0707  data: 0.6746  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0224 (1.0172)  acc1: 76.8000 (79.3714)  acc5: 94.0000 (95.0286)  time: 0.3945  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1424 (1.0346)  acc1: 76.4000 (78.8640)  acc5: 93.6000 (94.8000)  time: 0.3940  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7038 s / it)
* Acc@1 78.926 Acc@5 94.978 loss 1.020
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 78.93%
Epoch: [89]  [   0/1251]  eta: 1:29:35  lr: 0.003001  min_lr: 0.003001  loss: 2.9682 (2.9682)  weight_decay: 0.0500 (0.0500)  time: 4.2974  data: 3.5189  max mem: 69511
Epoch: [89]  [ 200/1251]  eta: 0:14:02  lr: 0.002999  min_lr: 0.002999  loss: 3.3368 (3.3410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5242 (0.5643)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [89]  [ 400/1251]  eta: 0:11:16  lr: 0.002997  min_lr: 0.002997  loss: 3.5302 (3.3589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5560 (0.6043)  time: 0.7904  data: 0.0005  max mem: 69511
Epoch: [89]  [ 600/1251]  eta: 0:08:36  lr: 0.002995  min_lr: 0.002995  loss: 3.4561 (3.3551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5711 (0.5944)  time: 0.7867  data: 0.0006  max mem: 69511
Epoch: [89]  [ 800/1251]  eta: 0:05:56  lr: 0.002993  min_lr: 0.002993  loss: 3.4704 (3.3380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5104 (0.5888)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [89]  [1000/1251]  eta: 0:03:18  lr: 0.002990  min_lr: 0.002990  loss: 3.6002 (3.3324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6254 (0.5898)  time: 0.7882  data: 0.0004  max mem: 69511
Epoch: [89]  [1200/1251]  eta: 0:00:40  lr: 0.002988  min_lr: 0.002988  loss: 3.4548 (3.3331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5081 (0.5804)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [89]  [1250/1251]  eta: 0:00:00  lr: 0.002988  min_lr: 0.002988  loss: 3.3781 (3.3318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5547 (0.5812)  time: 0.6685  data: 0.0006  max mem: 69511
Epoch: [89] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.002988  min_lr: 0.002988  loss: 3.3781 (3.3260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5547 (0.5812)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.8616 (0.8616)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 7.3581  data: 6.9192  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9424 (0.9747)  acc1: 82.8000 (82.9455)  acc5: 97.2000 (97.0546)  time: 1.0303  data: 0.6294  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1331 (1.1234)  acc1: 77.2000 (78.7048)  acc5: 94.4000 (95.1619)  time: 0.3974  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1729 (1.1266)  acc1: 75.6000 (78.1920)  acc5: 94.0000 (95.0720)  time: 0.3973  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6818 s / it)
* Acc@1 79.054 Acc@5 95.022 loss 1.106
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.05%
Epoch: [90]  [   0/1251]  eta: 1:28:46  lr: 0.002988  min_lr: 0.002988  loss: 4.1323 (4.1323)  weight_decay: 0.0500 (0.0500)  time: 4.2581  data: 3.4791  max mem: 69511
Epoch: [90]  [ 200/1251]  eta: 0:14:07  lr: 0.002985  min_lr: 0.002985  loss: 3.5234 (3.3063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5472 (0.5787)  time: 0.8157  data: 0.0004  max mem: 69511
Epoch: [90]  [ 400/1251]  eta: 0:11:18  lr: 0.002983  min_lr: 0.002983  loss: 3.3071 (3.2904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5912 (0.5716)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [90]  [ 600/1251]  eta: 0:08:36  lr: 0.002981  min_lr: 0.002981  loss: 3.1661 (3.3033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5359 (0.5710)  time: 0.7836  data: 0.0005  max mem: 69511
Epoch: [90]  [ 800/1251]  eta: 0:05:56  lr: 0.002979  min_lr: 0.002979  loss: 3.5466 (3.2977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5937 (0.5741)  time: 0.7849  data: 0.0005  max mem: 69511
Epoch: [90]  [1000/1251]  eta: 0:03:18  lr: 0.002976  min_lr: 0.002976  loss: 3.6540 (3.3042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6300 (0.5768)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [90]  [1200/1251]  eta: 0:00:40  lr: 0.002974  min_lr: 0.002974  loss: 3.4506 (3.3091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5119 (0.5728)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [90]  [1250/1251]  eta: 0:00:00  lr: 0.002974  min_lr: 0.002974  loss: 3.5109 (3.3112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5700 (0.5739)  time: 0.6672  data: 0.0006  max mem: 69511
Epoch: [90] Total time: 0:16:25 (0.7881 s / it)
Averaged stats: lr: 0.002974  min_lr: 0.002974  loss: 3.5109 (3.3254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5700 (0.5739)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.8985 (0.8985)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 7.4966  data: 7.0623  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9960 (1.0047)  acc1: 82.8000 (82.4000)  acc5: 96.8000 (96.7273)  time: 1.0423  data: 0.6423  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1294 (1.1672)  acc1: 77.2000 (78.5905)  acc5: 94.4000 (94.9143)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3014 (1.1801)  acc1: 76.4000 (78.2240)  acc5: 93.2000 (94.7520)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6883 s / it)
* Acc@1 78.770 Acc@5 94.874 loss 1.163
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 79.05%
Epoch: [91]  [   0/1251]  eta: 1:27:03  lr: 0.002974  min_lr: 0.002974  loss: 3.3774 (3.3774)  weight_decay: 0.0500 (0.0500)  time: 4.1755  data: 2.2325  max mem: 69511
Epoch: [91]  [ 200/1251]  eta: 0:14:07  lr: 0.002971  min_lr: 0.002971  loss: 3.3268 (3.3312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5422 (0.5990)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [91]  [ 400/1251]  eta: 0:11:17  lr: 0.002969  min_lr: 0.002969  loss: 3.4882 (3.3256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5742 (0.5976)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [91]  [ 600/1251]  eta: 0:08:36  lr: 0.002967  min_lr: 0.002967  loss: 3.4463 (3.3180)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5519 (0.5850)  time: 0.7991  data: 0.0005  max mem: 69511
Epoch: [91]  [ 800/1251]  eta: 0:05:57  lr: 0.002965  min_lr: 0.002965  loss: 3.3368 (3.3317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5520 (0.5841)  time: 0.7862  data: 0.0006  max mem: 69511
Epoch: [91]  [1000/1251]  eta: 0:03:18  lr: 0.002962  min_lr: 0.002962  loss: 3.2816 (3.3451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5136 (0.5805)  time: 0.7824  data: 0.0005  max mem: 69511
Epoch: [91]  [1200/1251]  eta: 0:00:40  lr: 0.002960  min_lr: 0.002960  loss: 3.2611 (3.3386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5474 (0.5828)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [91]  [1250/1251]  eta: 0:00:00  lr: 0.002960  min_lr: 0.002960  loss: 3.3964 (3.3429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5509 (0.5829)  time: 0.6704  data: 0.0005  max mem: 69511
Epoch: [91] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.002960  min_lr: 0.002960  loss: 3.3964 (3.3247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5509 (0.5829)
Test:  [ 0/25]  eta: 0:03:16  loss: 0.8014 (0.8014)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 7.8426  data: 7.3985  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9592 (0.9898)  acc1: 82.4000 (82.7273)  acc5: 97.2000 (96.9455)  time: 1.0741  data: 0.6729  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1814 (1.1720)  acc1: 76.0000 (79.2191)  acc5: 94.0000 (94.9143)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3616 (1.1859)  acc1: 75.2000 (78.7040)  acc5: 94.0000 (94.8800)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7017 s / it)
* Acc@1 79.036 Acc@5 94.942 loss 1.177
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.05%
Epoch: [92]  [   0/1251]  eta: 1:37:36  lr: 0.002960  min_lr: 0.002960  loss: 3.5980 (3.5980)  weight_decay: 0.0500 (0.0500)  time: 4.6815  data: 3.6204  max mem: 69511
Epoch: [92]  [ 200/1251]  eta: 0:14:06  lr: 0.002957  min_lr: 0.002957  loss: 3.4856 (3.3637)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5545 (0.5631)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [92]  [ 400/1251]  eta: 0:11:17  lr: 0.002955  min_lr: 0.002955  loss: 3.4733 (3.3253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5554 (0.5854)  time: 0.7946  data: 0.0004  max mem: 69511
Epoch: [92]  [ 600/1251]  eta: 0:08:36  lr: 0.002953  min_lr: 0.002953  loss: 3.0220 (3.3264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5478 (0.5818)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [92]  [ 800/1251]  eta: 0:05:57  lr: 0.002950  min_lr: 0.002950  loss: 3.4214 (3.3383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5813 (0.5854)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [92]  [1000/1251]  eta: 0:03:18  lr: 0.002948  min_lr: 0.002948  loss: 3.5009 (3.3392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6250 (0.5878)  time: 0.7835  data: 0.0004  max mem: 69511
Epoch: [92]  [1200/1251]  eta: 0:00:40  lr: 0.002946  min_lr: 0.002946  loss: 3.6055 (3.3324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5634 (0.5899)  time: 0.7850  data: 0.0004  max mem: 69511
Epoch: [92]  [1250/1251]  eta: 0:00:00  lr: 0.002945  min_lr: 0.002945  loss: 3.6128 (3.3361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5926 (0.5925)  time: 0.6667  data: 0.0007  max mem: 69511
Epoch: [92] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.002945  min_lr: 0.002945  loss: 3.6128 (3.3242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5926 (0.5925)
Test:  [ 0/25]  eta: 0:03:13  loss: 0.8825 (0.8825)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 7.7307  data: 7.2964  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9560 (0.9977)  acc1: 84.0000 (82.2909)  acc5: 97.2000 (97.0546)  time: 1.0634  data: 0.6636  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1991 (1.1820)  acc1: 75.2000 (78.4191)  acc5: 94.8000 (95.0286)  time: 0.3966  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3100 (1.1942)  acc1: 75.2000 (77.9680)  acc5: 93.6000 (94.9920)  time: 0.3966  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6954 s / it)
* Acc@1 78.744 Acc@5 94.954 loss 1.189
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 79.05%
Epoch: [93]  [   0/1251]  eta: 1:39:41  lr: 0.002945  min_lr: 0.002945  loss: 3.9142 (3.9142)  weight_decay: 0.0500 (0.0500)  time: 4.7815  data: 3.4927  max mem: 69511
Epoch: [93]  [ 200/1251]  eta: 0:14:03  lr: 0.002943  min_lr: 0.002943  loss: 3.5001 (3.2124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5402 (0.6086)  time: 0.7819  data: 0.0004  max mem: 69511
Epoch: [93]  [ 400/1251]  eta: 0:11:15  lr: 0.002941  min_lr: 0.002941  loss: 3.4813 (3.2552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5676 (0.5933)  time: 0.7951  data: 0.0005  max mem: 69511
Epoch: [93]  [ 600/1251]  eta: 0:08:35  lr: 0.002938  min_lr: 0.002938  loss: 3.0575 (3.2686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5798 (0.5988)  time: 0.7850  data: 0.0005  max mem: 69511
Epoch: [93]  [ 800/1251]  eta: 0:05:56  lr: 0.002936  min_lr: 0.002936  loss: 3.4228 (3.2732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5678 (0.5963)  time: 0.7939  data: 0.0004  max mem: 69511
Epoch: [93]  [1000/1251]  eta: 0:03:18  lr: 0.002934  min_lr: 0.002934  loss: 3.3003 (3.2896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6291 (0.5964)  time: 0.7870  data: 0.0005  max mem: 69511
Epoch: [93]  [1200/1251]  eta: 0:00:40  lr: 0.002931  min_lr: 0.002931  loss: 3.2265 (3.2919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6054 (0.5984)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [93]  [1250/1251]  eta: 0:00:00  lr: 0.002931  min_lr: 0.002931  loss: 3.3961 (3.2948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5776 (0.5979)  time: 0.6676  data: 0.0005  max mem: 69511
Epoch: [93] Total time: 0:16:25 (0.7879 s / it)
Averaged stats: lr: 0.002931  min_lr: 0.002931  loss: 3.3961 (3.3161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5776 (0.5979)
Test:  [ 0/25]  eta: 0:03:13  loss: 0.7890 (0.7890)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 7.7480  data: 7.3187  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9227 (0.9438)  acc1: 84.0000 (83.1273)  acc5: 97.6000 (97.2727)  time: 1.0653  data: 0.6656  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1582 (1.1270)  acc1: 76.4000 (79.3143)  acc5: 94.0000 (95.0667)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2219 (1.1438)  acc1: 76.0000 (78.9440)  acc5: 93.6000 (94.9120)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6967 s / it)
* Acc@1 79.074 Acc@5 95.060 loss 1.132
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.07%
Epoch: [94]  [   0/1251]  eta: 1:23:59  lr: 0.002931  min_lr: 0.002931  loss: 3.7196 (3.7196)  weight_decay: 0.0500 (0.0500)  time: 4.0284  data: 3.2456  max mem: 69511
Epoch: [94]  [ 200/1251]  eta: 0:14:05  lr: 0.002929  min_lr: 0.002929  loss: 3.3090 (3.2776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6003 (0.6085)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [94]  [ 400/1251]  eta: 0:11:17  lr: 0.002926  min_lr: 0.002926  loss: 3.5369 (3.3104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5657 (0.5932)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [94]  [ 600/1251]  eta: 0:08:36  lr: 0.002924  min_lr: 0.002924  loss: 3.1238 (3.3022)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6385 (nan)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [94]  [ 800/1251]  eta: 0:05:57  lr: 0.002922  min_lr: 0.002922  loss: 3.2989 (3.2852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5393 (nan)  time: 0.7934  data: 0.0004  max mem: 69511
Epoch: [94]  [1000/1251]  eta: 0:03:18  lr: 0.002919  min_lr: 0.002919  loss: 3.3287 (3.3036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6127 (nan)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [94]  [1200/1251]  eta: 0:00:40  lr: 0.002917  min_lr: 0.002917  loss: 3.4507 (3.2989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5403 (nan)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [94]  [1250/1251]  eta: 0:00:00  lr: 0.002916  min_lr: 0.002916  loss: 3.3283 (3.2998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5749 (nan)  time: 0.6681  data: 0.0005  max mem: 69511
Epoch: [94] Total time: 0:16:27 (0.7890 s / it)
Averaged stats: lr: 0.002916  min_lr: 0.002916  loss: 3.3283 (3.3050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5749 (nan)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.6467 (0.6467)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 7.3352  data: 6.9170  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8839 (0.8553)  acc1: 83.2000 (83.2000)  acc5: 98.0000 (97.3091)  time: 1.0277  data: 0.6291  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0670 (1.0476)  acc1: 78.4000 (79.4476)  acc5: 94.4000 (94.8381)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2149 (1.0607)  acc1: 77.2000 (79.0560)  acc5: 93.6000 (94.7680)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6797 s / it)
* Acc@1 79.352 Acc@5 95.000 loss 1.048
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.35%
Epoch: [95]  [   0/1251]  eta: 1:23:07  lr: 0.002916  min_lr: 0.002916  loss: 2.7224 (2.7224)  weight_decay: 0.0500 (0.0500)  time: 3.9872  data: 3.1971  max mem: 69511
Epoch: [95]  [ 200/1251]  eta: 0:14:05  lr: 0.002914  min_lr: 0.002914  loss: 3.5283 (3.3690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5346 (0.5963)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [95]  [ 400/1251]  eta: 0:11:16  lr: 0.002912  min_lr: 0.002912  loss: 3.0883 (3.3424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5557 (0.5996)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [95]  [ 600/1251]  eta: 0:08:36  lr: 0.002909  min_lr: 0.002909  loss: 3.6842 (3.3565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5360 (0.5849)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [95]  [ 800/1251]  eta: 0:05:57  lr: 0.002907  min_lr: 0.002907  loss: 3.4000 (3.3368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6558 (0.5982)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [95]  [1000/1251]  eta: 0:03:18  lr: 0.002904  min_lr: 0.002904  loss: 3.3371 (3.3389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5914 (0.5962)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [95]  [1200/1251]  eta: 0:00:40  lr: 0.002902  min_lr: 0.002902  loss: 3.3034 (3.3325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5832 (0.5972)  time: 0.7918  data: 0.0007  max mem: 69511
Epoch: [95]  [1250/1251]  eta: 0:00:00  lr: 0.002902  min_lr: 0.002902  loss: 3.4600 (3.3283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6194 (0.5979)  time: 0.6685  data: 0.0006  max mem: 69511
Epoch: [95] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.002902  min_lr: 0.002902  loss: 3.4600 (3.3014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6194 (0.5979)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.8028 (0.8028)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 7.4989  data: 7.0670  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9478 (0.9585)  acc1: 84.0000 (83.0909)  acc5: 96.8000 (96.7636)  time: 1.0430  data: 0.6427  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0965 (1.1035)  acc1: 78.4000 (79.2191)  acc5: 94.0000 (95.0667)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2113 (1.1116)  acc1: 78.4000 (78.7680)  acc5: 94.0000 (94.9440)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6872 s / it)
* Acc@1 79.118 Acc@5 95.142 loss 1.107
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.35%
Epoch: [96]  [   0/1251]  eta: 1:42:45  lr: 0.002902  min_lr: 0.002902  loss: 3.4253 (3.4253)  weight_decay: 0.0500 (0.0500)  time: 4.9287  data: 3.7390  max mem: 69511
Epoch: [96]  [ 200/1251]  eta: 0:14:06  lr: 0.002899  min_lr: 0.002899  loss: 3.5013 (3.2847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6347 (0.5992)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [96]  [ 400/1251]  eta: 0:11:17  lr: 0.002897  min_lr: 0.002897  loss: 3.3537 (3.2555)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [96]  [ 600/1251]  eta: 0:08:37  lr: 0.002894  min_lr: 0.002894  loss: 3.4149 (3.2749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6123 (nan)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [96]  [ 800/1251]  eta: 0:05:57  lr: 0.002892  min_lr: 0.002892  loss: 3.4583 (3.2947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5799 (nan)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [96]  [1000/1251]  eta: 0:03:18  lr: 0.002890  min_lr: 0.002890  loss: 3.4379 (3.2956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5198 (nan)  time: 0.7886  data: 0.0004  max mem: 69511
Epoch: [96]  [1200/1251]  eta: 0:00:40  lr: 0.002887  min_lr: 0.002887  loss: 3.1380 (3.3032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5975 (nan)  time: 0.7833  data: 0.0004  max mem: 69511
Epoch: [96]  [1250/1251]  eta: 0:00:00  lr: 0.002887  min_lr: 0.002887  loss: 3.3516 (3.3050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5326 (nan)  time: 0.6647  data: 0.0006  max mem: 69511
Epoch: [96] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.002887  min_lr: 0.002887  loss: 3.3516 (3.2998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5326 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7148 (0.7148)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.5001  data: 5.0723  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.9069 (0.9291)  acc1: 84.0000 (82.0727)  acc5: 97.2000 (96.8727)  time: 0.9775  data: 0.5815  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0791 (1.1026)  acc1: 76.8000 (78.9333)  acc5: 94.4000 (94.8952)  time: 0.4592  data: 0.0662  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1642 (1.1126)  acc1: 77.6000 (78.9280)  acc5: 93.6000 (94.8160)  time: 0.3932  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6551 s / it)
* Acc@1 79.384 Acc@5 95.158 loss 1.098
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.38%
Epoch: [97]  [   0/1251]  eta: 1:29:09  lr: 0.002887  min_lr: 0.002887  loss: 3.0083 (3.0083)  weight_decay: 0.0500 (0.0500)  time: 4.2762  data: 3.4853  max mem: 69511
Epoch: [97]  [ 200/1251]  eta: 0:14:03  lr: 0.002884  min_lr: 0.002884  loss: 3.4049 (3.2835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6162 (0.5995)  time: 0.7963  data: 0.0005  max mem: 69511
Epoch: [97]  [ 400/1251]  eta: 0:11:17  lr: 0.002882  min_lr: 0.002882  loss: 3.4322 (3.2908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6008 (0.5983)  time: 0.7857  data: 0.0006  max mem: 69511
Epoch: [97]  [ 600/1251]  eta: 0:08:36  lr: 0.002880  min_lr: 0.002880  loss: 3.4324 (3.2871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5519 (0.5879)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [97]  [ 800/1251]  eta: 0:05:57  lr: 0.002877  min_lr: 0.002877  loss: 3.2985 (3.2925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6136 (0.5921)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [97]  [1000/1251]  eta: 0:03:18  lr: 0.002875  min_lr: 0.002875  loss: 3.6274 (3.2963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7095 (0.5958)  time: 0.7828  data: 0.0004  max mem: 69511
Epoch: [97]  [1200/1251]  eta: 0:00:40  lr: 0.002872  min_lr: 0.002872  loss: 3.2527 (3.2997)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5844 (0.6000)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [97]  [1250/1251]  eta: 0:00:00  lr: 0.002872  min_lr: 0.002872  loss: 3.2952 (3.2978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5765 (0.5994)  time: 0.6677  data: 0.0006  max mem: 69511
Epoch: [97] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.002872  min_lr: 0.002872  loss: 3.2952 (3.3010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5765 (0.5994)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.7008 (0.7008)  acc1: 88.4000 (88.4000)  acc5: 97.6000 (97.6000)  time: 7.5135  data: 7.0946  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9034 (0.8766)  acc1: 84.0000 (83.0909)  acc5: 97.2000 (96.8727)  time: 1.0438  data: 0.6452  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0325 (1.0500)  acc1: 76.4000 (79.0667)  acc5: 95.2000 (95.2191)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1417 (1.0564)  acc1: 76.4000 (78.8480)  acc5: 94.4000 (95.1520)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6887 s / it)
* Acc@1 79.364 Acc@5 95.096 loss 1.043
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.38%
Epoch: [98]  [   0/1251]  eta: 1:47:57  lr: 0.002872  min_lr: 0.002872  loss: 3.5605 (3.5605)  weight_decay: 0.0500 (0.0500)  time: 5.1778  data: 4.3662  max mem: 69511
Epoch: [98]  [ 200/1251]  eta: 0:14:08  lr: 0.002869  min_lr: 0.002869  loss: 3.4877 (3.2723)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5730 (0.5657)  time: 0.7821  data: 0.0005  max mem: 69511
Epoch: [98]  [ 400/1251]  eta: 0:11:18  lr: 0.002867  min_lr: 0.002867  loss: 3.3250 (3.2602)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6002 (0.5852)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [98]  [ 600/1251]  eta: 0:08:36  lr: 0.002864  min_lr: 0.002864  loss: 3.4368 (3.2791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5665 (0.5934)  time: 0.7878  data: 0.0006  max mem: 69511
Epoch: [98]  [ 800/1251]  eta: 0:05:57  lr: 0.002862  min_lr: 0.002862  loss: 3.4885 (3.2912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6033 (0.5891)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [98]  [1000/1251]  eta: 0:03:18  lr: 0.002860  min_lr: 0.002860  loss: 3.3486 (3.3037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5669 (0.5899)  time: 0.7852  data: 0.0007  max mem: 69511
Epoch: [98]  [1200/1251]  eta: 0:00:40  lr: 0.002857  min_lr: 0.002857  loss: 3.5407 (3.3109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6647 (0.5992)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [98]  [1250/1251]  eta: 0:00:00  lr: 0.002857  min_lr: 0.002857  loss: 3.4796 (3.3146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6225 (0.5991)  time: 0.6682  data: 0.0006  max mem: 69511
Epoch: [98] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.002857  min_lr: 0.002857  loss: 3.4796 (3.2988)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6225 (0.5991)
Test:  [ 0/25]  eta: 0:03:08  loss: 0.7949 (0.7949)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 7.5358  data: 7.1146  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9615 (0.9528)  acc1: 82.4000 (82.9091)  acc5: 97.6000 (97.0182)  time: 1.0460  data: 0.6471  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1003 (1.1101)  acc1: 78.4000 (79.3143)  acc5: 94.0000 (95.1238)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1308 (1.1130)  acc1: 76.0000 (78.9120)  acc5: 93.6000 (94.9600)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6884 s / it)
* Acc@1 79.544 Acc@5 95.158 loss 1.100
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.54%
Epoch: [99]  [   0/1251]  eta: 1:34:12  lr: 0.002857  min_lr: 0.002857  loss: 2.6743 (2.6743)  weight_decay: 0.0500 (0.0500)  time: 4.5183  data: 3.7372  max mem: 69511
Epoch: [99]  [ 200/1251]  eta: 0:14:08  lr: 0.002854  min_lr: 0.002854  loss: 3.4373 (3.2763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6093 (0.5962)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [99]  [ 400/1251]  eta: 0:11:17  lr: 0.002852  min_lr: 0.002852  loss: 3.2251 (3.2852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5289 (0.5942)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [99]  [ 600/1251]  eta: 0:08:36  lr: 0.002849  min_lr: 0.002849  loss: 3.3103 (3.2981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6149 (0.5994)  time: 0.7887  data: 0.0004  max mem: 69511
Epoch: [99]  [ 800/1251]  eta: 0:05:57  lr: 0.002847  min_lr: 0.002847  loss: 3.1136 (3.2868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6286 (0.5953)  time: 0.7821  data: 0.0004  max mem: 69511
Epoch: [99]  [1000/1251]  eta: 0:03:18  lr: 0.002844  min_lr: 0.002844  loss: 3.4647 (3.2900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6092 (0.5973)  time: 0.7852  data: 0.0005  max mem: 69511
Epoch: [99]  [1200/1251]  eta: 0:00:40  lr: 0.002842  min_lr: 0.002842  loss: 3.5868 (3.2869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5215 (0.5999)  time: 0.7949  data: 0.0005  max mem: 69511
Epoch: [99]  [1250/1251]  eta: 0:00:00  lr: 0.002841  min_lr: 0.002841  loss: 3.2958 (3.2852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5050 (0.5969)  time: 0.6673  data: 0.0005  max mem: 69511
Epoch: [99] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.002841  min_lr: 0.002841  loss: 3.2958 (3.2887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5050 (0.5969)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.7560 (0.7560)  acc1: 86.4000 (86.4000)  acc5: 98.8000 (98.8000)  time: 7.6957  data: 7.2592  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8405 (0.8832)  acc1: 85.6000 (83.4182)  acc5: 97.6000 (97.2364)  time: 1.0606  data: 0.6602  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0691 (1.0412)  acc1: 77.6000 (79.4095)  acc5: 94.8000 (95.2000)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1356 (1.0508)  acc1: 76.4000 (79.0880)  acc5: 94.4000 (95.2000)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6948 s / it)
* Acc@1 79.538 Acc@5 95.378 loss 1.034
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.54%
Epoch: [100]  [   0/1251]  eta: 1:37:33  lr: 0.002841  min_lr: 0.002841  loss: 4.0082 (4.0082)  weight_decay: 0.0500 (0.0500)  time: 4.6788  data: 3.2823  max mem: 69511
Epoch: [100]  [ 200/1251]  eta: 0:14:05  lr: 0.002839  min_lr: 0.002839  loss: 3.0890 (3.1946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5661 (0.6047)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [100]  [ 400/1251]  eta: 0:11:18  lr: 0.002836  min_lr: 0.002836  loss: 3.5344 (3.2061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6084 (0.6073)  time: 0.7943  data: 0.0005  max mem: 69511
Epoch: [100]  [ 600/1251]  eta: 0:08:37  lr: 0.002834  min_lr: 0.002834  loss: 3.3948 (3.2238)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6398 (0.6017)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [100]  [ 800/1251]  eta: 0:05:57  lr: 0.002831  min_lr: 0.002831  loss: 3.4076 (3.2410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5784 (0.5992)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [100]  [1000/1251]  eta: 0:03:18  lr: 0.002829  min_lr: 0.002829  loss: 2.9483 (3.2531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5648 (0.5942)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [100]  [1200/1251]  eta: 0:00:40  lr: 0.002826  min_lr: 0.002826  loss: 3.5328 (3.2593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6160 (0.5956)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [100]  [1250/1251]  eta: 0:00:00  lr: 0.002826  min_lr: 0.002826  loss: 3.0903 (3.2577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6041 (0.5948)  time: 0.6681  data: 0.0005  max mem: 69511
Epoch: [100] Total time: 0:16:27 (0.7895 s / it)
Averaged stats: lr: 0.002826  min_lr: 0.002826  loss: 3.0903 (3.2786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6041 (0.5948)
Test:  [ 0/25]  eta: 0:02:51  loss: 0.7538 (0.7538)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 6.8768  data: 6.4413  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.9552 (0.9308)  acc1: 82.0000 (82.6909)  acc5: 96.8000 (96.9091)  time: 0.9865  data: 0.5860  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0815 (1.0718)  acc1: 76.8000 (79.2571)  acc5: 95.2000 (94.8952)  time: 0.3973  data: 0.0003  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1789 (1.0788)  acc1: 76.4000 (78.8160)  acc5: 92.8000 (94.8320)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6619 s / it)
* Acc@1 79.394 Acc@5 95.076 loss 1.064
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.54%
Epoch: [101]  [   0/1251]  eta: 1:29:16  lr: 0.002826  min_lr: 0.002826  loss: 3.7522 (3.7522)  weight_decay: 0.0500 (0.0500)  time: 4.2814  data: 3.4593  max mem: 69511
Epoch: [101]  [ 200/1251]  eta: 0:14:05  lr: 0.002823  min_lr: 0.002823  loss: 3.5093 (3.2604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6048 (0.6342)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [101]  [ 400/1251]  eta: 0:11:18  lr: 0.002821  min_lr: 0.002821  loss: 3.1879 (3.2470)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5766 (0.6235)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [101]  [ 600/1251]  eta: 0:08:36  lr: 0.002818  min_lr: 0.002818  loss: 2.8465 (3.2337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5430 (0.6106)  time: 0.7831  data: 0.0006  max mem: 69511
Epoch: [101]  [ 800/1251]  eta: 0:05:56  lr: 0.002816  min_lr: 0.002816  loss: 3.6173 (3.2451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6080 (0.6122)  time: 0.7919  data: 0.0006  max mem: 69511
Epoch: [101]  [1000/1251]  eta: 0:03:18  lr: 0.002813  min_lr: 0.002813  loss: 3.4867 (3.2469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5709 (0.6106)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [101]  [1200/1251]  eta: 0:00:40  lr: 0.002811  min_lr: 0.002811  loss: 3.4049 (3.2641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6357 (0.6067)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [101]  [1250/1251]  eta: 0:00:00  lr: 0.002810  min_lr: 0.002810  loss: 3.4354 (3.2682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6151 (0.6062)  time: 0.6665  data: 0.0007  max mem: 69511
Epoch: [101] Total time: 0:16:26 (0.7882 s / it)
Averaged stats: lr: 0.002810  min_lr: 0.002810  loss: 3.4354 (3.2790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6151 (0.6062)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7388 (0.7388)  acc1: 88.0000 (88.0000)  acc5: 97.6000 (97.6000)  time: 5.5504  data: 5.1078  max mem: 69511
Test:  [10/25]  eta: 0:00:13  loss: 0.9484 (0.9145)  acc1: 84.0000 (83.1636)  acc5: 96.8000 (96.7273)  time: 0.9235  data: 0.5261  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0834 (1.1049)  acc1: 77.6000 (78.8191)  acc5: 95.2000 (94.9143)  time: 0.4268  data: 0.0340  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2079 (1.1164)  acc1: 76.4000 (78.5280)  acc5: 94.0000 (94.8640)  time: 0.3930  data: 0.0002  max mem: 69511
Test: Total time: 0:00:15 (0.6314 s / it)
* Acc@1 79.474 Acc@5 95.092 loss 1.100
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.54%
Epoch: [102]  [   0/1251]  eta: 1:38:21  lr: 0.002810  min_lr: 0.002810  loss: 3.1162 (3.1162)  weight_decay: 0.0500 (0.0500)  time: 4.7170  data: 3.9262  max mem: 69511
Epoch: [102]  [ 200/1251]  eta: 0:14:06  lr: 0.002808  min_lr: 0.002808  loss: 3.3230 (3.2376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5540 (0.5720)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [102]  [ 400/1251]  eta: 0:11:15  lr: 0.002805  min_lr: 0.002805  loss: 3.4640 (3.2555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5418 (0.5850)  time: 0.7817  data: 0.0004  max mem: 69511
Epoch: [102]  [ 600/1251]  eta: 0:08:34  lr: 0.002803  min_lr: 0.002803  loss: 3.2644 (3.2568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6171 (0.5947)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [102]  [ 800/1251]  eta: 0:05:56  lr: 0.002800  min_lr: 0.002800  loss: 3.4358 (3.2489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5714 (0.5922)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [102]  [1000/1251]  eta: 0:03:18  lr: 0.002798  min_lr: 0.002798  loss: 3.5779 (3.2666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6107 (0.6002)  time: 0.7882  data: 0.0004  max mem: 69511
Epoch: [102]  [1200/1251]  eta: 0:00:40  lr: 0.002795  min_lr: 0.002795  loss: 3.1611 (3.2580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5651 (0.5992)  time: 0.7915  data: 0.0004  max mem: 69511
Epoch: [102]  [1250/1251]  eta: 0:00:00  lr: 0.002795  min_lr: 0.002795  loss: 3.5503 (3.2634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6085 (0.6011)  time: 0.6677  data: 0.0005  max mem: 69511
Epoch: [102] Total time: 0:16:25 (0.7876 s / it)
Averaged stats: lr: 0.002795  min_lr: 0.002795  loss: 3.5503 (3.2769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6085 (0.6011)
Test:  [ 0/25]  eta: 0:03:18  loss: 0.8486 (0.8486)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 7.9287  data: 7.5113  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.0689 (1.0238)  acc1: 82.8000 (83.2727)  acc5: 97.2000 (96.9455)  time: 1.0818  data: 0.6831  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1535 (1.2020)  acc1: 77.6000 (79.2381)  acc5: 94.4000 (94.8381)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.3404 (1.2122)  acc1: 76.8000 (78.8800)  acc5: 93.2000 (94.6720)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7039 s / it)
* Acc@1 79.406 Acc@5 95.060 loss 1.198
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.54%
Epoch: [103]  [   0/1251]  eta: 1:37:30  lr: 0.002795  min_lr: 0.002795  loss: 3.5798 (3.5798)  weight_decay: 0.0500 (0.0500)  time: 4.6767  data: 3.6813  max mem: 69511
Epoch: [103]  [ 200/1251]  eta: 0:14:08  lr: 0.002792  min_lr: 0.002792  loss: 3.3648 (3.3175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6669 (0.6092)  time: 0.7906  data: 0.0005  max mem: 69511
Epoch: [103]  [ 400/1251]  eta: 0:11:17  lr: 0.002790  min_lr: 0.002790  loss: 3.4809 (3.3252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5952 (0.6237)  time: 0.7828  data: 0.0004  max mem: 69511
Epoch: [103]  [ 600/1251]  eta: 0:08:36  lr: 0.002787  min_lr: 0.002787  loss: 3.3764 (3.3276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5732 (0.6170)  time: 0.7922  data: 0.0005  max mem: 69511
Epoch: [103]  [ 800/1251]  eta: 0:05:56  lr: 0.002785  min_lr: 0.002785  loss: 3.4160 (3.3220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5937 (0.6125)  time: 0.7858  data: 0.0006  max mem: 69511
Epoch: [103]  [1000/1251]  eta: 0:03:18  lr: 0.002782  min_lr: 0.002782  loss: 3.3288 (3.3236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5477 (0.6128)  time: 0.7865  data: 0.0006  max mem: 69511
Epoch: [103]  [1200/1251]  eta: 0:00:40  lr: 0.002779  min_lr: 0.002779  loss: 3.3315 (3.3381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5923 (0.6154)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [103]  [1250/1251]  eta: 0:00:00  lr: 0.002779  min_lr: 0.002779  loss: 3.5828 (3.3391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5754 (0.6148)  time: 0.6677  data: 0.0007  max mem: 69511
Epoch: [103] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.002779  min_lr: 0.002779  loss: 3.5828 (3.2810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5754 (0.6148)
Test:  [ 0/25]  eta: 0:03:21  loss: 0.7657 (0.7657)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 8.0506  data: 7.6120  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9009 (0.8985)  acc1: 82.4000 (83.6364)  acc5: 97.6000 (97.4545)  time: 1.0929  data: 0.6923  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0365 (1.0596)  acc1: 78.8000 (79.9238)  acc5: 94.8000 (95.6381)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1431 (1.0745)  acc1: 78.0000 (79.4720)  acc5: 94.4000 (95.3920)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7086 s / it)
* Acc@1 79.800 Acc@5 95.356 loss 1.066
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.80%
Epoch: [104]  [   0/1251]  eta: 1:24:07  lr: 0.002779  min_lr: 0.002779  loss: 3.9107 (3.9107)  weight_decay: 0.0500 (0.0500)  time: 4.0351  data: 3.2487  max mem: 69511
Epoch: [104]  [ 200/1251]  eta: 0:14:02  lr: 0.002776  min_lr: 0.002776  loss: 3.0321 (3.2512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5676 (0.5974)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [104]  [ 400/1251]  eta: 0:11:17  lr: 0.002774  min_lr: 0.002774  loss: 3.3611 (3.2640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5926 (0.6024)  time: 0.7869  data: 0.0004  max mem: 69511
Epoch: [104]  [ 600/1251]  eta: 0:08:36  lr: 0.002771  min_lr: 0.002771  loss: 3.3819 (3.2763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5596 (0.6003)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [104]  [ 800/1251]  eta: 0:05:57  lr: 0.002769  min_lr: 0.002769  loss: 3.4717 (3.2830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6039 (0.5984)  time: 0.7826  data: 0.0005  max mem: 69511
Epoch: [104]  [1000/1251]  eta: 0:03:18  lr: 0.002766  min_lr: 0.002766  loss: 3.1132 (3.2828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5998 (0.6000)  time: 0.7818  data: 0.0005  max mem: 69511
Epoch: [104]  [1200/1251]  eta: 0:00:40  lr: 0.002764  min_lr: 0.002764  loss: 3.3953 (3.2850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6115 (0.6042)  time: 0.7835  data: 0.0004  max mem: 69511
Epoch: [104]  [1250/1251]  eta: 0:00:00  lr: 0.002763  min_lr: 0.002763  loss: 3.2899 (3.2847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5928 (0.6048)  time: 0.6667  data: 0.0006  max mem: 69511
Epoch: [104] Total time: 0:16:25 (0.7878 s / it)
Averaged stats: lr: 0.002763  min_lr: 0.002763  loss: 3.2899 (3.2619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5928 (0.6048)
Test:  [ 0/25]  eta: 0:02:57  loss: 0.7064 (0.7064)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 7.0830  data: 6.6531  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8720 (0.8743)  acc1: 84.8000 (83.0909)  acc5: 97.6000 (97.1273)  time: 1.0045  data: 0.6052  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0572 (1.0530)  acc1: 78.4000 (79.3524)  acc5: 95.2000 (95.2762)  time: 0.3965  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1855 (1.0667)  acc1: 78.4000 (79.1680)  acc5: 94.4000 (95.2160)  time: 0.3965  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6697 s / it)
* Acc@1 79.630 Acc@5 95.304 loss 1.055
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.80%
Epoch: [105]  [   0/1251]  eta: 1:39:58  lr: 0.002763  min_lr: 0.002763  loss: 3.6295 (3.6295)  weight_decay: 0.0500 (0.0500)  time: 4.7951  data: 3.9938  max mem: 69511
Epoch: [105]  [ 200/1251]  eta: 0:14:10  lr: 0.002760  min_lr: 0.002760  loss: 3.3648 (3.1601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5793 (nan)  time: 0.8124  data: 0.0004  max mem: 69511
Epoch: [105]  [ 400/1251]  eta: 0:11:18  lr: 0.002758  min_lr: 0.002758  loss: 3.3258 (3.2316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5250 (nan)  time: 0.7822  data: 0.0005  max mem: 69511
Epoch: [105]  [ 600/1251]  eta: 0:08:36  lr: 0.002755  min_lr: 0.002755  loss: 3.4133 (3.2299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6296 (nan)  time: 0.7856  data: 0.0006  max mem: 69511
Epoch: [105]  [ 800/1251]  eta: 0:05:57  lr: 0.002753  min_lr: 0.002753  loss: 3.3057 (3.2458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6287 (nan)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [105]  [1000/1251]  eta: 0:03:18  lr: 0.002750  min_lr: 0.002750  loss: 3.3904 (3.2491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5693 (nan)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [105]  [1200/1251]  eta: 0:00:40  lr: 0.002747  min_lr: 0.002747  loss: 3.0127 (3.2453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5884 (nan)  time: 0.7877  data: 0.0004  max mem: 69511
Epoch: [105]  [1250/1251]  eta: 0:00:00  lr: 0.002747  min_lr: 0.002747  loss: 3.4744 (3.2459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6131 (nan)  time: 0.6683  data: 0.0007  max mem: 69511
Epoch: [105] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.002747  min_lr: 0.002747  loss: 3.4744 (3.2587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6131 (nan)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.6986 (0.6986)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 7.7659  data: 7.3412  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8720 (0.8637)  acc1: 84.0000 (83.5273)  acc5: 97.2000 (96.9818)  time: 1.0671  data: 0.6676  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0285 (1.0334)  acc1: 78.4000 (79.6191)  acc5: 94.0000 (94.8762)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1456 (1.0420)  acc1: 78.4000 (79.1680)  acc5: 93.6000 (94.8960)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6982 s / it)
* Acc@1 79.592 Acc@5 95.174 loss 1.024
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.80%
Epoch: [106]  [   0/1251]  eta: 1:43:50  lr: 0.002747  min_lr: 0.002747  loss: 3.8068 (3.8068)  weight_decay: 0.0500 (0.0500)  time: 4.9805  data: 2.8933  max mem: 69511
Epoch: [106]  [ 200/1251]  eta: 0:14:12  lr: 0.002744  min_lr: 0.002744  loss: 3.3482 (3.2617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5671 (0.5694)  time: 0.8026  data: 0.0005  max mem: 69511
Epoch: [106]  [ 400/1251]  eta: 0:11:19  lr: 0.002742  min_lr: 0.002742  loss: 3.2964 (3.2713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5837 (0.6038)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [106]  [ 600/1251]  eta: 0:08:37  lr: 0.002739  min_lr: 0.002739  loss: 3.2906 (3.2638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5790 (0.6098)  time: 0.7919  data: 0.0005  max mem: 69511
Epoch: [106]  [ 800/1251]  eta: 0:05:57  lr: 0.002736  min_lr: 0.002736  loss: 3.3580 (3.2726)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6814 (0.6217)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [106]  [1000/1251]  eta: 0:03:18  lr: 0.002734  min_lr: 0.002734  loss: 3.4694 (3.2837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6901 (0.6212)  time: 0.7824  data: 0.0005  max mem: 69511
Epoch: [106]  [1200/1251]  eta: 0:00:40  lr: 0.002731  min_lr: 0.002731  loss: 3.1392 (3.2745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7138 (0.6267)  time: 0.7847  data: 0.0006  max mem: 69511
Epoch: [106]  [1250/1251]  eta: 0:00:00  lr: 0.002731  min_lr: 0.002731  loss: 3.4585 (3.2738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6599 (0.6276)  time: 0.6669  data: 0.0007  max mem: 69511
Epoch: [106] Total time: 0:16:27 (0.7890 s / it)
Averaged stats: lr: 0.002731  min_lr: 0.002731  loss: 3.4585 (3.2548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6599 (0.6276)
Test:  [ 0/25]  eta: 0:02:53  loss: 0.7870 (0.7870)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 6.9570  data: 6.5208  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.9469 (0.9580)  acc1: 83.6000 (83.3455)  acc5: 97.2000 (97.1636)  time: 0.9931  data: 0.5931  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1608 (1.0984)  acc1: 77.2000 (80.0952)  acc5: 94.4000 (95.2952)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1728 (1.1071)  acc1: 76.8000 (79.6160)  acc5: 93.6000 (95.1360)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6651 s / it)
* Acc@1 79.852 Acc@5 95.344 loss 1.097
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.85%
Epoch: [107]  [   0/1251]  eta: 1:27:11  lr: 0.002731  min_lr: 0.002731  loss: 3.2674 (3.2674)  weight_decay: 0.0500 (0.0500)  time: 4.1816  data: 3.3930  max mem: 69511
Epoch: [107]  [ 200/1251]  eta: 0:14:04  lr: 0.002728  min_lr: 0.002728  loss: 3.2303 (3.2859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5772 (0.5921)  time: 0.7853  data: 0.0006  max mem: 69511
Epoch: [107]  [ 400/1251]  eta: 0:11:16  lr: 0.002725  min_lr: 0.002725  loss: 3.4187 (3.2828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5704 (0.5918)  time: 0.7865  data: 0.0006  max mem: 69511
Epoch: [107]  [ 600/1251]  eta: 0:08:36  lr: 0.002723  min_lr: 0.002723  loss: 3.2533 (3.2573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5625 (0.5918)  time: 0.7910  data: 0.0006  max mem: 69511
Epoch: [107]  [ 800/1251]  eta: 0:05:56  lr: 0.002720  min_lr: 0.002720  loss: 3.4233 (3.2614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5631 (0.5880)  time: 0.7831  data: 0.0005  max mem: 69511
Epoch: [107]  [1000/1251]  eta: 0:03:18  lr: 0.002718  min_lr: 0.002718  loss: 3.4492 (3.2672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6311 (0.5975)  time: 0.7948  data: 0.0005  max mem: 69511
Epoch: [107]  [1200/1251]  eta: 0:00:40  lr: 0.002715  min_lr: 0.002715  loss: 3.1910 (3.2591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5539 (0.5936)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [107]  [1250/1251]  eta: 0:00:00  lr: 0.002714  min_lr: 0.002714  loss: 3.4310 (3.2601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5900 (0.5958)  time: 0.6670  data: 0.0006  max mem: 69511
Epoch: [107] Total time: 0:16:24 (0.7872 s / it)
Averaged stats: lr: 0.002714  min_lr: 0.002714  loss: 3.4310 (3.2645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5900 (0.5958)
Test:  [ 0/25]  eta: 0:03:13  loss: 0.7004 (0.7004)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 7.7286  data: 7.3093  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8990 (0.9078)  acc1: 83.2000 (82.8727)  acc5: 97.6000 (97.3455)  time: 1.0634  data: 0.6647  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0606 (1.0591)  acc1: 78.0000 (79.6191)  acc5: 94.4000 (95.3143)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1286 (1.0685)  acc1: 76.4000 (79.1840)  acc5: 94.8000 (95.3600)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6959 s / it)
* Acc@1 79.910 Acc@5 95.344 loss 1.056
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.91%
Epoch: [108]  [   0/1251]  eta: 1:20:21  lr: 0.002714  min_lr: 0.002714  loss: 3.7028 (3.7028)  weight_decay: 0.0500 (0.0500)  time: 3.8542  data: 3.0683  max mem: 69511
Epoch: [108]  [ 200/1251]  eta: 0:14:01  lr: 0.002712  min_lr: 0.002712  loss: 3.3834 (3.2734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5874 (0.6209)  time: 0.7857  data: 0.0006  max mem: 69511
Epoch: [108]  [ 400/1251]  eta: 0:11:16  lr: 0.002709  min_lr: 0.002709  loss: 3.2376 (3.2652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5787 (0.6323)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [108]  [ 600/1251]  eta: 0:08:36  lr: 0.002706  min_lr: 0.002706  loss: 3.3221 (3.2663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6240 (0.6381)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [108]  [ 800/1251]  eta: 0:05:56  lr: 0.002704  min_lr: 0.002704  loss: 3.5055 (3.2545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6024 (0.6351)  time: 0.7861  data: 0.0007  max mem: 69511
Epoch: [108]  [1000/1251]  eta: 0:03:18  lr: 0.002701  min_lr: 0.002701  loss: 2.6757 (3.2528)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5418 (0.6275)  time: 0.7870  data: 0.0006  max mem: 69511
Epoch: [108]  [1200/1251]  eta: 0:00:40  lr: 0.002698  min_lr: 0.002698  loss: 3.2018 (3.2531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6111 (0.6267)  time: 0.7871  data: 0.0006  max mem: 69511
Epoch: [108]  [1250/1251]  eta: 0:00:00  lr: 0.002698  min_lr: 0.002698  loss: 3.2079 (3.2475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5701 (0.6257)  time: 0.6687  data: 0.0005  max mem: 69511
Epoch: [108] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.002698  min_lr: 0.002698  loss: 3.2079 (3.2536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5701 (0.6257)
Test:  [ 0/25]  eta: 0:03:19  loss: 0.7300 (0.7300)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 7.9877  data: 7.5739  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8547 (0.8707)  acc1: 82.4000 (82.9455)  acc5: 97.2000 (97.0182)  time: 1.0876  data: 0.6888  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0265 (1.0286)  acc1: 77.6000 (79.1238)  acc5: 95.2000 (95.2952)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1287 (1.0416)  acc1: 76.8000 (78.6560)  acc5: 94.4000 (95.2640)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7063 s / it)
* Acc@1 79.930 Acc@5 95.416 loss 1.020
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.93%
Epoch: [109]  [   0/1251]  eta: 1:30:47  lr: 0.002698  min_lr: 0.002698  loss: 3.4809 (3.4809)  weight_decay: 0.0500 (0.0500)  time: 4.3544  data: 3.5714  max mem: 69511
Epoch: [109]  [ 200/1251]  eta: 0:14:06  lr: 0.002695  min_lr: 0.002695  loss: 3.3298 (3.0940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5996 (0.6385)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [109]  [ 400/1251]  eta: 0:11:18  lr: 0.002693  min_lr: 0.002693  loss: 3.2372 (3.1690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5721 (0.6268)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [109]  [ 600/1251]  eta: 0:08:36  lr: 0.002690  min_lr: 0.002690  loss: 3.4778 (3.1900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5561 (0.6279)  time: 0.7863  data: 0.0006  max mem: 69511
Epoch: [109]  [ 800/1251]  eta: 0:05:57  lr: 0.002687  min_lr: 0.002687  loss: 3.3626 (3.2080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5788 (0.6178)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [109]  [1000/1251]  eta: 0:03:18  lr: 0.002685  min_lr: 0.002685  loss: 3.1096 (3.2065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6837 (0.6217)  time: 0.7865  data: 0.0006  max mem: 69511
Epoch: [109]  [1200/1251]  eta: 0:00:40  lr: 0.002682  min_lr: 0.002682  loss: 3.2909 (3.2133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5876 (0.6173)  time: 0.7821  data: 0.0005  max mem: 69511
Epoch: [109]  [1250/1251]  eta: 0:00:00  lr: 0.002681  min_lr: 0.002681  loss: 3.2239 (3.2136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6170 (0.6200)  time: 0.6646  data: 0.0006  max mem: 69511
Epoch: [109] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.002681  min_lr: 0.002681  loss: 3.2239 (3.2368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6170 (0.6200)
Test:  [ 0/25]  eta: 0:03:05  loss: 0.6557 (0.6557)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 7.4169  data: 7.0005  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8457 (0.8417)  acc1: 84.0000 (83.5273)  acc5: 97.6000 (97.4182)  time: 1.0327  data: 0.6367  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9689 (1.0090)  acc1: 78.8000 (80.2286)  acc5: 95.2000 (95.4095)  time: 0.3944  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1228 (1.0178)  acc1: 77.6000 (79.8080)  acc5: 94.4000 (95.2640)  time: 0.3944  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6864 s / it)
* Acc@1 79.930 Acc@5 95.366 loss 1.003
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.93%
Epoch: [110]  [   0/1251]  eta: 1:46:06  lr: 0.002681  min_lr: 0.002681  loss: 3.1624 (3.1624)  weight_decay: 0.0500 (0.0500)  time: 5.0889  data: 4.2943  max mem: 69511
Epoch: [110]  [ 200/1251]  eta: 0:14:10  lr: 0.002679  min_lr: 0.002679  loss: 3.5947 (3.2443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5711 (0.6179)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [110]  [ 400/1251]  eta: 0:11:18  lr: 0.002676  min_lr: 0.002676  loss: 3.2336 (3.2430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5782 (0.6210)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [110]  [ 600/1251]  eta: 0:08:37  lr: 0.002673  min_lr: 0.002673  loss: 3.1050 (3.2423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5908 (0.6200)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [110]  [ 800/1251]  eta: 0:05:57  lr: 0.002671  min_lr: 0.002671  loss: 3.0790 (3.2355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6381 (0.6250)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [110]  [1000/1251]  eta: 0:03:18  lr: 0.002668  min_lr: 0.002668  loss: 3.2567 (3.2325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5545 (0.6216)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [110]  [1200/1251]  eta: 0:00:40  lr: 0.002665  min_lr: 0.002665  loss: 3.2604 (3.2330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5983 (0.6237)  time: 0.7833  data: 0.0004  max mem: 69511
Epoch: [110]  [1250/1251]  eta: 0:00:00  lr: 0.002665  min_lr: 0.002665  loss: 3.5119 (3.2342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5842 (0.6226)  time: 0.6666  data: 0.0005  max mem: 69511
Epoch: [110] Total time: 0:16:27 (0.7895 s / it)
Averaged stats: lr: 0.002665  min_lr: 0.002665  loss: 3.5119 (3.2395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5842 (0.6226)
Test:  [ 0/25]  eta: 0:02:59  loss: 0.7483 (0.7483)  acc1: 85.6000 (85.6000)  acc5: 98.8000 (98.8000)  time: 7.1858  data: 6.7641  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9110 (0.9381)  acc1: 82.8000 (82.5818)  acc5: 97.2000 (97.0546)  time: 1.0112  data: 0.6152  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0626 (1.0770)  acc1: 77.2000 (79.2571)  acc5: 95.2000 (95.2381)  time: 0.3937  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1864 (1.0853)  acc1: 76.4000 (78.9440)  acc5: 94.0000 (95.2000)  time: 0.3936  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6756 s / it)
* Acc@1 79.770 Acc@5 95.256 loss 1.073
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.93%
Epoch: [111]  [   0/1251]  eta: 1:37:30  lr: 0.002665  min_lr: 0.002665  loss: 3.7640 (3.7640)  weight_decay: 0.0500 (0.0500)  time: 4.6765  data: 3.8861  max mem: 69511
Epoch: [111]  [ 200/1251]  eta: 0:14:04  lr: 0.002662  min_lr: 0.002662  loss: 3.4824 (3.2647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5659 (0.5965)  time: 0.7847  data: 0.0004  max mem: 69511
Epoch: [111]  [ 400/1251]  eta: 0:11:16  lr: 0.002659  min_lr: 0.002659  loss: 3.3201 (3.2306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5842 (0.6058)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [111]  [ 600/1251]  eta: 0:08:36  lr: 0.002657  min_lr: 0.002657  loss: 3.3431 (3.2263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6658 (0.6166)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [111]  [ 800/1251]  eta: 0:05:57  lr: 0.002654  min_lr: 0.002654  loss: 3.3686 (3.2291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6715 (0.6179)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [111]  [1000/1251]  eta: 0:03:18  lr: 0.002651  min_lr: 0.002651  loss: 2.9332 (3.2304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5432 (0.6121)  time: 0.7936  data: 0.0004  max mem: 69511
Epoch: [111]  [1200/1251]  eta: 0:00:40  lr: 0.002648  min_lr: 0.002648  loss: 3.4975 (3.2317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6398 (0.6192)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [111]  [1250/1251]  eta: 0:00:00  lr: 0.002648  min_lr: 0.002648  loss: 3.4194 (3.2318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6049 (0.6187)  time: 0.6675  data: 0.0005  max mem: 69511
Epoch: [111] Total time: 0:16:27 (0.7892 s / it)
Averaged stats: lr: 0.002648  min_lr: 0.002648  loss: 3.4194 (3.2352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6049 (0.6187)
Test:  [ 0/25]  eta: 0:03:13  loss: 0.8489 (0.8489)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 7.7409  data: 7.3121  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 1.0387 (1.0370)  acc1: 83.6000 (83.3455)  acc5: 96.8000 (97.2000)  time: 1.0647  data: 0.6650  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1751 (1.1635)  acc1: 77.6000 (79.9048)  acc5: 95.2000 (95.6191)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2431 (1.1771)  acc1: 76.8000 (79.3760)  acc5: 94.8000 (95.2320)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6968 s / it)
* Acc@1 79.908 Acc@5 95.380 loss 1.166
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.93%
Epoch: [112]  [   0/1251]  eta: 1:33:35  lr: 0.002648  min_lr: 0.002648  loss: 2.7893 (2.7893)  weight_decay: 0.0500 (0.0500)  time: 4.4888  data: 3.0443  max mem: 69511
Epoch: [112]  [ 200/1251]  eta: 0:14:06  lr: 0.002645  min_lr: 0.002645  loss: 3.2498 (3.2200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5556 (0.6261)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [112]  [ 400/1251]  eta: 0:11:19  lr: 0.002642  min_lr: 0.002642  loss: 3.2758 (3.2570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5941 (0.6246)  time: 0.7957  data: 0.0005  max mem: 69511
Epoch: [112]  [ 600/1251]  eta: 0:08:37  lr: 0.002640  min_lr: 0.002640  loss: 3.1282 (3.2477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5777 (0.6127)  time: 0.7833  data: 0.0006  max mem: 69511
Epoch: [112]  [ 800/1251]  eta: 0:05:57  lr: 0.002637  min_lr: 0.002637  loss: 3.4849 (3.2531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5540 (0.6125)  time: 0.7831  data: 0.0005  max mem: 69511
Epoch: [112]  [1000/1251]  eta: 0:03:18  lr: 0.002634  min_lr: 0.002634  loss: 3.3243 (3.2609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6622 (0.6174)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [112]  [1200/1251]  eta: 0:00:40  lr: 0.002632  min_lr: 0.002632  loss: 3.5019 (3.2568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5958 (nan)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [112]  [1250/1251]  eta: 0:00:00  lr: 0.002631  min_lr: 0.002631  loss: 3.4883 (3.2578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5813 (nan)  time: 0.6678  data: 0.0007  max mem: 69511
Epoch: [112] Total time: 0:16:26 (0.7889 s / it)
Averaged stats: lr: 0.002631  min_lr: 0.002631  loss: 3.4883 (3.2395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5813 (nan)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.7560 (0.7560)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 7.7117  data: 7.2872  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9169 (0.9485)  acc1: 84.4000 (84.2182)  acc5: 98.0000 (97.6364)  time: 1.0620  data: 0.6627  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1156 (1.1018)  acc1: 77.2000 (80.3048)  acc5: 95.2000 (95.6381)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1782 (1.1087)  acc1: 77.6000 (79.9200)  acc5: 94.8000 (95.6480)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6955 s / it)
* Acc@1 80.098 Acc@5 95.348 loss 1.102
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.10%
Epoch: [113]  [   0/1251]  eta: 1:31:03  lr: 0.002631  min_lr: 0.002631  loss: 3.0805 (3.0805)  weight_decay: 0.0500 (0.0500)  time: 4.3672  data: 3.5683  max mem: 69511
Epoch: [113]  [ 200/1251]  eta: 0:14:06  lr: 0.002628  min_lr: 0.002628  loss: 3.1780 (3.2237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6001 (0.6062)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [113]  [ 400/1251]  eta: 0:11:18  lr: 0.002625  min_lr: 0.002625  loss: 2.8718 (3.2034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5332 (0.6143)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [113]  [ 600/1251]  eta: 0:08:36  lr: 0.002623  min_lr: 0.002623  loss: 3.3209 (3.2136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6230 (0.6176)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [113]  [ 800/1251]  eta: 0:05:57  lr: 0.002620  min_lr: 0.002620  loss: 3.2604 (3.1998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6103 (0.6184)  time: 0.7993  data: 0.0006  max mem: 69511
Epoch: [113]  [1000/1251]  eta: 0:03:18  lr: 0.002617  min_lr: 0.002617  loss: 3.4667 (3.1957)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6017 (0.6181)  time: 0.7863  data: 0.0006  max mem: 69511
Epoch: [113]  [1200/1251]  eta: 0:00:40  lr: 0.002615  min_lr: 0.002615  loss: 3.1336 (3.2123)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6266 (0.6163)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [113]  [1250/1251]  eta: 0:00:00  lr: 0.002614  min_lr: 0.002614  loss: 3.3420 (3.2153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6228 (0.6149)  time: 0.6707  data: 0.0006  max mem: 69511
Epoch: [113] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.002614  min_lr: 0.002614  loss: 3.3420 (3.2292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6228 (0.6149)
Test:  [ 0/25]  eta: 0:03:04  loss: 0.7734 (0.7734)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 7.3632  data: 6.9337  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9144 (0.9221)  acc1: 84.0000 (83.7091)  acc5: 97.6000 (97.1273)  time: 1.0305  data: 0.6307  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0555 (1.0533)  acc1: 79.2000 (80.4381)  acc5: 94.8000 (95.2571)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0984 (1.0617)  acc1: 78.8000 (80.1760)  acc5: 94.4000 (95.2160)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6809 s / it)
* Acc@1 80.294 Acc@5 95.424 loss 1.049
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.29%
Epoch: [114]  [   0/1251]  eta: 1:23:24  lr: 0.002614  min_lr: 0.002614  loss: 3.8841 (3.8841)  weight_decay: 0.0500 (0.0500)  time: 4.0003  data: 3.2157  max mem: 69511
Epoch: [114]  [ 200/1251]  eta: 0:14:04  lr: 0.002611  min_lr: 0.002611  loss: 3.1696 (3.2209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5964 (0.6327)  time: 0.7834  data: 0.0004  max mem: 69511
Epoch: [114]  [ 400/1251]  eta: 0:11:15  lr: 0.002608  min_lr: 0.002608  loss: 3.4272 (3.2086)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5646 (0.6195)  time: 0.7842  data: 0.0004  max mem: 69511
Epoch: [114]  [ 600/1251]  eta: 0:08:35  lr: 0.002606  min_lr: 0.002606  loss: 3.4395 (3.2126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5687 (0.6126)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [114]  [ 800/1251]  eta: 0:05:56  lr: 0.002603  min_lr: 0.002603  loss: 3.5546 (3.2228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6173 (0.6176)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [114]  [1000/1251]  eta: 0:03:18  lr: 0.002600  min_lr: 0.002600  loss: 3.2779 (3.2168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5372 (0.6173)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [114]  [1200/1251]  eta: 0:00:40  lr: 0.002597  min_lr: 0.002597  loss: 3.1824 (3.2239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6472 (0.6220)  time: 0.7917  data: 0.0005  max mem: 69511
Epoch: [114]  [1250/1251]  eta: 0:00:00  lr: 0.002597  min_lr: 0.002597  loss: 3.2858 (3.2268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6017 (0.6209)  time: 0.6684  data: 0.0005  max mem: 69511
Epoch: [114] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.002597  min_lr: 0.002597  loss: 3.2858 (3.2231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6017 (0.6209)
Test:  [ 0/25]  eta: 0:02:55  loss: 0.7438 (0.7438)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 7.0163  data: 6.5764  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.9375 (0.9207)  acc1: 83.2000 (83.6364)  acc5: 97.2000 (97.1636)  time: 0.9988  data: 0.5981  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0933 (1.0859)  acc1: 77.2000 (80.0191)  acc5: 94.8000 (95.3333)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1560 (1.0984)  acc1: 77.6000 (79.6960)  acc5: 94.4000 (95.1200)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6672 s / it)
* Acc@1 80.118 Acc@5 95.500 loss 1.085
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.29%
Epoch: [115]  [   0/1251]  eta: 1:32:10  lr: 0.002597  min_lr: 0.002597  loss: 3.5265 (3.5265)  weight_decay: 0.0500 (0.0500)  time: 4.4210  data: 3.4643  max mem: 69511
Epoch: [115]  [ 200/1251]  eta: 0:14:03  lr: 0.002594  min_lr: 0.002594  loss: 3.1644 (3.1276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5386 (0.5860)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [115]  [ 400/1251]  eta: 0:11:17  lr: 0.002591  min_lr: 0.002591  loss: 3.1527 (3.1577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6526 (0.6170)  time: 0.7934  data: 0.0006  max mem: 69511
Epoch: [115]  [ 600/1251]  eta: 0:08:36  lr: 0.002588  min_lr: 0.002588  loss: 3.0335 (3.1879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5724 (0.6102)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [115]  [ 800/1251]  eta: 0:05:56  lr: 0.002586  min_lr: 0.002586  loss: 3.2840 (3.1812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6543 (0.6172)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [115]  [1000/1251]  eta: 0:03:18  lr: 0.002583  min_lr: 0.002583  loss: 3.2875 (3.2044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5698 (0.6175)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [115]  [1200/1251]  eta: 0:00:40  lr: 0.002580  min_lr: 0.002580  loss: 3.3564 (3.2056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6092 (0.6149)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [115]  [1250/1251]  eta: 0:00:00  lr: 0.002580  min_lr: 0.002580  loss: 3.1988 (3.2050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5803 (0.6125)  time: 0.6673  data: 0.0006  max mem: 69511
Epoch: [115] Total time: 0:16:26 (0.7889 s / it)
Averaged stats: lr: 0.002580  min_lr: 0.002580  loss: 3.1988 (3.2166)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5803 (0.6125)
Test:  [ 0/25]  eta: 0:02:55  loss: 0.7039 (0.7039)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 7.0333  data: 6.5966  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9600 (0.9280)  acc1: 83.6000 (83.4909)  acc5: 97.6000 (97.3091)  time: 1.0003  data: 0.6000  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0513 (1.0742)  acc1: 76.4000 (79.8667)  acc5: 94.4000 (95.3333)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1782 (1.0852)  acc1: 76.4000 (79.4080)  acc5: 93.6000 (95.0400)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6676 s / it)
* Acc@1 79.894 Acc@5 95.466 loss 1.069
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.29%
Epoch: [116]  [   0/1251]  eta: 1:26:40  lr: 0.002580  min_lr: 0.002580  loss: 2.2377 (2.2377)  weight_decay: 0.0500 (0.0500)  time: 4.1573  data: 2.2168  max mem: 69511
Epoch: [116]  [ 200/1251]  eta: 0:14:03  lr: 0.002577  min_lr: 0.002577  loss: 3.4321 (3.2299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5189 (0.6018)  time: 0.7852  data: 0.0005  max mem: 69511
Epoch: [116]  [ 400/1251]  eta: 0:11:17  lr: 0.002574  min_lr: 0.002574  loss: 2.9975 (3.1846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5586 (0.6086)  time: 0.7949  data: 0.0005  max mem: 69511
Epoch: [116]  [ 600/1251]  eta: 0:08:36  lr: 0.002571  min_lr: 0.002571  loss: 3.3137 (3.1970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5972 (0.6032)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [116]  [ 800/1251]  eta: 0:05:57  lr: 0.002568  min_lr: 0.002568  loss: 3.2376 (3.1926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6220 (0.6106)  time: 0.7971  data: 0.0005  max mem: 69511
Epoch: [116]  [1000/1251]  eta: 0:03:18  lr: 0.002566  min_lr: 0.002566  loss: 3.3458 (3.1984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6728 (0.6153)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [116]  [1200/1251]  eta: 0:00:40  lr: 0.002563  min_lr: 0.002563  loss: 3.4457 (3.2107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5454 (0.6132)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [116]  [1250/1251]  eta: 0:00:00  lr: 0.002562  min_lr: 0.002562  loss: 3.3519 (3.2081)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5811 (0.6120)  time: 0.6736  data: 0.0006  max mem: 69511
Epoch: [116] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.002562  min_lr: 0.002562  loss: 3.3519 (3.2201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5811 (0.6120)
Test:  [ 0/25]  eta: 0:03:08  loss: 0.7482 (0.7482)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 7.5234  data: 7.0878  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9403 (0.9135)  acc1: 82.4000 (83.1273)  acc5: 97.2000 (97.3091)  time: 1.0449  data: 0.6446  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0536 (1.0542)  acc1: 77.2000 (79.8286)  acc5: 95.2000 (95.5048)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1441 (1.0535)  acc1: 77.2000 (79.5040)  acc5: 94.4000 (95.3600)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6878 s / it)
* Acc@1 80.276 Acc@5 95.558 loss 1.042
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.29%
Epoch: [117]  [   0/1251]  eta: 1:37:11  lr: 0.002562  min_lr: 0.002562  loss: 3.2674 (3.2674)  weight_decay: 0.0500 (0.0500)  time: 4.6616  data: 2.1972  max mem: 69511
Epoch: [117]  [ 200/1251]  eta: 0:14:07  lr: 0.002559  min_lr: 0.002559  loss: 3.3233 (3.1788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6229 (0.6229)  time: 0.7899  data: 0.0004  max mem: 69511
Epoch: [117]  [ 400/1251]  eta: 0:11:16  lr: 0.002557  min_lr: 0.002557  loss: 3.4530 (3.2116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6061 (0.6409)  time: 0.7826  data: 0.0004  max mem: 69511
Epoch: [117]  [ 600/1251]  eta: 0:08:35  lr: 0.002554  min_lr: 0.002554  loss: 3.0191 (3.1982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5956 (0.6457)  time: 0.7906  data: 0.0005  max mem: 69511
Epoch: [117]  [ 800/1251]  eta: 0:05:56  lr: 0.002551  min_lr: 0.002551  loss: 3.1723 (3.2074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5810 (0.6355)  time: 0.7927  data: 0.0006  max mem: 69511
Epoch: [117]  [1000/1251]  eta: 0:03:18  lr: 0.002548  min_lr: 0.002548  loss: 3.3091 (3.1969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6224 (0.6332)  time: 0.7882  data: 0.0005  max mem: 69511
Epoch: [117]  [1200/1251]  eta: 0:00:40  lr: 0.002545  min_lr: 0.002545  loss: 3.3615 (3.2048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5828 (0.6348)  time: 0.8027  data: 0.0005  max mem: 69511
Epoch: [117]  [1250/1251]  eta: 0:00:00  lr: 0.002545  min_lr: 0.002545  loss: 3.2197 (3.2039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5409 (0.6322)  time: 0.6638  data: 0.0007  max mem: 69511
Epoch: [117] Total time: 0:16:25 (0.7879 s / it)
Averaged stats: lr: 0.002545  min_lr: 0.002545  loss: 3.2197 (3.2130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5409 (0.6322)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.7349 (0.7349)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 7.8375  data: 7.4121  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8862 (0.8786)  acc1: 84.4000 (83.6364)  acc5: 97.6000 (97.4545)  time: 1.0708  data: 0.6741  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0172 (1.0343)  acc1: 78.8000 (80.4000)  acc5: 95.2000 (95.7524)  time: 0.3940  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1132 (1.0440)  acc1: 78.0000 (80.0160)  acc5: 94.0000 (95.5040)  time: 0.3940  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7016 s / it)
* Acc@1 80.154 Acc@5 95.622 loss 1.029
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.29%
Epoch: [118]  [   0/1251]  eta: 1:32:42  lr: 0.002545  min_lr: 0.002545  loss: 3.5541 (3.5541)  weight_decay: 0.0500 (0.0500)  time: 4.4463  data: 2.2129  max mem: 69511
Epoch: [118]  [ 200/1251]  eta: 0:14:06  lr: 0.002542  min_lr: 0.002542  loss: 3.4227 (3.2085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5963 (0.6580)  time: 0.7953  data: 0.0004  max mem: 69511
Epoch: [118]  [ 400/1251]  eta: 0:11:16  lr: 0.002539  min_lr: 0.002539  loss: 3.3474 (3.2027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5954 (0.6266)  time: 0.7830  data: 0.0004  max mem: 69511
Epoch: [118]  [ 600/1251]  eta: 0:08:35  lr: 0.002536  min_lr: 0.002536  loss: 3.4421 (3.1923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7106 (0.6422)  time: 0.7839  data: 0.0005  max mem: 69511
Epoch: [118]  [ 800/1251]  eta: 0:05:56  lr: 0.002534  min_lr: 0.002534  loss: 3.0968 (3.2031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6362 (0.6366)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [118]  [1000/1251]  eta: 0:03:18  lr: 0.002531  min_lr: 0.002531  loss: 3.2312 (3.1951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6171 (0.6318)  time: 0.7880  data: 0.0004  max mem: 69511
Epoch: [118]  [1200/1251]  eta: 0:00:40  lr: 0.002528  min_lr: 0.002528  loss: 3.3016 (3.2032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5955 (0.6322)  time: 0.7833  data: 0.0004  max mem: 69511
Epoch: [118]  [1250/1251]  eta: 0:00:00  lr: 0.002527  min_lr: 0.002527  loss: 3.3572 (3.2017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6301 (0.6328)  time: 0.6650  data: 0.0005  max mem: 69511
Epoch: [118] Total time: 0:16:25 (0.7876 s / it)
Averaged stats: lr: 0.002527  min_lr: 0.002527  loss: 3.3572 (3.2128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6301 (0.6328)
Test:  [ 0/25]  eta: 0:03:24  loss: 0.7369 (0.7369)  acc1: 88.8000 (88.8000)  acc5: 99.6000 (99.6000)  time: 8.1995  data: 7.7750  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9639 (0.9253)  acc1: 84.8000 (83.7455)  acc5: 98.0000 (97.4909)  time: 1.1033  data: 0.7071  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0636 (1.0771)  acc1: 78.0000 (80.0381)  acc5: 94.8000 (95.7524)  time: 0.3936  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1781 (1.0830)  acc1: 76.8000 (79.5520)  acc5: 94.8000 (95.6320)  time: 0.3935  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7165 s / it)
* Acc@1 80.292 Acc@5 95.468 loss 1.068
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.29%
Epoch: [119]  [   0/1251]  eta: 1:33:33  lr: 0.002527  min_lr: 0.002527  loss: 3.7206 (3.7206)  weight_decay: 0.0500 (0.0500)  time: 4.4873  data: 3.7025  max mem: 69511
Epoch: [119]  [ 200/1251]  eta: 0:14:02  lr: 0.002524  min_lr: 0.002524  loss: 3.3811 (3.2156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5581 (0.6071)  time: 0.7850  data: 0.0006  max mem: 69511
Epoch: [119]  [ 400/1251]  eta: 0:11:16  lr: 0.002522  min_lr: 0.002522  loss: 3.3768 (3.2192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6718 (0.6279)  time: 0.7862  data: 0.0006  max mem: 69511
Epoch: [119]  [ 600/1251]  eta: 0:08:36  lr: 0.002519  min_lr: 0.002519  loss: 3.2596 (3.2255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6546 (0.6306)  time: 0.7825  data: 0.0006  max mem: 69511
Epoch: [119]  [ 800/1251]  eta: 0:05:56  lr: 0.002516  min_lr: 0.002516  loss: 3.2534 (3.2150)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5637 (0.6211)  time: 0.7852  data: 0.0005  max mem: 69511
Epoch: [119]  [1000/1251]  eta: 0:03:18  lr: 0.002513  min_lr: 0.002513  loss: 3.3612 (3.2094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6433 (0.6242)  time: 0.7854  data: 0.0007  max mem: 69511
Epoch: [119]  [1200/1251]  eta: 0:00:40  lr: 0.002510  min_lr: 0.002510  loss: 3.4272 (3.2088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6657 (0.6328)  time: 0.7823  data: 0.0006  max mem: 69511
Epoch: [119]  [1250/1251]  eta: 0:00:00  lr: 0.002510  min_lr: 0.002510  loss: 3.2384 (3.2045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6644 (0.6345)  time: 0.6646  data: 0.0005  max mem: 69511
Epoch: [119] Total time: 0:16:25 (0.7875 s / it)
Averaged stats: lr: 0.002510  min_lr: 0.002510  loss: 3.2384 (3.2066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6644 (0.6345)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.7612 (0.7612)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 6.5405  data: 6.1163  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.9140 (0.9306)  acc1: 84.4000 (83.2000)  acc5: 98.0000 (97.6364)  time: 0.9530  data: 0.5564  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0832 (1.0885)  acc1: 78.4000 (80.2476)  acc5: 94.8000 (95.6381)  time: 0.3943  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1249 (1.0977)  acc1: 78.4000 (79.9040)  acc5: 94.4000 (95.4720)  time: 0.3943  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6531 s / it)
* Acc@1 80.318 Acc@5 95.546 loss 1.083
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.32%
Epoch: [120]  [   0/1251]  eta: 1:32:58  lr: 0.002510  min_lr: 0.002510  loss: 2.4644 (2.4644)  weight_decay: 0.0500 (0.0500)  time: 4.4592  data: 3.6763  max mem: 69511
Epoch: [120]  [ 200/1251]  eta: 0:14:06  lr: 0.002507  min_lr: 0.002507  loss: 3.0903 (3.1293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5837 (0.6377)  time: 0.7955  data: 0.0004  max mem: 69511
Epoch: [120]  [ 400/1251]  eta: 0:11:17  lr: 0.002504  min_lr: 0.002504  loss: 3.2408 (3.1698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5691 (0.6302)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [120]  [ 600/1251]  eta: 0:08:36  lr: 0.002501  min_lr: 0.002501  loss: 3.1214 (3.1630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6415 (0.6314)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [120]  [ 800/1251]  eta: 0:05:57  lr: 0.002498  min_lr: 0.002498  loss: 3.2869 (3.1593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6388 (0.6381)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [120]  [1000/1251]  eta: 0:03:18  lr: 0.002495  min_lr: 0.002495  loss: 2.7987 (3.1688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6500 (0.6401)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [120]  [1200/1251]  eta: 0:00:40  lr: 0.002493  min_lr: 0.002493  loss: 3.1954 (3.1751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6479 (0.6425)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [120]  [1250/1251]  eta: 0:00:00  lr: 0.002492  min_lr: 0.002492  loss: 3.2280 (3.1737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6301 (0.6436)  time: 0.6656  data: 0.0005  max mem: 69511
Epoch: [120] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.002492  min_lr: 0.002492  loss: 3.2280 (3.2007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6301 (0.6436)
Test:  [ 0/25]  eta: 0:02:57  loss: 0.7150 (0.7150)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 7.1014  data: 6.6755  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8202 (0.8600)  acc1: 83.2000 (84.1455)  acc5: 97.6000 (97.3091)  time: 1.0038  data: 0.6072  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0306 (1.0225)  acc1: 79.2000 (80.2286)  acc5: 95.6000 (95.7143)  time: 0.3939  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1004 (1.0331)  acc1: 78.0000 (79.7600)  acc5: 94.8000 (95.5520)  time: 0.3939  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6713 s / it)
* Acc@1 80.404 Acc@5 95.532 loss 1.022
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.40%
Epoch: [121]  [   0/1251]  eta: 1:34:43  lr: 0.002492  min_lr: 0.002492  loss: 3.0903 (3.0903)  weight_decay: 0.0500 (0.0500)  time: 4.5434  data: 3.7559  max mem: 69511
Epoch: [121]  [ 200/1251]  eta: 0:14:05  lr: 0.002489  min_lr: 0.002489  loss: 3.3059 (3.1550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6092 (0.6199)  time: 0.7832  data: 0.0004  max mem: 69511
Epoch: [121]  [ 400/1251]  eta: 0:11:16  lr: 0.002486  min_lr: 0.002486  loss: 3.3111 (3.1674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5701 (0.6301)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [121]  [ 600/1251]  eta: 0:08:36  lr: 0.002483  min_lr: 0.002483  loss: 3.0587 (3.1717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5785 (0.6223)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [121]  [ 800/1251]  eta: 0:05:57  lr: 0.002480  min_lr: 0.002480  loss: 3.3693 (3.1661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6364 (0.6265)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [121]  [1000/1251]  eta: 0:03:18  lr: 0.002478  min_lr: 0.002478  loss: 3.3487 (3.1709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5686 (0.6336)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [121]  [1200/1251]  eta: 0:00:40  lr: 0.002475  min_lr: 0.002475  loss: 3.3449 (3.1777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6034 (0.6279)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [121]  [1250/1251]  eta: 0:00:00  lr: 0.002474  min_lr: 0.002474  loss: 3.3951 (3.1807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6034 (0.6256)  time: 0.6683  data: 0.0005  max mem: 69511
Epoch: [121] Total time: 0:16:27 (0.7890 s / it)
Averaged stats: lr: 0.002474  min_lr: 0.002474  loss: 3.3951 (3.1949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6034 (0.6256)
Test:  [ 0/25]  eta: 0:03:09  loss: 0.7851 (0.7851)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 7.5793  data: 7.1505  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9285 (0.9383)  acc1: 84.0000 (84.0364)  acc5: 97.6000 (97.5273)  time: 1.0502  data: 0.6503  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0863 (1.0789)  acc1: 79.2000 (80.5714)  acc5: 96.0000 (95.8476)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1289 (1.0838)  acc1: 78.4000 (80.3200)  acc5: 95.2000 (95.7280)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6916 s / it)
* Acc@1 80.516 Acc@5 95.626 loss 1.079
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.52%
Epoch: [122]  [   0/1251]  eta: 1:20:47  lr: 0.002474  min_lr: 0.002474  loss: 3.8276 (3.8276)  weight_decay: 0.0500 (0.0500)  time: 3.8750  data: 3.0757  max mem: 69511
Epoch: [122]  [ 200/1251]  eta: 0:14:05  lr: 0.002471  min_lr: 0.002471  loss: 3.4153 (3.2069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5593 (0.6632)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [122]  [ 400/1251]  eta: 0:11:16  lr: 0.002468  min_lr: 0.002468  loss: 3.4406 (3.2069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6702 (0.6556)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [122]  [ 600/1251]  eta: 0:08:36  lr: 0.002465  min_lr: 0.002465  loss: 3.2661 (3.2113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6029 (0.6404)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [122]  [ 800/1251]  eta: 0:05:57  lr: 0.002463  min_lr: 0.002463  loss: 3.0493 (3.2084)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6430 (0.6405)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [122]  [1000/1251]  eta: 0:03:18  lr: 0.002460  min_lr: 0.002460  loss: 3.2668 (3.2097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6371 (0.6329)  time: 0.7891  data: 0.0004  max mem: 69511
Epoch: [122]  [1200/1251]  eta: 0:00:40  lr: 0.002457  min_lr: 0.002457  loss: 3.3278 (3.2117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6176 (0.6304)  time: 0.7834  data: 0.0006  max mem: 69511
Epoch: [122]  [1250/1251]  eta: 0:00:00  lr: 0.002456  min_lr: 0.002456  loss: 3.3126 (3.2125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6057 (0.6295)  time: 0.6661  data: 0.0005  max mem: 69511
Epoch: [122] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.002456  min_lr: 0.002456  loss: 3.3126 (3.2040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6057 (0.6295)
Test:  [ 0/25]  eta: 0:03:13  loss: 0.7451 (0.7451)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 7.7418  data: 7.3187  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8814 (0.8715)  acc1: 84.8000 (83.8909)  acc5: 97.2000 (97.4545)  time: 1.0634  data: 0.6656  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0249 (1.0137)  acc1: 78.8000 (80.8952)  acc5: 95.2000 (95.7905)  time: 0.3955  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1256 (1.0278)  acc1: 77.6000 (80.1760)  acc5: 94.4000 (95.5680)  time: 0.3956  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6983 s / it)
* Acc@1 80.790 Acc@5 95.680 loss 1.019
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.79%
Epoch: [123]  [   0/1251]  eta: 1:25:41  lr: 0.002456  min_lr: 0.002456  loss: 3.4131 (3.4131)  weight_decay: 0.0500 (0.0500)  time: 4.1097  data: 3.3319  max mem: 69511
Epoch: [123]  [ 200/1251]  eta: 0:14:01  lr: 0.002453  min_lr: 0.002453  loss: 3.1456 (3.1600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6232 (0.6063)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [123]  [ 400/1251]  eta: 0:11:16  lr: 0.002450  min_lr: 0.002450  loss: 3.2056 (3.1720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6267 (0.6247)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [123]  [ 600/1251]  eta: 0:08:36  lr: 0.002447  min_lr: 0.002447  loss: 3.3818 (3.1696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6128 (0.6212)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [123]  [ 800/1251]  eta: 0:05:56  lr: 0.002445  min_lr: 0.002445  loss: 3.4574 (3.1695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6333 (0.6217)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [123]  [1000/1251]  eta: 0:03:18  lr: 0.002442  min_lr: 0.002442  loss: 3.1887 (3.1801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6177 (0.6250)  time: 0.7932  data: 0.0004  max mem: 69511
Epoch: [123]  [1200/1251]  eta: 0:00:40  lr: 0.002439  min_lr: 0.002439  loss: 3.4166 (3.1845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5626 (0.6267)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [123]  [1250/1251]  eta: 0:00:00  lr: 0.002438  min_lr: 0.002438  loss: 3.3282 (3.1878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6629 (0.6286)  time: 0.6684  data: 0.0006  max mem: 69511
Epoch: [123] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.002438  min_lr: 0.002438  loss: 3.3282 (3.1926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6629 (0.6286)
Test:  [ 0/25]  eta: 0:02:32  loss: 0.7213 (0.7213)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 6.0840  data: 5.6243  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8908 (0.8723)  acc1: 83.2000 (83.5636)  acc5: 97.2000 (97.1636)  time: 1.0284  data: 0.6257  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9970 (1.0286)  acc1: 79.2000 (80.4191)  acc5: 95.6000 (95.6571)  time: 0.4599  data: 0.0630  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1348 (1.0424)  acc1: 78.4000 (79.8240)  acc5: 94.8000 (95.6000)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6799 s / it)
* Acc@1 80.570 Acc@5 95.650 loss 1.026
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.79%
Epoch: [124]  [   0/1251]  eta: 1:33:07  lr: 0.002438  min_lr: 0.002438  loss: 3.5212 (3.5212)  weight_decay: 0.0500 (0.0500)  time: 4.4665  data: 3.6513  max mem: 69511
Epoch: [124]  [ 200/1251]  eta: 0:14:07  lr: 0.002435  min_lr: 0.002435  loss: 3.3188 (3.2085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6146 (0.6316)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [124]  [ 400/1251]  eta: 0:11:18  lr: 0.002432  min_lr: 0.002432  loss: 2.8283 (3.1833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6015 (0.6250)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [124]  [ 600/1251]  eta: 0:08:36  lr: 0.002429  min_lr: 0.002429  loss: 3.2860 (3.1749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6476 (0.6345)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [124]  [ 800/1251]  eta: 0:05:57  lr: 0.002427  min_lr: 0.002427  loss: 3.4215 (3.1796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6091 (0.6270)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [124]  [1000/1251]  eta: 0:03:18  lr: 0.002424  min_lr: 0.002424  loss: 3.4675 (3.1873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6943 (0.6351)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [124]  [1200/1251]  eta: 0:00:40  lr: 0.002421  min_lr: 0.002421  loss: 3.3516 (3.1930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6564 (0.6460)  time: 0.7872  data: 0.0003  max mem: 69511
Epoch: [124]  [1250/1251]  eta: 0:00:00  lr: 0.002420  min_lr: 0.002420  loss: 3.2486 (3.1909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6558 (0.6465)  time: 0.6691  data: 0.0005  max mem: 69511
Epoch: [124] Total time: 0:16:27 (0.7894 s / it)
Averaged stats: lr: 0.002420  min_lr: 0.002420  loss: 3.2486 (3.1848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6558 (0.6465)
Test:  [ 0/25]  eta: 0:03:11  loss: 0.7532 (0.7532)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 7.6623  data: 7.2237  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8921 (0.9174)  acc1: 83.2000 (83.1636)  acc5: 97.2000 (96.9455)  time: 1.0546  data: 0.6570  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1230 (1.0881)  acc1: 77.6000 (80.0000)  acc5: 95.2000 (95.5238)  time: 0.3937  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2211 (1.0970)  acc1: 77.6000 (79.6640)  acc5: 94.8000 (95.4240)  time: 0.3937  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6961 s / it)
* Acc@1 80.306 Acc@5 95.534 loss 1.088
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.79%
Epoch: [125]  [   0/1251]  eta: 1:45:13  lr: 0.002420  min_lr: 0.002420  loss: 3.4651 (3.4651)  weight_decay: 0.0500 (0.0500)  time: 5.0467  data: 3.4478  max mem: 69511
Epoch: [125]  [ 200/1251]  eta: 0:14:09  lr: 0.002417  min_lr: 0.002417  loss: 3.2303 (3.1794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6614 (0.6529)  time: 0.7831  data: 0.0004  max mem: 69511
Epoch: [125]  [ 400/1251]  eta: 0:11:17  lr: 0.002414  min_lr: 0.002414  loss: 3.4085 (3.1593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6334 (0.6377)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [125]  [ 600/1251]  eta: 0:08:36  lr: 0.002411  min_lr: 0.002411  loss: 3.4036 (3.1878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6029 (0.6299)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [125]  [ 800/1251]  eta: 0:05:57  lr: 0.002408  min_lr: 0.002408  loss: 3.2821 (3.1835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5880 (0.6439)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [125]  [1000/1251]  eta: 0:03:18  lr: 0.002405  min_lr: 0.002405  loss: 3.2547 (3.1821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6766 (0.6396)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [125]  [1200/1251]  eta: 0:00:40  lr: 0.002403  min_lr: 0.002403  loss: 3.3408 (3.1917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6695 (0.6424)  time: 0.7872  data: 0.0006  max mem: 69511
Epoch: [125]  [1250/1251]  eta: 0:00:00  lr: 0.002402  min_lr: 0.002402  loss: 3.3085 (3.1896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6231 (0.6412)  time: 0.6686  data: 0.0007  max mem: 69511
Epoch: [125] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.002402  min_lr: 0.002402  loss: 3.3085 (3.1818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6231 (0.6412)
Test:  [ 0/25]  eta: 0:03:19  loss: 0.7431 (0.7431)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 7.9693  data: 7.5483  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9081 (0.9212)  acc1: 84.4000 (84.4000)  acc5: 98.0000 (97.6364)  time: 1.0854  data: 0.6865  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1030 (1.0732)  acc1: 78.0000 (80.8000)  acc5: 95.6000 (96.0762)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1364 (1.0805)  acc1: 78.0000 (80.1760)  acc5: 94.8000 (95.9200)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7066 s / it)
* Acc@1 80.712 Acc@5 95.772 loss 1.077
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.79%
Epoch: [126]  [   0/1251]  eta: 1:40:15  lr: 0.002402  min_lr: 0.002402  loss: 3.6335 (3.6335)  weight_decay: 0.0500 (0.0500)  time: 4.8084  data: 3.4143  max mem: 69511
Epoch: [126]  [ 200/1251]  eta: 0:14:07  lr: 0.002399  min_lr: 0.002399  loss: 3.4629 (3.2091)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [126]  [ 400/1251]  eta: 0:11:16  lr: 0.002396  min_lr: 0.002396  loss: 3.3354 (3.2041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6904 (nan)  time: 0.7935  data: 0.0004  max mem: 69511
Epoch: [126]  [ 600/1251]  eta: 0:08:35  lr: 0.002393  min_lr: 0.002393  loss: 3.1378 (3.1970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5731 (nan)  time: 0.7848  data: 0.0004  max mem: 69511
Epoch: [126]  [ 800/1251]  eta: 0:05:56  lr: 0.002390  min_lr: 0.002390  loss: 3.0079 (3.1976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6155 (nan)  time: 0.7851  data: 0.0004  max mem: 69511
Epoch: [126]  [1000/1251]  eta: 0:03:18  lr: 0.002387  min_lr: 0.002387  loss: 3.4638 (3.2015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6456 (nan)  time: 0.7908  data: 0.0004  max mem: 69511
Epoch: [126]  [1200/1251]  eta: 0:00:40  lr: 0.002384  min_lr: 0.002384  loss: 3.2899 (3.1952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5774 (nan)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [126]  [1250/1251]  eta: 0:00:00  lr: 0.002384  min_lr: 0.002384  loss: 3.1703 (3.1958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5757 (nan)  time: 0.6674  data: 0.0005  max mem: 69511
Epoch: [126] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.002384  min_lr: 0.002384  loss: 3.1703 (3.1741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5757 (nan)
Test:  [ 0/25]  eta: 0:03:05  loss: 0.7566 (0.7566)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 7.4081  data: 6.9747  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9287 (0.9178)  acc1: 85.2000 (84.1455)  acc5: 97.2000 (97.1636)  time: 1.0344  data: 0.6343  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0672 (1.0671)  acc1: 79.6000 (80.6476)  acc5: 95.6000 (95.8286)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1847 (1.0835)  acc1: 78.0000 (80.0320)  acc5: 94.8000 (95.5680)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6838 s / it)
* Acc@1 80.810 Acc@5 95.764 loss 1.066
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.81%
Epoch: [127]  [   0/1251]  eta: 1:36:07  lr: 0.002384  min_lr: 0.002384  loss: 3.8045 (3.8045)  weight_decay: 0.0500 (0.0500)  time: 4.6099  data: 3.8252  max mem: 69511
Epoch: [127]  [ 200/1251]  eta: 0:14:06  lr: 0.002381  min_lr: 0.002381  loss: 2.9651 (3.1533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5697 (0.6204)  time: 0.7857  data: 0.0006  max mem: 69511
Epoch: [127]  [ 400/1251]  eta: 0:11:18  lr: 0.002378  min_lr: 0.002378  loss: 3.0792 (3.1573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5749 (0.6225)  time: 0.7968  data: 0.0004  max mem: 69511
Epoch: [127]  [ 600/1251]  eta: 0:08:37  lr: 0.002375  min_lr: 0.002375  loss: 3.4735 (3.1507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6820 (0.6419)  time: 0.7946  data: 0.0005  max mem: 69511
Epoch: [127]  [ 800/1251]  eta: 0:05:57  lr: 0.002372  min_lr: 0.002372  loss: 3.4455 (3.1502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5904 (0.6404)  time: 0.7909  data: 0.0005  max mem: 69511
Epoch: [127]  [1000/1251]  eta: 0:03:18  lr: 0.002369  min_lr: 0.002369  loss: 3.2283 (3.1491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6190 (0.6361)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [127]  [1200/1251]  eta: 0:00:40  lr: 0.002366  min_lr: 0.002366  loss: 3.3923 (3.1498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6045 (0.6321)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [127]  [1250/1251]  eta: 0:00:00  lr: 0.002365  min_lr: 0.002365  loss: 3.3229 (3.1527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6045 (0.6305)  time: 0.6687  data: 0.0006  max mem: 69511
Epoch: [127] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.002365  min_lr: 0.002365  loss: 3.3229 (3.1716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6045 (0.6305)
Test:  [ 0/25]  eta: 0:03:11  loss: 0.7839 (0.7839)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 7.6772  data: 7.2614  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8816 (0.9235)  acc1: 84.8000 (84.4364)  acc5: 97.2000 (97.3818)  time: 1.0593  data: 0.6604  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0759 (1.0789)  acc1: 78.0000 (80.6286)  acc5: 95.2000 (95.6000)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1922 (1.0890)  acc1: 76.8000 (80.2080)  acc5: 94.4000 (95.4880)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6945 s / it)
* Acc@1 80.598 Acc@5 95.658 loss 1.081
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.81%
Epoch: [128]  [   0/1251]  eta: 1:33:58  lr: 0.002365  min_lr: 0.002365  loss: 2.8238 (2.8238)  weight_decay: 0.0500 (0.0500)  time: 4.5073  data: 3.1482  max mem: 69511
Epoch: [128]  [ 200/1251]  eta: 0:14:07  lr: 0.002362  min_lr: 0.002362  loss: 3.3085 (3.2202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6578 (0.6205)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [128]  [ 400/1251]  eta: 0:11:19  lr: 0.002359  min_lr: 0.002359  loss: 3.5251 (3.1990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6200 (0.6365)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [128]  [ 600/1251]  eta: 0:08:36  lr: 0.002356  min_lr: 0.002356  loss: 3.2392 (3.1851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6080 (0.6309)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [128]  [ 800/1251]  eta: 0:05:57  lr: 0.002353  min_lr: 0.002353  loss: 3.2675 (3.1778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6128 (0.6352)  time: 0.7910  data: 0.0004  max mem: 69511
Epoch: [128]  [1000/1251]  eta: 0:03:18  lr: 0.002350  min_lr: 0.002350  loss: 3.3163 (3.1797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6393 (0.6413)  time: 0.7844  data: 0.0005  max mem: 69511
Epoch: [128]  [1200/1251]  eta: 0:00:40  lr: 0.002348  min_lr: 0.002348  loss: 3.1540 (3.1767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5677 (0.6394)  time: 0.7884  data: 0.0004  max mem: 69511
Epoch: [128]  [1250/1251]  eta: 0:00:00  lr: 0.002347  min_lr: 0.002347  loss: 3.0371 (3.1773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5765 (0.6380)  time: 0.6685  data: 0.0006  max mem: 69511
Epoch: [128] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.002347  min_lr: 0.002347  loss: 3.0371 (3.1745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5765 (0.6380)
Test:  [ 0/25]  eta: 0:03:18  loss: 0.6143 (0.6143)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 7.9387  data: 7.5258  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8124 (0.8445)  acc1: 84.0000 (83.4909)  acc5: 97.2000 (97.2000)  time: 1.0827  data: 0.6844  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9845 (0.9880)  acc1: 79.2000 (80.7429)  acc5: 95.2000 (95.5619)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0936 (1.0012)  acc1: 78.8000 (80.3200)  acc5: 94.8000 (95.5200)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7048 s / it)
* Acc@1 80.768 Acc@5 95.768 loss 0.985
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.81%
Epoch: [129]  [   0/1251]  eta: 1:31:16  lr: 0.002347  min_lr: 0.002347  loss: 2.7746 (2.7746)  weight_decay: 0.0500 (0.0500)  time: 4.3778  data: 2.2990  max mem: 69511
Epoch: [129]  [ 200/1251]  eta: 0:14:07  lr: 0.002344  min_lr: 0.002344  loss: 3.0176 (3.2161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5990 (0.6087)  time: 0.7853  data: 0.0007  max mem: 69511
Epoch: [129]  [ 400/1251]  eta: 0:11:17  lr: 0.002341  min_lr: 0.002341  loss: 3.2989 (3.2054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6095 (0.6341)  time: 0.7857  data: 0.0008  max mem: 69511
Epoch: [129]  [ 600/1251]  eta: 0:08:36  lr: 0.002338  min_lr: 0.002338  loss: 3.5184 (3.1983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6233 (0.6440)  time: 0.7825  data: 0.0007  max mem: 69511
Epoch: [129]  [ 800/1251]  eta: 0:05:56  lr: 0.002335  min_lr: 0.002335  loss: 3.1336 (3.1952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6253 (0.6332)  time: 0.7844  data: 0.0008  max mem: 69511
Epoch: [129]  [1000/1251]  eta: 0:03:18  lr: 0.002332  min_lr: 0.002332  loss: 3.3372 (3.2039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6225 (0.6383)  time: 0.7859  data: 0.0009  max mem: 69511
Epoch: [129]  [1200/1251]  eta: 0:00:40  lr: 0.002329  min_lr: 0.002329  loss: 3.3967 (3.1916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6268 (0.6395)  time: 0.7931  data: 0.0007  max mem: 69511
Epoch: [129]  [1250/1251]  eta: 0:00:00  lr: 0.002328  min_lr: 0.002328  loss: 3.5538 (3.1958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6154 (0.6396)  time: 0.6735  data: 0.0005  max mem: 69511
Epoch: [129] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.002328  min_lr: 0.002328  loss: 3.5538 (3.1615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6154 (0.6396)
Test:  [ 0/25]  eta: 0:03:00  loss: 0.8274 (0.8274)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 7.2242  data: 6.8034  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9452 (0.9558)  acc1: 85.2000 (84.1091)  acc5: 97.6000 (97.2727)  time: 1.0146  data: 0.6188  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0795 (1.1143)  acc1: 79.6000 (80.6476)  acc5: 94.8000 (95.4476)  time: 0.3938  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1713 (1.1253)  acc1: 78.8000 (80.1760)  acc5: 94.8000 (95.3920)  time: 0.3939  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6781 s / it)
* Acc@1 80.402 Acc@5 95.538 loss 1.118
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.81%
Epoch: [130]  [   0/1251]  eta: 1:41:33  lr: 0.002328  min_lr: 0.002328  loss: 2.7791 (2.7791)  weight_decay: 0.0500 (0.0500)  time: 4.8707  data: 3.3633  max mem: 69511
Epoch: [130]  [ 200/1251]  eta: 0:14:04  lr: 0.002325  min_lr: 0.002325  loss: 3.4679 (3.1357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6043 (0.6373)  time: 0.7846  data: 0.0008  max mem: 69511
Epoch: [130]  [ 400/1251]  eta: 0:11:16  lr: 0.002322  min_lr: 0.002322  loss: 3.1333 (3.1490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6307 (0.6527)  time: 0.7922  data: 0.0011  max mem: 69511
Epoch: [130]  [ 600/1251]  eta: 0:08:35  lr: 0.002319  min_lr: 0.002319  loss: 3.0509 (3.1470)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5704 (0.6381)  time: 0.7852  data: 0.0007  max mem: 69511
Epoch: [130]  [ 800/1251]  eta: 0:05:56  lr: 0.002316  min_lr: 0.002316  loss: 3.1050 (3.1564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5823 (0.6346)  time: 0.7858  data: 0.0007  max mem: 69511
Epoch: [130]  [1000/1251]  eta: 0:03:18  lr: 0.002313  min_lr: 0.002313  loss: 3.1276 (3.1607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6488 (0.6366)  time: 0.7861  data: 0.0008  max mem: 69511
Epoch: [130]  [1200/1251]  eta: 0:00:40  lr: 0.002310  min_lr: 0.002310  loss: 3.3919 (3.1633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6133 (0.6342)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [130]  [1250/1251]  eta: 0:00:00  lr: 0.002310  min_lr: 0.002310  loss: 3.3331 (3.1620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5909 (0.6360)  time: 0.6685  data: 0.0006  max mem: 69511
Epoch: [130] Total time: 0:16:25 (0.7880 s / it)
Averaged stats: lr: 0.002310  min_lr: 0.002310  loss: 3.3331 (3.1577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5909 (0.6360)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.7450 (0.7450)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 7.7616  data: 7.3349  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9077 (0.9293)  acc1: 85.6000 (84.5455)  acc5: 97.6000 (97.3091)  time: 1.0666  data: 0.6671  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0867 (1.0936)  acc1: 79.2000 (81.2952)  acc5: 95.6000 (95.6000)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2530 (1.1130)  acc1: 77.6000 (80.6560)  acc5: 94.8000 (95.3120)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6984 s / it)
* Acc@1 80.682 Acc@5 95.588 loss 1.101
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.81%
Epoch: [131]  [   0/1251]  eta: 1:40:36  lr: 0.002310  min_lr: 0.002310  loss: 2.9629 (2.9629)  weight_decay: 0.0500 (0.0500)  time: 4.8252  data: 3.2173  max mem: 69511
Epoch: [131]  [ 200/1251]  eta: 0:14:06  lr: 0.002307  min_lr: 0.002307  loss: 3.3115 (3.1759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6581 (0.6954)  time: 0.7821  data: 0.0005  max mem: 69511
Epoch: [131]  [ 400/1251]  eta: 0:11:18  lr: 0.002304  min_lr: 0.002304  loss: 3.3338 (3.1373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6304 (0.6680)  time: 0.8014  data: 0.0004  max mem: 69511
Epoch: [131]  [ 600/1251]  eta: 0:08:36  lr: 0.002301  min_lr: 0.002301  loss: 3.1983 (3.1404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6330 (0.6573)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [131]  [ 800/1251]  eta: 0:05:57  lr: 0.002298  min_lr: 0.002298  loss: 3.3114 (3.1370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5706 (0.6541)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [131]  [1000/1251]  eta: 0:03:18  lr: 0.002295  min_lr: 0.002295  loss: 3.1461 (3.1434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6258 (0.6570)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [131]  [1200/1251]  eta: 0:00:40  lr: 0.002292  min_lr: 0.002292  loss: 3.2189 (3.1378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6533 (0.6540)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [131]  [1250/1251]  eta: 0:00:00  lr: 0.002291  min_lr: 0.002291  loss: 3.0454 (3.1379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6155 (0.6523)  time: 0.6688  data: 0.0005  max mem: 69511
Epoch: [131] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.002291  min_lr: 0.002291  loss: 3.0454 (3.1570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6155 (0.6523)
Test:  [ 0/25]  eta: 0:03:09  loss: 0.7316 (0.7316)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 7.5874  data: 7.1624  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9567 (0.9098)  acc1: 85.6000 (84.8364)  acc5: 97.6000 (97.2364)  time: 1.0509  data: 0.6514  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0305 (1.0479)  acc1: 79.6000 (81.4667)  acc5: 95.6000 (95.6762)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1413 (1.0560)  acc1: 79.2000 (80.9120)  acc5: 94.0000 (95.5200)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6904 s / it)
* Acc@1 80.754 Acc@5 95.766 loss 1.047
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.81%
Epoch: [132]  [   0/1251]  eta: 1:40:13  lr: 0.002291  min_lr: 0.002291  loss: 3.2674 (3.2674)  weight_decay: 0.0500 (0.0500)  time: 4.8067  data: 2.4352  max mem: 69511
Epoch: [132]  [ 200/1251]  eta: 0:14:09  lr: 0.002288  min_lr: 0.002288  loss: 3.3977 (3.1429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5635 (0.6192)  time: 0.7936  data: 0.0005  max mem: 69511
Epoch: [132]  [ 400/1251]  eta: 0:11:19  lr: 0.002285  min_lr: 0.002285  loss: 3.2410 (3.1315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6113 (0.6209)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [132]  [ 600/1251]  eta: 0:08:37  lr: 0.002282  min_lr: 0.002282  loss: 3.2197 (3.1323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6335 (0.6271)  time: 0.7918  data: 0.0004  max mem: 69511
Epoch: [132]  [ 800/1251]  eta: 0:05:57  lr: 0.002279  min_lr: 0.002279  loss: 3.4405 (3.1370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6293 (0.6401)  time: 0.7881  data: 0.0004  max mem: 69511
Epoch: [132]  [1000/1251]  eta: 0:03:18  lr: 0.002276  min_lr: 0.002276  loss: 3.3345 (3.1440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5940 (0.6396)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [132]  [1200/1251]  eta: 0:00:40  lr: 0.002273  min_lr: 0.002273  loss: 3.1257 (3.1438)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.7929  data: 0.0004  max mem: 69511
Epoch: [132]  [1250/1251]  eta: 0:00:00  lr: 0.002272  min_lr: 0.002272  loss: 3.2295 (3.1495)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6680  data: 0.0005  max mem: 69511
Epoch: [132] Total time: 0:16:28 (0.7898 s / it)
Averaged stats: lr: 0.002272  min_lr: 0.002272  loss: 3.2295 (3.1598)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:03:18  loss: 0.7504 (0.7504)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 7.9508  data: 7.5393  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8790 (0.9080)  acc1: 85.2000 (83.7091)  acc5: 97.6000 (97.3091)  time: 1.0844  data: 0.6856  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0681 (1.0624)  acc1: 78.8000 (80.6095)  acc5: 95.2000 (95.6952)  time: 0.3975  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1485 (1.0718)  acc1: 77.6000 (80.3200)  acc5: 95.2000 (95.5680)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7042 s / it)
* Acc@1 80.562 Acc@5 95.648 loss 1.060
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.81%
Epoch: [133]  [   0/1251]  eta: 1:32:44  lr: 0.002272  min_lr: 0.002272  loss: 2.6544 (2.6544)  weight_decay: 0.0500 (0.0500)  time: 4.4477  data: 2.2613  max mem: 69511
Epoch: [133]  [ 200/1251]  eta: 0:14:07  lr: 0.002269  min_lr: 0.002269  loss: 3.2715 (3.1259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5913 (0.6476)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [133]  [ 400/1251]  eta: 0:11:17  lr: 0.002266  min_lr: 0.002266  loss: 3.0405 (3.1041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6677 (0.6553)  time: 0.7930  data: 0.0004  max mem: 69511
Epoch: [133]  [ 600/1251]  eta: 0:08:36  lr: 0.002263  min_lr: 0.002263  loss: 3.1279 (3.1214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6288 (0.6418)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [133]  [ 800/1251]  eta: 0:05:57  lr: 0.002260  min_lr: 0.002260  loss: 3.3818 (3.1239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5933 (0.6433)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [133]  [1000/1251]  eta: 0:03:18  lr: 0.002257  min_lr: 0.002257  loss: 3.4066 (3.1305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6382 (0.6452)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [133]  [1200/1251]  eta: 0:00:40  lr: 0.002254  min_lr: 0.002254  loss: 3.2879 (3.1300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6035 (0.6443)  time: 0.7926  data: 0.0005  max mem: 69511
Epoch: [133]  [1250/1251]  eta: 0:00:00  lr: 0.002254  min_lr: 0.002254  loss: 3.3098 (3.1307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6174 (0.6449)  time: 0.6668  data: 0.0005  max mem: 69511
Epoch: [133] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.002254  min_lr: 0.002254  loss: 3.3098 (3.1457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6174 (0.6449)
Test:  [ 0/25]  eta: 0:03:06  loss: 0.6534 (0.6534)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 7.4704  data: 7.0487  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8534 (0.8297)  acc1: 84.0000 (84.6546)  acc5: 97.6000 (97.4545)  time: 1.0400  data: 0.6411  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0042 (0.9917)  acc1: 80.0000 (81.3905)  acc5: 95.2000 (95.9048)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0818 (1.0012)  acc1: 79.2000 (80.9120)  acc5: 95.2000 (95.8400)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6852 s / it)
* Acc@1 81.040 Acc@5 95.854 loss 1.003
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.04%
Epoch: [134]  [   0/1251]  eta: 1:31:58  lr: 0.002254  min_lr: 0.002254  loss: 3.4568 (3.4568)  weight_decay: 0.0500 (0.0500)  time: 4.4110  data: 3.6310  max mem: 69511
Epoch: [134]  [ 200/1251]  eta: 0:14:05  lr: 0.002251  min_lr: 0.002251  loss: 3.3735 (3.1465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7208 (0.6647)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [134]  [ 400/1251]  eta: 0:11:18  lr: 0.002248  min_lr: 0.002248  loss: 3.2173 (3.1458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6598 (0.6755)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [134]  [ 600/1251]  eta: 0:08:36  lr: 0.002245  min_lr: 0.002245  loss: 3.3215 (3.1644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6265 (0.6729)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [134]  [ 800/1251]  eta: 0:05:57  lr: 0.002242  min_lr: 0.002242  loss: 3.3777 (3.1569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6160 (0.6629)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [134]  [1000/1251]  eta: 0:03:18  lr: 0.002239  min_lr: 0.002239  loss: 3.3565 (3.1556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6974 (0.6633)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [134]  [1200/1251]  eta: 0:00:40  lr: 0.002236  min_lr: 0.002236  loss: 3.0146 (3.1574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6108 (0.6596)  time: 0.7831  data: 0.0004  max mem: 69511
Epoch: [134]  [1250/1251]  eta: 0:00:00  lr: 0.002235  min_lr: 0.002235  loss: 3.2212 (3.1592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6074 (0.6585)  time: 0.6660  data: 0.0006  max mem: 69511
Epoch: [134] Total time: 0:16:27 (0.7890 s / it)
Averaged stats: lr: 0.002235  min_lr: 0.002235  loss: 3.2212 (3.1537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6074 (0.6585)
Test:  [ 0/25]  eta: 0:03:05  loss: 0.7238 (0.7238)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.4045  data: 6.9679  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8958 (0.8748)  acc1: 86.0000 (85.2000)  acc5: 97.6000 (97.3455)  time: 1.0329  data: 0.6338  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0274 (1.0127)  acc1: 79.2000 (81.7524)  acc5: 96.0000 (95.9238)  time: 0.3957  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0993 (1.0292)  acc1: 78.8000 (81.2160)  acc5: 94.4000 (95.7760)  time: 0.3956  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6876 s / it)
* Acc@1 80.998 Acc@5 95.834 loss 1.023
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.04%
Epoch: [135]  [   0/1251]  eta: 1:34:59  lr: 0.002235  min_lr: 0.002235  loss: 2.6698 (2.6698)  weight_decay: 0.0500 (0.0500)  time: 4.5559  data: 3.3834  max mem: 69511
Epoch: [135]  [ 200/1251]  eta: 0:14:09  lr: 0.002232  min_lr: 0.002232  loss: 3.0448 (3.1192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6264 (0.6534)  time: 0.7987  data: 0.0005  max mem: 69511
Epoch: [135]  [ 400/1251]  eta: 0:11:19  lr: 0.002229  min_lr: 0.002229  loss: 2.9755 (3.1557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6578 (0.6712)  time: 0.7869  data: 0.0004  max mem: 69511
Epoch: [135]  [ 600/1251]  eta: 0:08:37  lr: 0.002226  min_lr: 0.002226  loss: 3.0524 (3.1482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5892 (0.6497)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [135]  [ 800/1251]  eta: 0:05:58  lr: 0.002223  min_lr: 0.002223  loss: 3.1143 (3.1493)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6613 (0.6525)  time: 0.7877  data: 0.0005  max mem: 69511
Epoch: [135]  [1000/1251]  eta: 0:03:18  lr: 0.002220  min_lr: 0.002220  loss: 3.1663 (3.1522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6278 (0.6567)  time: 0.7867  data: 0.0006  max mem: 69511
Epoch: [135]  [1200/1251]  eta: 0:00:40  lr: 0.002217  min_lr: 0.002217  loss: 3.3110 (3.1429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6975 (0.6580)  time: 0.7878  data: 0.0005  max mem: 69511
Epoch: [135]  [1250/1251]  eta: 0:00:00  lr: 0.002216  min_lr: 0.002216  loss: 3.2740 (3.1446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6434 (0.6580)  time: 0.6653  data: 0.0007  max mem: 69511
Epoch: [135] Total time: 0:16:28 (0.7904 s / it)
Averaged stats: lr: 0.002216  min_lr: 0.002216  loss: 3.2740 (3.1458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6434 (0.6580)
Test:  [ 0/25]  eta: 0:03:11  loss: 0.8320 (0.8320)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 7.6424  data: 7.2251  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9766 (0.9401)  acc1: 83.6000 (84.4727)  acc5: 97.2000 (97.3455)  time: 1.0526  data: 0.6571  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0448 (1.0801)  acc1: 79.2000 (80.8952)  acc5: 95.6000 (96.0952)  time: 0.3936  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1303 (1.0875)  acc1: 79.6000 (80.6560)  acc5: 95.2000 (95.9360)  time: 0.3936  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6960 s / it)
* Acc@1 80.810 Acc@5 95.734 loss 1.095
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 81.04%
Epoch: [136]  [   0/1251]  eta: 1:37:05  lr: 0.002216  min_lr: 0.002216  loss: 3.0713 (3.0713)  weight_decay: 0.0500 (0.0500)  time: 4.6568  data: 3.1920  max mem: 69511
Epoch: [136]  [ 200/1251]  eta: 0:14:06  lr: 0.002213  min_lr: 0.002213  loss: 3.0633 (3.1756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6641 (0.6801)  time: 0.7841  data: 0.0004  max mem: 69511
Epoch: [136]  [ 400/1251]  eta: 0:11:17  lr: 0.002210  min_lr: 0.002210  loss: 3.1080 (3.1400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6503 (0.6751)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [136]  [ 600/1251]  eta: 0:08:36  lr: 0.002207  min_lr: 0.002207  loss: 3.1834 (3.1662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5845 (0.6560)  time: 0.7829  data: 0.0004  max mem: 69511
Epoch: [136]  [ 800/1251]  eta: 0:05:56  lr: 0.002204  min_lr: 0.002204  loss: 3.1672 (3.1437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6438 (0.6558)  time: 0.7833  data: 0.0004  max mem: 69511
Epoch: [136]  [1000/1251]  eta: 0:03:18  lr: 0.002201  min_lr: 0.002201  loss: 3.3138 (3.1311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5859 (0.6503)  time: 0.7823  data: 0.0004  max mem: 69511
Epoch: [136]  [1200/1251]  eta: 0:00:40  lr: 0.002198  min_lr: 0.002198  loss: 2.9673 (3.1249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6969 (0.6607)  time: 0.7907  data: 0.0004  max mem: 69511
Epoch: [136]  [1250/1251]  eta: 0:00:00  lr: 0.002197  min_lr: 0.002197  loss: 3.2488 (3.1280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7414 (0.6630)  time: 0.6674  data: 0.0005  max mem: 69511
Epoch: [136] Total time: 0:16:25 (0.7877 s / it)
Averaged stats: lr: 0.002197  min_lr: 0.002197  loss: 3.2488 (3.1337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7414 (0.6630)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.6928 (0.6928)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 7.7085  data: 7.2837  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8239 (0.8326)  acc1: 84.0000 (84.4000)  acc5: 97.6000 (97.8545)  time: 1.0615  data: 0.6624  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9559 (0.9843)  acc1: 79.6000 (81.0857)  acc5: 96.4000 (96.2476)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0826 (0.9913)  acc1: 78.0000 (80.7200)  acc5: 95.6000 (96.1440)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6970 s / it)
* Acc@1 80.980 Acc@5 95.868 loss 0.986
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.04%
Epoch: [137]  [   0/1251]  eta: 1:45:36  lr: 0.002197  min_lr: 0.002197  loss: 3.6520 (3.6520)  weight_decay: 0.0500 (0.0500)  time: 5.0652  data: 3.8416  max mem: 69511
Epoch: [137]  [ 200/1251]  eta: 0:14:10  lr: 0.002194  min_lr: 0.002194  loss: 2.9498 (3.0562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6100 (0.6540)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [137]  [ 400/1251]  eta: 0:11:19  lr: 0.002191  min_lr: 0.002191  loss: 3.3898 (3.0801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6412 (0.6641)  time: 0.7955  data: 0.0004  max mem: 69511
Epoch: [137]  [ 600/1251]  eta: 0:08:37  lr: 0.002188  min_lr: 0.002188  loss: 3.2660 (3.0894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6220 (0.6527)  time: 0.7974  data: 0.0004  max mem: 69511
Epoch: [137]  [ 800/1251]  eta: 0:05:57  lr: 0.002185  min_lr: 0.002185  loss: 3.2735 (3.1038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6053 (0.6517)  time: 0.7829  data: 0.0004  max mem: 69511
Epoch: [137]  [1000/1251]  eta: 0:03:18  lr: 0.002182  min_lr: 0.002182  loss: 3.1711 (3.1130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7009 (0.6552)  time: 0.7924  data: 0.0005  max mem: 69511
Epoch: [137]  [1200/1251]  eta: 0:00:40  lr: 0.002179  min_lr: 0.002179  loss: 3.0148 (3.1234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6730 (0.6541)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [137]  [1250/1251]  eta: 0:00:00  lr: 0.002178  min_lr: 0.002178  loss: 3.2814 (3.1258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6872 (0.6559)  time: 0.6675  data: 0.0005  max mem: 69511
Epoch: [137] Total time: 0:16:27 (0.7897 s / it)
Averaged stats: lr: 0.002178  min_lr: 0.002178  loss: 3.2814 (3.1248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6872 (0.6559)
Test:  [ 0/25]  eta: 0:03:23  loss: 0.7086 (0.7086)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 8.1306  data: 7.6828  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8802 (0.8835)  acc1: 83.2000 (84.2909)  acc5: 98.0000 (97.6000)  time: 1.0999  data: 0.6987  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0035 (1.0425)  acc1: 78.8000 (80.7429)  acc5: 95.2000 (95.6381)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1549 (1.0489)  acc1: 78.4000 (80.3680)  acc5: 94.8000 (95.6320)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7147 s / it)
* Acc@1 81.004 Acc@5 95.792 loss 1.033
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.04%
Epoch: [138]  [   0/1251]  eta: 1:41:37  lr: 0.002178  min_lr: 0.002178  loss: 2.6177 (2.6177)  weight_decay: 0.0500 (0.0500)  time: 4.8743  data: 3.1808  max mem: 69511
Epoch: [138]  [ 200/1251]  eta: 0:14:06  lr: 0.002175  min_lr: 0.002175  loss: 3.1652 (3.1132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6430 (0.6703)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [138]  [ 400/1251]  eta: 0:11:18  lr: 0.002172  min_lr: 0.002172  loss: 3.2731 (3.1221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6179 (0.6581)  time: 0.7852  data: 0.0004  max mem: 69511
Epoch: [138]  [ 600/1251]  eta: 0:08:36  lr: 0.002169  min_lr: 0.002169  loss: 3.4149 (3.1354)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6326 (0.6539)  time: 0.7824  data: 0.0004  max mem: 69511
Epoch: [138]  [ 800/1251]  eta: 0:05:56  lr: 0.002166  min_lr: 0.002166  loss: 3.2242 (3.1243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6163 (0.6545)  time: 0.7819  data: 0.0004  max mem: 69511
Epoch: [138]  [1000/1251]  eta: 0:03:18  lr: 0.002163  min_lr: 0.002163  loss: 3.2784 (3.1266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6654 (0.6563)  time: 0.7846  data: 0.0003  max mem: 69511
Epoch: [138]  [1200/1251]  eta: 0:00:40  lr: 0.002160  min_lr: 0.002160  loss: 3.0423 (3.1186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6229 (0.6565)  time: 0.7819  data: 0.0004  max mem: 69511
Epoch: [138]  [1250/1251]  eta: 0:00:00  lr: 0.002159  min_lr: 0.002159  loss: 3.1224 (3.1190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5965 (0.6573)  time: 0.6644  data: 0.0005  max mem: 69511
Epoch: [138] Total time: 0:16:24 (0.7872 s / it)
Averaged stats: lr: 0.002159  min_lr: 0.002159  loss: 3.1224 (3.1306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5965 (0.6573)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.7018 (0.7018)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 7.7084  data: 7.2825  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8556 (0.8670)  acc1: 84.4000 (84.2545)  acc5: 97.6000 (97.6364)  time: 1.0594  data: 0.6624  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0439 (1.0141)  acc1: 78.8000 (80.6476)  acc5: 96.0000 (96.0762)  time: 0.3947  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1231 (1.0226)  acc1: 77.6000 (80.2240)  acc5: 95.6000 (95.9520)  time: 0.3947  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6958 s / it)
* Acc@1 81.154 Acc@5 95.874 loss 1.000
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.15%
Epoch: [139]  [   0/1251]  eta: 1:29:54  lr: 0.002159  min_lr: 0.002159  loss: 3.5557 (3.5557)  weight_decay: 0.0500 (0.0500)  time: 4.3125  data: 3.5188  max mem: 69511
Epoch: [139]  [ 200/1251]  eta: 0:14:06  lr: 0.002156  min_lr: 0.002156  loss: 3.2189 (3.0953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5974 (0.6150)  time: 0.7940  data: 0.0004  max mem: 69511
Epoch: [139]  [ 400/1251]  eta: 0:11:18  lr: 0.002153  min_lr: 0.002153  loss: 3.1954 (3.1272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6717 (0.6384)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [139]  [ 600/1251]  eta: 0:08:36  lr: 0.002150  min_lr: 0.002150  loss: 3.4658 (3.1400)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [139]  [ 800/1251]  eta: 0:05:57  lr: 0.002147  min_lr: 0.002147  loss: 3.2669 (3.1188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5902 (nan)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [139]  [1000/1251]  eta: 0:03:18  lr: 0.002144  min_lr: 0.002144  loss: 2.8904 (3.1208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6488 (nan)  time: 0.7875  data: 0.0005  max mem: 69511
Epoch: [139]  [1200/1251]  eta: 0:00:40  lr: 0.002141  min_lr: 0.002141  loss: 3.0690 (3.1218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6247 (nan)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [139]  [1250/1251]  eta: 0:00:00  lr: 0.002140  min_lr: 0.002140  loss: 3.2144 (3.1217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6432 (nan)  time: 0.6775  data: 0.0004  max mem: 69511
Epoch: [139] Total time: 0:16:28 (0.7899 s / it)
Averaged stats: lr: 0.002140  min_lr: 0.002140  loss: 3.2144 (3.1267)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6432 (nan)
Test:  [ 0/25]  eta: 0:03:18  loss: 0.7016 (0.7016)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 7.9387  data: 7.5236  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8961 (0.8908)  acc1: 84.0000 (84.6909)  acc5: 98.0000 (97.5636)  time: 1.0827  data: 0.6842  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0208 (1.0611)  acc1: 78.8000 (80.8952)  acc5: 95.2000 (95.5810)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1798 (1.0639)  acc1: 77.2000 (80.6560)  acc5: 94.8000 (95.6000)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7083 s / it)
* Acc@1 81.032 Acc@5 95.778 loss 1.048
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.15%
Epoch: [140]  [   0/1251]  eta: 1:33:19  lr: 0.002140  min_lr: 0.002140  loss: 2.3673 (2.3673)  weight_decay: 0.0500 (0.0500)  time: 4.4763  data: 2.2885  max mem: 69511
Epoch: [140]  [ 200/1251]  eta: 0:14:07  lr: 0.002137  min_lr: 0.002137  loss: 3.3943 (3.1078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6180 (0.6604)  time: 0.7863  data: 0.0006  max mem: 69511
Epoch: [140]  [ 400/1251]  eta: 0:11:18  lr: 0.002134  min_lr: 0.002134  loss: 3.2109 (3.1226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6164 (0.6493)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [140]  [ 600/1251]  eta: 0:08:36  lr: 0.002131  min_lr: 0.002131  loss: 3.2429 (3.1361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6644 (0.6615)  time: 0.7910  data: 0.0006  max mem: 69511
Epoch: [140]  [ 800/1251]  eta: 0:05:56  lr: 0.002128  min_lr: 0.002128  loss: 3.2026 (3.1337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6372 (0.6615)  time: 0.7824  data: 0.0005  max mem: 69511
Epoch: [140]  [1000/1251]  eta: 0:03:18  lr: 0.002124  min_lr: 0.002124  loss: 3.3436 (3.1298)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.7826  data: 0.0005  max mem: 69511
Epoch: [140]  [1200/1251]  eta: 0:00:40  lr: 0.002121  min_lr: 0.002121  loss: 3.0521 (3.1214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6499 (nan)  time: 0.7831  data: 0.0006  max mem: 69511
Epoch: [140]  [1250/1251]  eta: 0:00:00  lr: 0.002121  min_lr: 0.002121  loss: 3.0904 (3.1179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6582 (nan)  time: 0.6662  data: 0.0006  max mem: 69511
Epoch: [140] Total time: 0:16:25 (0.7877 s / it)
Averaged stats: lr: 0.002121  min_lr: 0.002121  loss: 3.0904 (3.1256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6582 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6488 (0.6488)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 5.5353  data: 5.0763  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8466 (0.8222)  acc1: 84.8000 (84.5455)  acc5: 97.2000 (97.2364)  time: 0.9414  data: 0.5328  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9459 (0.9596)  acc1: 78.4000 (80.9524)  acc5: 96.4000 (95.9238)  time: 0.4387  data: 0.0393  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0808 (0.9691)  acc1: 78.4000 (80.7680)  acc5: 94.8000 (95.8240)  time: 0.3966  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6402 s / it)
* Acc@1 81.100 Acc@5 95.878 loss 0.954
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.15%
Epoch: [141]  [   0/1251]  eta: 1:44:53  lr: 0.002121  min_lr: 0.002121  loss: 3.1778 (3.1778)  weight_decay: 0.0500 (0.0500)  time: 5.0307  data: 3.8309  max mem: 69511
Epoch: [141]  [ 200/1251]  eta: 0:14:09  lr: 0.002118  min_lr: 0.002118  loss: 3.2934 (3.1207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5690 (0.6474)  time: 0.7850  data: 0.0005  max mem: 69511
Epoch: [141]  [ 400/1251]  eta: 0:11:18  lr: 0.002114  min_lr: 0.002114  loss: 3.2221 (3.1284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6412 (0.6428)  time: 0.7902  data: 0.0005  max mem: 69511
Epoch: [141]  [ 600/1251]  eta: 0:08:37  lr: 0.002111  min_lr: 0.002111  loss: 3.1646 (3.1253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6242 (0.6489)  time: 0.7923  data: 0.0006  max mem: 69511
Epoch: [141]  [ 800/1251]  eta: 0:05:57  lr: 0.002108  min_lr: 0.002108  loss: 3.0293 (3.1210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7272 (0.6608)  time: 0.7861  data: 0.0006  max mem: 69511
Epoch: [141]  [1000/1251]  eta: 0:03:18  lr: 0.002105  min_lr: 0.002105  loss: 2.9137 (3.1183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6351 (0.6507)  time: 0.7893  data: 0.0005  max mem: 69511
Epoch: [141]  [1200/1251]  eta: 0:00:40  lr: 0.002102  min_lr: 0.002102  loss: 3.2524 (3.1206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6428 (0.6511)  time: 0.7870  data: 0.0006  max mem: 69511
Epoch: [141]  [1250/1251]  eta: 0:00:00  lr: 0.002101  min_lr: 0.002101  loss: 3.2954 (3.1225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6459 (0.6532)  time: 0.6724  data: 0.0006  max mem: 69511
Epoch: [141] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.002101  min_lr: 0.002101  loss: 3.2954 (3.1207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6459 (0.6532)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.7031 (0.7031)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 7.7744  data: 7.3472  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8472 (0.8633)  acc1: 85.6000 (84.9091)  acc5: 97.6000 (97.4909)  time: 1.0680  data: 0.6682  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9746 (1.0298)  acc1: 80.0000 (81.3333)  acc5: 95.6000 (95.8476)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1268 (1.0386)  acc1: 78.8000 (80.7360)  acc5: 95.6000 (95.7600)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6987 s / it)
* Acc@1 81.130 Acc@5 95.818 loss 1.031
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.15%
Epoch: [142]  [   0/1251]  eta: 1:44:24  lr: 0.002101  min_lr: 0.002101  loss: 2.9725 (2.9725)  weight_decay: 0.0500 (0.0500)  time: 5.0073  data: 4.2220  max mem: 69511
Epoch: [142]  [ 200/1251]  eta: 0:14:07  lr: 0.002098  min_lr: 0.002098  loss: 3.0023 (3.0786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6741 (0.6774)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [142]  [ 400/1251]  eta: 0:11:18  lr: 0.002095  min_lr: 0.002095  loss: 2.6612 (3.0791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6263 (0.6610)  time: 0.7935  data: 0.0004  max mem: 69511
Epoch: [142]  [ 600/1251]  eta: 0:08:37  lr: 0.002092  min_lr: 0.002092  loss: 3.2066 (3.0969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6533 (0.6686)  time: 0.7864  data: 0.0006  max mem: 69511
Epoch: [142]  [ 800/1251]  eta: 0:05:57  lr: 0.002089  min_lr: 0.002089  loss: 2.8411 (3.0961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6593 (0.6711)  time: 0.7830  data: 0.0004  max mem: 69511
Epoch: [142]  [1000/1251]  eta: 0:03:18  lr: 0.002086  min_lr: 0.002086  loss: 2.9386 (3.0895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5853 (0.6683)  time: 0.7931  data: 0.0006  max mem: 69511
Epoch: [142]  [1200/1251]  eta: 0:00:40  lr: 0.002083  min_lr: 0.002083  loss: 2.9352 (3.0943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7116 (0.6735)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [142]  [1250/1251]  eta: 0:00:00  lr: 0.002082  min_lr: 0.002082  loss: 2.7362 (3.0912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (0.6728)  time: 0.6673  data: 0.0005  max mem: 69511
Epoch: [142] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.002082  min_lr: 0.002082  loss: 2.7362 (3.1147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (0.6728)
Test:  [ 0/25]  eta: 0:02:55  loss: 0.6089 (0.6089)  acc1: 89.2000 (89.2000)  acc5: 99.6000 (99.6000)  time: 7.0282  data: 6.5815  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7694 (0.7817)  acc1: 84.4000 (84.7636)  acc5: 97.6000 (97.6000)  time: 1.0001  data: 0.5986  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9823 (0.9194)  acc1: 80.4000 (81.7714)  acc5: 95.2000 (96.0191)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0297 (0.9256)  acc1: 80.0000 (81.4080)  acc5: 95.2000 (95.9360)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6681 s / it)
* Acc@1 81.308 Acc@5 95.972 loss 0.917
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.31%
Epoch: [143]  [   0/1251]  eta: 1:28:35  lr: 0.002082  min_lr: 0.002082  loss: 2.3684 (2.3684)  weight_decay: 0.0500 (0.0500)  time: 4.2488  data: 3.4607  max mem: 69511
Epoch: [143]  [ 200/1251]  eta: 0:14:07  lr: 0.002079  min_lr: 0.002079  loss: 3.2666 (3.0712)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6250 (0.6431)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [143]  [ 400/1251]  eta: 0:11:18  lr: 0.002076  min_lr: 0.002076  loss: 3.2990 (3.0759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7073 (0.6738)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [143]  [ 600/1251]  eta: 0:08:36  lr: 0.002073  min_lr: 0.002073  loss: 3.0049 (3.0551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6647 (0.6805)  time: 0.7891  data: 0.0005  max mem: 69511
Epoch: [143]  [ 800/1251]  eta: 0:05:57  lr: 0.002070  min_lr: 0.002070  loss: 3.4147 (3.0833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6607 (0.6843)  time: 0.7862  data: 0.0006  max mem: 69511
Epoch: [143]  [1000/1251]  eta: 0:03:18  lr: 0.002067  min_lr: 0.002067  loss: 3.2107 (3.0849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6764 (0.6806)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [143]  [1200/1251]  eta: 0:00:40  lr: 0.002064  min_lr: 0.002064  loss: 3.3122 (3.0795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6333 (0.6778)  time: 0.7871  data: 0.0006  max mem: 69511
Epoch: [143]  [1250/1251]  eta: 0:00:00  lr: 0.002063  min_lr: 0.002063  loss: 3.3163 (3.0842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6579 (0.6793)  time: 0.6757  data: 0.0006  max mem: 69511
Epoch: [143] Total time: 0:16:27 (0.7894 s / it)
Averaged stats: lr: 0.002063  min_lr: 0.002063  loss: 3.3163 (3.1090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6579 (0.6793)
Test:  [ 0/25]  eta: 0:03:01  loss: 0.6994 (0.6994)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 7.2629  data: 6.8193  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9091 (0.8923)  acc1: 84.8000 (84.8727)  acc5: 97.2000 (97.4909)  time: 1.0215  data: 0.6202  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0131 (1.0266)  acc1: 78.8000 (81.5429)  acc5: 95.2000 (95.8476)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0906 (1.0356)  acc1: 78.4000 (80.9280)  acc5: 95.2000 (95.8240)  time: 0.3973  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6784 s / it)
* Acc@1 81.114 Acc@5 95.870 loss 1.025
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.31%
Epoch: [144]  [   0/1251]  eta: 1:37:47  lr: 0.002063  min_lr: 0.002063  loss: 2.5016 (2.5016)  weight_decay: 0.0500 (0.0500)  time: 4.6899  data: 3.2385  max mem: 69511
Epoch: [144]  [ 200/1251]  eta: 0:14:09  lr: 0.002060  min_lr: 0.002060  loss: 3.2290 (3.1109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6994 (0.7115)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [144]  [ 400/1251]  eta: 0:11:18  lr: 0.002057  min_lr: 0.002057  loss: 3.3063 (3.1021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5994 (0.6840)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [144]  [ 600/1251]  eta: 0:08:37  lr: 0.002054  min_lr: 0.002054  loss: 3.1125 (3.0959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5987 (0.6699)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [144]  [ 800/1251]  eta: 0:05:57  lr: 0.002050  min_lr: 0.002050  loss: 3.3297 (3.1089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6042 (0.6672)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [144]  [1000/1251]  eta: 0:03:18  lr: 0.002047  min_lr: 0.002047  loss: 3.1832 (3.1051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6534 (0.6699)  time: 0.7915  data: 0.0004  max mem: 69511
Epoch: [144]  [1200/1251]  eta: 0:00:40  lr: 0.002044  min_lr: 0.002044  loss: 3.2350 (3.1059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6756 (0.6720)  time: 0.7884  data: 0.0004  max mem: 69511
Epoch: [144]  [1250/1251]  eta: 0:00:00  lr: 0.002044  min_lr: 0.002044  loss: 3.2154 (3.1091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5927 (0.6701)  time: 0.6694  data: 0.0007  max mem: 69511
Epoch: [144] Total time: 0:16:27 (0.7894 s / it)
Averaged stats: lr: 0.002044  min_lr: 0.002044  loss: 3.2154 (3.1087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5927 (0.6701)
Test:  [ 0/25]  eta: 0:03:20  loss: 0.6766 (0.6766)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 8.0097  data: 7.5876  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8938 (0.8846)  acc1: 85.2000 (84.8727)  acc5: 98.0000 (97.6727)  time: 1.0893  data: 0.6901  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0605 (1.0481)  acc1: 79.2000 (81.2381)  acc5: 96.0000 (95.9048)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1479 (1.0634)  acc1: 78.0000 (80.6240)  acc5: 95.6000 (95.9040)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7067 s / it)
* Acc@1 81.068 Acc@5 95.922 loss 1.054
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.31%
Epoch: [145]  [   0/1251]  eta: 1:37:46  lr: 0.002044  min_lr: 0.002044  loss: 3.4437 (3.4437)  weight_decay: 0.0500 (0.0500)  time: 4.6891  data: 2.1900  max mem: 69511
Epoch: [145]  [ 200/1251]  eta: 0:14:06  lr: 0.002040  min_lr: 0.002040  loss: 2.8982 (3.0586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5922 (0.6588)  time: 0.7861  data: 0.0007  max mem: 69511
Epoch: [145]  [ 400/1251]  eta: 0:11:18  lr: 0.002037  min_lr: 0.002037  loss: 2.8235 (3.0572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7038 (0.6718)  time: 0.7905  data: 0.0007  max mem: 69511
Epoch: [145]  [ 600/1251]  eta: 0:08:37  lr: 0.002034  min_lr: 0.002034  loss: 3.2157 (3.0576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6174 (0.6605)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [145]  [ 800/1251]  eta: 0:05:57  lr: 0.002031  min_lr: 0.002031  loss: 3.0614 (3.0709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6825 (0.6561)  time: 0.7912  data: 0.0005  max mem: 69511
Epoch: [145]  [1000/1251]  eta: 0:03:18  lr: 0.002028  min_lr: 0.002028  loss: 3.1854 (3.0869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6652 (0.6622)  time: 0.7869  data: 0.0006  max mem: 69511
Epoch: [145]  [1200/1251]  eta: 0:00:40  lr: 0.002025  min_lr: 0.002025  loss: 3.0617 (3.0872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6855 (0.6670)  time: 0.7879  data: 0.0005  max mem: 69511
Epoch: [145]  [1250/1251]  eta: 0:00:00  lr: 0.002024  min_lr: 0.002024  loss: 2.9809 (3.0882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7318 (0.6716)  time: 0.6691  data: 0.0005  max mem: 69511
Epoch: [145] Total time: 0:16:28 (0.7901 s / it)
Averaged stats: lr: 0.002024  min_lr: 0.002024  loss: 2.9809 (3.1047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7318 (0.6716)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.7019 (0.7019)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 7.7686  data: 7.3391  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9398 (0.8902)  acc1: 86.0000 (84.1818)  acc5: 97.6000 (97.7091)  time: 1.0675  data: 0.6675  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0087 (1.0425)  acc1: 79.6000 (81.2191)  acc5: 95.6000 (96.1333)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1273 (1.0549)  acc1: 79.6000 (80.9280)  acc5: 95.2000 (95.9520)  time: 0.3973  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6982 s / it)
* Acc@1 81.276 Acc@5 95.950 loss 1.044
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.31%
Epoch: [146]  [   0/1251]  eta: 1:37:45  lr: 0.002024  min_lr: 0.002024  loss: 3.4893 (3.4893)  weight_decay: 0.0500 (0.0500)  time: 4.6887  data: 3.5277  max mem: 69511
Epoch: [146]  [ 200/1251]  eta: 0:14:05  lr: 0.002021  min_lr: 0.002021  loss: 3.1910 (3.0845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6431 (0.6400)  time: 0.7815  data: 0.0005  max mem: 69511
Epoch: [146]  [ 400/1251]  eta: 0:11:16  lr: 0.002018  min_lr: 0.002018  loss: 2.8903 (3.0778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7098 (0.6509)  time: 0.7846  data: 0.0005  max mem: 69511
Epoch: [146]  [ 600/1251]  eta: 0:08:35  lr: 0.002015  min_lr: 0.002015  loss: 3.2545 (3.0746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6781 (0.6702)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [146]  [ 800/1251]  eta: 0:05:56  lr: 0.002012  min_lr: 0.002012  loss: 2.8226 (3.0809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.6843)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [146]  [1000/1251]  eta: 0:03:18  lr: 0.002009  min_lr: 0.002009  loss: 3.2365 (3.0749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6099 (0.6787)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [146]  [1200/1251]  eta: 0:00:40  lr: 0.002006  min_lr: 0.002006  loss: 3.2139 (3.0858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6711 (0.6799)  time: 0.7874  data: 0.0005  max mem: 69511
Epoch: [146]  [1250/1251]  eta: 0:00:00  lr: 0.002005  min_lr: 0.002005  loss: 3.0095 (3.0860)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6261 (0.6788)  time: 0.6685  data: 0.0006  max mem: 69511
Epoch: [146] Total time: 0:16:26 (0.7889 s / it)
Averaged stats: lr: 0.002005  min_lr: 0.002005  loss: 3.0095 (3.0971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6261 (0.6788)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.6479 (0.6479)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 7.5033  data: 7.0884  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8471 (0.8267)  acc1: 85.6000 (84.4727)  acc5: 97.2000 (97.5273)  time: 1.0431  data: 0.6447  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9334 (0.9706)  acc1: 80.4000 (81.5810)  acc5: 95.6000 (95.9810)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0954 (0.9811)  acc1: 79.6000 (81.2800)  acc5: 95.2000 (95.8240)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6865 s / it)
* Acc@1 81.456 Acc@5 96.038 loss 0.973
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.46%
Epoch: [147]  [   0/1251]  eta: 1:33:40  lr: 0.002005  min_lr: 0.002005  loss: 3.2335 (3.2335)  weight_decay: 0.0500 (0.0500)  time: 4.4925  data: 3.6977  max mem: 69511
Epoch: [147]  [ 200/1251]  eta: 0:14:07  lr: 0.002002  min_lr: 0.002002  loss: 3.1716 (3.0544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7271 (0.6986)  time: 0.7900  data: 0.0005  max mem: 69511
Epoch: [147]  [ 400/1251]  eta: 0:11:18  lr: 0.001999  min_lr: 0.001999  loss: 3.0943 (3.0581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6530 (0.6834)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [147]  [ 600/1251]  eta: 0:08:35  lr: 0.001995  min_lr: 0.001995  loss: 3.1246 (3.0523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6590 (0.6883)  time: 0.7831  data: 0.0004  max mem: 69511
Epoch: [147]  [ 800/1251]  eta: 0:05:56  lr: 0.001992  min_lr: 0.001992  loss: 3.1988 (3.0730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6571 (0.6957)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [147]  [1000/1251]  eta: 0:03:18  lr: 0.001989  min_lr: 0.001989  loss: 3.1085 (3.0733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6024 (0.6812)  time: 0.7826  data: 0.0004  max mem: 69511
Epoch: [147]  [1200/1251]  eta: 0:00:40  lr: 0.001986  min_lr: 0.001986  loss: 3.1503 (3.0795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6172 (0.6783)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [147]  [1250/1251]  eta: 0:00:00  lr: 0.001985  min_lr: 0.001985  loss: 2.9785 (3.0773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6172 (0.6766)  time: 0.6674  data: 0.0005  max mem: 69511
Epoch: [147] Total time: 0:16:25 (0.7878 s / it)
Averaged stats: lr: 0.001985  min_lr: 0.001985  loss: 2.9785 (3.0986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6172 (0.6766)
Test:  [ 0/25]  eta: 0:03:01  loss: 0.6324 (0.6324)  acc1: 88.8000 (88.8000)  acc5: 99.6000 (99.6000)  time: 7.2772  data: 6.8407  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8638 (0.8265)  acc1: 84.0000 (84.3273)  acc5: 98.0000 (97.8182)  time: 1.0222  data: 0.6221  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9885 (0.9765)  acc1: 79.2000 (81.4286)  acc5: 96.0000 (96.2476)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0730 (0.9896)  acc1: 79.2000 (81.0400)  acc5: 96.0000 (96.1280)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6774 s / it)
* Acc@1 81.400 Acc@5 96.078 loss 0.976
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.46%
Epoch: [148]  [   0/1251]  eta: 1:41:44  lr: 0.001985  min_lr: 0.001985  loss: 2.7082 (2.7082)  weight_decay: 0.0500 (0.0500)  time: 4.8798  data: 3.1845  max mem: 69511
Epoch: [148]  [ 200/1251]  eta: 0:14:08  lr: 0.001982  min_lr: 0.001982  loss: 3.2610 (3.0899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6637 (0.7216)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [148]  [ 400/1251]  eta: 0:11:18  lr: 0.001979  min_lr: 0.001979  loss: 3.1711 (3.0748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7216 (0.6908)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [148]  [ 600/1251]  eta: 0:08:36  lr: 0.001976  min_lr: 0.001976  loss: 3.0575 (3.0814)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6575 (0.6908)  time: 0.7889  data: 0.0005  max mem: 69511
Epoch: [148]  [ 800/1251]  eta: 0:05:56  lr: 0.001973  min_lr: 0.001973  loss: 3.2100 (3.0643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6363 (0.6843)  time: 0.7849  data: 0.0006  max mem: 69511
Epoch: [148]  [1000/1251]  eta: 0:03:18  lr: 0.001970  min_lr: 0.001970  loss: 3.1422 (3.0816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6533 (0.6897)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [148]  [1200/1251]  eta: 0:00:40  lr: 0.001967  min_lr: 0.001967  loss: 3.2440 (3.0797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7005 (0.6885)  time: 0.7852  data: 0.0004  max mem: 69511
Epoch: [148]  [1250/1251]  eta: 0:00:00  lr: 0.001966  min_lr: 0.001966  loss: 3.2103 (3.0830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7009 (0.6896)  time: 0.6677  data: 0.0005  max mem: 69511
Epoch: [148] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.001966  min_lr: 0.001966  loss: 3.2103 (3.0931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7009 (0.6896)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.7082 (0.7082)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 7.3266  data: 6.8979  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9216 (0.8822)  acc1: 84.0000 (84.6182)  acc5: 97.6000 (97.4546)  time: 1.0268  data: 0.6273  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0199 (1.0171)  acc1: 79.6000 (81.5810)  acc5: 96.0000 (96.1143)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0928 (1.0284)  acc1: 79.6000 (81.1680)  acc5: 96.0000 (96.0000)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6796 s / it)
* Acc@1 81.534 Acc@5 96.044 loss 1.017
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.53%
Epoch: [149]  [   0/1251]  eta: 1:25:05  lr: 0.001966  min_lr: 0.001966  loss: 3.3376 (3.3376)  weight_decay: 0.0500 (0.0500)  time: 4.0811  data: 3.2928  max mem: 69511
Epoch: [149]  [ 200/1251]  eta: 0:14:02  lr: 0.001963  min_lr: 0.001963  loss: 3.2507 (3.0940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6635 (0.6765)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [149]  [ 400/1251]  eta: 0:11:15  lr: 0.001960  min_lr: 0.001960  loss: 3.2467 (3.0878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7006 (0.6957)  time: 0.7823  data: 0.0005  max mem: 69511
Epoch: [149]  [ 600/1251]  eta: 0:08:35  lr: 0.001957  min_lr: 0.001957  loss: 3.0847 (3.1058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6118 (0.6913)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [149]  [ 800/1251]  eta: 0:05:56  lr: 0.001953  min_lr: 0.001953  loss: 3.0048 (3.0932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7429 (0.6932)  time: 0.7861  data: 0.0006  max mem: 69511
Epoch: [149]  [1000/1251]  eta: 0:03:18  lr: 0.001950  min_lr: 0.001950  loss: 2.9715 (3.0973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6345 (0.6939)  time: 0.7827  data: 0.0005  max mem: 69511
Epoch: [149]  [1200/1251]  eta: 0:00:40  lr: 0.001947  min_lr: 0.001947  loss: 2.9938 (3.0861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6063 (0.6881)  time: 0.7849  data: 0.0004  max mem: 69511
Epoch: [149]  [1250/1251]  eta: 0:00:00  lr: 0.001946  min_lr: 0.001946  loss: 2.6391 (3.0829)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5513 (0.6854)  time: 0.6669  data: 0.0006  max mem: 69511
Epoch: [149] Total time: 0:16:24 (0.7874 s / it)
Averaged stats: lr: 0.001946  min_lr: 0.001946  loss: 2.6391 (3.0855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5513 (0.6854)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.6413 (0.6413)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 7.7967  data: 7.3640  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8275 (0.7997)  acc1: 84.8000 (84.5091)  acc5: 97.2000 (97.4909)  time: 1.0696  data: 0.6697  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9472 (0.9227)  acc1: 80.4000 (81.6762)  acc5: 95.6000 (96.1143)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9933 (0.9342)  acc1: 80.4000 (81.2640)  acc5: 95.6000 (96.0320)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7007 s / it)
* Acc@1 81.692 Acc@5 96.020 loss 0.922
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.69%
Epoch: [150]  [   0/1251]  eta: 1:13:25  lr: 0.001946  min_lr: 0.001946  loss: 3.4975 (3.4975)  weight_decay: 0.0500 (0.0500)  time: 3.5218  data: 2.7318  max mem: 69511
Epoch: [150]  [ 200/1251]  eta: 0:14:02  lr: 0.001943  min_lr: 0.001943  loss: 3.1849 (3.1571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6537 (0.6765)  time: 0.7926  data: 0.0004  max mem: 69511
Epoch: [150]  [ 400/1251]  eta: 0:11:15  lr: 0.001940  min_lr: 0.001940  loss: 3.0096 (3.1041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6098 (0.6771)  time: 0.7845  data: 0.0004  max mem: 69511
Epoch: [150]  [ 600/1251]  eta: 0:08:35  lr: 0.001937  min_lr: 0.001937  loss: 3.3503 (3.0929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6306 (0.6847)  time: 0.7848  data: 0.0004  max mem: 69511
Epoch: [150]  [ 800/1251]  eta: 0:05:56  lr: 0.001934  min_lr: 0.001934  loss: 3.2339 (3.0838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7052 (0.6819)  time: 0.7885  data: 0.0004  max mem: 69511
Epoch: [150]  [1000/1251]  eta: 0:03:18  lr: 0.001931  min_lr: 0.001931  loss: 3.1705 (3.0774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (0.6795)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [150]  [1200/1251]  eta: 0:00:40  lr: 0.001928  min_lr: 0.001928  loss: 3.0926 (3.0826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7045 (0.6866)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [150]  [1250/1251]  eta: 0:00:00  lr: 0.001927  min_lr: 0.001927  loss: 3.1253 (3.0794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7915 (0.6913)  time: 0.6675  data: 0.0005  max mem: 69511
Epoch: [150] Total time: 0:16:25 (0.7876 s / it)
Averaged stats: lr: 0.001927  min_lr: 0.001927  loss: 3.1253 (3.0874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7915 (0.6913)
Test:  [ 0/25]  eta: 0:03:05  loss: 0.7214 (0.7214)  acc1: 90.4000 (90.4000)  acc5: 98.0000 (98.0000)  time: 7.4041  data: 6.9936  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8955 (0.8645)  acc1: 85.2000 (84.9091)  acc5: 97.6000 (97.4546)  time: 1.0348  data: 0.6361  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0137 (0.9982)  acc1: 80.0000 (81.7905)  acc5: 95.6000 (96.0952)  time: 0.3974  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0678 (1.0023)  acc1: 79.2000 (81.5520)  acc5: 95.2000 (96.0480)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6855 s / it)
* Acc@1 81.660 Acc@5 96.002 loss 0.994
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.69%
Epoch: [151]  [   0/1251]  eta: 1:26:49  lr: 0.001927  min_lr: 0.001927  loss: 3.4685 (3.4685)  weight_decay: 0.0500 (0.0500)  time: 4.1639  data: 3.0004  max mem: 69511
Epoch: [151]  [ 200/1251]  eta: 0:14:05  lr: 0.001924  min_lr: 0.001924  loss: 3.1359 (3.1109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6779 (0.6906)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [151]  [ 400/1251]  eta: 0:11:17  lr: 0.001921  min_lr: 0.001921  loss: 3.0045 (3.0908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6156 (0.6768)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [151]  [ 600/1251]  eta: 0:08:36  lr: 0.001917  min_lr: 0.001917  loss: 3.1282 (3.0946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6788 (0.6868)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [151]  [ 800/1251]  eta: 0:05:57  lr: 0.001914  min_lr: 0.001914  loss: 3.0424 (3.0919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6721 (0.6905)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [151]  [1000/1251]  eta: 0:03:18  lr: 0.001911  min_lr: 0.001911  loss: 3.1733 (3.0870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6234 (0.6901)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [151]  [1200/1251]  eta: 0:00:40  lr: 0.001908  min_lr: 0.001908  loss: 3.2377 (3.0826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6591 (0.6924)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [151]  [1250/1251]  eta: 0:00:00  lr: 0.001907  min_lr: 0.001907  loss: 3.2591 (3.0836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6337 (0.6916)  time: 0.6674  data: 0.0007  max mem: 69511
Epoch: [151] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.001907  min_lr: 0.001907  loss: 3.2591 (3.0794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6337 (0.6916)
Test:  [ 0/25]  eta: 0:03:06  loss: 0.7706 (0.7706)  acc1: 90.4000 (90.4000)  acc5: 98.0000 (98.0000)  time: 7.4652  data: 7.0359  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9606 (0.9600)  acc1: 84.4000 (84.7273)  acc5: 98.0000 (97.3818)  time: 1.0396  data: 0.6399  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1710 (1.0937)  acc1: 80.4000 (81.4476)  acc5: 96.0000 (96.0571)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1720 (1.0995)  acc1: 79.2000 (80.9760)  acc5: 95.6000 (95.9520)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6879 s / it)
* Acc@1 81.344 Acc@5 95.910 loss 1.089
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.69%
Epoch: [152]  [   0/1251]  eta: 1:38:54  lr: 0.001907  min_lr: 0.001907  loss: 2.6105 (2.6105)  weight_decay: 0.0500 (0.0500)  time: 4.7441  data: 3.1962  max mem: 69511
Epoch: [152]  [ 200/1251]  eta: 0:14:09  lr: 0.001904  min_lr: 0.001904  loss: 2.9837 (3.0571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6714 (nan)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [152]  [ 400/1251]  eta: 0:11:18  lr: 0.001901  min_lr: 0.001901  loss: 3.3619 (3.0782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7392 (nan)  time: 0.7975  data: 0.0004  max mem: 69511
Epoch: [152]  [ 600/1251]  eta: 0:08:37  lr: 0.001898  min_lr: 0.001898  loss: 3.0600 (3.0760)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6511 (nan)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [152]  [ 800/1251]  eta: 0:05:57  lr: 0.001895  min_lr: 0.001895  loss: 3.2550 (3.0712)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6810 (nan)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [152]  [1000/1251]  eta: 0:03:18  lr: 0.001892  min_lr: 0.001892  loss: 3.1819 (3.0710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6705 (nan)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [152]  [1200/1251]  eta: 0:00:40  lr: 0.001889  min_lr: 0.001889  loss: 3.1328 (3.0585)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7246 (nan)  time: 0.7972  data: 0.0004  max mem: 69511
Epoch: [152]  [1250/1251]  eta: 0:00:00  lr: 0.001888  min_lr: 0.001888  loss: 3.3230 (3.0610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6154 (nan)  time: 0.6676  data: 0.0005  max mem: 69511
Epoch: [152] Total time: 0:16:27 (0.7894 s / it)
Averaged stats: lr: 0.001888  min_lr: 0.001888  loss: 3.3230 (3.0700)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6154 (nan)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.8506 (0.8506)  acc1: 87.6000 (87.6000)  acc5: 97.2000 (97.2000)  time: 7.6835  data: 7.2527  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9557 (0.9569)  acc1: 85.2000 (84.6182)  acc5: 97.2000 (97.4182)  time: 1.0592  data: 0.6596  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1420 (1.0963)  acc1: 80.0000 (81.4667)  acc5: 96.0000 (96.2476)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2083 (1.1011)  acc1: 79.6000 (81.1840)  acc5: 96.0000 (96.1600)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6936 s / it)
* Acc@1 81.548 Acc@5 96.076 loss 1.091
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.69%
Epoch: [153]  [   0/1251]  eta: 1:42:13  lr: 0.001888  min_lr: 0.001888  loss: 2.2446 (2.2446)  weight_decay: 0.0500 (0.0500)  time: 4.9029  data: 4.1075  max mem: 69511
Epoch: [153]  [ 200/1251]  eta: 0:14:07  lr: 0.001885  min_lr: 0.001885  loss: 3.2361 (3.0424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6519 (0.6598)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [153]  [ 400/1251]  eta: 0:11:19  lr: 0.001881  min_lr: 0.001881  loss: 3.2524 (3.0724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7246 (0.6814)  time: 0.7910  data: 0.0006  max mem: 69511
Epoch: [153]  [ 600/1251]  eta: 0:08:37  lr: 0.001878  min_lr: 0.001878  loss: 3.0602 (3.0674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7451 (0.6939)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [153]  [ 800/1251]  eta: 0:05:57  lr: 0.001875  min_lr: 0.001875  loss: 3.2617 (3.0650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6821 (0.6876)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [153]  [1000/1251]  eta: 0:03:18  lr: 0.001872  min_lr: 0.001872  loss: 3.0107 (3.0618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6800 (0.6907)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [153]  [1200/1251]  eta: 0:00:40  lr: 0.001869  min_lr: 0.001869  loss: 3.3957 (3.0613)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6780 (0.6899)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [153]  [1250/1251]  eta: 0:00:00  lr: 0.001868  min_lr: 0.001868  loss: 3.2003 (3.0603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5974 (0.6863)  time: 0.6695  data: 0.0006  max mem: 69511
Epoch: [153] Total time: 0:16:27 (0.7894 s / it)
Averaged stats: lr: 0.001868  min_lr: 0.001868  loss: 3.2003 (3.0694)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5974 (0.6863)
Test:  [ 0/25]  eta: 0:02:59  loss: 0.7624 (0.7624)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 7.1989  data: 6.7750  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8524 (0.8716)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.5273)  time: 1.0181  data: 0.6187  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0227 (1.0136)  acc1: 80.8000 (81.6762)  acc5: 96.4000 (96.0571)  time: 0.3984  data: 0.0016  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1168 (1.0175)  acc1: 80.4000 (81.3120)  acc5: 95.2000 (96.0320)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6772 s / it)
* Acc@1 81.614 Acc@5 96.132 loss 1.017
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.69%
Epoch: [154]  [   0/1251]  eta: 1:34:10  lr: 0.001868  min_lr: 0.001868  loss: 2.2297 (2.2297)  weight_decay: 0.0500 (0.0500)  time: 4.5170  data: 3.1337  max mem: 69511
Epoch: [154]  [ 200/1251]  eta: 0:14:10  lr: 0.001865  min_lr: 0.001865  loss: 3.0291 (3.0286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6906 (0.7061)  time: 0.7934  data: 0.0005  max mem: 69511
Epoch: [154]  [ 400/1251]  eta: 0:11:19  lr: 0.001862  min_lr: 0.001862  loss: 3.0317 (3.0538)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7084 (0.6946)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [154]  [ 600/1251]  eta: 0:08:36  lr: 0.001859  min_lr: 0.001859  loss: 3.2082 (3.0576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6440 (0.6900)  time: 0.7828  data: 0.0004  max mem: 69511
Epoch: [154]  [ 800/1251]  eta: 0:05:57  lr: 0.001856  min_lr: 0.001856  loss: 2.9249 (3.0550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7653 (0.6958)  time: 0.7834  data: 0.0004  max mem: 69511
Epoch: [154]  [1000/1251]  eta: 0:03:18  lr: 0.001852  min_lr: 0.001852  loss: 3.3433 (3.0649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6367 (0.6876)  time: 0.7848  data: 0.0005  max mem: 69511
Epoch: [154]  [1200/1251]  eta: 0:00:40  lr: 0.001849  min_lr: 0.001849  loss: 3.1042 (3.0696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6787 (0.6893)  time: 0.7930  data: 0.0005  max mem: 69511
Epoch: [154]  [1250/1251]  eta: 0:00:00  lr: 0.001849  min_lr: 0.001849  loss: 3.0104 (3.0690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6678 (0.6892)  time: 0.6710  data: 0.0006  max mem: 69511
Epoch: [154] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.001849  min_lr: 0.001849  loss: 3.0104 (3.0678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6678 (0.6892)
Test:  [ 0/25]  eta: 0:03:01  loss: 0.6727 (0.6727)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 7.2544  data: 6.8276  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8725 (0.8430)  acc1: 85.2000 (84.7636)  acc5: 97.6000 (97.4909)  time: 1.0205  data: 0.6210  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9936 (1.0115)  acc1: 80.0000 (81.4476)  acc5: 96.4000 (96.1524)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1012 (1.0265)  acc1: 79.6000 (80.9600)  acc5: 95.6000 (96.1120)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6768 s / it)
* Acc@1 81.542 Acc@5 96.050 loss 1.018
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.69%
Epoch: [155]  [   0/1251]  eta: 1:44:06  lr: 0.001849  min_lr: 0.001849  loss: 3.3100 (3.3100)  weight_decay: 0.0500 (0.0500)  time: 4.9931  data: 3.8539  max mem: 69511
Epoch: [155]  [ 200/1251]  eta: 0:14:09  lr: 0.001845  min_lr: 0.001845  loss: 3.2406 (3.0418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6977 (0.6751)  time: 0.8017  data: 0.0004  max mem: 69511
Epoch: [155]  [ 400/1251]  eta: 0:11:18  lr: 0.001842  min_lr: 0.001842  loss: 3.2845 (3.0453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6806 (0.6683)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [155]  [ 600/1251]  eta: 0:08:37  lr: 0.001839  min_lr: 0.001839  loss: 3.1308 (3.0372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6531 (0.6754)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [155]  [ 800/1251]  eta: 0:05:57  lr: 0.001836  min_lr: 0.001836  loss: 3.1704 (3.0540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6000 (0.6709)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [155]  [1000/1251]  eta: 0:03:18  lr: 0.001833  min_lr: 0.001833  loss: 3.1910 (3.0616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6674 (0.6771)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [155]  [1200/1251]  eta: 0:00:40  lr: 0.001830  min_lr: 0.001830  loss: 2.7732 (3.0635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7229 (0.6839)  time: 0.7870  data: 0.0005  max mem: 69511
Epoch: [155]  [1250/1251]  eta: 0:00:00  lr: 0.001829  min_lr: 0.001829  loss: 2.9144 (3.0599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6875 (0.6834)  time: 0.6686  data: 0.0006  max mem: 69511
Epoch: [155] Total time: 0:16:28 (0.7900 s / it)
Averaged stats: lr: 0.001829  min_lr: 0.001829  loss: 2.9144 (3.0550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6875 (0.6834)
Test:  [ 0/25]  eta: 0:02:58  loss: 0.6539 (0.6539)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 7.1493  data: 6.7221  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8228 (0.7989)  acc1: 84.8000 (84.6909)  acc5: 97.6000 (97.4909)  time: 1.0112  data: 0.6114  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9655 (0.9307)  acc1: 79.6000 (81.9048)  acc5: 96.4000 (96.1143)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0161 (0.9363)  acc1: 78.0000 (81.4080)  acc5: 95.6000 (96.0320)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6742 s / it)
* Acc@1 81.784 Acc@5 96.170 loss 0.932
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.78%
Epoch: [156]  [   0/1251]  eta: 1:22:45  lr: 0.001829  min_lr: 0.001829  loss: 2.8597 (2.8597)  weight_decay: 0.0500 (0.0500)  time: 3.9694  data: 3.1683  max mem: 69511
Epoch: [156]  [ 200/1251]  eta: 0:14:03  lr: 0.001826  min_lr: 0.001826  loss: 3.1580 (2.9970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7151 (0.6834)  time: 0.7853  data: 0.0003  max mem: 69511
Epoch: [156]  [ 400/1251]  eta: 0:11:16  lr: 0.001823  min_lr: 0.001823  loss: 3.0400 (3.0016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7190 (0.7099)  time: 0.7919  data: 0.0004  max mem: 69511
Epoch: [156]  [ 600/1251]  eta: 0:08:36  lr: 0.001820  min_lr: 0.001820  loss: 3.2225 (3.0208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6268 (0.7015)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [156]  [ 800/1251]  eta: 0:05:57  lr: 0.001816  min_lr: 0.001816  loss: 3.0518 (3.0234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6487 (0.7052)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [156]  [1000/1251]  eta: 0:03:18  lr: 0.001813  min_lr: 0.001813  loss: 3.0801 (3.0337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6560 (0.7058)  time: 0.7876  data: 0.0004  max mem: 69511
Epoch: [156]  [1200/1251]  eta: 0:00:40  lr: 0.001810  min_lr: 0.001810  loss: 3.1386 (3.0410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6390 (0.7031)  time: 0.7874  data: 0.0004  max mem: 69511
Epoch: [156]  [1250/1251]  eta: 0:00:00  lr: 0.001809  min_lr: 0.001809  loss: 3.0387 (3.0360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7048 (0.7055)  time: 0.6689  data: 0.0004  max mem: 69511
Epoch: [156] Total time: 0:16:27 (0.7894 s / it)
Averaged stats: lr: 0.001809  min_lr: 0.001809  loss: 3.0387 (3.0624)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7048 (0.7055)
Test:  [ 0/25]  eta: 0:03:11  loss: 0.6844 (0.6844)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 7.6708  data: 7.2409  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8548 (0.8322)  acc1: 84.4000 (84.9455)  acc5: 97.6000 (97.7818)  time: 1.0584  data: 0.6585  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9570 (0.9560)  acc1: 80.4000 (81.8476)  acc5: 96.0000 (96.2286)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0312 (0.9620)  acc1: 79.2000 (81.4560)  acc5: 95.6000 (96.1600)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6965 s / it)
* Acc@1 81.796 Acc@5 96.164 loss 0.951
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.80%
Epoch: [157]  [   0/1251]  eta: 1:27:08  lr: 0.001809  min_lr: 0.001809  loss: 3.3862 (3.3862)  weight_decay: 0.0500 (0.0500)  time: 4.1798  data: 3.3913  max mem: 69511
Epoch: [157]  [ 200/1251]  eta: 0:14:04  lr: 0.001806  min_lr: 0.001806  loss: 3.1762 (3.1170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6712 (0.6630)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [157]  [ 400/1251]  eta: 0:11:17  lr: 0.001803  min_lr: 0.001803  loss: 3.2761 (3.1151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6560 (0.6878)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [157]  [ 600/1251]  eta: 0:08:36  lr: 0.001800  min_lr: 0.001800  loss: 3.1991 (3.1183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7150 (0.7022)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [157]  [ 800/1251]  eta: 0:05:57  lr: 0.001797  min_lr: 0.001797  loss: 2.9950 (3.0989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7049 (0.7068)  time: 0.7918  data: 0.0006  max mem: 69511
Epoch: [157]  [1000/1251]  eta: 0:03:18  lr: 0.001794  min_lr: 0.001794  loss: 3.1502 (3.0899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6740 (0.7097)  time: 0.7835  data: 0.0004  max mem: 69511
Epoch: [157]  [1200/1251]  eta: 0:00:40  lr: 0.001791  min_lr: 0.001791  loss: 3.2500 (3.0861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7372 (0.7093)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [157]  [1250/1251]  eta: 0:00:00  lr: 0.001790  min_lr: 0.001790  loss: 3.2487 (3.0884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6899 (0.7117)  time: 0.6663  data: 0.0006  max mem: 69511
Epoch: [157] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.001790  min_lr: 0.001790  loss: 3.2487 (3.0691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6899 (0.7117)
Test:  [ 0/25]  eta: 0:03:09  loss: 0.7806 (0.7806)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 7.5677  data: 7.1452  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9215 (0.9038)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.4182)  time: 1.0460  data: 0.6498  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0042 (1.0569)  acc1: 80.0000 (81.4286)  acc5: 96.0000 (96.0952)  time: 0.3938  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1489 (1.0703)  acc1: 78.8000 (80.9280)  acc5: 95.6000 (95.9840)  time: 0.3937  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6925 s / it)
* Acc@1 81.658 Acc@5 96.028 loss 1.057
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.80%
Epoch: [158]  [   0/1251]  eta: 1:44:03  lr: 0.001790  min_lr: 0.001790  loss: 3.3498 (3.3498)  weight_decay: 0.0500 (0.0500)  time: 4.9906  data: 4.1968  max mem: 69511
Epoch: [158]  [ 200/1251]  eta: 0:14:07  lr: 0.001787  min_lr: 0.001787  loss: 2.7074 (3.0212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7780 (0.7146)  time: 0.7830  data: 0.0005  max mem: 69511
Epoch: [158]  [ 400/1251]  eta: 0:11:18  lr: 0.001783  min_lr: 0.001783  loss: 3.2682 (3.0545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6648 (0.6987)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [158]  [ 600/1251]  eta: 0:08:36  lr: 0.001780  min_lr: 0.001780  loss: 3.1330 (3.0389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7012 (0.7141)  time: 0.7901  data: 0.0004  max mem: 69511
Epoch: [158]  [ 800/1251]  eta: 0:05:56  lr: 0.001777  min_lr: 0.001777  loss: 2.9071 (3.0417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6805 (0.7142)  time: 0.7840  data: 0.0004  max mem: 69511
Epoch: [158]  [1000/1251]  eta: 0:03:18  lr: 0.001774  min_lr: 0.001774  loss: 3.3651 (3.0460)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [158]  [1200/1251]  eta: 0:00:40  lr: 0.001771  min_lr: 0.001771  loss: 3.2113 (3.0488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7409 (nan)  time: 0.7890  data: 0.0004  max mem: 69511
Epoch: [158]  [1250/1251]  eta: 0:00:00  lr: 0.001770  min_lr: 0.001770  loss: 3.2235 (3.0494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7424 (nan)  time: 0.6646  data: 0.0005  max mem: 69511
Epoch: [158] Total time: 0:16:25 (0.7880 s / it)
Averaged stats: lr: 0.001770  min_lr: 0.001770  loss: 3.2235 (3.0511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7424 (nan)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.8052 (0.8052)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 7.3266  data: 6.8945  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9818 (0.9515)  acc1: 84.0000 (84.7273)  acc5: 98.0000 (97.8182)  time: 1.0240  data: 0.6271  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0869 (1.1037)  acc1: 79.6000 (81.4286)  acc5: 96.4000 (96.0762)  time: 0.3936  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.2192 (1.1149)  acc1: 79.2000 (81.0240)  acc5: 94.8000 (95.9040)  time: 0.3935  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6828 s / it)
* Acc@1 81.514 Acc@5 95.958 loss 1.105
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.80%
Epoch: [159]  [   0/1251]  eta: 1:48:16  lr: 0.001770  min_lr: 0.001770  loss: 2.8394 (2.8394)  weight_decay: 0.0500 (0.0500)  time: 5.1932  data: 2.9670  max mem: 69511
Epoch: [159]  [ 200/1251]  eta: 0:14:08  lr: 0.001767  min_lr: 0.001767  loss: 3.2372 (3.0385)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6307 (0.6857)  time: 0.7832  data: 0.0005  max mem: 69511
Epoch: [159]  [ 400/1251]  eta: 0:11:17  lr: 0.001764  min_lr: 0.001764  loss: 3.0464 (3.0347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6519 (0.6796)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [159]  [ 600/1251]  eta: 0:08:36  lr: 0.001761  min_lr: 0.001761  loss: 2.7373 (3.0352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6834 (0.6951)  time: 0.7881  data: 0.0006  max mem: 69511
Epoch: [159]  [ 800/1251]  eta: 0:05:56  lr: 0.001758  min_lr: 0.001758  loss: 3.2301 (3.0408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6889 (0.7073)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [159]  [1000/1251]  eta: 0:03:18  lr: 0.001754  min_lr: 0.001754  loss: 3.2387 (3.0407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7126 (0.7043)  time: 0.7885  data: 0.0005  max mem: 69511
Epoch: [159]  [1200/1251]  eta: 0:00:40  lr: 0.001751  min_lr: 0.001751  loss: 3.1409 (3.0409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6730 (0.7059)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [159]  [1250/1251]  eta: 0:00:00  lr: 0.001751  min_lr: 0.001751  loss: 3.3220 (3.0431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6853 (0.7077)  time: 0.6679  data: 0.0006  max mem: 69511
Epoch: [159] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.001751  min_lr: 0.001751  loss: 3.3220 (3.0479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6853 (0.7077)
Test:  [ 0/25]  eta: 0:03:09  loss: 0.7872 (0.7872)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 7.5971  data: 7.1789  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9857 (0.9868)  acc1: 85.2000 (85.2364)  acc5: 97.6000 (97.6364)  time: 1.0515  data: 0.6529  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1473 (1.0899)  acc1: 80.4000 (81.9810)  acc5: 96.4000 (96.0762)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1492 (1.0979)  acc1: 79.6000 (81.4080)  acc5: 95.2000 (95.9680)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6901 s / it)
* Acc@1 81.758 Acc@5 96.076 loss 1.087
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.80%
Epoch: [160]  [   0/1251]  eta: 1:30:53  lr: 0.001750  min_lr: 0.001750  loss: 3.2915 (3.2915)  weight_decay: 0.0500 (0.0500)  time: 4.3593  data: 3.1719  max mem: 69511
Epoch: [160]  [ 200/1251]  eta: 0:14:04  lr: 0.001747  min_lr: 0.001747  loss: 3.1234 (3.0186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6437 (0.7053)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [160]  [ 400/1251]  eta: 0:11:17  lr: 0.001744  min_lr: 0.001744  loss: 2.9605 (3.0400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6796 (0.6992)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [160]  [ 600/1251]  eta: 0:08:36  lr: 0.001741  min_lr: 0.001741  loss: 3.0053 (3.0419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6862 (0.7005)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [160]  [ 800/1251]  eta: 0:05:57  lr: 0.001738  min_lr: 0.001738  loss: 3.0947 (3.0468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (0.7113)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [160]  [1000/1251]  eta: 0:03:18  lr: 0.001735  min_lr: 0.001735  loss: 3.2693 (3.0619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7171 (0.7122)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [160]  [1200/1251]  eta: 0:00:40  lr: 0.001732  min_lr: 0.001732  loss: 3.0419 (3.0580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6684 (0.7079)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [160]  [1250/1251]  eta: 0:00:00  lr: 0.001731  min_lr: 0.001731  loss: 3.2652 (3.0573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6024 (0.7056)  time: 0.6678  data: 0.0005  max mem: 69511
Epoch: [160] Total time: 0:16:27 (0.7894 s / it)
Averaged stats: lr: 0.001731  min_lr: 0.001731  loss: 3.2652 (3.0438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6024 (0.7056)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.7648 (0.7648)  acc1: 88.8000 (88.8000)  acc5: 97.6000 (97.6000)  time: 7.5062  data: 7.0723  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9191 (0.8703)  acc1: 84.8000 (84.9818)  acc5: 97.2000 (97.3818)  time: 1.0436  data: 0.6432  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9653 (1.0016)  acc1: 80.0000 (81.8667)  acc5: 96.0000 (96.1143)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1159 (1.0139)  acc1: 78.4000 (81.3440)  acc5: 95.6000 (96.0160)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6888 s / it)
* Acc@1 81.868 Acc@5 96.174 loss 1.003
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.87%
Epoch: [161]  [   0/1251]  eta: 1:28:22  lr: 0.001731  min_lr: 0.001731  loss: 3.2951 (3.2951)  weight_decay: 0.0500 (0.0500)  time: 4.2387  data: 3.4573  max mem: 69511
Epoch: [161]  [ 200/1251]  eta: 0:14:05  lr: 0.001728  min_lr: 0.001728  loss: 3.1023 (2.9318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6852 (0.7027)  time: 0.7927  data: 0.0005  max mem: 69511
Epoch: [161]  [ 400/1251]  eta: 0:11:17  lr: 0.001725  min_lr: 0.001725  loss: 3.2310 (3.0049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6311 (0.6947)  time: 0.8016  data: 0.0004  max mem: 69511
Epoch: [161]  [ 600/1251]  eta: 0:08:36  lr: 0.001721  min_lr: 0.001721  loss: 3.2019 (3.0435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6822 (0.7041)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [161]  [ 800/1251]  eta: 0:05:57  lr: 0.001718  min_lr: 0.001718  loss: 3.0874 (3.0328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6642 (0.7007)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [161]  [1000/1251]  eta: 0:03:18  lr: 0.001715  min_lr: 0.001715  loss: 3.1928 (3.0176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6770 (0.7042)  time: 0.7831  data: 0.0004  max mem: 69511
Epoch: [161]  [1200/1251]  eta: 0:00:40  lr: 0.001712  min_lr: 0.001712  loss: 3.1461 (3.0223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6931 (0.7148)  time: 0.7862  data: 0.0006  max mem: 69511
Epoch: [161]  [1250/1251]  eta: 0:00:00  lr: 0.001711  min_lr: 0.001711  loss: 3.3295 (3.0232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6335 (0.7131)  time: 0.6678  data: 0.0005  max mem: 69511
Epoch: [161] Total time: 0:16:25 (0.7880 s / it)
Averaged stats: lr: 0.001711  min_lr: 0.001711  loss: 3.3295 (3.0356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6335 (0.7131)
Test:  [ 0/25]  eta: 0:03:04  loss: 0.7310 (0.7310)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.3725  data: 6.9459  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9708 (0.9203)  acc1: 84.8000 (85.1273)  acc5: 98.0000 (97.5636)  time: 1.0306  data: 0.6317  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0333 (1.0450)  acc1: 81.2000 (82.1143)  acc5: 96.0000 (96.1524)  time: 0.3965  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1104 (1.0549)  acc1: 80.4000 (81.6160)  acc5: 95.6000 (96.1280)  time: 0.3965  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6804 s / it)
* Acc@1 81.956 Acc@5 96.160 loss 1.042
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 81.96%
Epoch: [162]  [   0/1251]  eta: 1:23:47  lr: 0.001711  min_lr: 0.001711  loss: 2.3882 (2.3882)  weight_decay: 0.0500 (0.0500)  time: 4.0191  data: 3.2358  max mem: 69511
Epoch: [162]  [ 200/1251]  eta: 0:14:04  lr: 0.001708  min_lr: 0.001708  loss: 3.3346 (3.0512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6504 (0.7053)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [162]  [ 400/1251]  eta: 0:11:16  lr: 0.001705  min_lr: 0.001705  loss: 3.0562 (3.0323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7000 (0.7230)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [162]  [ 600/1251]  eta: 0:08:36  lr: 0.001702  min_lr: 0.001702  loss: 3.1018 (3.0191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7082 (0.7243)  time: 0.7945  data: 0.0004  max mem: 69511
Epoch: [162]  [ 800/1251]  eta: 0:05:57  lr: 0.001699  min_lr: 0.001699  loss: 3.1791 (3.0239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6886 (0.7254)  time: 0.7947  data: 0.0005  max mem: 69511
Epoch: [162]  [1000/1251]  eta: 0:03:18  lr: 0.001696  min_lr: 0.001696  loss: 3.1090 (3.0309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7661 (0.7214)  time: 0.7828  data: 0.0005  max mem: 69511
Epoch: [162]  [1200/1251]  eta: 0:00:40  lr: 0.001692  min_lr: 0.001692  loss: 3.1507 (3.0361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6952 (0.7221)  time: 0.7834  data: 0.0006  max mem: 69511
Epoch: [162]  [1250/1251]  eta: 0:00:00  lr: 0.001692  min_lr: 0.001692  loss: 3.1939 (3.0394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7318 (0.7225)  time: 0.6665  data: 0.0005  max mem: 69511
Epoch: [162] Total time: 0:16:25 (0.7880 s / it)
Averaged stats: lr: 0.001692  min_lr: 0.001692  loss: 3.1939 (3.0404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7318 (0.7225)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.6568 (0.6568)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 7.3322  data: 6.9055  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8455 (0.8051)  acc1: 84.4000 (84.9455)  acc5: 97.6000 (97.5636)  time: 1.0266  data: 0.6281  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9432 (0.9697)  acc1: 80.8000 (81.6952)  acc5: 96.0000 (96.2095)  time: 0.3961  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0668 (0.9825)  acc1: 79.6000 (81.1360)  acc5: 96.0000 (96.1920)  time: 0.3963  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6859 s / it)
* Acc@1 81.890 Acc@5 96.226 loss 0.973
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.96%
Epoch: [163]  [   0/1251]  eta: 1:38:52  lr: 0.001692  min_lr: 0.001692  loss: 2.0437 (2.0437)  weight_decay: 0.0500 (0.0500)  time: 4.7423  data: 3.9078  max mem: 69511
Epoch: [163]  [ 200/1251]  eta: 0:14:07  lr: 0.001688  min_lr: 0.001688  loss: 3.1378 (3.0002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6814 (0.7024)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [163]  [ 400/1251]  eta: 0:11:17  lr: 0.001685  min_lr: 0.001685  loss: 3.0649 (3.0077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6541 (0.6980)  time: 0.7905  data: 0.0004  max mem: 69511
Epoch: [163]  [ 600/1251]  eta: 0:08:36  lr: 0.001682  min_lr: 0.001682  loss: 3.2170 (3.0141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6781 (0.7123)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [163]  [ 800/1251]  eta: 0:05:57  lr: 0.001679  min_lr: 0.001679  loss: 3.1224 (3.0040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7172 (0.7116)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [163]  [1000/1251]  eta: 0:03:18  lr: 0.001676  min_lr: 0.001676  loss: 3.1857 (3.0030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7244 (0.7132)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [163]  [1200/1251]  eta: 0:00:40  lr: 0.001673  min_lr: 0.001673  loss: 2.9873 (3.0055)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7180 (0.7175)  time: 0.7963  data: 0.0004  max mem: 69511
Epoch: [163]  [1250/1251]  eta: 0:00:00  lr: 0.001672  min_lr: 0.001672  loss: 3.0036 (3.0027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.7177)  time: 0.6676  data: 0.0007  max mem: 69511
Epoch: [163] Total time: 0:16:27 (0.7895 s / it)
Averaged stats: lr: 0.001672  min_lr: 0.001672  loss: 3.0036 (3.0288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.7177)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.7439 (0.7439)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 7.8043  data: 7.3954  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8914 (0.8836)  acc1: 84.4000 (85.0909)  acc5: 97.6000 (97.4182)  time: 1.0715  data: 0.6726  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0107 (1.0015)  acc1: 80.4000 (82.0952)  acc5: 96.0000 (96.1714)  time: 0.3977  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0976 (1.0100)  acc1: 80.0000 (81.6800)  acc5: 95.2000 (96.0640)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6994 s / it)
* Acc@1 81.884 Acc@5 96.132 loss 1.000
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.96%
Epoch: [164]  [   0/1251]  eta: 1:38:32  lr: 0.001672  min_lr: 0.001672  loss: 3.0539 (3.0539)  weight_decay: 0.0500 (0.0500)  time: 4.7264  data: 3.6103  max mem: 69511
Epoch: [164]  [ 200/1251]  eta: 0:14:05  lr: 0.001669  min_lr: 0.001669  loss: 3.0769 (2.9749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7575 (0.7124)  time: 0.7856  data: 0.0006  max mem: 69511
Epoch: [164]  [ 400/1251]  eta: 0:11:17  lr: 0.001666  min_lr: 0.001666  loss: 3.0859 (2.9885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7649 (0.7353)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [164]  [ 600/1251]  eta: 0:08:37  lr: 0.001663  min_lr: 0.001663  loss: 3.1893 (3.0043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7016 (0.7318)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [164]  [ 800/1251]  eta: 0:05:57  lr: 0.001659  min_lr: 0.001659  loss: 3.1577 (3.0010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6822 (0.7345)  time: 0.7848  data: 0.0004  max mem: 69511
Epoch: [164]  [1000/1251]  eta: 0:03:18  lr: 0.001656  min_lr: 0.001656  loss: 2.9503 (3.0077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6703 (0.7394)  time: 0.7833  data: 0.0005  max mem: 69511
Epoch: [164]  [1200/1251]  eta: 0:00:40  lr: 0.001653  min_lr: 0.001653  loss: 3.2297 (3.0177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7573 (0.7415)  time: 0.7860  data: 0.0006  max mem: 69511
Epoch: [164]  [1250/1251]  eta: 0:00:00  lr: 0.001652  min_lr: 0.001652  loss: 3.0454 (3.0175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7895 (0.7462)  time: 0.6670  data: 0.0006  max mem: 69511
Epoch: [164] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.001652  min_lr: 0.001652  loss: 3.0454 (3.0265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7895 (0.7462)
Test:  [ 0/25]  eta: 0:03:18  loss: 0.7032 (0.7032)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 7.9209  data: 7.5115  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9334 (0.8742)  acc1: 85.2000 (84.8000)  acc5: 97.6000 (97.6364)  time: 1.0820  data: 0.6832  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9995 (1.0059)  acc1: 80.8000 (81.5429)  acc5: 96.0000 (96.2476)  time: 0.3975  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0844 (1.0165)  acc1: 80.8000 (81.4080)  acc5: 96.0000 (96.1600)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7047 s / it)
* Acc@1 81.848 Acc@5 96.228 loss 1.007
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.96%
Epoch: [165]  [   0/1251]  eta: 1:30:33  lr: 0.001652  min_lr: 0.001652  loss: 1.9102 (1.9102)  weight_decay: 0.0500 (0.0500)  time: 4.3434  data: 3.5198  max mem: 69511
Epoch: [165]  [ 200/1251]  eta: 0:14:05  lr: 0.001649  min_lr: 0.001649  loss: 3.2597 (2.9777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6163 (0.6941)  time: 0.7937  data: 0.0004  max mem: 69511
Epoch: [165]  [ 400/1251]  eta: 0:11:17  lr: 0.001646  min_lr: 0.001646  loss: 3.1340 (3.0202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6149 (0.7009)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [165]  [ 600/1251]  eta: 0:08:36  lr: 0.001643  min_lr: 0.001643  loss: 3.0284 (3.0450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6761 (0.7096)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [165]  [ 800/1251]  eta: 0:05:57  lr: 0.001640  min_lr: 0.001640  loss: 3.1361 (3.0433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7155 (0.7123)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [165]  [1000/1251]  eta: 0:03:18  lr: 0.001637  min_lr: 0.001637  loss: 3.1841 (3.0373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7077 (0.7116)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [165]  [1200/1251]  eta: 0:00:40  lr: 0.001634  min_lr: 0.001634  loss: 3.0140 (3.0321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6818 (0.7175)  time: 0.7867  data: 0.0004  max mem: 69511
Epoch: [165]  [1250/1251]  eta: 0:00:00  lr: 0.001633  min_lr: 0.001633  loss: 3.2660 (3.0342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6474 (0.7155)  time: 0.6723  data: 0.0007  max mem: 69511
Epoch: [165] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.001633  min_lr: 0.001633  loss: 3.2660 (3.0242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6474 (0.7155)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.7381 (0.7381)  acc1: 89.6000 (89.6000)  acc5: 99.6000 (99.6000)  time: 7.7039  data: 7.2850  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8873 (0.8943)  acc1: 84.0000 (84.4000)  acc5: 98.0000 (97.7091)  time: 1.0615  data: 0.6626  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0373 (1.0196)  acc1: 79.6000 (81.5238)  acc5: 96.4000 (96.3429)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0879 (1.0345)  acc1: 79.6000 (81.1360)  acc5: 95.6000 (96.0960)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6976 s / it)
* Acc@1 81.864 Acc@5 96.240 loss 1.026
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.96%
Epoch: [166]  [   0/1251]  eta: 1:34:09  lr: 0.001633  min_lr: 0.001633  loss: 3.1688 (3.1688)  weight_decay: 0.0500 (0.0500)  time: 4.5161  data: 2.3403  max mem: 69511
Epoch: [166]  [ 200/1251]  eta: 0:14:07  lr: 0.001630  min_lr: 0.001630  loss: 2.8804 (2.9768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7549 (0.7807)  time: 0.7902  data: 0.0004  max mem: 69511
Epoch: [166]  [ 400/1251]  eta: 0:11:17  lr: 0.001627  min_lr: 0.001627  loss: 3.0171 (3.0014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6375 (0.7616)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [166]  [ 600/1251]  eta: 0:08:36  lr: 0.001623  min_lr: 0.001623  loss: 2.9937 (3.0108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6981 (0.7410)  time: 0.7860  data: 0.0006  max mem: 69511
Epoch: [166]  [ 800/1251]  eta: 0:05:57  lr: 0.001620  min_lr: 0.001620  loss: 2.9822 (3.0134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6625 (0.7358)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [166]  [1000/1251]  eta: 0:03:18  lr: 0.001617  min_lr: 0.001617  loss: 3.1634 (3.0275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6211 (0.7308)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [166]  [1200/1251]  eta: 0:00:40  lr: 0.001614  min_lr: 0.001614  loss: 3.0212 (3.0243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7364 (0.7399)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [166]  [1250/1251]  eta: 0:00:00  lr: 0.001613  min_lr: 0.001613  loss: 3.1272 (3.0262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7199 (0.7392)  time: 0.6680  data: 0.0006  max mem: 69511
Epoch: [166] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.001613  min_lr: 0.001613  loss: 3.1272 (3.0233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7199 (0.7392)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.7015 (0.7015)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 7.4989  data: 7.0813  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9085 (0.8806)  acc1: 84.8000 (84.9818)  acc5: 97.6000 (97.6727)  time: 1.0428  data: 0.6440  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0229 (1.0388)  acc1: 80.8000 (81.8667)  acc5: 96.0000 (96.2667)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1012 (1.0513)  acc1: 79.6000 (81.5520)  acc5: 95.6000 (96.1280)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6881 s / it)
* Acc@1 81.888 Acc@5 96.216 loss 1.044
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.96%
Epoch: [167]  [   0/1251]  eta: 1:38:25  lr: 0.001613  min_lr: 0.001613  loss: 2.5539 (2.5539)  weight_decay: 0.0500 (0.0500)  time: 4.7208  data: 3.9173  max mem: 69511
Epoch: [167]  [ 200/1251]  eta: 0:14:07  lr: 0.001610  min_lr: 0.001610  loss: 2.8332 (3.0149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6868 (0.7317)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [167]  [ 400/1251]  eta: 0:11:17  lr: 0.001607  min_lr: 0.001607  loss: 2.7455 (3.0228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7130 (0.7419)  time: 0.7937  data: 0.0004  max mem: 69511
Epoch: [167]  [ 600/1251]  eta: 0:08:36  lr: 0.001604  min_lr: 0.001604  loss: 3.0372 (3.0141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7050 (0.7389)  time: 0.7924  data: 0.0006  max mem: 69511
Epoch: [167]  [ 800/1251]  eta: 0:05:56  lr: 0.001601  min_lr: 0.001601  loss: 3.0819 (3.0032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7782 (0.7479)  time: 0.7826  data: 0.0004  max mem: 69511
Epoch: [167]  [1000/1251]  eta: 0:03:18  lr: 0.001598  min_lr: 0.001598  loss: 3.0184 (3.0029)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.7889  data: 0.0004  max mem: 69511
Epoch: [167]  [1200/1251]  eta: 0:00:40  lr: 0.001594  min_lr: 0.001594  loss: 3.1571 (3.0033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6946 (nan)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [167]  [1250/1251]  eta: 0:00:00  lr: 0.001594  min_lr: 0.001594  loss: 2.7358 (2.9974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6445 (nan)  time: 0.6672  data: 0.0005  max mem: 69511
Epoch: [167] Total time: 0:16:25 (0.7877 s / it)
Averaged stats: lr: 0.001594  min_lr: 0.001594  loss: 2.7358 (3.0097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6445 (nan)
Test:  [ 0/25]  eta: 0:03:18  loss: 0.6756 (0.6756)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 7.9256  data: 7.5138  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8309 (0.7949)  acc1: 84.8000 (85.1273)  acc5: 97.6000 (97.6000)  time: 1.0817  data: 0.6834  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9214 (0.9223)  acc1: 80.0000 (82.2286)  acc5: 96.0000 (96.1524)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0055 (0.9357)  acc1: 79.6000 (81.7440)  acc5: 96.0000 (96.0160)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7038 s / it)
* Acc@1 82.018 Acc@5 96.280 loss 0.926
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.02%
Epoch: [168]  [   0/1251]  eta: 1:23:03  lr: 0.001594  min_lr: 0.001594  loss: 2.8768 (2.8768)  weight_decay: 0.0500 (0.0500)  time: 3.9839  data: 3.1827  max mem: 69511
Epoch: [168]  [ 200/1251]  eta: 0:14:00  lr: 0.001591  min_lr: 0.001591  loss: 3.3432 (3.0340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7724 (0.7327)  time: 0.7847  data: 0.0010  max mem: 69511
Epoch: [168]  [ 400/1251]  eta: 0:11:14  lr: 0.001587  min_lr: 0.001587  loss: 2.9975 (3.0138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7535 (0.7626)  time: 0.7825  data: 0.0007  max mem: 69511
Epoch: [168]  [ 600/1251]  eta: 0:08:34  lr: 0.001584  min_lr: 0.001584  loss: 3.2870 (3.0176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6669 (0.7462)  time: 0.7849  data: 0.0010  max mem: 69511
Epoch: [168]  [ 800/1251]  eta: 0:05:56  lr: 0.001581  min_lr: 0.001581  loss: 3.1370 (3.0237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7176 (0.7452)  time: 0.7926  data: 0.0009  max mem: 69511
Epoch: [168]  [1000/1251]  eta: 0:03:18  lr: 0.001578  min_lr: 0.001578  loss: 3.1760 (3.0129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7183 (0.7386)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [168]  [1200/1251]  eta: 0:00:40  lr: 0.001575  min_lr: 0.001575  loss: 2.7356 (3.0159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7626 (0.7397)  time: 0.7859  data: 0.0010  max mem: 69511
Epoch: [168]  [1250/1251]  eta: 0:00:00  lr: 0.001574  min_lr: 0.001574  loss: 3.2038 (3.0180)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6698 (0.7373)  time: 0.6679  data: 0.0006  max mem: 69511
Epoch: [168] Total time: 0:16:25 (0.7875 s / it)
Averaged stats: lr: 0.001574  min_lr: 0.001574  loss: 3.2038 (3.0128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6698 (0.7373)
Test:  [ 0/25]  eta: 0:02:57  loss: 0.7061 (0.7061)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 7.0870  data: 6.6650  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8773 (0.8522)  acc1: 85.6000 (85.4909)  acc5: 98.0000 (97.5636)  time: 1.0053  data: 0.6062  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0088 (0.9953)  acc1: 81.6000 (81.9429)  acc5: 96.0000 (96.3238)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0831 (1.0019)  acc1: 79.6000 (81.4720)  acc5: 95.6000 (96.3200)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6702 s / it)
* Acc@1 81.758 Acc@5 96.290 loss 0.996
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 82.02%
Epoch: [169]  [   0/1251]  eta: 1:38:46  lr: 0.001574  min_lr: 0.001574  loss: 3.2479 (3.2479)  weight_decay: 0.0500 (0.0500)  time: 4.7376  data: 3.9576  max mem: 69511
Epoch: [169]  [ 200/1251]  eta: 0:14:07  lr: 0.001571  min_lr: 0.001571  loss: 3.1044 (3.0229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7179 (0.7257)  time: 0.7852  data: 0.0008  max mem: 69511
Epoch: [169]  [ 400/1251]  eta: 0:11:18  lr: 0.001568  min_lr: 0.001568  loss: 3.1976 (3.0214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7642 (0.7232)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [169]  [ 600/1251]  eta: 0:08:36  lr: 0.001565  min_lr: 0.001565  loss: 3.1028 (3.0167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7790 (0.7354)  time: 0.7856  data: 0.0007  max mem: 69511
Epoch: [169]  [ 800/1251]  eta: 0:05:57  lr: 0.001562  min_lr: 0.001562  loss: 2.6537 (3.0196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7711 (0.7455)  time: 0.7882  data: 0.0006  max mem: 69511
Epoch: [169]  [1000/1251]  eta: 0:03:18  lr: 0.001558  min_lr: 0.001558  loss: 2.9964 (3.0248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8051 (0.7417)  time: 0.7857  data: 0.0008  max mem: 69511
Epoch: [169]  [1200/1251]  eta: 0:00:40  lr: 0.001555  min_lr: 0.001555  loss: 2.9708 (3.0140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7209 (0.7389)  time: 0.7883  data: 0.0007  max mem: 69511
Epoch: [169]  [1250/1251]  eta: 0:00:00  lr: 0.001555  min_lr: 0.001555  loss: 3.1884 (3.0174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7202 (0.7386)  time: 0.6675  data: 0.0007  max mem: 69511
Epoch: [169] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.001555  min_lr: 0.001555  loss: 3.1884 (3.0112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7202 (0.7386)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7921 (0.7921)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 5.5861  data: 5.1775  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9068 (0.9256)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (97.7818)  time: 1.0047  data: 0.6069  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0698 (1.0638)  acc1: 80.0000 (81.4476)  acc5: 96.4000 (96.4571)  time: 0.4772  data: 0.0750  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1377 (1.0705)  acc1: 79.6000 (81.0560)  acc5: 96.0000 (96.3520)  time: 0.4023  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6736 s / it)
* Acc@1 81.916 Acc@5 96.246 loss 1.061
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 82.02%
Epoch: [170]  [   0/1251]  eta: 1:38:19  lr: 0.001555  min_lr: 0.001555  loss: 3.5847 (3.5847)  weight_decay: 0.0500 (0.0500)  time: 4.7156  data: 3.4290  max mem: 69511
Epoch: [170]  [ 200/1251]  eta: 0:14:10  lr: 0.001551  min_lr: 0.001551  loss: 3.0579 (2.9392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7672 (0.7601)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [170]  [ 400/1251]  eta: 0:11:18  lr: 0.001548  min_lr: 0.001548  loss: 3.2316 (2.9534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7523 (0.7724)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [170]  [ 600/1251]  eta: 0:08:37  lr: 0.001545  min_lr: 0.001545  loss: 3.0863 (2.9610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7474 (0.7589)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [170]  [ 800/1251]  eta: 0:05:57  lr: 0.001542  min_lr: 0.001542  loss: 3.1680 (2.9733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7776 (0.7527)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [170]  [1000/1251]  eta: 0:03:18  lr: 0.001539  min_lr: 0.001539  loss: 3.1560 (2.9875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6879 (0.7508)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [170]  [1200/1251]  eta: 0:00:40  lr: 0.001536  min_lr: 0.001536  loss: 3.1395 (2.9932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8502 (0.7516)  time: 0.7982  data: 0.0005  max mem: 69511
Epoch: [170]  [1250/1251]  eta: 0:00:00  lr: 0.001535  min_lr: 0.001535  loss: 3.2127 (2.9975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7459 (0.7521)  time: 0.6701  data: 0.0007  max mem: 69511
Epoch: [170] Total time: 0:16:28 (0.7898 s / it)
Averaged stats: lr: 0.001535  min_lr: 0.001535  loss: 3.2127 (3.0025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7459 (0.7521)
Test:  [ 0/25]  eta: 0:02:34  loss: 0.7271 (0.7271)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 6.1711  data: 5.7358  max mem: 69511
Test:  [10/25]  eta: 0:00:13  loss: 0.9346 (0.9023)  acc1: 84.8000 (85.6727)  acc5: 98.0000 (97.5636)  time: 0.9224  data: 0.5219  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0493 (1.0427)  acc1: 80.4000 (82.0381)  acc5: 95.2000 (96.0571)  time: 0.3974  data: 0.0003  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1445 (1.0522)  acc1: 80.0000 (81.7440)  acc5: 95.2000 (96.0800)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:15 (0.6351 s / it)
* Acc@1 82.092 Acc@5 96.226 loss 1.045
Accuracy of the model on the 50000 test images: 82.1%
Max accuracy: 82.09%
Epoch: [171]  [   0/1251]  eta: 1:18:32  lr: 0.001535  min_lr: 0.001535  loss: 2.7627 (2.7627)  weight_decay: 0.0500 (0.0500)  time: 3.7669  data: 2.9818  max mem: 69511
Epoch: [171]  [ 200/1251]  eta: 0:14:01  lr: 0.001532  min_lr: 0.001532  loss: 3.1837 (2.9912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6837 (0.7370)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [171]  [ 400/1251]  eta: 0:11:16  lr: 0.001529  min_lr: 0.001529  loss: 2.9618 (2.9587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8113 (0.7448)  time: 0.7907  data: 0.0004  max mem: 69511
Epoch: [171]  [ 600/1251]  eta: 0:08:36  lr: 0.001526  min_lr: 0.001526  loss: 2.9091 (2.9643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.7481)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [171]  [ 800/1251]  eta: 0:05:56  lr: 0.001523  min_lr: 0.001523  loss: 2.9635 (2.9799)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7374 (0.7571)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [171]  [1000/1251]  eta: 0:03:18  lr: 0.001520  min_lr: 0.001520  loss: 3.0770 (2.9877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7280 (0.7603)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [171]  [1200/1251]  eta: 0:00:40  lr: 0.001516  min_lr: 0.001516  loss: 3.0230 (2.9855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7164 (0.7564)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [171]  [1250/1251]  eta: 0:00:00  lr: 0.001516  min_lr: 0.001516  loss: 2.9760 (2.9864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7044 (0.7554)  time: 0.6673  data: 0.0005  max mem: 69511
Epoch: [171] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.001516  min_lr: 0.001516  loss: 2.9760 (2.9930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7044 (0.7554)
Test:  [ 0/25]  eta: 0:02:56  loss: 0.5854 (0.5854)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 7.0701  data: 6.6335  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8353 (0.7610)  acc1: 84.8000 (85.1636)  acc5: 98.0000 (97.7455)  time: 1.0037  data: 0.6033  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8625 (0.9076)  acc1: 80.4000 (81.9048)  acc5: 96.8000 (96.4381)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0078 (0.9195)  acc1: 80.0000 (81.4240)  acc5: 95.2000 (96.2880)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6702 s / it)
* Acc@1 82.226 Acc@5 96.294 loss 0.907
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.23%
Epoch: [172]  [   0/1251]  eta: 1:23:28  lr: 0.001516  min_lr: 0.001516  loss: 3.1834 (3.1834)  weight_decay: 0.0500 (0.0500)  time: 4.0033  data: 3.2140  max mem: 69511
Epoch: [172]  [ 200/1251]  eta: 0:14:03  lr: 0.001513  min_lr: 0.001513  loss: 3.1464 (2.9714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7721 (0.7434)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [172]  [ 400/1251]  eta: 0:11:17  lr: 0.001509  min_lr: 0.001509  loss: 3.0308 (2.9684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6924 (0.7528)  time: 0.7962  data: 0.0005  max mem: 69511
Epoch: [172]  [ 600/1251]  eta: 0:08:36  lr: 0.001506  min_lr: 0.001506  loss: 3.1539 (2.9665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7135 (0.7554)  time: 0.7918  data: 0.0004  max mem: 69511
Epoch: [172]  [ 800/1251]  eta: 0:05:56  lr: 0.001503  min_lr: 0.001503  loss: 3.0597 (2.9882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7099 (0.7540)  time: 0.7879  data: 0.0005  max mem: 69511
Epoch: [172]  [1000/1251]  eta: 0:03:18  lr: 0.001500  min_lr: 0.001500  loss: 2.6989 (2.9757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6541 (0.7457)  time: 0.7852  data: 0.0007  max mem: 69511
Epoch: [172]  [1200/1251]  eta: 0:00:40  lr: 0.001497  min_lr: 0.001497  loss: 2.9843 (2.9812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7687 (0.7482)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [172]  [1250/1251]  eta: 0:00:00  lr: 0.001496  min_lr: 0.001496  loss: 3.0588 (2.9804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7884 (0.7492)  time: 0.6673  data: 0.0006  max mem: 69511
Epoch: [172] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.001496  min_lr: 0.001496  loss: 3.0588 (2.9979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7884 (0.7492)
Test:  [ 0/25]  eta: 0:03:06  loss: 0.6898 (0.6898)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 7.4620  data: 7.0183  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8345 (0.8398)  acc1: 86.8000 (85.6000)  acc5: 98.0000 (97.7455)  time: 1.0391  data: 0.6384  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9598 (0.9655)  acc1: 80.0000 (82.2476)  acc5: 96.0000 (96.3619)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0426 (0.9746)  acc1: 79.6000 (81.7760)  acc5: 95.6000 (96.3520)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6869 s / it)
* Acc@1 82.088 Acc@5 96.250 loss 0.961
Accuracy of the model on the 50000 test images: 82.1%
Max accuracy: 82.23%
Epoch: [173]  [   0/1251]  eta: 1:42:00  lr: 0.001496  min_lr: 0.001496  loss: 2.8486 (2.8486)  weight_decay: 0.0500 (0.0500)  time: 4.8923  data: 2.1278  max mem: 69511
Epoch: [173]  [ 200/1251]  eta: 0:14:09  lr: 0.001493  min_lr: 0.001493  loss: 2.9384 (2.9874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6690 (0.7376)  time: 0.8035  data: 0.0004  max mem: 69511
Epoch: [173]  [ 400/1251]  eta: 0:11:19  lr: 0.001490  min_lr: 0.001490  loss: 3.0531 (2.9679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7117 (0.7265)  time: 0.7853  data: 0.0006  max mem: 69511
Epoch: [173]  [ 600/1251]  eta: 0:08:36  lr: 0.001487  min_lr: 0.001487  loss: 2.9795 (2.9641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7066 (0.7331)  time: 0.7881  data: 0.0004  max mem: 69511
Epoch: [173]  [ 800/1251]  eta: 0:05:57  lr: 0.001484  min_lr: 0.001484  loss: 3.0214 (2.9730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7898 (0.7402)  time: 0.7888  data: 0.0005  max mem: 69511
Epoch: [173]  [1000/1251]  eta: 0:03:18  lr: 0.001481  min_lr: 0.001481  loss: 2.8513 (2.9743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7796 (0.7358)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [173]  [1200/1251]  eta: 0:00:40  lr: 0.001478  min_lr: 0.001478  loss: 3.1402 (2.9794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7607 (0.7356)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [173]  [1250/1251]  eta: 0:00:00  lr: 0.001477  min_lr: 0.001477  loss: 3.0145 (2.9778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7018 (0.7353)  time: 0.6673  data: 0.0006  max mem: 69511
Epoch: [173] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.001477  min_lr: 0.001477  loss: 3.0145 (2.9821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7018 (0.7353)
Test:  [ 0/25]  eta: 0:03:04  loss: 0.6641 (0.6641)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 7.3870  data: 6.9635  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8533 (0.8461)  acc1: 86.0000 (85.4182)  acc5: 97.6000 (97.5273)  time: 1.0324  data: 0.6333  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0031 (0.9849)  acc1: 80.0000 (82.3048)  acc5: 96.4000 (96.2476)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0959 (0.9960)  acc1: 80.0000 (81.9680)  acc5: 96.0000 (96.2240)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6838 s / it)
* Acc@1 82.140 Acc@5 96.204 loss 0.985
Accuracy of the model on the 50000 test images: 82.1%
Max accuracy: 82.23%
Epoch: [174]  [   0/1251]  eta: 1:38:14  lr: 0.001477  min_lr: 0.001477  loss: 2.2511 (2.2511)  weight_decay: 0.0500 (0.0500)  time: 4.7115  data: 3.9142  max mem: 69511
Epoch: [174]  [ 200/1251]  eta: 0:14:08  lr: 0.001474  min_lr: 0.001474  loss: 3.0875 (2.9016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7169 (0.7476)  time: 0.7847  data: 0.0004  max mem: 69511
Epoch: [174]  [ 400/1251]  eta: 0:11:17  lr: 0.001471  min_lr: 0.001471  loss: 3.0219 (2.9475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7454 (0.7252)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [174]  [ 600/1251]  eta: 0:08:36  lr: 0.001468  min_lr: 0.001468  loss: 2.9112 (2.9580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7158 (0.7474)  time: 0.7967  data: 0.0004  max mem: 69511
Epoch: [174]  [ 800/1251]  eta: 0:05:57  lr: 0.001464  min_lr: 0.001464  loss: 2.9700 (2.9617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6909 (0.7481)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [174]  [1000/1251]  eta: 0:03:18  lr: 0.001461  min_lr: 0.001461  loss: 2.9643 (2.9549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6991 (0.7526)  time: 0.7913  data: 0.0005  max mem: 69511
Epoch: [174]  [1200/1251]  eta: 0:00:40  lr: 0.001458  min_lr: 0.001458  loss: 2.8492 (2.9522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6626 (0.7467)  time: 0.7882  data: 0.0004  max mem: 69511
Epoch: [174]  [1250/1251]  eta: 0:00:00  lr: 0.001458  min_lr: 0.001458  loss: 2.9484 (2.9555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6560 (0.7466)  time: 0.6674  data: 0.0007  max mem: 69511
Epoch: [174] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.001458  min_lr: 0.001458  loss: 2.9484 (2.9800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6560 (0.7466)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.7203 (0.7203)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 7.6999  data: 7.2678  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8824 (0.8564)  acc1: 84.4000 (85.2000)  acc5: 97.6000 (97.5636)  time: 1.0607  data: 0.6610  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9551 (0.9803)  acc1: 81.2000 (82.6476)  acc5: 96.0000 (96.4381)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0553 (0.9905)  acc1: 80.4000 (82.2240)  acc5: 96.0000 (96.4000)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6947 s / it)
* Acc@1 82.382 Acc@5 96.316 loss 0.983
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.38%
Epoch: [175]  [   0/1251]  eta: 1:19:49  lr: 0.001457  min_lr: 0.001457  loss: 3.1100 (3.1100)  weight_decay: 0.0500 (0.0500)  time: 3.8289  data: 3.0393  max mem: 69511
Epoch: [175]  [ 200/1251]  eta: 0:14:01  lr: 0.001454  min_lr: 0.001454  loss: 2.9535 (2.9680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7429 (0.7457)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [175]  [ 400/1251]  eta: 0:11:16  lr: 0.001451  min_lr: 0.001451  loss: 3.0526 (2.9463)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6880 (0.7406)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [175]  [ 600/1251]  eta: 0:08:36  lr: 0.001448  min_lr: 0.001448  loss: 3.0099 (2.9488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7046 (0.7315)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [175]  [ 800/1251]  eta: 0:05:56  lr: 0.001445  min_lr: 0.001445  loss: 3.1700 (2.9695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7367 (0.7369)  time: 0.7864  data: 0.0006  max mem: 69511
Epoch: [175]  [1000/1251]  eta: 0:03:18  lr: 0.001442  min_lr: 0.001442  loss: 3.1380 (2.9763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7295 (0.7353)  time: 0.7995  data: 0.0004  max mem: 69511
Epoch: [175]  [1200/1251]  eta: 0:00:40  lr: 0.001439  min_lr: 0.001439  loss: 3.0036 (2.9765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7419 (0.7390)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [175]  [1250/1251]  eta: 0:00:00  lr: 0.001438  min_lr: 0.001438  loss: 3.1886 (2.9786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7202 (0.7388)  time: 0.6677  data: 0.0006  max mem: 69511
Epoch: [175] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.001438  min_lr: 0.001438  loss: 3.1886 (2.9832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7202 (0.7388)
Test:  [ 0/25]  eta: 0:03:17  loss: 0.8177 (0.8177)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.9116  data: 7.4890  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 1.0125 (0.9912)  acc1: 85.2000 (85.5273)  acc5: 97.6000 (97.7818)  time: 1.0798  data: 0.6811  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.1150 (1.1293)  acc1: 80.8000 (82.4381)  acc5: 95.6000 (96.2095)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1988 (1.1339)  acc1: 80.0000 (82.0160)  acc5: 95.6000 (96.1280)  time: 0.3966  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7020 s / it)
* Acc@1 82.298 Acc@5 96.342 loss 1.124
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.38%
Epoch: [176]  [   0/1251]  eta: 1:29:28  lr: 0.001438  min_lr: 0.001438  loss: 3.4078 (3.4078)  weight_decay: 0.0500 (0.0500)  time: 4.2916  data: 3.0154  max mem: 69511
Epoch: [176]  [ 200/1251]  eta: 0:14:04  lr: 0.001435  min_lr: 0.001435  loss: 2.9969 (2.9921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7996 (0.7420)  time: 0.7849  data: 0.0004  max mem: 69511
Epoch: [176]  [ 400/1251]  eta: 0:11:18  lr: 0.001432  min_lr: 0.001432  loss: 3.0545 (2.9736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7564 (0.7532)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [176]  [ 600/1251]  eta: 0:08:36  lr: 0.001429  min_lr: 0.001429  loss: 2.9410 (2.9760)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7882 (0.7773)  time: 0.7856  data: 0.0006  max mem: 69511
Epoch: [176]  [ 800/1251]  eta: 0:05:57  lr: 0.001426  min_lr: 0.001426  loss: 3.0901 (2.9709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6662 (0.7645)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [176]  [1000/1251]  eta: 0:03:18  lr: 0.001423  min_lr: 0.001423  loss: 3.0803 (2.9713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8588 (0.7687)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [176]  [1200/1251]  eta: 0:00:40  lr: 0.001420  min_lr: 0.001420  loss: 3.2108 (2.9868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7536 (0.7670)  time: 0.7908  data: 0.0006  max mem: 69511
Epoch: [176]  [1250/1251]  eta: 0:00:00  lr: 0.001419  min_lr: 0.001419  loss: 3.2869 (2.9933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6761 (0.7670)  time: 0.6641  data: 0.0007  max mem: 69511
Epoch: [176] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.001419  min_lr: 0.001419  loss: 3.2869 (2.9734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6761 (0.7670)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.7829 (0.7829)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 7.3464  data: 6.9394  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9190 (0.9381)  acc1: 84.8000 (85.1636)  acc5: 97.6000 (97.7091)  time: 1.0280  data: 0.6312  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0613 (1.0707)  acc1: 80.8000 (82.2667)  acc5: 96.0000 (96.3810)  time: 0.3949  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1915 (1.0806)  acc1: 80.0000 (81.6800)  acc5: 95.6000 (96.2080)  time: 0.3937  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6834 s / it)
* Acc@1 82.238 Acc@5 96.318 loss 1.069
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.38%
Epoch: [177]  [   0/1251]  eta: 1:40:32  lr: 0.001419  min_lr: 0.001419  loss: 2.4702 (2.4702)  weight_decay: 0.0500 (0.0500)  time: 4.8224  data: 3.2742  max mem: 69511
Epoch: [177]  [ 200/1251]  eta: 0:14:09  lr: 0.001416  min_lr: 0.001416  loss: 3.1460 (2.9793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8347 (0.7597)  time: 0.7969  data: 0.0005  max mem: 69511
Epoch: [177]  [ 400/1251]  eta: 0:11:18  lr: 0.001413  min_lr: 0.001413  loss: 3.1098 (2.9708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7786 (0.7606)  time: 0.7935  data: 0.0006  max mem: 69511
Epoch: [177]  [ 600/1251]  eta: 0:08:36  lr: 0.001410  min_lr: 0.001410  loss: 3.2370 (2.9732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7366 (0.7595)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [177]  [ 800/1251]  eta: 0:05:57  lr: 0.001406  min_lr: 0.001406  loss: 3.0545 (2.9825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7623 (0.7596)  time: 0.7856  data: 0.0006  max mem: 69511
Epoch: [177]  [1000/1251]  eta: 0:03:18  lr: 0.001403  min_lr: 0.001403  loss: 2.8639 (2.9706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7953 (0.7611)  time: 0.7821  data: 0.0008  max mem: 69511
Epoch: [177]  [1200/1251]  eta: 0:00:40  lr: 0.001400  min_lr: 0.001400  loss: 2.9859 (2.9715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7547 (inf)  time: 0.7827  data: 0.0007  max mem: 69511
Epoch: [177]  [1250/1251]  eta: 0:00:00  lr: 0.001400  min_lr: 0.001400  loss: 3.2868 (2.9753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8194 (inf)  time: 0.6701  data: 0.0006  max mem: 69511
Epoch: [177] Total time: 0:16:26 (0.7882 s / it)
Averaged stats: lr: 0.001400  min_lr: 0.001400  loss: 3.2868 (2.9714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8194 (inf)
Test:  [ 0/25]  eta: 0:02:52  loss: 0.7295 (0.7295)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 6.8812  data: 6.4501  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.9837 (0.9236)  acc1: 84.0000 (85.0545)  acc5: 97.6000 (97.7818)  time: 0.9859  data: 0.5866  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0535 (1.0637)  acc1: 80.0000 (81.8476)  acc5: 96.0000 (96.3810)  time: 0.3964  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1862 (1.0692)  acc1: 80.0000 (81.6000)  acc5: 95.6000 (96.2720)  time: 0.3964  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6624 s / it)
* Acc@1 82.156 Acc@5 96.346 loss 1.056
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.38%
Epoch: [178]  [   0/1251]  eta: 1:42:45  lr: 0.001400  min_lr: 0.001400  loss: 3.4403 (3.4403)  weight_decay: 0.0500 (0.0500)  time: 4.9282  data: 4.1389  max mem: 69511
Epoch: [178]  [ 200/1251]  eta: 0:14:09  lr: 0.001396  min_lr: 0.001396  loss: 3.1776 (2.9471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7341 (0.7552)  time: 0.7899  data: 0.0007  max mem: 69511
Epoch: [178]  [ 400/1251]  eta: 0:11:16  lr: 0.001393  min_lr: 0.001393  loss: 3.1249 (2.9654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7338 (0.7637)  time: 0.7825  data: 0.0005  max mem: 69511
Epoch: [178]  [ 600/1251]  eta: 0:08:36  lr: 0.001390  min_lr: 0.001390  loss: 3.2178 (2.9738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7900 (0.7546)  time: 0.7904  data: 0.0006  max mem: 69511
Epoch: [178]  [ 800/1251]  eta: 0:05:56  lr: 0.001387  min_lr: 0.001387  loss: 2.7622 (2.9720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6538 (0.7512)  time: 0.7855  data: 0.0006  max mem: 69511
Epoch: [178]  [1000/1251]  eta: 0:03:18  lr: 0.001384  min_lr: 0.001384  loss: 3.0001 (2.9670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8383 (0.7626)  time: 0.7829  data: 0.0006  max mem: 69511
Epoch: [178]  [1200/1251]  eta: 0:00:40  lr: 0.001381  min_lr: 0.001381  loss: 3.0009 (2.9666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6377 (0.7564)  time: 0.7849  data: 0.0005  max mem: 69511
Epoch: [178]  [1250/1251]  eta: 0:00:00  lr: 0.001380  min_lr: 0.001380  loss: 3.0401 (2.9677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7583 (0.7581)  time: 0.6640  data: 0.0007  max mem: 69511
Epoch: [178] Total time: 0:16:25 (0.7876 s / it)
Averaged stats: lr: 0.001380  min_lr: 0.001380  loss: 3.0401 (2.9693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7583 (0.7581)
Test:  [ 0/25]  eta: 0:03:05  loss: 0.7012 (0.7012)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 7.4174  data: 6.9910  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8847 (0.8635)  acc1: 85.6000 (85.7455)  acc5: 97.6000 (97.4909)  time: 1.0321  data: 0.6358  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9637 (1.0015)  acc1: 81.2000 (82.5905)  acc5: 96.0000 (96.1524)  time: 0.3935  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0219 (1.0071)  acc1: 80.8000 (82.2560)  acc5: 96.0000 (96.1440)  time: 0.3934  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6845 s / it)
* Acc@1 82.316 Acc@5 96.350 loss 0.992
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.38%
Epoch: [179]  [   0/1251]  eta: 1:39:39  lr: 0.001380  min_lr: 0.001380  loss: 3.2703 (3.2703)  weight_decay: 0.0500 (0.0500)  time: 4.7798  data: 3.9924  max mem: 69511
Epoch: [179]  [ 200/1251]  eta: 0:14:03  lr: 0.001377  min_lr: 0.001377  loss: 2.9950 (2.9586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7211 (0.7501)  time: 0.7832  data: 0.0005  max mem: 69511
Epoch: [179]  [ 400/1251]  eta: 0:11:15  lr: 0.001374  min_lr: 0.001374  loss: 3.1721 (2.9555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7377 (0.7615)  time: 0.7823  data: 0.0006  max mem: 69511
Epoch: [179]  [ 600/1251]  eta: 0:08:35  lr: 0.001371  min_lr: 0.001371  loss: 3.1897 (2.9541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7531 (0.7751)  time: 0.7848  data: 0.0005  max mem: 69511
Epoch: [179]  [ 800/1251]  eta: 0:05:56  lr: 0.001368  min_lr: 0.001368  loss: 2.8490 (2.9638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8648 (0.7864)  time: 0.7824  data: 0.0005  max mem: 69511
Epoch: [179]  [1000/1251]  eta: 0:03:18  lr: 0.001365  min_lr: 0.001365  loss: 3.0411 (2.9775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7191 (0.7778)  time: 0.7929  data: 0.0005  max mem: 69511
Epoch: [179]  [1200/1251]  eta: 0:00:40  lr: 0.001362  min_lr: 0.001362  loss: 3.1387 (2.9826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7435 (0.7788)  time: 0.7818  data: 0.0006  max mem: 69511
Epoch: [179]  [1250/1251]  eta: 0:00:00  lr: 0.001361  min_lr: 0.001361  loss: 3.2464 (2.9834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7192 (0.7795)  time: 0.6641  data: 0.0005  max mem: 69511
Epoch: [179] Total time: 0:16:24 (0.7866 s / it)
Averaged stats: lr: 0.001361  min_lr: 0.001361  loss: 3.2464 (2.9676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7192 (0.7795)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.7381 (0.7381)  acc1: 92.4000 (92.4000)  acc5: 98.4000 (98.4000)  time: 7.7790  data: 7.3575  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8753 (0.8852)  acc1: 85.6000 (85.7091)  acc5: 98.0000 (97.7091)  time: 1.0654  data: 0.6691  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0234 (1.0173)  acc1: 80.4000 (82.5714)  acc5: 96.0000 (96.4191)  time: 0.3942  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0267 (1.0254)  acc1: 80.4000 (82.1600)  acc5: 96.0000 (96.2880)  time: 0.3942  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7005 s / it)
* Acc@1 82.378 Acc@5 96.456 loss 1.013
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.38%
Epoch: [180]  [   0/1251]  eta: 1:34:29  lr: 0.001361  min_lr: 0.001361  loss: 3.1571 (3.1571)  weight_decay: 0.0500 (0.0500)  time: 4.5323  data: 2.1653  max mem: 69511
Epoch: [180]  [ 200/1251]  eta: 0:14:05  lr: 0.001358  min_lr: 0.001358  loss: 3.1752 (2.9402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7367 (0.8006)  time: 0.7936  data: 0.0004  max mem: 69511
Epoch: [180]  [ 400/1251]  eta: 0:11:18  lr: 0.001355  min_lr: 0.001355  loss: 3.1760 (2.9428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7094 (0.7797)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [180]  [ 600/1251]  eta: 0:08:36  lr: 0.001352  min_lr: 0.001352  loss: 2.8076 (2.9665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7178 (0.7857)  time: 0.7849  data: 0.0004  max mem: 69511
Epoch: [180]  [ 800/1251]  eta: 0:05:57  lr: 0.001349  min_lr: 0.001349  loss: 2.9782 (2.9581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8300 (0.7850)  time: 0.7918  data: 0.0005  max mem: 69511
Epoch: [180]  [1000/1251]  eta: 0:03:18  lr: 0.001346  min_lr: 0.001346  loss: 3.1509 (2.9609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8053 (0.7819)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [180]  [1200/1251]  eta: 0:00:40  lr: 0.001343  min_lr: 0.001343  loss: 3.2251 (2.9598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7334 (0.7828)  time: 0.7830  data: 0.0005  max mem: 69511
Epoch: [180]  [1250/1251]  eta: 0:00:00  lr: 0.001342  min_lr: 0.001342  loss: 3.1717 (2.9599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7202 (0.7820)  time: 0.6649  data: 0.0007  max mem: 69511
Epoch: [180] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.001342  min_lr: 0.001342  loss: 3.1717 (2.9597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7202 (0.7820)
Test:  [ 0/25]  eta: 0:03:01  loss: 0.7400 (0.7400)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 7.2558  data: 6.8244  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8169 (0.8480)  acc1: 86.8000 (85.3818)  acc5: 97.6000 (97.7818)  time: 1.0176  data: 0.6207  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0033 (0.9946)  acc1: 80.8000 (82.4571)  acc5: 96.0000 (96.5905)  time: 0.3937  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1049 (1.0048)  acc1: 80.8000 (81.7120)  acc5: 96.0000 (96.4480)  time: 0.3936  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6772 s / it)
* Acc@1 82.350 Acc@5 96.338 loss 0.988
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.38%
Epoch: [181]  [   0/1251]  eta: 1:39:34  lr: 0.001342  min_lr: 0.001342  loss: 3.3891 (3.3891)  weight_decay: 0.0500 (0.0500)  time: 4.7755  data: 3.9663  max mem: 69511
Epoch: [181]  [ 200/1251]  eta: 0:14:10  lr: 0.001339  min_lr: 0.001339  loss: 3.0793 (2.9506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.7341)  time: 0.7845  data: 0.0004  max mem: 69511
Epoch: [181]  [ 400/1251]  eta: 0:11:18  lr: 0.001336  min_lr: 0.001336  loss: 3.1036 (2.9681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6796 (0.7423)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [181]  [ 600/1251]  eta: 0:08:36  lr: 0.001333  min_lr: 0.001333  loss: 2.9887 (2.9588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7681 (0.7574)  time: 0.7912  data: 0.0004  max mem: 69511
Epoch: [181]  [ 800/1251]  eta: 0:05:57  lr: 0.001330  min_lr: 0.001330  loss: 3.1825 (2.9621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6890 (0.7691)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [181]  [1000/1251]  eta: 0:03:18  lr: 0.001327  min_lr: 0.001327  loss: 2.9037 (2.9609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8183 (0.7672)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [181]  [1200/1251]  eta: 0:00:40  lr: 0.001324  min_lr: 0.001324  loss: 3.0013 (2.9610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7395 (0.7673)  time: 0.7858  data: 0.0006  max mem: 69511
Epoch: [181]  [1250/1251]  eta: 0:00:00  lr: 0.001323  min_lr: 0.001323  loss: 3.2081 (2.9628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7635 (0.7687)  time: 0.6716  data: 0.0006  max mem: 69511
Epoch: [181] Total time: 0:16:27 (0.7892 s / it)
Averaged stats: lr: 0.001323  min_lr: 0.001323  loss: 3.2081 (2.9602)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7635 (0.7687)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.6899 (0.6899)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 7.5102  data: 7.0883  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8765 (0.8860)  acc1: 86.4000 (85.7455)  acc5: 98.0000 (97.8545)  time: 1.0434  data: 0.6447  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0453 (1.0241)  acc1: 81.6000 (82.4571)  acc5: 96.0000 (96.4571)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1038 (1.0407)  acc1: 79.6000 (81.7600)  acc5: 96.0000 (96.3360)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6876 s / it)
* Acc@1 82.460 Acc@5 96.490 loss 1.022
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.46%
Epoch: [182]  [   0/1251]  eta: 1:22:40  lr: 0.001323  min_lr: 0.001323  loss: 3.1062 (3.1062)  weight_decay: 0.0500 (0.0500)  time: 3.9654  data: 3.1666  max mem: 69511
Epoch: [182]  [ 200/1251]  eta: 0:14:03  lr: 0.001320  min_lr: 0.001320  loss: 2.9538 (2.9177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6987 (0.7440)  time: 0.7847  data: 0.0004  max mem: 69511
Epoch: [182]  [ 400/1251]  eta: 0:11:16  lr: 0.001317  min_lr: 0.001317  loss: 3.0119 (2.9039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7363 (0.7509)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [182]  [ 600/1251]  eta: 0:08:36  lr: 0.001314  min_lr: 0.001314  loss: 2.9066 (2.9109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7803 (0.7611)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [182]  [ 800/1251]  eta: 0:05:56  lr: 0.001311  min_lr: 0.001311  loss: 2.6863 (2.9280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7377 (0.7657)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [182]  [1000/1251]  eta: 0:03:18  lr: 0.001308  min_lr: 0.001308  loss: 3.0842 (2.9424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8655 (0.7783)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [182]  [1200/1251]  eta: 0:00:40  lr: 0.001305  min_lr: 0.001305  loss: 2.8899 (2.9444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7294 (0.7719)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [182]  [1250/1251]  eta: 0:00:00  lr: 0.001304  min_lr: 0.001304  loss: 3.0382 (2.9446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7436 (0.7707)  time: 0.6672  data: 0.0006  max mem: 69511
Epoch: [182] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.001304  min_lr: 0.001304  loss: 3.0382 (2.9493)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7436 (0.7707)
Test:  [ 0/25]  eta: 0:03:06  loss: 0.6387 (0.6387)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 7.4583  data: 7.0234  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8682 (0.8057)  acc1: 85.6000 (85.4182)  acc5: 97.2000 (97.7091)  time: 1.0387  data: 0.6388  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9298 (0.9306)  acc1: 82.4000 (82.7619)  acc5: 96.0000 (96.3238)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9989 (0.9378)  acc1: 81.2000 (82.2240)  acc5: 95.6000 (96.2720)  time: 0.3966  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6858 s / it)
* Acc@1 82.522 Acc@5 96.454 loss 0.923
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.52%
Epoch: [183]  [   0/1251]  eta: 1:19:49  lr: 0.001304  min_lr: 0.001304  loss: 2.9675 (2.9675)  weight_decay: 0.0500 (0.0500)  time: 3.8285  data: 3.0335  max mem: 69511
Epoch: [183]  [ 200/1251]  eta: 0:14:01  lr: 0.001301  min_lr: 0.001301  loss: 3.1376 (2.9634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7728 (0.7880)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [183]  [ 400/1251]  eta: 0:11:16  lr: 0.001298  min_lr: 0.001298  loss: 2.9988 (2.9355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7535 (0.7687)  time: 0.7850  data: 0.0004  max mem: 69511
Epoch: [183]  [ 600/1251]  eta: 0:08:36  lr: 0.001295  min_lr: 0.001295  loss: 2.8989 (2.9205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7594 (0.7742)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [183]  [ 800/1251]  eta: 0:05:56  lr: 0.001292  min_lr: 0.001292  loss: 3.1338 (2.9209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7574 (0.7724)  time: 0.7849  data: 0.0004  max mem: 69511
Epoch: [183]  [1000/1251]  eta: 0:03:18  lr: 0.001289  min_lr: 0.001289  loss: 2.9123 (2.9206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7628 (0.7838)  time: 0.7850  data: 0.0004  max mem: 69511
Epoch: [183]  [1200/1251]  eta: 0:00:40  lr: 0.001286  min_lr: 0.001286  loss: 2.9619 (2.9229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7417 (0.7830)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [183]  [1250/1251]  eta: 0:00:00  lr: 0.001285  min_lr: 0.001285  loss: 3.2114 (2.9238)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7057 (0.7808)  time: 0.6679  data: 0.0005  max mem: 69511
Epoch: [183] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.001285  min_lr: 0.001285  loss: 3.2114 (2.9441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7057 (0.7808)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.7515 (0.7515)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 7.5051  data: 7.0863  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9109 (0.8890)  acc1: 86.4000 (85.6727)  acc5: 98.0000 (97.8182)  time: 1.0428  data: 0.6445  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0283 (1.0143)  acc1: 80.8000 (82.7810)  acc5: 96.8000 (96.5905)  time: 0.3966  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1011 (1.0281)  acc1: 80.8000 (81.9840)  acc5: 96.0000 (96.5120)  time: 0.3966  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6877 s / it)
* Acc@1 82.558 Acc@5 96.448 loss 1.017
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.56%
Epoch: [184]  [   0/1251]  eta: 1:37:59  lr: 0.001285  min_lr: 0.001285  loss: 2.2410 (2.2410)  weight_decay: 0.0500 (0.0500)  time: 4.7002  data: 3.9134  max mem: 69511
Epoch: [184]  [ 200/1251]  eta: 0:14:07  lr: 0.001282  min_lr: 0.001282  loss: 3.2796 (2.9745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7614 (0.7913)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [184]  [ 400/1251]  eta: 0:11:18  lr: 0.001279  min_lr: 0.001279  loss: 2.8585 (2.9600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7516 (0.7847)  time: 0.7820  data: 0.0004  max mem: 69511
Epoch: [184]  [ 600/1251]  eta: 0:08:35  lr: 0.001276  min_lr: 0.001276  loss: 2.9155 (2.9604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7572 (0.7836)  time: 0.7850  data: 0.0005  max mem: 69511
Epoch: [184]  [ 800/1251]  eta: 0:05:56  lr: 0.001273  min_lr: 0.001273  loss: 3.1860 (2.9486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7385 (0.7846)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [184]  [1000/1251]  eta: 0:03:18  lr: 0.001270  min_lr: 0.001270  loss: 2.9416 (2.9429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7954 (0.7903)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [184]  [1200/1251]  eta: 0:00:40  lr: 0.001267  min_lr: 0.001267  loss: 3.0169 (2.9446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7267 (nan)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [184]  [1250/1251]  eta: 0:00:00  lr: 0.001266  min_lr: 0.001266  loss: 3.2106 (2.9449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7734 (nan)  time: 0.6672  data: 0.0006  max mem: 69511
Epoch: [184] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.001266  min_lr: 0.001266  loss: 3.2106 (2.9473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7734 (nan)
Test:  [ 0/25]  eta: 0:03:17  loss: 0.6574 (0.6574)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 7.9133  data: 7.5004  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8255 (0.8473)  acc1: 86.8000 (86.0000)  acc5: 98.0000 (97.9636)  time: 1.0805  data: 0.6821  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9629 (0.9767)  acc1: 81.2000 (82.7810)  acc5: 96.4000 (96.5714)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0430 (0.9845)  acc1: 81.2000 (82.4640)  acc5: 96.0000 (96.3840)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7050 s / it)
* Acc@1 82.676 Acc@5 96.518 loss 0.977
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.68%
Epoch: [185]  [   0/1251]  eta: 1:23:35  lr: 0.001266  min_lr: 0.001266  loss: 2.3786 (2.3786)  weight_decay: 0.0500 (0.0500)  time: 4.0096  data: 3.2219  max mem: 69511
Epoch: [185]  [ 200/1251]  eta: 0:14:05  lr: 0.001263  min_lr: 0.001263  loss: 2.7759 (2.9076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7347 (0.7557)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [185]  [ 400/1251]  eta: 0:11:16  lr: 0.001260  min_lr: 0.001260  loss: 3.0611 (2.9287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8345 (0.7821)  time: 0.7862  data: 0.0006  max mem: 69511
Epoch: [185]  [ 600/1251]  eta: 0:08:36  lr: 0.001257  min_lr: 0.001257  loss: 3.0054 (2.9276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7830 (0.7822)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [185]  [ 800/1251]  eta: 0:05:57  lr: 0.001254  min_lr: 0.001254  loss: 2.9148 (2.9153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7599 (0.7829)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [185]  [1000/1251]  eta: 0:03:18  lr: 0.001251  min_lr: 0.001251  loss: 2.5551 (2.9120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7745 (0.7815)  time: 0.7831  data: 0.0004  max mem: 69511
Epoch: [185]  [1200/1251]  eta: 0:00:40  lr: 0.001248  min_lr: 0.001248  loss: 3.0410 (2.9152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7098 (0.7810)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [185]  [1250/1251]  eta: 0:00:00  lr: 0.001247  min_lr: 0.001247  loss: 2.9986 (2.9146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7098 (0.7794)  time: 0.6687  data: 0.0009  max mem: 69511
Epoch: [185] Total time: 0:16:26 (0.7883 s / it)
Averaged stats: lr: 0.001247  min_lr: 0.001247  loss: 2.9986 (2.9335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7098 (0.7794)
Test:  [ 0/25]  eta: 0:03:20  loss: 0.6336 (0.6336)  acc1: 90.4000 (90.4000)  acc5: 98.0000 (98.0000)  time: 8.0105  data: 7.5899  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8267 (0.7946)  acc1: 86.4000 (85.8909)  acc5: 97.6000 (97.7091)  time: 1.0869  data: 0.6903  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8961 (0.9142)  acc1: 82.4000 (83.1429)  acc5: 96.8000 (96.7238)  time: 0.3944  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9694 (0.9251)  acc1: 81.6000 (82.6880)  acc5: 96.0000 (96.5920)  time: 0.3944  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7099 s / it)
* Acc@1 82.848 Acc@5 96.540 loss 0.919
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.85%
Epoch: [186]  [   0/1251]  eta: 1:20:10  lr: 0.001247  min_lr: 0.001247  loss: 2.4470 (2.4470)  weight_decay: 0.0500 (0.0500)  time: 3.8452  data: 3.0566  max mem: 69511
Epoch: [186]  [ 200/1251]  eta: 0:13:58  lr: 0.001244  min_lr: 0.001244  loss: 2.7213 (2.8842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7768 (0.7874)  time: 0.7821  data: 0.0005  max mem: 69511
Epoch: [186]  [ 400/1251]  eta: 0:11:13  lr: 0.001241  min_lr: 0.001241  loss: 3.0453 (2.9229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7734 (0.7898)  time: 0.7879  data: 0.0004  max mem: 69511
Epoch: [186]  [ 600/1251]  eta: 0:08:33  lr: 0.001238  min_lr: 0.001238  loss: 2.8384 (2.9264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7862 (0.7913)  time: 0.7816  data: 0.0005  max mem: 69511
Epoch: [186]  [ 800/1251]  eta: 0:05:55  lr: 0.001235  min_lr: 0.001235  loss: 3.1743 (2.9310)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7219 (0.7870)  time: 0.7845  data: 0.0005  max mem: 69511
Epoch: [186]  [1000/1251]  eta: 0:03:17  lr: 0.001232  min_lr: 0.001232  loss: 2.9854 (2.9427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8105 (0.7891)  time: 0.7821  data: 0.0005  max mem: 69511
Epoch: [186]  [1200/1251]  eta: 0:00:40  lr: 0.001229  min_lr: 0.001229  loss: 2.9391 (2.9409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8621 (0.7928)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [186]  [1250/1251]  eta: 0:00:00  lr: 0.001229  min_lr: 0.001229  loss: 2.7484 (2.9380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8474 (0.7940)  time: 0.6675  data: 0.0006  max mem: 69511
Epoch: [186] Total time: 0:16:22 (0.7854 s / it)
Averaged stats: lr: 0.001229  min_lr: 0.001229  loss: 2.7484 (2.9322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8474 (0.7940)
Test:  [ 0/25]  eta: 0:03:17  loss: 0.5872 (0.5872)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 7.8821  data: 7.4524  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.7757 (0.7491)  acc1: 86.8000 (86.3636)  acc5: 98.4000 (97.7818)  time: 1.0768  data: 0.6778  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8578 (0.8813)  acc1: 82.0000 (83.3143)  acc5: 96.4000 (96.5333)  time: 0.3963  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9896 (0.8926)  acc1: 82.0000 (82.8480)  acc5: 96.0000 (96.4640)  time: 0.3963  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7013 s / it)
* Acc@1 82.836 Acc@5 96.488 loss 0.882
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.85%
Epoch: [187]  [   0/1251]  eta: 1:40:45  lr: 0.001229  min_lr: 0.001229  loss: 3.0461 (3.0461)  weight_decay: 0.0500 (0.0500)  time: 4.8322  data: 2.2380  max mem: 69511
Epoch: [187]  [ 200/1251]  eta: 0:14:07  lr: 0.001226  min_lr: 0.001226  loss: 2.8751 (2.8684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7520 (0.7790)  time: 0.7899  data: 0.0004  max mem: 69511
Epoch: [187]  [ 400/1251]  eta: 0:11:18  lr: 0.001223  min_lr: 0.001223  loss: 2.9957 (2.9036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7554 (0.8028)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [187]  [ 600/1251]  eta: 0:08:36  lr: 0.001220  min_lr: 0.001220  loss: 2.9356 (2.9214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7975 (0.8021)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [187]  [ 800/1251]  eta: 0:05:57  lr: 0.001217  min_lr: 0.001217  loss: 2.9649 (2.9118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7737 (0.8025)  time: 0.7851  data: 0.0004  max mem: 69511
Epoch: [187]  [1000/1251]  eta: 0:03:18  lr: 0.001214  min_lr: 0.001214  loss: 3.0397 (2.9104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7565 (0.7948)  time: 0.7819  data: 0.0004  max mem: 69511
Epoch: [187]  [1200/1251]  eta: 0:00:40  lr: 0.001211  min_lr: 0.001211  loss: 3.1267 (2.9106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8339 (0.8066)  time: 0.7814  data: 0.0004  max mem: 69511
Epoch: [187]  [1250/1251]  eta: 0:00:00  lr: 0.001210  min_lr: 0.001210  loss: 3.1303 (2.9113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7876 (0.8056)  time: 0.6640  data: 0.0007  max mem: 69511
Epoch: [187] Total time: 0:16:25 (0.7875 s / it)
Averaged stats: lr: 0.001210  min_lr: 0.001210  loss: 3.1303 (2.9198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7876 (0.8056)
Test:  [ 0/25]  eta: 0:02:06  loss: 0.7313 (0.7313)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.0429  data: 4.6077  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8847 (0.8632)  acc1: 85.2000 (85.8182)  acc5: 97.6000 (97.8545)  time: 0.9789  data: 0.5803  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9892 (0.9994)  acc1: 81.6000 (82.7048)  acc5: 96.8000 (96.6476)  time: 0.4829  data: 0.0888  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0710 (1.0093)  acc1: 81.2000 (82.2720)  acc5: 96.4000 (96.5920)  time: 0.3934  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6556 s / it)
* Acc@1 82.796 Acc@5 96.484 loss 0.997
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.85%
Epoch: [188]  [   0/1251]  eta: 1:42:40  lr: 0.001210  min_lr: 0.001210  loss: 3.0909 (3.0909)  weight_decay: 0.0500 (0.0500)  time: 4.9249  data: 2.0623  max mem: 69511
Epoch: [188]  [ 200/1251]  eta: 0:14:07  lr: 0.001207  min_lr: 0.001207  loss: 2.9711 (2.8530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7449 (0.7727)  time: 0.7814  data: 0.0007  max mem: 69511
Epoch: [188]  [ 400/1251]  eta: 0:11:16  lr: 0.001204  min_lr: 0.001204  loss: 3.0246 (2.8858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7384 (0.7841)  time: 0.7824  data: 0.0005  max mem: 69511
Epoch: [188]  [ 600/1251]  eta: 0:08:35  lr: 0.001201  min_lr: 0.001201  loss: 3.0387 (2.8947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8187 (0.7899)  time: 0.7879  data: 0.0007  max mem: 69511
Epoch: [188]  [ 800/1251]  eta: 0:05:56  lr: 0.001198  min_lr: 0.001198  loss: 2.9863 (2.9101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8588 (0.8137)  time: 0.7998  data: 0.0005  max mem: 69511
Epoch: [188]  [1000/1251]  eta: 0:03:18  lr: 0.001195  min_lr: 0.001195  loss: 2.8673 (2.9088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7652 (0.8127)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [188]  [1200/1251]  eta: 0:00:40  lr: 0.001192  min_lr: 0.001192  loss: 2.9553 (2.9115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7664 (0.8101)  time: 0.7817  data: 0.0004  max mem: 69511
Epoch: [188]  [1250/1251]  eta: 0:00:00  lr: 0.001191  min_lr: 0.001191  loss: 3.0323 (2.9117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7664 (0.8088)  time: 0.6640  data: 0.0006  max mem: 69511
Epoch: [188] Total time: 0:16:24 (0.7872 s / it)
Averaged stats: lr: 0.001191  min_lr: 0.001191  loss: 3.0323 (2.9193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7664 (0.8088)
Test:  [ 0/25]  eta: 0:02:58  loss: 0.6789 (0.6789)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 7.1259  data: 6.6856  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8853 (0.8507)  acc1: 85.6000 (85.8909)  acc5: 98.0000 (97.8545)  time: 1.0057  data: 0.6081  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9645 (0.9726)  acc1: 82.0000 (82.9905)  acc5: 96.4000 (96.5714)  time: 0.3936  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0430 (0.9829)  acc1: 81.2000 (82.4800)  acc5: 96.0000 (96.5920)  time: 0.3936  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6681 s / it)
* Acc@1 82.704 Acc@5 96.502 loss 0.972
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.85%
Epoch: [189]  [   0/1251]  eta: 1:35:30  lr: 0.001191  min_lr: 0.001191  loss: 3.1964 (3.1964)  weight_decay: 0.0500 (0.0500)  time: 4.5811  data: 3.7717  max mem: 69511
Epoch: [189]  [ 200/1251]  eta: 0:14:06  lr: 0.001188  min_lr: 0.001188  loss: 2.9058 (2.9837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7816 (0.7991)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [189]  [ 400/1251]  eta: 0:11:16  lr: 0.001185  min_lr: 0.001185  loss: 3.0027 (2.9298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8274 (0.8034)  time: 0.7848  data: 0.0004  max mem: 69511
Epoch: [189]  [ 600/1251]  eta: 0:08:36  lr: 0.001182  min_lr: 0.001182  loss: 3.1190 (2.9228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8064 (0.8016)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [189]  [ 800/1251]  eta: 0:05:56  lr: 0.001179  min_lr: 0.001179  loss: 3.0710 (2.9305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8067 (nan)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [189]  [1000/1251]  eta: 0:03:18  lr: 0.001176  min_lr: 0.001176  loss: 2.8993 (2.9268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8590 (nan)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [189]  [1200/1251]  eta: 0:00:40  lr: 0.001173  min_lr: 0.001173  loss: 3.1728 (2.9349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7988 (nan)  time: 0.7910  data: 0.0004  max mem: 69511
Epoch: [189]  [1250/1251]  eta: 0:00:00  lr: 0.001173  min_lr: 0.001173  loss: 3.0974 (2.9350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (nan)  time: 0.6670  data: 0.0005  max mem: 69511
Epoch: [189] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.001173  min_lr: 0.001173  loss: 3.0974 (2.9293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (nan)
Test:  [ 0/25]  eta: 0:03:19  loss: 0.6840 (0.6840)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 7.9877  data: 7.5604  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8872 (0.8574)  acc1: 86.4000 (86.1818)  acc5: 98.0000 (97.9273)  time: 1.0867  data: 0.6876  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9951 (0.9902)  acc1: 81.2000 (82.4381)  acc5: 96.4000 (96.6667)  time: 0.3965  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0999 (0.9996)  acc1: 80.8000 (81.8400)  acc5: 95.6000 (96.5280)  time: 0.3964  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7069 s / it)
* Acc@1 82.750 Acc@5 96.492 loss 0.986
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.85%
Epoch: [190]  [   0/1251]  eta: 1:32:40  lr: 0.001173  min_lr: 0.001173  loss: 2.9170 (2.9170)  weight_decay: 0.0500 (0.0500)  time: 4.4446  data: 2.1648  max mem: 69511
Epoch: [190]  [ 200/1251]  eta: 0:14:03  lr: 0.001170  min_lr: 0.001170  loss: 3.0071 (2.9515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7741 (0.8046)  time: 0.7852  data: 0.0004  max mem: 69511
Epoch: [190]  [ 400/1251]  eta: 0:11:16  lr: 0.001167  min_lr: 0.001167  loss: 3.1351 (2.9425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8321 (0.8383)  time: 0.7944  data: 0.0004  max mem: 69511
Epoch: [190]  [ 600/1251]  eta: 0:08:36  lr: 0.001164  min_lr: 0.001164  loss: 3.1318 (2.9294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6859 (0.8249)  time: 0.7849  data: 0.0005  max mem: 69511
Epoch: [190]  [ 800/1251]  eta: 0:05:56  lr: 0.001161  min_lr: 0.001161  loss: 3.0632 (2.9282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7376 (0.8219)  time: 0.7851  data: 0.0005  max mem: 69511
Epoch: [190]  [1000/1251]  eta: 0:03:18  lr: 0.001158  min_lr: 0.001158  loss: 3.1042 (2.9259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7570 (0.8178)  time: 0.7822  data: 0.0004  max mem: 69511
Epoch: [190]  [1200/1251]  eta: 0:00:40  lr: 0.001155  min_lr: 0.001155  loss: 3.0821 (2.9257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8101 (0.8181)  time: 0.7821  data: 0.0004  max mem: 69511
Epoch: [190]  [1250/1251]  eta: 0:00:00  lr: 0.001154  min_lr: 0.001154  loss: 2.7139 (2.9216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.8163)  time: 0.6653  data: 0.0005  max mem: 69511
Epoch: [190] Total time: 0:16:25 (0.7874 s / it)
Averaged stats: lr: 0.001154  min_lr: 0.001154  loss: 2.7139 (2.9161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.8163)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.5969 (0.5969)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 7.3538  data: 6.9145  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7927 (0.7458)  acc1: 86.4000 (86.8000)  acc5: 98.4000 (97.8909)  time: 1.0281  data: 0.6289  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8814 (0.8836)  acc1: 82.0000 (83.1810)  acc5: 96.4000 (96.4571)  time: 0.3957  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9578 (0.8905)  acc1: 81.2000 (82.7680)  acc5: 96.0000 (96.4800)  time: 0.3958  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6866 s / it)
* Acc@1 83.004 Acc@5 96.598 loss 0.879
Accuracy of the model on the 50000 test images: 83.0%
Max accuracy: 83.00%
Epoch: [191]  [   0/1251]  eta: 1:27:17  lr: 0.001154  min_lr: 0.001154  loss: 2.3314 (2.3314)  weight_decay: 0.0500 (0.0500)  time: 4.1870  data: 3.3967  max mem: 69511
Epoch: [191]  [ 200/1251]  eta: 0:14:02  lr: 0.001151  min_lr: 0.001151  loss: 2.8149 (2.8400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7659 (0.8088)  time: 0.7819  data: 0.0004  max mem: 69511
Epoch: [191]  [ 400/1251]  eta: 0:11:15  lr: 0.001148  min_lr: 0.001148  loss: 3.0655 (2.8970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7970 (0.8277)  time: 0.7836  data: 0.0005  max mem: 69511
Epoch: [191]  [ 600/1251]  eta: 0:08:34  lr: 0.001145  min_lr: 0.001145  loss: 2.9897 (2.9116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7574 (0.8228)  time: 0.7849  data: 0.0004  max mem: 69511
Epoch: [191]  [ 800/1251]  eta: 0:05:56  lr: 0.001142  min_lr: 0.001142  loss: 2.8509 (2.9103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7114 (0.8088)  time: 0.7905  data: 0.0003  max mem: 69511
Epoch: [191]  [1000/1251]  eta: 0:03:18  lr: 0.001139  min_lr: 0.001139  loss: 2.8003 (2.9159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7271 (0.8103)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [191]  [1200/1251]  eta: 0:00:40  lr: 0.001136  min_lr: 0.001136  loss: 3.0045 (2.9141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7115 (0.8028)  time: 0.7855  data: 0.0003  max mem: 69511
Epoch: [191]  [1250/1251]  eta: 0:00:00  lr: 0.001136  min_lr: 0.001136  loss: 3.0312 (2.9142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7542 (0.8043)  time: 0.6719  data: 0.0004  max mem: 69511
Epoch: [191] Total time: 0:16:25 (0.7875 s / it)
Averaged stats: lr: 0.001136  min_lr: 0.001136  loss: 3.0312 (2.9047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7542 (0.8043)
Test:  [ 0/25]  eta: 0:03:01  loss: 0.7264 (0.7264)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 7.2611  data: 6.8365  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8923 (0.8520)  acc1: 85.6000 (86.0727)  acc5: 97.6000 (97.6000)  time: 1.0206  data: 0.6218  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9483 (0.9806)  acc1: 80.8000 (82.8762)  acc5: 96.4000 (96.3810)  time: 0.3965  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0615 (0.9895)  acc1: 80.8000 (82.4960)  acc5: 96.0000 (96.1760)  time: 0.3965  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6773 s / it)
* Acc@1 82.882 Acc@5 96.508 loss 0.980
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 83.00%
Epoch: [192]  [   0/1251]  eta: 1:33:23  lr: 0.001136  min_lr: 0.001136  loss: 3.0261 (3.0261)  weight_decay: 0.0500 (0.0500)  time: 4.4791  data: 3.6908  max mem: 69511
Epoch: [192]  [ 200/1251]  eta: 0:14:08  lr: 0.001133  min_lr: 0.001133  loss: 2.9249 (2.8959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8421 (0.8198)  time: 0.7935  data: 0.0005  max mem: 69511
Epoch: [192]  [ 400/1251]  eta: 0:11:17  lr: 0.001130  min_lr: 0.001130  loss: 2.7564 (2.8946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7756 (0.8138)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [192]  [ 600/1251]  eta: 0:08:36  lr: 0.001127  min_lr: 0.001127  loss: 2.7805 (2.8764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7634 (0.8299)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [192]  [ 800/1251]  eta: 0:05:57  lr: 0.001124  min_lr: 0.001124  loss: 3.1351 (2.8908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7916 (0.8299)  time: 0.7860  data: 0.0006  max mem: 69511
Epoch: [192]  [1000/1251]  eta: 0:03:18  lr: 0.001121  min_lr: 0.001121  loss: 3.1980 (2.8921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7597 (0.8224)  time: 0.7824  data: 0.0004  max mem: 69511
Epoch: [192]  [1200/1251]  eta: 0:00:40  lr: 0.001118  min_lr: 0.001118  loss: 2.8315 (2.9027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8810 (0.8231)  time: 0.7842  data: 0.0004  max mem: 69511
Epoch: [192]  [1250/1251]  eta: 0:00:00  lr: 0.001117  min_lr: 0.001117  loss: 2.8795 (2.8998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.8223)  time: 0.6663  data: 0.0005  max mem: 69511
Epoch: [192] Total time: 0:16:26 (0.7883 s / it)
Averaged stats: lr: 0.001117  min_lr: 0.001117  loss: 2.8795 (2.9085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.8223)
Test:  [ 0/25]  eta: 0:03:01  loss: 0.6592 (0.6592)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 7.2500  data: 6.8182  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7984 (0.7542)  acc1: 86.0000 (86.5091)  acc5: 98.0000 (97.8546)  time: 1.0253  data: 0.6267  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8885 (0.8893)  acc1: 82.0000 (83.2000)  acc5: 96.4000 (96.5714)  time: 0.3992  data: 0.0038  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9737 (0.8985)  acc1: 81.6000 (82.5280)  acc5: 96.0000 (96.5440)  time: 0.3957  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6799 s / it)
* Acc@1 82.946 Acc@5 96.540 loss 0.884
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 83.00%
Epoch: [193]  [   0/1251]  eta: 1:45:46  lr: 0.001117  min_lr: 0.001117  loss: 2.9020 (2.9020)  weight_decay: 0.0500 (0.0500)  time: 5.0731  data: 4.2746  max mem: 69511
Epoch: [193]  [ 200/1251]  eta: 0:14:09  lr: 0.001114  min_lr: 0.001114  loss: 2.7541 (2.9194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8398 (0.8591)  time: 0.7852  data: 0.0004  max mem: 69511
Epoch: [193]  [ 400/1251]  eta: 0:11:17  lr: 0.001112  min_lr: 0.001112  loss: 2.9301 (2.9163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7785 (0.8252)  time: 0.7901  data: 0.0004  max mem: 69511
Epoch: [193]  [ 600/1251]  eta: 0:08:36  lr: 0.001109  min_lr: 0.001109  loss: 2.8574 (2.9226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8134 (0.8235)  time: 0.7849  data: 0.0004  max mem: 69511
Epoch: [193]  [ 800/1251]  eta: 0:05:56  lr: 0.001106  min_lr: 0.001106  loss: 2.9891 (2.9282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8118 (0.8177)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [193]  [1000/1251]  eta: 0:03:18  lr: 0.001103  min_lr: 0.001103  loss: 2.6903 (2.9239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8264 (0.8236)  time: 0.7903  data: 0.0005  max mem: 69511
Epoch: [193]  [1200/1251]  eta: 0:00:40  lr: 0.001100  min_lr: 0.001100  loss: 3.0045 (2.9213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8064 (0.8260)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [193]  [1250/1251]  eta: 0:00:00  lr: 0.001099  min_lr: 0.001099  loss: 3.0403 (2.9226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7969 (0.8267)  time: 0.6673  data: 0.0007  max mem: 69511
Epoch: [193] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.001099  min_lr: 0.001099  loss: 3.0403 (2.9001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7969 (0.8267)
Test:  [ 0/25]  eta: 0:02:59  loss: 0.7071 (0.7071)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 7.1967  data: 6.7795  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8838 (0.8538)  acc1: 87.2000 (86.5818)  acc5: 97.6000 (97.7091)  time: 1.0149  data: 0.6166  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9387 (0.9854)  acc1: 82.4000 (83.3714)  acc5: 96.0000 (96.4191)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0454 (0.9919)  acc1: 82.0000 (82.9120)  acc5: 95.6000 (96.3520)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6747 s / it)
* Acc@1 83.046 Acc@5 96.522 loss 0.988
Accuracy of the model on the 50000 test images: 83.0%
Max accuracy: 83.05%
Epoch: [194]  [   0/1251]  eta: 1:21:36  lr: 0.001099  min_lr: 0.001099  loss: 2.8151 (2.8151)  weight_decay: 0.0500 (0.0500)  time: 3.9144  data: 3.1335  max mem: 69511
Epoch: [194]  [ 200/1251]  eta: 0:14:01  lr: 0.001096  min_lr: 0.001096  loss: 2.8213 (2.9132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8580 (0.8667)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [194]  [ 400/1251]  eta: 0:11:15  lr: 0.001093  min_lr: 0.001093  loss: 3.0870 (2.8819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8362 (0.8531)  time: 0.7932  data: 0.0005  max mem: 69511
Epoch: [194]  [ 600/1251]  eta: 0:08:35  lr: 0.001090  min_lr: 0.001090  loss: 2.9530 (2.8913)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7102 (0.8300)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [194]  [ 800/1251]  eta: 0:05:56  lr: 0.001088  min_lr: 0.001088  loss: 3.0222 (2.8981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8740 (0.8396)  time: 0.7930  data: 0.0004  max mem: 69511
Epoch: [194]  [1000/1251]  eta: 0:03:18  lr: 0.001085  min_lr: 0.001085  loss: 2.9986 (2.8962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8263 (0.8392)  time: 0.7880  data: 0.0004  max mem: 69511
Epoch: [194]  [1200/1251]  eta: 0:00:40  lr: 0.001082  min_lr: 0.001082  loss: 2.9448 (2.8897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7524 (0.8387)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [194]  [1250/1251]  eta: 0:00:00  lr: 0.001081  min_lr: 0.001081  loss: 2.9650 (2.8904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7913 (0.8399)  time: 0.6675  data: 0.0005  max mem: 69511
Epoch: [194] Total time: 0:16:25 (0.7879 s / it)
Averaged stats: lr: 0.001081  min_lr: 0.001081  loss: 2.9650 (2.8920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7913 (0.8399)
Test:  [ 0/25]  eta: 0:03:19  loss: 0.6227 (0.6227)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.9932  data: 7.5809  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.7873 (0.7861)  acc1: 86.8000 (86.7273)  acc5: 98.0000 (97.9636)  time: 1.0879  data: 0.6894  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8785 (0.9131)  acc1: 82.0000 (83.4667)  acc5: 96.4000 (96.6857)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0183 (0.9216)  acc1: 81.2000 (82.9600)  acc5: 96.0000 (96.5760)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7076 s / it)
* Acc@1 83.052 Acc@5 96.556 loss 0.917
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.05%
Epoch: [195]  [   0/1251]  eta: 1:33:04  lr: 0.001081  min_lr: 0.001081  loss: 3.3520 (3.3520)  weight_decay: 0.0500 (0.0500)  time: 4.4640  data: 3.6851  max mem: 69511
Epoch: [195]  [ 200/1251]  eta: 0:14:06  lr: 0.001078  min_lr: 0.001078  loss: 2.8768 (2.8594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7911 (0.8281)  time: 0.7929  data: 0.0004  max mem: 69511
Epoch: [195]  [ 400/1251]  eta: 0:11:18  lr: 0.001075  min_lr: 0.001075  loss: 2.8795 (2.8783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7374 (0.8304)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [195]  [ 600/1251]  eta: 0:08:36  lr: 0.001072  min_lr: 0.001072  loss: 3.1622 (2.8990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8121 (0.8246)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [195]  [ 800/1251]  eta: 0:05:57  lr: 0.001069  min_lr: 0.001069  loss: 2.9108 (2.8909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7305 (0.8160)  time: 0.7914  data: 0.0004  max mem: 69511
Epoch: [195]  [1000/1251]  eta: 0:03:18  lr: 0.001066  min_lr: 0.001066  loss: 3.1095 (2.8950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8309 (0.8126)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [195]  [1200/1251]  eta: 0:00:40  lr: 0.001064  min_lr: 0.001064  loss: 2.7464 (2.8979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8547 (0.8179)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [195]  [1250/1251]  eta: 0:00:00  lr: 0.001063  min_lr: 0.001063  loss: 3.1538 (2.9035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7972 (0.8184)  time: 0.6675  data: 0.0005  max mem: 69511
Epoch: [195] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.001063  min_lr: 0.001063  loss: 3.1538 (2.8929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7972 (0.8184)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.7425 (0.7425)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 7.4906  data: 7.0633  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9058 (0.8990)  acc1: 86.4000 (86.6182)  acc5: 98.0000 (97.6364)  time: 1.0412  data: 0.6424  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9923 (1.0256)  acc1: 82.8000 (83.4667)  acc5: 96.0000 (96.5905)  time: 0.3963  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1317 (1.0380)  acc1: 80.4000 (82.9120)  acc5: 96.0000 (96.4960)  time: 0.3962  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6869 s / it)
* Acc@1 82.924 Acc@5 96.502 loss 1.036
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 83.05%
Epoch: [196]  [   0/1251]  eta: 1:42:56  lr: 0.001063  min_lr: 0.001063  loss: 2.2832 (2.2832)  weight_decay: 0.0500 (0.0500)  time: 4.9375  data: 4.1522  max mem: 69511
Epoch: [196]  [ 200/1251]  eta: 0:14:08  lr: 0.001060  min_lr: 0.001060  loss: 2.5681 (2.8961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7356 (0.8018)  time: 0.7876  data: 0.0005  max mem: 69511
Epoch: [196]  [ 400/1251]  eta: 0:11:17  lr: 0.001057  min_lr: 0.001057  loss: 2.9781 (2.8739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7899 (0.8154)  time: 0.7854  data: 0.0006  max mem: 69511
Epoch: [196]  [ 600/1251]  eta: 0:08:36  lr: 0.001054  min_lr: 0.001054  loss: 2.9854 (2.8772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9149 (0.8492)  time: 0.7913  data: 0.0005  max mem: 69511
Epoch: [196]  [ 800/1251]  eta: 0:05:56  lr: 0.001051  min_lr: 0.001051  loss: 3.0905 (2.8919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8085 (0.8479)  time: 0.7820  data: 0.0006  max mem: 69511
Epoch: [196]  [1000/1251]  eta: 0:03:18  lr: 0.001048  min_lr: 0.001048  loss: 2.9869 (2.8870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8858 (0.8500)  time: 0.7848  data: 0.0006  max mem: 69511
Epoch: [196]  [1200/1251]  eta: 0:00:40  lr: 0.001046  min_lr: 0.001046  loss: 3.0264 (2.8947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7757 (0.8386)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [196]  [1250/1251]  eta: 0:00:00  lr: 0.001045  min_lr: 0.001045  loss: 3.0053 (2.8946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7806 (0.8376)  time: 0.6721  data: 0.0007  max mem: 69511
Epoch: [196] Total time: 0:16:25 (0.7880 s / it)
Averaged stats: lr: 0.001045  min_lr: 0.001045  loss: 3.0053 (2.8886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7806 (0.8376)
Test:  [ 0/25]  eta: 0:03:16  loss: 0.6979 (0.6979)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 7.8535  data: 7.4202  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8541 (0.8501)  acc1: 86.0000 (86.1455)  acc5: 97.6000 (97.9273)  time: 1.0746  data: 0.6748  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9703 (0.9844)  acc1: 81.2000 (83.0476)  acc5: 96.4000 (96.6857)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0838 (0.9920)  acc1: 81.2000 (82.6880)  acc5: 96.0000 (96.6080)  time: 0.3966  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7005 s / it)
* Acc@1 82.802 Acc@5 96.564 loss 0.990
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 83.05%
Epoch: [197]  [   0/1251]  eta: 1:37:45  lr: 0.001045  min_lr: 0.001045  loss: 2.9909 (2.9909)  weight_decay: 0.0500 (0.0500)  time: 4.6887  data: 3.2179  max mem: 69511
Epoch: [197]  [ 200/1251]  eta: 0:14:05  lr: 0.001042  min_lr: 0.001042  loss: 2.9863 (2.9353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8508 (0.8505)  time: 0.7823  data: 0.0005  max mem: 69511
Epoch: [197]  [ 400/1251]  eta: 0:11:16  lr: 0.001039  min_lr: 0.001039  loss: 2.9659 (2.8966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7416 (0.8261)  time: 0.7824  data: 0.0005  max mem: 69511
Epoch: [197]  [ 600/1251]  eta: 0:08:35  lr: 0.001036  min_lr: 0.001036  loss: 2.8361 (2.8843)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7798 (0.8277)  time: 0.7881  data: 0.0008  max mem: 69511
Epoch: [197]  [ 800/1251]  eta: 0:05:56  lr: 0.001033  min_lr: 0.001033  loss: 2.9740 (2.8923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8915 (0.8583)  time: 0.7850  data: 0.0004  max mem: 69511
Epoch: [197]  [1000/1251]  eta: 0:03:18  lr: 0.001031  min_lr: 0.001031  loss: 3.0649 (2.8912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8839 (0.8617)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [197]  [1200/1251]  eta: 0:00:40  lr: 0.001028  min_lr: 0.001028  loss: 3.0737 (2.8943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7952 (0.8589)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [197]  [1250/1251]  eta: 0:00:00  lr: 0.001027  min_lr: 0.001027  loss: 3.1263 (2.8933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7405 (0.8531)  time: 0.6680  data: 0.0005  max mem: 69511
Epoch: [197] Total time: 0:16:25 (0.7877 s / it)
Averaged stats: lr: 0.001027  min_lr: 0.001027  loss: 3.1263 (2.8883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7405 (0.8531)
Test:  [ 0/25]  eta: 0:03:16  loss: 0.8426 (0.8426)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 7.8763  data: 7.4564  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9402 (0.9316)  acc1: 86.8000 (86.3273)  acc5: 97.2000 (97.8909)  time: 1.0769  data: 0.6781  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0519 (1.0607)  acc1: 81.6000 (83.2571)  acc5: 96.8000 (96.5714)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1555 (1.0722)  acc1: 81.6000 (82.8320)  acc5: 96.0000 (96.4480)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7024 s / it)
* Acc@1 82.906 Acc@5 96.524 loss 1.066
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 83.05%
Epoch: [198]  [   0/1251]  eta: 1:41:52  lr: 0.001027  min_lr: 0.001027  loss: 2.1121 (2.1121)  weight_decay: 0.0500 (0.0500)  time: 4.8863  data: 4.0990  max mem: 69511
Epoch: [198]  [ 200/1251]  eta: 0:14:07  lr: 0.001024  min_lr: 0.001024  loss: 2.9401 (2.8660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8938 (0.8403)  time: 0.7885  data: 0.0006  max mem: 69511
Epoch: [198]  [ 400/1251]  eta: 0:11:17  lr: 0.001021  min_lr: 0.001021  loss: 2.8838 (2.8849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7841 (0.8233)  time: 0.7895  data: 0.0006  max mem: 69511
Epoch: [198]  [ 600/1251]  eta: 0:08:36  lr: 0.001018  min_lr: 0.001018  loss: 2.9596 (2.8863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8169 (0.8228)  time: 0.7851  data: 0.0005  max mem: 69511
Epoch: [198]  [ 800/1251]  eta: 0:05:56  lr: 0.001016  min_lr: 0.001016  loss: 2.9201 (2.8945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8915 (0.8318)  time: 0.7851  data: 0.0009  max mem: 69511
Epoch: [198]  [1000/1251]  eta: 0:03:18  lr: 0.001013  min_lr: 0.001013  loss: 2.9271 (2.8914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8907 (0.8359)  time: 0.7853  data: 0.0007  max mem: 69511
Epoch: [198]  [1200/1251]  eta: 0:00:40  lr: 0.001010  min_lr: 0.001010  loss: 2.9679 (2.8912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7858 (0.8366)  time: 0.7879  data: 0.0007  max mem: 69511
Epoch: [198]  [1250/1251]  eta: 0:00:00  lr: 0.001009  min_lr: 0.001009  loss: 3.0633 (2.8915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7378 (0.8339)  time: 0.6668  data: 0.0006  max mem: 69511
Epoch: [198] Total time: 0:16:26 (0.7882 s / it)
Averaged stats: lr: 0.001009  min_lr: 0.001009  loss: 3.0633 (2.8769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7378 (0.8339)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.7010 (0.7010)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.8395  data: 7.4045  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.9096 (0.8719)  acc1: 86.8000 (86.2909)  acc5: 97.6000 (97.6727)  time: 1.0735  data: 0.6734  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0240 (1.0018)  acc1: 81.6000 (83.1238)  acc5: 96.8000 (96.7048)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0908 (1.0124)  acc1: 80.0000 (82.5920)  acc5: 96.0000 (96.5120)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7000 s / it)
* Acc@1 82.948 Acc@5 96.550 loss 1.002
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 83.05%
Epoch: [199]  [   0/1251]  eta: 1:32:24  lr: 0.001009  min_lr: 0.001009  loss: 3.1205 (3.1205)  weight_decay: 0.0500 (0.0500)  time: 4.4324  data: 3.2222  max mem: 69511
Epoch: [199]  [ 200/1251]  eta: 0:14:06  lr: 0.001006  min_lr: 0.001006  loss: 2.9429 (2.8583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8767 (inf)  time: 0.7851  data: 0.0005  max mem: 69511
Epoch: [199]  [ 400/1251]  eta: 0:11:18  lr: 0.001003  min_lr: 0.001003  loss: 2.8016 (2.8894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8153 (inf)  time: 0.7958  data: 0.0005  max mem: 69511
Epoch: [199]  [ 600/1251]  eta: 0:08:36  lr: 0.001001  min_lr: 0.001001  loss: 2.8417 (2.8837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7739 (inf)  time: 0.7849  data: 0.0005  max mem: 69511
Epoch: [199]  [ 800/1251]  eta: 0:05:57  lr: 0.000998  min_lr: 0.000998  loss: 2.7519 (2.8792)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7318 (inf)  time: 0.7909  data: 0.0006  max mem: 69511
Epoch: [199]  [1000/1251]  eta: 0:03:18  lr: 0.000995  min_lr: 0.000995  loss: 2.7943 (2.8790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7927 (inf)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [199]  [1200/1251]  eta: 0:00:40  lr: 0.000992  min_lr: 0.000992  loss: 2.9372 (2.8752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8208 (inf)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [199]  [1250/1251]  eta: 0:00:00  lr: 0.000991  min_lr: 0.000991  loss: 3.0587 (2.8798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8286 (inf)  time: 0.6676  data: 0.0005  max mem: 69511
Epoch: [199] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.000991  min_lr: 0.000991  loss: 3.0587 (2.8745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8286 (inf)
Test:  [ 0/25]  eta: 0:02:33  loss: 0.7161 (0.7161)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 6.1262  data: 5.6950  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8619 (0.8568)  acc1: 86.0000 (86.4727)  acc5: 98.0000 (98.0000)  time: 0.9943  data: 0.5867  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0125 (0.9905)  acc1: 82.4000 (83.3905)  acc5: 96.4000 (96.6476)  time: 0.4387  data: 0.0379  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0679 (0.9995)  acc1: 81.6000 (82.9280)  acc5: 96.4000 (96.6080)  time: 0.3973  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6653 s / it)
* Acc@1 83.138 Acc@5 96.620 loss 0.998
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.14%
Epoch: [200]  [   0/1251]  eta: 1:25:29  lr: 0.000991  min_lr: 0.000991  loss: 2.2006 (2.2006)  weight_decay: 0.0500 (0.0500)  time: 4.1006  data: 3.3083  max mem: 69511
Epoch: [200]  [ 200/1251]  eta: 0:14:06  lr: 0.000989  min_lr: 0.000989  loss: 2.6944 (2.8565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8042 (0.8339)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [200]  [ 400/1251]  eta: 0:11:17  lr: 0.000986  min_lr: 0.000986  loss: 2.7660 (2.8518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8108 (0.8462)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [200]  [ 600/1251]  eta: 0:08:36  lr: 0.000983  min_lr: 0.000983  loss: 2.7875 (2.8523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7975 (0.8411)  time: 0.7850  data: 0.0005  max mem: 69511
Epoch: [200]  [ 800/1251]  eta: 0:05:57  lr: 0.000980  min_lr: 0.000980  loss: 3.0555 (2.8566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8244 (0.8499)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [200]  [1000/1251]  eta: 0:03:18  lr: 0.000977  min_lr: 0.000977  loss: 2.8828 (2.8640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8462 (0.8471)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [200]  [1200/1251]  eta: 0:00:40  lr: 0.000974  min_lr: 0.000974  loss: 3.1033 (2.8690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8966 (0.8535)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [200]  [1250/1251]  eta: 0:00:00  lr: 0.000974  min_lr: 0.000974  loss: 2.9625 (2.8685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8343 (0.8552)  time: 0.6732  data: 0.0006  max mem: 69511
Epoch: [200] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.000974  min_lr: 0.000974  loss: 2.9625 (2.8695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8343 (0.8552)
Test:  [ 0/25]  eta: 0:02:51  loss: 0.7148 (0.7148)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 6.8758  data: 6.4526  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8632 (0.8197)  acc1: 87.6000 (86.7273)  acc5: 97.6000 (97.6000)  time: 0.9860  data: 0.5869  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9250 (0.9439)  acc1: 81.6000 (83.5429)  acc5: 96.4000 (96.5333)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0501 (0.9544)  acc1: 80.8000 (83.0720)  acc5: 96.4000 (96.4160)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6618 s / it)
* Acc@1 83.158 Acc@5 96.586 loss 0.946
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.16%
Epoch: [201]  [   0/1251]  eta: 1:20:06  lr: 0.000974  min_lr: 0.000974  loss: 2.8914 (2.8914)  weight_decay: 0.0500 (0.0500)  time: 3.8425  data: 3.0465  max mem: 69511
Epoch: [201]  [ 200/1251]  eta: 0:14:01  lr: 0.000971  min_lr: 0.000971  loss: 2.8904 (2.8209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8152 (0.8408)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [201]  [ 400/1251]  eta: 0:11:15  lr: 0.000968  min_lr: 0.000968  loss: 2.7210 (2.8341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7469 (0.8395)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [201]  [ 600/1251]  eta: 0:08:36  lr: 0.000965  min_lr: 0.000965  loss: 3.0202 (2.8440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8931 (0.8613)  time: 0.7865  data: 0.0007  max mem: 69511
Epoch: [201]  [ 800/1251]  eta: 0:05:56  lr: 0.000963  min_lr: 0.000963  loss: 2.7537 (2.8496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8622 (0.8720)  time: 0.7865  data: 0.0007  max mem: 69511
Epoch: [201]  [1000/1251]  eta: 0:03:18  lr: 0.000960  min_lr: 0.000960  loss: 2.8640 (2.8526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7948 (0.8640)  time: 0.7865  data: 0.0006  max mem: 69511
Epoch: [201]  [1200/1251]  eta: 0:00:40  lr: 0.000957  min_lr: 0.000957  loss: 3.0320 (2.8526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8136 (0.8606)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [201]  [1250/1251]  eta: 0:00:00  lr: 0.000956  min_lr: 0.000956  loss: 3.1050 (2.8557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8142 (0.8595)  time: 0.6680  data: 0.0008  max mem: 69511
Epoch: [201] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.000956  min_lr: 0.000956  loss: 3.1050 (2.8644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8142 (0.8595)
Test:  [ 0/25]  eta: 0:03:08  loss: 0.6636 (0.6636)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.5500  data: 7.1234  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8412 (0.8323)  acc1: 86.4000 (86.2182)  acc5: 97.6000 (97.7091)  time: 1.0469  data: 0.6478  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9548 (0.9511)  acc1: 81.2000 (83.4095)  acc5: 96.0000 (96.6476)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0412 (0.9613)  acc1: 81.2000 (83.0240)  acc5: 96.0000 (96.5760)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6889 s / it)
* Acc@1 83.130 Acc@5 96.614 loss 0.954
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.16%
Epoch: [202]  [   0/1251]  eta: 1:41:04  lr: 0.000956  min_lr: 0.000956  loss: 1.8946 (1.8946)  weight_decay: 0.0500 (0.0500)  time: 4.8475  data: 4.0475  max mem: 69511
Epoch: [202]  [ 200/1251]  eta: 0:14:06  lr: 0.000953  min_lr: 0.000953  loss: 2.9967 (2.8295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8238 (0.8681)  time: 0.7818  data: 0.0004  max mem: 69511
Epoch: [202]  [ 400/1251]  eta: 0:11:17  lr: 0.000951  min_lr: 0.000951  loss: 2.9678 (2.8492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7998 (0.8576)  time: 0.7897  data: 0.0005  max mem: 69511
Epoch: [202]  [ 600/1251]  eta: 0:08:35  lr: 0.000948  min_lr: 0.000948  loss: 2.9875 (2.8393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8970 (0.8742)  time: 0.7846  data: 0.0004  max mem: 69511
Epoch: [202]  [ 800/1251]  eta: 0:05:56  lr: 0.000945  min_lr: 0.000945  loss: 2.8581 (2.8413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8352 (0.8673)  time: 0.7854  data: 0.0007  max mem: 69511
Epoch: [202]  [1000/1251]  eta: 0:03:18  lr: 0.000942  min_lr: 0.000942  loss: 2.7313 (2.8344)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9104 (0.8688)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [202]  [1200/1251]  eta: 0:00:40  lr: 0.000939  min_lr: 0.000939  loss: 2.9270 (2.8386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7635 (0.8624)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [202]  [1250/1251]  eta: 0:00:00  lr: 0.000939  min_lr: 0.000939  loss: 2.7858 (2.8404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7660 (0.8607)  time: 0.6680  data: 0.0006  max mem: 69511
Epoch: [202] Total time: 0:16:26 (0.7883 s / it)
Averaged stats: lr: 0.000939  min_lr: 0.000939  loss: 2.7858 (2.8553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7660 (0.8607)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.6399 (0.6399)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.5482  data: 6.1125  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8710 (0.8150)  acc1: 87.6000 (86.6182)  acc5: 98.0000 (98.0727)  time: 0.9565  data: 0.5560  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9221 (0.9435)  acc1: 80.8000 (83.3333)  acc5: 96.4000 (96.8571)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0282 (0.9545)  acc1: 80.8000 (82.7520)  acc5: 96.0000 (96.7840)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6489 s / it)
* Acc@1 83.272 Acc@5 96.598 loss 0.947
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.27%
Epoch: [203]  [   0/1251]  eta: 1:25:38  lr: 0.000939  min_lr: 0.000939  loss: 2.7056 (2.7056)  weight_decay: 0.0500 (0.0500)  time: 4.1075  data: 3.3177  max mem: 69511
Epoch: [203]  [ 200/1251]  eta: 0:14:05  lr: 0.000936  min_lr: 0.000936  loss: 2.5827 (2.8797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8923 (0.8669)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [203]  [ 400/1251]  eta: 0:11:17  lr: 0.000933  min_lr: 0.000933  loss: 3.1289 (2.8694)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8903 (0.8717)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [203]  [ 600/1251]  eta: 0:08:36  lr: 0.000930  min_lr: 0.000930  loss: 2.6784 (2.8586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8708 (0.8616)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [203]  [ 800/1251]  eta: 0:05:57  lr: 0.000928  min_lr: 0.000928  loss: 2.7526 (2.8569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9005 (0.8716)  time: 0.7917  data: 0.0005  max mem: 69511
Epoch: [203]  [1000/1251]  eta: 0:03:18  lr: 0.000925  min_lr: 0.000925  loss: 3.0864 (2.8690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8174 (0.8683)  time: 0.7840  data: 0.0005  max mem: 69511
Epoch: [203]  [1200/1251]  eta: 0:00:40  lr: 0.000922  min_lr: 0.000922  loss: 3.0453 (2.8696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8709 (0.8716)  time: 0.7848  data: 0.0004  max mem: 69511
Epoch: [203]  [1250/1251]  eta: 0:00:00  lr: 0.000922  min_lr: 0.000922  loss: 2.6788 (2.8653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8709 (0.8716)  time: 0.6670  data: 0.0006  max mem: 69511
Epoch: [203] Total time: 0:16:25 (0.7878 s / it)
Averaged stats: lr: 0.000922  min_lr: 0.000922  loss: 2.6788 (2.8592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8709 (0.8716)
Test:  [ 0/25]  eta: 0:02:48  loss: 0.6195 (0.6195)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 6.7441  data: 6.3047  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.7913 (0.7832)  acc1: 86.4000 (86.2909)  acc5: 98.0000 (97.8545)  time: 0.9743  data: 0.5735  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8933 (0.8919)  acc1: 81.6000 (83.5238)  acc5: 96.4000 (96.7238)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9257 (0.9044)  acc1: 81.6000 (83.0720)  acc5: 96.4000 (96.5600)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6598 s / it)
* Acc@1 83.290 Acc@5 96.628 loss 0.905
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.29%
Epoch: [204]  [   0/1251]  eta: 1:29:32  lr: 0.000921  min_lr: 0.000921  loss: 2.0596 (2.0596)  weight_decay: 0.0500 (0.0500)  time: 4.2945  data: 3.5039  max mem: 69511
Epoch: [204]  [ 200/1251]  eta: 0:14:07  lr: 0.000919  min_lr: 0.000919  loss: 3.0057 (2.8429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8261 (0.8343)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [204]  [ 400/1251]  eta: 0:11:16  lr: 0.000916  min_lr: 0.000916  loss: 3.0805 (2.8356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8071 (0.8383)  time: 0.7825  data: 0.0006  max mem: 69511
Epoch: [204]  [ 600/1251]  eta: 0:08:35  lr: 0.000913  min_lr: 0.000913  loss: 2.8297 (2.8498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8239 (0.8414)  time: 0.7829  data: 0.0005  max mem: 69511
Epoch: [204]  [ 800/1251]  eta: 0:05:56  lr: 0.000910  min_lr: 0.000910  loss: 2.8063 (2.8491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7682 (0.8439)  time: 0.7824  data: 0.0004  max mem: 69511
Epoch: [204]  [1000/1251]  eta: 0:03:17  lr: 0.000908  min_lr: 0.000908  loss: 2.8760 (2.8419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8861 (0.8502)  time: 0.7849  data: 0.0004  max mem: 69511
Epoch: [204]  [1200/1251]  eta: 0:00:40  lr: 0.000905  min_lr: 0.000905  loss: 2.9886 (2.8504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8458 (0.8571)  time: 0.7927  data: 0.0004  max mem: 69511
Epoch: [204]  [1250/1251]  eta: 0:00:00  lr: 0.000904  min_lr: 0.000904  loss: 2.9885 (2.8521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8176 (0.8571)  time: 0.6716  data: 0.0005  max mem: 69511
Epoch: [204] Total time: 0:16:25 (0.7874 s / it)
Averaged stats: lr: 0.000904  min_lr: 0.000904  loss: 2.9885 (2.8488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8176 (0.8571)
Test:  [ 0/25]  eta: 0:02:32  loss: 0.6648 (0.6648)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 6.0848  data: 5.6300  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8462 (0.8279)  acc1: 86.4000 (86.6182)  acc5: 97.6000 (97.6727)  time: 0.9719  data: 0.5701  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9235 (0.9498)  acc1: 81.2000 (83.7714)  acc5: 96.8000 (96.7429)  time: 0.4287  data: 0.0321  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0544 (0.9621)  acc1: 81.2000 (83.3120)  acc5: 96.0000 (96.5600)  time: 0.3975  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6559 s / it)
* Acc@1 83.272 Acc@5 96.614 loss 0.957
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.29%
Epoch: [205]  [   0/1251]  eta: 1:45:43  lr: 0.000904  min_lr: 0.000904  loss: 2.6554 (2.6554)  weight_decay: 0.0500 (0.0500)  time: 5.0709  data: 3.6849  max mem: 69511
Epoch: [205]  [ 200/1251]  eta: 0:14:07  lr: 0.000901  min_lr: 0.000901  loss: 3.0130 (2.9078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8924 (0.8883)  time: 0.7852  data: 0.0004  max mem: 69511
Epoch: [205]  [ 400/1251]  eta: 0:11:18  lr: 0.000899  min_lr: 0.000899  loss: 2.9359 (2.8846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8827 (0.8853)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [205]  [ 600/1251]  eta: 0:08:36  lr: 0.000896  min_lr: 0.000896  loss: 2.9766 (2.8636)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9183 (0.8899)  time: 0.7832  data: 0.0005  max mem: 69511
Epoch: [205]  [ 800/1251]  eta: 0:05:56  lr: 0.000893  min_lr: 0.000893  loss: 2.8996 (2.8702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8024 (0.8789)  time: 0.7849  data: 0.0006  max mem: 69511
Epoch: [205]  [1000/1251]  eta: 0:03:18  lr: 0.000891  min_lr: 0.000891  loss: 3.1410 (2.8678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7708 (0.8706)  time: 0.7921  data: 0.0004  max mem: 69511
Epoch: [205]  [1200/1251]  eta: 0:00:40  lr: 0.000888  min_lr: 0.000888  loss: 3.0111 (2.8627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8069 (0.8788)  time: 0.7840  data: 0.0004  max mem: 69511
Epoch: [205]  [1250/1251]  eta: 0:00:00  lr: 0.000887  min_lr: 0.000887  loss: 3.0470 (2.8635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7984 (0.8764)  time: 0.6646  data: 0.0005  max mem: 69511
Epoch: [205] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.000887  min_lr: 0.000887  loss: 3.0470 (2.8440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7984 (0.8764)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.6815 (0.6815)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 7.8146  data: 7.3763  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8124 (0.8322)  acc1: 86.4000 (86.7636)  acc5: 97.6000 (97.7818)  time: 1.0684  data: 0.6708  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9501 (0.9594)  acc1: 80.8000 (83.2191)  acc5: 96.0000 (96.6667)  time: 0.3936  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0482 (0.9680)  acc1: 80.0000 (82.6560)  acc5: 96.0000 (96.5920)  time: 0.3935  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6945 s / it)
* Acc@1 83.268 Acc@5 96.610 loss 0.954
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.29%
Epoch: [206]  [   0/1251]  eta: 1:38:31  lr: 0.000887  min_lr: 0.000887  loss: 2.7105 (2.7105)  weight_decay: 0.0500 (0.0500)  time: 4.7255  data: 3.9382  max mem: 69511
Epoch: [206]  [ 200/1251]  eta: 0:14:05  lr: 0.000884  min_lr: 0.000884  loss: 2.9089 (2.8056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8634 (0.8469)  time: 0.7913  data: 0.0004  max mem: 69511
Epoch: [206]  [ 400/1251]  eta: 0:11:17  lr: 0.000882  min_lr: 0.000882  loss: 2.8514 (2.8262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7644 (0.8419)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [206]  [ 600/1251]  eta: 0:08:36  lr: 0.000879  min_lr: 0.000879  loss: 3.0036 (2.8190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8233 (0.8475)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [206]  [ 800/1251]  eta: 0:05:57  lr: 0.000876  min_lr: 0.000876  loss: 3.1818 (2.8231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8787 (0.8545)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [206]  [1000/1251]  eta: 0:03:18  lr: 0.000873  min_lr: 0.000873  loss: 2.7515 (2.8232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8761 (0.8684)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [206]  [1200/1251]  eta: 0:00:40  lr: 0.000871  min_lr: 0.000871  loss: 2.7006 (2.8259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7979 (0.8636)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [206]  [1250/1251]  eta: 0:00:00  lr: 0.000870  min_lr: 0.000870  loss: 2.9800 (2.8276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7979 (0.8618)  time: 0.6719  data: 0.0005  max mem: 69511
Epoch: [206] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.000870  min_lr: 0.000870  loss: 2.9800 (2.8409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7979 (0.8618)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.6352 (0.6352)  acc1: 89.6000 (89.6000)  acc5: 99.6000 (99.6000)  time: 7.8052  data: 7.3901  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8127 (0.8036)  acc1: 86.4000 (86.5091)  acc5: 98.0000 (97.8546)  time: 1.0705  data: 0.6721  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9272 (0.9425)  acc1: 82.4000 (83.4857)  acc5: 96.0000 (96.6857)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0447 (0.9543)  acc1: 81.6000 (83.0400)  acc5: 96.0000 (96.6240)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7016 s / it)
* Acc@1 83.236 Acc@5 96.626 loss 0.944
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.29%
Epoch: [207]  [   0/1251]  eta: 1:40:23  lr: 0.000870  min_lr: 0.000870  loss: 2.9891 (2.9891)  weight_decay: 0.0500 (0.0500)  time: 4.8149  data: 4.0360  max mem: 69511
Epoch: [207]  [ 200/1251]  eta: 0:14:09  lr: 0.000867  min_lr: 0.000867  loss: 2.7141 (2.8168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8694 (0.8634)  time: 0.7943  data: 0.0004  max mem: 69511
Epoch: [207]  [ 400/1251]  eta: 0:11:18  lr: 0.000865  min_lr: 0.000865  loss: 3.0200 (2.8276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9062 (0.8783)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [207]  [ 600/1251]  eta: 0:08:36  lr: 0.000862  min_lr: 0.000862  loss: 3.0959 (2.8378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9561 (0.9086)  time: 0.7877  data: 0.0006  max mem: 69511
Epoch: [207]  [ 800/1251]  eta: 0:05:57  lr: 0.000859  min_lr: 0.000859  loss: 3.0178 (2.8332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8586 (0.9034)  time: 0.7874  data: 0.0004  max mem: 69511
Epoch: [207]  [1000/1251]  eta: 0:03:18  lr: 0.000857  min_lr: 0.000857  loss: 2.8906 (2.8354)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8520 (0.9020)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [207]  [1200/1251]  eta: 0:00:40  lr: 0.000854  min_lr: 0.000854  loss: 2.8676 (2.8468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9551 (0.9039)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [207]  [1250/1251]  eta: 0:00:00  lr: 0.000853  min_lr: 0.000853  loss: 3.0249 (2.8477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9551 (0.9062)  time: 0.6690  data: 0.0005  max mem: 69511
Epoch: [207] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.000853  min_lr: 0.000853  loss: 3.0249 (2.8419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9551 (0.9062)
Test:  [ 0/25]  eta: 0:02:57  loss: 0.7286 (0.7286)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.1014  data: 6.6671  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8684 (0.8654)  acc1: 86.8000 (86.6182)  acc5: 98.0000 (97.8182)  time: 1.0065  data: 0.6064  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0123 (0.9775)  acc1: 81.6000 (83.3333)  acc5: 96.4000 (96.5714)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0489 (0.9926)  acc1: 80.8000 (82.8480)  acc5: 96.0000 (96.5120)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6711 s / it)
* Acc@1 83.320 Acc@5 96.590 loss 0.978
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.32%
Epoch: [208]  [   0/1251]  eta: 1:34:21  lr: 0.000853  min_lr: 0.000853  loss: 3.3910 (3.3910)  weight_decay: 0.0500 (0.0500)  time: 4.5255  data: 3.7381  max mem: 69511
Epoch: [208]  [ 200/1251]  eta: 0:14:06  lr: 0.000850  min_lr: 0.000850  loss: 3.1129 (2.8282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8872 (0.9044)  time: 0.7847  data: 0.0005  max mem: 69511
Epoch: [208]  [ 400/1251]  eta: 0:11:17  lr: 0.000848  min_lr: 0.000848  loss: 2.8972 (2.8484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8272 (0.8957)  time: 0.7928  data: 0.0004  max mem: 69511
Epoch: [208]  [ 600/1251]  eta: 0:08:36  lr: 0.000845  min_lr: 0.000845  loss: 2.9901 (2.8359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8143 (0.8925)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [208]  [ 800/1251]  eta: 0:05:57  lr: 0.000842  min_lr: 0.000842  loss: 2.8475 (2.8327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9453 (0.8843)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [208]  [1000/1251]  eta: 0:03:18  lr: 0.000840  min_lr: 0.000840  loss: 2.9359 (2.8382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8117 (0.8805)  time: 0.7926  data: 0.0005  max mem: 69511
Epoch: [208]  [1200/1251]  eta: 0:00:40  lr: 0.000837  min_lr: 0.000837  loss: 2.9021 (2.8395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7829 (0.8730)  time: 0.7917  data: 0.0004  max mem: 69511
Epoch: [208]  [1250/1251]  eta: 0:00:00  lr: 0.000836  min_lr: 0.000836  loss: 3.0211 (2.8406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9017 (0.8764)  time: 0.6676  data: 0.0005  max mem: 69511
Epoch: [208] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.000836  min_lr: 0.000836  loss: 3.0211 (2.8396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9017 (0.8764)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.6887 (0.6887)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 7.5053  data: 7.0919  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8717 (0.8357)  acc1: 86.8000 (86.9818)  acc5: 98.0000 (98.0364)  time: 1.0433  data: 0.6450  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9391 (0.9667)  acc1: 82.0000 (83.4667)  acc5: 96.8000 (96.8381)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0514 (0.9785)  acc1: 81.2000 (83.0240)  acc5: 96.0000 (96.7200)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6873 s / it)
* Acc@1 83.354 Acc@5 96.662 loss 0.968
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.35%
Epoch: [209]  [   0/1251]  eta: 1:36:31  lr: 0.000836  min_lr: 0.000836  loss: 2.0086 (2.0086)  weight_decay: 0.0500 (0.0500)  time: 4.6298  data: 3.8420  max mem: 69511
Epoch: [209]  [ 200/1251]  eta: 0:14:05  lr: 0.000834  min_lr: 0.000834  loss: 2.8521 (2.7988)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8193 (0.8714)  time: 0.7860  data: 0.0006  max mem: 69511
Epoch: [209]  [ 400/1251]  eta: 0:11:17  lr: 0.000831  min_lr: 0.000831  loss: 2.8725 (2.7738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8264 (0.8873)  time: 0.7872  data: 0.0005  max mem: 69511
Epoch: [209]  [ 600/1251]  eta: 0:08:37  lr: 0.000828  min_lr: 0.000828  loss: 2.7969 (2.7866)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8785 (0.8982)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [209]  [ 800/1251]  eta: 0:05:57  lr: 0.000826  min_lr: 0.000826  loss: 2.9281 (2.7882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8839 (0.8926)  time: 0.7927  data: 0.0004  max mem: 69511
Epoch: [209]  [1000/1251]  eta: 0:03:18  lr: 0.000823  min_lr: 0.000823  loss: 3.0367 (2.8066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9060 (0.9023)  time: 0.7948  data: 0.0004  max mem: 69511
Epoch: [209]  [1200/1251]  eta: 0:00:40  lr: 0.000820  min_lr: 0.000820  loss: 2.6662 (2.8190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8568 (0.8960)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [209]  [1250/1251]  eta: 0:00:00  lr: 0.000820  min_lr: 0.000820  loss: 2.6675 (2.8208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9650 (0.9005)  time: 0.6676  data: 0.0004  max mem: 69511
Epoch: [209] Total time: 0:16:27 (0.7897 s / it)
Averaged stats: lr: 0.000820  min_lr: 0.000820  loss: 2.6675 (2.8215)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9650 (0.9005)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6039 (0.6039)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.3417  data: 4.9117  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8028 (0.7821)  acc1: 87.2000 (86.5455)  acc5: 98.0000 (98.0364)  time: 1.0015  data: 0.5998  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9180 (0.9054)  acc1: 81.6000 (83.3714)  acc5: 96.8000 (97.0095)  time: 0.4821  data: 0.0843  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9536 (0.9182)  acc1: 81.6000 (82.8800)  acc5: 96.0000 (96.8480)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6690 s / it)
* Acc@1 83.338 Acc@5 96.772 loss 0.911
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.35%
Epoch: [210]  [   0/1251]  eta: 1:26:23  lr: 0.000820  min_lr: 0.000820  loss: 3.0407 (3.0407)  weight_decay: 0.0500 (0.0500)  time: 4.1432  data: 2.1308  max mem: 69511
Epoch: [210]  [ 200/1251]  eta: 0:14:06  lr: 0.000817  min_lr: 0.000817  loss: 3.0158 (2.8161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8727 (0.8610)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [210]  [ 400/1251]  eta: 0:11:18  lr: 0.000814  min_lr: 0.000814  loss: 2.9051 (2.8020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8818 (0.8717)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [210]  [ 600/1251]  eta: 0:08:36  lr: 0.000812  min_lr: 0.000812  loss: 2.9495 (2.8194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8232 (0.8748)  time: 0.7864  data: 0.0003  max mem: 69511
Epoch: [210]  [ 800/1251]  eta: 0:05:57  lr: 0.000809  min_lr: 0.000809  loss: 2.9348 (2.8188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8062 (0.8925)  time: 0.7921  data: 0.0004  max mem: 69511
Epoch: [210]  [1000/1251]  eta: 0:03:18  lr: 0.000806  min_lr: 0.000806  loss: 2.9047 (2.8248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8253 (0.8869)  time: 0.7843  data: 0.0004  max mem: 69511
Epoch: [210]  [1200/1251]  eta: 0:00:40  lr: 0.000804  min_lr: 0.000804  loss: 2.8559 (2.8210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8694 (0.8921)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [210]  [1250/1251]  eta: 0:00:00  lr: 0.000803  min_lr: 0.000803  loss: 2.9311 (2.8206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9244 (0.8958)  time: 0.6674  data: 0.0005  max mem: 69511
Epoch: [210] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.000803  min_lr: 0.000803  loss: 2.9311 (2.8185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9244 (0.8958)
Test:  [ 0/25]  eta: 0:03:25  loss: 0.6621 (0.6621)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 8.2198  data: 7.8016  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8634 (0.8153)  acc1: 85.6000 (86.2545)  acc5: 97.6000 (97.8182)  time: 1.1076  data: 0.7095  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9121 (0.9524)  acc1: 82.4000 (83.3524)  acc5: 96.4000 (96.5714)  time: 0.3964  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0550 (0.9634)  acc1: 82.0000 (83.0080)  acc5: 96.0000 (96.4960)  time: 0.3964  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7156 s / it)
* Acc@1 83.488 Acc@5 96.588 loss 0.954
Accuracy of the model on the 50000 test images: 83.5%
Max accuracy: 83.49%
Epoch: [211]  [   0/1251]  eta: 1:26:47  lr: 0.000803  min_lr: 0.000803  loss: 3.0063 (3.0063)  weight_decay: 0.0500 (0.0500)  time: 4.1626  data: 3.3833  max mem: 69511
Epoch: [211]  [ 200/1251]  eta: 0:14:04  lr: 0.000801  min_lr: 0.000801  loss: 2.9211 (2.7902)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8521 (0.8951)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [211]  [ 400/1251]  eta: 0:11:16  lr: 0.000798  min_lr: 0.000798  loss: 2.6305 (2.7885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9583 (0.9025)  time: 0.7854  data: 0.0003  max mem: 69511
Epoch: [211]  [ 600/1251]  eta: 0:08:35  lr: 0.000795  min_lr: 0.000795  loss: 2.7844 (2.7932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7797 (0.8831)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [211]  [ 800/1251]  eta: 0:05:56  lr: 0.000793  min_lr: 0.000793  loss: 2.8858 (2.7910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8699 (0.8851)  time: 0.7830  data: 0.0004  max mem: 69511
Epoch: [211]  [1000/1251]  eta: 0:03:18  lr: 0.000790  min_lr: 0.000790  loss: 2.8943 (2.7887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8525 (0.8891)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [211]  [1200/1251]  eta: 0:00:40  lr: 0.000787  min_lr: 0.000787  loss: 2.9521 (2.7956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8421 (nan)  time: 0.7851  data: 0.0004  max mem: 69511
Epoch: [211]  [1250/1251]  eta: 0:00:00  lr: 0.000787  min_lr: 0.000787  loss: 2.9824 (2.7960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8986 (nan)  time: 0.6714  data: 0.0004  max mem: 69511
Epoch: [211] Total time: 0:16:25 (0.7880 s / it)
Averaged stats: lr: 0.000787  min_lr: 0.000787  loss: 2.9824 (2.8211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8986 (nan)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.7899 (0.7899)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 7.3266  data: 6.9064  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9250 (0.9352)  acc1: 86.4000 (86.2545)  acc5: 98.0000 (98.0000)  time: 1.0265  data: 0.6281  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0739 (1.0444)  acc1: 82.8000 (83.3524)  acc5: 96.8000 (96.9714)  time: 0.3965  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0994 (1.0560)  acc1: 81.6000 (82.8480)  acc5: 96.4000 (96.9440)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6790 s / it)
* Acc@1 83.332 Acc@5 96.748 loss 1.048
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.49%
Epoch: [212]  [   0/1251]  eta: 1:42:08  lr: 0.000787  min_lr: 0.000787  loss: 2.6295 (2.6295)  weight_decay: 0.0500 (0.0500)  time: 4.8988  data: 4.1039  max mem: 69511
Epoch: [212]  [ 200/1251]  eta: 0:14:06  lr: 0.000784  min_lr: 0.000784  loss: 2.7948 (2.7886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8823 (0.8836)  time: 0.7822  data: 0.0004  max mem: 69511
Epoch: [212]  [ 400/1251]  eta: 0:11:16  lr: 0.000781  min_lr: 0.000781  loss: 2.7267 (2.7846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8445 (0.8851)  time: 0.7950  data: 0.0004  max mem: 69511
Epoch: [212]  [ 600/1251]  eta: 0:08:36  lr: 0.000779  min_lr: 0.000779  loss: 2.6723 (2.7946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9482 (0.9080)  time: 0.7855  data: 0.0006  max mem: 69511
Epoch: [212]  [ 800/1251]  eta: 0:05:56  lr: 0.000776  min_lr: 0.000776  loss: 2.8802 (2.8010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8616 (0.9165)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [212]  [1000/1251]  eta: 0:03:18  lr: 0.000774  min_lr: 0.000774  loss: 2.9123 (2.8132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9527 (0.9255)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [212]  [1200/1251]  eta: 0:00:40  lr: 0.000771  min_lr: 0.000771  loss: 2.6650 (2.8175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8176 (0.9209)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [212]  [1250/1251]  eta: 0:00:00  lr: 0.000770  min_lr: 0.000770  loss: 2.9716 (2.8220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7918 (0.9168)  time: 0.6676  data: 0.0005  max mem: 69511
Epoch: [212] Total time: 0:16:26 (0.7889 s / it)
Averaged stats: lr: 0.000770  min_lr: 0.000770  loss: 2.9716 (2.8139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7918 (0.9168)
Test:  [ 0/25]  eta: 0:02:54  loss: 0.6589 (0.6589)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.9700  data: 6.5515  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8795 (0.8315)  acc1: 87.6000 (86.9091)  acc5: 98.0000 (98.0727)  time: 0.9948  data: 0.5959  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9521 (0.9624)  acc1: 82.8000 (83.6952)  acc5: 96.8000 (96.8952)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0646 (0.9812)  acc1: 81.6000 (83.0240)  acc5: 96.8000 (96.8160)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6671 s / it)
* Acc@1 83.442 Acc@5 96.692 loss 0.970
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.49%
Epoch: [213]  [   0/1251]  eta: 1:35:18  lr: 0.000770  min_lr: 0.000770  loss: 2.9061 (2.9061)  weight_decay: 0.0500 (0.0500)  time: 4.5708  data: 3.1824  max mem: 69511
Epoch: [213]  [ 200/1251]  eta: 0:14:07  lr: 0.000768  min_lr: 0.000768  loss: 2.7038 (2.7872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8384 (0.9161)  time: 0.7940  data: 0.0007  max mem: 69511
Epoch: [213]  [ 400/1251]  eta: 0:11:18  lr: 0.000765  min_lr: 0.000765  loss: 2.7413 (2.7652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8318 (0.9179)  time: 0.7940  data: 0.0005  max mem: 69511
Epoch: [213]  [ 600/1251]  eta: 0:08:37  lr: 0.000763  min_lr: 0.000763  loss: 2.9693 (2.7903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8154 (0.9086)  time: 0.7932  data: 0.0011  max mem: 69511
Epoch: [213]  [ 800/1251]  eta: 0:05:57  lr: 0.000760  min_lr: 0.000760  loss: 2.9760 (2.7832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9877 (0.9168)  time: 0.7859  data: 0.0009  max mem: 69511
Epoch: [213]  [1000/1251]  eta: 0:03:18  lr: 0.000757  min_lr: 0.000757  loss: 2.9571 (2.7824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8125 (0.9223)  time: 0.7905  data: 0.0008  max mem: 69511
Epoch: [213]  [1200/1251]  eta: 0:00:40  lr: 0.000755  min_lr: 0.000755  loss: 2.8415 (2.7887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8518 (0.9149)  time: 0.7864  data: 0.0008  max mem: 69511
Epoch: [213]  [1250/1251]  eta: 0:00:00  lr: 0.000754  min_lr: 0.000754  loss: 3.0703 (2.7907)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8413 (0.9153)  time: 0.6677  data: 0.0007  max mem: 69511
Epoch: [213] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.000754  min_lr: 0.000754  loss: 3.0703 (2.7943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8413 (0.9153)
Test:  [ 0/25]  eta: 0:03:00  loss: 0.7372 (0.7372)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 7.2044  data: 6.7717  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9148 (0.8993)  acc1: 85.6000 (86.6182)  acc5: 98.0000 (98.0727)  time: 1.0159  data: 0.6159  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0108 (1.0280)  acc1: 82.0000 (83.6571)  acc5: 96.4000 (96.8381)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0973 (1.0386)  acc1: 81.6000 (83.2480)  acc5: 96.0000 (96.7040)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6756 s / it)
* Acc@1 83.470 Acc@5 96.714 loss 1.026
Accuracy of the model on the 50000 test images: 83.5%
Max accuracy: 83.49%
Epoch: [214]  [   0/1251]  eta: 1:44:00  lr: 0.000754  min_lr: 0.000754  loss: 2.1626 (2.1626)  weight_decay: 0.0500 (0.0500)  time: 4.9882  data: 3.5341  max mem: 69511
Epoch: [214]  [ 200/1251]  eta: 0:14:10  lr: 0.000752  min_lr: 0.000752  loss: 2.7512 (2.7953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8952 (0.9108)  time: 0.7939  data: 0.0005  max mem: 69511
Epoch: [214]  [ 400/1251]  eta: 0:11:18  lr: 0.000749  min_lr: 0.000749  loss: 2.9645 (2.7953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7903 (0.8985)  time: 0.7822  data: 0.0005  max mem: 69511
Epoch: [214]  [ 600/1251]  eta: 0:08:36  lr: 0.000746  min_lr: 0.000746  loss: 2.7572 (2.7982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8891 (0.9141)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [214]  [ 800/1251]  eta: 0:05:56  lr: 0.000744  min_lr: 0.000744  loss: 2.9451 (2.8000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8774 (0.9194)  time: 0.7937  data: 0.0006  max mem: 69511
Epoch: [214]  [1000/1251]  eta: 0:03:18  lr: 0.000741  min_lr: 0.000741  loss: 2.8314 (2.7989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9006 (0.9191)  time: 0.7857  data: 0.0006  max mem: 69511
Epoch: [214]  [1200/1251]  eta: 0:00:40  lr: 0.000739  min_lr: 0.000739  loss: 2.8519 (2.8003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9203 (0.9147)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [214]  [1250/1251]  eta: 0:00:00  lr: 0.000738  min_lr: 0.000738  loss: 2.9977 (2.8017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8923 (0.9143)  time: 0.6676  data: 0.0007  max mem: 69511
Epoch: [214] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.000738  min_lr: 0.000738  loss: 2.9977 (2.8007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8923 (0.9143)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.6564 (0.6564)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 7.5049  data: 7.0863  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8772 (0.8288)  acc1: 86.0000 (86.1091)  acc5: 98.0000 (97.8545)  time: 1.0432  data: 0.6445  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9735 (0.9605)  acc1: 82.4000 (83.5238)  acc5: 96.4000 (96.8381)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0380 (0.9741)  acc1: 82.4000 (83.2320)  acc5: 96.4000 (96.8000)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6882 s / it)
* Acc@1 83.564 Acc@5 96.784 loss 0.963
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.56%
Epoch: [215]  [   0/1251]  eta: 1:28:53  lr: 0.000738  min_lr: 0.000738  loss: 1.8483 (1.8483)  weight_decay: 0.0500 (0.0500)  time: 4.2631  data: 3.4717  max mem: 69511
Epoch: [215]  [ 200/1251]  eta: 0:14:07  lr: 0.000736  min_lr: 0.000736  loss: 2.8180 (2.7903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9202 (0.9239)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [215]  [ 400/1251]  eta: 0:11:17  lr: 0.000733  min_lr: 0.000733  loss: 3.0239 (2.7753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8939 (0.9169)  time: 0.7828  data: 0.0006  max mem: 69511
Epoch: [215]  [ 600/1251]  eta: 0:08:35  lr: 0.000730  min_lr: 0.000730  loss: 2.8973 (2.7742)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8876 (0.9143)  time: 0.7823  data: 0.0005  max mem: 69511
Epoch: [215]  [ 800/1251]  eta: 0:05:56  lr: 0.000728  min_lr: 0.000728  loss: 2.5836 (2.7663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9251 (0.9235)  time: 0.7838  data: 0.0004  max mem: 69511
Epoch: [215]  [1000/1251]  eta: 0:03:18  lr: 0.000725  min_lr: 0.000725  loss: 2.6692 (2.7799)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0377 (0.9324)  time: 0.7829  data: 0.0005  max mem: 69511
Epoch: [215]  [1200/1251]  eta: 0:00:40  lr: 0.000723  min_lr: 0.000723  loss: 2.7731 (2.7775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8001 (0.9324)  time: 0.7852  data: 0.0004  max mem: 69511
Epoch: [215]  [1250/1251]  eta: 0:00:00  lr: 0.000722  min_lr: 0.000722  loss: 2.9790 (2.7779)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8795 (0.9326)  time: 0.6671  data: 0.0006  max mem: 69511
Epoch: [215] Total time: 0:16:24 (0.7874 s / it)
Averaged stats: lr: 0.000722  min_lr: 0.000722  loss: 2.9790 (2.8006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8795 (0.9326)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.6456 (0.6456)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 7.5137  data: 7.0851  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8395 (0.8047)  acc1: 87.2000 (86.4727)  acc5: 98.0000 (98.2546)  time: 1.0438  data: 0.6444  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9473 (0.9089)  acc1: 81.6000 (83.6381)  acc5: 96.8000 (97.1810)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9511 (0.9207)  acc1: 81.2000 (83.1360)  acc5: 96.4000 (97.0400)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6871 s / it)
* Acc@1 83.634 Acc@5 96.850 loss 0.912
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.63%
Epoch: [216]  [   0/1251]  eta: 1:26:53  lr: 0.000722  min_lr: 0.000722  loss: 2.7574 (2.7574)  weight_decay: 0.0500 (0.0500)  time: 4.1676  data: 3.3830  max mem: 69511
Epoch: [216]  [ 200/1251]  eta: 0:14:00  lr: 0.000720  min_lr: 0.000720  loss: 2.9715 (2.8408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8329 (0.8806)  time: 0.7827  data: 0.0005  max mem: 69511
Epoch: [216]  [ 400/1251]  eta: 0:11:15  lr: 0.000717  min_lr: 0.000717  loss: 2.8374 (2.8167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9449 (0.9066)  time: 0.7932  data: 0.0004  max mem: 69511
Epoch: [216]  [ 600/1251]  eta: 0:08:34  lr: 0.000715  min_lr: 0.000715  loss: 2.9993 (2.8091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8615 (0.9089)  time: 0.7828  data: 0.0004  max mem: 69511
Epoch: [216]  [ 800/1251]  eta: 0:05:55  lr: 0.000712  min_lr: 0.000712  loss: 2.7733 (2.8002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8406 (0.9275)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [216]  [1000/1251]  eta: 0:03:18  lr: 0.000709  min_lr: 0.000709  loss: 2.4635 (2.7886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8774 (0.9276)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [216]  [1200/1251]  eta: 0:00:40  lr: 0.000707  min_lr: 0.000707  loss: 2.9541 (2.7874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7848 (0.9277)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [216]  [1250/1251]  eta: 0:00:00  lr: 0.000706  min_lr: 0.000706  loss: 2.7671 (2.7889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7646 (0.9244)  time: 0.6684  data: 0.0006  max mem: 69511
Epoch: [216] Total time: 0:16:25 (0.7876 s / it)
Averaged stats: lr: 0.000706  min_lr: 0.000706  loss: 2.7671 (2.8010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7646 (0.9244)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.6568 (0.6568)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 7.4860  data: 7.0551  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8247 (0.7922)  acc1: 86.4000 (86.5818)  acc5: 98.0000 (98.0364)  time: 1.0414  data: 0.6417  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9048 (0.8938)  acc1: 82.4000 (83.9619)  acc5: 97.2000 (96.9143)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9509 (0.9106)  acc1: 82.0000 (83.3280)  acc5: 96.0000 (96.7520)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6861 s / it)
* Acc@1 83.844 Acc@5 96.806 loss 0.900
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.84%
Epoch: [217]  [   0/1251]  eta: 1:11:05  lr: 0.000706  min_lr: 0.000706  loss: 2.9635 (2.9635)  weight_decay: 0.0500 (0.0500)  time: 3.4097  data: 2.6275  max mem: 69511
Epoch: [217]  [ 200/1251]  eta: 0:14:00  lr: 0.000704  min_lr: 0.000704  loss: 2.8432 (2.8005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8316 (0.9124)  time: 0.7941  data: 0.0005  max mem: 69511
Epoch: [217]  [ 400/1251]  eta: 0:11:16  lr: 0.000701  min_lr: 0.000701  loss: 2.8665 (2.8038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9485 (0.9414)  time: 0.7941  data: 0.0004  max mem: 69511
Epoch: [217]  [ 600/1251]  eta: 0:08:35  lr: 0.000699  min_lr: 0.000699  loss: 2.5829 (2.7904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9649 (0.9469)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [217]  [ 800/1251]  eta: 0:05:56  lr: 0.000696  min_lr: 0.000696  loss: 2.8716 (2.7931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9729 (inf)  time: 0.7921  data: 0.0007  max mem: 69511
Epoch: [217]  [1000/1251]  eta: 0:03:18  lr: 0.000694  min_lr: 0.000694  loss: 2.9936 (2.7894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9922 (inf)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [217]  [1200/1251]  eta: 0:00:40  lr: 0.000691  min_lr: 0.000691  loss: 2.6807 (2.7864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9182 (inf)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [217]  [1250/1251]  eta: 0:00:00  lr: 0.000691  min_lr: 0.000691  loss: 2.8837 (2.7866)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8265 (inf)  time: 0.6676  data: 0.0006  max mem: 69511
Epoch: [217] Total time: 0:16:26 (0.7883 s / it)
Averaged stats: lr: 0.000691  min_lr: 0.000691  loss: 2.8837 (2.7886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8265 (inf)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.7156 (0.7156)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 7.7014  data: 7.2888  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7940 (0.8148)  acc1: 86.0000 (86.4727)  acc5: 97.6000 (98.0000)  time: 1.0613  data: 0.6629  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9810 (0.9346)  acc1: 82.4000 (83.7143)  acc5: 96.8000 (97.0095)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0132 (0.9487)  acc1: 82.0000 (83.3120)  acc5: 96.0000 (96.8160)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6954 s / it)
* Acc@1 83.716 Acc@5 96.822 loss 0.938
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.84%
Epoch: [218]  [   0/1251]  eta: 1:35:00  lr: 0.000691  min_lr: 0.000691  loss: 3.1781 (3.1781)  weight_decay: 0.0500 (0.0500)  time: 4.5567  data: 3.3568  max mem: 69511
Epoch: [218]  [ 200/1251]  eta: 0:14:07  lr: 0.000688  min_lr: 0.000688  loss: 2.9110 (2.7652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9065 (0.8745)  time: 0.7981  data: 0.0004  max mem: 69511
Epoch: [218]  [ 400/1251]  eta: 0:11:16  lr: 0.000686  min_lr: 0.000686  loss: 2.6840 (2.7762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9242 (0.9136)  time: 0.7821  data: 0.0004  max mem: 69511
Epoch: [218]  [ 600/1251]  eta: 0:08:34  lr: 0.000683  min_lr: 0.000683  loss: 3.0130 (2.7780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8320 (0.9172)  time: 0.7825  data: 0.0004  max mem: 69511
Epoch: [218]  [ 800/1251]  eta: 0:05:56  lr: 0.000681  min_lr: 0.000681  loss: 2.8392 (2.7823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9060 (0.9282)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [218]  [1000/1251]  eta: 0:03:18  lr: 0.000678  min_lr: 0.000678  loss: 2.7517 (2.7880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9037 (0.9256)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [218]  [1200/1251]  eta: 0:00:40  lr: 0.000676  min_lr: 0.000676  loss: 2.9657 (2.7809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9617 (0.9313)  time: 0.7933  data: 0.0003  max mem: 69511
Epoch: [218]  [1250/1251]  eta: 0:00:00  lr: 0.000675  min_lr: 0.000675  loss: 2.8490 (2.7789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9644 (0.9328)  time: 0.6674  data: 0.0005  max mem: 69511
Epoch: [218] Total time: 0:16:24 (0.7872 s / it)
Averaged stats: lr: 0.000675  min_lr: 0.000675  loss: 2.8490 (2.7798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9644 (0.9328)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.6325 (0.6325)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 7.7651  data: 7.3404  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.7923 (0.7734)  acc1: 85.6000 (85.9273)  acc5: 98.0000 (98.1091)  time: 1.0668  data: 0.6676  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8806 (0.8926)  acc1: 82.0000 (83.6571)  acc5: 97.2000 (96.9714)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9930 (0.9062)  acc1: 81.6000 (83.2640)  acc5: 96.4000 (96.8320)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6988 s / it)
* Acc@1 83.718 Acc@5 96.806 loss 0.893
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.84%
Epoch: [219]  [   0/1251]  eta: 1:34:46  lr: 0.000675  min_lr: 0.000675  loss: 3.0468 (3.0468)  weight_decay: 0.0500 (0.0500)  time: 4.5459  data: 3.0462  max mem: 69511
Epoch: [219]  [ 200/1251]  eta: 0:14:08  lr: 0.000673  min_lr: 0.000673  loss: 2.9631 (2.8016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9961 (1.0151)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [219]  [ 400/1251]  eta: 0:11:18  lr: 0.000670  min_lr: 0.000670  loss: 2.7799 (2.7733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9813 (1.0018)  time: 0.7841  data: 0.0004  max mem: 69511
Epoch: [219]  [ 600/1251]  eta: 0:08:36  lr: 0.000668  min_lr: 0.000668  loss: 2.9403 (2.7614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9581 (0.9786)  time: 0.7890  data: 0.0004  max mem: 69511
Epoch: [219]  [ 800/1251]  eta: 0:05:56  lr: 0.000665  min_lr: 0.000665  loss: 2.8878 (2.7651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8830 (0.9709)  time: 0.7852  data: 0.0005  max mem: 69511
Epoch: [219]  [1000/1251]  eta: 0:03:18  lr: 0.000663  min_lr: 0.000663  loss: 2.6425 (2.7606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8981 (0.9652)  time: 0.7842  data: 0.0004  max mem: 69511
Epoch: [219]  [1200/1251]  eta: 0:00:40  lr: 0.000660  min_lr: 0.000660  loss: 2.7382 (2.7599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8643 (0.9508)  time: 0.7820  data: 0.0005  max mem: 69511
Epoch: [219]  [1250/1251]  eta: 0:00:00  lr: 0.000660  min_lr: 0.000660  loss: 2.7053 (2.7613)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9134 (0.9520)  time: 0.6640  data: 0.0006  max mem: 69511
Epoch: [219] Total time: 0:16:25 (0.7879 s / it)
Averaged stats: lr: 0.000660  min_lr: 0.000660  loss: 2.7053 (2.7737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9134 (0.9520)
Test:  [ 0/25]  eta: 0:03:13  loss: 0.6162 (0.6162)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.7395  data: 7.3216  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7422 (0.7220)  acc1: 87.6000 (86.3636)  acc5: 97.6000 (97.9636)  time: 1.0614  data: 0.6658  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8382 (0.8416)  acc1: 81.2000 (83.6381)  acc5: 96.8000 (96.8000)  time: 0.3936  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9315 (0.8502)  acc1: 81.2000 (83.2960)  acc5: 96.0000 (96.7040)  time: 0.3936  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6922 s / it)
* Acc@1 83.780 Acc@5 96.796 loss 0.831
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.84%
Epoch: [220]  [   0/1251]  eta: 1:42:48  lr: 0.000660  min_lr: 0.000660  loss: 2.5262 (2.5262)  weight_decay: 0.0500 (0.0500)  time: 4.9306  data: 4.1428  max mem: 69511
Epoch: [220]  [ 200/1251]  eta: 0:14:04  lr: 0.000657  min_lr: 0.000657  loss: 2.6968 (2.7578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9427 (0.9840)  time: 0.7839  data: 0.0006  max mem: 69511
Epoch: [220]  [ 400/1251]  eta: 0:11:17  lr: 0.000655  min_lr: 0.000655  loss: 2.7885 (2.7738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8625 (0.9429)  time: 0.7847  data: 0.0005  max mem: 69511
Epoch: [220]  [ 600/1251]  eta: 0:08:36  lr: 0.000652  min_lr: 0.000652  loss: 2.6692 (2.7723)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8817 (0.9398)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [220]  [ 800/1251]  eta: 0:05:56  lr: 0.000650  min_lr: 0.000650  loss: 2.7577 (2.7712)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8959 (0.9414)  time: 0.7856  data: 0.0006  max mem: 69511
Epoch: [220]  [1000/1251]  eta: 0:03:18  lr: 0.000647  min_lr: 0.000647  loss: 2.8485 (2.7718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9398 (0.9447)  time: 0.7915  data: 0.0004  max mem: 69511
Epoch: [220]  [1200/1251]  eta: 0:00:40  lr: 0.000645  min_lr: 0.000645  loss: 2.7355 (2.7721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9448 (0.9452)  time: 0.7851  data: 0.0006  max mem: 69511
Epoch: [220]  [1250/1251]  eta: 0:00:00  lr: 0.000644  min_lr: 0.000644  loss: 2.9652 (2.7752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9448 (0.9454)  time: 0.6671  data: 0.0006  max mem: 69511
Epoch: [220] Total time: 0:16:26 (0.7883 s / it)
Averaged stats: lr: 0.000644  min_lr: 0.000644  loss: 2.9652 (2.7749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9448 (0.9454)
Test:  [ 0/25]  eta: 0:03:01  loss: 0.7036 (0.7036)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 7.2503  data: 6.8221  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8261 (0.8318)  acc1: 86.0000 (86.3636)  acc5: 98.0000 (97.8909)  time: 1.0200  data: 0.6205  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9448 (0.9545)  acc1: 82.4000 (83.4857)  acc5: 96.8000 (96.8952)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0200 (0.9706)  acc1: 80.8000 (82.9120)  acc5: 96.0000 (96.8000)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6774 s / it)
* Acc@1 83.694 Acc@5 96.794 loss 0.951
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.84%
Epoch: [221]  [   0/1251]  eta: 1:33:42  lr: 0.000644  min_lr: 0.000644  loss: 2.9353 (2.9353)  weight_decay: 0.0500 (0.0500)  time: 4.4946  data: 3.1625  max mem: 69511
Epoch: [221]  [ 200/1251]  eta: 0:14:08  lr: 0.000642  min_lr: 0.000642  loss: 2.9378 (2.7647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9098 (0.9668)  time: 0.7939  data: 0.0004  max mem: 69511
Epoch: [221]  [ 400/1251]  eta: 0:11:18  lr: 0.000640  min_lr: 0.000640  loss: 2.7677 (2.7367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8862 (0.9400)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [221]  [ 600/1251]  eta: 0:08:36  lr: 0.000637  min_lr: 0.000637  loss: 2.8249 (2.7468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9397 (0.9366)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [221]  [ 800/1251]  eta: 0:05:57  lr: 0.000635  min_lr: 0.000635  loss: 2.8485 (2.7384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9352 (0.9464)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [221]  [1000/1251]  eta: 0:03:18  lr: 0.000632  min_lr: 0.000632  loss: 2.9262 (2.7532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9062 (0.9373)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [221]  [1200/1251]  eta: 0:00:40  lr: 0.000630  min_lr: 0.000630  loss: 2.8616 (2.7610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9055 (0.9339)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [221]  [1250/1251]  eta: 0:00:00  lr: 0.000629  min_lr: 0.000629  loss: 2.8655 (2.7617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9256 (0.9374)  time: 0.6676  data: 0.0005  max mem: 69511
Epoch: [221] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.000629  min_lr: 0.000629  loss: 2.8655 (2.7705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9256 (0.9374)
Test:  [ 0/25]  eta: 0:02:57  loss: 0.7405 (0.7405)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.0845  data: 6.6669  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8865 (0.8748)  acc1: 86.4000 (86.6182)  acc5: 98.0000 (98.0000)  time: 1.0052  data: 0.6064  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0106 (1.0029)  acc1: 81.6000 (84.0191)  acc5: 97.2000 (97.0667)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1053 (1.0136)  acc1: 81.6000 (83.4720)  acc5: 96.4000 (96.8800)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6721 s / it)
* Acc@1 83.870 Acc@5 96.848 loss 0.994
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 83.87%
Epoch: [222]  [   0/1251]  eta: 1:20:45  lr: 0.000629  min_lr: 0.000629  loss: 3.1433 (3.1433)  weight_decay: 0.0500 (0.0500)  time: 3.8732  data: 3.0830  max mem: 69511
Epoch: [222]  [ 200/1251]  eta: 0:14:03  lr: 0.000627  min_lr: 0.000627  loss: 2.7739 (2.7209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8541 (0.9336)  time: 0.7909  data: 0.0005  max mem: 69511
Epoch: [222]  [ 400/1251]  eta: 0:11:15  lr: 0.000624  min_lr: 0.000624  loss: 2.8291 (2.7450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9589 (0.9350)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [222]  [ 600/1251]  eta: 0:08:35  lr: 0.000622  min_lr: 0.000622  loss: 2.9687 (2.7496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9524 (0.9374)  time: 0.7962  data: 0.0006  max mem: 69511
Epoch: [222]  [ 800/1251]  eta: 0:05:56  lr: 0.000620  min_lr: 0.000620  loss: 2.7700 (2.7479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8921 (0.9382)  time: 0.7830  data: 0.0006  max mem: 69511
Epoch: [222]  [1000/1251]  eta: 0:03:18  lr: 0.000617  min_lr: 0.000617  loss: 2.9174 (2.7578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8621 (0.9381)  time: 0.7828  data: 0.0005  max mem: 69511
Epoch: [222]  [1200/1251]  eta: 0:00:40  lr: 0.000615  min_lr: 0.000615  loss: 2.8189 (2.7609)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0278 (0.9515)  time: 0.7848  data: 0.0006  max mem: 69511
Epoch: [222]  [1250/1251]  eta: 0:00:00  lr: 0.000614  min_lr: 0.000614  loss: 2.7211 (2.7570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9592 (0.9512)  time: 0.6672  data: 0.0005  max mem: 69511
Epoch: [222] Total time: 0:16:24 (0.7870 s / it)
Averaged stats: lr: 0.000614  min_lr: 0.000614  loss: 2.7211 (2.7623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9592 (0.9512)
Test:  [ 0/25]  eta: 0:03:23  loss: 0.5845 (0.5845)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 8.1475  data: 7.7330  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.7788 (0.7422)  acc1: 87.2000 (87.1636)  acc5: 98.0000 (97.9636)  time: 1.1014  data: 0.7033  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8703 (0.8629)  acc1: 82.4000 (84.1143)  acc5: 97.2000 (97.1048)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9408 (0.8761)  acc1: 82.0000 (83.7760)  acc5: 96.8000 (96.9920)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7128 s / it)
* Acc@1 83.916 Acc@5 96.870 loss 0.867
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 83.92%
Epoch: [223]  [   0/1251]  eta: 1:33:15  lr: 0.000614  min_lr: 0.000614  loss: 3.2529 (3.2529)  weight_decay: 0.0500 (0.0500)  time: 4.4729  data: 3.6858  max mem: 69511
Epoch: [223]  [ 200/1251]  eta: 0:14:07  lr: 0.000612  min_lr: 0.000612  loss: 2.6948 (2.7455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9986 (0.9542)  time: 0.7852  data: 0.0006  max mem: 69511
Epoch: [223]  [ 400/1251]  eta: 0:11:17  lr: 0.000610  min_lr: 0.000610  loss: 2.6844 (2.7591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8772 (0.9498)  time: 0.7819  data: 0.0005  max mem: 69511
Epoch: [223]  [ 600/1251]  eta: 0:08:35  lr: 0.000607  min_lr: 0.000607  loss: 2.8090 (2.7386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9937 (0.9425)  time: 0.7823  data: 0.0006  max mem: 69511
Epoch: [223]  [ 800/1251]  eta: 0:05:56  lr: 0.000605  min_lr: 0.000605  loss: 2.9764 (2.7555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9324 (0.9487)  time: 0.7852  data: 0.0008  max mem: 69511
Epoch: [223]  [1000/1251]  eta: 0:03:18  lr: 0.000602  min_lr: 0.000602  loss: 2.9370 (2.7466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8957 (0.9454)  time: 0.7957  data: 0.0007  max mem: 69511
Epoch: [223]  [1200/1251]  eta: 0:00:40  lr: 0.000600  min_lr: 0.000600  loss: 2.8087 (2.7449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0084 (0.9509)  time: 0.7856  data: 0.0006  max mem: 69511
Epoch: [223]  [1250/1251]  eta: 0:00:00  lr: 0.000599  min_lr: 0.000599  loss: 2.9936 (2.7468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9820 (0.9545)  time: 0.6674  data: 0.0006  max mem: 69511
Epoch: [223] Total time: 0:16:25 (0.7877 s / it)
Averaged stats: lr: 0.000599  min_lr: 0.000599  loss: 2.9936 (2.7567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9820 (0.9545)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.7432 (0.7432)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 7.5086  data: 7.0821  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.9038 (0.8631)  acc1: 87.2000 (86.8000)  acc5: 98.0000 (97.9636)  time: 1.0438  data: 0.6441  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9995 (0.9820)  acc1: 82.4000 (84.0381)  acc5: 96.4000 (96.9524)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0477 (0.9964)  acc1: 82.4000 (83.5200)  acc5: 96.4000 (96.8800)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6888 s / it)
* Acc@1 83.782 Acc@5 96.756 loss 0.987
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.92%
Epoch: [224]  [   0/1251]  eta: 1:44:47  lr: 0.000599  min_lr: 0.000599  loss: 3.0860 (3.0860)  weight_decay: 0.0500 (0.0500)  time: 5.0259  data: 4.2475  max mem: 69511
Epoch: [224]  [ 200/1251]  eta: 0:14:08  lr: 0.000597  min_lr: 0.000597  loss: 2.9904 (2.7442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8937 (nan)  time: 0.7867  data: 0.0006  max mem: 69511
Epoch: [224]  [ 400/1251]  eta: 0:11:19  lr: 0.000595  min_lr: 0.000595  loss: 2.7890 (2.7533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9220 (nan)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [224]  [ 600/1251]  eta: 0:08:37  lr: 0.000592  min_lr: 0.000592  loss: 2.8969 (2.7512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8728 (nan)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [224]  [ 800/1251]  eta: 0:05:57  lr: 0.000590  min_lr: 0.000590  loss: 2.8337 (2.7523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8871 (nan)  time: 0.7862  data: 0.0008  max mem: 69511
Epoch: [224]  [1000/1251]  eta: 0:03:18  lr: 0.000588  min_lr: 0.000588  loss: 2.9689 (2.7542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9187 (nan)  time: 0.7861  data: 0.0007  max mem: 69511
Epoch: [224]  [1200/1251]  eta: 0:00:40  lr: 0.000585  min_lr: 0.000585  loss: 2.8192 (2.7484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8922 (nan)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [224]  [1250/1251]  eta: 0:00:00  lr: 0.000585  min_lr: 0.000585  loss: 2.9925 (2.7472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9708 (nan)  time: 0.6677  data: 0.0006  max mem: 69511
Epoch: [224] Total time: 0:16:28 (0.7900 s / it)
Averaged stats: lr: 0.000585  min_lr: 0.000585  loss: 2.9925 (2.7509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9708 (nan)
Test:  [ 0/25]  eta: 0:03:08  loss: 0.6498 (0.6498)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 7.5279  data: 7.0853  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8609 (0.8165)  acc1: 87.2000 (87.3818)  acc5: 97.6000 (97.7455)  time: 1.0452  data: 0.6444  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9323 (0.9429)  acc1: 83.6000 (84.0762)  acc5: 96.8000 (96.9143)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0520 (0.9552)  acc1: 82.0000 (83.6800)  acc5: 96.0000 (96.7840)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6887 s / it)
* Acc@1 83.924 Acc@5 96.788 loss 0.945
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 83.92%
Epoch: [225]  [   0/1251]  eta: 1:27:35  lr: 0.000585  min_lr: 0.000585  loss: 2.5080 (2.5080)  weight_decay: 0.0500 (0.0500)  time: 4.2014  data: 3.4082  max mem: 69511
Epoch: [225]  [ 200/1251]  eta: 0:14:05  lr: 0.000582  min_lr: 0.000582  loss: 2.5171 (2.6986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8475 (0.9264)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [225]  [ 400/1251]  eta: 0:11:18  lr: 0.000580  min_lr: 0.000580  loss: 2.7563 (2.7049)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0587 (inf)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [225]  [ 600/1251]  eta: 0:08:36  lr: 0.000578  min_lr: 0.000578  loss: 2.7482 (2.7093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9527 (inf)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [225]  [ 800/1251]  eta: 0:05:57  lr: 0.000575  min_lr: 0.000575  loss: 2.8204 (2.7143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8450 (inf)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [225]  [1000/1251]  eta: 0:03:18  lr: 0.000573  min_lr: 0.000573  loss: 2.7347 (2.7152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9505 (inf)  time: 0.7825  data: 0.0005  max mem: 69511
Epoch: [225]  [1200/1251]  eta: 0:00:40  lr: 0.000571  min_lr: 0.000571  loss: 2.7075 (2.7241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9859 (inf)  time: 0.7850  data: 0.0005  max mem: 69511
Epoch: [225]  [1250/1251]  eta: 0:00:00  lr: 0.000570  min_lr: 0.000570  loss: 2.9399 (2.7283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9582 (inf)  time: 0.6711  data: 0.0006  max mem: 69511
Epoch: [225] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.000570  min_lr: 0.000570  loss: 2.9399 (2.7444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9582 (inf)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6359 (0.6359)  acc1: 91.6000 (91.6000)  acc5: 98.8000 (98.8000)  time: 6.3842  data: 5.9596  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8235 (0.8027)  acc1: 87.6000 (87.4546)  acc5: 97.6000 (97.9636)  time: 0.9851  data: 0.5864  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9202 (0.9198)  acc1: 83.2000 (84.5143)  acc5: 96.8000 (96.9143)  time: 0.4208  data: 0.0246  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0030 (0.9318)  acc1: 82.4000 (84.0160)  acc5: 96.4000 (96.7520)  time: 0.3965  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6603 s / it)
* Acc@1 84.034 Acc@5 96.784 loss 0.924
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.03%
Epoch: [226]  [   0/1251]  eta: 1:34:36  lr: 0.000570  min_lr: 0.000570  loss: 3.0117 (3.0117)  weight_decay: 0.0500 (0.0500)  time: 4.5375  data: 3.7582  max mem: 69511
Epoch: [226]  [ 200/1251]  eta: 0:14:08  lr: 0.000568  min_lr: 0.000568  loss: 2.9333 (2.7604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8667 (0.9558)  time: 0.7851  data: 0.0004  max mem: 69511
Epoch: [226]  [ 400/1251]  eta: 0:11:18  lr: 0.000566  min_lr: 0.000566  loss: 2.6916 (2.7711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9945 (0.9578)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [226]  [ 600/1251]  eta: 0:08:36  lr: 0.000563  min_lr: 0.000563  loss: 2.8414 (2.7668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9210 (0.9514)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [226]  [ 800/1251]  eta: 0:05:57  lr: 0.000561  min_lr: 0.000561  loss: 2.7435 (2.7577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9900 (0.9694)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [226]  [1000/1251]  eta: 0:03:18  lr: 0.000559  min_lr: 0.000559  loss: 2.7643 (2.7523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8977 (0.9662)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [226]  [1200/1251]  eta: 0:00:40  lr: 0.000556  min_lr: 0.000556  loss: 2.8013 (2.7619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9445 (0.9649)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [226]  [1250/1251]  eta: 0:00:00  lr: 0.000556  min_lr: 0.000556  loss: 2.7794 (2.7606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9445 (0.9657)  time: 0.6682  data: 0.0006  max mem: 69511
Epoch: [226] Total time: 0:16:27 (0.7892 s / it)
Averaged stats: lr: 0.000556  min_lr: 0.000556  loss: 2.7794 (2.7473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9445 (0.9657)
Test:  [ 0/25]  eta: 0:03:23  loss: 0.6798 (0.6798)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 8.1595  data: 7.7422  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8484 (0.8240)  acc1: 86.8000 (87.2000)  acc5: 98.0000 (97.8909)  time: 1.1027  data: 0.7041  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9172 (0.9267)  acc1: 83.2000 (84.1905)  acc5: 96.8000 (96.9714)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9676 (0.9379)  acc1: 82.8000 (83.6160)  acc5: 96.4000 (96.8640)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7157 s / it)
* Acc@1 84.024 Acc@5 96.864 loss 0.925
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.03%
Epoch: [227]  [   0/1251]  eta: 1:47:01  lr: 0.000556  min_lr: 0.000556  loss: 2.6772 (2.6772)  weight_decay: 0.0500 (0.0500)  time: 5.1332  data: 4.3398  max mem: 69511
Epoch: [227]  [ 200/1251]  eta: 0:14:09  lr: 0.000553  min_lr: 0.000553  loss: 2.7853 (2.7253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9483 (0.9717)  time: 0.7852  data: 0.0005  max mem: 69511
Epoch: [227]  [ 400/1251]  eta: 0:11:18  lr: 0.000551  min_lr: 0.000551  loss: 2.8817 (2.7274)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0187 (0.9639)  time: 0.7907  data: 0.0005  max mem: 69511
Epoch: [227]  [ 600/1251]  eta: 0:08:37  lr: 0.000549  min_lr: 0.000549  loss: 2.8344 (2.7311)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0273 (0.9722)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [227]  [ 800/1251]  eta: 0:05:57  lr: 0.000547  min_lr: 0.000547  loss: 2.8770 (2.7329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9614 (0.9743)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [227]  [1000/1251]  eta: 0:03:18  lr: 0.000544  min_lr: 0.000544  loss: 2.8834 (2.7372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9232 (0.9733)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [227]  [1200/1251]  eta: 0:00:40  lr: 0.000542  min_lr: 0.000542  loss: 2.7289 (2.7456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0070 (0.9683)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [227]  [1250/1251]  eta: 0:00:00  lr: 0.000542  min_lr: 0.000542  loss: 2.8359 (2.7480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9054 (0.9665)  time: 0.6680  data: 0.0005  max mem: 69511
Epoch: [227] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.000542  min_lr: 0.000542  loss: 2.8359 (2.7453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9054 (0.9665)
Test:  [ 0/25]  eta: 0:03:16  loss: 0.7656 (0.7656)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 7.8521  data: 7.4375  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8774 (0.8715)  acc1: 86.8000 (86.9818)  acc5: 98.0000 (97.8546)  time: 1.0750  data: 0.6764  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9653 (0.9875)  acc1: 83.2000 (84.0952)  acc5: 96.8000 (96.9143)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0490 (0.9998)  acc1: 83.2000 (83.6320)  acc5: 96.4000 (96.8000)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7022 s / it)
* Acc@1 83.884 Acc@5 96.838 loss 0.993
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 84.03%
Epoch: [228]  [   0/1251]  eta: 1:34:58  lr: 0.000541  min_lr: 0.000541  loss: 3.0316 (3.0316)  weight_decay: 0.0500 (0.0500)  time: 4.5554  data: 3.7426  max mem: 69511
Epoch: [228]  [ 200/1251]  eta: 0:14:06  lr: 0.000539  min_lr: 0.000539  loss: 2.8240 (2.7849)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0169 (0.9862)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [228]  [ 400/1251]  eta: 0:11:18  lr: 0.000537  min_lr: 0.000537  loss: 2.8831 (2.7682)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0104 (0.9711)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [228]  [ 600/1251]  eta: 0:08:37  lr: 0.000535  min_lr: 0.000535  loss: 2.8112 (2.7610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9228 (0.9770)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [228]  [ 800/1251]  eta: 0:05:57  lr: 0.000532  min_lr: 0.000532  loss: 2.8477 (2.7588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9364 (0.9854)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [228]  [1000/1251]  eta: 0:03:18  lr: 0.000530  min_lr: 0.000530  loss: 2.7634 (2.7601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0430 (0.9931)  time: 0.7884  data: 0.0005  max mem: 69511
Epoch: [228]  [1200/1251]  eta: 0:00:40  lr: 0.000528  min_lr: 0.000528  loss: 2.7232 (2.7519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8656 (0.9841)  time: 0.7863  data: 0.0006  max mem: 69511
Epoch: [228]  [1250/1251]  eta: 0:00:00  lr: 0.000527  min_lr: 0.000527  loss: 2.6698 (2.7499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8425 (0.9792)  time: 0.6678  data: 0.0007  max mem: 69511
Epoch: [228] Total time: 0:16:27 (0.7895 s / it)
Averaged stats: lr: 0.000527  min_lr: 0.000527  loss: 2.6698 (2.7307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8425 (0.9792)
Test:  [ 0/25]  eta: 0:03:08  loss: 0.6120 (0.6120)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 7.5385  data: 7.1082  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7665 (0.7297)  acc1: 87.6000 (86.9091)  acc5: 98.0000 (97.9273)  time: 1.0461  data: 0.6465  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8710 (0.8592)  acc1: 83.2000 (84.0571)  acc5: 96.8000 (96.9333)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9122 (0.8726)  acc1: 81.2000 (83.6320)  acc5: 96.4000 (96.8160)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6874 s / it)
* Acc@1 83.962 Acc@5 96.816 loss 0.857
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.03%
Epoch: [229]  [   0/1251]  eta: 1:42:28  lr: 0.000527  min_lr: 0.000527  loss: 2.7997 (2.7997)  weight_decay: 0.0500 (0.0500)  time: 4.9145  data: 3.0612  max mem: 69511
Epoch: [229]  [ 200/1251]  eta: 0:14:08  lr: 0.000525  min_lr: 0.000525  loss: 2.9416 (2.7087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9007 (1.0042)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [229]  [ 400/1251]  eta: 0:11:18  lr: 0.000523  min_lr: 0.000523  loss: 2.8432 (2.7372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9596 (1.0124)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [229]  [ 600/1251]  eta: 0:08:36  lr: 0.000521  min_lr: 0.000521  loss: 2.5689 (2.7493)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9666 (1.0025)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [229]  [ 800/1251]  eta: 0:05:57  lr: 0.000518  min_lr: 0.000518  loss: 2.8636 (2.7412)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8827 (1.0096)  time: 0.7851  data: 0.0005  max mem: 69511
Epoch: [229]  [1000/1251]  eta: 0:03:18  lr: 0.000516  min_lr: 0.000516  loss: 2.9368 (2.7330)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0135 (1.0028)  time: 0.7840  data: 0.0005  max mem: 69511
Epoch: [229]  [1200/1251]  eta: 0:00:40  lr: 0.000514  min_lr: 0.000514  loss: 2.5848 (2.7322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9104 (0.9953)  time: 0.7851  data: 0.0005  max mem: 69511
Epoch: [229]  [1250/1251]  eta: 0:00:00  lr: 0.000513  min_lr: 0.000513  loss: 2.8337 (2.7294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9426 (0.9967)  time: 0.6730  data: 0.0006  max mem: 69511
Epoch: [229] Total time: 0:16:26 (0.7883 s / it)
Averaged stats: lr: 0.000513  min_lr: 0.000513  loss: 2.8337 (2.7336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9426 (0.9967)
Test:  [ 0/25]  eta: 0:03:13  loss: 0.6727 (0.6727)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 7.7319  data: 7.3186  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8545 (0.8098)  acc1: 86.8000 (86.6546)  acc5: 98.0000 (98.0364)  time: 1.0641  data: 0.6656  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9279 (0.9322)  acc1: 83.2000 (83.7905)  acc5: 96.8000 (97.0857)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0061 (0.9456)  acc1: 82.4000 (83.4400)  acc5: 96.4000 (96.8640)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6973 s / it)
* Acc@1 83.912 Acc@5 96.868 loss 0.930
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 84.03%
Epoch: [230]  [   0/1251]  eta: 1:38:36  lr: 0.000513  min_lr: 0.000513  loss: 3.0627 (3.0627)  weight_decay: 0.0500 (0.0500)  time: 4.7298  data: 3.9291  max mem: 69511
Epoch: [230]  [ 200/1251]  eta: 0:14:08  lr: 0.000511  min_lr: 0.000511  loss: 2.5761 (2.6815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9255 (1.0672)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [230]  [ 400/1251]  eta: 0:11:18  lr: 0.000509  min_lr: 0.000509  loss: 2.8255 (2.7143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9978 (1.0393)  time: 0.7932  data: 0.0005  max mem: 69511
Epoch: [230]  [ 600/1251]  eta: 0:08:36  lr: 0.000507  min_lr: 0.000507  loss: 2.9581 (2.7258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9325 (1.0256)  time: 0.7887  data: 0.0004  max mem: 69511
Epoch: [230]  [ 800/1251]  eta: 0:05:57  lr: 0.000505  min_lr: 0.000505  loss: 2.9419 (2.7256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9463 (1.0136)  time: 0.7829  data: 0.0004  max mem: 69511
Epoch: [230]  [1000/1251]  eta: 0:03:18  lr: 0.000502  min_lr: 0.000502  loss: 2.8941 (2.7170)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0602 (1.0173)  time: 0.7907  data: 0.0005  max mem: 69511
Epoch: [230]  [1200/1251]  eta: 0:00:40  lr: 0.000500  min_lr: 0.000500  loss: 2.8760 (2.7198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8860 (1.0106)  time: 0.7878  data: 0.0004  max mem: 69511
Epoch: [230]  [1250/1251]  eta: 0:00:00  lr: 0.000500  min_lr: 0.000500  loss: 2.6614 (2.7182)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0365 (1.0142)  time: 0.6674  data: 0.0005  max mem: 69511
Epoch: [230] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.000500  min_lr: 0.000500  loss: 2.6614 (2.7212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0365 (1.0142)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.6491 (0.6491)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 7.4895  data: 7.0626  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8380 (0.7975)  acc1: 86.8000 (86.7273)  acc5: 98.4000 (98.0000)  time: 1.0416  data: 0.6423  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9057 (0.9163)  acc1: 82.4000 (83.8095)  acc5: 97.2000 (97.1238)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0031 (0.9259)  acc1: 82.0000 (83.4720)  acc5: 96.4000 (96.9760)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6868 s / it)
* Acc@1 84.034 Acc@5 96.844 loss 0.915
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.03%
Epoch: [231]  [   0/1251]  eta: 1:37:36  lr: 0.000500  min_lr: 0.000500  loss: 2.3159 (2.3159)  weight_decay: 0.0500 (0.0500)  time: 4.6815  data: 3.8931  max mem: 69511
Epoch: [231]  [ 200/1251]  eta: 0:14:06  lr: 0.000497  min_lr: 0.000497  loss: 2.7389 (2.7315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9854 (1.0208)  time: 0.7901  data: 0.0004  max mem: 69511
Epoch: [231]  [ 400/1251]  eta: 0:11:18  lr: 0.000495  min_lr: 0.000495  loss: 2.5701 (2.7102)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0634 (1.0345)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [231]  [ 600/1251]  eta: 0:08:36  lr: 0.000493  min_lr: 0.000493  loss: 2.7919 (2.7076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8920 (1.0041)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [231]  [ 800/1251]  eta: 0:05:57  lr: 0.000491  min_lr: 0.000491  loss: 2.7552 (2.7082)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0659 (1.0106)  time: 0.7926  data: 0.0009  max mem: 69511
Epoch: [231]  [1000/1251]  eta: 0:03:18  lr: 0.000489  min_lr: 0.000489  loss: 2.5201 (2.7048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9823 (1.0231)  time: 0.7878  data: 0.0004  max mem: 69511
Epoch: [231]  [1200/1251]  eta: 0:00:40  lr: 0.000486  min_lr: 0.000486  loss: 2.7574 (2.7138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0382 (1.0216)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [231]  [1250/1251]  eta: 0:00:00  lr: 0.000486  min_lr: 0.000486  loss: 2.7849 (2.7137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9307 (1.0194)  time: 0.6675  data: 0.0005  max mem: 69511
Epoch: [231] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.000486  min_lr: 0.000486  loss: 2.7849 (2.7148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9307 (1.0194)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.6859 (0.6859)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.7034  data: 7.2647  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8534 (0.8246)  acc1: 86.4000 (86.5091)  acc5: 97.6000 (97.7818)  time: 1.0611  data: 0.6607  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9458 (0.9442)  acc1: 82.4000 (83.8095)  acc5: 96.4000 (96.9143)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0299 (0.9541)  acc1: 81.6000 (83.4080)  acc5: 96.4000 (96.8480)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6944 s / it)
* Acc@1 83.940 Acc@5 96.846 loss 0.943
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 84.03%
Epoch: [232]  [   0/1251]  eta: 1:29:37  lr: 0.000486  min_lr: 0.000486  loss: 2.8667 (2.8667)  weight_decay: 0.0500 (0.0500)  time: 4.2984  data: 3.4879  max mem: 69511
Epoch: [232]  [ 200/1251]  eta: 0:14:04  lr: 0.000484  min_lr: 0.000484  loss: 2.8374 (2.7097)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0229 (0.9738)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [232]  [ 400/1251]  eta: 0:11:18  lr: 0.000482  min_lr: 0.000482  loss: 2.6901 (2.7268)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1890 (1.0462)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [232]  [ 600/1251]  eta: 0:08:36  lr: 0.000479  min_lr: 0.000479  loss: 2.7064 (2.7201)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0312 (1.0505)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [232]  [ 800/1251]  eta: 0:05:57  lr: 0.000477  min_lr: 0.000477  loss: 2.6485 (2.7148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9449 (1.0286)  time: 0.7933  data: 0.0005  max mem: 69511
Epoch: [232]  [1000/1251]  eta: 0:03:18  lr: 0.000475  min_lr: 0.000475  loss: 2.7486 (2.7190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9886 (1.0219)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [232]  [1200/1251]  eta: 0:00:40  lr: 0.000473  min_lr: 0.000473  loss: 2.5057 (2.7136)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0045 (1.0180)  time: 0.7896  data: 0.0005  max mem: 69511
Epoch: [232]  [1250/1251]  eta: 0:00:00  lr: 0.000473  min_lr: 0.000473  loss: 2.9055 (2.7135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9592 (1.0168)  time: 0.6683  data: 0.0006  max mem: 69511
Epoch: [232] Total time: 0:16:28 (0.7898 s / it)
Averaged stats: lr: 0.000473  min_lr: 0.000473  loss: 2.9055 (2.7143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9592 (1.0168)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.6563 (0.6563)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.2236  data: 4.7866  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7589 (0.7468)  acc1: 86.4000 (86.5091)  acc5: 97.6000 (97.8182)  time: 1.0244  data: 0.6220  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8594 (0.8666)  acc1: 82.8000 (84.0000)  acc5: 96.8000 (96.8191)  time: 0.5008  data: 0.1028  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9313 (0.8789)  acc1: 82.4000 (83.6000)  acc5: 96.8000 (96.8160)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6778 s / it)
* Acc@1 84.026 Acc@5 96.868 loss 0.863
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.03%
Epoch: [233]  [   0/1251]  eta: 1:40:37  lr: 0.000472  min_lr: 0.000472  loss: 3.1118 (3.1118)  weight_decay: 0.0500 (0.0500)  time: 4.8262  data: 3.1246  max mem: 69511
Epoch: [233]  [ 200/1251]  eta: 0:14:09  lr: 0.000470  min_lr: 0.000470  loss: 2.8373 (2.6954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9832 (0.9747)  time: 0.7907  data: 0.0005  max mem: 69511
Epoch: [233]  [ 400/1251]  eta: 0:11:19  lr: 0.000468  min_lr: 0.000468  loss: 2.9697 (2.7098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9583 (0.9922)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [233]  [ 600/1251]  eta: 0:08:37  lr: 0.000466  min_lr: 0.000466  loss: 2.7635 (2.7078)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0254 (1.0069)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [233]  [ 800/1251]  eta: 0:05:57  lr: 0.000464  min_lr: 0.000464  loss: 2.9501 (2.7164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9649 (1.0196)  time: 0.7910  data: 0.0005  max mem: 69511
Epoch: [233]  [1000/1251]  eta: 0:03:18  lr: 0.000462  min_lr: 0.000462  loss: 2.8870 (2.7233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9193 (1.0089)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [233]  [1200/1251]  eta: 0:00:40  lr: 0.000460  min_lr: 0.000460  loss: 2.6505 (2.7233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9496 (1.0067)  time: 0.7916  data: 0.0004  max mem: 69511
Epoch: [233]  [1250/1251]  eta: 0:00:00  lr: 0.000459  min_lr: 0.000459  loss: 2.6837 (2.7213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9155 (1.0019)  time: 0.6679  data: 0.0006  max mem: 69511
Epoch: [233] Total time: 0:16:27 (0.7895 s / it)
Averaged stats: lr: 0.000459  min_lr: 0.000459  loss: 2.6837 (2.7122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9155 (1.0019)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.6342 (0.6342)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 7.3467  data: 6.9045  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7687 (0.7517)  acc1: 87.2000 (87.0909)  acc5: 97.6000 (98.0364)  time: 1.0292  data: 0.6280  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8470 (0.8832)  acc1: 82.8000 (84.0191)  acc5: 96.8000 (96.9905)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0004 (0.8983)  acc1: 82.4000 (83.5840)  acc5: 96.0000 (96.8000)  time: 0.3971  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6826 s / it)
* Acc@1 84.076 Acc@5 96.816 loss 0.881
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.08%
Epoch: [234]  [   0/1251]  eta: 1:18:00  lr: 0.000459  min_lr: 0.000459  loss: 1.8560 (1.8560)  weight_decay: 0.0500 (0.0500)  time: 3.7416  data: 2.9482  max mem: 69511
Epoch: [234]  [ 200/1251]  eta: 0:14:03  lr: 0.000457  min_lr: 0.000457  loss: 2.8272 (2.7015)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0356 (1.0657)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [234]  [ 400/1251]  eta: 0:11:15  lr: 0.000455  min_lr: 0.000455  loss: 2.8021 (2.7157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9380 (1.0188)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [234]  [ 600/1251]  eta: 0:08:36  lr: 0.000453  min_lr: 0.000453  loss: 2.7259 (2.7032)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0138 (1.0219)  time: 0.7911  data: 0.0004  max mem: 69511
Epoch: [234]  [ 800/1251]  eta: 0:05:56  lr: 0.000451  min_lr: 0.000451  loss: 2.7726 (2.6994)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0391 (1.0214)  time: 0.7827  data: 0.0004  max mem: 69511
Epoch: [234]  [1000/1251]  eta: 0:03:18  lr: 0.000449  min_lr: 0.000449  loss: 2.9059 (2.7085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9636 (1.0135)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [234]  [1200/1251]  eta: 0:00:40  lr: 0.000447  min_lr: 0.000447  loss: 2.7618 (2.7013)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0431 (1.0143)  time: 0.7913  data: 0.0004  max mem: 69511
Epoch: [234]  [1250/1251]  eta: 0:00:00  lr: 0.000446  min_lr: 0.000446  loss: 2.8556 (2.7022)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9856 (1.0102)  time: 0.6672  data: 0.0005  max mem: 69511
Epoch: [234] Total time: 0:16:25 (0.7874 s / it)
Averaged stats: lr: 0.000446  min_lr: 0.000446  loss: 2.8556 (2.7045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9856 (1.0102)
Test:  [ 0/25]  eta: 0:03:04  loss: 0.7258 (0.7258)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 7.3821  data: 6.9375  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8629 (0.8450)  acc1: 86.0000 (86.8727)  acc5: 97.6000 (97.9636)  time: 1.0322  data: 0.6310  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9451 (0.9762)  acc1: 82.8000 (84.1524)  acc5: 96.8000 (96.9905)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0946 (0.9916)  acc1: 81.6000 (83.5840)  acc5: 96.4000 (96.8960)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6823 s / it)
* Acc@1 83.954 Acc@5 96.872 loss 0.978
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.08%
Epoch: [235]  [   0/1251]  eta: 1:41:42  lr: 0.000446  min_lr: 0.000446  loss: 1.8418 (1.8418)  weight_decay: 0.0500 (0.0500)  time: 4.8779  data: 2.2218  max mem: 69511
Epoch: [235]  [ 200/1251]  eta: 0:14:06  lr: 0.000444  min_lr: 0.000444  loss: 2.7849 (2.6548)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0487 (0.9817)  time: 0.7828  data: 0.0005  max mem: 69511
Epoch: [235]  [ 400/1251]  eta: 0:11:16  lr: 0.000442  min_lr: 0.000442  loss: 2.9128 (2.6921)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0453 (1.0039)  time: 0.7829  data: 0.0005  max mem: 69511
Epoch: [235]  [ 600/1251]  eta: 0:08:36  lr: 0.000440  min_lr: 0.000440  loss: 2.6604 (2.6941)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0297 (1.0111)  time: 0.7857  data: 0.0006  max mem: 69511
Epoch: [235]  [ 800/1251]  eta: 0:05:56  lr: 0.000438  min_lr: 0.000438  loss: 2.7783 (2.7040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9672 (1.0147)  time: 0.7820  data: 0.0006  max mem: 69511
Epoch: [235]  [1000/1251]  eta: 0:03:18  lr: 0.000436  min_lr: 0.000436  loss: 2.6209 (2.6943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9621 (1.0116)  time: 0.7907  data: 0.0005  max mem: 69511
Epoch: [235]  [1200/1251]  eta: 0:00:40  lr: 0.000433  min_lr: 0.000433  loss: 2.8292 (2.6947)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1000 (1.0214)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [235]  [1250/1251]  eta: 0:00:00  lr: 0.000433  min_lr: 0.000433  loss: 2.8261 (2.6946)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0256 (1.0218)  time: 0.6680  data: 0.0007  max mem: 69511
Epoch: [235] Total time: 0:16:25 (0.7879 s / it)
Averaged stats: lr: 0.000433  min_lr: 0.000433  loss: 2.8261 (2.7013)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0256 (1.0218)
Test:  [ 0/25]  eta: 0:02:54  loss: 0.7383 (0.7383)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 6.9914  data: 6.5597  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8641 (0.8520)  acc1: 87.6000 (87.2727)  acc5: 98.0000 (97.8545)  time: 0.9966  data: 0.5967  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9539 (0.9608)  acc1: 83.2000 (84.4191)  acc5: 96.8000 (96.9143)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0193 (0.9748)  acc1: 83.2000 (83.8720)  acc5: 96.0000 (96.7680)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6692 s / it)
* Acc@1 84.150 Acc@5 96.828 loss 0.964
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.15%
Epoch: [236]  [   0/1251]  eta: 1:26:07  lr: 0.000433  min_lr: 0.000433  loss: 1.9486 (1.9486)  weight_decay: 0.0500 (0.0500)  time: 4.1308  data: 3.3500  max mem: 69511
Epoch: [236]  [ 200/1251]  eta: 0:14:05  lr: 0.000431  min_lr: 0.000431  loss: 2.7622 (2.6846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9581 (1.0230)  time: 0.7962  data: 0.0006  max mem: 69511
Epoch: [236]  [ 400/1251]  eta: 0:11:17  lr: 0.000429  min_lr: 0.000429  loss: 2.8810 (2.7061)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0146 (1.0108)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [236]  [ 600/1251]  eta: 0:08:36  lr: 0.000427  min_lr: 0.000427  loss: 2.9210 (2.6941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9619 (1.0087)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [236]  [ 800/1251]  eta: 0:05:57  lr: 0.000425  min_lr: 0.000425  loss: 2.6638 (2.6877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9688 (1.0132)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [236]  [1000/1251]  eta: 0:03:18  lr: 0.000423  min_lr: 0.000423  loss: 2.8780 (2.6804)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0644 (1.0224)  time: 0.7825  data: 0.0005  max mem: 69511
Epoch: [236]  [1200/1251]  eta: 0:00:40  lr: 0.000421  min_lr: 0.000421  loss: 2.7835 (2.6815)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0204 (1.0138)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [236]  [1250/1251]  eta: 0:00:00  lr: 0.000420  min_lr: 0.000420  loss: 2.3504 (2.6784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9395 (1.0136)  time: 0.6668  data: 0.0006  max mem: 69511
Epoch: [236] Total time: 0:16:25 (0.7880 s / it)
Averaged stats: lr: 0.000420  min_lr: 0.000420  loss: 2.3504 (2.6914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9395 (1.0136)
Test:  [ 0/25]  eta: 0:03:11  loss: 0.5589 (0.5589)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 7.6435  data: 7.2326  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7103 (0.6818)  acc1: 86.8000 (87.0909)  acc5: 98.0000 (98.0000)  time: 1.0564  data: 0.6578  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8235 (0.7998)  acc1: 82.8000 (84.3619)  acc5: 96.8000 (97.0857)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.8827 (0.8105)  acc1: 82.0000 (83.9200)  acc5: 96.4000 (96.9920)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6947 s / it)
* Acc@1 84.268 Acc@5 96.916 loss 0.797
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.27%
Epoch: [237]  [   0/1251]  eta: 1:21:25  lr: 0.000420  min_lr: 0.000420  loss: 2.9635 (2.9635)  weight_decay: 0.0500 (0.0500)  time: 3.9050  data: 3.1256  max mem: 69511
Epoch: [237]  [ 200/1251]  eta: 0:14:03  lr: 0.000418  min_lr: 0.000418  loss: 2.8668 (2.6838)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0218 (1.0533)  time: 0.7901  data: 0.0004  max mem: 69511
Epoch: [237]  [ 400/1251]  eta: 0:11:14  lr: 0.000416  min_lr: 0.000416  loss: 2.4898 (2.6942)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0343 (1.0622)  time: 0.7836  data: 0.0005  max mem: 69511
Epoch: [237]  [ 600/1251]  eta: 0:08:34  lr: 0.000414  min_lr: 0.000414  loss: 2.6890 (2.6772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9183 (1.0399)  time: 0.7897  data: 0.0004  max mem: 69511
Epoch: [237]  [ 800/1251]  eta: 0:05:56  lr: 0.000412  min_lr: 0.000412  loss: 2.6742 (2.6842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9690 (1.0222)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [237]  [1000/1251]  eta: 0:03:18  lr: 0.000410  min_lr: 0.000410  loss: 2.6645 (2.6841)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0925 (1.0236)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [237]  [1200/1251]  eta: 0:00:40  lr: 0.000408  min_lr: 0.000408  loss: 2.7680 (2.6895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8882 (1.0260)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [237]  [1250/1251]  eta: 0:00:00  lr: 0.000408  min_lr: 0.000408  loss: 2.7892 (2.6914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9420 (1.0235)  time: 0.6678  data: 0.0005  max mem: 69511
Epoch: [237] Total time: 0:16:24 (0.7872 s / it)
Averaged stats: lr: 0.000408  min_lr: 0.000408  loss: 2.7892 (2.6875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9420 (1.0235)
Test:  [ 0/25]  eta: 0:02:59  loss: 0.6813 (0.6813)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.1616  data: 6.7338  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8338 (0.8060)  acc1: 86.4000 (86.6182)  acc5: 98.0000 (97.9636)  time: 1.0121  data: 0.6124  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9355 (0.9224)  acc1: 83.2000 (84.3238)  acc5: 96.8000 (96.9143)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0118 (0.9372)  acc1: 83.2000 (83.8400)  acc5: 96.0000 (96.7680)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6732 s / it)
* Acc@1 84.046 Acc@5 96.884 loss 0.925
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.27%
Epoch: [238]  [   0/1251]  eta: 1:38:30  lr: 0.000408  min_lr: 0.000408  loss: 2.7759 (2.7759)  weight_decay: 0.0500 (0.0500)  time: 4.7250  data: 3.4306  max mem: 69511
Epoch: [238]  [ 200/1251]  eta: 0:14:08  lr: 0.000405  min_lr: 0.000405  loss: 2.8497 (2.7217)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1260 (1.0442)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [238]  [ 400/1251]  eta: 0:11:18  lr: 0.000403  min_lr: 0.000403  loss: 2.2596 (2.6987)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0513 (1.0772)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [238]  [ 600/1251]  eta: 0:08:36  lr: 0.000401  min_lr: 0.000401  loss: 2.8746 (2.6905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9292 (1.0536)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [238]  [ 800/1251]  eta: 0:05:57  lr: 0.000399  min_lr: 0.000399  loss: 2.9231 (2.6903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9757 (1.0504)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [238]  [1000/1251]  eta: 0:03:18  lr: 0.000397  min_lr: 0.000397  loss: 2.2725 (2.6909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9913 (1.0441)  time: 0.7832  data: 0.0004  max mem: 69511
Epoch: [238]  [1200/1251]  eta: 0:00:40  lr: 0.000395  min_lr: 0.000395  loss: 2.4656 (2.6941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9704 (1.0475)  time: 0.7883  data: 0.0004  max mem: 69511
Epoch: [238]  [1250/1251]  eta: 0:00:00  lr: 0.000395  min_lr: 0.000395  loss: 2.7025 (2.6923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9800 (1.0478)  time: 0.6640  data: 0.0006  max mem: 69511
Epoch: [238] Total time: 0:16:26 (0.7883 s / it)
Averaged stats: lr: 0.000395  min_lr: 0.000395  loss: 2.7025 (2.6899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9800 (1.0478)
Test:  [ 0/25]  eta: 0:03:00  loss: 0.6363 (0.6363)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.2323  data: 6.7954  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7937 (0.7665)  acc1: 86.0000 (86.5455)  acc5: 98.0000 (98.0000)  time: 1.0157  data: 0.6180  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8678 (0.8864)  acc1: 82.0000 (83.8857)  acc5: 97.2000 (97.1048)  time: 0.3940  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9628 (0.9008)  acc1: 82.0000 (83.3280)  acc5: 96.8000 (96.9600)  time: 0.3939  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6788 s / it)
* Acc@1 84.144 Acc@5 96.928 loss 0.890
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.27%
Epoch: [239]  [   0/1251]  eta: 1:34:23  lr: 0.000395  min_lr: 0.000395  loss: 2.3320 (2.3320)  weight_decay: 0.0500 (0.0500)  time: 4.5272  data: 3.7315  max mem: 69511
Epoch: [239]  [ 200/1251]  eta: 0:14:03  lr: 0.000393  min_lr: 0.000393  loss: 2.6769 (2.6396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9039 (0.9837)  time: 0.7850  data: 0.0004  max mem: 69511
Epoch: [239]  [ 400/1251]  eta: 0:11:17  lr: 0.000391  min_lr: 0.000391  loss: 2.7752 (2.6593)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0165 (1.0050)  time: 0.7988  data: 0.0004  max mem: 69511
Epoch: [239]  [ 600/1251]  eta: 0:08:36  lr: 0.000389  min_lr: 0.000389  loss: 2.8323 (2.6674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0081 (1.0214)  time: 0.7868  data: 0.0005  max mem: 69511
Epoch: [239]  [ 800/1251]  eta: 0:05:57  lr: 0.000387  min_lr: 0.000387  loss: 2.5506 (2.6610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9781 (1.0207)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [239]  [1000/1251]  eta: 0:03:18  lr: 0.000385  min_lr: 0.000385  loss: 2.9079 (2.6574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9639 (1.0329)  time: 0.7939  data: 0.0007  max mem: 69511
Epoch: [239]  [1200/1251]  eta: 0:00:40  lr: 0.000383  min_lr: 0.000383  loss: 2.8998 (2.6670)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0535 (1.0260)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [239]  [1250/1251]  eta: 0:00:00  lr: 0.000383  min_lr: 0.000383  loss: 2.8221 (2.6695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9693 (1.0216)  time: 0.6680  data: 0.0005  max mem: 69511
Epoch: [239] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.000383  min_lr: 0.000383  loss: 2.8221 (2.6817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9693 (1.0216)
Test:  [ 0/25]  eta: 0:02:54  loss: 0.6648 (0.6648)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.9748  data: 6.5527  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8143 (0.7989)  acc1: 87.2000 (87.1636)  acc5: 98.0000 (98.0727)  time: 0.9952  data: 0.5960  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9040 (0.9310)  acc1: 82.8000 (84.3048)  acc5: 97.2000 (97.1810)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0325 (0.9425)  acc1: 82.4000 (83.9200)  acc5: 96.4000 (97.0080)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6675 s / it)
* Acc@1 84.228 Acc@5 96.958 loss 0.933
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.27%
Epoch: [240]  [   0/1251]  eta: 1:38:31  lr: 0.000383  min_lr: 0.000383  loss: 2.4974 (2.4974)  weight_decay: 0.0500 (0.0500)  time: 4.7258  data: 3.9391  max mem: 69511
Epoch: [240]  [ 200/1251]  eta: 0:14:07  lr: 0.000381  min_lr: 0.000381  loss: 2.7801 (2.6982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9245 (inf)  time: 0.7816  data: 0.0004  max mem: 69511
Epoch: [240]  [ 400/1251]  eta: 0:11:16  lr: 0.000379  min_lr: 0.000379  loss: 2.8861 (2.6693)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0905 (inf)  time: 0.7827  data: 0.0004  max mem: 69511
Epoch: [240]  [ 600/1251]  eta: 0:08:35  lr: 0.000377  min_lr: 0.000377  loss: 2.7103 (2.6624)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9389 (inf)  time: 0.7819  data: 0.0005  max mem: 69511
Epoch: [240]  [ 800/1251]  eta: 0:05:56  lr: 0.000375  min_lr: 0.000375  loss: 2.7672 (2.6674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1459 (inf)  time: 0.7872  data: 0.0004  max mem: 69511
Epoch: [240]  [1000/1251]  eta: 0:03:17  lr: 0.000373  min_lr: 0.000373  loss: 2.7237 (2.6646)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0185 (inf)  time: 0.7849  data: 0.0004  max mem: 69511
Epoch: [240]  [1200/1251]  eta: 0:00:40  lr: 0.000371  min_lr: 0.000371  loss: 2.5536 (2.6771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9605 (inf)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [240]  [1250/1251]  eta: 0:00:00  lr: 0.000371  min_lr: 0.000371  loss: 2.6379 (2.6747)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1624 (inf)  time: 0.6713  data: 0.0006  max mem: 69511
Epoch: [240] Total time: 0:16:24 (0.7869 s / it)
Averaged stats: lr: 0.000371  min_lr: 0.000371  loss: 2.6379 (2.6726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1624 (inf)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6007 (0.6007)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 6.3804  data: 5.9526  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.7659 (0.7256)  acc1: 87.6000 (87.5636)  acc5: 98.0000 (98.0727)  time: 0.9410  data: 0.5415  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8528 (0.8530)  acc1: 83.2000 (84.5524)  acc5: 96.8000 (97.0286)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9542 (0.8662)  acc1: 83.2000 (84.1120)  acc5: 96.4000 (96.9440)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6423 s / it)
* Acc@1 84.184 Acc@5 96.980 loss 0.856
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.27%
Epoch: [241]  [   0/1251]  eta: 1:42:01  lr: 0.000371  min_lr: 0.000371  loss: 2.9706 (2.9706)  weight_decay: 0.0500 (0.0500)  time: 4.8934  data: 3.9784  max mem: 69511
Epoch: [241]  [ 200/1251]  eta: 0:14:07  lr: 0.000369  min_lr: 0.000369  loss: 2.8222 (2.7186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9953 (1.0351)  time: 0.7901  data: 0.0004  max mem: 69511
Epoch: [241]  [ 400/1251]  eta: 0:11:16  lr: 0.000367  min_lr: 0.000367  loss: 2.8502 (2.7026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0304 (1.0372)  time: 0.7894  data: 0.0004  max mem: 69511
Epoch: [241]  [ 600/1251]  eta: 0:08:35  lr: 0.000365  min_lr: 0.000365  loss: 2.8088 (2.6968)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0768 (1.0544)  time: 0.7822  data: 0.0004  max mem: 69511
Epoch: [241]  [ 800/1251]  eta: 0:05:56  lr: 0.000363  min_lr: 0.000363  loss: 2.6777 (2.6981)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0236 (1.0513)  time: 0.7847  data: 0.0004  max mem: 69511
Epoch: [241]  [1000/1251]  eta: 0:03:18  lr: 0.000361  min_lr: 0.000361  loss: 2.8259 (2.6914)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0141 (1.0488)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [241]  [1200/1251]  eta: 0:00:40  lr: 0.000359  min_lr: 0.000359  loss: 2.7317 (2.6917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0356 (1.0456)  time: 0.7908  data: 0.0004  max mem: 69511
Epoch: [241]  [1250/1251]  eta: 0:00:00  lr: 0.000359  min_lr: 0.000359  loss: 2.5722 (2.6896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9757 (1.0451)  time: 0.6729  data: 0.0005  max mem: 69511
Epoch: [241] Total time: 0:16:25 (0.7875 s / it)
Averaged stats: lr: 0.000359  min_lr: 0.000359  loss: 2.5722 (2.6701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9757 (1.0451)
Test:  [ 0/25]  eta: 0:02:57  loss: 0.6092 (0.6092)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 7.1136  data: 6.6771  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7594 (0.7321)  acc1: 86.4000 (87.1273)  acc5: 98.0000 (98.0364)  time: 1.0074  data: 0.6073  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8314 (0.8539)  acc1: 83.2000 (84.5905)  acc5: 96.8000 (97.1238)  time: 0.3967  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9159 (0.8690)  acc1: 82.8000 (84.0800)  acc5: 96.4000 (96.9760)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6720 s / it)
* Acc@1 84.202 Acc@5 96.956 loss 0.858
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.27%
Epoch: [242]  [   0/1251]  eta: 1:38:54  lr: 0.000359  min_lr: 0.000359  loss: 2.1262 (2.1262)  weight_decay: 0.0500 (0.0500)  time: 4.7435  data: 3.9501  max mem: 69511
Epoch: [242]  [ 200/1251]  eta: 0:14:06  lr: 0.000357  min_lr: 0.000357  loss: 2.6807 (2.6209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9850 (1.0344)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [242]  [ 400/1251]  eta: 0:11:17  lr: 0.000355  min_lr: 0.000355  loss: 2.8366 (2.6396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9406 (1.0128)  time: 0.7900  data: 0.0005  max mem: 69511
Epoch: [242]  [ 600/1251]  eta: 0:08:36  lr: 0.000353  min_lr: 0.000353  loss: 2.6499 (2.6620)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0702 (1.0195)  time: 0.7874  data: 0.0004  max mem: 69511
Epoch: [242]  [ 800/1251]  eta: 0:05:56  lr: 0.000351  min_lr: 0.000351  loss: 2.6970 (2.6542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9747 (1.0157)  time: 0.7903  data: 0.0004  max mem: 69511
Epoch: [242]  [1000/1251]  eta: 0:03:18  lr: 0.000349  min_lr: 0.000349  loss: 2.6268 (2.6565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9552 (1.0189)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [242]  [1200/1251]  eta: 0:00:40  lr: 0.000347  min_lr: 0.000347  loss: 2.8585 (2.6648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9971 (1.0195)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [242]  [1250/1251]  eta: 0:00:00  lr: 0.000347  min_lr: 0.000347  loss: 2.8173 (2.6667)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0589 (1.0229)  time: 0.6678  data: 0.0005  max mem: 69511
Epoch: [242] Total time: 0:16:25 (0.7880 s / it)
Averaged stats: lr: 0.000347  min_lr: 0.000347  loss: 2.8173 (2.6633)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0589 (1.0229)
Test:  [ 0/25]  eta: 0:03:09  loss: 0.7397 (0.7397)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.5984  data: 7.1745  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8833 (0.8698)  acc1: 87.2000 (87.4182)  acc5: 97.6000 (97.8909)  time: 1.0518  data: 0.6525  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9901 (1.0012)  acc1: 83.6000 (84.5905)  acc5: 96.8000 (97.0286)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1100 (1.0148)  acc1: 83.6000 (84.0480)  acc5: 96.4000 (96.8320)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6907 s / it)
* Acc@1 84.268 Acc@5 96.914 loss 1.002
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.27%
Epoch: [243]  [   0/1251]  eta: 1:33:24  lr: 0.000347  min_lr: 0.000347  loss: 1.8745 (1.8745)  weight_decay: 0.0500 (0.0500)  time: 4.4800  data: 3.6100  max mem: 69511
Epoch: [243]  [ 200/1251]  eta: 0:14:03  lr: 0.000345  min_lr: 0.000345  loss: 2.7193 (2.6290)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0200 (1.1129)  time: 0.7821  data: 0.0004  max mem: 69511
Epoch: [243]  [ 400/1251]  eta: 0:11:15  lr: 0.000343  min_lr: 0.000343  loss: 2.6116 (2.6445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0293 (1.0804)  time: 0.7958  data: 0.0004  max mem: 69511
Epoch: [243]  [ 600/1251]  eta: 0:08:35  lr: 0.000341  min_lr: 0.000341  loss: 2.7505 (2.6450)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0173 (1.0807)  time: 0.7852  data: 0.0005  max mem: 69511
Epoch: [243]  [ 800/1251]  eta: 0:05:56  lr: 0.000339  min_lr: 0.000339  loss: 2.6310 (2.6461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1203 (1.0763)  time: 0.7902  data: 0.0005  max mem: 69511
Epoch: [243]  [1000/1251]  eta: 0:03:18  lr: 0.000337  min_lr: 0.000337  loss: 2.5382 (2.6487)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0746 (1.0859)  time: 0.7879  data: 0.0005  max mem: 69511
Epoch: [243]  [1200/1251]  eta: 0:00:40  lr: 0.000336  min_lr: 0.000336  loss: 2.9077 (2.6526)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0147 (1.0787)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [243]  [1250/1251]  eta: 0:00:00  lr: 0.000335  min_lr: 0.000335  loss: 2.7969 (2.6498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9936 (1.0774)  time: 0.6671  data: 0.0005  max mem: 69511
Epoch: [243] Total time: 0:16:25 (0.7875 s / it)
Averaged stats: lr: 0.000335  min_lr: 0.000335  loss: 2.7969 (2.6673)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9936 (1.0774)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.6566 (0.6566)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 6.3565  data: 5.9108  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.7736 (0.7818)  acc1: 87.6000 (87.3091)  acc5: 98.4000 (98.0364)  time: 0.9633  data: 0.5623  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8814 (0.9084)  acc1: 83.2000 (84.4762)  acc5: 96.8000 (97.0857)  time: 0.4128  data: 0.0137  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9768 (0.9214)  acc1: 82.8000 (84.0000)  acc5: 96.8000 (96.9920)  time: 0.4020  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6605 s / it)
* Acc@1 84.224 Acc@5 96.954 loss 0.914
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.27%
Epoch: [244]  [   0/1251]  eta: 1:43:36  lr: 0.000335  min_lr: 0.000335  loss: 2.9458 (2.9458)  weight_decay: 0.0500 (0.0500)  time: 4.9690  data: 4.1753  max mem: 69511
Epoch: [244]  [ 200/1251]  eta: 0:14:07  lr: 0.000333  min_lr: 0.000333  loss: 2.7022 (2.6560)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0553 (1.0555)  time: 0.7852  data: 0.0006  max mem: 69511
Epoch: [244]  [ 400/1251]  eta: 0:11:18  lr: 0.000331  min_lr: 0.000331  loss: 2.7697 (2.6534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9575 (1.0429)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [244]  [ 600/1251]  eta: 0:08:36  lr: 0.000330  min_lr: 0.000330  loss: 2.9105 (2.6730)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0620 (1.0571)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [244]  [ 800/1251]  eta: 0:05:57  lr: 0.000328  min_lr: 0.000328  loss: 2.7619 (2.6595)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0321 (nan)  time: 0.7884  data: 0.0006  max mem: 69511
Epoch: [244]  [1000/1251]  eta: 0:03:18  lr: 0.000326  min_lr: 0.000326  loss: 2.5019 (2.6562)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0082 (nan)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [244]  [1200/1251]  eta: 0:00:40  lr: 0.000324  min_lr: 0.000324  loss: 2.7677 (2.6631)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1068 (nan)  time: 0.7912  data: 0.0005  max mem: 69511
Epoch: [244]  [1250/1251]  eta: 0:00:00  lr: 0.000324  min_lr: 0.000324  loss: 2.8819 (2.6655)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0497 (nan)  time: 0.6682  data: 0.0005  max mem: 69511
Epoch: [244] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.000324  min_lr: 0.000324  loss: 2.8819 (2.6561)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0497 (nan)
Test:  [ 0/25]  eta: 0:03:22  loss: 0.7283 (0.7283)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 8.0929  data: 7.6775  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8551 (0.8656)  acc1: 86.8000 (87.4545)  acc5: 98.0000 (98.0364)  time: 1.0966  data: 0.6982  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0058 (0.9992)  acc1: 84.4000 (84.5524)  acc5: 96.8000 (97.0476)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0653 (1.0120)  acc1: 82.0000 (84.1280)  acc5: 96.4000 (96.8480)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7108 s / it)
* Acc@1 84.300 Acc@5 96.954 loss 1.002
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.30%
Epoch: [245]  [   0/1251]  eta: 1:32:00  lr: 0.000324  min_lr: 0.000324  loss: 2.9915 (2.9915)  weight_decay: 0.0500 (0.0500)  time: 4.4130  data: 3.6228  max mem: 69511
Epoch: [245]  [ 200/1251]  eta: 0:14:04  lr: 0.000322  min_lr: 0.000322  loss: 2.5118 (2.6369)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0558 (1.0932)  time: 0.7829  data: 0.0005  max mem: 69511
Epoch: [245]  [ 400/1251]  eta: 0:11:15  lr: 0.000320  min_lr: 0.000320  loss: 2.6176 (2.6521)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0122 (1.0838)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [245]  [ 600/1251]  eta: 0:08:35  lr: 0.000318  min_lr: 0.000318  loss: 2.7636 (2.6493)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1604 (1.0819)  time: 0.7832  data: 0.0004  max mem: 69511
Epoch: [245]  [ 800/1251]  eta: 0:05:56  lr: 0.000316  min_lr: 0.000316  loss: 2.4845 (2.6634)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1504 (1.0982)  time: 0.7817  data: 0.0004  max mem: 69511
Epoch: [245]  [1000/1251]  eta: 0:03:17  lr: 0.000315  min_lr: 0.000315  loss: 2.8366 (2.6620)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0033 (1.0862)  time: 0.7844  data: 0.0004  max mem: 69511
Epoch: [245]  [1200/1251]  eta: 0:00:40  lr: 0.000313  min_lr: 0.000313  loss: 2.4830 (2.6540)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0414 (1.0843)  time: 0.7849  data: 0.0004  max mem: 69511
Epoch: [245]  [1250/1251]  eta: 0:00:00  lr: 0.000312  min_lr: 0.000312  loss: 2.7591 (2.6551)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1492 (1.0870)  time: 0.6668  data: 0.0006  max mem: 69511
Epoch: [245] Total time: 0:16:24 (0.7868 s / it)
Averaged stats: lr: 0.000312  min_lr: 0.000312  loss: 2.7591 (2.6594)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1492 (1.0870)
Test:  [ 0/25]  eta: 0:03:04  loss: 0.6506 (0.6506)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 7.3645  data: 6.9485  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7855 (0.7741)  acc1: 86.4000 (87.1273)  acc5: 97.6000 (97.9636)  time: 1.0303  data: 0.6320  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9230 (0.8976)  acc1: 84.0000 (84.6095)  acc5: 96.8000 (97.0667)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9651 (0.9116)  acc1: 83.2000 (84.0800)  acc5: 96.4000 (96.8160)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6803 s / it)
* Acc@1 84.352 Acc@5 96.936 loss 0.901
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.35%
Epoch: [246]  [   0/1251]  eta: 1:23:01  lr: 0.000312  min_lr: 0.000312  loss: 2.4031 (2.4031)  weight_decay: 0.0500 (0.0500)  time: 3.9818  data: 3.1874  max mem: 69511
Epoch: [246]  [ 200/1251]  eta: 0:14:01  lr: 0.000311  min_lr: 0.000311  loss: 2.9002 (2.6418)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1049 (1.1347)  time: 0.7881  data: 0.0004  max mem: 69511
Epoch: [246]  [ 400/1251]  eta: 0:11:14  lr: 0.000309  min_lr: 0.000309  loss: 2.5976 (2.6189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1752 (1.1176)  time: 0.7919  data: 0.0008  max mem: 69511
Epoch: [246]  [ 600/1251]  eta: 0:08:34  lr: 0.000307  min_lr: 0.000307  loss: 2.8361 (2.6486)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0049 (1.1012)  time: 0.7837  data: 0.0005  max mem: 69511
Epoch: [246]  [ 800/1251]  eta: 0:05:55  lr: 0.000305  min_lr: 0.000305  loss: 2.8259 (2.6461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0543 (1.0891)  time: 0.7849  data: 0.0005  max mem: 69511
Epoch: [246]  [1000/1251]  eta: 0:03:17  lr: 0.000304  min_lr: 0.000304  loss: 2.3073 (2.6485)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0525 (1.0780)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [246]  [1200/1251]  eta: 0:00:40  lr: 0.000302  min_lr: 0.000302  loss: 2.5543 (2.6513)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0235 (1.0763)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [246]  [1250/1251]  eta: 0:00:00  lr: 0.000301  min_lr: 0.000301  loss: 2.4479 (2.6479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1528 (1.0798)  time: 0.6675  data: 0.0005  max mem: 69511
Epoch: [246] Total time: 0:16:24 (0.7867 s / it)
Averaged stats: lr: 0.000301  min_lr: 0.000301  loss: 2.4479 (2.6520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1528 (1.0798)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.5907 (0.5907)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 7.4914  data: 7.0614  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7403 (0.7217)  acc1: 87.2000 (87.5636)  acc5: 98.0000 (98.0364)  time: 1.0418  data: 0.6422  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8374 (0.8390)  acc1: 84.0000 (84.7238)  acc5: 97.2000 (97.2762)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9137 (0.8525)  acc1: 82.8000 (84.3680)  acc5: 96.8000 (97.0560)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6862 s / it)
* Acc@1 84.422 Acc@5 96.998 loss 0.839
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.42%
Epoch: [247]  [   0/1251]  eta: 1:25:27  lr: 0.000301  min_lr: 0.000301  loss: 2.2016 (2.2016)  weight_decay: 0.0500 (0.0500)  time: 4.0986  data: 3.3096  max mem: 69511
Epoch: [247]  [ 200/1251]  eta: 0:14:03  lr: 0.000300  min_lr: 0.000300  loss: 2.7163 (2.6184)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0197 (1.1117)  time: 0.7934  data: 0.0008  max mem: 69511
Epoch: [247]  [ 400/1251]  eta: 0:11:17  lr: 0.000298  min_lr: 0.000298  loss: 2.7997 (2.6293)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1421 (1.0921)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [247]  [ 600/1251]  eta: 0:08:36  lr: 0.000296  min_lr: 0.000296  loss: 2.7817 (2.6288)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0297 (1.0787)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [247]  [ 800/1251]  eta: 0:05:57  lr: 0.000294  min_lr: 0.000294  loss: 2.6143 (2.6331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1329 (1.0892)  time: 0.7863  data: 0.0006  max mem: 69511
Epoch: [247]  [1000/1251]  eta: 0:03:18  lr: 0.000293  min_lr: 0.000293  loss: 2.5824 (2.6388)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0157 (1.0827)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [247]  [1200/1251]  eta: 0:00:40  lr: 0.000291  min_lr: 0.000291  loss: 2.7372 (2.6529)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0353 (1.0854)  time: 0.7873  data: 0.0004  max mem: 69511
Epoch: [247]  [1250/1251]  eta: 0:00:00  lr: 0.000290  min_lr: 0.000290  loss: 2.6178 (2.6492)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0991 (1.0852)  time: 0.6678  data: 0.0005  max mem: 69511
Epoch: [247] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.000290  min_lr: 0.000290  loss: 2.6178 (2.6473)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0991 (1.0852)
Test:  [ 0/25]  eta: 0:03:06  loss: 0.6425 (0.6425)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.4422  data: 7.0182  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8045 (0.7753)  acc1: 86.8000 (86.6182)  acc5: 97.6000 (97.8909)  time: 1.0374  data: 0.6383  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8954 (0.8948)  acc1: 83.2000 (84.0952)  acc5: 97.2000 (97.2952)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9733 (0.9073)  acc1: 82.8000 (83.6640)  acc5: 96.4000 (97.1040)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6850 s / it)
* Acc@1 84.236 Acc@5 96.992 loss 0.896
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.42%
Epoch: [248]  [   0/1251]  eta: 1:43:58  lr: 0.000290  min_lr: 0.000290  loss: 2.9759 (2.9759)  weight_decay: 0.0500 (0.0500)  time: 4.9870  data: 3.1837  max mem: 69511
Epoch: [248]  [ 200/1251]  eta: 0:14:09  lr: 0.000289  min_lr: 0.000289  loss: 2.3979 (2.6597)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0811 (1.1357)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [248]  [ 400/1251]  eta: 0:11:20  lr: 0.000287  min_lr: 0.000287  loss: 2.8798 (2.6453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9756 (1.0936)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [248]  [ 600/1251]  eta: 0:08:37  lr: 0.000285  min_lr: 0.000285  loss: 2.7781 (2.6502)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1010 (1.0942)  time: 0.7901  data: 0.0005  max mem: 69511
Epoch: [248]  [ 800/1251]  eta: 0:05:57  lr: 0.000284  min_lr: 0.000284  loss: 2.5657 (2.6365)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1719 (1.1120)  time: 0.7866  data: 0.0004  max mem: 69511
Epoch: [248]  [1000/1251]  eta: 0:03:18  lr: 0.000282  min_lr: 0.000282  loss: 2.5442 (2.6365)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0764 (1.1145)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [248]  [1200/1251]  eta: 0:00:40  lr: 0.000280  min_lr: 0.000280  loss: 2.5899 (2.6271)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1705 (1.1210)  time: 0.7928  data: 0.0005  max mem: 69511
Epoch: [248]  [1250/1251]  eta: 0:00:00  lr: 0.000280  min_lr: 0.000280  loss: 2.7932 (2.6279)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1158 (1.1179)  time: 0.6656  data: 0.0005  max mem: 69511
Epoch: [248] Total time: 0:16:28 (0.7901 s / it)
Averaged stats: lr: 0.000280  min_lr: 0.000280  loss: 2.7932 (2.6429)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1158 (1.1179)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.7075 (0.7075)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 7.8022  data: 7.3706  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8446 (0.8242)  acc1: 87.6000 (87.0546)  acc5: 98.0000 (97.9636)  time: 1.0676  data: 0.6703  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9588 (0.9467)  acc1: 82.8000 (84.1905)  acc5: 97.2000 (97.2191)  time: 0.3943  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0402 (0.9609)  acc1: 81.6000 (83.7120)  acc5: 96.4000 (97.0240)  time: 0.3943  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7017 s / it)
* Acc@1 84.348 Acc@5 97.026 loss 0.947
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.42%
Epoch: [249]  [   0/1251]  eta: 1:29:52  lr: 0.000280  min_lr: 0.000280  loss: 3.1460 (3.1460)  weight_decay: 0.0500 (0.0500)  time: 4.3109  data: 2.1077  max mem: 69511
Epoch: [249]  [ 200/1251]  eta: 0:14:05  lr: 0.000278  min_lr: 0.000278  loss: 2.6119 (2.6033)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0657 (1.0632)  time: 0.7846  data: 0.0004  max mem: 69511
Epoch: [249]  [ 400/1251]  eta: 0:11:16  lr: 0.000276  min_lr: 0.000276  loss: 2.6262 (2.6223)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0317 (1.0732)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [249]  [ 600/1251]  eta: 0:08:36  lr: 0.000275  min_lr: 0.000275  loss: 2.7155 (2.6176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9595 (1.0505)  time: 0.7855  data: 0.0006  max mem: 69511
Epoch: [249]  [ 800/1251]  eta: 0:05:56  lr: 0.000273  min_lr: 0.000273  loss: 2.6524 (2.6291)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1335 (1.0631)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [249]  [1000/1251]  eta: 0:03:18  lr: 0.000271  min_lr: 0.000271  loss: 2.6234 (2.6274)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0059 (1.0730)  time: 0.7827  data: 0.0005  max mem: 69511
Epoch: [249]  [1200/1251]  eta: 0:00:40  lr: 0.000270  min_lr: 0.000270  loss: 2.8004 (2.6371)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0460 (1.0715)  time: 0.7847  data: 0.0004  max mem: 69511
Epoch: [249]  [1250/1251]  eta: 0:00:00  lr: 0.000269  min_lr: 0.000269  loss: 2.8064 (2.6380)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0287 (1.0709)  time: 0.6663  data: 0.0005  max mem: 69511
Epoch: [249] Total time: 0:16:25 (0.7877 s / it)
Averaged stats: lr: 0.000269  min_lr: 0.000269  loss: 2.8064 (2.6419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0287 (1.0709)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.6858 (0.6858)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 7.3302  data: 6.9069  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8676 (0.8429)  acc1: 86.4000 (87.4909)  acc5: 98.0000 (98.0000)  time: 1.0269  data: 0.6282  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9617 (0.9691)  acc1: 83.6000 (84.4381)  acc5: 97.2000 (97.0857)  time: 0.3966  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0491 (0.9827)  acc1: 83.2000 (84.1120)  acc5: 96.4000 (96.9280)  time: 0.3966  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6804 s / it)
* Acc@1 84.414 Acc@5 97.000 loss 0.968
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.42%
Epoch: [250]  [   0/1251]  eta: 1:35:31  lr: 0.000269  min_lr: 0.000269  loss: 2.5572 (2.5572)  weight_decay: 0.0500 (0.0500)  time: 4.5818  data: 2.9219  max mem: 69511
Epoch: [250]  [ 200/1251]  eta: 0:14:05  lr: 0.000267  min_lr: 0.000267  loss: 2.3672 (2.5829)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0714 (1.0995)  time: 0.7854  data: 0.0008  max mem: 69511
Epoch: [250]  [ 400/1251]  eta: 0:11:17  lr: 0.000266  min_lr: 0.000266  loss: 2.6899 (2.5886)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0120 (1.0850)  time: 0.7854  data: 0.0009  max mem: 69511
Epoch: [250]  [ 600/1251]  eta: 0:08:36  lr: 0.000264  min_lr: 0.000264  loss: 2.6466 (2.6107)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1313 (1.0898)  time: 0.7856  data: 0.0006  max mem: 69511
Epoch: [250]  [ 800/1251]  eta: 0:05:57  lr: 0.000262  min_lr: 0.000262  loss: 2.6952 (2.6143)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1028 (1.0934)  time: 0.7855  data: 0.0008  max mem: 69511
Epoch: [250]  [1000/1251]  eta: 0:03:18  lr: 0.000261  min_lr: 0.000261  loss: 2.4501 (2.6110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0516 (1.0994)  time: 0.7864  data: 0.0007  max mem: 69511
Epoch: [250]  [1200/1251]  eta: 0:00:40  lr: 0.000259  min_lr: 0.000259  loss: 2.7640 (2.6166)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0939 (1.1067)  time: 0.7865  data: 0.0007  max mem: 69511
Epoch: [250]  [1250/1251]  eta: 0:00:00  lr: 0.000259  min_lr: 0.000259  loss: 2.7364 (2.6148)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0939 (1.1079)  time: 0.6677  data: 0.0006  max mem: 69511
Epoch: [250] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.000259  min_lr: 0.000259  loss: 2.7364 (2.6278)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0939 (1.1079)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.6310 (0.6310)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 7.7906  data: 7.3573  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.7937 (0.7713)  acc1: 86.8000 (87.3818)  acc5: 97.6000 (97.8909)  time: 1.0692  data: 0.6691  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9188 (0.8882)  acc1: 83.6000 (84.7810)  acc5: 96.8000 (96.9143)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9697 (0.9021)  acc1: 83.2000 (84.3680)  acc5: 96.4000 (96.7520)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7005 s / it)
* Acc@1 84.360 Acc@5 96.982 loss 0.889
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.42%
Epoch: [251]  [   0/1251]  eta: 1:41:51  lr: 0.000259  min_lr: 0.000259  loss: 3.2250 (3.2250)  weight_decay: 0.0500 (0.0500)  time: 4.8855  data: 2.2264  max mem: 69511
Epoch: [251]  [ 200/1251]  eta: 0:14:09  lr: 0.000257  min_lr: 0.000257  loss: 2.8116 (2.6554)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0733 (1.1749)  time: 0.7902  data: 0.0004  max mem: 69511
Epoch: [251]  [ 400/1251]  eta: 0:11:19  lr: 0.000256  min_lr: 0.000256  loss: 2.7142 (2.6395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9903 (1.1343)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [251]  [ 600/1251]  eta: 0:08:37  lr: 0.000254  min_lr: 0.000254  loss: 2.8375 (2.6219)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0618 (1.1048)  time: 0.7915  data: 0.0005  max mem: 69511
Epoch: [251]  [ 800/1251]  eta: 0:05:57  lr: 0.000252  min_lr: 0.000252  loss: 2.6126 (2.6112)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1160 (1.1010)  time: 0.7913  data: 0.0004  max mem: 69511
Epoch: [251]  [1000/1251]  eta: 0:03:18  lr: 0.000251  min_lr: 0.000251  loss: 2.7373 (2.6162)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0714 (1.1044)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [251]  [1200/1251]  eta: 0:00:40  lr: 0.000249  min_lr: 0.000249  loss: 2.8393 (2.6209)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0524 (1.1046)  time: 0.7867  data: 0.0004  max mem: 69511
Epoch: [251]  [1250/1251]  eta: 0:00:00  lr: 0.000249  min_lr: 0.000249  loss: 2.3866 (2.6205)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0136 (1.1024)  time: 0.6681  data: 0.0007  max mem: 69511
Epoch: [251] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.000249  min_lr: 0.000249  loss: 2.3866 (2.6331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0136 (1.1024)
Test:  [ 0/25]  eta: 0:03:08  loss: 0.5662 (0.5662)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 7.5291  data: 7.0890  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7316 (0.7156)  acc1: 86.0000 (87.4909)  acc5: 98.0000 (97.8909)  time: 1.0455  data: 0.6447  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8350 (0.8375)  acc1: 84.0000 (84.8000)  acc5: 96.8000 (97.0095)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9271 (0.8490)  acc1: 83.6000 (84.3360)  acc5: 96.4000 (96.8800)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6880 s / it)
* Acc@1 84.476 Acc@5 97.030 loss 0.836
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.48%
Epoch: [252]  [   0/1251]  eta: 1:27:58  lr: 0.000249  min_lr: 0.000249  loss: 2.5089 (2.5089)  weight_decay: 0.0500 (0.0500)  time: 4.2193  data: 3.4210  max mem: 69511
Epoch: [252]  [ 200/1251]  eta: 0:14:06  lr: 0.000247  min_lr: 0.000247  loss: 2.5866 (2.6016)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1249 (1.1448)  time: 0.7968  data: 0.0004  max mem: 69511
Epoch: [252]  [ 400/1251]  eta: 0:11:17  lr: 0.000245  min_lr: 0.000245  loss: 2.3807 (2.5872)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1468 (1.1261)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [252]  [ 600/1251]  eta: 0:08:36  lr: 0.000244  min_lr: 0.000244  loss: 2.7078 (2.5757)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0448 (1.1038)  time: 0.7928  data: 0.0004  max mem: 69511
Epoch: [252]  [ 800/1251]  eta: 0:05:57  lr: 0.000242  min_lr: 0.000242  loss: 2.8060 (2.6017)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0539 (1.0887)  time: 0.7875  data: 0.0004  max mem: 69511
Epoch: [252]  [1000/1251]  eta: 0:03:18  lr: 0.000241  min_lr: 0.000241  loss: 2.6880 (2.6138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1411 (1.0976)  time: 0.7939  data: 0.0005  max mem: 69511
Epoch: [252]  [1200/1251]  eta: 0:00:40  lr: 0.000239  min_lr: 0.000239  loss: 2.8447 (2.6166)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0588 (1.0986)  time: 0.7919  data: 0.0004  max mem: 69511
Epoch: [252]  [1250/1251]  eta: 0:00:00  lr: 0.000239  min_lr: 0.000239  loss: 2.3389 (2.6146)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0663 (1.0992)  time: 0.6675  data: 0.0005  max mem: 69511
Epoch: [252] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.000239  min_lr: 0.000239  loss: 2.3389 (2.6272)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0663 (1.0992)
Test:  [ 0/25]  eta: 0:03:06  loss: 0.6266 (0.6266)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.4496  data: 7.0201  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7843 (0.7694)  acc1: 86.8000 (87.2364)  acc5: 97.6000 (97.8909)  time: 1.0380  data: 0.6385  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8833 (0.8884)  acc1: 83.6000 (84.7048)  acc5: 97.2000 (97.1429)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9763 (0.9025)  acc1: 83.2000 (84.1760)  acc5: 96.4000 (96.9120)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6840 s / it)
* Acc@1 84.360 Acc@5 96.964 loss 0.893
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.48%
Epoch: [253]  [   0/1251]  eta: 1:39:03  lr: 0.000239  min_lr: 0.000239  loss: 2.9305 (2.9305)  weight_decay: 0.0500 (0.0500)  time: 4.7513  data: 3.3775  max mem: 69511
Epoch: [253]  [ 200/1251]  eta: 0:14:08  lr: 0.000237  min_lr: 0.000237  loss: 2.6269 (2.6013)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1267 (1.1048)  time: 0.7966  data: 0.0005  max mem: 69511
Epoch: [253]  [ 400/1251]  eta: 0:11:18  lr: 0.000236  min_lr: 0.000236  loss: 2.8116 (2.6164)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0859 (1.1260)  time: 0.7914  data: 0.0003  max mem: 69511
Epoch: [253]  [ 600/1251]  eta: 0:08:37  lr: 0.000234  min_lr: 0.000234  loss: 2.6077 (2.6431)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1083 (1.1389)  time: 0.7925  data: 0.0004  max mem: 69511
Epoch: [253]  [ 800/1251]  eta: 0:05:57  lr: 0.000232  min_lr: 0.000232  loss: 2.7441 (2.6364)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1068 (1.1237)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [253]  [1000/1251]  eta: 0:03:18  lr: 0.000231  min_lr: 0.000231  loss: 2.7257 (2.6301)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1282 (1.1228)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [253]  [1200/1251]  eta: 0:00:40  lr: 0.000229  min_lr: 0.000229  loss: 2.7754 (2.6287)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1203 (1.1246)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [253]  [1250/1251]  eta: 0:00:00  lr: 0.000229  min_lr: 0.000229  loss: 2.7259 (2.6295)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0513 (1.1258)  time: 0.6681  data: 0.0005  max mem: 69511
Epoch: [253] Total time: 0:16:27 (0.7895 s / it)
Averaged stats: lr: 0.000229  min_lr: 0.000229  loss: 2.7259 (2.6253)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0513 (1.1258)
Test:  [ 0/25]  eta: 0:02:58  loss: 0.6862 (0.6862)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 7.1538  data: 6.7245  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8193 (0.8074)  acc1: 87.2000 (87.4546)  acc5: 97.6000 (97.8909)  time: 1.0114  data: 0.6116  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9331 (0.9324)  acc1: 84.0000 (84.6667)  acc5: 97.2000 (97.1238)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0334 (0.9469)  acc1: 82.8000 (84.2240)  acc5: 96.4000 (96.8960)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6755 s / it)
* Acc@1 84.342 Acc@5 96.930 loss 0.938
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.48%
Epoch: [254]  [   0/1251]  eta: 1:38:04  lr: 0.000229  min_lr: 0.000229  loss: 2.7501 (2.7501)  weight_decay: 0.0500 (0.0500)  time: 4.7040  data: 2.4323  max mem: 69511
Epoch: [254]  [ 200/1251]  eta: 0:14:05  lr: 0.000227  min_lr: 0.000227  loss: 2.7718 (2.6130)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0537 (1.0800)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [254]  [ 400/1251]  eta: 0:11:17  lr: 0.000226  min_lr: 0.000226  loss: 2.6454 (2.6276)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0692 (1.0963)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [254]  [ 600/1251]  eta: 0:08:36  lr: 0.000224  min_lr: 0.000224  loss: 2.7749 (2.6340)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1019 (1.1060)  time: 0.7818  data: 0.0004  max mem: 69511
Epoch: [254]  [ 800/1251]  eta: 0:05:56  lr: 0.000223  min_lr: 0.000223  loss: 2.5487 (2.6294)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1444 (1.1214)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [254]  [1000/1251]  eta: 0:03:18  lr: 0.000221  min_lr: 0.000221  loss: 2.7944 (2.6275)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0531 (1.1163)  time: 0.7873  data: 0.0005  max mem: 69511
Epoch: [254]  [1200/1251]  eta: 0:00:40  lr: 0.000220  min_lr: 0.000220  loss: 2.5080 (2.6273)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0769 (1.1110)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [254]  [1250/1251]  eta: 0:00:00  lr: 0.000219  min_lr: 0.000219  loss: 2.7051 (2.6286)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0771 (1.1088)  time: 0.6674  data: 0.0005  max mem: 69511
Epoch: [254] Total time: 0:16:25 (0.7881 s / it)
Averaged stats: lr: 0.000219  min_lr: 0.000219  loss: 2.7051 (2.6230)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0771 (1.1088)
Test:  [ 0/25]  eta: 0:03:15  loss: 0.6935 (0.6935)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.8399  data: 7.4170  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8542 (0.8469)  acc1: 87.6000 (87.5636)  acc5: 97.6000 (98.0727)  time: 1.0736  data: 0.6745  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9651 (0.9738)  acc1: 83.6000 (84.7048)  acc5: 96.8000 (97.1238)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0825 (0.9883)  acc1: 82.8000 (84.3200)  acc5: 96.4000 (96.9120)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7013 s / it)
* Acc@1 84.438 Acc@5 96.952 loss 0.980
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.48%
Epoch: [255]  [   0/1251]  eta: 1:30:59  lr: 0.000219  min_lr: 0.000219  loss: 2.3535 (2.3535)  weight_decay: 0.0500 (0.0500)  time: 4.3639  data: 3.1870  max mem: 69511
Epoch: [255]  [ 200/1251]  eta: 0:14:07  lr: 0.000218  min_lr: 0.000218  loss: 2.6734 (2.6324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0909 (1.0984)  time: 0.7936  data: 0.0004  max mem: 69511
Epoch: [255]  [ 400/1251]  eta: 0:11:17  lr: 0.000216  min_lr: 0.000216  loss: 2.8045 (2.6265)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0827 (1.1117)  time: 0.7825  data: 0.0005  max mem: 69511
Epoch: [255]  [ 600/1251]  eta: 0:08:35  lr: 0.000215  min_lr: 0.000215  loss: 2.7597 (2.6193)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1383 (1.1030)  time: 0.7821  data: 0.0004  max mem: 69511
Epoch: [255]  [ 800/1251]  eta: 0:05:56  lr: 0.000213  min_lr: 0.000213  loss: 2.5628 (2.6220)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0371 (1.1172)  time: 0.7832  data: 0.0005  max mem: 69511
Epoch: [255]  [1000/1251]  eta: 0:03:18  lr: 0.000212  min_lr: 0.000212  loss: 2.7623 (2.6231)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1040 (1.1127)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [255]  [1200/1251]  eta: 0:00:40  lr: 0.000210  min_lr: 0.000210  loss: 2.7245 (2.6187)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0588 (1.1067)  time: 0.7909  data: 0.0005  max mem: 69511
Epoch: [255]  [1250/1251]  eta: 0:00:00  lr: 0.000210  min_lr: 0.000210  loss: 2.5781 (2.6144)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0132 (1.1048)  time: 0.6675  data: 0.0006  max mem: 69511
Epoch: [255] Total time: 0:16:25 (0.7874 s / it)
Averaged stats: lr: 0.000210  min_lr: 0.000210  loss: 2.5781 (2.6153)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0132 (1.1048)
Test:  [ 0/25]  eta: 0:03:00  loss: 0.5940 (0.5940)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.2174  data: 6.7690  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7547 (0.7331)  acc1: 86.8000 (87.7091)  acc5: 98.0000 (97.9636)  time: 1.0170  data: 0.6156  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8657 (0.8643)  acc1: 84.0000 (84.8571)  acc5: 97.2000 (97.0095)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9660 (0.8802)  acc1: 83.2000 (84.3680)  acc5: 96.0000 (96.8160)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6757 s / it)
* Acc@1 84.552 Acc@5 97.060 loss 0.870
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.55%
Epoch: [256]  [   0/1251]  eta: 1:24:14  lr: 0.000210  min_lr: 0.000210  loss: 2.0696 (2.0696)  weight_decay: 0.0500 (0.0500)  time: 4.0406  data: 3.2554  max mem: 69511
Epoch: [256]  [ 200/1251]  eta: 0:14:05  lr: 0.000208  min_lr: 0.000208  loss: 2.6868 (2.6463)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0459 (1.0927)  time: 0.7954  data: 0.0006  max mem: 69511
Epoch: [256]  [ 400/1251]  eta: 0:11:16  lr: 0.000207  min_lr: 0.000207  loss: 2.7846 (2.6388)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1242 (1.1270)  time: 0.7857  data: 0.0006  max mem: 69511
Epoch: [256]  [ 600/1251]  eta: 0:08:36  lr: 0.000205  min_lr: 0.000205  loss: 2.6921 (2.6351)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0799 (1.1270)  time: 0.7901  data: 0.0006  max mem: 69511
Epoch: [256]  [ 800/1251]  eta: 0:05:56  lr: 0.000204  min_lr: 0.000204  loss: 2.5876 (2.6344)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1023 (1.1317)  time: 0.7815  data: 0.0005  max mem: 69511
Epoch: [256]  [1000/1251]  eta: 0:03:18  lr: 0.000203  min_lr: 0.000203  loss: 2.7757 (2.6380)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2735 (1.1456)  time: 0.7847  data: 0.0004  max mem: 69511
Epoch: [256]  [1200/1251]  eta: 0:00:40  lr: 0.000201  min_lr: 0.000201  loss: 2.7675 (2.6244)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1508 (1.1468)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [256]  [1250/1251]  eta: 0:00:00  lr: 0.000201  min_lr: 0.000201  loss: 2.7237 (2.6237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9852 (1.1407)  time: 0.6677  data: 0.0004  max mem: 69511
Epoch: [256] Total time: 0:16:25 (0.7877 s / it)
Averaged stats: lr: 0.000201  min_lr: 0.000201  loss: 2.7237 (2.6113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9852 (1.1407)
Test:  [ 0/25]  eta: 0:03:05  loss: 0.6299 (0.6299)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 7.4024  data: 6.9733  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8098 (0.7787)  acc1: 87.2000 (87.6364)  acc5: 98.0000 (98.0727)  time: 1.0338  data: 0.6342  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8900 (0.9111)  acc1: 83.2000 (84.6286)  acc5: 97.2000 (97.0857)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9914 (0.9244)  acc1: 82.4000 (84.3200)  acc5: 96.4000 (96.9440)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6855 s / it)
* Acc@1 84.538 Acc@5 97.064 loss 0.917
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.55%
Epoch: [257]  [   0/1251]  eta: 1:31:24  lr: 0.000201  min_lr: 0.000201  loss: 2.2562 (2.2562)  weight_decay: 0.0500 (0.0500)  time: 4.3844  data: 3.1964  max mem: 69511
Epoch: [257]  [ 200/1251]  eta: 0:14:06  lr: 0.000199  min_lr: 0.000199  loss: 2.7595 (2.5937)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0886 (1.0985)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [257]  [ 400/1251]  eta: 0:11:18  lr: 0.000198  min_lr: 0.000198  loss: 2.6705 (2.6121)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0722 (1.0636)  time: 0.7941  data: 0.0004  max mem: 69511
Epoch: [257]  [ 600/1251]  eta: 0:08:37  lr: 0.000196  min_lr: 0.000196  loss: 2.4512 (2.6075)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0331 (1.0796)  time: 0.7965  data: 0.0005  max mem: 69511
Epoch: [257]  [ 800/1251]  eta: 0:05:57  lr: 0.000195  min_lr: 0.000195  loss: 2.6699 (2.6111)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0582 (1.0779)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [257]  [1000/1251]  eta: 0:03:18  lr: 0.000193  min_lr: 0.000193  loss: 2.6912 (2.6084)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1237 (1.0903)  time: 0.7831  data: 0.0004  max mem: 69511
Epoch: [257]  [1200/1251]  eta: 0:00:40  lr: 0.000192  min_lr: 0.000192  loss: 2.7337 (2.6124)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0285 (1.0846)  time: 0.7851  data: 0.0004  max mem: 69511
Epoch: [257]  [1250/1251]  eta: 0:00:00  lr: 0.000192  min_lr: 0.000192  loss: 2.7786 (2.6141)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0723 (1.0861)  time: 0.6671  data: 0.0005  max mem: 69511
Epoch: [257] Total time: 0:16:26 (0.7885 s / it)
Averaged stats: lr: 0.000192  min_lr: 0.000192  loss: 2.7786 (2.6051)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0723 (1.0861)
Test:  [ 0/25]  eta: 0:03:08  loss: 0.6484 (0.6484)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 7.5291  data: 7.0873  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8265 (0.8061)  acc1: 86.8000 (87.6000)  acc5: 97.6000 (97.9636)  time: 1.0448  data: 0.6446  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9385 (0.9324)  acc1: 83.6000 (84.8571)  acc5: 97.2000 (97.0857)  time: 0.3963  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0223 (0.9478)  acc1: 83.2000 (84.4000)  acc5: 96.4000 (96.9120)  time: 0.3963  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6872 s / it)
* Acc@1 84.424 Acc@5 96.996 loss 0.938
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.55%
Epoch: [258]  [   0/1251]  eta: 1:34:28  lr: 0.000192  min_lr: 0.000192  loss: 2.9255 (2.9255)  weight_decay: 0.0500 (0.0500)  time: 4.5312  data: 3.2207  max mem: 69511
Epoch: [258]  [ 200/1251]  eta: 0:14:06  lr: 0.000190  min_lr: 0.000190  loss: 2.6266 (2.5856)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1777 (1.1827)  time: 0.7853  data: 0.0003  max mem: 69511
Epoch: [258]  [ 400/1251]  eta: 0:11:16  lr: 0.000189  min_lr: 0.000189  loss: 2.8063 (2.5791)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1133 (inf)  time: 0.7821  data: 0.0004  max mem: 69511
Epoch: [258]  [ 600/1251]  eta: 0:08:34  lr: 0.000187  min_lr: 0.000187  loss: 2.7688 (2.5938)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0667 (inf)  time: 0.7817  data: 0.0003  max mem: 69511
Epoch: [258]  [ 800/1251]  eta: 0:05:56  lr: 0.000186  min_lr: 0.000186  loss: 2.6858 (2.5927)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0931 (inf)  time: 0.7907  data: 0.0004  max mem: 69511
Epoch: [258]  [1000/1251]  eta: 0:03:18  lr: 0.000185  min_lr: 0.000185  loss: 2.5815 (2.5846)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0837 (inf)  time: 0.7870  data: 0.0004  max mem: 69511
Epoch: [258]  [1200/1251]  eta: 0:00:40  lr: 0.000183  min_lr: 0.000183  loss: 2.7443 (2.5949)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0543 (inf)  time: 0.7827  data: 0.0005  max mem: 69511
Epoch: [258]  [1250/1251]  eta: 0:00:00  lr: 0.000183  min_lr: 0.000183  loss: 2.7253 (2.5954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0671 (inf)  time: 0.6660  data: 0.0005  max mem: 69511
Epoch: [258] Total time: 0:16:23 (0.7865 s / it)
Averaged stats: lr: 0.000183  min_lr: 0.000183  loss: 2.7253 (2.6044)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0671 (inf)
Test:  [ 0/25]  eta: 0:02:46  loss: 0.6064 (0.6064)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.6545  data: 6.2341  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.7819 (0.7540)  acc1: 86.0000 (87.6364)  acc5: 98.0000 (98.0727)  time: 0.9651  data: 0.5670  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8826 (0.8705)  acc1: 83.2000 (84.7238)  acc5: 97.2000 (97.2571)  time: 0.3961  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9493 (0.8843)  acc1: 82.8000 (84.3200)  acc5: 96.4000 (97.0400)  time: 0.3960  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6534 s / it)
* Acc@1 84.594 Acc@5 96.992 loss 0.874
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.59%
Epoch: [259]  [   0/1251]  eta: 1:29:33  lr: 0.000183  min_lr: 0.000183  loss: 2.8081 (2.8081)  weight_decay: 0.0500 (0.0500)  time: 4.2951  data: 3.5167  max mem: 69511
Epoch: [259]  [ 200/1251]  eta: 0:14:05  lr: 0.000181  min_lr: 0.000181  loss: 2.7601 (2.5614)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0242 (1.0327)  time: 0.7855  data: 0.0003  max mem: 69511
Epoch: [259]  [ 400/1251]  eta: 0:11:17  lr: 0.000180  min_lr: 0.000180  loss: 2.6321 (2.5891)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0770 (nan)  time: 0.7858  data: 0.0003  max mem: 69511
Epoch: [259]  [ 600/1251]  eta: 0:08:36  lr: 0.000179  min_lr: 0.000179  loss: 2.2839 (2.5976)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0798 (nan)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [259]  [ 800/1251]  eta: 0:05:57  lr: 0.000177  min_lr: 0.000177  loss: 2.2843 (2.5848)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0854 (nan)  time: 0.7858  data: 0.0003  max mem: 69511
Epoch: [259]  [1000/1251]  eta: 0:03:18  lr: 0.000176  min_lr: 0.000176  loss: 2.7395 (2.5958)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0513 (nan)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [259]  [1200/1251]  eta: 0:00:40  lr: 0.000175  min_lr: 0.000175  loss: 2.5592 (2.5941)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0262 (nan)  time: 0.7828  data: 0.0004  max mem: 69511
Epoch: [259]  [1250/1251]  eta: 0:00:00  lr: 0.000174  min_lr: 0.000174  loss: 2.7445 (2.5979)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0491 (nan)  time: 0.6651  data: 0.0005  max mem: 69511
Epoch: [259] Total time: 0:16:26 (0.7889 s / it)
Averaged stats: lr: 0.000174  min_lr: 0.000174  loss: 2.7445 (2.5929)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0491 (nan)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6887 (0.6887)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.8578  data: 5.4007  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8361 (0.8081)  acc1: 86.4000 (87.4545)  acc5: 98.0000 (98.0364)  time: 0.9760  data: 0.5770  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9504 (0.9270)  acc1: 84.0000 (84.7619)  acc5: 97.2000 (97.2000)  time: 0.4405  data: 0.0474  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0050 (0.9426)  acc1: 82.4000 (84.3040)  acc5: 96.8000 (97.0400)  time: 0.3934  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6550 s / it)
* Acc@1 84.542 Acc@5 97.028 loss 0.934
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.59%
Epoch: [260]  [   0/1251]  eta: 1:29:52  lr: 0.000174  min_lr: 0.000174  loss: 2.7481 (2.7481)  weight_decay: 0.0500 (0.0500)  time: 4.3108  data: 3.0384  max mem: 69511
Epoch: [260]  [ 200/1251]  eta: 0:14:05  lr: 0.000173  min_lr: 0.000173  loss: 2.7239 (2.6383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1047 (1.1163)  time: 0.7849  data: 0.0006  max mem: 69511
Epoch: [260]  [ 400/1251]  eta: 0:11:16  lr: 0.000172  min_lr: 0.000172  loss: 2.5001 (2.6058)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2030 (1.1601)  time: 0.7850  data: 0.0006  max mem: 69511
Epoch: [260]  [ 600/1251]  eta: 0:08:35  lr: 0.000170  min_lr: 0.000170  loss: 2.6670 (2.6142)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1630 (1.1497)  time: 0.7857  data: 0.0007  max mem: 69511
Epoch: [260]  [ 800/1251]  eta: 0:05:56  lr: 0.000169  min_lr: 0.000169  loss: 2.6323 (2.6136)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0971 (1.1358)  time: 0.7863  data: 0.0008  max mem: 69511
Epoch: [260]  [1000/1251]  eta: 0:03:18  lr: 0.000167  min_lr: 0.000167  loss: 2.5307 (2.6108)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2547 (1.1560)  time: 0.7920  data: 0.0005  max mem: 69511
Epoch: [260]  [1200/1251]  eta: 0:00:40  lr: 0.000166  min_lr: 0.000166  loss: 2.6355 (2.6009)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0212 (1.1540)  time: 0.7863  data: 0.0007  max mem: 69511
Epoch: [260]  [1250/1251]  eta: 0:00:00  lr: 0.000166  min_lr: 0.000166  loss: 2.3718 (2.5994)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1618 (1.1541)  time: 0.6724  data: 0.0005  max mem: 69511
Epoch: [260] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.000166  min_lr: 0.000166  loss: 2.3718 (2.5939)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1618 (1.1541)
Test:  [ 0/25]  eta: 0:02:33  loss: 0.6081 (0.6081)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 6.1533  data: 5.7376  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.7805 (0.7571)  acc1: 86.8000 (87.6727)  acc5: 97.6000 (98.0364)  time: 0.9972  data: 0.5915  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8818 (0.8778)  acc1: 83.6000 (84.8381)  acc5: 97.2000 (97.0857)  time: 0.4393  data: 0.0385  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9583 (0.8925)  acc1: 82.8000 (84.4000)  acc5: 96.4000 (96.9600)  time: 0.3976  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6662 s / it)
* Acc@1 84.590 Acc@5 97.058 loss 0.883
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.59%
Epoch: [261]  [   0/1251]  eta: 1:41:43  lr: 0.000166  min_lr: 0.000166  loss: 2.3285 (2.3285)  weight_decay: 0.0500 (0.0500)  time: 4.8786  data: 3.6695  max mem: 69511
Epoch: [261]  [ 200/1251]  eta: 0:14:07  lr: 0.000165  min_lr: 0.000165  loss: 2.7362 (2.5754)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0182 (1.1469)  time: 0.7863  data: 0.0008  max mem: 69511
Epoch: [261]  [ 400/1251]  eta: 0:11:18  lr: 0.000163  min_lr: 0.000163  loss: 2.7077 (2.5994)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1073 (1.1502)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [261]  [ 600/1251]  eta: 0:08:37  lr: 0.000162  min_lr: 0.000162  loss: 2.9163 (2.6100)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0684 (1.1233)  time: 0.7863  data: 0.0005  max mem: 69511
Epoch: [261]  [ 800/1251]  eta: 0:05:57  lr: 0.000161  min_lr: 0.000161  loss: 2.7771 (2.6124)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0740 (1.1133)  time: 0.7874  data: 0.0005  max mem: 69511
Epoch: [261]  [1000/1251]  eta: 0:03:18  lr: 0.000159  min_lr: 0.000159  loss: 2.6341 (2.5988)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0848 (1.1290)  time: 0.7850  data: 0.0005  max mem: 69511
Epoch: [261]  [1200/1251]  eta: 0:00:40  lr: 0.000158  min_lr: 0.000158  loss: 2.6034 (2.6022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1171 (1.1269)  time: 0.7851  data: 0.0005  max mem: 69511
Epoch: [261]  [1250/1251]  eta: 0:00:00  lr: 0.000158  min_lr: 0.000158  loss: 2.6290 (2.6047)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1016 (1.1257)  time: 0.6671  data: 0.0007  max mem: 69511
Epoch: [261] Total time: 0:16:25 (0.7881 s / it)
Averaged stats: lr: 0.000158  min_lr: 0.000158  loss: 2.6290 (2.5918)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1016 (1.1257)
Test:  [ 0/25]  eta: 0:03:08  loss: 0.7103 (0.7103)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.5242  data: 7.0978  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8696 (0.8527)  acc1: 86.4000 (87.3818)  acc5: 98.0000 (98.0727)  time: 1.0444  data: 0.6455  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9695 (0.9761)  acc1: 83.6000 (84.7048)  acc5: 96.8000 (97.0667)  time: 0.3964  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0728 (0.9910)  acc1: 83.2000 (84.2240)  acc5: 96.0000 (96.8800)  time: 0.3964  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6866 s / it)
* Acc@1 84.404 Acc@5 96.940 loss 0.983
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.59%
Epoch: [262]  [   0/1251]  eta: 1:35:26  lr: 0.000158  min_lr: 0.000158  loss: 2.5918 (2.5918)  weight_decay: 0.0500 (0.0500)  time: 4.5778  data: 2.0845  max mem: 69511
Epoch: [262]  [ 200/1251]  eta: 0:14:06  lr: 0.000156  min_lr: 0.000156  loss: 2.6069 (2.6010)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1817 (1.1230)  time: 0.7851  data: 0.0005  max mem: 69511
Epoch: [262]  [ 400/1251]  eta: 0:11:18  lr: 0.000155  min_lr: 0.000155  loss: 2.5358 (2.6063)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0939 (1.1216)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [262]  [ 600/1251]  eta: 0:08:36  lr: 0.000154  min_lr: 0.000154  loss: 2.6981 (2.5905)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1363 (1.1180)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [262]  [ 800/1251]  eta: 0:05:57  lr: 0.000152  min_lr: 0.000152  loss: 2.8053 (2.5883)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2225 (1.1262)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [262]  [1000/1251]  eta: 0:03:18  lr: 0.000151  min_lr: 0.000151  loss: 2.7338 (2.5882)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0296 (1.1234)  time: 0.7936  data: 0.0004  max mem: 69511
Epoch: [262]  [1200/1251]  eta: 0:00:40  lr: 0.000150  min_lr: 0.000150  loss: 2.7914 (2.5875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2360 (1.1338)  time: 0.7951  data: 0.0004  max mem: 69511
Epoch: [262]  [1250/1251]  eta: 0:00:00  lr: 0.000150  min_lr: 0.000150  loss: 2.7354 (2.5895)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1379 (1.1347)  time: 0.6694  data: 0.0007  max mem: 69511
Epoch: [262] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.000150  min_lr: 0.000150  loss: 2.7354 (2.5913)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1379 (1.1347)
Test:  [ 0/25]  eta: 0:03:07  loss: 0.7185 (0.7185)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 7.4820  data: 7.0581  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8639 (0.8506)  acc1: 86.8000 (87.8909)  acc5: 98.0000 (98.0000)  time: 1.0412  data: 0.6419  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9737 (0.9761)  acc1: 82.8000 (85.0286)  acc5: 97.2000 (97.1810)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0701 (0.9931)  acc1: 82.8000 (84.5440)  acc5: 96.4000 (96.9440)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6881 s / it)
* Acc@1 84.562 Acc@5 97.044 loss 0.983
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.59%
Epoch: [263]  [   0/1251]  eta: 1:41:50  lr: 0.000150  min_lr: 0.000150  loss: 2.9994 (2.9994)  weight_decay: 0.0500 (0.0500)  time: 4.8845  data: 3.6072  max mem: 69511
Epoch: [263]  [ 200/1251]  eta: 0:14:08  lr: 0.000148  min_lr: 0.000148  loss: 2.7117 (2.6416)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1261 (1.1092)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [263]  [ 400/1251]  eta: 0:11:19  lr: 0.000147  min_lr: 0.000147  loss: 2.6765 (2.6204)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1267 (1.1241)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [263]  [ 600/1251]  eta: 0:08:36  lr: 0.000146  min_lr: 0.000146  loss: 2.6365 (2.6162)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1081 (1.1214)  time: 0.7822  data: 0.0004  max mem: 69511
Epoch: [263]  [ 800/1251]  eta: 0:05:57  lr: 0.000145  min_lr: 0.000145  loss: 2.6770 (2.5968)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0972 (1.1229)  time: 0.7939  data: 0.0005  max mem: 69511
Epoch: [263]  [1000/1251]  eta: 0:03:18  lr: 0.000143  min_lr: 0.000143  loss: 2.7883 (2.6012)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1625 (1.1306)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [263]  [1200/1251]  eta: 0:00:40  lr: 0.000142  min_lr: 0.000142  loss: 2.6883 (2.6019)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0823 (1.1381)  time: 0.7826  data: 0.0006  max mem: 69511
Epoch: [263]  [1250/1251]  eta: 0:00:00  lr: 0.000142  min_lr: 0.000142  loss: 2.6532 (2.6014)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1497 (1.1389)  time: 0.6649  data: 0.0005  max mem: 69511
Epoch: [263] Total time: 0:16:25 (0.7880 s / it)
Averaged stats: lr: 0.000142  min_lr: 0.000142  loss: 2.6532 (2.5927)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1497 (1.1389)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.6385 (0.6385)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.7885  data: 7.3600  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8056 (0.7669)  acc1: 86.8000 (87.7091)  acc5: 98.0000 (98.0364)  time: 1.0673  data: 0.6694  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9317 (0.8882)  acc1: 82.8000 (84.7429)  acc5: 97.2000 (97.1810)  time: 0.3951  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9691 (0.9032)  acc1: 82.4000 (84.3360)  acc5: 96.4000 (96.9760)  time: 0.3952  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7001 s / it)
* Acc@1 84.594 Acc@5 97.016 loss 0.891
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.59%
Epoch: [264]  [   0/1251]  eta: 1:22:16  lr: 0.000142  min_lr: 0.000142  loss: 2.0864 (2.0864)  weight_decay: 0.0500 (0.0500)  time: 3.9460  data: 3.1616  max mem: 69511
Epoch: [264]  [ 200/1251]  eta: 0:14:04  lr: 0.000141  min_lr: 0.000141  loss: 2.6377 (2.5522)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0780 (1.1065)  time: 0.7847  data: 0.0004  max mem: 69511
Epoch: [264]  [ 400/1251]  eta: 0:11:15  lr: 0.000139  min_lr: 0.000139  loss: 2.7452 (2.5557)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0262 (1.1120)  time: 0.7816  data: 0.0005  max mem: 69511
Epoch: [264]  [ 600/1251]  eta: 0:08:34  lr: 0.000138  min_lr: 0.000138  loss: 2.7590 (2.5742)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0582 (1.1031)  time: 0.7824  data: 0.0004  max mem: 69511
Epoch: [264]  [ 800/1251]  eta: 0:05:55  lr: 0.000137  min_lr: 0.000137  loss: 2.6773 (2.5680)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0922 (1.1071)  time: 0.7850  data: 0.0006  max mem: 69511
Epoch: [264]  [1000/1251]  eta: 0:03:17  lr: 0.000136  min_lr: 0.000136  loss: 2.4753 (2.5815)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1005 (1.1139)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [264]  [1200/1251]  eta: 0:00:40  lr: 0.000134  min_lr: 0.000134  loss: 2.6725 (2.5828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0347 (1.1116)  time: 0.7878  data: 0.0004  max mem: 69511
Epoch: [264]  [1250/1251]  eta: 0:00:00  lr: 0.000134  min_lr: 0.000134  loss: 2.5545 (2.5824)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0985 (1.1152)  time: 0.6676  data: 0.0005  max mem: 69511
Epoch: [264] Total time: 0:16:24 (0.7871 s / it)
Averaged stats: lr: 0.000134  min_lr: 0.000134  loss: 2.5545 (2.5819)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0985 (1.1152)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.5727 (0.5727)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 7.7145  data: 7.2780  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7518 (0.7342)  acc1: 87.2000 (87.4545)  acc5: 98.0000 (98.0000)  time: 1.0622  data: 0.6619  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8930 (0.8585)  acc1: 83.2000 (84.8381)  acc5: 96.8000 (97.1810)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9336 (0.8739)  acc1: 82.4000 (84.4640)  acc5: 96.4000 (96.9120)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6961 s / it)
* Acc@1 84.594 Acc@5 97.018 loss 0.862
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.59%
Epoch: [265]  [   0/1251]  eta: 1:30:40  lr: 0.000134  min_lr: 0.000134  loss: 2.5365 (2.5365)  weight_decay: 0.0500 (0.0500)  time: 4.3491  data: 3.5562  max mem: 69511
Epoch: [265]  [ 200/1251]  eta: 0:14:04  lr: 0.000133  min_lr: 0.000133  loss: 2.6104 (2.5780)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1604 (1.1533)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [265]  [ 400/1251]  eta: 0:11:15  lr: 0.000132  min_lr: 0.000132  loss: 2.7996 (2.5841)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0357 (1.1053)  time: 0.7823  data: 0.0004  max mem: 69511
Epoch: [265]  [ 600/1251]  eta: 0:08:35  lr: 0.000131  min_lr: 0.000131  loss: 2.7163 (2.5769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1601 (1.1212)  time: 0.7843  data: 0.0007  max mem: 69511
Epoch: [265]  [ 800/1251]  eta: 0:05:56  lr: 0.000129  min_lr: 0.000129  loss: 2.6978 (2.5885)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1652 (inf)  time: 0.7845  data: 0.0004  max mem: 69511
Epoch: [265]  [1000/1251]  eta: 0:03:18  lr: 0.000128  min_lr: 0.000128  loss: 2.5958 (2.5896)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0737 (inf)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [265]  [1200/1251]  eta: 0:00:40  lr: 0.000127  min_lr: 0.000127  loss: 2.5437 (2.5886)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1117 (inf)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [265]  [1250/1251]  eta: 0:00:00  lr: 0.000127  min_lr: 0.000127  loss: 2.7980 (2.5937)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1919 (inf)  time: 0.6672  data: 0.0005  max mem: 69511
Epoch: [265] Total time: 0:16:25 (0.7875 s / it)
Averaged stats: lr: 0.000127  min_lr: 0.000127  loss: 2.7980 (2.5800)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1919 (inf)
Test:  [ 0/25]  eta: 0:03:12  loss: 0.7463 (0.7463)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 7.6937  data: 7.2769  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8758 (0.8859)  acc1: 87.6000 (87.6727)  acc5: 97.6000 (97.9636)  time: 1.0602  data: 0.6618  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0216 (1.0038)  acc1: 83.6000 (84.8952)  acc5: 97.2000 (97.1238)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0919 (1.0189)  acc1: 82.8000 (84.4480)  acc5: 96.0000 (96.8960)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6946 s / it)
* Acc@1 84.534 Acc@5 97.014 loss 1.009
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.59%
Epoch: [266]  [   0/1251]  eta: 1:39:44  lr: 0.000127  min_lr: 0.000127  loss: 2.7604 (2.7604)  weight_decay: 0.0500 (0.0500)  time: 4.7835  data: 3.8496  max mem: 69511
Epoch: [266]  [ 200/1251]  eta: 0:14:07  lr: 0.000126  min_lr: 0.000126  loss: 2.8450 (2.5816)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0056 (1.0804)  time: 0.7852  data: 0.0005  max mem: 69511
Epoch: [266]  [ 400/1251]  eta: 0:11:17  lr: 0.000124  min_lr: 0.000124  loss: 2.5891 (2.5896)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1518 (1.1185)  time: 0.7889  data: 0.0004  max mem: 69511
Epoch: [266]  [ 600/1251]  eta: 0:08:35  lr: 0.000123  min_lr: 0.000123  loss: 2.7670 (2.5980)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2259 (1.1348)  time: 0.7870  data: 0.0006  max mem: 69511
Epoch: [266]  [ 800/1251]  eta: 0:05:56  lr: 0.000122  min_lr: 0.000122  loss: 2.7295 (2.5866)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1598 (1.1470)  time: 0.7850  data: 0.0004  max mem: 69511
Epoch: [266]  [1000/1251]  eta: 0:03:18  lr: 0.000121  min_lr: 0.000121  loss: 2.6419 (2.5842)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1961 (1.1570)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [266]  [1200/1251]  eta: 0:00:40  lr: 0.000120  min_lr: 0.000120  loss: 2.4637 (2.5815)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1250 (1.1603)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [266]  [1250/1251]  eta: 0:00:00  lr: 0.000120  min_lr: 0.000120  loss: 2.5060 (2.5768)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1696 (1.1645)  time: 0.6679  data: 0.0006  max mem: 69511
Epoch: [266] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.000120  min_lr: 0.000120  loss: 2.5060 (2.5726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1696 (1.1645)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.5896 (0.5896)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 7.7953  data: 7.3693  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.7618 (0.7400)  acc1: 87.6000 (87.6727)  acc5: 98.0000 (98.1091)  time: 1.0694  data: 0.6702  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8972 (0.8610)  acc1: 83.6000 (84.9714)  acc5: 97.2000 (97.1048)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9328 (0.8760)  acc1: 82.4000 (84.5600)  acc5: 96.4000 (96.8960)  time: 0.3967  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7008 s / it)
* Acc@1 84.606 Acc@5 97.036 loss 0.866
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.61%
Epoch: [267]  [   0/1251]  eta: 1:07:07  lr: 0.000120  min_lr: 0.000120  loss: 2.4324 (2.4324)  weight_decay: 0.0500 (0.0500)  time: 3.2193  data: 2.4219  max mem: 69511
Epoch: [267]  [ 200/1251]  eta: 0:13:59  lr: 0.000118  min_lr: 0.000118  loss: 2.7146 (2.5874)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2743 (1.1795)  time: 0.7950  data: 0.0004  max mem: 69511
Epoch: [267]  [ 400/1251]  eta: 0:11:14  lr: 0.000117  min_lr: 0.000117  loss: 2.6751 (2.5615)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0967 (1.1565)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [267]  [ 600/1251]  eta: 0:08:35  lr: 0.000116  min_lr: 0.000116  loss: 2.7683 (2.5671)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0398 (1.1379)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [267]  [ 800/1251]  eta: 0:05:56  lr: 0.000115  min_lr: 0.000115  loss: 2.5627 (2.5598)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0697 (1.1348)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [267]  [1000/1251]  eta: 0:03:18  lr: 0.000114  min_lr: 0.000114  loss: 2.6117 (2.5617)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1070 (1.1333)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [267]  [1200/1251]  eta: 0:00:40  lr: 0.000113  min_lr: 0.000113  loss: 2.6845 (2.5607)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0516 (1.1273)  time: 0.7871  data: 0.0004  max mem: 69511
Epoch: [267]  [1250/1251]  eta: 0:00:00  lr: 0.000113  min_lr: 0.000113  loss: 2.5861 (2.5590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0686 (1.1285)  time: 0.6716  data: 0.0005  max mem: 69511
Epoch: [267] Total time: 0:16:25 (0.7879 s / it)
Averaged stats: lr: 0.000113  min_lr: 0.000113  loss: 2.5861 (2.5688)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0686 (1.1285)
Test:  [ 0/25]  eta: 0:03:06  loss: 0.6270 (0.6270)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 7.4673  data: 7.0517  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7994 (0.7663)  acc1: 86.8000 (87.5636)  acc5: 97.6000 (98.0364)  time: 1.0400  data: 0.6413  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9095 (0.8866)  acc1: 83.2000 (84.7619)  acc5: 97.2000 (97.0286)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9635 (0.9017)  acc1: 82.4000 (84.3360)  acc5: 96.4000 (96.8480)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6874 s / it)
* Acc@1 84.602 Acc@5 96.988 loss 0.889
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.61%
Epoch: [268]  [   0/1251]  eta: 1:34:28  lr: 0.000113  min_lr: 0.000113  loss: 2.5878 (2.5878)  weight_decay: 0.0500 (0.0500)  time: 4.5310  data: 3.0249  max mem: 69511
Epoch: [268]  [ 200/1251]  eta: 0:14:07  lr: 0.000111  min_lr: 0.000111  loss: 2.6392 (2.5580)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0965 (1.1392)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [268]  [ 400/1251]  eta: 0:11:18  lr: 0.000110  min_lr: 0.000110  loss: 2.8072 (2.5785)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1090 (1.1453)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [268]  [ 600/1251]  eta: 0:08:36  lr: 0.000109  min_lr: 0.000109  loss: 2.4774 (2.5609)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1174 (1.1617)  time: 0.7892  data: 0.0005  max mem: 69511
Epoch: [268]  [ 800/1251]  eta: 0:05:57  lr: 0.000108  min_lr: 0.000108  loss: 2.7277 (2.5666)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1289 (1.1683)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [268]  [1000/1251]  eta: 0:03:18  lr: 0.000107  min_lr: 0.000107  loss: 2.3446 (2.5621)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1073 (1.1616)  time: 0.7833  data: 0.0004  max mem: 69511
Epoch: [268]  [1200/1251]  eta: 0:00:40  lr: 0.000106  min_lr: 0.000106  loss: 2.5778 (2.5594)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0519 (1.1568)  time: 0.7906  data: 0.0005  max mem: 69511
Epoch: [268]  [1250/1251]  eta: 0:00:00  lr: 0.000106  min_lr: 0.000106  loss: 2.6038 (2.5610)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0356 (1.1537)  time: 0.6673  data: 0.0006  max mem: 69511
Epoch: [268] Total time: 0:16:26 (0.7889 s / it)
Averaged stats: lr: 0.000106  min_lr: 0.000106  loss: 2.6038 (2.5723)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0356 (1.1537)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.5929 (0.5929)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.8827  data: 5.4371  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7607 (0.7327)  acc1: 87.2000 (87.8182)  acc5: 97.6000 (98.0000)  time: 1.0230  data: 0.6223  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8726 (0.8511)  acc1: 83.2000 (84.9905)  acc5: 97.2000 (97.1810)  time: 0.4667  data: 0.0704  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9156 (0.8665)  acc1: 82.8000 (84.5440)  acc5: 96.4000 (96.9760)  time: 0.3964  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6764 s / it)
* Acc@1 84.642 Acc@5 97.082 loss 0.856
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.64%
Epoch: [269]  [   0/1251]  eta: 1:22:57  lr: 0.000106  min_lr: 0.000106  loss: 1.8838 (1.8838)  weight_decay: 0.0500 (0.0500)  time: 3.9787  data: 3.1860  max mem: 69511
Epoch: [269]  [ 200/1251]  eta: 0:14:03  lr: 0.000105  min_lr: 0.000105  loss: 2.4873 (2.5621)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1472 (1.1398)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [269]  [ 400/1251]  eta: 0:11:16  lr: 0.000104  min_lr: 0.000104  loss: 2.7169 (2.5677)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1883 (1.1439)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [269]  [ 600/1251]  eta: 0:08:36  lr: 0.000103  min_lr: 0.000103  loss: 2.8230 (2.5755)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1430 (1.1659)  time: 0.7920  data: 0.0005  max mem: 69511
Epoch: [269]  [ 800/1251]  eta: 0:05:56  lr: 0.000102  min_lr: 0.000102  loss: 2.4386 (2.5872)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1261 (1.1636)  time: 0.7893  data: 0.0005  max mem: 69511
Epoch: [269]  [1000/1251]  eta: 0:03:18  lr: 0.000100  min_lr: 0.000100  loss: 2.5025 (2.5914)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0976 (1.1577)  time: 0.7871  data: 0.0005  max mem: 69511
Epoch: [269]  [1200/1251]  eta: 0:00:40  lr: 0.000099  min_lr: 0.000099  loss: 2.6652 (2.5875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0566 (1.1501)  time: 0.7829  data: 0.0006  max mem: 69511
Epoch: [269]  [1250/1251]  eta: 0:00:00  lr: 0.000099  min_lr: 0.000099  loss: 2.5968 (2.5881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0618 (1.1484)  time: 0.6641  data: 0.0005  max mem: 69511
Epoch: [269] Total time: 0:16:25 (0.7881 s / it)
Averaged stats: lr: 0.000099  min_lr: 0.000099  loss: 2.5968 (2.5668)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0618 (1.1484)
Test:  [ 0/25]  eta: 0:03:01  loss: 0.5980 (0.5980)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 7.2739  data: 6.8442  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7768 (0.7467)  acc1: 87.2000 (87.7455)  acc5: 97.6000 (98.0364)  time: 1.0192  data: 0.6225  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8736 (0.8696)  acc1: 84.0000 (85.0286)  acc5: 97.2000 (97.1429)  time: 0.3937  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9577 (0.8829)  acc1: 83.2000 (84.5920)  acc5: 96.4000 (96.8960)  time: 0.3936  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6803 s / it)
* Acc@1 84.784 Acc@5 97.038 loss 0.871
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.78%
Epoch: [270]  [   0/1251]  eta: 1:35:34  lr: 0.000099  min_lr: 0.000099  loss: 2.5820 (2.5820)  weight_decay: 0.0500 (0.0500)  time: 4.5842  data: 3.8065  max mem: 69511
Epoch: [270]  [ 200/1251]  eta: 0:14:04  lr: 0.000098  min_lr: 0.000098  loss: 2.7699 (2.5814)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1840 (1.1909)  time: 0.7836  data: 0.0005  max mem: 69511
Epoch: [270]  [ 400/1251]  eta: 0:11:17  lr: 0.000097  min_lr: 0.000097  loss: 2.6854 (2.5600)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0680 (1.1723)  time: 0.7855  data: 0.0005  max mem: 69511
Epoch: [270]  [ 600/1251]  eta: 0:08:35  lr: 0.000096  min_lr: 0.000096  loss: 2.6890 (2.5541)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0697 (1.1514)  time: 0.7821  data: 0.0004  max mem: 69511
Epoch: [270]  [ 800/1251]  eta: 0:05:56  lr: 0.000095  min_lr: 0.000095  loss: 2.6595 (2.5553)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0758 (1.1406)  time: 0.7881  data: 0.0005  max mem: 69511
Epoch: [270]  [1000/1251]  eta: 0:03:18  lr: 0.000094  min_lr: 0.000094  loss: 2.6358 (2.5569)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1521 (1.1357)  time: 0.7836  data: 0.0005  max mem: 69511
Epoch: [270]  [1200/1251]  eta: 0:00:40  lr: 0.000093  min_lr: 0.000093  loss: 2.7737 (2.5532)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0990 (1.1397)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [270]  [1250/1251]  eta: 0:00:00  lr: 0.000093  min_lr: 0.000093  loss: 2.5251 (2.5519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0782 (1.1385)  time: 0.6719  data: 0.0006  max mem: 69511
Epoch: [270] Total time: 0:16:25 (0.7874 s / it)
Averaged stats: lr: 0.000093  min_lr: 0.000093  loss: 2.5251 (2.5578)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0782 (1.1385)
Test:  [ 0/25]  eta: 0:02:52  loss: 0.6374 (0.6374)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 6.8983  data: 6.4557  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8025 (0.7776)  acc1: 86.8000 (87.7091)  acc5: 98.0000 (98.0364)  time: 0.9883  data: 0.5872  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9039 (0.8936)  acc1: 84.0000 (85.1619)  acc5: 96.8000 (96.9524)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9780 (0.9077)  acc1: 83.6000 (84.7040)  acc5: 96.0000 (96.7680)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6628 s / it)
* Acc@1 84.720 Acc@5 97.008 loss 0.895
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.78%
Epoch: [271]  [   0/1251]  eta: 1:26:38  lr: 0.000093  min_lr: 0.000093  loss: 2.7779 (2.7779)  weight_decay: 0.0500 (0.0500)  time: 4.1559  data: 1.9730  max mem: 69511
Epoch: [271]  [ 200/1251]  eta: 0:14:06  lr: 0.000092  min_lr: 0.000092  loss: 2.7302 (2.5590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1031 (1.1478)  time: 0.7946  data: 0.0004  max mem: 69511
Epoch: [271]  [ 400/1251]  eta: 0:11:17  lr: 0.000091  min_lr: 0.000091  loss: 2.6551 (2.5530)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1715 (1.1471)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [271]  [ 600/1251]  eta: 0:08:36  lr: 0.000090  min_lr: 0.000090  loss: 2.5655 (2.5569)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1706 (1.1732)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [271]  [ 800/1251]  eta: 0:05:57  lr: 0.000089  min_lr: 0.000089  loss: 2.6070 (2.5591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0662 (1.1627)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [271]  [1000/1251]  eta: 0:03:18  lr: 0.000088  min_lr: 0.000088  loss: 2.6523 (2.5645)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1228 (1.1540)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [271]  [1200/1251]  eta: 0:00:40  lr: 0.000087  min_lr: 0.000087  loss: 2.6428 (2.5681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0399 (1.1466)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [271]  [1250/1251]  eta: 0:00:00  lr: 0.000087  min_lr: 0.000087  loss: 2.5204 (2.5691)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0979 (1.1478)  time: 0.6700  data: 0.0005  max mem: 69511
Epoch: [271] Total time: 0:16:27 (0.7895 s / it)
Averaged stats: lr: 0.000087  min_lr: 0.000087  loss: 2.5204 (2.5676)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0979 (1.1478)
Test:  [ 0/25]  eta: 0:03:05  loss: 0.6553 (0.6553)  acc1: 89.6000 (89.6000)  acc5: 99.6000 (99.6000)  time: 7.4228  data: 7.0133  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8121 (0.7923)  acc1: 86.8000 (87.6364)  acc5: 98.0000 (98.0364)  time: 1.0368  data: 0.6379  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9201 (0.9179)  acc1: 83.6000 (84.8571)  acc5: 97.2000 (97.1810)  time: 0.3976  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0120 (0.9322)  acc1: 83.2000 (84.5280)  acc5: 96.4000 (96.9760)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6847 s / it)
* Acc@1 84.700 Acc@5 97.010 loss 0.920
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.78%
Epoch: [272]  [   0/1251]  eta: 1:40:16  lr: 0.000087  min_lr: 0.000087  loss: 3.0348 (3.0348)  weight_decay: 0.0500 (0.0500)  time: 4.8092  data: 3.9700  max mem: 69511
Epoch: [272]  [ 200/1251]  eta: 0:14:06  lr: 0.000086  min_lr: 0.000086  loss: 2.5031 (2.5440)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1696 (1.1529)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [272]  [ 400/1251]  eta: 0:11:18  lr: 0.000085  min_lr: 0.000085  loss: 2.7461 (2.5510)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1820 (1.1353)  time: 0.7932  data: 0.0005  max mem: 69511
Epoch: [272]  [ 600/1251]  eta: 0:08:37  lr: 0.000084  min_lr: 0.000084  loss: 2.7365 (2.5610)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1114 (1.1515)  time: 0.7886  data: 0.0004  max mem: 69511
Epoch: [272]  [ 800/1251]  eta: 0:05:57  lr: 0.000083  min_lr: 0.000083  loss: 2.4871 (2.5579)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1366 (1.1435)  time: 0.7825  data: 0.0005  max mem: 69511
Epoch: [272]  [1000/1251]  eta: 0:03:18  lr: 0.000082  min_lr: 0.000082  loss: 2.7065 (2.5568)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1067 (1.1411)  time: 0.7838  data: 0.0004  max mem: 69511
Epoch: [272]  [1200/1251]  eta: 0:00:40  lr: 0.000081  min_lr: 0.000081  loss: 2.5613 (2.5492)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1511 (1.1425)  time: 0.7851  data: 0.0004  max mem: 69511
Epoch: [272]  [1250/1251]  eta: 0:00:00  lr: 0.000081  min_lr: 0.000081  loss: 2.7750 (2.5519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0808 (1.1438)  time: 0.6668  data: 0.0006  max mem: 69511
Epoch: [272] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.000081  min_lr: 0.000081  loss: 2.7750 (2.5592)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0808 (1.1438)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.6734 (0.6734)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 6.0118  data: 5.5828  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8330 (0.8131)  acc1: 86.8000 (87.6727)  acc5: 97.6000 (98.0364)  time: 1.0149  data: 0.6093  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9581 (0.9370)  acc1: 84.0000 (84.8952)  acc5: 97.2000 (97.0667)  time: 0.4590  data: 0.0560  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0326 (0.9514)  acc1: 82.8000 (84.5440)  acc5: 96.4000 (96.8800)  time: 0.4032  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6766 s / it)
* Acc@1 84.676 Acc@5 97.006 loss 0.939
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.78%
Epoch: [273]  [   0/1251]  eta: 1:37:38  lr: 0.000081  min_lr: 0.000081  loss: 2.5535 (2.5535)  weight_decay: 0.0500 (0.0500)  time: 4.6828  data: 3.0692  max mem: 69511
Epoch: [273]  [ 200/1251]  eta: 0:14:05  lr: 0.000080  min_lr: 0.000080  loss: 2.7357 (2.5456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1896 (1.1577)  time: 0.7822  data: 0.0005  max mem: 69511
Epoch: [273]  [ 400/1251]  eta: 0:11:17  lr: 0.000079  min_lr: 0.000079  loss: 2.6966 (2.5376)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1267 (1.1489)  time: 0.7932  data: 0.0004  max mem: 69511
Epoch: [273]  [ 600/1251]  eta: 0:08:36  lr: 0.000078  min_lr: 0.000078  loss: 2.6052 (2.5437)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1457 (1.1656)  time: 0.7859  data: 0.0004  max mem: 69511
Epoch: [273]  [ 800/1251]  eta: 0:05:57  lr: 0.000077  min_lr: 0.000077  loss: 2.6813 (2.5547)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3035 (1.1856)  time: 0.7989  data: 0.0005  max mem: 69511
Epoch: [273]  [1000/1251]  eta: 0:03:18  lr: 0.000076  min_lr: 0.000076  loss: 2.6137 (2.5567)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0819 (1.1749)  time: 0.7923  data: 0.0005  max mem: 69511
Epoch: [273]  [1200/1251]  eta: 0:00:40  lr: 0.000075  min_lr: 0.000075  loss: 2.7008 (2.5665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0774 (1.1672)  time: 0.7869  data: 0.0004  max mem: 69511
Epoch: [273]  [1250/1251]  eta: 0:00:00  lr: 0.000075  min_lr: 0.000075  loss: 2.5576 (2.5644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1603 (1.1660)  time: 0.6699  data: 0.0005  max mem: 69511
Epoch: [273] Total time: 0:16:27 (0.7892 s / it)
Averaged stats: lr: 0.000075  min_lr: 0.000075  loss: 2.5576 (2.5558)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1603 (1.1660)
Test:  [ 0/25]  eta: 0:03:00  loss: 0.6526 (0.6526)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 7.2224  data: 6.7840  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7993 (0.7836)  acc1: 87.2000 (87.5273)  acc5: 97.6000 (97.9273)  time: 1.0175  data: 0.6171  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9217 (0.9046)  acc1: 82.8000 (84.7810)  acc5: 97.2000 (97.0857)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0050 (0.9201)  acc1: 82.4000 (84.3360)  acc5: 96.4000 (96.9120)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6756 s / it)
* Acc@1 84.668 Acc@5 97.042 loss 0.907
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.78%
Epoch: [274]  [   0/1251]  eta: 1:38:50  lr: 0.000075  min_lr: 0.000075  loss: 2.1175 (2.1175)  weight_decay: 0.0500 (0.0500)  time: 4.7410  data: 3.5841  max mem: 69511
Epoch: [274]  [ 200/1251]  eta: 0:14:08  lr: 0.000074  min_lr: 0.000074  loss: 2.7570 (2.5663)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1636 (1.1457)  time: 0.7824  data: 0.0005  max mem: 69511
Epoch: [274]  [ 400/1251]  eta: 0:11:17  lr: 0.000073  min_lr: 0.000073  loss: 2.6442 (2.5519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1401 (1.1536)  time: 0.7844  data: 0.0004  max mem: 69511
Epoch: [274]  [ 600/1251]  eta: 0:08:36  lr: 0.000072  min_lr: 0.000072  loss: 2.5687 (2.5577)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1329 (1.1552)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [274]  [ 800/1251]  eta: 0:05:57  lr: 0.000071  min_lr: 0.000071  loss: 2.6296 (2.5651)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1589 (1.1551)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [274]  [1000/1251]  eta: 0:03:18  lr: 0.000070  min_lr: 0.000070  loss: 2.6020 (2.5588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0883 (1.1549)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [274]  [1200/1251]  eta: 0:00:40  lr: 0.000070  min_lr: 0.000070  loss: 2.7886 (2.5590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0841 (1.1571)  time: 0.7835  data: 0.0005  max mem: 69511
Epoch: [274]  [1250/1251]  eta: 0:00:00  lr: 0.000069  min_lr: 0.000069  loss: 2.7106 (2.5614)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1585 (1.1597)  time: 0.6651  data: 0.0006  max mem: 69511
Epoch: [274] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.000069  min_lr: 0.000069  loss: 2.7106 (2.5586)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1585 (1.1597)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.7108 (0.7108)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 6.0203  data: 5.5901  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8790 (0.8545)  acc1: 86.8000 (87.8545)  acc5: 97.6000 (98.0364)  time: 0.9420  data: 0.5407  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9720 (0.9834)  acc1: 84.0000 (85.0095)  acc5: 97.2000 (97.1619)  time: 0.4221  data: 0.0179  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0741 (0.9988)  acc1: 82.8000 (84.6720)  acc5: 96.4000 (96.9600)  time: 0.4046  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6466 s / it)
* Acc@1 84.748 Acc@5 97.042 loss 0.986
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.78%
Epoch: [275]  [   0/1251]  eta: 1:40:54  lr: 0.000069  min_lr: 0.000069  loss: 1.8897 (1.8897)  weight_decay: 0.0500 (0.0500)  time: 4.8396  data: 3.1178  max mem: 69511
Epoch: [275]  [ 200/1251]  eta: 0:14:08  lr: 0.000069  min_lr: 0.000069  loss: 2.7733 (2.5345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1200 (1.1760)  time: 0.7849  data: 0.0004  max mem: 69511
Epoch: [275]  [ 400/1251]  eta: 0:11:18  lr: 0.000068  min_lr: 0.000068  loss: 2.4620 (2.5182)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1784 (1.1772)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [275]  [ 600/1251]  eta: 0:08:36  lr: 0.000067  min_lr: 0.000067  loss: 2.4960 (2.5273)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0995 (1.1692)  time: 0.7900  data: 0.0004  max mem: 69511
Epoch: [275]  [ 800/1251]  eta: 0:05:56  lr: 0.000066  min_lr: 0.000066  loss: 2.5879 (2.5324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0601 (1.1545)  time: 0.7915  data: 0.0004  max mem: 69511
Epoch: [275]  [1000/1251]  eta: 0:03:18  lr: 0.000065  min_lr: 0.000065  loss: 2.6432 (2.5302)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1002 (1.1569)  time: 0.7852  data: 0.0004  max mem: 69511
Epoch: [275]  [1200/1251]  eta: 0:00:40  lr: 0.000064  min_lr: 0.000064  loss: 2.6525 (2.5424)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1482 (1.1588)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [275]  [1250/1251]  eta: 0:00:00  lr: 0.000064  min_lr: 0.000064  loss: 2.7902 (2.5398)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0937 (1.1572)  time: 0.6717  data: 0.0005  max mem: 69511
Epoch: [275] Total time: 0:16:26 (0.7887 s / it)
Averaged stats: lr: 0.000064  min_lr: 0.000064  loss: 2.7902 (2.5513)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0937 (1.1572)
Test:  [ 0/25]  eta: 0:03:13  loss: 0.6206 (0.6206)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 7.7570  data: 7.3426  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7810 (0.7587)  acc1: 87.2000 (87.6727)  acc5: 97.6000 (98.0364)  time: 1.0656  data: 0.6678  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9024 (0.8805)  acc1: 84.0000 (84.9905)  acc5: 97.2000 (97.1429)  time: 0.3962  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9614 (0.8939)  acc1: 83.2000 (84.6400)  acc5: 96.4000 (96.9120)  time: 0.3960  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6964 s / it)
* Acc@1 84.734 Acc@5 97.034 loss 0.883
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.78%
Epoch: [276]  [   0/1251]  eta: 1:38:00  lr: 0.000064  min_lr: 0.000064  loss: 2.6528 (2.6528)  weight_decay: 0.0500 (0.0500)  time: 4.7004  data: 2.1440  max mem: 69511
Epoch: [276]  [ 200/1251]  eta: 0:14:03  lr: 0.000063  min_lr: 0.000063  loss: 2.7334 (2.5692)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1973 (1.2142)  time: 0.7830  data: 0.0006  max mem: 69511
Epoch: [276]  [ 400/1251]  eta: 0:11:16  lr: 0.000062  min_lr: 0.000062  loss: 2.6084 (2.5610)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1608 (1.2266)  time: 0.7851  data: 0.0006  max mem: 69511
Epoch: [276]  [ 600/1251]  eta: 0:08:35  lr: 0.000062  min_lr: 0.000062  loss: 2.6428 (2.5567)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0355 (1.2004)  time: 0.7857  data: 0.0006  max mem: 69511
Epoch: [276]  [ 800/1251]  eta: 0:05:56  lr: 0.000061  min_lr: 0.000061  loss: 2.6440 (2.5581)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0624 (1.1811)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [276]  [1000/1251]  eta: 0:03:18  lr: 0.000060  min_lr: 0.000060  loss: 2.6127 (2.5579)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1679 (1.1836)  time: 0.7853  data: 0.0004  max mem: 69511
Epoch: [276]  [1200/1251]  eta: 0:00:40  lr: 0.000059  min_lr: 0.000059  loss: 2.6449 (2.5622)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1284 (1.1853)  time: 0.7828  data: 0.0005  max mem: 69511
Epoch: [276]  [1250/1251]  eta: 0:00:00  lr: 0.000059  min_lr: 0.000059  loss: 2.5882 (2.5611)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0705 (1.1828)  time: 0.6654  data: 0.0005  max mem: 69511
Epoch: [276] Total time: 0:16:25 (0.7877 s / it)
Averaged stats: lr: 0.000059  min_lr: 0.000059  loss: 2.5882 (2.5478)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0705 (1.1828)
Test:  [ 0/25]  eta: 0:03:10  loss: 0.5843 (0.5843)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 7.6201  data: 7.1988  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7547 (0.7229)  acc1: 86.4000 (87.4545)  acc5: 98.0000 (98.1091)  time: 1.0522  data: 0.6547  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8711 (0.8444)  acc1: 83.2000 (84.8191)  acc5: 97.2000 (97.0857)  time: 0.3957  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9292 (0.8582)  acc1: 83.2000 (84.4800)  acc5: 96.4000 (96.8960)  time: 0.3959  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6912 s / it)
* Acc@1 84.752 Acc@5 97.020 loss 0.846
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.78%
Epoch: [277]  [   0/1251]  eta: 1:42:33  lr: 0.000059  min_lr: 0.000059  loss: 1.6118 (1.6118)  weight_decay: 0.0500 (0.0500)  time: 4.9190  data: 3.5414  max mem: 69511
Epoch: [277]  [ 200/1251]  eta: 0:14:07  lr: 0.000058  min_lr: 0.000058  loss: 2.6831 (2.5161)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1248 (1.1947)  time: 0.7853  data: 0.0005  max mem: 69511
Epoch: [277]  [ 400/1251]  eta: 0:11:18  lr: 0.000057  min_lr: 0.000057  loss: 2.3288 (2.5265)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1292 (1.1643)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [277]  [ 600/1251]  eta: 0:08:36  lr: 0.000057  min_lr: 0.000057  loss: 2.7285 (2.5451)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1505 (1.1675)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [277]  [ 800/1251]  eta: 0:05:57  lr: 0.000056  min_lr: 0.000056  loss: 2.6556 (2.5480)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1519 (1.1599)  time: 0.7886  data: 0.0004  max mem: 69511
Epoch: [277]  [1000/1251]  eta: 0:03:18  lr: 0.000055  min_lr: 0.000055  loss: 2.4735 (2.5450)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1772 (1.1596)  time: 0.7857  data: 0.0005  max mem: 69511
Epoch: [277]  [1200/1251]  eta: 0:00:40  lr: 0.000054  min_lr: 0.000054  loss: 2.6942 (2.5487)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1075 (1.1596)  time: 0.7856  data: 0.0005  max mem: 69511
Epoch: [277]  [1250/1251]  eta: 0:00:00  lr: 0.000054  min_lr: 0.000054  loss: 2.6110 (2.5498)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1236 (1.1602)  time: 0.6679  data: 0.0007  max mem: 69511
Epoch: [277] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.000054  min_lr: 0.000054  loss: 2.6110 (2.5459)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1236 (1.1602)
Test:  [ 0/25]  eta: 0:03:09  loss: 0.7107 (0.7107)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.5879  data: 7.1533  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8474 (0.8410)  acc1: 87.2000 (87.9273)  acc5: 97.6000 (98.1091)  time: 1.0508  data: 0.6506  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9866 (0.9625)  acc1: 83.6000 (85.3333)  acc5: 97.2000 (97.1238)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0478 (0.9767)  acc1: 83.2000 (84.8640)  acc5: 96.4000 (96.9280)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6915 s / it)
* Acc@1 84.764 Acc@5 97.052 loss 0.967
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.78%
Epoch: [278]  [   0/1251]  eta: 1:35:41  lr: 0.000054  min_lr: 0.000054  loss: 2.7651 (2.7651)  weight_decay: 0.0500 (0.0500)  time: 4.5894  data: 2.2857  max mem: 69511
Epoch: [278]  [ 200/1251]  eta: 0:14:08  lr: 0.000053  min_lr: 0.000053  loss: 2.5847 (2.5690)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0981 (1.1777)  time: 0.7944  data: 0.0004  max mem: 69511
Epoch: [278]  [ 400/1251]  eta: 0:11:18  lr: 0.000053  min_lr: 0.000053  loss: 2.5815 (2.5470)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1324 (1.1688)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [278]  [ 600/1251]  eta: 0:08:36  lr: 0.000052  min_lr: 0.000052  loss: 2.6723 (2.5607)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1504 (1.1686)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [278]  [ 800/1251]  eta: 0:05:57  lr: 0.000051  min_lr: 0.000051  loss: 2.5439 (2.5641)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0942 (1.1651)  time: 0.7942  data: 0.0005  max mem: 69511
Epoch: [278]  [1000/1251]  eta: 0:03:18  lr: 0.000050  min_lr: 0.000050  loss: 2.5067 (2.5752)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0439 (inf)  time: 0.7860  data: 0.0006  max mem: 69511
Epoch: [278]  [1200/1251]  eta: 0:00:40  lr: 0.000050  min_lr: 0.000050  loss: 2.6897 (2.5651)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1704 (inf)  time: 0.7997  data: 0.0006  max mem: 69511
Epoch: [278]  [1250/1251]  eta: 0:00:00  lr: 0.000049  min_lr: 0.000049  loss: 2.6297 (2.5637)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1336 (inf)  time: 0.6671  data: 0.0005  max mem: 69511
Epoch: [278] Total time: 0:16:27 (0.7897 s / it)
Averaged stats: lr: 0.000049  min_lr: 0.000049  loss: 2.6297 (2.5523)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1336 (inf)
Test:  [ 0/25]  eta: 0:02:57  loss: 0.7097 (0.7097)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 7.1095  data: 6.6778  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8566 (0.8521)  acc1: 86.8000 (87.6727)  acc5: 97.6000 (98.1455)  time: 1.0072  data: 0.6074  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9758 (0.9756)  acc1: 84.0000 (85.1238)  acc5: 97.2000 (97.1048)  time: 0.3968  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0639 (0.9900)  acc1: 83.6000 (84.7200)  acc5: 96.4000 (96.9280)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6713 s / it)
* Acc@1 84.834 Acc@5 97.032 loss 0.979
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.83%
Epoch: [279]  [   0/1251]  eta: 1:33:28  lr: 0.000049  min_lr: 0.000049  loss: 2.8352 (2.8352)  weight_decay: 0.0500 (0.0500)  time: 4.4829  data: 3.6867  max mem: 69511
Epoch: [279]  [ 200/1251]  eta: 0:14:06  lr: 0.000049  min_lr: 0.000049  loss: 2.1770 (2.5420)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1074 (1.1513)  time: 0.7858  data: 0.0007  max mem: 69511
Epoch: [279]  [ 400/1251]  eta: 0:11:17  lr: 0.000048  min_lr: 0.000048  loss: 2.6998 (2.5483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0891 (1.1508)  time: 0.7864  data: 0.0007  max mem: 69511
Epoch: [279]  [ 600/1251]  eta: 0:08:37  lr: 0.000047  min_lr: 0.000047  loss: 2.7529 (2.5450)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2361 (1.1818)  time: 0.7890  data: 0.0007  max mem: 69511
Epoch: [279]  [ 800/1251]  eta: 0:05:57  lr: 0.000046  min_lr: 0.000046  loss: 2.6223 (2.5597)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1621 (1.1848)  time: 0.7861  data: 0.0007  max mem: 69511
Epoch: [279]  [1000/1251]  eta: 0:03:18  lr: 0.000046  min_lr: 0.000046  loss: 2.5056 (2.5557)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1449 (1.1859)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [279]  [1200/1251]  eta: 0:00:40  lr: 0.000045  min_lr: 0.000045  loss: 2.4864 (2.5471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1176 (1.1822)  time: 0.7887  data: 0.0007  max mem: 69511
Epoch: [279]  [1250/1251]  eta: 0:00:00  lr: 0.000045  min_lr: 0.000045  loss: 2.8768 (2.5499)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1627 (1.1834)  time: 0.6689  data: 0.0005  max mem: 69511
Epoch: [279] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.000045  min_lr: 0.000045  loss: 2.8768 (2.5435)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1627 (1.1834)
Test:  [ 0/25]  eta: 0:02:55  loss: 0.6900 (0.6900)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 7.0379  data: 6.6149  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8288 (0.8249)  acc1: 86.8000 (87.8545)  acc5: 98.0000 (98.1091)  time: 1.0013  data: 0.6017  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9709 (0.9496)  acc1: 83.6000 (85.2381)  acc5: 97.2000 (97.2191)  time: 0.3976  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0474 (0.9641)  acc1: 83.2000 (84.8000)  acc5: 96.4000 (96.9920)  time: 0.3975  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6709 s / it)
* Acc@1 84.690 Acc@5 97.048 loss 0.955
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.83%
Epoch: [280]  [   0/1251]  eta: 1:34:29  lr: 0.000045  min_lr: 0.000045  loss: 2.6393 (2.6393)  weight_decay: 0.0500 (0.0500)  time: 4.5317  data: 3.7280  max mem: 69511
Epoch: [280]  [ 200/1251]  eta: 0:14:05  lr: 0.000044  min_lr: 0.000044  loss: 2.6691 (2.5153)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1884 (1.2439)  time: 0.7849  data: 0.0006  max mem: 69511
Epoch: [280]  [ 400/1251]  eta: 0:11:17  lr: 0.000043  min_lr: 0.000043  loss: 2.6175 (2.5179)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1867 (1.1991)  time: 0.7860  data: 0.0006  max mem: 69511
Epoch: [280]  [ 600/1251]  eta: 0:08:36  lr: 0.000043  min_lr: 0.000043  loss: 2.7019 (2.5269)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1363 (1.1768)  time: 0.7865  data: 0.0006  max mem: 69511
Epoch: [280]  [ 800/1251]  eta: 0:05:56  lr: 0.000042  min_lr: 0.000042  loss: 2.5352 (2.5289)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1304 (1.1830)  time: 0.7820  data: 0.0007  max mem: 69511
Epoch: [280]  [1000/1251]  eta: 0:03:18  lr: 0.000041  min_lr: 0.000041  loss: 2.6341 (2.5394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0666 (1.1719)  time: 0.7898  data: 0.0006  max mem: 69511
Epoch: [280]  [1200/1251]  eta: 0:00:40  lr: 0.000041  min_lr: 0.000041  loss: 2.6778 (2.5443)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0773 (1.1550)  time: 0.7822  data: 0.0008  max mem: 69511
Epoch: [280]  [1250/1251]  eta: 0:00:00  lr: 0.000041  min_lr: 0.000041  loss: 2.6586 (2.5447)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1862 (inf)  time: 0.6647  data: 0.0005  max mem: 69511
Epoch: [280] Total time: 0:16:25 (0.7876 s / it)
Averaged stats: lr: 0.000041  min_lr: 0.000041  loss: 2.6586 (2.5445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1862 (inf)
Test:  [ 0/25]  eta: 0:03:01  loss: 0.6772 (0.6772)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 7.2713  data: 6.8534  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8260 (0.8112)  acc1: 87.6000 (87.6364)  acc5: 98.0000 (98.1091)  time: 1.0206  data: 0.6233  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9595 (0.9361)  acc1: 84.0000 (84.9714)  acc5: 97.2000 (97.2191)  time: 0.3957  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0287 (0.9511)  acc1: 82.4000 (84.6240)  acc5: 96.4000 (96.9600)  time: 0.3957  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6768 s / it)
* Acc@1 84.746 Acc@5 97.008 loss 0.941
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.83%
Epoch: [281]  [   0/1251]  eta: 1:43:21  lr: 0.000041  min_lr: 0.000041  loss: 2.7723 (2.7723)  weight_decay: 0.0500 (0.0500)  time: 4.9574  data: 4.1724  max mem: 69511
Epoch: [281]  [ 200/1251]  eta: 0:14:07  lr: 0.000040  min_lr: 0.000040  loss: 2.6171 (2.5325)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0869 (1.1613)  time: 0.7947  data: 0.0008  max mem: 69511
Epoch: [281]  [ 400/1251]  eta: 0:11:19  lr: 0.000039  min_lr: 0.000039  loss: 2.3950 (2.5379)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1287 (1.1728)  time: 0.7856  data: 0.0007  max mem: 69511
Epoch: [281]  [ 600/1251]  eta: 0:08:37  lr: 0.000039  min_lr: 0.000039  loss: 2.6020 (2.5518)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0682 (1.1669)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [281]  [ 800/1251]  eta: 0:05:57  lr: 0.000038  min_lr: 0.000038  loss: 2.4150 (2.5411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1493 (1.1598)  time: 0.7821  data: 0.0008  max mem: 69511
Epoch: [281]  [1000/1251]  eta: 0:03:18  lr: 0.000037  min_lr: 0.000037  loss: 2.5480 (2.5436)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0527 (1.1567)  time: 0.7838  data: 0.0007  max mem: 69511
Epoch: [281]  [1200/1251]  eta: 0:00:40  lr: 0.000037  min_lr: 0.000037  loss: 2.6239 (2.5506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0707 (1.1497)  time: 0.7828  data: 0.0005  max mem: 69511
Epoch: [281]  [1250/1251]  eta: 0:00:00  lr: 0.000037  min_lr: 0.000037  loss: 2.6064 (2.5483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1240 (1.1496)  time: 0.6654  data: 0.0007  max mem: 69511
Epoch: [281] Total time: 0:16:25 (0.7881 s / it)
Averaged stats: lr: 0.000037  min_lr: 0.000037  loss: 2.6064 (2.5400)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1240 (1.1496)
Test:  [ 0/25]  eta: 0:03:06  loss: 0.6538 (0.6538)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 7.4635  data: 7.0501  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7957 (0.7849)  acc1: 86.8000 (87.8545)  acc5: 97.6000 (98.0364)  time: 1.0381  data: 0.6412  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9382 (0.9047)  acc1: 84.4000 (85.2381)  acc5: 97.2000 (97.1619)  time: 0.3953  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9796 (0.9188)  acc1: 83.2000 (84.7840)  acc5: 96.8000 (96.9280)  time: 0.3952  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6871 s / it)
* Acc@1 84.764 Acc@5 97.032 loss 0.910
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.83%
Epoch: [282]  [   0/1251]  eta: 1:40:26  lr: 0.000037  min_lr: 0.000037  loss: 2.9157 (2.9157)  weight_decay: 0.0500 (0.0500)  time: 4.8172  data: 3.3407  max mem: 69511
Epoch: [282]  [ 200/1251]  eta: 0:14:08  lr: 0.000036  min_lr: 0.000036  loss: 2.5092 (2.4850)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1300 (1.2130)  time: 0.7929  data: 0.0005  max mem: 69511
Epoch: [282]  [ 400/1251]  eta: 0:11:18  lr: 0.000035  min_lr: 0.000035  loss: 2.7462 (2.5326)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1265 (1.1925)  time: 0.7974  data: 0.0006  max mem: 69511
Epoch: [282]  [ 600/1251]  eta: 0:08:36  lr: 0.000035  min_lr: 0.000035  loss: 2.6978 (2.5424)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1028 (1.1802)  time: 0.7927  data: 0.0006  max mem: 69511
Epoch: [282]  [ 800/1251]  eta: 0:05:57  lr: 0.000034  min_lr: 0.000034  loss: 2.7336 (2.5478)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1119 (1.1770)  time: 0.7889  data: 0.0005  max mem: 69511
Epoch: [282]  [1000/1251]  eta: 0:03:18  lr: 0.000033  min_lr: 0.000033  loss: 2.4831 (2.5450)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1528 (1.1867)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [282]  [1200/1251]  eta: 0:00:40  lr: 0.000033  min_lr: 0.000033  loss: 2.7626 (2.5473)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1593 (1.1899)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [282]  [1250/1251]  eta: 0:00:00  lr: 0.000033  min_lr: 0.000033  loss: 2.6045 (2.5491)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1593 (1.1916)  time: 0.6683  data: 0.0007  max mem: 69511
Epoch: [282] Total time: 0:16:27 (0.7891 s / it)
Averaged stats: lr: 0.000033  min_lr: 0.000033  loss: 2.6045 (2.5424)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1593 (1.1916)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.6903 (0.6903)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 7.3387  data: 6.8992  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8345 (0.8317)  acc1: 86.4000 (87.7091)  acc5: 98.0000 (98.1091)  time: 1.0283  data: 0.6275  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9767 (0.9586)  acc1: 84.4000 (85.0476)  acc5: 97.2000 (97.1238)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0539 (0.9742)  acc1: 82.8000 (84.6880)  acc5: 96.4000 (96.9280)  time: 0.3969  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6800 s / it)
* Acc@1 84.740 Acc@5 97.044 loss 0.964
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.83%
Epoch: [283]  [   0/1251]  eta: 1:40:56  lr: 0.000033  min_lr: 0.000033  loss: 2.4002 (2.4002)  weight_decay: 0.0500 (0.0500)  time: 4.8411  data: 3.2659  max mem: 69511
Epoch: [283]  [ 200/1251]  eta: 0:14:08  lr: 0.000032  min_lr: 0.000032  loss: 2.4863 (2.5177)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1334 (1.1660)  time: 0.7826  data: 0.0004  max mem: 69511
Epoch: [283]  [ 400/1251]  eta: 0:11:16  lr: 0.000032  min_lr: 0.000032  loss: 2.6078 (2.5286)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0630 (1.1751)  time: 0.7831  data: 0.0005  max mem: 69511
Epoch: [283]  [ 600/1251]  eta: 0:08:36  lr: 0.000031  min_lr: 0.000031  loss: 2.5866 (2.5449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0583 (1.1643)  time: 0.7910  data: 0.0006  max mem: 69511
Epoch: [283]  [ 800/1251]  eta: 0:05:56  lr: 0.000030  min_lr: 0.000030  loss: 2.7414 (2.5431)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0640 (1.1558)  time: 0.7895  data: 0.0004  max mem: 69511
Epoch: [283]  [1000/1251]  eta: 0:03:18  lr: 0.000030  min_lr: 0.000030  loss: 2.5615 (2.5367)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2015 (1.1563)  time: 0.7940  data: 0.0004  max mem: 69511
Epoch: [283]  [1200/1251]  eta: 0:00:40  lr: 0.000029  min_lr: 0.000029  loss: 2.5451 (2.5368)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1039 (1.1620)  time: 0.7857  data: 0.0004  max mem: 69511
Epoch: [283]  [1250/1251]  eta: 0:00:00  lr: 0.000029  min_lr: 0.000029  loss: 2.6846 (2.5360)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0639 (1.1600)  time: 0.6675  data: 0.0005  max mem: 69511
Epoch: [283] Total time: 0:16:25 (0.7878 s / it)
Averaged stats: lr: 0.000029  min_lr: 0.000029  loss: 2.6846 (2.5465)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0639 (1.1600)
Test:  [ 0/25]  eta: 0:03:14  loss: 0.6321 (0.6321)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 7.7638  data: 7.3489  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.7807 (0.7651)  acc1: 86.4000 (87.7818)  acc5: 98.0000 (98.0364)  time: 1.0668  data: 0.6683  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9129 (0.8848)  acc1: 84.4000 (85.3333)  acc5: 97.2000 (97.0095)  time: 0.3969  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9714 (0.8987)  acc1: 83.2000 (84.8960)  acc5: 96.4000 (96.8160)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6979 s / it)
* Acc@1 84.746 Acc@5 97.022 loss 0.891
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.83%
Epoch: [284]  [   0/1251]  eta: 1:31:09  lr: 0.000029  min_lr: 0.000029  loss: 3.0636 (3.0636)  weight_decay: 0.0500 (0.0500)  time: 4.3721  data: 3.4349  max mem: 69511
Epoch: [284]  [ 200/1251]  eta: 0:14:03  lr: 0.000029  min_lr: 0.000029  loss: 2.6265 (2.5687)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1216 (1.1786)  time: 0.7851  data: 0.0005  max mem: 69511
Epoch: [284]  [ 400/1251]  eta: 0:11:16  lr: 0.000028  min_lr: 0.000028  loss: 2.6004 (2.5595)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0381 (1.1587)  time: 0.7901  data: 0.0004  max mem: 69511
Epoch: [284]  [ 600/1251]  eta: 0:08:35  lr: 0.000027  min_lr: 0.000027  loss: 2.6214 (2.5640)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1784 (1.1529)  time: 0.7817  data: 0.0004  max mem: 69511
Epoch: [284]  [ 800/1251]  eta: 0:05:55  lr: 0.000027  min_lr: 0.000027  loss: 2.6974 (2.5527)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0421 (1.1454)  time: 0.7840  data: 0.0004  max mem: 69511
Epoch: [284]  [1000/1251]  eta: 0:03:18  lr: 0.000026  min_lr: 0.000026  loss: 2.6606 (2.5427)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0185 (1.1409)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [284]  [1200/1251]  eta: 0:00:40  lr: 0.000026  min_lr: 0.000026  loss: 2.7055 (2.5453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1227 (1.1387)  time: 0.7931  data: 0.0004  max mem: 69511
Epoch: [284]  [1250/1251]  eta: 0:00:00  lr: 0.000026  min_lr: 0.000026  loss: 2.5353 (2.5441)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1369 (1.1379)  time: 0.6644  data: 0.0005  max mem: 69511
Epoch: [284] Total time: 0:16:24 (0.7869 s / it)
Averaged stats: lr: 0.000026  min_lr: 0.000026  loss: 2.5353 (2.5385)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1369 (1.1379)
Test:  [ 0/25]  eta: 0:03:16  loss: 0.6936 (0.6936)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 7.8732  data: 7.4537  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8427 (0.8315)  acc1: 87.2000 (87.7455)  acc5: 98.0000 (98.1455)  time: 1.0748  data: 0.6779  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9689 (0.9520)  acc1: 83.6000 (85.0095)  acc5: 97.2000 (97.0857)  time: 0.3949  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0440 (0.9668)  acc1: 83.2000 (84.6240)  acc5: 96.4000 (96.8640)  time: 0.3950  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7075 s / it)
* Acc@1 84.712 Acc@5 96.986 loss 0.957
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.83%
Epoch: [285]  [   0/1251]  eta: 1:38:10  lr: 0.000026  min_lr: 0.000026  loss: 2.6699 (2.6699)  weight_decay: 0.0500 (0.0500)  time: 4.7084  data: 3.2521  max mem: 69511
Epoch: [285]  [ 200/1251]  eta: 0:14:05  lr: 0.000025  min_lr: 0.000025  loss: 2.7274 (2.5499)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1925 (1.1579)  time: 0.7820  data: 0.0005  max mem: 69511
Epoch: [285]  [ 400/1251]  eta: 0:11:16  lr: 0.000025  min_lr: 0.000025  loss: 2.6071 (2.5316)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0353 (1.1385)  time: 0.7838  data: 0.0004  max mem: 69511
Epoch: [285]  [ 600/1251]  eta: 0:08:35  lr: 0.000024  min_lr: 0.000024  loss: 2.7042 (2.5345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1774 (1.1424)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [285]  [ 800/1251]  eta: 0:05:56  lr: 0.000024  min_lr: 0.000024  loss: 2.4518 (2.5428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0801 (1.1451)  time: 0.7933  data: 0.0005  max mem: 69511
Epoch: [285]  [1000/1251]  eta: 0:03:18  lr: 0.000023  min_lr: 0.000023  loss: 2.6022 (2.5391)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0153 (1.1439)  time: 0.7873  data: 0.0006  max mem: 69511
Epoch: [285]  [1200/1251]  eta: 0:00:40  lr: 0.000023  min_lr: 0.000023  loss: 2.4877 (2.5470)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1067 (1.1406)  time: 0.7830  data: 0.0005  max mem: 69511
Epoch: [285]  [1250/1251]  eta: 0:00:00  lr: 0.000023  min_lr: 0.000023  loss: 2.6788 (2.5473)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0823 (1.1383)  time: 0.6655  data: 0.0005  max mem: 69511
Epoch: [285] Total time: 0:16:25 (0.7880 s / it)
Averaged stats: lr: 0.000023  min_lr: 0.000023  loss: 2.6788 (2.5409)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0823 (1.1383)
Test:  [ 0/25]  eta: 0:03:10  loss: 0.6948 (0.6948)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 7.6125  data: 7.1874  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8477 (0.8338)  acc1: 87.2000 (88.0000)  acc5: 98.0000 (98.0000)  time: 1.0504  data: 0.6537  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9750 (0.9545)  acc1: 83.6000 (85.2191)  acc5: 97.2000 (97.1238)  time: 0.3944  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0428 (0.9681)  acc1: 83.2000 (84.8160)  acc5: 96.4000 (96.9760)  time: 0.3945  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6933 s / it)
* Acc@1 84.722 Acc@5 97.018 loss 0.958
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.83%
Epoch: [286]  [   0/1251]  eta: 1:33:49  lr: 0.000023  min_lr: 0.000023  loss: 2.3044 (2.3044)  weight_decay: 0.0500 (0.0500)  time: 4.5004  data: 3.4108  max mem: 69511
Epoch: [286]  [ 200/1251]  eta: 0:14:09  lr: 0.000022  min_lr: 0.000022  loss: 2.4286 (2.5363)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0880 (1.1201)  time: 0.7933  data: 0.0005  max mem: 69511
Epoch: [286]  [ 400/1251]  eta: 0:11:18  lr: 0.000022  min_lr: 0.000022  loss: 2.7608 (2.5133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1596 (1.1451)  time: 0.7855  data: 0.0006  max mem: 69511
Epoch: [286]  [ 600/1251]  eta: 0:08:36  lr: 0.000021  min_lr: 0.000021  loss: 2.6381 (2.5296)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0335 (1.1404)  time: 0.7863  data: 0.0007  max mem: 69511
Epoch: [286]  [ 800/1251]  eta: 0:05:57  lr: 0.000021  min_lr: 0.000021  loss: 2.5878 (2.5280)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0906 (1.1481)  time: 0.7859  data: 0.0006  max mem: 69511
Epoch: [286]  [1000/1251]  eta: 0:03:18  lr: 0.000020  min_lr: 0.000020  loss: 2.4658 (2.5214)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0753 (1.1488)  time: 0.7891  data: 0.0004  max mem: 69511
Epoch: [286]  [1200/1251]  eta: 0:00:40  lr: 0.000020  min_lr: 0.000020  loss: 2.2974 (2.5159)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1352 (1.1428)  time: 0.7912  data: 0.0007  max mem: 69511
Epoch: [286]  [1250/1251]  eta: 0:00:00  lr: 0.000020  min_lr: 0.000020  loss: 2.6008 (2.5133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1063 (1.1427)  time: 0.6674  data: 0.0007  max mem: 69511
Epoch: [286] Total time: 0:16:27 (0.7892 s / it)
Averaged stats: lr: 0.000020  min_lr: 0.000020  loss: 2.6008 (2.5346)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1063 (1.1427)
Test:  [ 0/25]  eta: 0:03:02  loss: 0.6298 (0.6298)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.3106  data: 6.8682  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7801 (0.7659)  acc1: 87.2000 (87.7091)  acc5: 98.0000 (98.0000)  time: 1.0257  data: 0.6247  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9140 (0.8894)  acc1: 83.6000 (84.9714)  acc5: 97.2000 (97.0476)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9803 (0.9037)  acc1: 82.4000 (84.6080)  acc5: 96.4000 (96.8320)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6786 s / it)
* Acc@1 84.794 Acc@5 97.038 loss 0.894
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.83%
Epoch: [287]  [   0/1251]  eta: 1:35:09  lr: 0.000020  min_lr: 0.000020  loss: 2.4968 (2.4968)  weight_decay: 0.0500 (0.0500)  time: 4.5643  data: 3.1213  max mem: 69511
Epoch: [287]  [ 200/1251]  eta: 0:14:08  lr: 0.000019  min_lr: 0.000019  loss: 2.7102 (2.5987)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2812 (1.2354)  time: 0.7958  data: 0.0006  max mem: 69511
Epoch: [287]  [ 400/1251]  eta: 0:11:17  lr: 0.000019  min_lr: 0.000019  loss: 2.6130 (2.5772)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1042 (1.1990)  time: 0.7874  data: 0.0005  max mem: 69511
Epoch: [287]  [ 600/1251]  eta: 0:08:36  lr: 0.000018  min_lr: 0.000018  loss: 2.7903 (2.5644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0786 (1.1804)  time: 0.7917  data: 0.0005  max mem: 69511
Epoch: [287]  [ 800/1251]  eta: 0:05:56  lr: 0.000018  min_lr: 0.000018  loss: 2.5477 (2.5491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9812 (1.1613)  time: 0.7844  data: 0.0005  max mem: 69511
Epoch: [287]  [1000/1251]  eta: 0:03:18  lr: 0.000017  min_lr: 0.000017  loss: 2.5439 (2.5520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1595 (1.1645)  time: 0.7851  data: 0.0006  max mem: 69511
Epoch: [287]  [1200/1251]  eta: 0:00:40  lr: 0.000017  min_lr: 0.000017  loss: 2.7591 (2.5497)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0924 (1.1597)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [287]  [1250/1251]  eta: 0:00:00  lr: 0.000017  min_lr: 0.000017  loss: 2.6511 (2.5466)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1145 (1.1584)  time: 0.6680  data: 0.0007  max mem: 69511
Epoch: [287] Total time: 0:16:25 (0.7880 s / it)
Averaged stats: lr: 0.000017  min_lr: 0.000017  loss: 2.6511 (2.5375)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1145 (1.1584)
Test:  [ 0/25]  eta: 0:03:16  loss: 0.6316 (0.6316)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 7.8427  data: 7.4148  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.7966 (0.7715)  acc1: 87.2000 (87.8182)  acc5: 98.0000 (98.1818)  time: 1.0741  data: 0.6744  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9161 (0.8933)  acc1: 84.0000 (85.1619)  acc5: 97.2000 (97.2381)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9904 (0.9081)  acc1: 82.8000 (84.7680)  acc5: 96.4000 (97.0400)  time: 0.3972  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7007 s / it)
* Acc@1 84.806 Acc@5 97.066 loss 0.898
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.83%
Epoch: [288]  [   0/1251]  eta: 1:34:40  lr: 0.000017  min_lr: 0.000017  loss: 2.4244 (2.4244)  weight_decay: 0.0500 (0.0500)  time: 4.5404  data: 3.0183  max mem: 69511
Epoch: [288]  [ 200/1251]  eta: 0:14:06  lr: 0.000016  min_lr: 0.000016  loss: 2.5124 (2.5308)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0068 (1.1359)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [288]  [ 400/1251]  eta: 0:11:17  lr: 0.000016  min_lr: 0.000016  loss: 2.7195 (2.5303)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1115 (1.1343)  time: 0.7819  data: 0.0004  max mem: 69511
Epoch: [288]  [ 600/1251]  eta: 0:08:35  lr: 0.000016  min_lr: 0.000016  loss: 2.6883 (2.5228)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0936 (1.1439)  time: 0.7832  data: 0.0005  max mem: 69511
Epoch: [288]  [ 800/1251]  eta: 0:05:56  lr: 0.000015  min_lr: 0.000015  loss: 2.4808 (2.5177)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0365 (1.1346)  time: 0.7938  data: 0.0004  max mem: 69511
Epoch: [288]  [1000/1251]  eta: 0:03:18  lr: 0.000015  min_lr: 0.000015  loss: 2.5483 (2.5162)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1229 (1.1455)  time: 0.7953  data: 0.0005  max mem: 69511
Epoch: [288]  [1200/1251]  eta: 0:00:40  lr: 0.000014  min_lr: 0.000014  loss: 2.6587 (2.5149)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0972 (1.1461)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [288]  [1250/1251]  eta: 0:00:00  lr: 0.000014  min_lr: 0.000014  loss: 2.7104 (2.5195)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1640 (1.1480)  time: 0.6659  data: 0.0007  max mem: 69511
Epoch: [288] Total time: 0:16:26 (0.7884 s / it)
Averaged stats: lr: 0.000014  min_lr: 0.000014  loss: 2.7104 (2.5350)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1640 (1.1480)
Test:  [ 0/25]  eta: 0:03:16  loss: 0.7476 (0.7476)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 7.8555  data: 7.4350  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8839 (0.8816)  acc1: 87.2000 (87.9273)  acc5: 98.0000 (98.0727)  time: 1.0724  data: 0.6762  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 1.0103 (1.0048)  acc1: 83.6000 (84.9714)  acc5: 97.2000 (97.0857)  time: 0.3941  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.1043 (1.0198)  acc1: 82.4000 (84.5760)  acc5: 96.4000 (96.8640)  time: 0.3941  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7035 s / it)
* Acc@1 84.764 Acc@5 97.024 loss 1.010
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.83%
Epoch: [289]  [   0/1251]  eta: 1:26:38  lr: 0.000014  min_lr: 0.000014  loss: 2.6568 (2.6568)  weight_decay: 0.0500 (0.0500)  time: 4.1552  data: 3.0400  max mem: 69511
Epoch: [289]  [ 200/1251]  eta: 0:14:02  lr: 0.000014  min_lr: 0.000014  loss: 2.6895 (2.4842)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1179 (1.1054)  time: 0.7839  data: 0.0006  max mem: 69511
Epoch: [289]  [ 400/1251]  eta: 0:11:16  lr: 0.000014  min_lr: 0.000014  loss: 2.6532 (2.5274)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2156 (1.1385)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [289]  [ 600/1251]  eta: 0:08:35  lr: 0.000013  min_lr: 0.000013  loss: 2.5537 (2.5294)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1363 (1.1401)  time: 0.7865  data: 0.0006  max mem: 69511
Epoch: [289]  [ 800/1251]  eta: 0:05:56  lr: 0.000013  min_lr: 0.000013  loss: 2.7995 (2.5300)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1348 (1.1305)  time: 0.7866  data: 0.0005  max mem: 69511
Epoch: [289]  [1000/1251]  eta: 0:03:18  lr: 0.000012  min_lr: 0.000012  loss: 2.4389 (2.5269)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1911 (1.1391)  time: 0.7837  data: 0.0005  max mem: 69511
Epoch: [289]  [1200/1251]  eta: 0:00:40  lr: 0.000012  min_lr: 0.000012  loss: 2.5644 (2.5236)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0621 (1.1419)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [289]  [1250/1251]  eta: 0:00:00  lr: 0.000012  min_lr: 0.000012  loss: 2.6700 (2.5259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1408 (1.1420)  time: 0.6672  data: 0.0007  max mem: 69511
Epoch: [289] Total time: 0:16:25 (0.7882 s / it)
Averaged stats: lr: 0.000012  min_lr: 0.000012  loss: 2.6700 (2.5313)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1408 (1.1420)
Test:  [ 0/25]  eta: 0:02:33  loss: 0.6958 (0.6958)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 6.1542  data: 5.6903  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.8326 (0.8258)  acc1: 86.8000 (87.8545)  acc5: 98.0000 (98.1818)  time: 0.9989  data: 0.5848  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9710 (0.9517)  acc1: 84.0000 (85.0857)  acc5: 97.2000 (97.1238)  time: 0.4404  data: 0.0371  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0494 (0.9666)  acc1: 82.8000 (84.6880)  acc5: 96.4000 (96.9440)  time: 0.4019  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6678 s / it)
* Acc@1 84.814 Acc@5 97.038 loss 0.956
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.83%
Epoch: [290]  [   0/1251]  eta: 1:42:38  lr: 0.000012  min_lr: 0.000012  loss: 2.5915 (2.5915)  weight_decay: 0.0500 (0.0500)  time: 4.9228  data: 3.2513  max mem: 69511
Epoch: [290]  [ 200/1251]  eta: 0:14:08  lr: 0.000012  min_lr: 0.000012  loss: 2.7093 (2.5217)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0890 (1.1747)  time: 0.7816  data: 0.0003  max mem: 69511
Epoch: [290]  [ 400/1251]  eta: 0:11:17  lr: 0.000011  min_lr: 0.000011  loss: 2.5620 (2.5332)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0690 (1.1720)  time: 0.7855  data: 0.0004  max mem: 69511
Epoch: [290]  [ 600/1251]  eta: 0:08:36  lr: 0.000011  min_lr: 0.000011  loss: 2.7280 (2.5373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0597 (1.1593)  time: 0.7858  data: 0.0004  max mem: 69511
Epoch: [290]  [ 800/1251]  eta: 0:05:56  lr: 0.000011  min_lr: 0.000011  loss: 2.5027 (2.5366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1126 (1.1675)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [290]  [1000/1251]  eta: 0:03:18  lr: 0.000010  min_lr: 0.000010  loss: 2.7004 (2.5445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1165 (1.1730)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [290]  [1200/1251]  eta: 0:00:40  lr: 0.000010  min_lr: 0.000010  loss: 2.6314 (2.5471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1422 (1.1645)  time: 0.7861  data: 0.0004  max mem: 69511
Epoch: [290]  [1250/1251]  eta: 0:00:00  lr: 0.000010  min_lr: 0.000010  loss: 2.5713 (2.5461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0639 (1.1648)  time: 0.6695  data: 0.0005  max mem: 69511
Epoch: [290] Total time: 0:16:26 (0.7886 s / it)
Averaged stats: lr: 0.000010  min_lr: 0.000010  loss: 2.5713 (2.5352)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0639 (1.1648)
Test:  [ 0/25]  eta: 0:03:06  loss: 0.6485 (0.6485)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 7.4718  data: 7.0353  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7929 (0.7815)  acc1: 87.2000 (88.0364)  acc5: 98.0000 (98.0727)  time: 1.0373  data: 0.6398  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9302 (0.9012)  acc1: 84.4000 (85.3143)  acc5: 97.2000 (97.0857)  time: 0.3940  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9937 (0.9148)  acc1: 83.6000 (84.9280)  acc5: 96.4000 (96.8800)  time: 0.3941  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6891 s / it)
* Acc@1 84.784 Acc@5 97.030 loss 0.905
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.83%
Epoch: [291]  [   0/1251]  eta: 1:31:32  lr: 0.000010  min_lr: 0.000010  loss: 2.5752 (2.5752)  weight_decay: 0.0500 (0.0500)  time: 4.3902  data: 2.1515  max mem: 69511
Epoch: [291]  [ 200/1251]  eta: 0:14:03  lr: 0.000010  min_lr: 0.000010  loss: 2.6571 (2.5290)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1196 (1.1582)  time: 0.7844  data: 0.0005  max mem: 69511
Epoch: [291]  [ 400/1251]  eta: 0:11:16  lr: 0.000009  min_lr: 0.000009  loss: 2.6288 (2.5432)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0074 (1.1341)  time: 0.7854  data: 0.0005  max mem: 69511
Epoch: [291]  [ 600/1251]  eta: 0:08:36  lr: 0.000009  min_lr: 0.000009  loss: 2.5861 (2.5522)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1481 (1.1336)  time: 0.7869  data: 0.0004  max mem: 69511
Epoch: [291]  [ 800/1251]  eta: 0:05:57  lr: 0.000009  min_lr: 0.000009  loss: 2.5041 (2.5517)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0915 (1.1438)  time: 0.7861  data: 0.0005  max mem: 69511
Epoch: [291]  [1000/1251]  eta: 0:03:18  lr: 0.000008  min_lr: 0.000008  loss: 2.6411 (2.5498)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1005 (1.1339)  time: 0.7931  data: 0.0005  max mem: 69511
Epoch: [291]  [1200/1251]  eta: 0:00:40  lr: 0.000008  min_lr: 0.000008  loss: 2.7724 (2.5534)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0989 (1.1368)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [291]  [1250/1251]  eta: 0:00:00  lr: 0.000008  min_lr: 0.000008  loss: 2.6467 (2.5522)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0880 (1.1377)  time: 0.6675  data: 0.0005  max mem: 69511
Epoch: [291] Total time: 0:16:26 (0.7888 s / it)
Averaged stats: lr: 0.000008  min_lr: 0.000008  loss: 2.6467 (2.5363)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0880 (1.1377)
Test:  [ 0/25]  eta: 0:02:59  loss: 0.6336 (0.6336)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 7.1730  data: 6.7595  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7777 (0.7639)  acc1: 87.2000 (88.0364)  acc5: 97.6000 (98.1091)  time: 1.0133  data: 0.6148  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9091 (0.8856)  acc1: 84.8000 (85.4095)  acc5: 97.2000 (97.1429)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9755 (0.8998)  acc1: 83.2000 (85.0400)  acc5: 96.8000 (96.9920)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6734 s / it)
* Acc@1 84.844 Acc@5 97.036 loss 0.891
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.84%
Epoch: [292]  [   0/1251]  eta: 1:33:47  lr: 0.000008  min_lr: 0.000008  loss: 2.3946 (2.3946)  weight_decay: 0.0500 (0.0500)  time: 4.4981  data: 3.7148  max mem: 69511
Epoch: [292]  [ 200/1251]  eta: 0:14:05  lr: 0.000008  min_lr: 0.000008  loss: 2.5489 (2.5180)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1463 (1.1460)  time: 0.7860  data: 0.0005  max mem: 69511
Epoch: [292]  [ 400/1251]  eta: 0:11:18  lr: 0.000007  min_lr: 0.000007  loss: 2.5823 (2.5006)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1404 (1.1469)  time: 0.7947  data: 0.0004  max mem: 69511
Epoch: [292]  [ 600/1251]  eta: 0:08:37  lr: 0.000007  min_lr: 0.000007  loss: 2.3661 (2.4988)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0616 (1.1306)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [292]  [ 800/1251]  eta: 0:05:57  lr: 0.000007  min_lr: 0.000007  loss: 2.6429 (2.5073)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1568 (1.1353)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [292]  [1000/1251]  eta: 0:03:18  lr: 0.000007  min_lr: 0.000007  loss: 2.7324 (2.5113)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1416 (1.1450)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [292]  [1200/1251]  eta: 0:00:40  lr: 0.000006  min_lr: 0.000006  loss: 2.6806 (2.5191)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1168 (inf)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [292]  [1250/1251]  eta: 0:00:00  lr: 0.000006  min_lr: 0.000006  loss: 2.6325 (2.5176)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1013 (inf)  time: 0.6676  data: 0.0006  max mem: 69511
Epoch: [292] Total time: 0:16:27 (0.7894 s / it)
Averaged stats: lr: 0.000006  min_lr: 0.000006  loss: 2.6325 (2.5362)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1013 (inf)
Test:  [ 0/25]  eta: 0:03:03  loss: 0.6151 (0.6151)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.3254  data: 6.8904  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.7599 (0.7440)  acc1: 86.8000 (87.9636)  acc5: 97.6000 (98.0000)  time: 1.0271  data: 0.6267  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8989 (0.8668)  acc1: 84.8000 (85.3333)  acc5: 97.2000 (97.1048)  time: 0.3971  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9578 (0.8803)  acc1: 82.8000 (84.9440)  acc5: 96.4000 (96.9280)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6796 s / it)
* Acc@1 84.890 Acc@5 97.024 loss 0.869
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.89%
Epoch: [293]  [   0/1251]  eta: 1:18:15  lr: 0.000006  min_lr: 0.000006  loss: 2.5745 (2.5745)  weight_decay: 0.0500 (0.0500)  time: 3.7530  data: 2.9710  max mem: 69511
Epoch: [293]  [ 200/1251]  eta: 0:14:00  lr: 0.000006  min_lr: 0.000006  loss: 2.6096 (2.5453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1978 (1.1821)  time: 0.7818  data: 0.0004  max mem: 69511
Epoch: [293]  [ 400/1251]  eta: 0:11:14  lr: 0.000006  min_lr: 0.000006  loss: 2.6862 (2.5407)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1268 (1.1668)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [293]  [ 600/1251]  eta: 0:08:34  lr: 0.000006  min_lr: 0.000006  loss: 2.7258 (2.5530)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1134 (1.1621)  time: 0.7860  data: 0.0004  max mem: 69511
Epoch: [293]  [ 800/1251]  eta: 0:05:56  lr: 0.000005  min_lr: 0.000005  loss: 2.5726 (2.5528)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0359 (1.1538)  time: 0.7856  data: 0.0004  max mem: 69511
Epoch: [293]  [1000/1251]  eta: 0:03:17  lr: 0.000005  min_lr: 0.000005  loss: 2.5133 (2.5533)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0523 (1.1507)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [293]  [1200/1251]  eta: 0:00:40  lr: 0.000005  min_lr: 0.000005  loss: 2.5696 (2.5500)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0404 (1.1380)  time: 0.7827  data: 0.0005  max mem: 69511
Epoch: [293]  [1250/1251]  eta: 0:00:00  lr: 0.000005  min_lr: 0.000005  loss: 2.3942 (2.5430)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0960 (1.1362)  time: 0.6655  data: 0.0006  max mem: 69511
Epoch: [293] Total time: 0:16:23 (0.7865 s / it)
Averaged stats: lr: 0.000005  min_lr: 0.000005  loss: 2.3942 (2.5349)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0960 (1.1362)
Test:  [ 0/25]  eta: 0:02:53  loss: 0.5907 (0.5907)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.9564  data: 6.5279  max mem: 69511
Test:  [10/25]  eta: 0:00:14  loss: 0.7488 (0.7298)  acc1: 86.4000 (87.9636)  acc5: 98.0000 (97.9273)  time: 0.9918  data: 0.5937  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.8807 (0.8522)  acc1: 84.4000 (85.2191)  acc5: 97.2000 (97.0476)  time: 0.3952  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9410 (0.8670)  acc1: 83.2000 (84.8000)  acc5: 96.4000 (96.8800)  time: 0.3952  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6686 s / it)
* Acc@1 84.814 Acc@5 97.018 loss 0.858
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.89%
Epoch: [294]  [   0/1251]  eta: 1:45:26  lr: 0.000005  min_lr: 0.000005  loss: 1.7144 (1.7144)  weight_decay: 0.0500 (0.0500)  time: 5.0573  data: 2.9682  max mem: 69511
Epoch: [294]  [ 200/1251]  eta: 0:14:09  lr: 0.000005  min_lr: 0.000005  loss: 2.6755 (2.5318)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0642 (1.1446)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [294]  [ 400/1251]  eta: 0:11:19  lr: 0.000005  min_lr: 0.000005  loss: 2.6767 (2.5443)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0835 (1.1287)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [294]  [ 600/1251]  eta: 0:08:37  lr: 0.000004  min_lr: 0.000004  loss: 2.5846 (2.5497)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1188 (1.1220)  time: 0.7868  data: 0.0004  max mem: 69511
Epoch: [294]  [ 800/1251]  eta: 0:05:57  lr: 0.000004  min_lr: 0.000004  loss: 2.2596 (2.5356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1166 (1.1266)  time: 0.7864  data: 0.0004  max mem: 69511
Epoch: [294]  [1000/1251]  eta: 0:03:18  lr: 0.000004  min_lr: 0.000004  loss: 2.7855 (2.5301)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1670 (1.1295)  time: 0.7829  data: 0.0004  max mem: 69511
Epoch: [294]  [1200/1251]  eta: 0:00:40  lr: 0.000004  min_lr: 0.000004  loss: 2.5288 (2.5295)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0589 (1.1239)  time: 0.7941  data: 0.0004  max mem: 69511
Epoch: [294]  [1250/1251]  eta: 0:00:00  lr: 0.000004  min_lr: 0.000004  loss: 2.5655 (2.5306)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1303 (1.1237)  time: 0.6670  data: 0.0005  max mem: 69511
Epoch: [294] Total time: 0:16:28 (0.7899 s / it)
Averaged stats: lr: 0.000004  min_lr: 0.000004  loss: 2.5655 (2.5342)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1303 (1.1237)
Test:  [ 0/25]  eta: 0:03:18  loss: 0.6166 (0.6166)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 7.9416  data: 7.5341  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.7686 (0.7516)  acc1: 86.8000 (87.8909)  acc5: 97.6000 (97.9636)  time: 1.0841  data: 0.6852  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9047 (0.8753)  acc1: 84.4000 (85.2000)  acc5: 97.2000 (97.1048)  time: 0.3976  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 0.9642 (0.8898)  acc1: 82.8000 (84.8160)  acc5: 96.4000 (96.9120)  time: 0.3968  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7049 s / it)
* Acc@1 84.854 Acc@5 97.034 loss 0.879
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.89%
Epoch: [295]  [   0/1251]  eta: 1:37:48  lr: 0.000004  min_lr: 0.000004  loss: 2.7559 (2.7559)  weight_decay: 0.0500 (0.0500)  time: 4.6910  data: 3.3759  max mem: 69511
Epoch: [295]  [ 200/1251]  eta: 0:14:07  lr: 0.000004  min_lr: 0.000004  loss: 2.4722 (2.5094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2310 (1.1719)  time: 0.7854  data: 0.0004  max mem: 69511
Epoch: [295]  [ 400/1251]  eta: 0:11:18  lr: 0.000003  min_lr: 0.000003  loss: 2.5860 (2.5040)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0817 (1.1453)  time: 0.7869  data: 0.0004  max mem: 69511
Epoch: [295]  [ 600/1251]  eta: 0:08:37  lr: 0.000003  min_lr: 0.000003  loss: 2.6401 (2.4971)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0733 (1.1352)  time: 0.7870  data: 0.0005  max mem: 69511
Epoch: [295]  [ 800/1251]  eta: 0:05:57  lr: 0.000003  min_lr: 0.000003  loss: 2.6703 (2.5194)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1112 (1.1395)  time: 0.7873  data: 0.0005  max mem: 69511
Epoch: [295]  [1000/1251]  eta: 0:03:18  lr: 0.000003  min_lr: 0.000003  loss: 2.5002 (2.5312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1162 (1.1376)  time: 0.7925  data: 0.0004  max mem: 69511
Epoch: [295]  [1200/1251]  eta: 0:00:40  lr: 0.000003  min_lr: 0.000003  loss: 2.5588 (2.5280)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1308 (1.1310)  time: 0.7863  data: 0.0004  max mem: 69511
Epoch: [295]  [1250/1251]  eta: 0:00:00  lr: 0.000003  min_lr: 0.000003  loss: 2.7434 (2.5315)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1308 (1.1304)  time: 0.6688  data: 0.0006  max mem: 69511
Epoch: [295] Total time: 0:16:28 (0.7903 s / it)
Averaged stats: lr: 0.000003  min_lr: 0.000003  loss: 2.7434 (2.5364)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1308 (1.1304)
Test:  [ 0/25]  eta: 0:03:09  loss: 0.7161 (0.7161)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 7.5671  data: 7.1548  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8666 (0.8506)  acc1: 86.8000 (87.8909)  acc5: 98.0000 (98.0727)  time: 1.0494  data: 0.6507  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9825 (0.9732)  acc1: 84.0000 (85.1619)  acc5: 97.2000 (97.1429)  time: 0.3973  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0670 (0.9882)  acc1: 83.2000 (84.7840)  acc5: 96.4000 (96.9440)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6900 s / it)
* Acc@1 84.774 Acc@5 97.024 loss 0.978
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.89%
Epoch: [296]  [   0/1251]  eta: 1:36:41  lr: 0.000003  min_lr: 0.000003  loss: 2.6998 (2.6998)  weight_decay: 0.0500 (0.0500)  time: 4.6371  data: 3.8274  max mem: 69511
Epoch: [296]  [ 200/1251]  eta: 0:14:06  lr: 0.000003  min_lr: 0.000003  loss: 2.5714 (2.5413)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1015 (1.1035)  time: 0.7869  data: 0.0005  max mem: 69511
Epoch: [296]  [ 400/1251]  eta: 0:11:18  lr: 0.000002  min_lr: 0.000002  loss: 2.4568 (2.5335)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0970 (1.1342)  time: 0.7918  data: 0.0006  max mem: 69511
Epoch: [296]  [ 600/1251]  eta: 0:08:36  lr: 0.000002  min_lr: 0.000002  loss: 2.6958 (2.5210)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1249 (1.1414)  time: 0.7865  data: 0.0004  max mem: 69511
Epoch: [296]  [ 800/1251]  eta: 0:05:57  lr: 0.000002  min_lr: 0.000002  loss: 2.7139 (2.5284)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0429 (1.1524)  time: 0.7858  data: 0.0005  max mem: 69511
Epoch: [296]  [1000/1251]  eta: 0:03:18  lr: 0.000002  min_lr: 0.000002  loss: 2.6407 (2.5328)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0918 (1.1479)  time: 0.7862  data: 0.0005  max mem: 69511
Epoch: [296]  [1200/1251]  eta: 0:00:40  lr: 0.000002  min_lr: 0.000002  loss: 2.4744 (2.5318)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1140 (1.1479)  time: 0.7868  data: 0.0007  max mem: 69511
Epoch: [296]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.6435 (2.5324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0550 (1.1452)  time: 0.6679  data: 0.0006  max mem: 69511
Epoch: [296] Total time: 0:16:27 (0.7896 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.6435 (2.5383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0550 (1.1452)
Test:  [ 0/25]  eta: 0:03:16  loss: 0.6801 (0.6801)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.8747  data: 7.4636  max mem: 69511
Test:  [10/25]  eta: 0:00:16  loss: 0.8289 (0.8133)  acc1: 86.8000 (87.9273)  acc5: 98.0000 (98.0000)  time: 1.0776  data: 0.6788  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9526 (0.9383)  acc1: 84.0000 (85.1048)  acc5: 97.2000 (97.0857)  time: 0.3974  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0370 (0.9530)  acc1: 82.8000 (84.7520)  acc5: 96.4000 (96.8960)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.7015 s / it)
* Acc@1 84.844 Acc@5 97.028 loss 0.942
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.89%
Epoch: [297]  [   0/1251]  eta: 1:41:10  lr: 0.000002  min_lr: 0.000002  loss: 1.9659 (1.9659)  weight_decay: 0.0500 (0.0500)  time: 4.8524  data: 3.3250  max mem: 69511
Epoch: [297]  [ 200/1251]  eta: 0:14:10  lr: 0.000002  min_lr: 0.000002  loss: 2.5199 (2.5303)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0877 (1.1364)  time: 0.7926  data: 0.0004  max mem: 69511
Epoch: [297]  [ 400/1251]  eta: 0:11:19  lr: 0.000002  min_lr: 0.000002  loss: 2.6043 (2.5442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0697 (1.1794)  time: 0.7936  data: 0.0004  max mem: 69511
Epoch: [297]  [ 600/1251]  eta: 0:08:37  lr: 0.000002  min_lr: 0.000002  loss: 2.4517 (2.5222)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0697 (1.1677)  time: 0.7908  data: 0.0004  max mem: 69511
Epoch: [297]  [ 800/1251]  eta: 0:05:57  lr: 0.000002  min_lr: 0.000002  loss: 2.6540 (2.5192)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0736 (1.1553)  time: 0.7879  data: 0.0004  max mem: 69511
Epoch: [297]  [1000/1251]  eta: 0:03:18  lr: 0.000002  min_lr: 0.000002  loss: 2.6509 (2.5094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0995 (1.1548)  time: 0.7862  data: 0.0004  max mem: 69511
Epoch: [297]  [1200/1251]  eta: 0:00:40  lr: 0.000001  min_lr: 0.000001  loss: 2.5516 (2.5148)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1620 (1.1558)  time: 0.7859  data: 0.0005  max mem: 69511
Epoch: [297]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.6112 (2.5185)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0754 (1.1537)  time: 0.6686  data: 0.0005  max mem: 69511
Epoch: [297] Total time: 0:16:27 (0.7893 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.6112 (2.5268)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0754 (1.1537)
Test:  [ 0/25]  eta: 0:03:01  loss: 0.6700 (0.6700)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 7.2622  data: 6.8333  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8161 (0.8070)  acc1: 86.8000 (87.8545)  acc5: 98.0000 (98.0000)  time: 1.0212  data: 0.6215  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9469 (0.9306)  acc1: 84.4000 (85.0476)  acc5: 97.2000 (97.0857)  time: 0.3970  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0249 (0.9451)  acc1: 82.8000 (84.6720)  acc5: 96.4000 (96.8640)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:16 (0.6768 s / it)
* Acc@1 84.772 Acc@5 97.002 loss 0.935
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.89%
Epoch: [298]  [   0/1251]  eta: 1:39:49  lr: 0.000001  min_lr: 0.000001  loss: 2.7258 (2.7258)  weight_decay: 0.0500 (0.0500)  time: 4.7876  data: 3.4236  max mem: 69511
Epoch: [298]  [ 200/1251]  eta: 0:14:07  lr: 0.000001  min_lr: 0.000001  loss: 2.5905 (2.5428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1126 (1.1198)  time: 0.7863  data: 0.0006  max mem: 69511
Epoch: [298]  [ 400/1251]  eta: 0:11:18  lr: 0.000001  min_lr: 0.000001  loss: 2.5912 (2.5405)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1076 (1.1138)  time: 0.7865  data: 0.0005  max mem: 69511
Epoch: [298]  [ 600/1251]  eta: 0:08:37  lr: 0.000001  min_lr: 0.000001  loss: 2.2383 (2.5427)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0655 (1.1207)  time: 0.7921  data: 0.0010  max mem: 69511
Epoch: [298]  [ 800/1251]  eta: 0:05:57  lr: 0.000001  min_lr: 0.000001  loss: 2.4243 (2.5406)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1321 (1.1246)  time: 0.7864  data: 0.0009  max mem: 69511
Epoch: [298]  [1000/1251]  eta: 0:03:18  lr: 0.000001  min_lr: 0.000001  loss: 2.5855 (2.5352)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0788 (1.1304)  time: 0.7944  data: 0.0007  max mem: 69511
Epoch: [298]  [1200/1251]  eta: 0:00:40  lr: 0.000001  min_lr: 0.000001  loss: 2.6648 (2.5252)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0873 (1.1319)  time: 0.7883  data: 0.0007  max mem: 69511
Epoch: [298]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.8271 (2.5280)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0897 (1.1310)  time: 0.6650  data: 0.0007  max mem: 69511
Epoch: [298] Total time: 0:16:28 (0.7900 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.8271 (2.5314)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0897 (1.1310)
Test:  [ 0/25]  eta: 0:03:08  loss: 0.7139 (0.7139)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 7.5291  data: 7.1151  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8578 (0.8481)  acc1: 86.8000 (87.9273)  acc5: 97.6000 (98.1091)  time: 1.0428  data: 0.6471  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9851 (0.9683)  acc1: 84.0000 (85.1810)  acc5: 97.2000 (97.1238)  time: 0.3942  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0612 (0.9827)  acc1: 83.6000 (84.8000)  acc5: 96.0000 (96.9120)  time: 0.3941  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6900 s / it)
* Acc@1 84.818 Acc@5 97.014 loss 0.972
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.89%
Epoch: [299]  [   0/1251]  eta: 1:42:09  lr: 0.000001  min_lr: 0.000001  loss: 2.8868 (2.8868)  weight_decay: 0.0500 (0.0500)  time: 4.9000  data: 3.1765  max mem: 69511
Epoch: [299]  [ 200/1251]  eta: 0:14:05  lr: 0.000001  min_lr: 0.000001  loss: 2.4133 (2.5304)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1168 (1.1331)  time: 0.7838  data: 0.0008  max mem: 69511
Epoch: [299]  [ 400/1251]  eta: 0:11:18  lr: 0.000001  min_lr: 0.000001  loss: 2.8076 (2.5387)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1467 (1.1497)  time: 0.7978  data: 0.0006  max mem: 69511
Epoch: [299]  [ 600/1251]  eta: 0:08:37  lr: 0.000001  min_lr: 0.000001  loss: 2.4876 (2.5366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1210 (1.1419)  time: 0.7867  data: 0.0004  max mem: 69511
Epoch: [299]  [ 800/1251]  eta: 0:05:57  lr: 0.000001  min_lr: 0.000001  loss: 2.4711 (2.5381)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1011 (1.1435)  time: 0.7867  data: 0.0005  max mem: 69511
Epoch: [299]  [1000/1251]  eta: 0:03:18  lr: 0.000001  min_lr: 0.000001  loss: 2.6173 (2.5333)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0567 (1.1386)  time: 0.7941  data: 0.0004  max mem: 69511
Epoch: [299]  [1200/1251]  eta: 0:00:40  lr: 0.000001  min_lr: 0.000001  loss: 2.6494 (2.5322)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0748 (1.1355)  time: 0.7864  data: 0.0005  max mem: 69511
Epoch: [299]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.5792 (2.5349)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0435 (1.1331)  time: 0.6684  data: 0.0006  max mem: 69511
Epoch: [299] Total time: 0:16:28 (0.7900 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.5792 (2.5302)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0435 (1.1331)
Test:  [ 0/25]  eta: 0:03:11  loss: 0.6660 (0.6660)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 7.6713  data: 7.2382  max mem: 69511
Test:  [10/25]  eta: 0:00:15  loss: 0.8040 (0.7938)  acc1: 86.8000 (87.8545)  acc5: 97.6000 (98.1091)  time: 1.0586  data: 0.6583  max mem: 69511
Test:  [20/25]  eta: 0:00:03  loss: 0.9366 (0.9165)  acc1: 84.4000 (85.2000)  acc5: 97.2000 (97.1048)  time: 0.3972  data: 0.0002  max mem: 69511
Test:  [24/25]  eta: 0:00:00  loss: 1.0060 (0.9306)  acc1: 83.2000 (84.8000)  acc5: 96.4000 (96.9280)  time: 0.3970  data: 0.0001  max mem: 69511
Test: Total time: 0:00:17 (0.6931 s / it)
* Acc@1 84.844 Acc@5 97.048 loss 0.920
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.89%
Training time 3 days, 11:54:11
