| distributed init (rank 0): env://, gpu 0
| distributed init (rank 5): env://, gpu 5
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 7): env://, gpu 7
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 6): env://, gpu 6
| distributed init (rank 3): env://, gpu 3
Namespace(batch_size=128, epochs=300, update_freq=4, model='nano', drop_path=0, input_size=224, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.004, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.2, cutmix=0.3, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_nano_1.4G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=8, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(224, 224))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7f19091110d0>
Mixup is activated!
Model = SFCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 20, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
  )
  (layer1): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((20,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(20, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=20, bias=False)
        (norm): BatchNorm2d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(20, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(160, 40, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(20, 20, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=20, bias=False)
          (norm): BatchNorm2d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(20, 40, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): Identity()
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
        (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(40, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(160, 40, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.002)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
        (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(40, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(160, 40, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.004)
    )
  )
  (layer2): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((40,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
        (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(40, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(320, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(40, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=40, bias=False)
          (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(40, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.005)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(320, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.007)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(320, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.009)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(320, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.011)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(320, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.013)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(320, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.014)
    )
  )
  (layer3): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((80,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=80, bias=False)
          (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(80, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.016)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.018)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.020)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.021)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.023)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.025)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.027)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.029)
    )
    (8): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.030)
    )
    (9): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.032)
    )
    (10): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.034)
    )
    (11): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.036)
    )
    (12): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.038)
    )
    (13): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.039)
    )
    (14): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.041)
    )
    (15): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.043)
    )
    (16): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(640, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.045)
    )
  )
  (layer4): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 1280, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1280, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1280, bias=False)
        (norm): BatchNorm2d(1280, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1280, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=160, bias=False)
          (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(160, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.046)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 1280, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1280, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1280, bias=False)
        (norm): BatchNorm2d(1280, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1280, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.048)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 1280, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1280, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1280, bias=False)
        (norm): BatchNorm2d(1280, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1280, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.050)
    )
  )
  (head): ConvX(
    (conv): Conv2d(320, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 10840356
LR = 0.00400000
Batch size = 4096
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 312
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.le.conv.weight",
      "layer1.0.conv_in.conv.weight",
      "layer1.0.conv.conv.weight",
      "layer1.0.conv_out.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.le.conv.weight",
      "layer1.1.conv_in.conv.weight",
      "layer1.1.conv.conv.weight",
      "layer1.1.conv_out.conv.weight",
      "layer1.2.le.conv.weight",
      "layer1.2.conv_in.conv.weight",
      "layer1.2.conv.conv.weight",
      "layer1.2.conv_out.conv.weight",
      "layer2.0.le.conv.weight",
      "layer2.0.conv_in.conv.weight",
      "layer2.0.conv.conv.weight",
      "layer2.0.conv_out.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.le.conv.weight",
      "layer2.1.conv_in.conv.weight",
      "layer2.1.conv.conv.weight",
      "layer2.1.conv_out.conv.weight",
      "layer2.2.le.conv.weight",
      "layer2.2.conv_in.conv.weight",
      "layer2.2.conv.conv.weight",
      "layer2.2.conv_out.conv.weight",
      "layer2.3.le.conv.weight",
      "layer2.3.conv_in.conv.weight",
      "layer2.3.conv.conv.weight",
      "layer2.3.conv_out.conv.weight",
      "layer2.4.le.conv.weight",
      "layer2.4.conv_in.conv.weight",
      "layer2.4.conv.conv.weight",
      "layer2.4.conv_out.conv.weight",
      "layer2.5.le.conv.weight",
      "layer2.5.conv_in.conv.weight",
      "layer2.5.conv.conv.weight",
      "layer2.5.conv_out.conv.weight",
      "layer3.0.le.conv.weight",
      "layer3.0.conv_in.conv.weight",
      "layer3.0.conv.conv.weight",
      "layer3.0.conv_out.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.le.conv.weight",
      "layer3.1.conv_in.conv.weight",
      "layer3.1.conv.conv.weight",
      "layer3.1.conv_out.conv.weight",
      "layer3.2.le.conv.weight",
      "layer3.2.conv_in.conv.weight",
      "layer3.2.conv.conv.weight",
      "layer3.2.conv_out.conv.weight",
      "layer3.3.le.conv.weight",
      "layer3.3.conv_in.conv.weight",
      "layer3.3.conv.conv.weight",
      "layer3.3.conv_out.conv.weight",
      "layer3.4.le.conv.weight",
      "layer3.4.conv_in.conv.weight",
      "layer3.4.conv.conv.weight",
      "layer3.4.conv_out.conv.weight",
      "layer3.5.le.conv.weight",
      "layer3.5.conv_in.conv.weight",
      "layer3.5.conv.conv.weight",
      "layer3.5.conv_out.conv.weight",
      "layer3.6.le.conv.weight",
      "layer3.6.conv_in.conv.weight",
      "layer3.6.conv.conv.weight",
      "layer3.6.conv_out.conv.weight",
      "layer3.7.le.conv.weight",
      "layer3.7.conv_in.conv.weight",
      "layer3.7.conv.conv.weight",
      "layer3.7.conv_out.conv.weight",
      "layer3.8.le.conv.weight",
      "layer3.8.conv_in.conv.weight",
      "layer3.8.conv.conv.weight",
      "layer3.8.conv_out.conv.weight",
      "layer3.9.le.conv.weight",
      "layer3.9.conv_in.conv.weight",
      "layer3.9.conv.conv.weight",
      "layer3.9.conv_out.conv.weight",
      "layer3.10.le.conv.weight",
      "layer3.10.conv_in.conv.weight",
      "layer3.10.conv.conv.weight",
      "layer3.10.conv_out.conv.weight",
      "layer3.11.le.conv.weight",
      "layer3.11.conv_in.conv.weight",
      "layer3.11.conv.conv.weight",
      "layer3.11.conv_out.conv.weight",
      "layer3.12.le.conv.weight",
      "layer3.12.conv_in.conv.weight",
      "layer3.12.conv.conv.weight",
      "layer3.12.conv_out.conv.weight",
      "layer3.13.le.conv.weight",
      "layer3.13.conv_in.conv.weight",
      "layer3.13.conv.conv.weight",
      "layer3.13.conv_out.conv.weight",
      "layer3.14.le.conv.weight",
      "layer3.14.conv_in.conv.weight",
      "layer3.14.conv.conv.weight",
      "layer3.14.conv_out.conv.weight",
      "layer3.15.le.conv.weight",
      "layer3.15.conv_in.conv.weight",
      "layer3.15.conv.conv.weight",
      "layer3.15.conv_out.conv.weight",
      "layer3.16.le.conv.weight",
      "layer3.16.conv_in.conv.weight",
      "layer3.16.conv.conv.weight",
      "layer3.16.conv_out.conv.weight",
      "layer4.0.le.conv.weight",
      "layer4.0.conv_in.conv.weight",
      "layer4.0.conv.conv.weight",
      "layer4.0.conv_out.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.le.conv.weight",
      "layer4.1.conv_in.conv.weight",
      "layer4.1.conv.conv.weight",
      "layer4.1.conv_out.conv.weight",
      "layer4.2.le.conv.weight",
      "layer4.2.conv_in.conv.weight",
      "layer4.2.conv.conv.weight",
      "layer4.2.conv_out.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.ln.weight",
      "layer1.0.ln.bias",
      "layer1.0.le.norm.weight",
      "layer1.0.le.norm.bias",
      "layer1.0.conv_in.norm.weight",
      "layer1.0.conv_in.norm.bias",
      "layer1.0.conv.norm.weight",
      "layer1.0.conv.norm.bias",
      "layer1.0.conv_out.norm.weight",
      "layer1.0.conv_out.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.le.norm.weight",
      "layer1.1.le.norm.bias",
      "layer1.1.conv_in.norm.weight",
      "layer1.1.conv_in.norm.bias",
      "layer1.1.conv.norm.weight",
      "layer1.1.conv.norm.bias",
      "layer1.1.conv_out.norm.weight",
      "layer1.1.conv_out.norm.bias",
      "layer1.2.le.norm.weight",
      "layer1.2.le.norm.bias",
      "layer1.2.conv_in.norm.weight",
      "layer1.2.conv_in.norm.bias",
      "layer1.2.conv.norm.weight",
      "layer1.2.conv.norm.bias",
      "layer1.2.conv_out.norm.weight",
      "layer1.2.conv_out.norm.bias",
      "layer2.0.ln.weight",
      "layer2.0.ln.bias",
      "layer2.0.le.norm.weight",
      "layer2.0.le.norm.bias",
      "layer2.0.conv_in.norm.weight",
      "layer2.0.conv_in.norm.bias",
      "layer2.0.conv.norm.weight",
      "layer2.0.conv.norm.bias",
      "layer2.0.conv_out.norm.weight",
      "layer2.0.conv_out.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.le.norm.weight",
      "layer2.1.le.norm.bias",
      "layer2.1.conv_in.norm.weight",
      "layer2.1.conv_in.norm.bias",
      "layer2.1.conv.norm.weight",
      "layer2.1.conv.norm.bias",
      "layer2.1.conv_out.norm.weight",
      "layer2.1.conv_out.norm.bias",
      "layer2.2.le.norm.weight",
      "layer2.2.le.norm.bias",
      "layer2.2.conv_in.norm.weight",
      "layer2.2.conv_in.norm.bias",
      "layer2.2.conv.norm.weight",
      "layer2.2.conv.norm.bias",
      "layer2.2.conv_out.norm.weight",
      "layer2.2.conv_out.norm.bias",
      "layer2.3.le.norm.weight",
      "layer2.3.le.norm.bias",
      "layer2.3.conv_in.norm.weight",
      "layer2.3.conv_in.norm.bias",
      "layer2.3.conv.norm.weight",
      "layer2.3.conv.norm.bias",
      "layer2.3.conv_out.norm.weight",
      "layer2.3.conv_out.norm.bias",
      "layer2.4.le.norm.weight",
      "layer2.4.le.norm.bias",
      "layer2.4.conv_in.norm.weight",
      "layer2.4.conv_in.norm.bias",
      "layer2.4.conv.norm.weight",
      "layer2.4.conv.norm.bias",
      "layer2.4.conv_out.norm.weight",
      "layer2.4.conv_out.norm.bias",
      "layer2.5.le.norm.weight",
      "layer2.5.le.norm.bias",
      "layer2.5.conv_in.norm.weight",
      "layer2.5.conv_in.norm.bias",
      "layer2.5.conv.norm.weight",
      "layer2.5.conv.norm.bias",
      "layer2.5.conv_out.norm.weight",
      "layer2.5.conv_out.norm.bias",
      "layer3.0.ln.weight",
      "layer3.0.ln.bias",
      "layer3.0.le.norm.weight",
      "layer3.0.le.norm.bias",
      "layer3.0.conv_in.norm.weight",
      "layer3.0.conv_in.norm.bias",
      "layer3.0.conv.norm.weight",
      "layer3.0.conv.norm.bias",
      "layer3.0.conv_out.norm.weight",
      "layer3.0.conv_out.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.le.norm.weight",
      "layer3.1.le.norm.bias",
      "layer3.1.conv_in.norm.weight",
      "layer3.1.conv_in.norm.bias",
      "layer3.1.conv.norm.weight",
      "layer3.1.conv.norm.bias",
      "layer3.1.conv_out.norm.weight",
      "layer3.1.conv_out.norm.bias",
      "layer3.2.le.norm.weight",
      "layer3.2.le.norm.bias",
      "layer3.2.conv_in.norm.weight",
      "layer3.2.conv_in.norm.bias",
      "layer3.2.conv.norm.weight",
      "layer3.2.conv.norm.bias",
      "layer3.2.conv_out.norm.weight",
      "layer3.2.conv_out.norm.bias",
      "layer3.3.le.norm.weight",
      "layer3.3.le.norm.bias",
      "layer3.3.conv_in.norm.weight",
      "layer3.3.conv_in.norm.bias",
      "layer3.3.conv.norm.weight",
      "layer3.3.conv.norm.bias",
      "layer3.3.conv_out.norm.weight",
      "layer3.3.conv_out.norm.bias",
      "layer3.4.le.norm.weight",
      "layer3.4.le.norm.bias",
      "layer3.4.conv_in.norm.weight",
      "layer3.4.conv_in.norm.bias",
      "layer3.4.conv.norm.weight",
      "layer3.4.conv.norm.bias",
      "layer3.4.conv_out.norm.weight",
      "layer3.4.conv_out.norm.bias",
      "layer3.5.le.norm.weight",
      "layer3.5.le.norm.bias",
      "layer3.5.conv_in.norm.weight",
      "layer3.5.conv_in.norm.bias",
      "layer3.5.conv.norm.weight",
      "layer3.5.conv.norm.bias",
      "layer3.5.conv_out.norm.weight",
      "layer3.5.conv_out.norm.bias",
      "layer3.6.le.norm.weight",
      "layer3.6.le.norm.bias",
      "layer3.6.conv_in.norm.weight",
      "layer3.6.conv_in.norm.bias",
      "layer3.6.conv.norm.weight",
      "layer3.6.conv.norm.bias",
      "layer3.6.conv_out.norm.weight",
      "layer3.6.conv_out.norm.bias",
      "layer3.7.le.norm.weight",
      "layer3.7.le.norm.bias",
      "layer3.7.conv_in.norm.weight",
      "layer3.7.conv_in.norm.bias",
      "layer3.7.conv.norm.weight",
      "layer3.7.conv.norm.bias",
      "layer3.7.conv_out.norm.weight",
      "layer3.7.conv_out.norm.bias",
      "layer3.8.le.norm.weight",
      "layer3.8.le.norm.bias",
      "layer3.8.conv_in.norm.weight",
      "layer3.8.conv_in.norm.bias",
      "layer3.8.conv.norm.weight",
      "layer3.8.conv.norm.bias",
      "layer3.8.conv_out.norm.weight",
      "layer3.8.conv_out.norm.bias",
      "layer3.9.le.norm.weight",
      "layer3.9.le.norm.bias",
      "layer3.9.conv_in.norm.weight",
      "layer3.9.conv_in.norm.bias",
      "layer3.9.conv.norm.weight",
      "layer3.9.conv.norm.bias",
      "layer3.9.conv_out.norm.weight",
      "layer3.9.conv_out.norm.bias",
      "layer3.10.le.norm.weight",
      "layer3.10.le.norm.bias",
      "layer3.10.conv_in.norm.weight",
      "layer3.10.conv_in.norm.bias",
      "layer3.10.conv.norm.weight",
      "layer3.10.conv.norm.bias",
      "layer3.10.conv_out.norm.weight",
      "layer3.10.conv_out.norm.bias",
      "layer3.11.le.norm.weight",
      "layer3.11.le.norm.bias",
      "layer3.11.conv_in.norm.weight",
      "layer3.11.conv_in.norm.bias",
      "layer3.11.conv.norm.weight",
      "layer3.11.conv.norm.bias",
      "layer3.11.conv_out.norm.weight",
      "layer3.11.conv_out.norm.bias",
      "layer3.12.le.norm.weight",
      "layer3.12.le.norm.bias",
      "layer3.12.conv_in.norm.weight",
      "layer3.12.conv_in.norm.bias",
      "layer3.12.conv.norm.weight",
      "layer3.12.conv.norm.bias",
      "layer3.12.conv_out.norm.weight",
      "layer3.12.conv_out.norm.bias",
      "layer3.13.le.norm.weight",
      "layer3.13.le.norm.bias",
      "layer3.13.conv_in.norm.weight",
      "layer3.13.conv_in.norm.bias",
      "layer3.13.conv.norm.weight",
      "layer3.13.conv.norm.bias",
      "layer3.13.conv_out.norm.weight",
      "layer3.13.conv_out.norm.bias",
      "layer3.14.le.norm.weight",
      "layer3.14.le.norm.bias",
      "layer3.14.conv_in.norm.weight",
      "layer3.14.conv_in.norm.bias",
      "layer3.14.conv.norm.weight",
      "layer3.14.conv.norm.bias",
      "layer3.14.conv_out.norm.weight",
      "layer3.14.conv_out.norm.bias",
      "layer3.15.le.norm.weight",
      "layer3.15.le.norm.bias",
      "layer3.15.conv_in.norm.weight",
      "layer3.15.conv_in.norm.bias",
      "layer3.15.conv.norm.weight",
      "layer3.15.conv.norm.bias",
      "layer3.15.conv_out.norm.weight",
      "layer3.15.conv_out.norm.bias",
      "layer3.16.le.norm.weight",
      "layer3.16.le.norm.bias",
      "layer3.16.conv_in.norm.weight",
      "layer3.16.conv_in.norm.bias",
      "layer3.16.conv.norm.weight",
      "layer3.16.conv.norm.bias",
      "layer3.16.conv_out.norm.weight",
      "layer3.16.conv_out.norm.bias",
      "layer4.0.ln.weight",
      "layer4.0.ln.bias",
      "layer4.0.le.norm.weight",
      "layer4.0.le.norm.bias",
      "layer4.0.conv_in.norm.weight",
      "layer4.0.conv_in.norm.bias",
      "layer4.0.conv.norm.weight",
      "layer4.0.conv.norm.bias",
      "layer4.0.conv_out.norm.weight",
      "layer4.0.conv_out.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.le.norm.weight",
      "layer4.1.le.norm.bias",
      "layer4.1.conv_in.norm.weight",
      "layer4.1.conv_in.norm.bias",
      "layer4.1.conv.norm.weight",
      "layer4.1.conv.norm.bias",
      "layer4.1.conv_out.norm.weight",
      "layer4.1.conv_out.norm.bias",
      "layer4.2.le.norm.weight",
      "layer4.2.le.norm.bias",
      "layer4.2.conv_in.norm.weight",
      "layer4.2.conv_in.norm.bias",
      "layer4.2.conv.norm.weight",
      "layer4.2.conv.norm.bias",
      "layer4.2.conv_out.norm.weight",
      "layer4.2.conv_out.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 6240
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: 
Start training for 300 epochs
Epoch: [0]  [   0/1251]  eta: 3:42:21  lr: 0.000000  min_lr: 0.000000  loss: 6.9247 (6.9247)  weight_decay: 0.0500 (0.0500)  time: 10.6650  data: 2.1478  max mem: 13273
Epoch: [0]  [ 200/1251]  eta: 0:04:32  lr: 0.000032  min_lr: 0.000032  loss: 6.8802 (6.9182)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6006 (5.9905)  time: 0.2067  data: 0.0006  max mem: 13273
Epoch: [0]  [ 400/1251]  eta: 0:03:18  lr: 0.000064  min_lr: 0.000064  loss: 6.7260 (6.8523)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6411 (4.5888)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [0]  [ 600/1251]  eta: 0:02:26  lr: 0.000096  min_lr: 0.000096  loss: 6.5797 (6.7932)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.2277 (inf)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [0]  [ 800/1251]  eta: 0:01:39  lr: 0.000128  min_lr: 0.000128  loss: 6.4687 (6.7408)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.4811 (inf)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [0]  [1000/1251]  eta: 0:00:54  lr: 0.000160  min_lr: 0.000160  loss: 6.4030 (6.6900)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.3889 (inf)  time: 0.2063  data: 0.0006  max mem: 13273
Epoch: [0]  [1200/1251]  eta: 0:00:10  lr: 0.000192  min_lr: 0.000192  loss: 6.3958 (6.6399)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.2600 (inf)  time: 0.2069  data: 0.0006  max mem: 13273
Epoch: [0]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 6.2827 (6.6290)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.7022 (inf)  time: 0.1718  data: 0.0007  max mem: 13273
Epoch: [0] Total time: 0:04:29 (0.2153 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 6.2827 (6.6308)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.7022 (inf)
Test:  [ 0/25]  eta: 0:04:24  loss: 5.6466 (5.6466)  acc1: 8.0000 (8.0000)  acc5: 17.6000 (17.6000)  time: 10.5643  data: 7.4044  max mem: 13273
Test:  [10/25]  eta: 0:00:15  loss: 5.5871 (5.5701)  acc1: 3.6000 (4.6545)  acc5: 14.0000 (14.9091)  time: 1.0426  data: 0.6733  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 5.6171 (5.6281)  acc1: 4.4000 (5.1238)  acc5: 13.6000 (15.0667)  time: 0.0904  data: 0.0002  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 5.6171 (5.5846)  acc1: 4.8000 (5.6160)  acc5: 13.6000 (16.0800)  time: 0.1092  data: 0.0190  max mem: 13273
Test: Total time: 0:00:13 (0.5274 s / it)
* Acc@1 5.436 Acc@5 16.090 loss 5.590
Accuracy of the model on the 50000 test images: 5.4%
Max accuracy: 5.44%
Epoch: [1]  [   0/1251]  eta: 0:58:46  lr: 0.000200  min_lr: 0.000200  loss: 6.5390 (6.5390)  weight_decay: 0.0500 (0.0500)  time: 2.8186  data: 2.4647  max mem: 13273
Epoch: [1]  [ 200/1251]  eta: 0:03:50  lr: 0.000232  min_lr: 0.000232  loss: 6.2033 (6.3231)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.8759 (10.4597)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [1]  [ 400/1251]  eta: 0:03:00  lr: 0.000264  min_lr: 0.000264  loss: 6.1237 (6.2847)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.9170 (10.9533)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [1]  [ 600/1251]  eta: 0:02:16  lr: 0.000296  min_lr: 0.000296  loss: 6.1740 (6.2494)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.1624 (11.4130)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [1]  [ 800/1251]  eta: 0:01:34  lr: 0.000328  min_lr: 0.000328  loss: 6.0668 (6.2221)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.8805 (inf)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [1]  [1000/1251]  eta: 0:00:52  lr: 0.000360  min_lr: 0.000360  loss: 5.8774 (6.1933)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.7867 (inf)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [1]  [1200/1251]  eta: 0:00:10  lr: 0.000392  min_lr: 0.000392  loss: 5.9399 (6.1637)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.4925 (inf)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [1]  [1250/1251]  eta: 0:00:00  lr: 0.000399  min_lr: 0.000399  loss: 5.8082 (6.1563)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.8985 (inf)  time: 0.1705  data: 0.0009  max mem: 13273
Epoch: [1] Total time: 0:04:19 (0.2077 s / it)
Averaged stats: lr: 0.000399  min_lr: 0.000399  loss: 5.8082 (6.1460)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.8985 (inf)
Test:  [ 0/25]  eta: 0:02:24  loss: 4.5981 (4.5981)  acc1: 16.4000 (16.4000)  acc5: 33.6000 (33.6000)  time: 5.7841  data: 5.6778  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 4.4327 (4.5118)  acc1: 15.2000 (14.0000)  acc5: 33.6000 (34.8000)  time: 0.7745  data: 0.6767  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 4.7900 (4.7093)  acc1: 12.4000 (13.2381)  acc5: 27.6000 (31.1238)  time: 0.2079  data: 0.1119  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 4.8136 (4.6795)  acc1: 12.8000 (13.8880)  acc5: 28.4000 (31.9360)  time: 0.2075  data: 0.1118  max mem: 13273
Test: Total time: 0:00:10 (0.4156 s / it)
* Acc@1 13.154 Acc@5 31.706 loss 4.670
Accuracy of the model on the 50000 test images: 13.2%
Max accuracy: 13.15%
Epoch: [2]  [   0/1251]  eta: 1:00:05  lr: 0.000400  min_lr: 0.000400  loss: 5.6761 (5.6761)  weight_decay: 0.0500 (0.0500)  time: 2.8821  data: 2.4925  max mem: 13273
Epoch: [2]  [ 200/1251]  eta: 0:03:51  lr: 0.000432  min_lr: 0.000432  loss: 5.7756 (5.9394)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.5934 (12.3102)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [2]  [ 400/1251]  eta: 0:03:01  lr: 0.000464  min_lr: 0.000464  loss: 5.7172 (5.9214)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.5414 (12.4390)  time: 0.2079  data: 0.0004  max mem: 13273
Epoch: [2]  [ 600/1251]  eta: 0:02:17  lr: 0.000496  min_lr: 0.000496  loss: 6.0654 (5.8889)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.8819 (12.0977)  time: 0.2069  data: 0.0004  max mem: 13273
Epoch: [2]  [ 800/1251]  eta: 0:01:34  lr: 0.000528  min_lr: 0.000528  loss: 5.5463 (5.8631)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.1745 (11.8593)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [2]  [1000/1251]  eta: 0:00:52  lr: 0.000560  min_lr: 0.000560  loss: 5.4389 (5.8314)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.5352 (11.7093)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [2]  [1200/1251]  eta: 0:00:10  lr: 0.000592  min_lr: 0.000592  loss: 5.7996 (5.8190)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.7244 (11.6031)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [2]  [1250/1251]  eta: 0:00:00  lr: 0.000599  min_lr: 0.000599  loss: 5.2651 (5.8097)  weight_decay: 0.0500 (0.0500)  grad_norm: 12.0713 (11.6588)  time: 0.1710  data: 0.0009  max mem: 13273
Epoch: [2] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.000599  min_lr: 0.000599  loss: 5.2651 (5.8161)  weight_decay: 0.0500 (0.0500)  grad_norm: 12.0713 (11.6588)
Test:  [ 0/25]  eta: 0:02:23  loss: 3.8603 (3.8603)  acc1: 24.4000 (24.4000)  acc5: 46.8000 (46.8000)  time: 5.7479  data: 5.6419  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 3.7010 (3.8018)  acc1: 24.4000 (23.3455)  acc5: 49.6000 (48.2182)  time: 0.7455  data: 0.6517  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 4.2787 (4.0786)  acc1: 19.2000 (21.1619)  acc5: 40.0000 (43.9619)  time: 0.1910  data: 0.0990  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 4.2882 (4.0356)  acc1: 19.6000 (21.9840)  acc5: 40.0000 (44.3840)  time: 0.1999  data: 0.1079  max mem: 13273
Test: Total time: 0:00:10 (0.4088 s / it)
* Acc@1 22.076 Acc@5 44.822 loss 4.020
Accuracy of the model on the 50000 test images: 22.1%
Max accuracy: 22.08%
Epoch: [3]  [   0/1251]  eta: 1:05:47  lr: 0.000600  min_lr: 0.000600  loss: 5.5961 (5.5961)  weight_decay: 0.0500 (0.0500)  time: 3.1557  data: 2.9172  max mem: 13273
Epoch: [3]  [ 200/1251]  eta: 0:03:52  lr: 0.000632  min_lr: 0.000632  loss: 5.6324 (5.6459)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.2028 (10.3722)  time: 0.2160  data: 0.0003  max mem: 13273
Epoch: [3]  [ 400/1251]  eta: 0:03:01  lr: 0.000664  min_lr: 0.000664  loss: 5.1613 (5.6311)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.2242 (10.4589)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [3]  [ 600/1251]  eta: 0:02:17  lr: 0.000696  min_lr: 0.000696  loss: 5.7718 (5.5972)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.2603 (10.1297)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [3]  [ 800/1251]  eta: 0:01:34  lr: 0.000728  min_lr: 0.000728  loss: 5.6532 (5.5764)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.3093 (10.1345)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [3]  [1000/1251]  eta: 0:00:52  lr: 0.000760  min_lr: 0.000760  loss: 5.3926 (5.5552)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.4744 (9.8610)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [3]  [1200/1251]  eta: 0:00:10  lr: 0.000792  min_lr: 0.000792  loss: 5.1735 (5.5392)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.6077 (9.7582)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [3]  [1250/1251]  eta: 0:00:00  lr: 0.000799  min_lr: 0.000799  loss: 5.5710 (5.5353)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.0273 (9.7375)  time: 0.1699  data: 0.0005  max mem: 13273
Epoch: [3] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.000799  min_lr: 0.000799  loss: 5.5710 (5.5393)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.0273 (9.7375)
Test:  [ 0/25]  eta: 0:02:21  loss: 3.2483 (3.2483)  acc1: 35.2000 (35.2000)  acc5: 59.6000 (59.6000)  time: 5.6633  data: 5.5581  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 3.1990 (3.2862)  acc1: 35.2000 (30.6182)  acc5: 59.6000 (58.3273)  time: 0.7444  data: 0.6493  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 3.6489 (3.5900)  acc1: 24.4000 (28.0571)  acc5: 49.2000 (52.9333)  time: 0.1993  data: 0.1071  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 3.8695 (3.5709)  acc1: 25.2000 (28.6080)  acc5: 48.8000 (53.5200)  time: 0.1987  data: 0.1070  max mem: 13273
Test: Total time: 0:00:10 (0.4040 s / it)
* Acc@1 28.978 Acc@5 53.878 loss 3.558
Accuracy of the model on the 50000 test images: 29.0%
Max accuracy: 28.98%
Epoch: [4]  [   0/1251]  eta: 1:00:34  lr: 0.000800  min_lr: 0.000800  loss: 4.7240 (4.7240)  weight_decay: 0.0500 (0.0500)  time: 2.9051  data: 2.6511  max mem: 13273
Epoch: [4]  [ 200/1251]  eta: 0:03:50  lr: 0.000832  min_lr: 0.000832  loss: 5.0657 (5.4423)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.1090 (8.5207)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [4]  [ 400/1251]  eta: 0:03:00  lr: 0.000864  min_lr: 0.000864  loss: 5.0704 (5.3412)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.0083 (8.4330)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [4]  [ 600/1251]  eta: 0:02:16  lr: 0.000896  min_lr: 0.000896  loss: 5.4691 (5.3387)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.0147 (8.4124)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [4]  [ 800/1251]  eta: 0:01:34  lr: 0.000928  min_lr: 0.000928  loss: 5.5741 (5.3318)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.0511 (8.1254)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [4]  [1000/1251]  eta: 0:00:52  lr: 0.000960  min_lr: 0.000960  loss: 5.3209 (5.3092)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.8597 (8.0244)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [4]  [1200/1251]  eta: 0:00:10  lr: 0.000992  min_lr: 0.000992  loss: 4.8315 (5.2856)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.3071 (7.8278)  time: 0.2047  data: 0.0006  max mem: 13273
Epoch: [4]  [1250/1251]  eta: 0:00:00  lr: 0.001000  min_lr: 0.001000  loss: 5.1348 (5.2813)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.3651 (7.8328)  time: 0.1708  data: 0.0007  max mem: 13273
Epoch: [4] Total time: 0:04:19 (0.2076 s / it)
Averaged stats: lr: 0.001000  min_lr: 0.001000  loss: 5.1348 (5.2856)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.3651 (7.8328)
Test:  [ 0/25]  eta: 0:01:48  loss: 2.6070 (2.6070)  acc1: 41.6000 (41.6000)  acc5: 70.8000 (70.8000)  time: 4.3229  data: 4.2075  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 2.6881 (2.8169)  acc1: 40.0000 (38.6545)  acc5: 69.2000 (67.6000)  time: 0.6727  data: 0.5759  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 3.3420 (3.1601)  acc1: 29.6000 (34.5524)  acc5: 56.0000 (60.9524)  time: 0.2415  data: 0.1488  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 3.3929 (3.1403)  acc1: 31.2000 (35.2000)  acc5: 56.0000 (61.2800)  time: 0.2149  data: 0.1233  max mem: 13273
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 35.982 Acc@5 61.484 loss 3.133
Accuracy of the model on the 50000 test images: 36.0%
Max accuracy: 35.98%
Epoch: [5]  [   0/1251]  eta: 1:02:13  lr: 0.001000  min_lr: 0.001000  loss: 5.9916 (5.9916)  weight_decay: 0.0500 (0.0500)  time: 2.9847  data: 2.3961  max mem: 13273
Epoch: [5]  [ 200/1251]  eta: 0:03:51  lr: 0.001032  min_lr: 0.001032  loss: 5.4030 (5.1157)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.2723 (6.9573)  time: 0.2111  data: 0.0004  max mem: 13273
Epoch: [5]  [ 400/1251]  eta: 0:03:00  lr: 0.001064  min_lr: 0.001064  loss: 5.6953 (5.1810)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.5336 (6.4709)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [5]  [ 600/1251]  eta: 0:02:16  lr: 0.001096  min_lr: 0.001096  loss: 5.0117 (5.1547)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.5387 (6.3712)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [5]  [ 800/1251]  eta: 0:01:34  lr: 0.001128  min_lr: 0.001128  loss: 4.9116 (5.1367)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.0146 (6.2251)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [5]  [1000/1251]  eta: 0:00:52  lr: 0.001160  min_lr: 0.001160  loss: 4.4099 (5.1125)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.0467 (6.1783)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [5]  [1200/1251]  eta: 0:00:10  lr: 0.001192  min_lr: 0.001192  loss: 5.3878 (5.0957)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.5999 (6.1135)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [5]  [1250/1251]  eta: 0:00:00  lr: 0.001200  min_lr: 0.001200  loss: 5.4512 (5.0980)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.6634 (6.0719)  time: 0.1700  data: 0.0010  max mem: 13273
Epoch: [5] Total time: 0:04:19 (0.2077 s / it)
Averaged stats: lr: 0.001200  min_lr: 0.001200  loss: 5.4512 (5.0805)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.6634 (6.0719)
Test:  [ 0/25]  eta: 0:02:18  loss: 2.3012 (2.3012)  acc1: 50.4000 (50.4000)  acc5: 74.4000 (74.4000)  time: 5.5268  data: 5.4204  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 2.3268 (2.4445)  acc1: 49.6000 (47.6364)  acc5: 74.4000 (74.3636)  time: 0.7599  data: 0.6610  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 3.0408 (2.8684)  acc1: 39.2000 (41.4476)  acc5: 62.4000 (66.8000)  time: 0.2189  data: 0.1243  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 3.1671 (2.8627)  acc1: 35.6000 (41.5200)  acc5: 61.2000 (66.8160)  time: 0.2201  data: 0.1264  max mem: 13273
Test: Total time: 0:00:10 (0.4156 s / it)
* Acc@1 41.520 Acc@5 67.172 loss 2.859
Accuracy of the model on the 50000 test images: 41.5%
Max accuracy: 41.52%
Epoch: [6]  [   0/1251]  eta: 1:04:43  lr: 0.001200  min_lr: 0.001200  loss: 5.6525 (5.6525)  weight_decay: 0.0500 (0.0500)  time: 3.1041  data: 2.8606  max mem: 13273
Epoch: [6]  [ 200/1251]  eta: 0:03:51  lr: 0.001232  min_lr: 0.001232  loss: 4.3738 (4.9176)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.7312 (5.6590)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [6]  [ 400/1251]  eta: 0:03:01  lr: 0.001264  min_lr: 0.001264  loss: 4.8384 (4.9335)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.2888 (5.4148)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [6]  [ 600/1251]  eta: 0:02:17  lr: 0.001296  min_lr: 0.001296  loss: 5.1578 (4.9344)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.9903 (5.3176)  time: 0.2153  data: 0.0003  max mem: 13273
Epoch: [6]  [ 800/1251]  eta: 0:01:34  lr: 0.001328  min_lr: 0.001328  loss: 4.9937 (4.9258)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.0781 (5.2621)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [6]  [1000/1251]  eta: 0:00:52  lr: 0.001360  min_lr: 0.001360  loss: 4.6461 (4.9051)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2933 (5.1270)  time: 0.2069  data: 0.0006  max mem: 13273
Epoch: [6]  [1200/1251]  eta: 0:00:10  lr: 0.001393  min_lr: 0.001393  loss: 4.4128 (4.8888)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.1987 (5.0551)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [6]  [1250/1251]  eta: 0:00:00  lr: 0.001400  min_lr: 0.001400  loss: 5.4240 (4.8935)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0875 (5.0321)  time: 0.1704  data: 0.0006  max mem: 13273
Epoch: [6] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.001400  min_lr: 0.001400  loss: 5.4240 (4.8894)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0875 (5.0321)
Test:  [ 0/25]  eta: 0:02:18  loss: 2.3082 (2.3082)  acc1: 54.4000 (54.4000)  acc5: 78.4000 (78.4000)  time: 5.5567  data: 5.4464  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 2.3082 (2.3932)  acc1: 54.4000 (49.0909)  acc5: 78.4000 (76.2909)  time: 0.7427  data: 0.6479  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 2.8998 (2.6968)  acc1: 40.0000 (44.3619)  acc5: 66.8000 (70.3238)  time: 0.2045  data: 0.1126  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.9080 (2.6898)  acc1: 39.6000 (44.6560)  acc5: 66.8000 (70.2720)  time: 0.2036  data: 0.1125  max mem: 13273
Test: Total time: 0:00:10 (0.4033 s / it)
* Acc@1 45.604 Acc@5 70.890 loss 2.662
Accuracy of the model on the 50000 test images: 45.6%
Max accuracy: 45.60%
Epoch: [7]  [   0/1251]  eta: 0:59:01  lr: 0.001400  min_lr: 0.001400  loss: 4.3226 (4.3226)  weight_decay: 0.0500 (0.0500)  time: 2.8308  data: 2.5702  max mem: 13273
Epoch: [7]  [ 200/1251]  eta: 0:03:50  lr: 0.001432  min_lr: 0.001432  loss: 5.0280 (4.7448)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.0004 (4.7911)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [7]  [ 400/1251]  eta: 0:03:00  lr: 0.001464  min_lr: 0.001464  loss: 5.0451 (4.7705)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7740 (4.2604)  time: 0.2040  data: 0.0004  max mem: 13273
Epoch: [7]  [ 600/1251]  eta: 0:02:16  lr: 0.001496  min_lr: 0.001496  loss: 5.0825 (4.7580)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6761 (4.3110)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [7]  [ 800/1251]  eta: 0:01:33  lr: 0.001528  min_lr: 0.001528  loss: 4.2795 (4.7438)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5269 (4.2465)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [7]  [1000/1251]  eta: 0:00:52  lr: 0.001561  min_lr: 0.001561  loss: 5.4901 (4.7294)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7044 (4.1292)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [7]  [1200/1251]  eta: 0:00:10  lr: 0.001593  min_lr: 0.001593  loss: 4.7897 (4.7189)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5868 (4.0431)  time: 0.2039  data: 0.0004  max mem: 13273
Epoch: [7]  [1250/1251]  eta: 0:00:00  lr: 0.001600  min_lr: 0.001600  loss: 4.9376 (4.7194)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2791 (4.0152)  time: 0.1779  data: 0.0009  max mem: 13273
Epoch: [7] Total time: 0:04:19 (0.2072 s / it)
Averaged stats: lr: 0.001600  min_lr: 0.001600  loss: 4.9376 (4.7217)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2791 (4.0152)
Test:  [ 0/25]  eta: 0:02:17  loss: 2.0133 (2.0133)  acc1: 59.6000 (59.6000)  acc5: 81.6000 (81.6000)  time: 5.4979  data: 5.3660  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 2.0396 (2.1320)  acc1: 55.6000 (54.0727)  acc5: 81.6000 (80.2909)  time: 0.7466  data: 0.6438  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 2.4666 (2.4540)  acc1: 45.2000 (48.8571)  acc5: 72.0000 (74.5905)  time: 0.2001  data: 0.1042  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.6698 (2.4577)  acc1: 44.4000 (48.7200)  acc5: 70.4000 (74.4320)  time: 0.2009  data: 0.1052  max mem: 13273
Test: Total time: 0:00:09 (0.3999 s / it)
* Acc@1 48.978 Acc@5 74.016 loss 2.457
Accuracy of the model on the 50000 test images: 49.0%
Max accuracy: 48.98%
Epoch: [8]  [   0/1251]  eta: 0:56:31  lr: 0.001600  min_lr: 0.001600  loss: 4.6659 (4.6659)  weight_decay: 0.0500 (0.0500)  time: 2.7114  data: 2.4741  max mem: 13273
Epoch: [8]  [ 200/1251]  eta: 0:03:50  lr: 0.001632  min_lr: 0.001632  loss: 4.3232 (4.6215)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3901 (3.4972)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [8]  [ 400/1251]  eta: 0:03:00  lr: 0.001664  min_lr: 0.001664  loss: 4.0032 (4.6021)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1455 (3.4446)  time: 0.2039  data: 0.0004  max mem: 13273
Epoch: [8]  [ 600/1251]  eta: 0:02:16  lr: 0.001696  min_lr: 0.001696  loss: 4.0120 (4.5899)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3470 (3.4576)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [8]  [ 800/1251]  eta: 0:01:34  lr: 0.001728  min_lr: 0.001728  loss: 4.6087 (4.5679)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2632 (3.4013)  time: 0.2051  data: 0.0003  max mem: 13273
Epoch: [8]  [1000/1251]  eta: 0:00:52  lr: 0.001761  min_lr: 0.001761  loss: 5.0388 (4.5777)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1657 (3.3294)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [8]  [1200/1251]  eta: 0:00:10  lr: 0.001793  min_lr: 0.001793  loss: 3.9669 (4.5713)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2411 (3.2873)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [8]  [1250/1251]  eta: 0:00:00  lr: 0.001800  min_lr: 0.001800  loss: 4.0207 (4.5695)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1787 (3.2802)  time: 0.1715  data: 0.0007  max mem: 13273
Epoch: [8] Total time: 0:04:19 (0.2077 s / it)
Averaged stats: lr: 0.001800  min_lr: 0.001800  loss: 4.0207 (4.6031)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1787 (3.2802)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.7363 (1.7363)  acc1: 64.0000 (64.0000)  acc5: 82.4000 (82.4000)  time: 5.4893  data: 5.3841  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.7925 (1.9109)  acc1: 58.4000 (57.7455)  acc5: 84.4000 (82.6545)  time: 0.7324  data: 0.6369  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 2.3501 (2.2643)  acc1: 48.4000 (51.8095)  acc5: 75.6000 (76.7429)  time: 0.2028  data: 0.1104  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.5272 (2.2665)  acc1: 45.6000 (51.5680)  acc5: 71.6000 (76.8000)  time: 0.2086  data: 0.1170  max mem: 13273
Test: Total time: 0:00:10 (0.4090 s / it)
* Acc@1 51.918 Acc@5 76.836 loss 2.253
Accuracy of the model on the 50000 test images: 51.9%
Max accuracy: 51.92%
Epoch: [9]  [   0/1251]  eta: 1:02:01  lr: 0.001800  min_lr: 0.001800  loss: 4.7084 (4.7084)  weight_decay: 0.0500 (0.0500)  time: 2.9748  data: 2.7522  max mem: 13273
Epoch: [9]  [ 200/1251]  eta: 0:03:50  lr: 0.001832  min_lr: 0.001832  loss: 4.2172 (4.5717)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9181 (3.0751)  time: 0.2078  data: 0.0005  max mem: 13273
Epoch: [9]  [ 400/1251]  eta: 0:03:00  lr: 0.001864  min_lr: 0.001864  loss: 4.1418 (4.5296)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8394 (3.0837)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [9]  [ 600/1251]  eta: 0:02:16  lr: 0.001896  min_lr: 0.001896  loss: 4.6005 (4.5444)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4045 (3.0224)  time: 0.2074  data: 0.0005  max mem: 13273
Epoch: [9]  [ 800/1251]  eta: 0:01:34  lr: 0.001929  min_lr: 0.001929  loss: 4.1531 (4.5344)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4100 (2.9233)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [9]  [1000/1251]  eta: 0:00:52  lr: 0.001961  min_lr: 0.001961  loss: 3.9395 (4.5138)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1971 (2.8417)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [9]  [1200/1251]  eta: 0:00:10  lr: 0.001993  min_lr: 0.001993  loss: 4.3570 (4.5130)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3574 (2.8065)  time: 0.2041  data: 0.0006  max mem: 13273
Epoch: [9]  [1250/1251]  eta: 0:00:00  lr: 0.002000  min_lr: 0.002000  loss: 4.8905 (4.5154)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2999 (2.7832)  time: 0.1707  data: 0.0006  max mem: 13273
Epoch: [9] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.002000  min_lr: 0.002000  loss: 4.8905 (4.4961)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2999 (2.7832)
Test:  [ 0/25]  eta: 0:02:09  loss: 1.7035 (1.7035)  acc1: 63.2000 (63.2000)  acc5: 83.2000 (83.2000)  time: 5.1938  data: 5.0875  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.7035 (1.8346)  acc1: 59.2000 (59.0545)  acc5: 85.2000 (84.4727)  time: 0.7148  data: 0.6205  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 2.2160 (2.1506)  acc1: 49.6000 (53.9048)  acc5: 77.2000 (78.9143)  time: 0.2212  data: 0.1295  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.3754 (2.1567)  acc1: 49.2000 (53.9520)  acc5: 74.8000 (78.5920)  time: 0.2140  data: 0.1228  max mem: 13273
Test: Total time: 0:00:10 (0.4035 s / it)
* Acc@1 54.568 Acc@5 78.570 loss 2.150
Accuracy of the model on the 50000 test images: 54.6%
Max accuracy: 54.57%
Epoch: [10]  [   0/1251]  eta: 0:57:10  lr: 0.002000  min_lr: 0.002000  loss: 3.4478 (3.4478)  weight_decay: 0.0500 (0.0500)  time: 2.7423  data: 2.4844  max mem: 13273
Epoch: [10]  [ 200/1251]  eta: 0:03:51  lr: 0.002032  min_lr: 0.002032  loss: 3.9260 (4.4852)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2984 (2.4723)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [10]  [ 400/1251]  eta: 0:03:00  lr: 0.002064  min_lr: 0.002064  loss: 4.8545 (4.4627)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3805 (2.4243)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [10]  [ 600/1251]  eta: 0:02:16  lr: 0.002096  min_lr: 0.002096  loss: 4.5378 (4.4630)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6344 (2.4369)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [10]  [ 800/1251]  eta: 0:01:34  lr: 0.002129  min_lr: 0.002129  loss: 3.7747 (4.4803)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4403 (2.3787)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [10]  [1000/1251]  eta: 0:00:52  lr: 0.002161  min_lr: 0.002161  loss: 4.1946 (4.4854)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8896 (2.3185)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [10]  [1200/1251]  eta: 0:00:10  lr: 0.002193  min_lr: 0.002193  loss: 4.4401 (4.4695)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9683 (2.2812)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [10]  [1250/1251]  eta: 0:00:00  lr: 0.002200  min_lr: 0.002200  loss: 3.9532 (4.4610)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9683 (2.2699)  time: 0.1706  data: 0.0008  max mem: 13273
Epoch: [10] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.002200  min_lr: 0.002200  loss: 3.9532 (4.4166)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9683 (2.2699)
Test:  [ 0/25]  eta: 0:02:26  loss: 1.7781 (1.7781)  acc1: 65.6000 (65.6000)  acc5: 84.4000 (84.4000)  time: 5.8410  data: 5.7357  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.7010 (1.7812)  acc1: 64.4000 (62.1455)  acc5: 86.4000 (85.6000)  time: 0.7040  data: 0.6087  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 2.2083 (2.0743)  acc1: 52.4000 (56.3619)  acc5: 76.0000 (80.2667)  time: 0.1803  data: 0.0879  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.2698 (2.0868)  acc1: 51.6000 (55.9840)  acc5: 75.6000 (79.8560)  time: 0.1894  data: 0.0979  max mem: 13273
Test: Total time: 0:00:10 (0.4073 s / it)
* Acc@1 55.966 Acc@5 79.868 loss 2.073
Accuracy of the model on the 50000 test images: 56.0%
Max accuracy: 55.97%
Epoch: [11]  [   0/1251]  eta: 1:07:15  lr: 0.002200  min_lr: 0.002200  loss: 5.4745 (5.4745)  weight_decay: 0.0500 (0.0500)  time: 3.2255  data: 3.0011  max mem: 13273
Epoch: [11]  [ 200/1251]  eta: 0:03:52  lr: 0.002232  min_lr: 0.002232  loss: 3.7942 (4.4196)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7975 (1.9985)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [11]  [ 400/1251]  eta: 0:03:01  lr: 0.002264  min_lr: 0.002264  loss: 3.9343 (4.3814)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1341 (2.0376)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [11]  [ 600/1251]  eta: 0:02:16  lr: 0.002297  min_lr: 0.002297  loss: 3.9371 (4.3299)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5716 (1.9736)  time: 0.2035  data: 0.0005  max mem: 13273
Epoch: [11]  [ 800/1251]  eta: 0:01:34  lr: 0.002329  min_lr: 0.002329  loss: 4.0385 (4.3185)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8635 (1.9474)  time: 0.2073  data: 0.0005  max mem: 13273
Epoch: [11]  [1000/1251]  eta: 0:00:52  lr: 0.002361  min_lr: 0.002361  loss: 4.8887 (4.3212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5998 (1.8963)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [11]  [1200/1251]  eta: 0:00:10  lr: 0.002393  min_lr: 0.002393  loss: 3.4941 (4.3053)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6594 (1.8634)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [11]  [1250/1251]  eta: 0:00:00  lr: 0.002400  min_lr: 0.002400  loss: 3.8898 (4.3002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8660 (1.8657)  time: 0.1702  data: 0.0005  max mem: 13273
Epoch: [11] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.002400  min_lr: 0.002400  loss: 3.8898 (4.3418)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8660 (1.8657)
Test:  [ 0/25]  eta: 0:02:28  loss: 1.5859 (1.5859)  acc1: 68.4000 (68.4000)  acc5: 85.2000 (85.2000)  time: 5.9522  data: 5.8219  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.5972 (1.7256)  acc1: 66.4000 (63.4545)  acc5: 87.6000 (85.9273)  time: 0.7681  data: 0.6666  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 2.1375 (2.0326)  acc1: 52.8000 (57.4286)  acc5: 77.2000 (80.1714)  time: 0.2022  data: 0.1077  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.3007 (2.0382)  acc1: 52.0000 (57.2960)  acc5: 76.4000 (80.1280)  time: 0.2004  data: 0.1076  max mem: 13273
Test: Total time: 0:00:10 (0.4176 s / it)
* Acc@1 56.278 Acc@5 80.202 loss 2.043
Accuracy of the model on the 50000 test images: 56.3%
Max accuracy: 56.28%
Epoch: [12]  [   0/1251]  eta: 1:06:41  lr: 0.002400  min_lr: 0.002400  loss: 3.4115 (3.4115)  weight_decay: 0.0500 (0.0500)  time: 3.1989  data: 2.9808  max mem: 13273
Epoch: [12]  [ 200/1251]  eta: 0:03:51  lr: 0.002432  min_lr: 0.002432  loss: 3.7589 (4.3273)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4744 (1.5555)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [12]  [ 400/1251]  eta: 0:03:01  lr: 0.002464  min_lr: 0.002464  loss: 4.3433 (4.3508)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4363 (1.5265)  time: 0.2058  data: 0.0006  max mem: 13273
Epoch: [12]  [ 600/1251]  eta: 0:02:16  lr: 0.002497  min_lr: 0.002497  loss: 4.5415 (4.3304)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3633 (1.4905)  time: 0.2039  data: 0.0004  max mem: 13273
Epoch: [12]  [ 800/1251]  eta: 0:01:34  lr: 0.002529  min_lr: 0.002529  loss: 3.8768 (4.3202)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6735 (1.5477)  time: 0.2064  data: 0.0007  max mem: 13273
Epoch: [12]  [1000/1251]  eta: 0:00:52  lr: 0.002561  min_lr: 0.002561  loss: 3.8025 (4.3031)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3091 (1.5290)  time: 0.2060  data: 0.0006  max mem: 13273
Epoch: [12]  [1200/1251]  eta: 0:00:10  lr: 0.002593  min_lr: 0.002593  loss: 4.1352 (4.2940)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3778 (1.5068)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [12]  [1250/1251]  eta: 0:00:00  lr: 0.002600  min_lr: 0.002600  loss: 3.7313 (4.2866)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3190 (1.5002)  time: 0.1705  data: 0.0008  max mem: 13273
Epoch: [12] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.002600  min_lr: 0.002600  loss: 3.7313 (4.2923)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3190 (1.5002)
Test:  [ 0/25]  eta: 0:02:20  loss: 1.4085 (1.4085)  acc1: 70.4000 (70.4000)  acc5: 88.0000 (88.0000)  time: 5.6398  data: 5.5332  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.4809 (1.5764)  acc1: 64.0000 (64.3636)  acc5: 88.8000 (87.6000)  time: 0.7431  data: 0.6450  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.9094 (1.9007)  acc1: 55.6000 (58.9714)  acc5: 80.8000 (82.3429)  time: 0.2046  data: 0.1108  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.1163 (1.9198)  acc1: 55.2000 (58.6080)  acc5: 77.2000 (81.9840)  time: 0.2034  data: 0.1107  max mem: 13273
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 58.454 Acc@5 81.734 loss 1.923
Accuracy of the model on the 50000 test images: 58.5%
Max accuracy: 58.45%
Epoch: [13]  [   0/1251]  eta: 0:57:37  lr: 0.002600  min_lr: 0.002600  loss: 5.1772 (5.1772)  weight_decay: 0.0500 (0.0500)  time: 2.7635  data: 2.5274  max mem: 13273
Epoch: [13]  [ 200/1251]  eta: 0:03:52  lr: 0.002632  min_lr: 0.002632  loss: 3.6344 (4.2900)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2733 (1.3128)  time: 0.2133  data: 0.0004  max mem: 13273
Epoch: [13]  [ 400/1251]  eta: 0:03:02  lr: 0.002665  min_lr: 0.002665  loss: 4.0454 (4.2695)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1425 (1.3187)  time: 0.2078  data: 0.0005  max mem: 13273
Epoch: [13]  [ 600/1251]  eta: 0:02:17  lr: 0.002697  min_lr: 0.002697  loss: 4.9554 (4.2890)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2469 (1.3333)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [13]  [ 800/1251]  eta: 0:01:34  lr: 0.002729  min_lr: 0.002729  loss: 3.7110 (4.2692)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2976 (1.3171)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [13]  [1000/1251]  eta: 0:00:52  lr: 0.002761  min_lr: 0.002761  loss: 3.4346 (4.2402)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3717 (1.3187)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [13]  [1200/1251]  eta: 0:00:10  lr: 0.002793  min_lr: 0.002793  loss: 3.5067 (4.2321)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2058 (1.3109)  time: 0.2089  data: 0.0005  max mem: 13273
Epoch: [13]  [1250/1251]  eta: 0:00:00  lr: 0.002800  min_lr: 0.002800  loss: 4.1389 (4.2359)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2094 (1.3081)  time: 0.1699  data: 0.0005  max mem: 13273
Epoch: [13] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.002800  min_lr: 0.002800  loss: 4.1389 (4.2301)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2094 (1.3081)
Test:  [ 0/25]  eta: 0:01:55  loss: 1.4385 (1.4385)  acc1: 68.4000 (68.4000)  acc5: 89.6000 (89.6000)  time: 4.6230  data: 4.5151  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 1.5322 (1.5969)  acc1: 66.8000 (65.3818)  acc5: 88.8000 (87.3818)  time: 0.6489  data: 0.5506  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.9710 (1.9206)  acc1: 56.8000 (59.1810)  acc5: 78.8000 (81.8476)  time: 0.2084  data: 0.1135  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.1822 (1.9367)  acc1: 55.2000 (58.8160)  acc5: 77.2000 (81.5040)  time: 0.2162  data: 0.1222  max mem: 13273
Test: Total time: 0:00:09 (0.3926 s / it)
* Acc@1 58.818 Acc@5 81.962 loss 1.926
Accuracy of the model on the 50000 test images: 58.8%
Max accuracy: 58.82%
Epoch: [14]  [   0/1251]  eta: 1:07:31  lr: 0.002800  min_lr: 0.002800  loss: 4.5903 (4.5903)  weight_decay: 0.0500 (0.0500)  time: 3.2384  data: 2.9973  max mem: 13273
Epoch: [14]  [ 200/1251]  eta: 0:03:51  lr: 0.002833  min_lr: 0.002833  loss: 3.7883 (4.1189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1348 (1.2735)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [14]  [ 400/1251]  eta: 0:03:01  lr: 0.002865  min_lr: 0.002865  loss: 3.8690 (4.1372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2135 (1.2742)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [14]  [ 600/1251]  eta: 0:02:16  lr: 0.002897  min_lr: 0.002897  loss: 4.3353 (4.1379)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2651 (1.2480)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [14]  [ 800/1251]  eta: 0:01:34  lr: 0.002929  min_lr: 0.002929  loss: 4.3977 (4.1495)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3431 (1.2374)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [14]  [1000/1251]  eta: 0:00:52  lr: 0.002961  min_lr: 0.002961  loss: 4.1083 (4.1438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1837 (1.2283)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [14]  [1200/1251]  eta: 0:00:10  lr: 0.002993  min_lr: 0.002993  loss: 3.8262 (4.1573)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1142 (1.2214)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [14]  [1250/1251]  eta: 0:00:00  lr: 0.003000  min_lr: 0.003000  loss: 4.7417 (4.1633)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1178 (1.2245)  time: 0.1699  data: 0.0006  max mem: 13273
Epoch: [14] Total time: 0:04:20 (0.2079 s / it)
Averaged stats: lr: 0.003000  min_lr: 0.003000  loss: 4.7417 (4.1903)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1178 (1.2245)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.4805 (1.4805)  acc1: 69.6000 (69.6000)  acc5: 88.4000 (88.4000)  time: 5.6444  data: 5.5360  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.5173 (1.5879)  acc1: 68.4000 (65.5273)  acc5: 89.2000 (87.8909)  time: 0.7305  data: 0.6316  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.9289 (1.8958)  acc1: 54.8000 (59.7524)  acc5: 79.6000 (82.7429)  time: 0.1992  data: 0.1045  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.1645 (1.9081)  acc1: 53.6000 (59.2960)  acc5: 78.4000 (82.4000)  time: 0.1977  data: 0.1044  max mem: 13273
Test: Total time: 0:00:10 (0.4046 s / it)
* Acc@1 59.684 Acc@5 82.840 loss 1.893
Accuracy of the model on the 50000 test images: 59.7%
Max accuracy: 59.68%
Epoch: [15]  [   0/1251]  eta: 1:09:47  lr: 0.003000  min_lr: 0.003000  loss: 3.5252 (3.5252)  weight_decay: 0.0500 (0.0500)  time: 3.3471  data: 3.1085  max mem: 13273
Epoch: [15]  [ 200/1251]  eta: 0:03:51  lr: 0.003033  min_lr: 0.003033  loss: 4.1034 (4.1877)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0994 (1.1845)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [15]  [ 400/1251]  eta: 0:03:01  lr: 0.003065  min_lr: 0.003065  loss: 3.5888 (4.1581)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3532 (1.1602)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [15]  [ 600/1251]  eta: 0:02:16  lr: 0.003097  min_lr: 0.003097  loss: 4.4116 (4.1755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9620 (1.1353)  time: 0.2052  data: 0.0006  max mem: 13273
Epoch: [15]  [ 800/1251]  eta: 0:01:34  lr: 0.003129  min_lr: 0.003129  loss: 4.1117 (4.1601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0970 (1.1247)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [15]  [1000/1251]  eta: 0:00:52  lr: 0.003161  min_lr: 0.003161  loss: 4.4540 (4.1529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8932 (1.1210)  time: 0.2078  data: 0.0004  max mem: 13273
Epoch: [15]  [1200/1251]  eta: 0:00:10  lr: 0.003193  min_lr: 0.003193  loss: 3.2725 (4.1426)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0374 (1.1063)  time: 0.2076  data: 0.0005  max mem: 13273
Epoch: [15]  [1250/1251]  eta: 0:00:00  lr: 0.003200  min_lr: 0.003200  loss: 4.6202 (4.1497)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0521 (1.1038)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [15] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.003200  min_lr: 0.003200  loss: 4.6202 (4.1409)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0521 (1.1038)
Test:  [ 0/25]  eta: 0:02:24  loss: 1.4763 (1.4763)  acc1: 71.2000 (71.2000)  acc5: 87.2000 (87.2000)  time: 5.7985  data: 5.6917  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.5337 (1.6345)  acc1: 64.0000 (64.4727)  acc5: 88.4000 (87.0909)  time: 0.7425  data: 0.6471  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.9715 (1.9358)  acc1: 54.8000 (58.7619)  acc5: 80.8000 (82.1714)  time: 0.1985  data: 0.1037  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.2147 (1.9430)  acc1: 55.2000 (58.8480)  acc5: 78.0000 (81.9520)  time: 0.1974  data: 0.1036  max mem: 13273
Test: Total time: 0:00:10 (0.4082 s / it)
* Acc@1 59.592 Acc@5 82.654 loss 1.916
Accuracy of the model on the 50000 test images: 59.6%
Max accuracy: 59.68%
Epoch: [16]  [   0/1251]  eta: 1:05:14  lr: 0.003201  min_lr: 0.003201  loss: 4.8084 (4.8084)  weight_decay: 0.0500 (0.0500)  time: 3.1289  data: 2.2552  max mem: 13273
Epoch: [16]  [ 200/1251]  eta: 0:03:52  lr: 0.003233  min_lr: 0.003233  loss: 3.6863 (4.1240)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0078 (1.0567)  time: 0.2043  data: 0.0006  max mem: 13273
Epoch: [16]  [ 400/1251]  eta: 0:03:02  lr: 0.003265  min_lr: 0.003265  loss: 4.0122 (4.1163)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1501 (1.0846)  time: 0.2071  data: 0.0005  max mem: 13273
Epoch: [16]  [ 600/1251]  eta: 0:02:17  lr: 0.003297  min_lr: 0.003297  loss: 4.2095 (4.1401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9390 (1.0693)  time: 0.2073  data: 0.0005  max mem: 13273
Epoch: [16]  [ 800/1251]  eta: 0:01:35  lr: 0.003329  min_lr: 0.003329  loss: 4.1239 (4.1570)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1023 (inf)  time: 0.2091  data: 0.0005  max mem: 13273
Epoch: [16]  [1000/1251]  eta: 0:00:52  lr: 0.003361  min_lr: 0.003361  loss: 3.6135 (4.1513)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0122 (inf)  time: 0.2058  data: 0.0006  max mem: 13273
Epoch: [16]  [1200/1251]  eta: 0:00:10  lr: 0.003393  min_lr: 0.003393  loss: 4.4944 (4.1646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9531 (inf)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [16]  [1250/1251]  eta: 0:00:00  lr: 0.003400  min_lr: 0.003400  loss: 4.2573 (4.1663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8793 (inf)  time: 0.1711  data: 0.0007  max mem: 13273
Epoch: [16] Total time: 0:04:21 (0.2094 s / it)
Averaged stats: lr: 0.003400  min_lr: 0.003400  loss: 4.2573 (4.1317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8793 (inf)
Test:  [ 0/25]  eta: 0:02:26  loss: 1.4136 (1.4136)  acc1: 73.2000 (73.2000)  acc5: 90.4000 (90.4000)  time: 5.8472  data: 5.7164  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.4136 (1.5655)  acc1: 67.2000 (66.6909)  acc5: 90.0000 (88.8000)  time: 0.6727  data: 0.5710  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 2.0185 (1.8918)  acc1: 56.8000 (60.2667)  acc5: 82.4000 (83.4476)  time: 0.1701  data: 0.0756  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.1247 (1.8936)  acc1: 56.0000 (60.1760)  acc5: 79.2000 (83.4080)  time: 0.2041  data: 0.1115  max mem: 13273
Test: Total time: 0:00:10 (0.4170 s / it)
* Acc@1 60.524 Acc@5 83.792 loss 1.867
Accuracy of the model on the 50000 test images: 60.5%
Max accuracy: 60.52%
Epoch: [17]  [   0/1251]  eta: 1:03:46  lr: 0.003401  min_lr: 0.003401  loss: 4.7677 (4.7677)  weight_decay: 0.0500 (0.0500)  time: 3.0587  data: 2.8209  max mem: 13273
Epoch: [17]  [ 200/1251]  eta: 0:03:51  lr: 0.003433  min_lr: 0.003433  loss: 3.5971 (4.0826)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0715 (1.0631)  time: 0.2085  data: 0.0004  max mem: 13273
Epoch: [17]  [ 400/1251]  eta: 0:03:01  lr: 0.003465  min_lr: 0.003465  loss: 3.7349 (4.0694)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8336 (0.9839)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [17]  [ 600/1251]  eta: 0:02:17  lr: 0.003497  min_lr: 0.003497  loss: 3.5985 (4.0732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8978 (0.9763)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [17]  [ 800/1251]  eta: 0:01:34  lr: 0.003529  min_lr: 0.003529  loss: 3.8650 (4.0731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0694 (0.9809)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [17]  [1000/1251]  eta: 0:00:52  lr: 0.003561  min_lr: 0.003561  loss: 3.3336 (4.0620)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0098 (0.9952)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [17]  [1200/1251]  eta: 0:00:10  lr: 0.003593  min_lr: 0.003593  loss: 4.2139 (4.0636)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9384 (0.9904)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [17]  [1250/1251]  eta: 0:00:00  lr: 0.003600  min_lr: 0.003600  loss: 3.6042 (4.0554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9407 (0.9892)  time: 0.1707  data: 0.0007  max mem: 13273
Epoch: [17] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.003600  min_lr: 0.003600  loss: 3.6042 (4.0847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9407 (0.9892)
Test:  [ 0/25]  eta: 0:01:49  loss: 1.2389 (1.2389)  acc1: 74.0000 (74.0000)  acc5: 88.0000 (88.0000)  time: 4.3932  data: 4.2872  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.3961 (1.4119)  acc1: 68.0000 (67.1273)  acc5: 90.0000 (89.2000)  time: 0.7102  data: 0.6117  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.7245 (1.7409)  acc1: 58.4000 (61.6000)  acc5: 82.4000 (84.4191)  time: 0.2566  data: 0.1616  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.9267 (1.7484)  acc1: 58.0000 (61.3760)  acc5: 81.2000 (84.3520)  time: 0.2385  data: 0.1449  max mem: 13273
Test: Total time: 0:00:10 (0.4187 s / it)
* Acc@1 62.164 Acc@5 84.542 loss 1.728
Accuracy of the model on the 50000 test images: 62.2%
Max accuracy: 62.16%
Epoch: [18]  [   0/1251]  eta: 1:14:05  lr: 0.003601  min_lr: 0.003601  loss: 3.3398 (3.3398)  weight_decay: 0.0500 (0.0500)  time: 3.5536  data: 3.3306  max mem: 13273
Epoch: [18]  [ 200/1251]  eta: 0:03:52  lr: 0.003633  min_lr: 0.003633  loss: 3.9159 (4.0906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9004 (0.9872)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [18]  [ 400/1251]  eta: 0:03:01  lr: 0.003665  min_lr: 0.003665  loss: 3.3874 (4.0881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8630 (1.0029)  time: 0.2083  data: 0.0005  max mem: 13273
Epoch: [18]  [ 600/1251]  eta: 0:02:17  lr: 0.003697  min_lr: 0.003697  loss: 3.4670 (4.0907)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8405 (0.9940)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [18]  [ 800/1251]  eta: 0:01:34  lr: 0.003729  min_lr: 0.003729  loss: 4.1450 (4.0607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9116 (0.9876)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [18]  [1000/1251]  eta: 0:00:52  lr: 0.003761  min_lr: 0.003761  loss: 3.8198 (4.0448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9030 (0.9730)  time: 0.2069  data: 0.0005  max mem: 13273
Epoch: [18]  [1200/1251]  eta: 0:00:10  lr: 0.003793  min_lr: 0.003793  loss: 3.4904 (4.0450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8301 (0.9686)  time: 0.2053  data: 0.0006  max mem: 13273
Epoch: [18]  [1250/1251]  eta: 0:00:00  lr: 0.003800  min_lr: 0.003800  loss: 3.6018 (4.0474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8505 (0.9763)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [18] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003800  min_lr: 0.003800  loss: 3.6018 (4.0465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8505 (0.9763)
Test:  [ 0/25]  eta: 0:02:20  loss: 1.3326 (1.3326)  acc1: 71.6000 (71.6000)  acc5: 90.4000 (90.4000)  time: 5.6210  data: 5.5138  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.3957 (1.5420)  acc1: 68.4000 (66.4727)  acc5: 89.2000 (88.7636)  time: 0.7151  data: 0.6208  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.9668 (1.8345)  acc1: 57.6000 (61.0667)  acc5: 80.8000 (83.8667)  time: 0.1926  data: 0.1008  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.1095 (1.8399)  acc1: 57.2000 (60.7840)  acc5: 79.2000 (83.8880)  time: 0.1915  data: 0.1007  max mem: 13273
Test: Total time: 0:00:09 (0.3965 s / it)
* Acc@1 60.662 Acc@5 83.560 loss 1.837
Accuracy of the model on the 50000 test images: 60.7%
Max accuracy: 62.16%
Epoch: [19]  [   0/1251]  eta: 1:02:07  lr: 0.003801  min_lr: 0.003801  loss: 3.7679 (3.7679)  weight_decay: 0.0500 (0.0500)  time: 2.9796  data: 2.2437  max mem: 13273
Epoch: [19]  [ 200/1251]  eta: 0:03:52  lr: 0.003833  min_lr: 0.003833  loss: 3.4168 (3.9662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8604 (0.9028)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [19]  [ 400/1251]  eta: 0:03:01  lr: 0.003865  min_lr: 0.003865  loss: 4.0135 (4.0034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8869 (0.9299)  time: 0.2033  data: 0.0005  max mem: 13273
Epoch: [19]  [ 600/1251]  eta: 0:02:17  lr: 0.003897  min_lr: 0.003897  loss: 3.5187 (4.0056)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1308 (0.9645)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [19]  [ 800/1251]  eta: 0:01:34  lr: 0.003929  min_lr: 0.003929  loss: 4.0958 (3.9986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7712 (0.9287)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [19]  [1000/1251]  eta: 0:00:52  lr: 0.003961  min_lr: 0.003961  loss: 3.5520 (3.9882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9298 (0.9284)  time: 0.2045  data: 0.0006  max mem: 13273
Epoch: [19]  [1200/1251]  eta: 0:00:10  lr: 0.003993  min_lr: 0.003993  loss: 3.6797 (3.9755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8238 (0.9210)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [19]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.3031 (3.9828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8452 (0.9178)  time: 0.1701  data: 0.0008  max mem: 13273
Epoch: [19] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.3031 (4.0223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8452 (0.9178)
Test:  [ 0/25]  eta: 0:02:26  loss: 1.3488 (1.3488)  acc1: 74.4000 (74.4000)  acc5: 90.8000 (90.8000)  time: 5.8432  data: 5.7366  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.4260 (1.5121)  acc1: 70.4000 (68.5818)  acc5: 90.8000 (90.0364)  time: 0.7878  data: 0.6893  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.9034 (1.8114)  acc1: 59.2000 (62.8190)  acc5: 83.2000 (85.2191)  time: 0.2105  data: 0.1161  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 2.0360 (1.8315)  acc1: 57.6000 (62.1760)  acc5: 81.6000 (84.8480)  time: 0.2108  data: 0.1160  max mem: 13273
Test: Total time: 0:00:10 (0.4208 s / it)
* Acc@1 62.218 Acc@5 84.980 loss 1.823
Accuracy of the model on the 50000 test images: 62.2%
Max accuracy: 62.22%
Epoch: [20]  [   0/1251]  eta: 0:54:13  lr: 0.004000  min_lr: 0.004000  loss: 4.4567 (4.4567)  weight_decay: 0.0500 (0.0500)  time: 2.6009  data: 2.0967  max mem: 13273
Epoch: [20]  [ 200/1251]  eta: 0:03:50  lr: 0.004000  min_lr: 0.004000  loss: 3.4037 (3.9106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8774 (0.9692)  time: 0.2062  data: 0.0006  max mem: 13273
Epoch: [20]  [ 400/1251]  eta: 0:03:01  lr: 0.004000  min_lr: 0.004000  loss: 4.1001 (3.9727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8975 (0.9398)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [20]  [ 600/1251]  eta: 0:02:17  lr: 0.004000  min_lr: 0.004000  loss: 3.3258 (4.0147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8442 (0.9206)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [20]  [ 800/1251]  eta: 0:01:34  lr: 0.004000  min_lr: 0.004000  loss: 4.7497 (4.0175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8085 (inf)  time: 0.2047  data: 0.0007  max mem: 13273
Epoch: [20]  [1000/1251]  eta: 0:00:52  lr: 0.004000  min_lr: 0.004000  loss: 3.2304 (4.0294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8150 (inf)  time: 0.2077  data: 0.0006  max mem: 13273
Epoch: [20]  [1200/1251]  eta: 0:00:10  lr: 0.004000  min_lr: 0.004000  loss: 3.5753 (4.0333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7449 (inf)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [20]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.2034 (4.0343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7468 (inf)  time: 0.1708  data: 0.0007  max mem: 13273
Epoch: [20] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.2034 (3.9827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7468 (inf)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.3180 (1.3180)  acc1: 68.8000 (68.8000)  acc5: 90.4000 (90.4000)  time: 5.4974  data: 5.3879  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.4232 (1.4625)  acc1: 68.8000 (68.1818)  acc5: 90.4000 (89.8182)  time: 0.7139  data: 0.6192  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.7886 (1.7587)  acc1: 60.0000 (62.6476)  acc5: 82.0000 (84.8381)  time: 0.1934  data: 0.1010  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.9729 (1.7749)  acc1: 60.0000 (62.2240)  acc5: 80.8000 (84.4000)  time: 0.2076  data: 0.1153  max mem: 13273
Test: Total time: 0:00:10 (0.4036 s / it)
* Acc@1 62.144 Acc@5 84.614 loss 1.766
Accuracy of the model on the 50000 test images: 62.1%
Max accuracy: 62.22%
Epoch: [21]  [   0/1251]  eta: 1:08:23  lr: 0.004000  min_lr: 0.004000  loss: 3.1077 (3.1077)  weight_decay: 0.0500 (0.0500)  time: 3.2800  data: 1.6813  max mem: 13273
Epoch: [21]  [ 200/1251]  eta: 0:03:54  lr: 0.004000  min_lr: 0.004000  loss: 3.7798 (3.9247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7405 (0.8656)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [21]  [ 400/1251]  eta: 0:03:02  lr: 0.004000  min_lr: 0.004000  loss: 3.7229 (3.9272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6842 (0.8453)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [21]  [ 600/1251]  eta: 0:02:17  lr: 0.004000  min_lr: 0.004000  loss: 3.5374 (3.8837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8504 (0.8611)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [21]  [ 800/1251]  eta: 0:01:34  lr: 0.004000  min_lr: 0.004000  loss: 4.4767 (3.8832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7979 (0.8356)  time: 0.2040  data: 0.0004  max mem: 13273
Epoch: [21]  [1000/1251]  eta: 0:00:52  lr: 0.004000  min_lr: 0.004000  loss: 3.7252 (3.8953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6869 (0.8377)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [21]  [1200/1251]  eta: 0:00:10  lr: 0.004000  min_lr: 0.004000  loss: 3.2298 (3.8979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6373 (0.8217)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [21]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 3.8959 (3.9005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7181 (0.8232)  time: 0.1703  data: 0.0008  max mem: 13273
Epoch: [21] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 3.8959 (3.9216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7181 (0.8232)
Test:  [ 0/25]  eta: 0:02:23  loss: 1.2499 (1.2499)  acc1: 78.0000 (78.0000)  acc5: 91.6000 (91.6000)  time: 5.7288  data: 5.5966  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.2499 (1.4021)  acc1: 71.2000 (70.2182)  acc5: 92.0000 (90.5091)  time: 0.7284  data: 0.6268  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.7604 (1.6907)  acc1: 62.0000 (64.6857)  acc5: 86.8000 (86.1714)  time: 0.1952  data: 0.1008  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.9493 (1.7016)  acc1: 61.2000 (64.3040)  acc5: 82.8000 (85.8880)  time: 0.1933  data: 0.1007  max mem: 13273
Test: Total time: 0:00:10 (0.4035 s / it)
* Acc@1 64.194 Acc@5 86.104 loss 1.688
Accuracy of the model on the 50000 test images: 64.2%
Max accuracy: 64.19%
Epoch: [22]  [   0/1251]  eta: 1:00:42  lr: 0.003999  min_lr: 0.003999  loss: 3.0616 (3.0616)  weight_decay: 0.0500 (0.0500)  time: 2.9118  data: 2.6697  max mem: 13273
Epoch: [22]  [ 200/1251]  eta: 0:03:50  lr: 0.003999  min_lr: 0.003999  loss: 3.7673 (3.9297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8444 (0.8855)  time: 0.2047  data: 0.0006  max mem: 13273
Epoch: [22]  [ 400/1251]  eta: 0:03:00  lr: 0.003999  min_lr: 0.003999  loss: 3.1669 (3.9384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7897 (0.8328)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [22]  [ 600/1251]  eta: 0:02:16  lr: 0.003999  min_lr: 0.003999  loss: 4.4342 (3.9370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6884 (0.8138)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [22]  [ 800/1251]  eta: 0:01:34  lr: 0.003999  min_lr: 0.003999  loss: 4.5625 (3.9441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8494 (0.8229)  time: 0.2057  data: 0.0003  max mem: 13273
Epoch: [22]  [1000/1251]  eta: 0:00:52  lr: 0.003999  min_lr: 0.003999  loss: 3.6473 (3.9093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6676 (0.8083)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [22]  [1200/1251]  eta: 0:00:10  lr: 0.003999  min_lr: 0.003999  loss: 4.5154 (3.9023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7744 (0.8322)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [22]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 3.2924 (3.8999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6710 (0.8260)  time: 0.1703  data: 0.0008  max mem: 13273
Epoch: [22] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 3.2924 (3.8960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6710 (0.8260)
Test:  [ 0/25]  eta: 0:02:23  loss: 1.2953 (1.2953)  acc1: 73.6000 (73.6000)  acc5: 90.0000 (90.0000)  time: 5.7445  data: 5.6308  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.2953 (1.3569)  acc1: 73.2000 (70.6182)  acc5: 92.0000 (90.5455)  time: 0.6844  data: 0.5908  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.6717 (1.6689)  acc1: 61.2000 (64.7619)  acc5: 84.4000 (85.9048)  time: 0.1790  data: 0.0874  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.9405 (1.6817)  acc1: 59.6000 (64.2560)  acc5: 82.4000 (85.8880)  time: 0.1899  data: 0.0986  max mem: 13273
Test: Total time: 0:00:10 (0.4010 s / it)
* Acc@1 64.064 Acc@5 85.868 loss 1.673
Accuracy of the model on the 50000 test images: 64.1%
Max accuracy: 64.19%
Epoch: [23]  [   0/1251]  eta: 1:05:19  lr: 0.003999  min_lr: 0.003999  loss: 3.8591 (3.8591)  weight_decay: 0.0500 (0.0500)  time: 3.1331  data: 2.5574  max mem: 13273
Epoch: [23]  [ 200/1251]  eta: 0:03:53  lr: 0.003999  min_lr: 0.003999  loss: 3.3244 (3.9284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7712 (0.8098)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [23]  [ 400/1251]  eta: 0:03:02  lr: 0.003999  min_lr: 0.003999  loss: 4.2375 (3.8849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6506 (0.8073)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [23]  [ 600/1251]  eta: 0:02:18  lr: 0.003998  min_lr: 0.003998  loss: 3.4976 (3.8896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7745 (0.8010)  time: 0.2068  data: 0.0004  max mem: 13273
Epoch: [23]  [ 800/1251]  eta: 0:01:35  lr: 0.003998  min_lr: 0.003998  loss: 3.2573 (3.8719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7965 (0.7925)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [23]  [1000/1251]  eta: 0:00:52  lr: 0.003998  min_lr: 0.003998  loss: 4.2221 (3.8586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8287 (0.7927)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [23]  [1200/1251]  eta: 0:00:10  lr: 0.003998  min_lr: 0.003998  loss: 3.8349 (3.8811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7500 (0.8262)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [23]  [1250/1251]  eta: 0:00:00  lr: 0.003998  min_lr: 0.003998  loss: 4.6547 (3.8885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7028 (0.8227)  time: 0.1700  data: 0.0007  max mem: 13273
Epoch: [23] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.003998  min_lr: 0.003998  loss: 4.6547 (3.8767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7028 (0.8227)
Test:  [ 0/25]  eta: 0:02:16  loss: 1.3186 (1.3186)  acc1: 74.4000 (74.4000)  acc5: 92.8000 (92.8000)  time: 5.4571  data: 5.3427  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.3348 (1.4965)  acc1: 69.6000 (69.2727)  acc5: 92.4000 (90.4000)  time: 0.7620  data: 0.6620  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.8545 (1.7894)  acc1: 59.6000 (63.6381)  acc5: 83.6000 (85.5238)  time: 0.2326  data: 0.1377  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.9922 (1.7996)  acc1: 59.6000 (63.3760)  acc5: 81.6000 (85.3600)  time: 0.2317  data: 0.1376  max mem: 13273
Test: Total time: 0:00:10 (0.4218 s / it)
* Acc@1 63.926 Acc@5 85.654 loss 1.792
Accuracy of the model on the 50000 test images: 63.9%
Max accuracy: 64.19%
Epoch: [24]  [   0/1251]  eta: 1:07:19  lr: 0.003998  min_lr: 0.003998  loss: 4.9215 (4.9215)  weight_decay: 0.0500 (0.0500)  time: 3.2289  data: 1.7150  max mem: 13273
Epoch: [24]  [ 200/1251]  eta: 0:03:52  lr: 0.003998  min_lr: 0.003998  loss: 3.6862 (3.9265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6793 (0.7099)  time: 0.2043  data: 0.0005  max mem: 13273
Epoch: [24]  [ 400/1251]  eta: 0:03:01  lr: 0.003998  min_lr: 0.003998  loss: 4.2652 (3.9214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6741 (0.7440)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [24]  [ 600/1251]  eta: 0:02:17  lr: 0.003997  min_lr: 0.003997  loss: 4.2906 (3.9160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7301 (0.7330)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [24]  [ 800/1251]  eta: 0:01:34  lr: 0.003997  min_lr: 0.003997  loss: 3.1056 (3.9151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7661 (0.7422)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [24]  [1000/1251]  eta: 0:00:52  lr: 0.003997  min_lr: 0.003997  loss: 3.6935 (3.9186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6940 (0.7394)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [24]  [1200/1251]  eta: 0:00:10  lr: 0.003997  min_lr: 0.003997  loss: 3.5524 (3.9002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7326 (0.7413)  time: 0.2065  data: 0.0006  max mem: 13273
Epoch: [24]  [1250/1251]  eta: 0:00:00  lr: 0.003997  min_lr: 0.003997  loss: 3.7511 (3.8997)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7709 (0.7442)  time: 0.1707  data: 0.0007  max mem: 13273
Epoch: [24] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.003997  min_lr: 0.003997  loss: 3.7511 (3.8601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7709 (0.7442)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.1343 (1.1343)  acc1: 74.8000 (74.8000)  acc5: 94.8000 (94.8000)  time: 5.6706  data: 5.5649  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.2231 (1.3931)  acc1: 72.0000 (71.6727)  acc5: 92.8000 (90.8364)  time: 0.7196  data: 0.6228  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.8079 (1.6831)  acc1: 62.8000 (65.2762)  acc5: 83.6000 (86.2476)  time: 0.1936  data: 0.1000  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.8651 (1.6882)  acc1: 62.4000 (65.0400)  acc5: 82.8000 (86.1760)  time: 0.2055  data: 0.1130  max mem: 13273
Test: Total time: 0:00:10 (0.4105 s / it)
* Acc@1 64.890 Acc@5 86.298 loss 1.675
Accuracy of the model on the 50000 test images: 64.9%
Max accuracy: 64.89%
Epoch: [25]  [   0/1251]  eta: 1:08:55  lr: 0.003997  min_lr: 0.003997  loss: 4.5295 (4.5295)  weight_decay: 0.0500 (0.0500)  time: 3.3054  data: 3.0847  max mem: 13273
Epoch: [25]  [ 200/1251]  eta: 0:03:53  lr: 0.003997  min_lr: 0.003997  loss: 4.2001 (3.7838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7945 (0.7942)  time: 0.2073  data: 0.0005  max mem: 13273
Epoch: [25]  [ 400/1251]  eta: 0:03:02  lr: 0.003996  min_lr: 0.003996  loss: 3.6586 (3.7918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6885 (0.7527)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [25]  [ 600/1251]  eta: 0:02:17  lr: 0.003996  min_lr: 0.003996  loss: 3.4500 (3.7952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7936 (0.7535)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [25]  [ 800/1251]  eta: 0:01:34  lr: 0.003996  min_lr: 0.003996  loss: 3.3905 (3.8029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7412 (0.7485)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [25]  [1000/1251]  eta: 0:00:52  lr: 0.003996  min_lr: 0.003996  loss: 4.0342 (3.8189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7904 (0.7465)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [25]  [1200/1251]  eta: 0:00:10  lr: 0.003996  min_lr: 0.003996  loss: 4.1382 (3.8298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7157 (0.7511)  time: 0.2070  data: 0.0005  max mem: 13273
Epoch: [25]  [1250/1251]  eta: 0:00:00  lr: 0.003995  min_lr: 0.003995  loss: 4.1859 (3.8266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7339 (0.7528)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [25] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.003995  min_lr: 0.003995  loss: 4.1859 (3.8184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7339 (0.7528)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.1802 (1.1802)  acc1: 76.4000 (76.4000)  acc5: 93.6000 (93.6000)  time: 5.6739  data: 5.5671  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.2237 (1.3508)  acc1: 71.2000 (71.5636)  acc5: 93.2000 (91.7818)  time: 0.7631  data: 0.6651  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.7161 (1.6496)  acc1: 62.0000 (65.6191)  acc5: 85.2000 (86.9905)  time: 0.2180  data: 0.1234  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.8250 (1.6612)  acc1: 61.2000 (65.3760)  acc5: 84.0000 (86.7200)  time: 0.2176  data: 0.1234  max mem: 13273
Test: Total time: 0:00:10 (0.4185 s / it)
* Acc@1 65.534 Acc@5 86.856 loss 1.652
Accuracy of the model on the 50000 test images: 65.5%
Max accuracy: 65.53%
Epoch: [26]  [   0/1251]  eta: 0:59:31  lr: 0.003995  min_lr: 0.003995  loss: 4.5482 (4.5482)  weight_decay: 0.0500 (0.0500)  time: 2.8551  data: 2.6132  max mem: 13273
Epoch: [26]  [ 200/1251]  eta: 0:03:50  lr: 0.003995  min_lr: 0.003995  loss: 3.0831 (3.6841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7050 (0.7473)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [26]  [ 400/1251]  eta: 0:03:00  lr: 0.003995  min_lr: 0.003995  loss: 3.2982 (3.7568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7555 (0.7596)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [26]  [ 600/1251]  eta: 0:02:16  lr: 0.003995  min_lr: 0.003995  loss: 3.1998 (3.7611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7356 (0.7578)  time: 0.2032  data: 0.0005  max mem: 13273
Epoch: [26]  [ 800/1251]  eta: 0:01:34  lr: 0.003994  min_lr: 0.003994  loss: 3.0915 (3.7464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5657 (0.7457)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [26]  [1000/1251]  eta: 0:00:52  lr: 0.003994  min_lr: 0.003994  loss: 3.8868 (3.7517)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6639 (0.7450)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [26]  [1200/1251]  eta: 0:00:10  lr: 0.003994  min_lr: 0.003994  loss: 3.3311 (3.7601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8748 (0.7599)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [26]  [1250/1251]  eta: 0:00:00  lr: 0.003994  min_lr: 0.003994  loss: 3.4968 (3.7600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7568 (0.7576)  time: 0.1701  data: 0.0009  max mem: 13273
Epoch: [26] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.003994  min_lr: 0.003994  loss: 3.4968 (3.7925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7568 (0.7576)
Test:  [ 0/25]  eta: 0:02:10  loss: 1.1059 (1.1059)  acc1: 77.2000 (77.2000)  acc5: 93.2000 (93.2000)  time: 5.2191  data: 5.0833  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.2579 (1.3651)  acc1: 75.6000 (71.4182)  acc5: 91.6000 (91.2727)  time: 0.7469  data: 0.6439  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.6722 (1.6120)  acc1: 63.6000 (66.1524)  acc5: 86.8000 (87.5048)  time: 0.2370  data: 0.1417  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.7327 (1.6207)  acc1: 63.2000 (65.9040)  acc5: 84.8000 (87.4080)  time: 0.2352  data: 0.1416  max mem: 13273
Test: Total time: 0:00:10 (0.4159 s / it)
* Acc@1 66.082 Acc@5 87.580 loss 1.606
Accuracy of the model on the 50000 test images: 66.1%
Max accuracy: 66.08%
Epoch: [27]  [   0/1251]  eta: 0:57:08  lr: 0.003994  min_lr: 0.003994  loss: 2.9384 (2.9384)  weight_decay: 0.0500 (0.0500)  time: 2.7402  data: 2.4411  max mem: 13273
Epoch: [27]  [ 200/1251]  eta: 0:03:51  lr: 0.003994  min_lr: 0.003994  loss: 2.9755 (3.7572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7101 (0.7484)  time: 0.2051  data: 0.0008  max mem: 13273
Epoch: [27]  [ 400/1251]  eta: 0:03:00  lr: 0.003993  min_lr: 0.003993  loss: 4.1802 (3.7532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7129 (0.7598)  time: 0.2054  data: 0.0008  max mem: 13273
Epoch: [27]  [ 600/1251]  eta: 0:02:16  lr: 0.003993  min_lr: 0.003993  loss: 3.6567 (3.7670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6718 (0.7442)  time: 0.2056  data: 0.0010  max mem: 13273
Epoch: [27]  [ 800/1251]  eta: 0:01:34  lr: 0.003993  min_lr: 0.003993  loss: 3.4657 (3.7608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6703 (0.7446)  time: 0.2052  data: 0.0008  max mem: 13273
Epoch: [27]  [1000/1251]  eta: 0:00:52  lr: 0.003992  min_lr: 0.003992  loss: 3.6207 (3.7536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8092 (0.7567)  time: 0.2054  data: 0.0010  max mem: 13273
Epoch: [27]  [1200/1251]  eta: 0:00:10  lr: 0.003992  min_lr: 0.003992  loss: 3.9384 (3.7589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7264 (0.7525)  time: 0.2069  data: 0.0010  max mem: 13273
Epoch: [27]  [1250/1251]  eta: 0:00:00  lr: 0.003992  min_lr: 0.003992  loss: 3.3315 (3.7621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7380 (0.7534)  time: 0.1715  data: 0.0007  max mem: 13273
Epoch: [27] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.003992  min_lr: 0.003992  loss: 3.3315 (3.7664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7380 (0.7534)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.9917 (0.9917)  acc1: 78.4000 (78.4000)  acc5: 94.4000 (94.4000)  time: 5.7908  data: 5.6853  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.2349 (1.3020)  acc1: 74.8000 (72.9091)  acc5: 92.0000 (91.0909)  time: 0.7727  data: 0.6745  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.7060 (1.5411)  acc1: 61.6000 (67.0667)  acc5: 85.2000 (87.2000)  time: 0.2020  data: 0.1072  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.7325 (1.5561)  acc1: 61.6000 (66.6240)  acc5: 84.4000 (87.2320)  time: 0.2119  data: 0.1179  max mem: 13273
Test: Total time: 0:00:10 (0.4195 s / it)
* Acc@1 66.534 Acc@5 87.974 loss 1.553
Accuracy of the model on the 50000 test images: 66.5%
Max accuracy: 66.53%
Epoch: [28]  [   0/1251]  eta: 0:59:48  lr: 0.003992  min_lr: 0.003992  loss: 4.6248 (4.6248)  weight_decay: 0.0500 (0.0500)  time: 2.8686  data: 2.5951  max mem: 13273
Epoch: [28]  [ 200/1251]  eta: 0:03:51  lr: 0.003992  min_lr: 0.003992  loss: 3.1224 (3.7495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7957 (0.7578)  time: 0.2035  data: 0.0008  max mem: 13273
Epoch: [28]  [ 400/1251]  eta: 0:03:00  lr: 0.003991  min_lr: 0.003991  loss: 3.4537 (3.7535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6749 (0.7302)  time: 0.2077  data: 0.0007  max mem: 13273
Epoch: [28]  [ 600/1251]  eta: 0:02:16  lr: 0.003991  min_lr: 0.003991  loss: 4.3863 (3.7377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7635 (0.7206)  time: 0.2048  data: 0.0009  max mem: 13273
Epoch: [28]  [ 800/1251]  eta: 0:01:34  lr: 0.003991  min_lr: 0.003991  loss: 4.1194 (3.7490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6209 (0.7109)  time: 0.2045  data: 0.0008  max mem: 13273
Epoch: [28]  [1000/1251]  eta: 0:00:52  lr: 0.003990  min_lr: 0.003990  loss: 3.9508 (3.7611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6758 (0.7184)  time: 0.2053  data: 0.0010  max mem: 13273
Epoch: [28]  [1200/1251]  eta: 0:00:10  lr: 0.003990  min_lr: 0.003990  loss: 3.3527 (3.7478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6693 (0.7209)  time: 0.2061  data: 0.0007  max mem: 13273
Epoch: [28]  [1250/1251]  eta: 0:00:00  lr: 0.003990  min_lr: 0.003990  loss: 3.8078 (3.7521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7177 (0.7226)  time: 0.1710  data: 0.0007  max mem: 13273
Epoch: [28] Total time: 0:04:20 (0.2080 s / it)
Averaged stats: lr: 0.003990  min_lr: 0.003990  loss: 3.8078 (3.7465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7177 (0.7226)
Test:  [ 0/25]  eta: 0:02:20  loss: 1.1415 (1.1415)  acc1: 78.4000 (78.4000)  acc5: 94.0000 (94.0000)  time: 5.6097  data: 5.5027  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.2549 (1.3682)  acc1: 72.8000 (71.9273)  acc5: 93.6000 (91.4909)  time: 0.7568  data: 0.6592  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.7259 (1.6095)  acc1: 62.0000 (66.4762)  acc5: 86.8000 (87.8286)  time: 0.2169  data: 0.1226  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.7719 (1.6126)  acc1: 62.0000 (66.3040)  acc5: 84.8000 (87.6480)  time: 0.2160  data: 0.1225  max mem: 13273
Test: Total time: 0:00:10 (0.4154 s / it)
* Acc@1 66.794 Acc@5 87.736 loss 1.607
Accuracy of the model on the 50000 test images: 66.8%
Max accuracy: 66.79%
Epoch: [29]  [   0/1251]  eta: 1:09:10  lr: 0.003990  min_lr: 0.003990  loss: 3.0244 (3.0244)  weight_decay: 0.0500 (0.0500)  time: 3.3178  data: 3.1094  max mem: 13273
Epoch: [29]  [ 200/1251]  eta: 0:03:52  lr: 0.003989  min_lr: 0.003989  loss: 3.6073 (3.6741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6829 (0.7161)  time: 0.2058  data: 0.0008  max mem: 13273
Epoch: [29]  [ 400/1251]  eta: 0:03:01  lr: 0.003989  min_lr: 0.003989  loss: 3.2436 (3.6867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7406 (0.7447)  time: 0.2063  data: 0.0008  max mem: 13273
Epoch: [29]  [ 600/1251]  eta: 0:02:17  lr: 0.003989  min_lr: 0.003989  loss: 3.1138 (3.6977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7463 (0.7340)  time: 0.2065  data: 0.0008  max mem: 13273
Epoch: [29]  [ 800/1251]  eta: 0:01:34  lr: 0.003988  min_lr: 0.003988  loss: 3.1606 (3.7025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7039 (0.7367)  time: 0.2058  data: 0.0007  max mem: 13273
Epoch: [29]  [1000/1251]  eta: 0:00:52  lr: 0.003988  min_lr: 0.003988  loss: 4.1614 (3.7044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6646 (0.7305)  time: 0.2085  data: 0.0008  max mem: 13273
Epoch: [29]  [1200/1251]  eta: 0:00:10  lr: 0.003988  min_lr: 0.003988  loss: 3.2595 (3.7107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7364 (0.7347)  time: 0.2077  data: 0.0009  max mem: 13273
Epoch: [29]  [1250/1251]  eta: 0:00:00  lr: 0.003987  min_lr: 0.003987  loss: 3.6822 (3.7170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7195 (0.7333)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [29] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.003987  min_lr: 0.003987  loss: 3.6822 (3.7259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7195 (0.7333)
Test:  [ 0/25]  eta: 0:02:16  loss: 1.0899 (1.0899)  acc1: 75.6000 (75.6000)  acc5: 96.0000 (96.0000)  time: 5.4684  data: 5.3631  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.1957 (1.2818)  acc1: 74.4000 (73.0545)  acc5: 93.2000 (92.6182)  time: 0.7393  data: 0.6424  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.6718 (1.5525)  acc1: 64.0000 (67.1238)  acc5: 85.6000 (88.2095)  time: 0.2145  data: 0.1213  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.7463 (1.5605)  acc1: 63.2000 (67.0080)  acc5: 84.4000 (88.1440)  time: 0.2138  data: 0.1212  max mem: 13273
Test: Total time: 0:00:10 (0.4083 s / it)
* Acc@1 67.288 Acc@5 88.206 loss 1.546
Accuracy of the model on the 50000 test images: 67.3%
Max accuracy: 67.29%
Epoch: [30]  [   0/1251]  eta: 0:58:21  lr: 0.003987  min_lr: 0.003987  loss: 4.5498 (4.5498)  weight_decay: 0.0500 (0.0500)  time: 2.7993  data: 2.5588  max mem: 13273
Epoch: [30]  [ 200/1251]  eta: 0:03:53  lr: 0.003987  min_lr: 0.003987  loss: 3.6742 (3.6441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6922 (0.6844)  time: 0.2121  data: 0.0010  max mem: 13273
Epoch: [30]  [ 400/1251]  eta: 0:03:01  lr: 0.003987  min_lr: 0.003987  loss: 3.4103 (3.6714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7639 (0.7222)  time: 0.2045  data: 0.0008  max mem: 13273
Epoch: [30]  [ 600/1251]  eta: 0:02:17  lr: 0.003986  min_lr: 0.003986  loss: 2.9949 (3.6842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6754 (0.7317)  time: 0.2048  data: 0.0010  max mem: 13273
Epoch: [30]  [ 800/1251]  eta: 0:01:34  lr: 0.003986  min_lr: 0.003986  loss: 3.0844 (3.6528)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6538 (0.7373)  time: 0.2059  data: 0.0008  max mem: 13273
Epoch: [30]  [1000/1251]  eta: 0:00:52  lr: 0.003985  min_lr: 0.003985  loss: 3.7502 (3.6516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7764 (0.7399)  time: 0.2058  data: 0.0007  max mem: 13273
Epoch: [30]  [1200/1251]  eta: 0:00:10  lr: 0.003985  min_lr: 0.003985  loss: 3.2571 (3.6687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6961 (0.7270)  time: 0.2045  data: 0.0008  max mem: 13273
Epoch: [30]  [1250/1251]  eta: 0:00:00  lr: 0.003985  min_lr: 0.003985  loss: 3.3797 (3.6732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6679 (0.7242)  time: 0.1701  data: 0.0005  max mem: 13273
Epoch: [30] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003985  min_lr: 0.003985  loss: 3.3797 (3.6897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6679 (0.7242)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.9580 (0.9580)  acc1: 79.2000 (79.2000)  acc5: 95.6000 (95.6000)  time: 5.3748  data: 5.2656  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.1240 (1.1984)  acc1: 74.8000 (74.1091)  acc5: 93.2000 (92.2182)  time: 0.7428  data: 0.6433  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.5503 (1.4620)  acc1: 65.2000 (68.8000)  acc5: 85.6000 (88.3048)  time: 0.2051  data: 0.1076  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.7060 (1.4892)  acc1: 65.2000 (68.3520)  acc5: 84.8000 (88.0000)  time: 0.2046  data: 0.1075  max mem: 13273
Test: Total time: 0:00:09 (0.3978 s / it)
* Acc@1 67.910 Acc@5 88.358 loss 1.486
Accuracy of the model on the 50000 test images: 67.9%
Max accuracy: 67.91%
Epoch: [31]  [   0/1251]  eta: 1:07:55  lr: 0.003985  min_lr: 0.003985  loss: 4.2240 (4.2240)  weight_decay: 0.0500 (0.0500)  time: 3.2575  data: 3.0351  max mem: 13273
Epoch: [31]  [ 200/1251]  eta: 0:03:51  lr: 0.003984  min_lr: 0.003984  loss: 3.5320 (3.6148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7994 (0.7578)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [31]  [ 400/1251]  eta: 0:03:01  lr: 0.003984  min_lr: 0.003984  loss: 3.3989 (3.5904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7409 (0.7500)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [31]  [ 600/1251]  eta: 0:02:16  lr: 0.003983  min_lr: 0.003983  loss: 3.1884 (3.6089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7095 (0.7387)  time: 0.2053  data: 0.0006  max mem: 13273
Epoch: [31]  [ 800/1251]  eta: 0:01:34  lr: 0.003983  min_lr: 0.003983  loss: 3.2783 (3.6176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6854 (0.7351)  time: 0.2036  data: 0.0005  max mem: 13273
Epoch: [31]  [1000/1251]  eta: 0:00:52  lr: 0.003982  min_lr: 0.003982  loss: 3.1285 (3.6420)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6605 (0.7333)  time: 0.2071  data: 0.0005  max mem: 13273
Epoch: [31]  [1200/1251]  eta: 0:00:10  lr: 0.003982  min_lr: 0.003982  loss: 3.1458 (3.6555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6629 (0.7297)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [31]  [1250/1251]  eta: 0:00:00  lr: 0.003982  min_lr: 0.003982  loss: 3.1115 (3.6615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6676 (0.7296)  time: 0.1715  data: 0.0006  max mem: 13273
Epoch: [31] Total time: 0:04:19 (0.2078 s / it)
Averaged stats: lr: 0.003982  min_lr: 0.003982  loss: 3.1115 (3.6892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6676 (0.7296)
Test:  [ 0/25]  eta: 0:02:18  loss: 1.0600 (1.0600)  acc1: 80.0000 (80.0000)  acc5: 94.4000 (94.4000)  time: 5.5203  data: 5.4121  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.1078 (1.2091)  acc1: 76.4000 (73.5273)  acc5: 94.4000 (92.9091)  time: 0.6850  data: 0.5871  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.5616 (1.5074)  acc1: 64.0000 (68.0952)  acc5: 87.6000 (88.6286)  time: 0.1902  data: 0.0966  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.6907 (1.5150)  acc1: 64.0000 (68.1280)  acc5: 86.0000 (88.5120)  time: 0.2091  data: 0.1166  max mem: 13273
Test: Total time: 0:00:10 (0.4065 s / it)
* Acc@1 67.886 Acc@5 88.332 loss 1.507
Accuracy of the model on the 50000 test images: 67.9%
Max accuracy: 67.91%
Epoch: [32]  [   0/1251]  eta: 1:11:40  lr: 0.003982  min_lr: 0.003982  loss: 2.6931 (2.6931)  weight_decay: 0.0500 (0.0500)  time: 3.4375  data: 1.6462  max mem: 13273
Epoch: [32]  [ 200/1251]  eta: 0:03:53  lr: 0.003981  min_lr: 0.003981  loss: 3.7569 (3.6526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7717 (0.7183)  time: 0.2068  data: 0.0005  max mem: 13273
Epoch: [32]  [ 400/1251]  eta: 0:03:01  lr: 0.003981  min_lr: 0.003981  loss: 3.3710 (3.6896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7033 (0.7241)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [32]  [ 600/1251]  eta: 0:02:17  lr: 0.003980  min_lr: 0.003980  loss: 3.9056 (3.6815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9002 (0.7502)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [32]  [ 800/1251]  eta: 0:01:34  lr: 0.003980  min_lr: 0.003980  loss: 3.1800 (3.6819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6034 (0.7362)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [32]  [1000/1251]  eta: 0:00:52  lr: 0.003979  min_lr: 0.003979  loss: 4.3475 (3.6847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7323 (0.7379)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [32]  [1200/1251]  eta: 0:00:10  lr: 0.003979  min_lr: 0.003979  loss: 3.5208 (3.7009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6587 (0.7315)  time: 0.2071  data: 0.0005  max mem: 13273
Epoch: [32]  [1250/1251]  eta: 0:00:00  lr: 0.003979  min_lr: 0.003979  loss: 3.6037 (3.6990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6645 (0.7306)  time: 0.1709  data: 0.0007  max mem: 13273
Epoch: [32] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.003979  min_lr: 0.003979  loss: 3.6037 (3.6841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6645 (0.7306)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.9798 (0.9798)  acc1: 82.8000 (82.8000)  acc5: 94.4000 (94.4000)  time: 5.6575  data: 5.5522  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.1635 (1.1917)  acc1: 76.4000 (75.4182)  acc5: 94.4000 (93.4182)  time: 0.7236  data: 0.6279  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4742 (1.4430)  acc1: 67.6000 (69.6952)  acc5: 88.4000 (89.2762)  time: 0.2045  data: 0.1119  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5978 (1.4629)  acc1: 66.4000 (69.1200)  acc5: 86.8000 (89.0080)  time: 0.2043  data: 0.1118  max mem: 13273
Test: Total time: 0:00:10 (0.4076 s / it)
* Acc@1 68.954 Acc@5 89.094 loss 1.456
Accuracy of the model on the 50000 test images: 69.0%
Max accuracy: 68.95%
Epoch: [33]  [   0/1251]  eta: 1:00:38  lr: 0.003979  min_lr: 0.003979  loss: 3.6454 (3.6454)  weight_decay: 0.0500 (0.0500)  time: 2.9087  data: 2.6597  max mem: 13273
Epoch: [33]  [ 200/1251]  eta: 0:03:51  lr: 0.003978  min_lr: 0.003978  loss: 3.0982 (3.6375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6487 (0.7027)  time: 0.2069  data: 0.0005  max mem: 13273
Epoch: [33]  [ 400/1251]  eta: 0:03:01  lr: 0.003978  min_lr: 0.003978  loss: 4.1640 (3.6655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8306 (0.7467)  time: 0.2067  data: 0.0004  max mem: 13273
Epoch: [33]  [ 600/1251]  eta: 0:02:17  lr: 0.003977  min_lr: 0.003977  loss: 3.8115 (3.6554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6891 (0.7503)  time: 0.2067  data: 0.0006  max mem: 13273
Epoch: [33]  [ 800/1251]  eta: 0:01:35  lr: 0.003977  min_lr: 0.003977  loss: 3.8186 (3.6398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7295 (0.7424)  time: 0.2150  data: 0.0004  max mem: 13273
Epoch: [33]  [1000/1251]  eta: 0:00:52  lr: 0.003976  min_lr: 0.003976  loss: 3.4260 (3.6512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6622 (0.7372)  time: 0.2037  data: 0.0005  max mem: 13273
Epoch: [33]  [1200/1251]  eta: 0:00:10  lr: 0.003976  min_lr: 0.003976  loss: 2.9631 (3.6598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7970 (0.7447)  time: 0.2073  data: 0.0005  max mem: 13273
Epoch: [33]  [1250/1251]  eta: 0:00:00  lr: 0.003975  min_lr: 0.003975  loss: 3.6729 (3.6540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7130 (0.7429)  time: 0.1702  data: 0.0010  max mem: 13273
Epoch: [33] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.003975  min_lr: 0.003975  loss: 3.6729 (3.6632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7130 (0.7429)
Test:  [ 0/25]  eta: 0:02:23  loss: 1.0658 (1.0658)  acc1: 79.6000 (79.6000)  acc5: 94.4000 (94.4000)  time: 5.7516  data: 5.6307  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.1201 (1.2203)  acc1: 75.6000 (73.7091)  acc5: 94.4000 (92.8000)  time: 0.7533  data: 0.6526  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4903 (1.4878)  acc1: 66.0000 (68.5905)  acc5: 88.0000 (88.9143)  time: 0.2088  data: 0.1143  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.7017 (1.5050)  acc1: 65.6000 (68.3360)  acc5: 86.0000 (88.5760)  time: 0.2068  data: 0.1142  max mem: 13273
Test: Total time: 0:00:10 (0.4147 s / it)
* Acc@1 68.492 Acc@5 88.896 loss 1.503
Accuracy of the model on the 50000 test images: 68.5%
Max accuracy: 68.95%
Epoch: [34]  [   0/1251]  eta: 1:10:23  lr: 0.003975  min_lr: 0.003975  loss: 2.6430 (2.6430)  weight_decay: 0.0500 (0.0500)  time: 3.3758  data: 2.1020  max mem: 13273
Epoch: [34]  [ 200/1251]  eta: 0:03:52  lr: 0.003975  min_lr: 0.003975  loss: 2.8898 (3.5644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7547 (0.7117)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [34]  [ 400/1251]  eta: 0:03:01  lr: 0.003974  min_lr: 0.003974  loss: 4.2582 (3.5927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6891 (0.7199)  time: 0.2048  data: 0.0003  max mem: 13273
Epoch: [34]  [ 600/1251]  eta: 0:02:17  lr: 0.003974  min_lr: 0.003974  loss: 3.6751 (3.6255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7899 (0.7351)  time: 0.2067  data: 0.0006  max mem: 13273
Epoch: [34]  [ 800/1251]  eta: 0:01:34  lr: 0.003973  min_lr: 0.003973  loss: 3.3102 (3.6207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7048 (0.7167)  time: 0.2037  data: 0.0005  max mem: 13273
Epoch: [34]  [1000/1251]  eta: 0:00:52  lr: 0.003972  min_lr: 0.003972  loss: 3.2027 (3.6222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5872 (0.7097)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [34]  [1200/1251]  eta: 0:00:10  lr: 0.003972  min_lr: 0.003972  loss: 3.6921 (3.6261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6463 (0.7165)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [34]  [1250/1251]  eta: 0:00:00  lr: 0.003972  min_lr: 0.003972  loss: 3.1070 (3.6212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6747 (0.7187)  time: 0.1717  data: 0.0008  max mem: 13273
Epoch: [34] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.003972  min_lr: 0.003972  loss: 3.1070 (3.6487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6747 (0.7187)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.9986 (0.9986)  acc1: 80.8000 (80.8000)  acc5: 94.0000 (94.0000)  time: 5.4181  data: 5.3101  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 1.1337 (1.2174)  acc1: 76.8000 (74.5455)  acc5: 93.6000 (92.5455)  time: 0.6437  data: 0.5463  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.5974 (1.5011)  acc1: 64.4000 (68.6667)  acc5: 86.0000 (88.2667)  time: 0.1670  data: 0.0736  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.6537 (1.5061)  acc1: 64.4000 (68.2560)  acc5: 85.6000 (88.2080)  time: 0.1874  data: 0.0947  max mem: 13273
Test: Total time: 0:00:09 (0.3865 s / it)
* Acc@1 68.084 Acc@5 88.720 loss 1.488
Accuracy of the model on the 50000 test images: 68.1%
Max accuracy: 68.95%
Epoch: [35]  [   0/1251]  eta: 1:06:44  lr: 0.003972  min_lr: 0.003972  loss: 2.9315 (2.9315)  weight_decay: 0.0500 (0.0500)  time: 3.2007  data: 1.5699  max mem: 13273
Epoch: [35]  [ 200/1251]  eta: 0:03:53  lr: 0.003971  min_lr: 0.003971  loss: 3.4465 (3.5961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7129 (0.7311)  time: 0.2041  data: 0.0004  max mem: 13273
Epoch: [35]  [ 400/1251]  eta: 0:03:02  lr: 0.003971  min_lr: 0.003971  loss: 3.8524 (3.6063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7700 (0.7630)  time: 0.2031  data: 0.0004  max mem: 13273
Epoch: [35]  [ 600/1251]  eta: 0:02:17  lr: 0.003970  min_lr: 0.003970  loss: 2.9892 (3.6033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6411 (0.7390)  time: 0.2034  data: 0.0004  max mem: 13273
Epoch: [35]  [ 800/1251]  eta: 0:01:34  lr: 0.003969  min_lr: 0.003969  loss: 3.4509 (3.6267)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [35]  [1000/1251]  eta: 0:00:52  lr: 0.003969  min_lr: 0.003969  loss: 3.1606 (3.6449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7600 (nan)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [35]  [1200/1251]  eta: 0:00:10  lr: 0.003968  min_lr: 0.003968  loss: 3.7055 (3.6474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6662 (nan)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [35]  [1250/1251]  eta: 0:00:00  lr: 0.003968  min_lr: 0.003968  loss: 3.7498 (3.6477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7143 (nan)  time: 0.1709  data: 0.0008  max mem: 13273
Epoch: [35] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.003968  min_lr: 0.003968  loss: 3.7498 (3.6180)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7143 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 1.0879 (1.0879)  acc1: 79.2000 (79.2000)  acc5: 94.4000 (94.4000)  time: 5.4761  data: 5.3697  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.1940 (1.2270)  acc1: 75.2000 (74.5091)  acc5: 94.4000 (93.4909)  time: 0.7308  data: 0.6329  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.5494 (1.5025)  acc1: 66.0000 (68.9905)  acc5: 87.6000 (89.3714)  time: 0.2059  data: 0.1111  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.6412 (1.5186)  acc1: 65.6000 (68.8960)  acc5: 87.2000 (89.0240)  time: 0.2051  data: 0.1110  max mem: 13273
Test: Total time: 0:00:10 (0.4022 s / it)
* Acc@1 68.656 Acc@5 89.168 loss 1.515
Accuracy of the model on the 50000 test images: 68.7%
Max accuracy: 68.95%
Epoch: [36]  [   0/1251]  eta: 1:03:26  lr: 0.003968  min_lr: 0.003968  loss: 4.3215 (4.3215)  weight_decay: 0.0500 (0.0500)  time: 3.0429  data: 1.6288  max mem: 13273
Epoch: [36]  [ 200/1251]  eta: 0:03:53  lr: 0.003967  min_lr: 0.003967  loss: 2.9217 (3.5856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7535 (0.7333)  time: 0.2120  data: 0.0003  max mem: 13273
Epoch: [36]  [ 400/1251]  eta: 0:03:02  lr: 0.003967  min_lr: 0.003967  loss: 3.7040 (3.5650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7683 (0.7545)  time: 0.2069  data: 0.0004  max mem: 13273
Epoch: [36]  [ 600/1251]  eta: 0:02:17  lr: 0.003966  min_lr: 0.003966  loss: 3.7244 (3.5781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7505 (0.7616)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [36]  [ 800/1251]  eta: 0:01:34  lr: 0.003965  min_lr: 0.003965  loss: 3.4738 (3.5885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6776 (0.7451)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [36]  [1000/1251]  eta: 0:00:52  lr: 0.003965  min_lr: 0.003965  loss: 3.2982 (3.6032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7533 (0.7598)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [36]  [1200/1251]  eta: 0:00:10  lr: 0.003964  min_lr: 0.003964  loss: 4.3755 (3.6256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6946 (0.7426)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [36]  [1250/1251]  eta: 0:00:00  lr: 0.003964  min_lr: 0.003964  loss: 3.4984 (3.6294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7446 (0.7433)  time: 0.1704  data: 0.0008  max mem: 13273
Epoch: [36] Total time: 0:04:22 (0.2096 s / it)
Averaged stats: lr: 0.003964  min_lr: 0.003964  loss: 3.4984 (3.6235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7446 (0.7433)
Test:  [ 0/25]  eta: 0:02:25  loss: 1.0746 (1.0746)  acc1: 82.8000 (82.8000)  acc5: 94.0000 (94.0000)  time: 5.8313  data: 5.7247  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.1859 (1.2285)  acc1: 76.4000 (75.9636)  acc5: 94.0000 (93.2727)  time: 0.7605  data: 0.6641  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4640 (1.4972)  acc1: 66.0000 (69.6381)  acc5: 88.4000 (89.4286)  time: 0.2051  data: 0.1122  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.6483 (1.5037)  acc1: 65.6000 (69.5360)  acc5: 86.4000 (89.2480)  time: 0.2043  data: 0.1121  max mem: 13273
Test: Total time: 0:00:10 (0.4152 s / it)
* Acc@1 69.162 Acc@5 89.338 loss 1.495
Accuracy of the model on the 50000 test images: 69.2%
Max accuracy: 69.16%
Epoch: [37]  [   0/1251]  eta: 1:08:08  lr: 0.003964  min_lr: 0.003964  loss: 3.6057 (3.6057)  weight_decay: 0.0500 (0.0500)  time: 3.2678  data: 3.0175  max mem: 13273
Epoch: [37]  [ 200/1251]  eta: 0:03:52  lr: 0.003963  min_lr: 0.003963  loss: 2.8292 (3.5028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6898 (0.6737)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [37]  [ 400/1251]  eta: 0:03:01  lr: 0.003962  min_lr: 0.003962  loss: 3.4229 (3.5436)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6576 (0.6978)  time: 0.2045  data: 0.0004  max mem: 13273
Epoch: [37]  [ 600/1251]  eta: 0:02:17  lr: 0.003962  min_lr: 0.003962  loss: 2.9657 (3.5747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5917 (0.7063)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [37]  [ 800/1251]  eta: 0:01:34  lr: 0.003961  min_lr: 0.003961  loss: 3.1126 (3.5778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7684 (0.7225)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [37]  [1000/1251]  eta: 0:00:52  lr: 0.003960  min_lr: 0.003960  loss: 3.0733 (3.5828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.7339)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [37]  [1200/1251]  eta: 0:00:10  lr: 0.003960  min_lr: 0.003960  loss: 3.3790 (3.5936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6786 (0.7279)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [37]  [1250/1251]  eta: 0:00:00  lr: 0.003959  min_lr: 0.003959  loss: 4.0116 (3.5991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7371 (0.7273)  time: 0.1702  data: 0.0008  max mem: 13273
Epoch: [37] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.003959  min_lr: 0.003959  loss: 4.0116 (3.6051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7371 (0.7273)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.9278 (0.9278)  acc1: 83.2000 (83.2000)  acc5: 94.4000 (94.4000)  time: 5.2938  data: 5.1884  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0841 (1.1903)  acc1: 76.4000 (75.1636)  acc5: 93.6000 (93.2364)  time: 0.6937  data: 0.5987  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.5532 (1.4726)  acc1: 65.2000 (69.5048)  acc5: 88.0000 (89.4095)  time: 0.1903  data: 0.0951  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.6704 (1.4824)  acc1: 64.4000 (69.2160)  acc5: 85.6000 (89.2160)  time: 0.2000  data: 0.1049  max mem: 13273
Test: Total time: 0:00:10 (0.4033 s / it)
* Acc@1 69.164 Acc@5 89.400 loss 1.471
Accuracy of the model on the 50000 test images: 69.2%
Max accuracy: 69.16%
Epoch: [38]  [   0/1251]  eta: 1:00:48  lr: 0.003959  min_lr: 0.003959  loss: 3.9100 (3.9100)  weight_decay: 0.0500 (0.0500)  time: 2.9163  data: 2.6545  max mem: 13273
Epoch: [38]  [ 200/1251]  eta: 0:03:51  lr: 0.003959  min_lr: 0.003959  loss: 2.8608 (3.5371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6385 (0.7448)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [38]  [ 400/1251]  eta: 0:03:01  lr: 0.003958  min_lr: 0.003958  loss: 2.9085 (3.5578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6597 (0.7280)  time: 0.2081  data: 0.0004  max mem: 13273
Epoch: [38]  [ 600/1251]  eta: 0:02:16  lr: 0.003957  min_lr: 0.003957  loss: 3.6867 (3.5905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6951 (0.7432)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [38]  [ 800/1251]  eta: 0:01:34  lr: 0.003956  min_lr: 0.003956  loss: 3.8847 (3.5925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6450 (0.7270)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [38]  [1000/1251]  eta: 0:00:52  lr: 0.003956  min_lr: 0.003956  loss: 3.1067 (3.5788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6796 (0.7267)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [38]  [1200/1251]  eta: 0:00:10  lr: 0.003955  min_lr: 0.003955  loss: 3.8153 (3.5759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6741 (0.7188)  time: 0.2061  data: 0.0006  max mem: 13273
Epoch: [38]  [1250/1251]  eta: 0:00:00  lr: 0.003955  min_lr: 0.003955  loss: 4.1328 (3.5818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6645 (0.7192)  time: 0.1700  data: 0.0007  max mem: 13273
Epoch: [38] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.003955  min_lr: 0.003955  loss: 4.1328 (3.5978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6645 (0.7192)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.9816 (0.9816)  acc1: 81.6000 (81.6000)  acc5: 94.4000 (94.4000)  time: 5.6045  data: 5.4991  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.1639 (1.1998)  acc1: 76.4000 (75.6364)  acc5: 94.0000 (93.3455)  time: 0.6675  data: 0.5734  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.5768 (1.4446)  acc1: 66.0000 (69.7905)  acc5: 87.6000 (89.5810)  time: 0.1696  data: 0.0779  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.6451 (1.4585)  acc1: 66.0000 (69.6640)  acc5: 86.4000 (89.3760)  time: 0.2002  data: 0.1094  max mem: 13273
Test: Total time: 0:00:10 (0.4030 s / it)
* Acc@1 69.690 Acc@5 89.578 loss 1.448
Accuracy of the model on the 50000 test images: 69.7%
Max accuracy: 69.69%
Epoch: [39]  [   0/1251]  eta: 0:53:39  lr: 0.003955  min_lr: 0.003955  loss: 2.7055 (2.7055)  weight_decay: 0.0500 (0.0500)  time: 2.5737  data: 2.3218  max mem: 13273
Epoch: [39]  [ 200/1251]  eta: 0:03:52  lr: 0.003954  min_lr: 0.003954  loss: 2.8928 (3.6289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6635 (0.7031)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [39]  [ 400/1251]  eta: 0:03:01  lr: 0.003953  min_lr: 0.003953  loss: 3.0273 (3.5917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6607 (0.7196)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [39]  [ 600/1251]  eta: 0:02:17  lr: 0.003952  min_lr: 0.003952  loss: 3.3969 (3.6055)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6721 (0.7141)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [39]  [ 800/1251]  eta: 0:01:34  lr: 0.003952  min_lr: 0.003952  loss: 3.2797 (3.5879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7640 (0.7319)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [39]  [1000/1251]  eta: 0:00:52  lr: 0.003951  min_lr: 0.003951  loss: 2.8891 (3.5703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7287 (0.7274)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [39]  [1200/1251]  eta: 0:00:10  lr: 0.003950  min_lr: 0.003950  loss: 2.7976 (3.5547)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5986 (0.7206)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [39]  [1250/1251]  eta: 0:00:00  lr: 0.003950  min_lr: 0.003950  loss: 3.4962 (3.5557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7832 (0.7232)  time: 0.1703  data: 0.0006  max mem: 13273
Epoch: [39] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.003950  min_lr: 0.003950  loss: 3.4962 (3.5751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7832 (0.7232)
Test:  [ 0/25]  eta: 0:02:07  loss: 0.9915 (0.9915)  acc1: 83.2000 (83.2000)  acc5: 95.2000 (95.2000)  time: 5.1149  data: 4.9724  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 1.1317 (1.1948)  acc1: 76.0000 (76.2182)  acc5: 94.8000 (93.7455)  time: 0.6481  data: 0.5456  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4988 (1.4594)  acc1: 66.4000 (70.0571)  acc5: 88.8000 (89.7714)  time: 0.1881  data: 0.0907  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.6003 (1.4708)  acc1: 66.4000 (69.9200)  acc5: 86.0000 (89.5520)  time: 0.2114  data: 0.1148  max mem: 13273
Test: Total time: 0:00:09 (0.3950 s / it)
* Acc@1 69.632 Acc@5 89.488 loss 1.470
Accuracy of the model on the 50000 test images: 69.6%
Max accuracy: 69.69%
Epoch: [40]  [   0/1251]  eta: 1:02:17  lr: 0.003950  min_lr: 0.003950  loss: 2.8231 (2.8231)  weight_decay: 0.0500 (0.0500)  time: 2.9877  data: 1.6348  max mem: 13273
Epoch: [40]  [ 200/1251]  eta: 0:03:54  lr: 0.003949  min_lr: 0.003949  loss: 3.8636 (3.6052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7837 (0.7754)  time: 0.2125  data: 0.0005  max mem: 13273
Epoch: [40]  [ 400/1251]  eta: 0:03:03  lr: 0.003948  min_lr: 0.003948  loss: 3.9993 (3.5714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5830 (0.7685)  time: 0.2044  data: 0.0006  max mem: 13273
Epoch: [40]  [ 600/1251]  eta: 0:02:18  lr: 0.003947  min_lr: 0.003947  loss: 3.3248 (3.5705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6501 (0.7523)  time: 0.2057  data: 0.0007  max mem: 13273
Epoch: [40]  [ 800/1251]  eta: 0:01:35  lr: 0.003947  min_lr: 0.003947  loss: 2.9052 (3.5721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5939 (0.7371)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [40]  [1000/1251]  eta: 0:00:52  lr: 0.003946  min_lr: 0.003946  loss: 3.7856 (3.5832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6493 (0.7331)  time: 0.2048  data: 0.0007  max mem: 13273
Epoch: [40]  [1200/1251]  eta: 0:00:10  lr: 0.003945  min_lr: 0.003945  loss: 4.2509 (3.5981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7189 (0.7402)  time: 0.2053  data: 0.0008  max mem: 13273
Epoch: [40]  [1250/1251]  eta: 0:00:00  lr: 0.003945  min_lr: 0.003945  loss: 3.0221 (3.5929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6951 (0.7395)  time: 0.1707  data: 0.0009  max mem: 13273
Epoch: [40] Total time: 0:04:22 (0.2095 s / it)
Averaged stats: lr: 0.003945  min_lr: 0.003945  loss: 3.0221 (3.5656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6951 (0.7395)
Test:  [ 0/25]  eta: 0:01:23  loss: 0.8548 (0.8548)  acc1: 82.0000 (82.0000)  acc5: 96.4000 (96.4000)  time: 3.3558  data: 3.2491  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 1.0777 (1.1034)  acc1: 78.0000 (75.6000)  acc5: 94.8000 (93.7455)  time: 0.6134  data: 0.5195  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4637 (1.3573)  acc1: 67.2000 (70.4571)  acc5: 88.4000 (89.9810)  time: 0.2747  data: 0.1832  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5214 (1.3714)  acc1: 67.2000 (70.0480)  acc5: 87.2000 (89.7120)  time: 0.2010  data: 0.1092  max mem: 13273
Test: Total time: 0:00:10 (0.4039 s / it)
* Acc@1 69.968 Acc@5 89.848 loss 1.374
Accuracy of the model on the 50000 test images: 70.0%
Max accuracy: 69.97%
Epoch: [41]  [   0/1251]  eta: 0:59:05  lr: 0.003945  min_lr: 0.003945  loss: 2.5914 (2.5914)  weight_decay: 0.0500 (0.0500)  time: 2.8343  data: 2.6035  max mem: 13273
Epoch: [41]  [ 200/1251]  eta: 0:03:50  lr: 0.003944  min_lr: 0.003944  loss: 4.0566 (3.5218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7087 (0.7020)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [41]  [ 400/1251]  eta: 0:03:00  lr: 0.003943  min_lr: 0.003943  loss: 2.7266 (3.5241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6735 (0.7034)  time: 0.2051  data: 0.0003  max mem: 13273
Epoch: [41]  [ 600/1251]  eta: 0:02:16  lr: 0.003942  min_lr: 0.003942  loss: 4.1400 (3.5623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7457 (0.7247)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [41]  [ 800/1251]  eta: 0:01:34  lr: 0.003941  min_lr: 0.003941  loss: 3.0212 (3.5556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9071 (0.7399)  time: 0.2077  data: 0.0004  max mem: 13273
Epoch: [41]  [1000/1251]  eta: 0:00:52  lr: 0.003940  min_lr: 0.003940  loss: 2.8678 (3.5552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6923 (0.7286)  time: 0.2070  data: 0.0004  max mem: 13273
Epoch: [41]  [1200/1251]  eta: 0:00:10  lr: 0.003940  min_lr: 0.003940  loss: 3.2901 (3.5464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6831 (0.7229)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [41]  [1250/1251]  eta: 0:00:00  lr: 0.003939  min_lr: 0.003939  loss: 3.0940 (3.5450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6528 (0.7224)  time: 0.1708  data: 0.0010  max mem: 13273
Epoch: [41] Total time: 0:04:20 (0.2080 s / it)
Averaged stats: lr: 0.003939  min_lr: 0.003939  loss: 3.0940 (3.5589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6528 (0.7224)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.9104 (0.9104)  acc1: 82.4000 (82.4000)  acc5: 93.2000 (93.2000)  time: 5.6914  data: 5.5838  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0799 (1.0974)  acc1: 76.8000 (75.8182)  acc5: 94.0000 (93.6727)  time: 0.6764  data: 0.5799  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4441 (1.3674)  acc1: 65.2000 (70.3810)  acc5: 88.8000 (89.9619)  time: 0.1710  data: 0.0780  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5672 (1.3751)  acc1: 65.2000 (70.0640)  acc5: 86.4000 (89.8720)  time: 0.2040  data: 0.1118  max mem: 13273
Test: Total time: 0:00:10 (0.4092 s / it)
* Acc@1 70.134 Acc@5 89.930 loss 1.374
Accuracy of the model on the 50000 test images: 70.1%
Max accuracy: 70.13%
Epoch: [42]  [   0/1251]  eta: 1:06:00  lr: 0.003939  min_lr: 0.003939  loss: 4.7534 (4.7534)  weight_decay: 0.0500 (0.0500)  time: 3.1660  data: 2.9435  max mem: 13273
Epoch: [42]  [ 200/1251]  eta: 0:03:51  lr: 0.003939  min_lr: 0.003939  loss: 3.2035 (3.4746)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [42]  [ 400/1251]  eta: 0:03:01  lr: 0.003938  min_lr: 0.003938  loss: 3.2545 (3.4551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6995 (nan)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [42]  [ 600/1251]  eta: 0:02:17  lr: 0.003937  min_lr: 0.003937  loss: 3.1764 (3.4672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6635 (nan)  time: 0.2071  data: 0.0005  max mem: 13273
Epoch: [42]  [ 800/1251]  eta: 0:01:34  lr: 0.003936  min_lr: 0.003936  loss: 3.5150 (3.4715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7005 (nan)  time: 0.2035  data: 0.0005  max mem: 13273
Epoch: [42]  [1000/1251]  eta: 0:00:52  lr: 0.003935  min_lr: 0.003935  loss: 3.3440 (3.4973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5928 (nan)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [42]  [1200/1251]  eta: 0:00:10  lr: 0.003934  min_lr: 0.003934  loss: 3.0654 (3.5057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8141 (nan)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [42]  [1250/1251]  eta: 0:00:00  lr: 0.003934  min_lr: 0.003934  loss: 2.9235 (3.4997)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6218 (nan)  time: 0.1705  data: 0.0006  max mem: 13273
Epoch: [42] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.003934  min_lr: 0.003934  loss: 2.9235 (3.5572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6218 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.9350 (0.9350)  acc1: 82.8000 (82.8000)  acc5: 95.2000 (95.2000)  time: 5.6737  data: 5.5676  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0492 (1.0958)  acc1: 74.8000 (75.9273)  acc5: 94.4000 (94.1818)  time: 0.7447  data: 0.6497  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3448 (1.3712)  acc1: 65.6000 (70.2095)  acc5: 87.6000 (89.7524)  time: 0.1925  data: 0.1004  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5304 (1.3756)  acc1: 65.2000 (69.9360)  acc5: 86.8000 (89.8400)  time: 0.1980  data: 0.1066  max mem: 13273
Test: Total time: 0:00:10 (0.4053 s / it)
* Acc@1 69.980 Acc@5 89.952 loss 1.376
Accuracy of the model on the 50000 test images: 70.0%
Max accuracy: 70.13%
Epoch: [43]  [   0/1251]  eta: 1:02:56  lr: 0.003934  min_lr: 0.003934  loss: 2.5779 (2.5779)  weight_decay: 0.0500 (0.0500)  time: 3.0185  data: 2.7731  max mem: 13273
Epoch: [43]  [ 200/1251]  eta: 0:03:52  lr: 0.003933  min_lr: 0.003933  loss: 3.7738 (3.4867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7088 (0.7217)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [43]  [ 400/1251]  eta: 0:03:01  lr: 0.003932  min_lr: 0.003932  loss: 2.9439 (3.5301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7135 (0.7251)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [43]  [ 600/1251]  eta: 0:02:17  lr: 0.003931  min_lr: 0.003931  loss: 4.1325 (3.5374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6919 (0.7193)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [43]  [ 800/1251]  eta: 0:01:35  lr: 0.003930  min_lr: 0.003930  loss: 3.2211 (3.5593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (0.7177)  time: 0.2156  data: 0.0005  max mem: 13273
Epoch: [43]  [1000/1251]  eta: 0:00:52  lr: 0.003929  min_lr: 0.003929  loss: 3.1766 (3.5721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7387 (0.7323)  time: 0.2071  data: 0.0005  max mem: 13273
Epoch: [43]  [1200/1251]  eta: 0:00:10  lr: 0.003928  min_lr: 0.003928  loss: 3.1153 (3.5641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6916 (0.7307)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [43]  [1250/1251]  eta: 0:00:00  lr: 0.003928  min_lr: 0.003928  loss: 4.1477 (3.5651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7084 (0.7290)  time: 0.1703  data: 0.0006  max mem: 13273
Epoch: [43] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.003928  min_lr: 0.003928  loss: 4.1477 (3.5321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7084 (0.7290)
Test:  [ 0/25]  eta: 0:02:26  loss: 1.0991 (1.0991)  acc1: 81.6000 (81.6000)  acc5: 94.4000 (94.4000)  time: 5.8731  data: 5.7664  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.2112 (1.2055)  acc1: 78.0000 (75.8545)  acc5: 94.4000 (93.8909)  time: 0.7131  data: 0.6157  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.5074 (1.4750)  acc1: 67.6000 (70.0762)  acc5: 90.0000 (89.9619)  time: 0.1786  data: 0.0852  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.6792 (1.4878)  acc1: 65.2000 (69.7120)  acc5: 87.2000 (89.7280)  time: 0.1851  data: 0.0925  max mem: 13273
Test: Total time: 0:00:10 (0.4032 s / it)
* Acc@1 69.926 Acc@5 89.834 loss 1.484
Accuracy of the model on the 50000 test images: 69.9%
Max accuracy: 70.13%
Epoch: [44]  [   0/1251]  eta: 1:08:29  lr: 0.003928  min_lr: 0.003928  loss: 4.4585 (4.4585)  weight_decay: 0.0500 (0.0500)  time: 3.2849  data: 2.8017  max mem: 13273
Epoch: [44]  [ 200/1251]  eta: 0:03:52  lr: 0.003927  min_lr: 0.003927  loss: 3.1826 (3.6476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7083 (0.7097)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [44]  [ 400/1251]  eta: 0:03:01  lr: 0.003926  min_lr: 0.003926  loss: 3.7029 (3.5892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6934 (0.6958)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [44]  [ 600/1251]  eta: 0:02:17  lr: 0.003925  min_lr: 0.003925  loss: 3.7064 (3.5926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6691 (0.7256)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [44]  [ 800/1251]  eta: 0:01:34  lr: 0.003924  min_lr: 0.003924  loss: 3.1171 (3.5600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6386 (0.7158)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [44]  [1000/1251]  eta: 0:00:52  lr: 0.003923  min_lr: 0.003923  loss: 3.3114 (3.5545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7462 (0.7204)  time: 0.2037  data: 0.0004  max mem: 13273
Epoch: [44]  [1200/1251]  eta: 0:00:10  lr: 0.003922  min_lr: 0.003922  loss: 3.4773 (3.5598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6458 (0.7121)  time: 0.2063  data: 0.0006  max mem: 13273
Epoch: [44]  [1250/1251]  eta: 0:00:00  lr: 0.003922  min_lr: 0.003922  loss: 3.0562 (3.5578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6822 (0.7126)  time: 0.1706  data: 0.0011  max mem: 13273
Epoch: [44] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003922  min_lr: 0.003922  loss: 3.0562 (3.5387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6822 (0.7126)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.9113 (0.9113)  acc1: 81.6000 (81.6000)  acc5: 95.2000 (95.2000)  time: 5.6877  data: 5.5801  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9872 (1.0505)  acc1: 76.4000 (76.6182)  acc5: 94.8000 (94.0727)  time: 0.7320  data: 0.6374  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3098 (1.3221)  acc1: 68.8000 (71.5429)  acc5: 89.2000 (90.3429)  time: 0.2004  data: 0.1084  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5470 (1.3360)  acc1: 67.6000 (70.9280)  acc5: 88.4000 (90.2720)  time: 0.1998  data: 0.1083  max mem: 13273
Test: Total time: 0:00:10 (0.4072 s / it)
* Acc@1 71.036 Acc@5 90.466 loss 1.326
Accuracy of the model on the 50000 test images: 71.0%
Max accuracy: 71.04%
Epoch: [45]  [   0/1251]  eta: 1:01:01  lr: 0.003922  min_lr: 0.003922  loss: 4.6941 (4.6941)  weight_decay: 0.0500 (0.0500)  time: 2.9267  data: 2.6910  max mem: 13273
Epoch: [45]  [ 200/1251]  eta: 0:03:50  lr: 0.003921  min_lr: 0.003921  loss: 3.7800 (3.6292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6123 (0.7084)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [45]  [ 400/1251]  eta: 0:03:00  lr: 0.003920  min_lr: 0.003920  loss: 3.2415 (3.6457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6190 (0.7250)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [45]  [ 600/1251]  eta: 0:02:16  lr: 0.003919  min_lr: 0.003919  loss: 3.0715 (3.6074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6092 (0.7101)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [45]  [ 800/1251]  eta: 0:01:34  lr: 0.003918  min_lr: 0.003918  loss: 3.0219 (3.5868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7344 (0.7184)  time: 0.2062  data: 0.0006  max mem: 13273
Epoch: [45]  [1000/1251]  eta: 0:00:52  lr: 0.003917  min_lr: 0.003917  loss: 3.2730 (3.5751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6900 (0.7178)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [45]  [1200/1251]  eta: 0:00:10  lr: 0.003916  min_lr: 0.003916  loss: 2.8039 (3.5575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6844 (0.7240)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [45]  [1250/1251]  eta: 0:00:00  lr: 0.003916  min_lr: 0.003916  loss: 3.8625 (3.5628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7111 (0.7247)  time: 0.1714  data: 0.0005  max mem: 13273
Epoch: [45] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.003916  min_lr: 0.003916  loss: 3.8625 (3.5285)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7111 (0.7247)
Test:  [ 0/25]  eta: 0:02:00  loss: 1.0457 (1.0457)  acc1: 82.8000 (82.8000)  acc5: 96.0000 (96.0000)  time: 4.8013  data: 4.6895  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0925 (1.1771)  acc1: 78.8000 (75.6364)  acc5: 94.8000 (94.1455)  time: 0.7276  data: 0.6321  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.5054 (1.4166)  acc1: 67.6000 (70.3810)  acc5: 88.4000 (90.4381)  time: 0.2376  data: 0.1455  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5426 (1.4112)  acc1: 67.2000 (70.5280)  acc5: 88.0000 (90.4640)  time: 0.1958  data: 0.1050  max mem: 13273
Test: Total time: 0:00:10 (0.4028 s / it)
* Acc@1 70.562 Acc@5 90.334 loss 1.413
Accuracy of the model on the 50000 test images: 70.6%
Max accuracy: 71.04%
Epoch: [46]  [   0/1251]  eta: 1:03:45  lr: 0.003916  min_lr: 0.003916  loss: 2.4727 (2.4727)  weight_decay: 0.0500 (0.0500)  time: 3.0576  data: 1.5066  max mem: 13273
Epoch: [46]  [ 200/1251]  eta: 0:03:52  lr: 0.003914  min_lr: 0.003914  loss: 3.9900 (3.5546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6555 (0.6979)  time: 0.2046  data: 0.0003  max mem: 13273
Epoch: [46]  [ 400/1251]  eta: 0:03:01  lr: 0.003913  min_lr: 0.003913  loss: 3.3092 (3.5545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6659 (0.7060)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [46]  [ 600/1251]  eta: 0:02:17  lr: 0.003912  min_lr: 0.003912  loss: 2.9823 (3.5564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6620 (0.6894)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [46]  [ 800/1251]  eta: 0:01:34  lr: 0.003911  min_lr: 0.003911  loss: 3.5761 (3.5519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6467 (0.7161)  time: 0.2085  data: 0.0007  max mem: 13273
Epoch: [46]  [1000/1251]  eta: 0:00:52  lr: 0.003910  min_lr: 0.003910  loss: 3.3653 (3.5465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6907 (0.7180)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [46]  [1200/1251]  eta: 0:00:10  lr: 0.003909  min_lr: 0.003909  loss: 3.2002 (3.5647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6731 (0.7187)  time: 0.2050  data: 0.0006  max mem: 13273
Epoch: [46]  [1250/1251]  eta: 0:00:00  lr: 0.003909  min_lr: 0.003909  loss: 2.9817 (3.5591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6114 (0.7138)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [46] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.003909  min_lr: 0.003909  loss: 2.9817 (3.5526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6114 (0.7138)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.9204 (0.9204)  acc1: 82.0000 (82.0000)  acc5: 96.4000 (96.4000)  time: 5.8074  data: 5.7003  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.1332 (1.1235)  acc1: 76.4000 (75.7091)  acc5: 94.8000 (93.8909)  time: 0.7595  data: 0.6649  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4233 (1.3568)  acc1: 67.2000 (70.4762)  acc5: 88.8000 (90.2857)  time: 0.2091  data: 0.1173  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4865 (1.3610)  acc1: 65.6000 (70.3840)  acc5: 87.6000 (90.1440)  time: 0.2079  data: 0.1172  max mem: 13273
Test: Total time: 0:00:10 (0.4171 s / it)
* Acc@1 70.734 Acc@5 90.462 loss 1.354
Accuracy of the model on the 50000 test images: 70.7%
Max accuracy: 71.04%
Epoch: [47]  [   0/1251]  eta: 1:07:17  lr: 0.003909  min_lr: 0.003909  loss: 2.3605 (2.3605)  weight_decay: 0.0500 (0.0500)  time: 3.2278  data: 2.5716  max mem: 13273
Epoch: [47]  [ 200/1251]  eta: 0:03:55  lr: 0.003908  min_lr: 0.003908  loss: 4.2480 (3.4477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7237 (0.7297)  time: 0.2066  data: 0.0006  max mem: 13273
Epoch: [47]  [ 400/1251]  eta: 0:03:03  lr: 0.003907  min_lr: 0.003907  loss: 2.8960 (3.5104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7173 (0.7073)  time: 0.2064  data: 0.0006  max mem: 13273
Epoch: [47]  [ 600/1251]  eta: 0:02:18  lr: 0.003906  min_lr: 0.003906  loss: 2.7831 (3.5137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6428 (0.7093)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [47]  [ 800/1251]  eta: 0:01:35  lr: 0.003905  min_lr: 0.003905  loss: 3.9008 (3.5107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6544 (0.7086)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [47]  [1000/1251]  eta: 0:00:52  lr: 0.003904  min_lr: 0.003904  loss: 4.0334 (3.5260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6408 (0.7071)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [47]  [1200/1251]  eta: 0:00:10  lr: 0.003902  min_lr: 0.003902  loss: 4.0931 (3.5228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6396 (0.7038)  time: 0.2065  data: 0.0004  max mem: 13273
Epoch: [47]  [1250/1251]  eta: 0:00:00  lr: 0.003902  min_lr: 0.003902  loss: 3.2356 (3.5240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6587 (0.7033)  time: 0.1704  data: 0.0011  max mem: 13273
Epoch: [47] Total time: 0:04:21 (0.2092 s / it)
Averaged stats: lr: 0.003902  min_lr: 0.003902  loss: 3.2356 (3.5174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6587 (0.7033)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.9060 (0.9060)  acc1: 84.0000 (84.0000)  acc5: 96.4000 (96.4000)  time: 5.6569  data: 5.5507  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0892 (1.1192)  acc1: 76.4000 (75.9636)  acc5: 94.8000 (94.3636)  time: 0.6986  data: 0.6011  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4310 (1.3904)  acc1: 67.2000 (70.4191)  acc5: 89.6000 (90.5714)  time: 0.1901  data: 0.0965  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5227 (1.3979)  acc1: 66.4000 (70.1280)  acc5: 87.6000 (90.4160)  time: 0.1889  data: 0.0965  max mem: 13273
Test: Total time: 0:00:09 (0.3964 s / it)
* Acc@1 70.694 Acc@5 90.406 loss 1.393
Accuracy of the model on the 50000 test images: 70.7%
Max accuracy: 71.04%
Epoch: [48]  [   0/1251]  eta: 1:12:23  lr: 0.003902  min_lr: 0.003902  loss: 4.2911 (4.2911)  weight_decay: 0.0500 (0.0500)  time: 3.4717  data: 2.3947  max mem: 13273
Epoch: [48]  [ 200/1251]  eta: 0:03:53  lr: 0.003901  min_lr: 0.003901  loss: 3.6655 (3.4952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7021 (0.7065)  time: 0.2036  data: 0.0005  max mem: 13273
Epoch: [48]  [ 400/1251]  eta: 0:03:01  lr: 0.003900  min_lr: 0.003900  loss: 3.9492 (3.5380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7652 (0.7336)  time: 0.2037  data: 0.0005  max mem: 13273
Epoch: [48]  [ 600/1251]  eta: 0:02:16  lr: 0.003899  min_lr: 0.003899  loss: 3.9974 (3.5477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7137 (0.7170)  time: 0.2037  data: 0.0004  max mem: 13273
Epoch: [48]  [ 800/1251]  eta: 0:01:34  lr: 0.003898  min_lr: 0.003898  loss: 3.8946 (3.5446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6187 (0.7227)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [48]  [1000/1251]  eta: 0:00:52  lr: 0.003897  min_lr: 0.003897  loss: 3.3170 (3.5307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6592 (0.7142)  time: 0.2080  data: 0.0006  max mem: 13273
Epoch: [48]  [1200/1251]  eta: 0:00:10  lr: 0.003895  min_lr: 0.003895  loss: 4.1319 (3.5411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7509 (0.7170)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [48]  [1250/1251]  eta: 0:00:00  lr: 0.003895  min_lr: 0.003895  loss: 2.9246 (3.5401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6970 (0.7161)  time: 0.1709  data: 0.0007  max mem: 13273
Epoch: [48] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.003895  min_lr: 0.003895  loss: 2.9246 (3.5039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6970 (0.7161)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8850 (0.8850)  acc1: 82.0000 (82.0000)  acc5: 94.8000 (94.8000)  time: 5.6796  data: 5.5744  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9826 (1.1230)  acc1: 78.8000 (76.5818)  acc5: 95.2000 (93.9636)  time: 0.7625  data: 0.6678  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4636 (1.3835)  acc1: 66.8000 (71.6762)  acc5: 89.2000 (90.4952)  time: 0.2032  data: 0.1107  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5731 (1.3900)  acc1: 66.8000 (71.3280)  acc5: 88.4000 (90.4320)  time: 0.2026  data: 0.1106  max mem: 13273
Test: Total time: 0:00:10 (0.4084 s / it)
* Acc@1 70.930 Acc@5 90.554 loss 1.388
Accuracy of the model on the 50000 test images: 70.9%
Max accuracy: 71.04%
Epoch: [49]  [   0/1251]  eta: 1:05:13  lr: 0.003895  min_lr: 0.003895  loss: 3.5769 (3.5769)  weight_decay: 0.0500 (0.0500)  time: 3.1279  data: 2.0986  max mem: 13273
Epoch: [49]  [ 200/1251]  eta: 0:03:53  lr: 0.003894  min_lr: 0.003894  loss: 2.9314 (3.4561)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6056 (0.6595)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [49]  [ 400/1251]  eta: 0:03:01  lr: 0.003893  min_lr: 0.003893  loss: 3.4869 (3.4781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6230 (0.6481)  time: 0.2040  data: 0.0004  max mem: 13273
Epoch: [49]  [ 600/1251]  eta: 0:02:17  lr: 0.003892  min_lr: 0.003892  loss: 3.2147 (3.4765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6988 (0.6764)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [49]  [ 800/1251]  eta: 0:01:34  lr: 0.003890  min_lr: 0.003890  loss: 3.4045 (3.4848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6742 (0.6780)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [49]  [1000/1251]  eta: 0:00:52  lr: 0.003889  min_lr: 0.003889  loss: 3.0891 (3.4931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6316 (0.6778)  time: 0.2069  data: 0.0005  max mem: 13273
Epoch: [49]  [1200/1251]  eta: 0:00:10  lr: 0.003888  min_lr: 0.003888  loss: 3.8338 (3.4948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6205 (0.6791)  time: 0.2040  data: 0.0004  max mem: 13273
Epoch: [49]  [1250/1251]  eta: 0:00:00  lr: 0.003888  min_lr: 0.003888  loss: 2.9537 (3.4941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6105 (0.6750)  time: 0.1704  data: 0.0008  max mem: 13273
Epoch: [49] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.003888  min_lr: 0.003888  loss: 2.9537 (3.4855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6105 (0.6750)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.8910 (0.8910)  acc1: 82.8000 (82.8000)  acc5: 96.8000 (96.8000)  time: 5.7078  data: 5.5989  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0682 (1.1071)  acc1: 78.0000 (76.9091)  acc5: 95.6000 (94.6909)  time: 0.7441  data: 0.6454  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3586 (1.3547)  acc1: 68.4000 (71.4476)  acc5: 91.2000 (90.9524)  time: 0.2043  data: 0.1103  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4750 (1.3690)  acc1: 66.8000 (71.1840)  acc5: 88.8000 (90.6400)  time: 0.2029  data: 0.1102  max mem: 13273
Test: Total time: 0:00:10 (0.4092 s / it)
* Acc@1 71.246 Acc@5 90.638 loss 1.375
Accuracy of the model on the 50000 test images: 71.2%
Max accuracy: 71.25%
Epoch: [50]  [   0/1251]  eta: 0:57:53  lr: 0.003888  min_lr: 0.003888  loss: 2.7199 (2.7199)  weight_decay: 0.0500 (0.0500)  time: 2.7764  data: 1.9381  max mem: 13273
Epoch: [50]  [ 200/1251]  eta: 0:03:49  lr: 0.003887  min_lr: 0.003887  loss: 3.2998 (3.5301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (0.7633)  time: 0.2038  data: 0.0004  max mem: 13273
Epoch: [50]  [ 400/1251]  eta: 0:03:00  lr: 0.003885  min_lr: 0.003885  loss: 3.4476 (3.5375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7644 (0.7399)  time: 0.2036  data: 0.0005  max mem: 13273
Epoch: [50]  [ 600/1251]  eta: 0:02:16  lr: 0.003884  min_lr: 0.003884  loss: 4.1109 (3.5446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7898 (0.7410)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [50]  [ 800/1251]  eta: 0:01:34  lr: 0.003883  min_lr: 0.003883  loss: 3.4976 (3.5283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6234 (0.7156)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [50]  [1000/1251]  eta: 0:00:52  lr: 0.003882  min_lr: 0.003882  loss: 2.9540 (3.5175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6312 (0.7085)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [50]  [1200/1251]  eta: 0:00:10  lr: 0.003881  min_lr: 0.003881  loss: 2.8005 (3.5161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7577 (0.7098)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [50]  [1250/1251]  eta: 0:00:00  lr: 0.003880  min_lr: 0.003880  loss: 3.2333 (3.5122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7556 (0.7100)  time: 0.1700  data: 0.0007  max mem: 13273
Epoch: [50] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.003880  min_lr: 0.003880  loss: 3.2333 (3.4982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7556 (0.7100)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.9456 (0.9456)  acc1: 83.2000 (83.2000)  acc5: 94.0000 (94.0000)  time: 5.8300  data: 5.7244  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.1682 (1.1232)  acc1: 76.8000 (76.5455)  acc5: 94.0000 (93.8909)  time: 0.7010  data: 0.6055  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3814 (1.3653)  acc1: 68.8000 (71.3333)  acc5: 90.4000 (90.4762)  time: 0.1773  data: 0.0848  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5411 (1.3765)  acc1: 68.4000 (70.9280)  acc5: 88.4000 (90.3360)  time: 0.1815  data: 0.0889  max mem: 13273
Test: Total time: 0:00:09 (0.3975 s / it)
* Acc@1 71.072 Acc@5 90.546 loss 1.367
Accuracy of the model on the 50000 test images: 71.1%
Max accuracy: 71.25%
Epoch: [51]  [   0/1251]  eta: 1:05:24  lr: 0.003880  min_lr: 0.003880  loss: 3.7987 (3.7987)  weight_decay: 0.0500 (0.0500)  time: 3.1369  data: 1.5976  max mem: 13273
Epoch: [51]  [ 200/1251]  eta: 0:03:53  lr: 0.003879  min_lr: 0.003879  loss: 3.9376 (3.4285)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6916 (0.7404)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [51]  [ 400/1251]  eta: 0:03:02  lr: 0.003878  min_lr: 0.003878  loss: 2.9752 (3.4464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7376 (0.7395)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [51]  [ 600/1251]  eta: 0:02:17  lr: 0.003877  min_lr: 0.003877  loss: 3.8683 (3.4837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6773 (0.7261)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [51]  [ 800/1251]  eta: 0:01:34  lr: 0.003875  min_lr: 0.003875  loss: 2.8878 (3.4776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5681 (0.7109)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [51]  [1000/1251]  eta: 0:00:52  lr: 0.003874  min_lr: 0.003874  loss: 2.8730 (3.4534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6999 (0.7080)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [51]  [1200/1251]  eta: 0:00:10  lr: 0.003873  min_lr: 0.003873  loss: 3.3048 (3.4567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7583 (0.7091)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [51]  [1250/1251]  eta: 0:00:00  lr: 0.003873  min_lr: 0.003873  loss: 3.4243 (3.4553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6490 (0.7092)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [51] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003873  min_lr: 0.003873  loss: 3.4243 (3.4952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6490 (0.7092)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.9503 (0.9503)  acc1: 82.0000 (82.0000)  acc5: 96.4000 (96.4000)  time: 5.4321  data: 5.3267  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.1471 (1.1665)  acc1: 77.6000 (76.4000)  acc5: 95.2000 (94.1818)  time: 0.7157  data: 0.6201  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4717 (1.4168)  acc1: 68.0000 (71.2000)  acc5: 87.6000 (90.4762)  time: 0.2021  data: 0.1097  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5738 (1.4317)  acc1: 68.0000 (70.7360)  acc5: 87.6000 (90.3360)  time: 0.2008  data: 0.1096  max mem: 13273
Test: Total time: 0:00:09 (0.3978 s / it)
* Acc@1 71.048 Acc@5 90.468 loss 1.428
Accuracy of the model on the 50000 test images: 71.0%
Max accuracy: 71.25%
Epoch: [52]  [   0/1251]  eta: 1:07:47  lr: 0.003873  min_lr: 0.003873  loss: 4.9519 (4.9519)  weight_decay: 0.0500 (0.0500)  time: 3.2515  data: 2.5782  max mem: 13273
Epoch: [52]  [ 200/1251]  eta: 0:03:53  lr: 0.003871  min_lr: 0.003871  loss: 3.2445 (3.5388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7758 (0.7204)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [52]  [ 400/1251]  eta: 0:03:01  lr: 0.003870  min_lr: 0.003870  loss: 3.1313 (3.5156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7081 (0.7160)  time: 0.2044  data: 0.0006  max mem: 13273
Epoch: [52]  [ 600/1251]  eta: 0:02:17  lr: 0.003869  min_lr: 0.003869  loss: 2.8239 (3.4790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6011 (0.7116)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [52]  [ 800/1251]  eta: 0:01:34  lr: 0.003867  min_lr: 0.003867  loss: 2.9996 (3.4577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7363 (0.6946)  time: 0.2052  data: 0.0007  max mem: 13273
Epoch: [52]  [1000/1251]  eta: 0:00:52  lr: 0.003866  min_lr: 0.003866  loss: 3.6483 (3.4706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5656 (0.6867)  time: 0.2047  data: 0.0006  max mem: 13273
Epoch: [52]  [1200/1251]  eta: 0:00:10  lr: 0.003865  min_lr: 0.003865  loss: 2.9483 (3.4802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6215 (0.6823)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [52]  [1250/1251]  eta: 0:00:00  lr: 0.003865  min_lr: 0.003865  loss: 3.0644 (3.4736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5816 (0.6798)  time: 0.1703  data: 0.0008  max mem: 13273
Epoch: [52] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.003865  min_lr: 0.003865  loss: 3.0644 (3.4868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5816 (0.6798)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.8082 (0.8082)  acc1: 84.8000 (84.8000)  acc5: 96.8000 (96.8000)  time: 5.7735  data: 5.6681  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9275 (1.0076)  acc1: 78.4000 (78.2545)  acc5: 95.6000 (94.5455)  time: 0.7381  data: 0.6390  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3405 (1.3196)  acc1: 70.0000 (72.1524)  acc5: 90.4000 (90.6476)  time: 0.1891  data: 0.0928  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5047 (1.3193)  acc1: 68.4000 (71.8560)  acc5: 88.0000 (90.5120)  time: 0.1880  data: 0.0927  max mem: 13273
Test: Total time: 0:00:09 (0.3996 s / it)
* Acc@1 71.920 Acc@5 90.780 loss 1.310
Accuracy of the model on the 50000 test images: 71.9%
Max accuracy: 71.92%
Epoch: [53]  [   0/1251]  eta: 1:07:15  lr: 0.003865  min_lr: 0.003865  loss: 3.8460 (3.8460)  weight_decay: 0.0500 (0.0500)  time: 3.2259  data: 2.9923  max mem: 13273
Epoch: [53]  [ 200/1251]  eta: 0:03:51  lr: 0.003863  min_lr: 0.003863  loss: 3.2333 (3.4302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7324 (0.6987)  time: 0.2072  data: 0.0004  max mem: 13273
Epoch: [53]  [ 400/1251]  eta: 0:03:01  lr: 0.003862  min_lr: 0.003862  loss: 3.7339 (3.4424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6459 (0.7188)  time: 0.2050  data: 0.0012  max mem: 13273
Epoch: [53]  [ 600/1251]  eta: 0:02:16  lr: 0.003861  min_lr: 0.003861  loss: 2.9899 (3.4340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6535 (0.7143)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [53]  [ 800/1251]  eta: 0:01:34  lr: 0.003859  min_lr: 0.003859  loss: 3.1497 (3.4530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6360 (0.7126)  time: 0.2038  data: 0.0004  max mem: 13273
Epoch: [53]  [1000/1251]  eta: 0:00:52  lr: 0.003858  min_lr: 0.003858  loss: 2.9137 (3.4373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7197 (0.7185)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [53]  [1200/1251]  eta: 0:00:10  lr: 0.003857  min_lr: 0.003857  loss: 3.0442 (3.4521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5689 (0.7145)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [53]  [1250/1251]  eta: 0:00:00  lr: 0.003856  min_lr: 0.003856  loss: 3.1112 (3.4491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5586 (0.7114)  time: 0.1700  data: 0.0006  max mem: 13273
Epoch: [53] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003856  min_lr: 0.003856  loss: 3.1112 (3.4593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5586 (0.7114)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.9494 (0.9494)  acc1: 83.2000 (83.2000)  acc5: 94.8000 (94.8000)  time: 5.7082  data: 5.5913  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0616 (1.1056)  acc1: 78.4000 (77.7818)  acc5: 94.8000 (94.0000)  time: 0.7315  data: 0.6345  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4335 (1.3578)  acc1: 69.6000 (72.6476)  acc5: 89.2000 (90.4381)  time: 0.2023  data: 0.1097  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5116 (1.3659)  acc1: 68.8000 (72.0960)  acc5: 88.4000 (90.2880)  time: 0.2012  data: 0.1098  max mem: 13273
Test: Total time: 0:00:10 (0.4075 s / it)
* Acc@1 71.468 Acc@5 90.726 loss 1.373
Accuracy of the model on the 50000 test images: 71.5%
Max accuracy: 71.92%
Epoch: [54]  [   0/1251]  eta: 1:08:15  lr: 0.003856  min_lr: 0.003856  loss: 3.2852 (3.2852)  weight_decay: 0.0500 (0.0500)  time: 3.2742  data: 2.7305  max mem: 13273
Epoch: [54]  [ 200/1251]  eta: 0:03:53  lr: 0.003855  min_lr: 0.003855  loss: 3.7537 (3.3998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5946 (0.6639)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [54]  [ 400/1251]  eta: 0:03:01  lr: 0.003854  min_lr: 0.003854  loss: 2.9016 (3.3757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6912 (0.6706)  time: 0.2048  data: 0.0007  max mem: 13273
Epoch: [54]  [ 600/1251]  eta: 0:02:17  lr: 0.003852  min_lr: 0.003852  loss: 2.8823 (3.4196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6073 (0.6558)  time: 0.2043  data: 0.0005  max mem: 13273
Epoch: [54]  [ 800/1251]  eta: 0:01:34  lr: 0.003851  min_lr: 0.003851  loss: 3.5422 (3.4393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7815 (0.6769)  time: 0.2081  data: 0.0006  max mem: 13273
Epoch: [54]  [1000/1251]  eta: 0:00:52  lr: 0.003849  min_lr: 0.003849  loss: 4.2339 (3.4702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6921 (0.6908)  time: 0.2060  data: 0.0006  max mem: 13273
Epoch: [54]  [1200/1251]  eta: 0:00:10  lr: 0.003848  min_lr: 0.003848  loss: 3.2463 (3.4777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6400 (0.6957)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [54]  [1250/1251]  eta: 0:00:00  lr: 0.003848  min_lr: 0.003848  loss: 3.7626 (3.4782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6186 (0.6950)  time: 0.1711  data: 0.0011  max mem: 13273
Epoch: [54] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003848  min_lr: 0.003848  loss: 3.7626 (3.4633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6186 (0.6950)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.9619 (0.9619)  acc1: 80.8000 (80.8000)  acc5: 95.6000 (95.6000)  time: 5.6293  data: 5.4878  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.1373 (1.1501)  acc1: 78.4000 (75.9636)  acc5: 95.2000 (94.0364)  time: 0.7465  data: 0.6434  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4492 (1.3885)  acc1: 67.6000 (71.0667)  acc5: 89.6000 (90.8762)  time: 0.2010  data: 0.1054  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5637 (1.3923)  acc1: 67.6000 (71.0400)  acc5: 88.4000 (90.8320)  time: 0.2038  data: 0.1100  max mem: 13273
Test: Total time: 0:00:10 (0.4086 s / it)
* Acc@1 71.366 Acc@5 90.912 loss 1.388
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.92%
Epoch: [55]  [   0/1251]  eta: 1:05:37  lr: 0.003848  min_lr: 0.003848  loss: 2.7091 (2.7091)  weight_decay: 0.0500 (0.0500)  time: 3.1471  data: 1.7548  max mem: 13273
Epoch: [55]  [ 200/1251]  eta: 0:03:52  lr: 0.003846  min_lr: 0.003846  loss: 2.9764 (3.4397)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5284 (0.6143)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [55]  [ 400/1251]  eta: 0:03:01  lr: 0.003845  min_lr: 0.003845  loss: 3.5972 (3.3825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6001 (0.6691)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [55]  [ 600/1251]  eta: 0:02:17  lr: 0.003844  min_lr: 0.003844  loss: 3.8595 (3.4194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7029 (0.6929)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [55]  [ 800/1251]  eta: 0:01:34  lr: 0.003842  min_lr: 0.003842  loss: 3.7730 (3.4138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6921 (0.6940)  time: 0.2049  data: 0.0006  max mem: 13273
Epoch: [55]  [1000/1251]  eta: 0:00:52  lr: 0.003841  min_lr: 0.003841  loss: 3.8108 (3.4264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6992 (0.7059)  time: 0.2062  data: 0.0006  max mem: 13273
Epoch: [55]  [1200/1251]  eta: 0:00:10  lr: 0.003839  min_lr: 0.003839  loss: 3.7862 (3.4323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6069 (0.6952)  time: 0.2054  data: 0.0006  max mem: 13273
Epoch: [55]  [1250/1251]  eta: 0:00:00  lr: 0.003839  min_lr: 0.003839  loss: 2.8814 (3.4273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7410 (0.7012)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [55] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.003839  min_lr: 0.003839  loss: 2.8814 (3.4544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7410 (0.7012)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.9396 (0.9396)  acc1: 82.8000 (82.8000)  acc5: 95.2000 (95.2000)  time: 5.4921  data: 5.3837  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.1101 (1.0843)  acc1: 78.4000 (78.1455)  acc5: 95.2000 (94.3273)  time: 0.7544  data: 0.6577  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4111 (1.3356)  acc1: 68.8000 (72.5905)  acc5: 90.0000 (90.8762)  time: 0.2131  data: 0.1199  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4619 (1.3370)  acc1: 68.0000 (72.1280)  acc5: 88.8000 (90.8320)  time: 0.2196  data: 0.1266  max mem: 13273
Test: Total time: 0:00:10 (0.4130 s / it)
* Acc@1 71.670 Acc@5 90.802 loss 1.338
Accuracy of the model on the 50000 test images: 71.7%
Max accuracy: 71.92%
Epoch: [56]  [   0/1251]  eta: 1:11:10  lr: 0.003839  min_lr: 0.003839  loss: 2.8926 (2.8926)  weight_decay: 0.0500 (0.0500)  time: 3.4133  data: 1.5421  max mem: 13273
Epoch: [56]  [ 200/1251]  eta: 0:03:53  lr: 0.003838  min_lr: 0.003838  loss: 3.4551 (3.4476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6076 (0.6721)  time: 0.2045  data: 0.0003  max mem: 13273
Epoch: [56]  [ 400/1251]  eta: 0:03:01  lr: 0.003836  min_lr: 0.003836  loss: 2.7893 (3.4477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6118 (0.6760)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [56]  [ 600/1251]  eta: 0:02:17  lr: 0.003835  min_lr: 0.003835  loss: 3.7177 (3.4227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7698 (0.6952)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [56]  [ 800/1251]  eta: 0:01:34  lr: 0.003833  min_lr: 0.003833  loss: 3.8483 (3.4582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6926 (0.7006)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [56]  [1000/1251]  eta: 0:00:52  lr: 0.003832  min_lr: 0.003832  loss: 3.4572 (3.4744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6638 (0.6958)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [56]  [1200/1251]  eta: 0:00:10  lr: 0.003831  min_lr: 0.003831  loss: 3.0927 (3.4835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6186 (0.6888)  time: 0.2060  data: 0.0006  max mem: 13273
Epoch: [56]  [1250/1251]  eta: 0:00:00  lr: 0.003830  min_lr: 0.003830  loss: 2.8864 (3.4735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6402 (0.6874)  time: 0.1703  data: 0.0010  max mem: 13273
Epoch: [56] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003830  min_lr: 0.003830  loss: 2.8864 (3.4565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6402 (0.6874)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8526 (0.8526)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 5.6000  data: 5.4666  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9952 (1.0210)  acc1: 78.4000 (77.5636)  acc5: 95.2000 (94.4000)  time: 0.7167  data: 0.6123  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3238 (1.2440)  acc1: 69.2000 (72.1714)  acc5: 89.2000 (90.9905)  time: 0.1910  data: 0.0950  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3363 (1.2520)  acc1: 69.2000 (72.0160)  acc5: 89.2000 (91.0080)  time: 0.1884  data: 0.0949  max mem: 13273
Test: Total time: 0:00:09 (0.3944 s / it)
* Acc@1 72.156 Acc@5 91.198 loss 1.248
Accuracy of the model on the 50000 test images: 72.2%
Max accuracy: 72.16%
Epoch: [57]  [   0/1251]  eta: 0:59:54  lr: 0.003830  min_lr: 0.003830  loss: 2.5991 (2.5991)  weight_decay: 0.0500 (0.0500)  time: 2.8733  data: 2.6413  max mem: 13273
Epoch: [57]  [ 200/1251]  eta: 0:03:54  lr: 0.003829  min_lr: 0.003829  loss: 3.8342 (3.4556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7486 (0.7063)  time: 0.2140  data: 0.0005  max mem: 13273
Epoch: [57]  [ 400/1251]  eta: 0:03:02  lr: 0.003827  min_lr: 0.003827  loss: 3.8305 (3.4527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6092 (0.7142)  time: 0.2174  data: 0.0004  max mem: 13273
Epoch: [57]  [ 600/1251]  eta: 0:02:17  lr: 0.003826  min_lr: 0.003826  loss: 3.1230 (3.4529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6471 (0.7105)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [57]  [ 800/1251]  eta: 0:01:34  lr: 0.003824  min_lr: 0.003824  loss: 2.7654 (3.4448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6819 (0.7135)  time: 0.2073  data: 0.0004  max mem: 13273
Epoch: [57]  [1000/1251]  eta: 0:00:52  lr: 0.003823  min_lr: 0.003823  loss: 3.9717 (3.4525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6545 (0.7059)  time: 0.2071  data: 0.0005  max mem: 13273
Epoch: [57]  [1200/1251]  eta: 0:00:10  lr: 0.003821  min_lr: 0.003821  loss: 3.1239 (3.4489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6625 (0.7041)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [57]  [1250/1251]  eta: 0:00:00  lr: 0.003821  min_lr: 0.003821  loss: 2.8374 (3.4515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6974 (0.7052)  time: 0.1704  data: 0.0006  max mem: 13273
Epoch: [57] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.003821  min_lr: 0.003821  loss: 2.8374 (3.4522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6974 (0.7052)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7856 (0.7856)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.7629  data: 5.6577  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9559 (1.0528)  acc1: 78.4000 (77.8545)  acc5: 96.0000 (94.9818)  time: 0.7674  data: 0.6719  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3773 (1.3127)  acc1: 68.0000 (72.1714)  acc5: 90.4000 (91.3524)  time: 0.2259  data: 0.1327  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4213 (1.3273)  acc1: 67.6000 (71.8560)  acc5: 89.6000 (91.2320)  time: 0.2253  data: 0.1326  max mem: 13273
Test: Total time: 0:00:10 (0.4292 s / it)
* Acc@1 72.136 Acc@5 91.038 loss 1.331
Accuracy of the model on the 50000 test images: 72.1%
Max accuracy: 72.16%
Epoch: [58]  [   0/1251]  eta: 1:10:20  lr: 0.003821  min_lr: 0.003821  loss: 2.4030 (2.4030)  weight_decay: 0.0500 (0.0500)  time: 3.3737  data: 2.5335  max mem: 13273
Epoch: [58]  [ 200/1251]  eta: 0:03:53  lr: 0.003820  min_lr: 0.003820  loss: 3.2075 (3.2937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5938 (0.6610)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [58]  [ 400/1251]  eta: 0:03:02  lr: 0.003818  min_lr: 0.003818  loss: 3.6202 (3.3994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7018 (0.6890)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [58]  [ 600/1251]  eta: 0:02:17  lr: 0.003817  min_lr: 0.003817  loss: 3.6552 (3.4159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6932 (0.6872)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [58]  [ 800/1251]  eta: 0:01:34  lr: 0.003815  min_lr: 0.003815  loss: 3.0336 (3.4315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7015 (0.6847)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [58]  [1000/1251]  eta: 0:00:52  lr: 0.003813  min_lr: 0.003813  loss: 3.5739 (3.4418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6275 (0.6817)  time: 0.2047  data: 0.0006  max mem: 13273
Epoch: [58]  [1200/1251]  eta: 0:00:10  lr: 0.003812  min_lr: 0.003812  loss: 3.1948 (3.4360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6805 (0.6890)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [58]  [1250/1251]  eta: 0:00:00  lr: 0.003812  min_lr: 0.003812  loss: 3.7749 (3.4386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6574 (0.6874)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [58] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.003812  min_lr: 0.003812  loss: 3.7749 (3.4432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6574 (0.6874)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.9047 (0.9047)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 5.7796  data: 5.6740  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0513 (1.1116)  acc1: 76.4000 (77.3818)  acc5: 95.6000 (94.7636)  time: 0.7636  data: 0.6674  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4010 (1.3421)  acc1: 68.4000 (72.4571)  acc5: 91.2000 (91.5238)  time: 0.1990  data: 0.1059  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5053 (1.3497)  acc1: 68.0000 (71.8400)  acc5: 90.0000 (91.4880)  time: 0.1986  data: 0.1059  max mem: 13273
Test: Total time: 0:00:10 (0.4077 s / it)
* Acc@1 72.018 Acc@5 91.160 loss 1.353
Accuracy of the model on the 50000 test images: 72.0%
Max accuracy: 72.16%
Epoch: [59]  [   0/1251]  eta: 1:06:59  lr: 0.003812  min_lr: 0.003812  loss: 3.3825 (3.3825)  weight_decay: 0.0500 (0.0500)  time: 3.2127  data: 1.7037  max mem: 13273
Epoch: [59]  [ 200/1251]  eta: 0:03:53  lr: 0.003810  min_lr: 0.003810  loss: 3.7011 (3.5247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6752 (0.7217)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [59]  [ 400/1251]  eta: 0:03:01  lr: 0.003809  min_lr: 0.003809  loss: 3.6442 (3.4823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7056 (0.7229)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [59]  [ 600/1251]  eta: 0:02:17  lr: 0.003807  min_lr: 0.003807  loss: 3.2302 (3.4899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6516 (0.7198)  time: 0.2123  data: 0.0004  max mem: 13273
Epoch: [59]  [ 800/1251]  eta: 0:01:34  lr: 0.003805  min_lr: 0.003805  loss: 3.7249 (3.4601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6445 (0.7176)  time: 0.2114  data: 0.0005  max mem: 13273
Epoch: [59]  [1000/1251]  eta: 0:00:52  lr: 0.003804  min_lr: 0.003804  loss: 3.5290 (3.4666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6095 (0.7088)  time: 0.2081  data: 0.0006  max mem: 13273
Epoch: [59]  [1200/1251]  eta: 0:00:10  lr: 0.003802  min_lr: 0.003802  loss: 3.1332 (3.4554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6279 (0.7082)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [59]  [1250/1251]  eta: 0:00:00  lr: 0.003802  min_lr: 0.003802  loss: 3.2566 (3.4560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6257 (0.7060)  time: 0.1706  data: 0.0010  max mem: 13273
Epoch: [59] Total time: 0:04:21 (0.2090 s / it)
Averaged stats: lr: 0.003802  min_lr: 0.003802  loss: 3.2566 (3.4432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6257 (0.7060)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8188 (0.8188)  acc1: 84.4000 (84.4000)  acc5: 96.4000 (96.4000)  time: 5.5660  data: 5.4568  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9954 (1.0319)  acc1: 75.2000 (77.1273)  acc5: 96.0000 (94.6546)  time: 0.7422  data: 0.6477  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3271 (1.2886)  acc1: 69.2000 (72.1905)  acc5: 90.8000 (91.2571)  time: 0.2038  data: 0.1120  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4572 (1.3004)  acc1: 68.0000 (71.8240)  acc5: 88.4000 (91.1040)  time: 0.2028  data: 0.1119  max mem: 13273
Test: Total time: 0:00:10 (0.4043 s / it)
* Acc@1 72.136 Acc@5 91.226 loss 1.300
Accuracy of the model on the 50000 test images: 72.1%
Max accuracy: 72.16%
Epoch: [60]  [   0/1251]  eta: 1:07:51  lr: 0.003802  min_lr: 0.003802  loss: 4.9431 (4.9431)  weight_decay: 0.0500 (0.0500)  time: 3.2542  data: 1.6091  max mem: 13273
Epoch: [60]  [ 200/1251]  eta: 0:03:52  lr: 0.003800  min_lr: 0.003800  loss: 2.9437 (3.3436)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7444 (0.6949)  time: 0.2037  data: 0.0005  max mem: 13273
Epoch: [60]  [ 400/1251]  eta: 0:03:01  lr: 0.003799  min_lr: 0.003799  loss: 3.0134 (3.3994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5445 (0.7040)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [60]  [ 600/1251]  eta: 0:02:17  lr: 0.003797  min_lr: 0.003797  loss: 2.8922 (3.4205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5903 (0.6887)  time: 0.2166  data: 0.0003  max mem: 13273
Epoch: [60]  [ 800/1251]  eta: 0:01:34  lr: 0.003796  min_lr: 0.003796  loss: 3.0046 (3.4146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6287 (inf)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [60]  [1000/1251]  eta: 0:00:52  lr: 0.003794  min_lr: 0.003794  loss: 2.8408 (3.4186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6790 (inf)  time: 0.2045  data: 0.0004  max mem: 13273
Epoch: [60]  [1200/1251]  eta: 0:00:10  lr: 0.003793  min_lr: 0.003793  loss: 3.3053 (3.4256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7176 (inf)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [60]  [1250/1251]  eta: 0:00:00  lr: 0.003792  min_lr: 0.003792  loss: 3.5736 (3.4295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7528 (inf)  time: 0.1719  data: 0.0007  max mem: 13273
Epoch: [60] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.003792  min_lr: 0.003792  loss: 3.5736 (3.4333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7528 (inf)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.9127 (0.9127)  acc1: 82.8000 (82.8000)  acc5: 97.2000 (97.2000)  time: 5.3380  data: 5.2308  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.1127 (1.1625)  acc1: 78.8000 (76.2909)  acc5: 94.8000 (94.4727)  time: 0.7544  data: 0.6561  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.4502 (1.3964)  acc1: 67.2000 (71.5048)  acc5: 90.0000 (90.8381)  time: 0.2294  data: 0.1353  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5670 (1.4086)  acc1: 67.2000 (71.2160)  acc5: 88.8000 (90.6400)  time: 0.2127  data: 0.1203  max mem: 13273
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 71.354 Acc@5 90.902 loss 1.404
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 72.16%
Epoch: [61]  [   0/1251]  eta: 1:03:50  lr: 0.003792  min_lr: 0.003792  loss: 3.8686 (3.8686)  weight_decay: 0.0500 (0.0500)  time: 3.0621  data: 2.1806  max mem: 13273
Epoch: [61]  [ 200/1251]  eta: 0:03:53  lr: 0.003791  min_lr: 0.003791  loss: 2.9471 (3.3965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5881 (0.6662)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [61]  [ 400/1251]  eta: 0:03:02  lr: 0.003789  min_lr: 0.003789  loss: 3.8416 (3.4653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6255 (nan)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [61]  [ 600/1251]  eta: 0:02:17  lr: 0.003787  min_lr: 0.003787  loss: 3.3904 (3.4718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6059 (nan)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [61]  [ 800/1251]  eta: 0:01:34  lr: 0.003786  min_lr: 0.003786  loss: 2.9528 (3.4781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6880 (nan)  time: 0.2047  data: 0.0006  max mem: 13273
Epoch: [61]  [1000/1251]  eta: 0:00:52  lr: 0.003784  min_lr: 0.003784  loss: 3.1518 (3.4848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6596 (nan)  time: 0.2072  data: 0.0003  max mem: 13273
Epoch: [61]  [1200/1251]  eta: 0:00:10  lr: 0.003782  min_lr: 0.003782  loss: 4.1164 (3.4724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8135 (nan)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [61]  [1250/1251]  eta: 0:00:00  lr: 0.003782  min_lr: 0.003782  loss: 3.0158 (3.4672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7566 (nan)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [61] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.003782  min_lr: 0.003782  loss: 3.0158 (3.4435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7566 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.9604 (0.9604)  acc1: 82.8000 (82.8000)  acc5: 96.8000 (96.8000)  time: 5.5915  data: 5.4847  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.1539 (1.1427)  acc1: 76.8000 (77.1636)  acc5: 94.8000 (94.2909)  time: 0.7405  data: 0.6451  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3590 (1.3367)  acc1: 70.0000 (72.3619)  acc5: 91.2000 (90.9714)  time: 0.2016  data: 0.1092  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4458 (1.3443)  acc1: 69.2000 (72.0320)  acc5: 89.6000 (90.8480)  time: 0.2014  data: 0.1092  max mem: 13273
Test: Total time: 0:00:10 (0.4046 s / it)
* Acc@1 72.056 Acc@5 91.098 loss 1.339
Accuracy of the model on the 50000 test images: 72.1%
Max accuracy: 72.16%
Epoch: [62]  [   0/1251]  eta: 1:07:37  lr: 0.003782  min_lr: 0.003782  loss: 2.4614 (2.4614)  weight_decay: 0.0500 (0.0500)  time: 3.2436  data: 1.6747  max mem: 13273
Epoch: [62]  [ 200/1251]  eta: 0:03:53  lr: 0.003780  min_lr: 0.003780  loss: 3.4078 (3.4408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7114 (0.6639)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [62]  [ 400/1251]  eta: 0:03:02  lr: 0.003779  min_lr: 0.003779  loss: 3.9200 (3.4541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7033 (0.6933)  time: 0.2164  data: 0.0006  max mem: 13273
Epoch: [62]  [ 600/1251]  eta: 0:02:17  lr: 0.003777  min_lr: 0.003777  loss: 3.3849 (3.4498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6548 (0.6793)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [62]  [ 800/1251]  eta: 0:01:34  lr: 0.003775  min_lr: 0.003775  loss: 3.2275 (3.4219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7039 (0.6842)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [62]  [1000/1251]  eta: 0:00:52  lr: 0.003774  min_lr: 0.003774  loss: 3.1475 (3.4133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7166 (0.6982)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [62]  [1200/1251]  eta: 0:00:10  lr: 0.003772  min_lr: 0.003772  loss: 3.7937 (3.4295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5803 (0.7095)  time: 0.2048  data: 0.0006  max mem: 13273
Epoch: [62]  [1250/1251]  eta: 0:00:00  lr: 0.003772  min_lr: 0.003772  loss: 3.2099 (3.4294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5451 (0.7062)  time: 0.1713  data: 0.0010  max mem: 13273
Epoch: [62] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.003772  min_lr: 0.003772  loss: 3.2099 (3.4203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5451 (0.7062)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8936 (0.8936)  acc1: 82.8000 (82.8000)  acc5: 96.4000 (96.4000)  time: 5.5295  data: 5.4187  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0477 (1.1122)  acc1: 81.6000 (77.2727)  acc5: 94.8000 (94.5455)  time: 0.6921  data: 0.5940  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3615 (1.3497)  acc1: 70.0000 (72.3619)  acc5: 91.2000 (91.2000)  time: 0.1861  data: 0.0909  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4281 (1.3679)  acc1: 70.0000 (71.9680)  acc5: 88.4000 (90.9600)  time: 0.2041  data: 0.1098  max mem: 13273
Test: Total time: 0:00:10 (0.4028 s / it)
* Acc@1 72.196 Acc@5 91.164 loss 1.364
Accuracy of the model on the 50000 test images: 72.2%
Max accuracy: 72.20%
Epoch: [63]  [   0/1251]  eta: 1:03:32  lr: 0.003772  min_lr: 0.003772  loss: 4.2233 (4.2233)  weight_decay: 0.0500 (0.0500)  time: 3.0478  data: 2.8369  max mem: 13273
Epoch: [63]  [ 200/1251]  eta: 0:03:51  lr: 0.003770  min_lr: 0.003770  loss: 3.5015 (3.3659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5945 (0.6173)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [63]  [ 400/1251]  eta: 0:03:01  lr: 0.003768  min_lr: 0.003768  loss: 3.5112 (3.3802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6621 (0.6371)  time: 0.2044  data: 0.0006  max mem: 13273
Epoch: [63]  [ 600/1251]  eta: 0:02:17  lr: 0.003767  min_lr: 0.003767  loss: 3.1016 (3.3910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6212 (0.6462)  time: 0.2066  data: 0.0006  max mem: 13273
Epoch: [63]  [ 800/1251]  eta: 0:01:34  lr: 0.003765  min_lr: 0.003765  loss: 3.3488 (3.3775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6456 (0.6625)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [63]  [1000/1251]  eta: 0:00:52  lr: 0.003763  min_lr: 0.003763  loss: 2.6847 (3.3734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6138 (0.6688)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [63]  [1200/1251]  eta: 0:00:10  lr: 0.003762  min_lr: 0.003762  loss: 3.7276 (3.4058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.6719)  time: 0.2180  data: 0.0006  max mem: 13273
Epoch: [63]  [1250/1251]  eta: 0:00:00  lr: 0.003761  min_lr: 0.003761  loss: 3.4051 (3.4073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6740 (0.6711)  time: 0.1700  data: 0.0005  max mem: 13273
Epoch: [63] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.003761  min_lr: 0.003761  loss: 3.4051 (3.4171)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6740 (0.6711)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.7963 (0.7963)  acc1: 84.0000 (84.0000)  acc5: 97.2000 (97.2000)  time: 5.8127  data: 5.7033  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0544 (1.0512)  acc1: 79.2000 (78.4727)  acc5: 94.8000 (94.6182)  time: 0.7695  data: 0.6750  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3415 (1.3266)  acc1: 69.6000 (72.9905)  acc5: 90.0000 (91.1048)  time: 0.2024  data: 0.1105  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4721 (1.3282)  acc1: 68.8000 (72.8320)  acc5: 89.6000 (91.2320)  time: 0.2018  data: 0.1104  max mem: 13273
Test: Total time: 0:00:10 (0.4125 s / it)
* Acc@1 72.370 Acc@5 91.180 loss 1.336
Accuracy of the model on the 50000 test images: 72.4%
Max accuracy: 72.37%
Epoch: [64]  [   0/1251]  eta: 1:04:10  lr: 0.003761  min_lr: 0.003761  loss: 4.1783 (4.1783)  weight_decay: 0.0500 (0.0500)  time: 3.0777  data: 2.8247  max mem: 13273
Epoch: [64]  [ 200/1251]  eta: 0:03:52  lr: 0.003760  min_lr: 0.003760  loss: 3.1384 (3.3478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6469 (0.7233)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [64]  [ 400/1251]  eta: 0:03:01  lr: 0.003758  min_lr: 0.003758  loss: 3.1521 (3.4041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7249 (0.7412)  time: 0.2071  data: 0.0004  max mem: 13273
Epoch: [64]  [ 600/1251]  eta: 0:02:17  lr: 0.003756  min_lr: 0.003756  loss: 3.5368 (3.4340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6207 (0.7404)  time: 0.2051  data: 0.0006  max mem: 13273
Epoch: [64]  [ 800/1251]  eta: 0:01:34  lr: 0.003754  min_lr: 0.003754  loss: 4.0951 (3.4386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6461 (0.7168)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [64]  [1000/1251]  eta: 0:00:52  lr: 0.003753  min_lr: 0.003753  loss: 3.5452 (3.4330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6054 (0.7106)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [64]  [1200/1251]  eta: 0:00:10  lr: 0.003751  min_lr: 0.003751  loss: 3.2037 (3.4352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6796 (0.7017)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [64]  [1250/1251]  eta: 0:00:00  lr: 0.003751  min_lr: 0.003751  loss: 3.2891 (3.4398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.7013)  time: 0.1706  data: 0.0009  max mem: 13273
Epoch: [64] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.003751  min_lr: 0.003751  loss: 3.2891 (3.4149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.7013)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.8872 (0.8872)  acc1: 82.4000 (82.4000)  acc5: 96.0000 (96.0000)  time: 5.6931  data: 5.5869  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0551 (1.1036)  acc1: 78.8000 (77.4909)  acc5: 95.2000 (94.6909)  time: 0.7087  data: 0.6109  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3407 (1.3410)  acc1: 70.4000 (72.3810)  acc5: 89.6000 (91.0095)  time: 0.1888  data: 0.0931  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4973 (1.3547)  acc1: 70.4000 (72.2400)  acc5: 88.4000 (90.9760)  time: 0.1880  data: 0.0930  max mem: 13273
Test: Total time: 0:00:09 (0.3971 s / it)
* Acc@1 72.764 Acc@5 91.362 loss 1.349
Accuracy of the model on the 50000 test images: 72.8%
Max accuracy: 72.76%
Epoch: [65]  [   0/1251]  eta: 1:04:15  lr: 0.003751  min_lr: 0.003751  loss: 2.3735 (2.3735)  weight_decay: 0.0500 (0.0500)  time: 3.0820  data: 2.8322  max mem: 13273
Epoch: [65]  [ 200/1251]  eta: 0:03:52  lr: 0.003749  min_lr: 0.003749  loss: 3.2747 (3.3655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6244 (0.6852)  time: 0.2068  data: 0.0004  max mem: 13273
Epoch: [65]  [ 400/1251]  eta: 0:03:01  lr: 0.003747  min_lr: 0.003747  loss: 2.8434 (3.4521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7006 (0.6956)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [65]  [ 600/1251]  eta: 0:02:17  lr: 0.003745  min_lr: 0.003745  loss: 2.7476 (3.4407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6646 (0.6739)  time: 0.2076  data: 0.0005  max mem: 13273
Epoch: [65]  [ 800/1251]  eta: 0:01:34  lr: 0.003744  min_lr: 0.003744  loss: 3.1360 (3.4130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5748 (0.6651)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [65]  [1000/1251]  eta: 0:00:52  lr: 0.003742  min_lr: 0.003742  loss: 2.8757 (3.4107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7645 (0.6736)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [65]  [1200/1251]  eta: 0:00:10  lr: 0.003740  min_lr: 0.003740  loss: 3.6956 (3.4186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5953 (0.6679)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [65]  [1250/1251]  eta: 0:00:00  lr: 0.003740  min_lr: 0.003740  loss: 3.0497 (3.4177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6564 (0.6668)  time: 0.1707  data: 0.0007  max mem: 13273
Epoch: [65] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.003740  min_lr: 0.003740  loss: 3.0497 (3.4177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6564 (0.6668)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7872 (0.7872)  acc1: 84.4000 (84.4000)  acc5: 96.4000 (96.4000)  time: 5.7051  data: 5.5958  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0381 (1.0168)  acc1: 76.8000 (78.0364)  acc5: 94.8000 (94.5091)  time: 0.6880  data: 0.5893  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3069 (1.2748)  acc1: 70.0000 (72.5524)  acc5: 90.0000 (91.2000)  time: 0.1770  data: 0.0829  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4163 (1.2856)  acc1: 69.2000 (72.2240)  acc5: 90.0000 (91.0880)  time: 0.2050  data: 0.1123  max mem: 13273
Test: Total time: 0:00:10 (0.4117 s / it)
* Acc@1 72.400 Acc@5 91.344 loss 1.282
Accuracy of the model on the 50000 test images: 72.4%
Max accuracy: 72.76%
Epoch: [66]  [   0/1251]  eta: 1:02:23  lr: 0.003740  min_lr: 0.003740  loss: 4.4019 (4.4019)  weight_decay: 0.0500 (0.0500)  time: 2.9928  data: 2.5483  max mem: 13273
Epoch: [66]  [ 200/1251]  eta: 0:03:52  lr: 0.003738  min_lr: 0.003738  loss: 2.7979 (3.4338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7350 (0.7357)  time: 0.2038  data: 0.0005  max mem: 13273
Epoch: [66]  [ 400/1251]  eta: 0:03:01  lr: 0.003736  min_lr: 0.003736  loss: 2.9217 (3.4100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7273 (0.7398)  time: 0.2054  data: 0.0006  max mem: 13273
Epoch: [66]  [ 600/1251]  eta: 0:02:16  lr: 0.003734  min_lr: 0.003734  loss: 3.7408 (3.3697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8297 (0.7297)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [66]  [ 800/1251]  eta: 0:01:34  lr: 0.003732  min_lr: 0.003732  loss: 2.7088 (3.3833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7405 (0.7290)  time: 0.2069  data: 0.0006  max mem: 13273
Epoch: [66]  [1000/1251]  eta: 0:00:52  lr: 0.003731  min_lr: 0.003731  loss: 3.5714 (3.3999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6075 (0.7134)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [66]  [1200/1251]  eta: 0:00:10  lr: 0.003729  min_lr: 0.003729  loss: 3.5587 (3.3976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6117 (0.7053)  time: 0.2083  data: 0.0004  max mem: 13273
Epoch: [66]  [1250/1251]  eta: 0:00:00  lr: 0.003728  min_lr: 0.003728  loss: 2.7794 (3.3945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6913 (0.7060)  time: 0.1701  data: 0.0008  max mem: 13273
Epoch: [66] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.003728  min_lr: 0.003728  loss: 2.7794 (3.4046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6913 (0.7060)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7842 (0.7842)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.4586  data: 5.3532  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9881 (1.0658)  acc1: 80.8000 (77.9273)  acc5: 95.6000 (94.8727)  time: 0.7552  data: 0.6561  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3716 (1.2995)  acc1: 70.0000 (73.4476)  acc5: 92.0000 (91.6952)  time: 0.2236  data: 0.1282  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4470 (1.3077)  acc1: 69.2000 (73.1360)  acc5: 90.0000 (91.7120)  time: 0.2222  data: 0.1282  max mem: 13273
Test: Total time: 0:00:10 (0.4145 s / it)
* Acc@1 72.576 Acc@5 91.458 loss 1.310
Accuracy of the model on the 50000 test images: 72.6%
Max accuracy: 72.76%
Epoch: [67]  [   0/1251]  eta: 1:05:17  lr: 0.003728  min_lr: 0.003728  loss: 4.1543 (4.1543)  weight_decay: 0.0500 (0.0500)  time: 3.1312  data: 2.3837  max mem: 13273
Epoch: [67]  [ 200/1251]  eta: 0:03:52  lr: 0.003727  min_lr: 0.003727  loss: 2.9964 (3.4216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7527 (0.7009)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [67]  [ 400/1251]  eta: 0:03:03  lr: 0.003725  min_lr: 0.003725  loss: 4.0141 (3.4563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6330 (0.6904)  time: 0.2176  data: 0.0006  max mem: 13273
Epoch: [67]  [ 600/1251]  eta: 0:02:18  lr: 0.003723  min_lr: 0.003723  loss: 3.5737 (3.4399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6899 (0.7121)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [67]  [ 800/1251]  eta: 0:01:35  lr: 0.003721  min_lr: 0.003721  loss: 2.9429 (3.4257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6188 (0.7087)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [67]  [1000/1251]  eta: 0:00:52  lr: 0.003719  min_lr: 0.003719  loss: 3.8842 (3.4388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5265 (0.6871)  time: 0.2075  data: 0.0004  max mem: 13273
Epoch: [67]  [1200/1251]  eta: 0:00:10  lr: 0.003717  min_lr: 0.003717  loss: 2.7335 (3.4536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6234 (0.6821)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [67]  [1250/1251]  eta: 0:00:00  lr: 0.003717  min_lr: 0.003717  loss: 3.7656 (3.4567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6562 (0.6813)  time: 0.1706  data: 0.0007  max mem: 13273
Epoch: [67] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.003717  min_lr: 0.003717  loss: 3.7656 (3.4172)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6562 (0.6813)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.9675 (0.9675)  acc1: 82.0000 (82.0000)  acc5: 95.2000 (95.2000)  time: 5.7502  data: 5.6446  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0437 (1.0757)  acc1: 78.8000 (78.2182)  acc5: 95.2000 (94.8364)  time: 0.6779  data: 0.5801  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3392 (1.3136)  acc1: 70.4000 (73.0095)  acc5: 90.4000 (91.5810)  time: 0.1691  data: 0.0754  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5302 (1.3287)  acc1: 68.8000 (72.5280)  acc5: 88.8000 (91.3280)  time: 0.1801  data: 0.0874  max mem: 13273
Test: Total time: 0:00:09 (0.3969 s / it)
* Acc@1 72.612 Acc@5 91.452 loss 1.336
Accuracy of the model on the 50000 test images: 72.6%
Max accuracy: 72.76%
Epoch: [68]  [   0/1251]  eta: 1:04:23  lr: 0.003717  min_lr: 0.003717  loss: 2.5182 (2.5182)  weight_decay: 0.0500 (0.0500)  time: 3.0880  data: 2.8346  max mem: 13273
Epoch: [68]  [ 200/1251]  eta: 0:03:51  lr: 0.003715  min_lr: 0.003715  loss: 2.9002 (3.2717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6267 (0.6495)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [68]  [ 400/1251]  eta: 0:03:01  lr: 0.003713  min_lr: 0.003713  loss: 3.1355 (3.3154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7793 (0.7081)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [68]  [ 600/1251]  eta: 0:02:17  lr: 0.003711  min_lr: 0.003711  loss: 3.0294 (3.3574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6102 (0.7001)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [68]  [ 800/1251]  eta: 0:01:34  lr: 0.003710  min_lr: 0.003710  loss: 3.4822 (3.3725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7262 (0.6978)  time: 0.2059  data: 0.0006  max mem: 13273
Epoch: [68]  [1000/1251]  eta: 0:00:52  lr: 0.003708  min_lr: 0.003708  loss: 2.8447 (3.3949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6144 (0.6959)  time: 0.2053  data: 0.0006  max mem: 13273
Epoch: [68]  [1200/1251]  eta: 0:00:10  lr: 0.003706  min_lr: 0.003706  loss: 3.8525 (3.3989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5621 (0.6876)  time: 0.2084  data: 0.0005  max mem: 13273
Epoch: [68]  [1250/1251]  eta: 0:00:00  lr: 0.003705  min_lr: 0.003705  loss: 3.1602 (3.3950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6402 (0.6882)  time: 0.1702  data: 0.0008  max mem: 13273
Epoch: [68] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.003705  min_lr: 0.003705  loss: 3.1602 (3.3922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6402 (0.6882)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8673 (0.8673)  acc1: 84.0000 (84.0000)  acc5: 96.4000 (96.4000)  time: 5.5392  data: 5.4306  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0658 (1.0547)  acc1: 78.8000 (78.1091)  acc5: 95.6000 (94.7273)  time: 0.7546  data: 0.6556  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3151 (1.2882)  acc1: 69.2000 (73.4476)  acc5: 91.2000 (91.6571)  time: 0.2148  data: 0.1205  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4435 (1.2957)  acc1: 69.2000 (73.0720)  acc5: 89.6000 (91.6320)  time: 0.2131  data: 0.1204  max mem: 13273
Test: Total time: 0:00:10 (0.4108 s / it)
* Acc@1 72.998 Acc@5 91.598 loss 1.305
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.00%
Epoch: [69]  [   0/1251]  eta: 1:02:50  lr: 0.003705  min_lr: 0.003705  loss: 3.3861 (3.3861)  weight_decay: 0.0500 (0.0500)  time: 3.0139  data: 2.7707  max mem: 13273
Epoch: [69]  [ 200/1251]  eta: 0:03:51  lr: 0.003703  min_lr: 0.003703  loss: 3.4140 (3.4576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6272 (0.7037)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [69]  [ 400/1251]  eta: 0:03:00  lr: 0.003702  min_lr: 0.003702  loss: 3.1978 (3.4132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7019 (0.7159)  time: 0.2053  data: 0.0006  max mem: 13273
Epoch: [69]  [ 600/1251]  eta: 0:02:16  lr: 0.003700  min_lr: 0.003700  loss: 3.7048 (3.3961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6560 (0.7248)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [69]  [ 800/1251]  eta: 0:01:34  lr: 0.003698  min_lr: 0.003698  loss: 2.8037 (3.3997)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6431 (0.7176)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [69]  [1000/1251]  eta: 0:00:52  lr: 0.003696  min_lr: 0.003696  loss: 3.9882 (3.4106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6908 (0.7111)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [69]  [1200/1251]  eta: 0:00:10  lr: 0.003694  min_lr: 0.003694  loss: 3.8561 (3.4249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6597 (0.7077)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [69]  [1250/1251]  eta: 0:00:00  lr: 0.003694  min_lr: 0.003694  loss: 3.1054 (3.4197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6938 (0.7084)  time: 0.1713  data: 0.0006  max mem: 13273
Epoch: [69] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.003694  min_lr: 0.003694  loss: 3.1054 (3.4023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6938 (0.7084)
Test:  [ 0/25]  eta: 0:02:20  loss: 1.0317 (1.0317)  acc1: 80.8000 (80.8000)  acc5: 96.8000 (96.8000)  time: 5.6085  data: 5.5022  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0859 (1.1047)  acc1: 76.0000 (77.5273)  acc5: 95.6000 (95.2000)  time: 0.7386  data: 0.6412  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3428 (1.3234)  acc1: 70.8000 (73.1810)  acc5: 91.6000 (91.8286)  time: 0.2004  data: 0.1070  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4662 (1.3283)  acc1: 70.4000 (72.7680)  acc5: 90.8000 (91.8080)  time: 0.2024  data: 0.1098  max mem: 13273
Test: Total time: 0:00:10 (0.4053 s / it)
* Acc@1 72.762 Acc@5 91.468 loss 1.334
Accuracy of the model on the 50000 test images: 72.8%
Max accuracy: 73.00%
Epoch: [70]  [   0/1251]  eta: 1:05:19  lr: 0.003694  min_lr: 0.003694  loss: 2.5663 (2.5663)  weight_decay: 0.0500 (0.0500)  time: 3.1330  data: 2.5183  max mem: 13273
Epoch: [70]  [ 200/1251]  eta: 0:03:53  lr: 0.003692  min_lr: 0.003692  loss: 3.2553 (3.3751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5796 (0.6626)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [70]  [ 400/1251]  eta: 0:03:01  lr: 0.003690  min_lr: 0.003690  loss: 3.8996 (3.3850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6236 (0.6721)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [70]  [ 600/1251]  eta: 0:02:17  lr: 0.003688  min_lr: 0.003688  loss: 3.0709 (3.3663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6359 (0.6761)  time: 0.2076  data: 0.0006  max mem: 13273
Epoch: [70]  [ 800/1251]  eta: 0:01:34  lr: 0.003686  min_lr: 0.003686  loss: 3.5580 (3.3584)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5844 (0.6735)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [70]  [1000/1251]  eta: 0:00:52  lr: 0.003684  min_lr: 0.003684  loss: 3.0124 (3.3585)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6739 (0.6745)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [70]  [1200/1251]  eta: 0:00:10  lr: 0.003682  min_lr: 0.003682  loss: 3.7373 (3.3668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5921 (0.6729)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [70]  [1250/1251]  eta: 0:00:00  lr: 0.003682  min_lr: 0.003682  loss: 3.3638 (3.3690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6189 (0.6755)  time: 0.1701  data: 0.0009  max mem: 13273
Epoch: [70] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.003682  min_lr: 0.003682  loss: 3.3638 (3.3982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6189 (0.6755)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.9124 (0.9124)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 5.6305  data: 5.5253  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.1237 (1.1211)  acc1: 79.6000 (78.5455)  acc5: 95.6000 (94.6909)  time: 0.7416  data: 0.6467  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3804 (1.3465)  acc1: 70.0000 (72.9714)  acc5: 89.2000 (91.4857)  time: 0.2105  data: 0.1184  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4775 (1.3505)  acc1: 70.0000 (72.5600)  acc5: 88.8000 (91.3120)  time: 0.2095  data: 0.1183  max mem: 13273
Test: Total time: 0:00:10 (0.4121 s / it)
* Acc@1 72.770 Acc@5 91.564 loss 1.343
Accuracy of the model on the 50000 test images: 72.8%
Max accuracy: 73.00%
Epoch: [71]  [   0/1251]  eta: 1:06:30  lr: 0.003681  min_lr: 0.003681  loss: 4.0526 (4.0526)  weight_decay: 0.0500 (0.0500)  time: 3.1896  data: 2.3905  max mem: 13273
Epoch: [71]  [ 200/1251]  eta: 0:03:53  lr: 0.003680  min_lr: 0.003680  loss: 3.1092 (3.3495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6794 (0.7062)  time: 0.2041  data: 0.0004  max mem: 13273
Epoch: [71]  [ 400/1251]  eta: 0:03:01  lr: 0.003678  min_lr: 0.003678  loss: 3.0657 (3.3907)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6564 (0.7084)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [71]  [ 600/1251]  eta: 0:02:17  lr: 0.003676  min_lr: 0.003676  loss: 3.8244 (3.3693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6691 (0.7175)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [71]  [ 800/1251]  eta: 0:01:34  lr: 0.003674  min_lr: 0.003674  loss: 2.9174 (3.3739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6610 (0.7066)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [71]  [1000/1251]  eta: 0:00:52  lr: 0.003672  min_lr: 0.003672  loss: 3.1088 (3.3751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7015 (0.7079)  time: 0.2059  data: 0.0006  max mem: 13273
Epoch: [71]  [1200/1251]  eta: 0:00:10  lr: 0.003670  min_lr: 0.003670  loss: 3.9060 (3.3925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6481 (0.7106)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [71]  [1250/1251]  eta: 0:00:00  lr: 0.003669  min_lr: 0.003669  loss: 3.2140 (3.3950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6054 (0.7087)  time: 0.1709  data: 0.0007  max mem: 13273
Epoch: [71] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.003669  min_lr: 0.003669  loss: 3.2140 (3.3886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6054 (0.7087)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7613 (0.7613)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 5.7283  data: 5.6191  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0353 (1.0302)  acc1: 78.4000 (77.9273)  acc5: 96.0000 (94.9091)  time: 0.7376  data: 0.6394  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3465 (1.2523)  acc1: 70.8000 (73.2571)  acc5: 90.4000 (91.6000)  time: 0.2022  data: 0.1065  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3574 (1.2566)  acc1: 70.8000 (72.8960)  acc5: 89.6000 (91.7120)  time: 0.2022  data: 0.1073  max mem: 13273
Test: Total time: 0:00:10 (0.4095 s / it)
* Acc@1 72.810 Acc@5 91.652 loss 1.260
Accuracy of the model on the 50000 test images: 72.8%
Max accuracy: 73.00%
Epoch: [72]  [   0/1251]  eta: 1:07:51  lr: 0.003669  min_lr: 0.003669  loss: 3.8447 (3.8447)  weight_decay: 0.0500 (0.0500)  time: 3.2549  data: 2.9092  max mem: 13273
Epoch: [72]  [ 200/1251]  eta: 0:03:53  lr: 0.003667  min_lr: 0.003667  loss: 2.7639 (3.2279)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8221 (0.7624)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [72]  [ 400/1251]  eta: 0:03:01  lr: 0.003665  min_lr: 0.003665  loss: 2.8601 (3.2708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8277 (0.7957)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [72]  [ 600/1251]  eta: 0:02:17  lr: 0.003663  min_lr: 0.003663  loss: 2.9834 (3.3305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7188 (0.7538)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [72]  [ 800/1251]  eta: 0:01:34  lr: 0.003661  min_lr: 0.003661  loss: 3.0385 (3.3290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6061 (0.7336)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [72]  [1000/1251]  eta: 0:00:52  lr: 0.003659  min_lr: 0.003659  loss: 3.7297 (3.3434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7051 (0.7247)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [72]  [1200/1251]  eta: 0:00:10  lr: 0.003657  min_lr: 0.003657  loss: 2.8261 (3.3311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6190 (0.7140)  time: 0.2065  data: 0.0004  max mem: 13273
Epoch: [72]  [1250/1251]  eta: 0:00:00  lr: 0.003657  min_lr: 0.003657  loss: 2.8538 (3.3401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7450 (0.7309)  time: 0.1707  data: 0.0008  max mem: 13273
Epoch: [72] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.003657  min_lr: 0.003657  loss: 2.8538 (3.3722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7450 (0.7309)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.9540 (0.9540)  acc1: 84.0000 (84.0000)  acc5: 97.2000 (97.2000)  time: 5.4910  data: 5.3479  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0029 (1.0904)  acc1: 77.6000 (77.4182)  acc5: 95.2000 (94.7636)  time: 0.6995  data: 0.5969  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3412 (1.3246)  acc1: 69.2000 (72.6286)  acc5: 90.0000 (91.3714)  time: 0.1948  data: 0.0999  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.5042 (1.3343)  acc1: 69.2000 (72.2560)  acc5: 88.8000 (91.2640)  time: 0.2089  data: 0.1148  max mem: 13273
Test: Total time: 0:00:10 (0.4062 s / it)
* Acc@1 72.698 Acc@5 91.416 loss 1.324
Accuracy of the model on the 50000 test images: 72.7%
Max accuracy: 73.00%
Epoch: [73]  [   0/1251]  eta: 1:04:29  lr: 0.003657  min_lr: 0.003657  loss: 2.6741 (2.6741)  weight_decay: 0.0500 (0.0500)  time: 3.0933  data: 2.6403  max mem: 13273
Epoch: [73]  [ 200/1251]  eta: 0:03:53  lr: 0.003655  min_lr: 0.003655  loss: 3.8535 (3.3815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6868 (0.6586)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [73]  [ 400/1251]  eta: 0:03:01  lr: 0.003653  min_lr: 0.003653  loss: 3.0100 (3.3644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5751 (0.6701)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [73]  [ 600/1251]  eta: 0:02:17  lr: 0.003651  min_lr: 0.003651  loss: 3.1429 (3.3135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7029 (0.6810)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [73]  [ 800/1251]  eta: 0:01:34  lr: 0.003649  min_lr: 0.003649  loss: 3.3437 (3.3425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6428 (0.6731)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [73]  [1000/1251]  eta: 0:00:52  lr: 0.003647  min_lr: 0.003647  loss: 2.6296 (3.3544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6497 (0.6690)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [73]  [1200/1251]  eta: 0:00:10  lr: 0.003645  min_lr: 0.003645  loss: 3.2012 (3.3563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6845 (0.6733)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [73]  [1250/1251]  eta: 0:00:00  lr: 0.003644  min_lr: 0.003644  loss: 3.0144 (3.3555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7172 (0.6744)  time: 0.1703  data: 0.0005  max mem: 13273
Epoch: [73] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003644  min_lr: 0.003644  loss: 3.0144 (3.3806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7172 (0.6744)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.8086 (0.8086)  acc1: 83.6000 (83.6000)  acc5: 97.2000 (97.2000)  time: 5.5110  data: 5.4057  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0102 (1.0122)  acc1: 77.6000 (78.0000)  acc5: 95.6000 (95.1273)  time: 0.7454  data: 0.6502  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2766 (1.2339)  acc1: 70.0000 (73.4286)  acc5: 92.0000 (92.0381)  time: 0.2059  data: 0.1130  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3249 (1.2435)  acc1: 70.0000 (72.8960)  acc5: 90.4000 (92.0320)  time: 0.2047  data: 0.1129  max mem: 13273
Test: Total time: 0:00:10 (0.4029 s / it)
* Acc@1 72.910 Acc@5 91.692 loss 1.248
Accuracy of the model on the 50000 test images: 72.9%
Max accuracy: 73.00%
Epoch: [74]  [   0/1251]  eta: 1:06:04  lr: 0.003644  min_lr: 0.003644  loss: 2.6359 (2.6359)  weight_decay: 0.0500 (0.0500)  time: 3.1687  data: 2.8786  max mem: 13273
Epoch: [74]  [ 200/1251]  eta: 0:03:54  lr: 0.003642  min_lr: 0.003642  loss: 3.6598 (3.3547)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6098 (0.6638)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [74]  [ 400/1251]  eta: 0:03:02  lr: 0.003640  min_lr: 0.003640  loss: 3.7953 (3.3427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6812 (inf)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [74]  [ 600/1251]  eta: 0:02:18  lr: 0.003638  min_lr: 0.003638  loss: 3.3930 (3.3626)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6899 (inf)  time: 0.2084  data: 0.0005  max mem: 13273
Epoch: [74]  [ 800/1251]  eta: 0:01:35  lr: 0.003636  min_lr: 0.003636  loss: 2.9789 (3.3662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6817 (inf)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [74]  [1000/1251]  eta: 0:00:52  lr: 0.003634  min_lr: 0.003634  loss: 2.8224 (3.3749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6248 (inf)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [74]  [1200/1251]  eta: 0:00:10  lr: 0.003632  min_lr: 0.003632  loss: 3.4647 (3.3832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5970 (inf)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [74]  [1250/1251]  eta: 0:00:00  lr: 0.003631  min_lr: 0.003631  loss: 2.7677 (3.3783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5914 (inf)  time: 0.1704  data: 0.0009  max mem: 13273
Epoch: [74] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.003631  min_lr: 0.003631  loss: 2.7677 (3.3679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5914 (inf)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7618 (0.7618)  acc1: 84.0000 (84.0000)  acc5: 96.8000 (96.8000)  time: 5.7195  data: 5.6120  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9631 (0.9881)  acc1: 78.4000 (77.7455)  acc5: 95.6000 (95.3091)  time: 0.7568  data: 0.6626  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3043 (1.2180)  acc1: 70.4000 (73.2762)  acc5: 90.8000 (92.1905)  time: 0.2227  data: 0.1303  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3593 (1.2227)  acc1: 70.4000 (73.1200)  acc5: 90.4000 (92.0000)  time: 0.2223  data: 0.1302  max mem: 13273
Test: Total time: 0:00:10 (0.4245 s / it)
* Acc@1 73.220 Acc@5 91.932 loss 1.223
Accuracy of the model on the 50000 test images: 73.2%
Max accuracy: 73.22%
Epoch: [75]  [   0/1251]  eta: 1:05:17  lr: 0.003631  min_lr: 0.003631  loss: 3.3154 (3.3154)  weight_decay: 0.0500 (0.0500)  time: 3.1316  data: 2.9070  max mem: 13273
Epoch: [75]  [ 200/1251]  eta: 0:03:51  lr: 0.003629  min_lr: 0.003629  loss: 2.8126 (3.4120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7724 (0.7406)  time: 0.2071  data: 0.0005  max mem: 13273
Epoch: [75]  [ 400/1251]  eta: 0:03:01  lr: 0.003627  min_lr: 0.003627  loss: 3.1863 (3.3350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5920 (0.6852)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [75]  [ 600/1251]  eta: 0:02:16  lr: 0.003625  min_lr: 0.003625  loss: 2.7565 (3.3107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6159 (0.6824)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [75]  [ 800/1251]  eta: 0:01:34  lr: 0.003623  min_lr: 0.003623  loss: 3.2763 (3.3256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6224 (0.6798)  time: 0.2039  data: 0.0004  max mem: 13273
Epoch: [75]  [1000/1251]  eta: 0:00:52  lr: 0.003621  min_lr: 0.003621  loss: 2.7903 (3.3515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6199 (0.6795)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [75]  [1200/1251]  eta: 0:00:10  lr: 0.003619  min_lr: 0.003619  loss: 2.8743 (3.3474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7473 (0.6777)  time: 0.2074  data: 0.0004  max mem: 13273
Epoch: [75]  [1250/1251]  eta: 0:00:00  lr: 0.003618  min_lr: 0.003618  loss: 2.7729 (3.3418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7774 (0.6824)  time: 0.1706  data: 0.0007  max mem: 13273
Epoch: [75] Total time: 0:04:19 (0.2077 s / it)
Averaged stats: lr: 0.003618  min_lr: 0.003618  loss: 2.7729 (3.3768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7774 (0.6824)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.8031 (0.8031)  acc1: 80.0000 (80.0000)  acc5: 98.0000 (98.0000)  time: 5.1746  data: 5.0662  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0006 (0.9871)  acc1: 78.4000 (78.3636)  acc5: 95.6000 (95.1273)  time: 0.7135  data: 0.6158  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2447 (1.2166)  acc1: 70.4000 (73.6191)  acc5: 91.2000 (92.0762)  time: 0.2158  data: 0.1223  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3108 (1.2295)  acc1: 70.0000 (73.0240)  acc5: 90.8000 (91.7440)  time: 0.2149  data: 0.1222  max mem: 13273
Test: Total time: 0:00:09 (0.3973 s / it)
* Acc@1 73.014 Acc@5 91.748 loss 1.231
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.22%
Epoch: [76]  [   0/1251]  eta: 1:06:39  lr: 0.003618  min_lr: 0.003618  loss: 3.3018 (3.3018)  weight_decay: 0.0500 (0.0500)  time: 3.1973  data: 2.0363  max mem: 13273
Epoch: [76]  [ 200/1251]  eta: 0:03:52  lr: 0.003616  min_lr: 0.003616  loss: 3.6019 (3.3989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6598 (0.7426)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [76]  [ 400/1251]  eta: 0:03:01  lr: 0.003614  min_lr: 0.003614  loss: 3.9499 (3.3811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6357 (0.7000)  time: 0.2064  data: 0.0004  max mem: 13273
Epoch: [76]  [ 600/1251]  eta: 0:02:17  lr: 0.003612  min_lr: 0.003612  loss: 3.7476 (3.3963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6494 (0.6892)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [76]  [ 800/1251]  eta: 0:01:34  lr: 0.003610  min_lr: 0.003610  loss: 2.8878 (3.3962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6529 (0.6908)  time: 0.2070  data: 0.0004  max mem: 13273
Epoch: [76]  [1000/1251]  eta: 0:00:52  lr: 0.003607  min_lr: 0.003607  loss: 3.0870 (3.3947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7225 (0.6899)  time: 0.2060  data: 0.0003  max mem: 13273
Epoch: [76]  [1200/1251]  eta: 0:00:10  lr: 0.003605  min_lr: 0.003605  loss: 2.7138 (3.3958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6887 (0.6902)  time: 0.2069  data: 0.0003  max mem: 13273
Epoch: [76]  [1250/1251]  eta: 0:00:00  lr: 0.003605  min_lr: 0.003605  loss: 3.5002 (3.3961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7408 (0.6944)  time: 0.1699  data: 0.0005  max mem: 13273
Epoch: [76] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.003605  min_lr: 0.003605  loss: 3.5002 (3.3698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7408 (0.6944)
Test:  [ 0/25]  eta: 0:01:30  loss: 0.8768 (0.8768)  acc1: 86.4000 (86.4000)  acc5: 96.0000 (96.0000)  time: 3.6080  data: 3.4961  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 1.0416 (1.0631)  acc1: 77.2000 (77.7455)  acc5: 96.0000 (94.6546)  time: 0.6007  data: 0.5048  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3427 (1.2882)  acc1: 70.0000 (73.3524)  acc5: 90.8000 (91.8476)  time: 0.2711  data: 0.1788  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4038 (1.2983)  acc1: 70.0000 (72.9760)  acc5: 90.4000 (91.6960)  time: 0.1926  data: 0.1021  max mem: 13273
Test: Total time: 0:00:09 (0.3994 s / it)
* Acc@1 73.354 Acc@5 92.000 loss 1.283
Accuracy of the model on the 50000 test images: 73.4%
Max accuracy: 73.35%
Epoch: [77]  [   0/1251]  eta: 0:49:34  lr: 0.003605  min_lr: 0.003605  loss: 2.3645 (2.3645)  weight_decay: 0.0500 (0.0500)  time: 2.3775  data: 1.9418  max mem: 13273
Epoch: [77]  [ 200/1251]  eta: 0:03:51  lr: 0.003603  min_lr: 0.003603  loss: 2.6713 (3.3294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7263 (0.7427)  time: 0.2073  data: 0.0005  max mem: 13273
Epoch: [77]  [ 400/1251]  eta: 0:03:02  lr: 0.003601  min_lr: 0.003601  loss: 3.0601 (3.3366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6598 (0.6956)  time: 0.2062  data: 0.0006  max mem: 13273
Epoch: [77]  [ 600/1251]  eta: 0:02:17  lr: 0.003598  min_lr: 0.003598  loss: 3.5572 (3.3852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6586 (0.6955)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [77]  [ 800/1251]  eta: 0:01:34  lr: 0.003596  min_lr: 0.003596  loss: 3.0515 (3.3512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7017 (0.6956)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [77]  [1000/1251]  eta: 0:00:52  lr: 0.003594  min_lr: 0.003594  loss: 3.8439 (3.3522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6437 (0.6977)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [77]  [1200/1251]  eta: 0:00:10  lr: 0.003592  min_lr: 0.003592  loss: 3.8730 (3.3723)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5863 (0.6906)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [77]  [1250/1251]  eta: 0:00:00  lr: 0.003591  min_lr: 0.003591  loss: 2.7801 (3.3678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6024 (0.6905)  time: 0.1701  data: 0.0007  max mem: 13273
Epoch: [77] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.003591  min_lr: 0.003591  loss: 2.7801 (3.3642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6024 (0.6905)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.7909 (0.7909)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.8074  data: 5.6810  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0112 (1.0069)  acc1: 79.2000 (78.6182)  acc5: 96.0000 (95.0909)  time: 0.7658  data: 0.6648  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3086 (1.2396)  acc1: 71.6000 (73.8857)  acc5: 91.2000 (91.9429)  time: 0.2091  data: 0.1147  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4144 (1.2506)  acc1: 70.4000 (73.4720)  acc5: 90.4000 (91.9360)  time: 0.2073  data: 0.1146  max mem: 13273
Test: Total time: 0:00:10 (0.4172 s / it)
* Acc@1 73.326 Acc@5 91.916 loss 1.249
Accuracy of the model on the 50000 test images: 73.3%
Max accuracy: 73.35%
Epoch: [78]  [   0/1251]  eta: 1:02:51  lr: 0.003591  min_lr: 0.003591  loss: 3.3595 (3.3595)  weight_decay: 0.0500 (0.0500)  time: 3.0146  data: 2.1357  max mem: 13273
Epoch: [78]  [ 200/1251]  eta: 0:03:52  lr: 0.003589  min_lr: 0.003589  loss: 3.5978 (3.4007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6699 (0.6630)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [78]  [ 400/1251]  eta: 0:03:01  lr: 0.003587  min_lr: 0.003587  loss: 3.4531 (3.4155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6450 (0.6974)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [78]  [ 600/1251]  eta: 0:02:17  lr: 0.003585  min_lr: 0.003585  loss: 3.0285 (3.3948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5998 (0.7072)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [78]  [ 800/1251]  eta: 0:01:34  lr: 0.003583  min_lr: 0.003583  loss: 2.5889 (3.3752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5792 (0.7207)  time: 0.2075  data: 0.0006  max mem: 13273
Epoch: [78]  [1000/1251]  eta: 0:00:52  lr: 0.003580  min_lr: 0.003580  loss: 3.7464 (3.3903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6064 (0.7048)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [78]  [1200/1251]  eta: 0:00:10  lr: 0.003578  min_lr: 0.003578  loss: 3.4603 (3.3783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6547 (0.6964)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [78]  [1250/1251]  eta: 0:00:00  lr: 0.003578  min_lr: 0.003578  loss: 3.1087 (3.3808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7474 (0.7000)  time: 0.1702  data: 0.0005  max mem: 13273
Epoch: [78] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003578  min_lr: 0.003578  loss: 3.1087 (3.3557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7474 (0.7000)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8127 (0.8127)  acc1: 82.4000 (82.4000)  acc5: 97.6000 (97.6000)  time: 5.6036  data: 5.4975  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9295 (1.0014)  acc1: 81.2000 (78.7273)  acc5: 96.0000 (95.3091)  time: 0.7316  data: 0.6340  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2704 (1.2362)  acc1: 70.8000 (73.9619)  acc5: 91.6000 (92.0952)  time: 0.2127  data: 0.1191  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3791 (1.2448)  acc1: 70.8000 (73.7600)  acc5: 89.6000 (91.9360)  time: 0.2118  data: 0.1191  max mem: 13273
Test: Total time: 0:00:10 (0.4121 s / it)
* Acc@1 73.464 Acc@5 91.874 loss 1.249
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.46%
Epoch: [79]  [   0/1251]  eta: 0:57:23  lr: 0.003578  min_lr: 0.003578  loss: 3.5911 (3.5911)  weight_decay: 0.0500 (0.0500)  time: 2.7525  data: 2.5213  max mem: 13273
Epoch: [79]  [ 200/1251]  eta: 0:03:49  lr: 0.003575  min_lr: 0.003575  loss: 3.8298 (3.3726)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5757 (0.6455)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [79]  [ 400/1251]  eta: 0:03:00  lr: 0.003573  min_lr: 0.003573  loss: 3.0467 (3.3594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7804 (0.6780)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [79]  [ 600/1251]  eta: 0:02:16  lr: 0.003571  min_lr: 0.003571  loss: 3.4803 (3.3661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6310 (0.6821)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [79]  [ 800/1251]  eta: 0:01:34  lr: 0.003569  min_lr: 0.003569  loss: 3.2520 (3.3501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5880 (0.6910)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [79]  [1000/1251]  eta: 0:00:52  lr: 0.003567  min_lr: 0.003567  loss: 2.5545 (3.3273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7231 (0.6923)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [79]  [1200/1251]  eta: 0:00:10  lr: 0.003564  min_lr: 0.003564  loss: 3.2045 (3.3266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6503 (0.6890)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [79]  [1250/1251]  eta: 0:00:00  lr: 0.003564  min_lr: 0.003564  loss: 3.5291 (3.3314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6262 (0.6874)  time: 0.1707  data: 0.0007  max mem: 13273
Epoch: [79] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.003564  min_lr: 0.003564  loss: 3.5291 (3.3470)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6262 (0.6874)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.9016 (0.9016)  acc1: 82.4000 (82.4000)  acc5: 98.0000 (98.0000)  time: 5.6253  data: 5.5169  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0371 (1.0714)  acc1: 80.4000 (78.2909)  acc5: 95.6000 (95.1636)  time: 0.7843  data: 0.6843  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3598 (1.2978)  acc1: 70.4000 (73.8095)  acc5: 91.6000 (91.9810)  time: 0.2168  data: 0.1215  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4652 (1.3134)  acc1: 70.4000 (73.2960)  acc5: 89.6000 (91.9200)  time: 0.2155  data: 0.1214  max mem: 13273
Test: Total time: 0:00:10 (0.4159 s / it)
* Acc@1 73.556 Acc@5 91.956 loss 1.312
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 73.56%
Epoch: [80]  [   0/1251]  eta: 1:01:34  lr: 0.003564  min_lr: 0.003564  loss: 3.0631 (3.0631)  weight_decay: 0.0500 (0.0500)  time: 2.9533  data: 2.7074  max mem: 13273
Epoch: [80]  [ 200/1251]  eta: 0:03:50  lr: 0.003562  min_lr: 0.003562  loss: 2.7420 (3.2837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6229 (0.6619)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [80]  [ 400/1251]  eta: 0:03:01  lr: 0.003559  min_lr: 0.003559  loss: 3.1133 (3.3010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7305 (0.6970)  time: 0.2049  data: 0.0006  max mem: 13273
Epoch: [80]  [ 600/1251]  eta: 0:02:17  lr: 0.003557  min_lr: 0.003557  loss: 4.0115 (3.3263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7630 (0.7092)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [80]  [ 800/1251]  eta: 0:01:34  lr: 0.003555  min_lr: 0.003555  loss: 3.1656 (3.3269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6276 (0.7020)  time: 0.2063  data: 0.0006  max mem: 13273
Epoch: [80]  [1000/1251]  eta: 0:00:52  lr: 0.003553  min_lr: 0.003553  loss: 3.3440 (3.3337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6261 (inf)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [80]  [1200/1251]  eta: 0:00:10  lr: 0.003550  min_lr: 0.003550  loss: 3.3599 (3.3609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7027 (inf)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [80]  [1250/1251]  eta: 0:00:00  lr: 0.003550  min_lr: 0.003550  loss: 2.7199 (3.3458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6651 (inf)  time: 0.1706  data: 0.0007  max mem: 13273
Epoch: [80] Total time: 0:04:21 (0.2094 s / it)
Averaged stats: lr: 0.003550  min_lr: 0.003550  loss: 2.7199 (3.3627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6651 (inf)
Test:  [ 0/25]  eta: 0:01:54  loss: 0.7383 (0.7383)  acc1: 85.2000 (85.2000)  acc5: 96.8000 (96.8000)  time: 4.5953  data: 4.4893  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.9818 (0.9506)  acc1: 78.8000 (78.5091)  acc5: 95.6000 (95.0182)  time: 0.6536  data: 0.5564  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2051 (1.1698)  acc1: 70.8000 (73.7524)  acc5: 91.2000 (91.9048)  time: 0.2135  data: 0.1188  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2875 (1.1768)  acc1: 70.4000 (73.4560)  acc5: 90.0000 (92.1280)  time: 0.2321  data: 0.1379  max mem: 13273
Test: Total time: 0:00:10 (0.4020 s / it)
* Acc@1 73.534 Acc@5 92.054 loss 1.183
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.56%
Epoch: [81]  [   0/1251]  eta: 1:03:45  lr: 0.003550  min_lr: 0.003550  loss: 4.2791 (4.2791)  weight_decay: 0.0500 (0.0500)  time: 3.0583  data: 1.6232  max mem: 13273
Epoch: [81]  [ 200/1251]  eta: 0:03:54  lr: 0.003547  min_lr: 0.003547  loss: 2.8951 (3.3396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6346 (0.7344)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [81]  [ 400/1251]  eta: 0:03:02  lr: 0.003545  min_lr: 0.003545  loss: 2.5819 (3.3196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6351 (0.7191)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [81]  [ 600/1251]  eta: 0:02:17  lr: 0.003543  min_lr: 0.003543  loss: 3.1012 (3.3087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6969 (0.7246)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [81]  [ 800/1251]  eta: 0:01:34  lr: 0.003541  min_lr: 0.003541  loss: 2.7695 (3.3326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (0.7215)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [81]  [1000/1251]  eta: 0:00:52  lr: 0.003538  min_lr: 0.003538  loss: 3.1729 (3.3378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6146 (0.7139)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [81]  [1200/1251]  eta: 0:00:10  lr: 0.003536  min_lr: 0.003536  loss: 2.9374 (3.3294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6273 (0.6990)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [81]  [1250/1251]  eta: 0:00:00  lr: 0.003535  min_lr: 0.003535  loss: 2.7938 (3.3313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5967 (0.6961)  time: 0.1703  data: 0.0008  max mem: 13273
Epoch: [81] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.003535  min_lr: 0.003535  loss: 2.7938 (3.3428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5967 (0.6961)
Test:  [ 0/25]  eta: 0:01:55  loss: 0.7789 (0.7789)  acc1: 80.8000 (80.8000)  acc5: 97.2000 (97.2000)  time: 4.6021  data: 4.4931  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.9880 (0.9792)  acc1: 80.0000 (78.6909)  acc5: 95.2000 (95.1636)  time: 0.6614  data: 0.5637  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2131 (1.2013)  acc1: 71.2000 (74.0000)  acc5: 91.2000 (92.0191)  time: 0.2215  data: 0.1278  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3867 (1.2136)  acc1: 71.2000 (73.8880)  acc5: 90.0000 (92.0000)  time: 0.2045  data: 0.1126  max mem: 13273
Test: Total time: 0:00:10 (0.4094 s / it)
* Acc@1 73.652 Acc@5 92.120 loss 1.213
Accuracy of the model on the 50000 test images: 73.7%
Max accuracy: 73.65%
Epoch: [82]  [   0/1251]  eta: 0:56:56  lr: 0.003535  min_lr: 0.003535  loss: 2.1821 (2.1821)  weight_decay: 0.0500 (0.0500)  time: 2.7309  data: 2.5061  max mem: 13273
Epoch: [82]  [ 200/1251]  eta: 0:03:50  lr: 0.003533  min_lr: 0.003533  loss: 2.6400 (3.2867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6827 (0.7265)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [82]  [ 400/1251]  eta: 0:03:00  lr: 0.003531  min_lr: 0.003531  loss: 3.6427 (3.2856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6901 (0.7292)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [82]  [ 600/1251]  eta: 0:02:16  lr: 0.003528  min_lr: 0.003528  loss: 3.5772 (3.2872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6359 (0.7301)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [82]  [ 800/1251]  eta: 0:01:34  lr: 0.003526  min_lr: 0.003526  loss: 3.6249 (3.3049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7070 (0.7290)  time: 0.2048  data: 0.0003  max mem: 13273
Epoch: [82]  [1000/1251]  eta: 0:00:52  lr: 0.003524  min_lr: 0.003524  loss: 2.7123 (3.3079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7046 (0.7255)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [82]  [1200/1251]  eta: 0:00:10  lr: 0.003521  min_lr: 0.003521  loss: 2.8459 (3.3068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6714 (0.7237)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [82]  [1250/1251]  eta: 0:00:00  lr: 0.003521  min_lr: 0.003521  loss: 2.6674 (3.2985)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6369 (0.7234)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [82] Total time: 0:04:19 (0.2078 s / it)
Averaged stats: lr: 0.003521  min_lr: 0.003521  loss: 2.6674 (3.3561)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6369 (0.7234)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7258 (0.7258)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 5.4600  data: 5.3512  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9876 (0.9976)  acc1: 79.2000 (78.3636)  acc5: 95.2000 (94.9455)  time: 0.7067  data: 0.6111  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2282 (1.2099)  acc1: 70.4000 (73.3524)  acc5: 91.6000 (92.0191)  time: 0.2022  data: 0.1099  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3514 (1.2150)  acc1: 70.0000 (73.3120)  acc5: 90.0000 (91.9680)  time: 0.2019  data: 0.1098  max mem: 13273
Test: Total time: 0:00:09 (0.3991 s / it)
* Acc@1 73.634 Acc@5 92.126 loss 1.206
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 73.65%
Epoch: [83]  [   0/1251]  eta: 1:08:53  lr: 0.003521  min_lr: 0.003521  loss: 4.0057 (4.0057)  weight_decay: 0.0500 (0.0500)  time: 3.3044  data: 1.5816  max mem: 13273
Epoch: [83]  [ 200/1251]  eta: 0:03:53  lr: 0.003519  min_lr: 0.003519  loss: 2.8340 (3.2919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7549 (0.7812)  time: 0.2068  data: 0.0004  max mem: 13273
Epoch: [83]  [ 400/1251]  eta: 0:03:01  lr: 0.003516  min_lr: 0.003516  loss: 3.5654 (3.3121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6092 (0.7368)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [83]  [ 600/1251]  eta: 0:02:17  lr: 0.003514  min_lr: 0.003514  loss: 3.7140 (3.2980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6925 (0.7314)  time: 0.2037  data: 0.0005  max mem: 13273
Epoch: [83]  [ 800/1251]  eta: 0:01:34  lr: 0.003512  min_lr: 0.003512  loss: 2.9800 (3.3358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6490 (inf)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [83]  [1000/1251]  eta: 0:00:52  lr: 0.003509  min_lr: 0.003509  loss: 3.0239 (3.3462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6464 (inf)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [83]  [1200/1251]  eta: 0:00:10  lr: 0.003507  min_lr: 0.003507  loss: 2.8820 (3.3490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7292 (inf)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [83]  [1250/1251]  eta: 0:00:00  lr: 0.003506  min_lr: 0.003506  loss: 2.7081 (3.3453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7441 (inf)  time: 0.1705  data: 0.0007  max mem: 13273
Epoch: [83] Total time: 0:04:19 (0.2077 s / it)
Averaged stats: lr: 0.003506  min_lr: 0.003506  loss: 2.7081 (3.3504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7441 (inf)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.9190 (0.9190)  acc1: 84.4000 (84.4000)  acc5: 98.0000 (98.0000)  time: 5.7451  data: 5.6398  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0900 (1.1085)  acc1: 78.0000 (78.0727)  acc5: 96.4000 (95.2727)  time: 0.7674  data: 0.6729  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3769 (1.3418)  acc1: 69.6000 (73.4095)  acc5: 91.2000 (91.9810)  time: 0.2159  data: 0.1241  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4669 (1.3476)  acc1: 69.6000 (73.1520)  acc5: 89.6000 (91.9520)  time: 0.2150  data: 0.1240  max mem: 13273
Test: Total time: 0:00:10 (0.4200 s / it)
* Acc@1 73.174 Acc@5 91.816 loss 1.347
Accuracy of the model on the 50000 test images: 73.2%
Max accuracy: 73.65%
Epoch: [84]  [   0/1251]  eta: 1:05:16  lr: 0.003506  min_lr: 0.003506  loss: 2.7078 (2.7078)  weight_decay: 0.0500 (0.0500)  time: 3.1307  data: 2.3888  max mem: 13273
Epoch: [84]  [ 200/1251]  eta: 0:03:55  lr: 0.003504  min_lr: 0.003504  loss: 2.8871 (3.3581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6363 (0.6972)  time: 0.2036  data: 0.0005  max mem: 13273
Epoch: [84]  [ 400/1251]  eta: 0:03:03  lr: 0.003502  min_lr: 0.003502  loss: 3.0799 (3.3402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6456 (0.6837)  time: 0.2031  data: 0.0006  max mem: 13273
Epoch: [84]  [ 600/1251]  eta: 0:02:17  lr: 0.003499  min_lr: 0.003499  loss: 3.3285 (3.3532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6313 (0.6928)  time: 0.2036  data: 0.0004  max mem: 13273
Epoch: [84]  [ 800/1251]  eta: 0:01:34  lr: 0.003497  min_lr: 0.003497  loss: 2.8408 (3.3354)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6152 (0.6917)  time: 0.2084  data: 0.0004  max mem: 13273
Epoch: [84]  [1000/1251]  eta: 0:00:52  lr: 0.003494  min_lr: 0.003494  loss: 3.6359 (3.3545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7565 (nan)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [84]  [1200/1251]  eta: 0:00:10  lr: 0.003492  min_lr: 0.003492  loss: 3.5949 (3.3544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7395 (nan)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [84]  [1250/1251]  eta: 0:00:00  lr: 0.003491  min_lr: 0.003491  loss: 2.7435 (3.3536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (nan)  time: 0.1707  data: 0.0006  max mem: 13273
Epoch: [84] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.003491  min_lr: 0.003491  loss: 2.7435 (3.3480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8024 (0.8024)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.6689  data: 5.5615  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0046 (0.9921)  acc1: 77.6000 (78.6909)  acc5: 95.6000 (95.4546)  time: 0.7636  data: 0.6672  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2382 (1.2176)  acc1: 71.2000 (74.1333)  acc5: 93.2000 (92.4191)  time: 0.2006  data: 0.1071  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3966 (1.2292)  acc1: 71.2000 (73.5520)  acc5: 90.4000 (92.3840)  time: 0.2073  data: 0.1138  max mem: 13273
Test: Total time: 0:00:10 (0.4108 s / it)
* Acc@1 73.890 Acc@5 92.272 loss 1.222
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 73.89%
Epoch: [85]  [   0/1251]  eta: 1:02:00  lr: 0.003491  min_lr: 0.003491  loss: 2.4654 (2.4654)  weight_decay: 0.0500 (0.0500)  time: 2.9739  data: 2.7235  max mem: 13273
Epoch: [85]  [ 200/1251]  eta: 0:03:51  lr: 0.003489  min_lr: 0.003489  loss: 3.0182 (3.2600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6996 (0.6932)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [85]  [ 400/1251]  eta: 0:03:01  lr: 0.003487  min_lr: 0.003487  loss: 2.7572 (3.2749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8469 (0.7339)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [85]  [ 600/1251]  eta: 0:02:16  lr: 0.003484  min_lr: 0.003484  loss: 3.1712 (3.2777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8409 (0.7343)  time: 0.2044  data: 0.0006  max mem: 13273
Epoch: [85]  [ 800/1251]  eta: 0:01:34  lr: 0.003482  min_lr: 0.003482  loss: 3.1909 (3.2995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6669 (0.7290)  time: 0.2068  data: 0.0007  max mem: 13273
Epoch: [85]  [1000/1251]  eta: 0:00:52  lr: 0.003479  min_lr: 0.003479  loss: 2.8435 (3.3287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6706 (0.7219)  time: 0.2065  data: 0.0006  max mem: 13273
Epoch: [85]  [1200/1251]  eta: 0:00:10  lr: 0.003477  min_lr: 0.003477  loss: 2.7094 (3.3186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6085 (0.7135)  time: 0.2081  data: 0.0005  max mem: 13273
Epoch: [85]  [1250/1251]  eta: 0:00:00  lr: 0.003476  min_lr: 0.003476  loss: 2.9329 (3.3173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6117 (0.7098)  time: 0.1708  data: 0.0006  max mem: 13273
Epoch: [85] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.003476  min_lr: 0.003476  loss: 2.9329 (3.3327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6117 (0.7098)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8425 (0.8425)  acc1: 83.2000 (83.2000)  acc5: 96.0000 (96.0000)  time: 5.5651  data: 5.4599  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9187 (1.0070)  acc1: 78.8000 (78.2909)  acc5: 96.0000 (95.1636)  time: 0.7372  data: 0.6426  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2811 (1.2077)  acc1: 71.2000 (73.8857)  acc5: 92.0000 (92.5333)  time: 0.2138  data: 0.1209  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3860 (1.2251)  acc1: 69.6000 (73.5680)  acc5: 91.2000 (92.4000)  time: 0.2136  data: 0.1208  max mem: 13273
Test: Total time: 0:00:10 (0.4110 s / it)
* Acc@1 73.672 Acc@5 92.192 loss 1.232
Accuracy of the model on the 50000 test images: 73.7%
Max accuracy: 73.89%
Epoch: [86]  [   0/1251]  eta: 1:06:23  lr: 0.003476  min_lr: 0.003476  loss: 4.0378 (4.0378)  weight_decay: 0.0500 (0.0500)  time: 3.1843  data: 2.5819  max mem: 13273
Epoch: [86]  [ 200/1251]  eta: 0:03:52  lr: 0.003474  min_lr: 0.003474  loss: 3.8350 (3.3496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6976 (0.7518)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [86]  [ 400/1251]  eta: 0:03:01  lr: 0.003472  min_lr: 0.003472  loss: 4.0595 (3.3498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6212 (0.7157)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [86]  [ 600/1251]  eta: 0:02:17  lr: 0.003469  min_lr: 0.003469  loss: 3.2243 (3.3263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7601 (0.7170)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [86]  [ 800/1251]  eta: 0:01:34  lr: 0.003467  min_lr: 0.003467  loss: 3.7463 (3.3345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7371 (0.7171)  time: 0.2074  data: 0.0004  max mem: 13273
Epoch: [86]  [1000/1251]  eta: 0:00:52  lr: 0.003464  min_lr: 0.003464  loss: 2.5743 (3.3144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7030 (0.7152)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [86]  [1200/1251]  eta: 0:00:10  lr: 0.003462  min_lr: 0.003462  loss: 3.3939 (3.3280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6175 (0.7130)  time: 0.2062  data: 0.0006  max mem: 13273
Epoch: [86]  [1250/1251]  eta: 0:00:00  lr: 0.003461  min_lr: 0.003461  loss: 2.7919 (3.3259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6129 (0.7100)  time: 0.1708  data: 0.0010  max mem: 13273
Epoch: [86] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003461  min_lr: 0.003461  loss: 2.7919 (3.3271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6129 (0.7100)
Test:  [ 0/25]  eta: 0:01:37  loss: 0.7645 (0.7645)  acc1: 84.0000 (84.0000)  acc5: 97.2000 (97.2000)  time: 3.8916  data: 3.7851  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 1.0076 (0.9689)  acc1: 79.2000 (79.1636)  acc5: 95.2000 (95.1636)  time: 0.6311  data: 0.5371  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2048 (1.1780)  acc1: 70.8000 (74.2476)  acc5: 91.6000 (92.3429)  time: 0.2511  data: 0.1572  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2803 (1.1924)  acc1: 70.8000 (73.7760)  acc5: 90.8000 (92.2720)  time: 0.2267  data: 0.1335  max mem: 13273
Test: Total time: 0:00:10 (0.4122 s / it)
* Acc@1 73.942 Acc@5 92.292 loss 1.200
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 73.94%
Epoch: [87]  [   0/1251]  eta: 1:03:59  lr: 0.003461  min_lr: 0.003461  loss: 3.8287 (3.8287)  weight_decay: 0.0500 (0.0500)  time: 3.0694  data: 2.8342  max mem: 13273
Epoch: [87]  [ 200/1251]  eta: 0:03:51  lr: 0.003459  min_lr: 0.003459  loss: 2.9413 (3.3196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6393 (0.6792)  time: 0.2045  data: 0.0004  max mem: 13273
Epoch: [87]  [ 400/1251]  eta: 0:03:01  lr: 0.003456  min_lr: 0.003456  loss: 2.9880 (3.2872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6767 (0.7102)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [87]  [ 600/1251]  eta: 0:02:17  lr: 0.003454  min_lr: 0.003454  loss: 3.1092 (3.2904)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [87]  [ 800/1251]  eta: 0:01:34  lr: 0.003451  min_lr: 0.003451  loss: 2.5763 (3.2819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6561 (nan)  time: 0.2070  data: 0.0005  max mem: 13273
Epoch: [87]  [1000/1251]  eta: 0:00:52  lr: 0.003449  min_lr: 0.003449  loss: 2.9731 (3.2919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6540 (nan)  time: 0.2070  data: 0.0004  max mem: 13273
Epoch: [87]  [1200/1251]  eta: 0:00:10  lr: 0.003446  min_lr: 0.003446  loss: 2.5847 (3.2877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6352 (nan)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [87]  [1250/1251]  eta: 0:00:00  lr: 0.003446  min_lr: 0.003446  loss: 3.1849 (3.2912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6300 (nan)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [87] Total time: 0:04:22 (0.2095 s / it)
Averaged stats: lr: 0.003446  min_lr: 0.003446  loss: 3.1849 (3.3294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6300 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8913 (0.8913)  acc1: 83.2000 (83.2000)  acc5: 97.6000 (97.6000)  time: 5.6434  data: 5.5059  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9840 (1.0274)  acc1: 81.6000 (78.6546)  acc5: 95.6000 (95.0909)  time: 0.7358  data: 0.6338  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2912 (1.2435)  acc1: 72.0000 (74.1524)  acc5: 91.6000 (92.0762)  time: 0.2097  data: 0.1115  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3972 (1.2518)  acc1: 70.0000 (73.7280)  acc5: 89.6000 (92.0320)  time: 0.2090  data: 0.1114  max mem: 13273
Test: Total time: 0:00:10 (0.4119 s / it)
* Acc@1 73.924 Acc@5 92.164 loss 1.250
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 73.94%
Epoch: [88]  [   0/1251]  eta: 1:04:14  lr: 0.003446  min_lr: 0.003446  loss: 3.4618 (3.4618)  weight_decay: 0.0500 (0.0500)  time: 3.0812  data: 2.4442  max mem: 13273
Epoch: [88]  [ 200/1251]  eta: 0:03:53  lr: 0.003443  min_lr: 0.003443  loss: 2.6717 (3.2597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6338 (0.7274)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [88]  [ 400/1251]  eta: 0:03:01  lr: 0.003441  min_lr: 0.003441  loss: 3.1207 (3.2804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7218 (0.7187)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [88]  [ 600/1251]  eta: 0:02:17  lr: 0.003438  min_lr: 0.003438  loss: 2.6804 (3.2898)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6481 (0.7150)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [88]  [ 800/1251]  eta: 0:01:34  lr: 0.003436  min_lr: 0.003436  loss: 3.1233 (3.3057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6737 (0.7217)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [88]  [1000/1251]  eta: 0:00:52  lr: 0.003433  min_lr: 0.003433  loss: 3.4356 (3.3087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6023 (0.7141)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [88]  [1200/1251]  eta: 0:00:10  lr: 0.003431  min_lr: 0.003431  loss: 2.7920 (3.3052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6785 (0.7105)  time: 0.2036  data: 0.0005  max mem: 13273
Epoch: [88]  [1250/1251]  eta: 0:00:00  lr: 0.003430  min_lr: 0.003430  loss: 3.7926 (3.3108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6940 (0.7097)  time: 0.1701  data: 0.0010  max mem: 13273
Epoch: [88] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.003430  min_lr: 0.003430  loss: 3.7926 (3.3309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6940 (0.7097)
Test:  [ 0/25]  eta: 0:02:08  loss: 0.9174 (0.9174)  acc1: 82.4000 (82.4000)  acc5: 98.4000 (98.4000)  time: 5.1423  data: 5.0276  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0461 (1.0798)  acc1: 80.8000 (78.7636)  acc5: 96.4000 (95.9273)  time: 0.7042  data: 0.6064  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3968 (1.3122)  acc1: 70.4000 (73.4857)  acc5: 92.0000 (92.5905)  time: 0.2075  data: 0.1142  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4696 (1.3277)  acc1: 69.6000 (73.1680)  acc5: 90.8000 (92.5440)  time: 0.2167  data: 0.1245  max mem: 13273
Test: Total time: 0:00:10 (0.4007 s / it)
* Acc@1 73.656 Acc@5 92.134 loss 1.333
Accuracy of the model on the 50000 test images: 73.7%
Max accuracy: 73.94%
Epoch: [89]  [   0/1251]  eta: 1:04:58  lr: 0.003430  min_lr: 0.003430  loss: 2.7599 (2.7599)  weight_decay: 0.0500 (0.0500)  time: 3.1161  data: 2.0904  max mem: 13273
Epoch: [89]  [ 200/1251]  eta: 0:03:53  lr: 0.003428  min_lr: 0.003428  loss: 2.7290 (3.3069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6659 (0.7001)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [89]  [ 400/1251]  eta: 0:03:02  lr: 0.003425  min_lr: 0.003425  loss: 2.7296 (3.2986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7242 (0.6960)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [89]  [ 600/1251]  eta: 0:02:17  lr: 0.003423  min_lr: 0.003423  loss: 3.1128 (3.2928)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6282 (0.6976)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [89]  [ 800/1251]  eta: 0:01:34  lr: 0.003420  min_lr: 0.003420  loss: 2.7275 (3.3035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5880 (0.6892)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [89]  [1000/1251]  eta: 0:00:52  lr: 0.003418  min_lr: 0.003418  loss: 3.1999 (3.3018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6816 (0.6931)  time: 0.2114  data: 0.0004  max mem: 13273
Epoch: [89]  [1200/1251]  eta: 0:00:10  lr: 0.003415  min_lr: 0.003415  loss: 2.8561 (3.3028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6151 (0.6898)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [89]  [1250/1251]  eta: 0:00:00  lr: 0.003414  min_lr: 0.003414  loss: 3.2240 (3.3041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (0.6926)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [89] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.003414  min_lr: 0.003414  loss: 3.2240 (3.3215)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (0.6926)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.9181 (0.9181)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 5.5661  data: 5.4605  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.1003 (1.0913)  acc1: 79.6000 (79.3455)  acc5: 96.0000 (95.4182)  time: 0.7563  data: 0.6595  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2891 (1.3126)  acc1: 71.6000 (74.4191)  acc5: 91.2000 (92.3810)  time: 0.2131  data: 0.1196  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4487 (1.3273)  acc1: 71.6000 (74.0160)  acc5: 90.0000 (92.1600)  time: 0.2120  data: 0.1195  max mem: 13273
Test: Total time: 0:00:10 (0.4107 s / it)
* Acc@1 73.842 Acc@5 92.158 loss 1.335
Accuracy of the model on the 50000 test images: 73.8%
Max accuracy: 73.94%
Epoch: [90]  [   0/1251]  eta: 1:01:57  lr: 0.003414  min_lr: 0.003414  loss: 4.2717 (4.2717)  weight_decay: 0.0500 (0.0500)  time: 2.9714  data: 2.4859  max mem: 13273
Epoch: [90]  [ 200/1251]  eta: 0:03:52  lr: 0.003412  min_lr: 0.003412  loss: 3.5683 (3.2874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6873 (0.6880)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [90]  [ 400/1251]  eta: 0:03:01  lr: 0.003409  min_lr: 0.003409  loss: 3.3149 (3.3185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6833 (0.6941)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [90]  [ 600/1251]  eta: 0:02:17  lr: 0.003407  min_lr: 0.003407  loss: 2.7242 (3.3092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7048 (0.6918)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [90]  [ 800/1251]  eta: 0:01:34  lr: 0.003404  min_lr: 0.003404  loss: 3.1294 (3.3115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.6910)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [90]  [1000/1251]  eta: 0:00:52  lr: 0.003402  min_lr: 0.003402  loss: 3.3683 (3.3155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7064 (0.6900)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [90]  [1200/1251]  eta: 0:00:10  lr: 0.003399  min_lr: 0.003399  loss: 3.5410 (3.3143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7055 (0.7015)  time: 0.2063  data: 0.0006  max mem: 13273
Epoch: [90]  [1250/1251]  eta: 0:00:00  lr: 0.003398  min_lr: 0.003398  loss: 3.4887 (3.3163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6072 (0.6967)  time: 0.1703  data: 0.0006  max mem: 13273
Epoch: [90] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.003398  min_lr: 0.003398  loss: 3.4887 (3.3199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6072 (0.6967)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.8589 (0.8589)  acc1: 84.4000 (84.4000)  acc5: 96.4000 (96.4000)  time: 5.3772  data: 5.2700  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9637 (1.0224)  acc1: 78.0000 (78.4000)  acc5: 95.6000 (95.5273)  time: 0.7054  data: 0.6119  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2614 (1.2333)  acc1: 72.0000 (73.7524)  acc5: 92.0000 (92.5524)  time: 0.2018  data: 0.1106  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3380 (1.2434)  acc1: 71.2000 (73.6320)  acc5: 90.8000 (92.6080)  time: 0.2189  data: 0.1277  max mem: 13273
Test: Total time: 0:00:10 (0.4084 s / it)
* Acc@1 74.156 Acc@5 92.394 loss 1.249
Accuracy of the model on the 50000 test images: 74.2%
Max accuracy: 74.16%
Epoch: [91]  [   0/1251]  eta: 1:00:30  lr: 0.003398  min_lr: 0.003398  loss: 3.5932 (3.5932)  weight_decay: 0.0500 (0.0500)  time: 2.9018  data: 2.4570  max mem: 13273
Epoch: [91]  [ 200/1251]  eta: 0:03:53  lr: 0.003396  min_lr: 0.003396  loss: 3.2748 (3.3199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6542 (0.6585)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [91]  [ 400/1251]  eta: 0:03:02  lr: 0.003393  min_lr: 0.003393  loss: 2.9191 (3.2943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6580 (0.6772)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [91]  [ 600/1251]  eta: 0:02:17  lr: 0.003391  min_lr: 0.003391  loss: 4.0515 (3.3011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7141 (0.6759)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [91]  [ 800/1251]  eta: 0:01:34  lr: 0.003388  min_lr: 0.003388  loss: 3.7985 (3.3192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7290 (0.6897)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [91]  [1000/1251]  eta: 0:00:52  lr: 0.003385  min_lr: 0.003385  loss: 3.1319 (3.3382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6737 (0.6953)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [91]  [1200/1251]  eta: 0:00:10  lr: 0.003383  min_lr: 0.003383  loss: 2.6658 (3.3219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6198 (0.6917)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [91]  [1250/1251]  eta: 0:00:00  lr: 0.003382  min_lr: 0.003382  loss: 3.3541 (3.3266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6728 (0.6909)  time: 0.1706  data: 0.0012  max mem: 13273
Epoch: [91] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003382  min_lr: 0.003382  loss: 3.3541 (3.3141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6728 (0.6909)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.8357 (0.8357)  acc1: 85.2000 (85.2000)  acc5: 96.8000 (96.8000)  time: 5.9816  data: 5.8751  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0097 (1.0649)  acc1: 78.8000 (77.7818)  acc5: 95.6000 (95.0909)  time: 0.7363  data: 0.6399  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2800 (1.2608)  acc1: 72.0000 (73.7333)  acc5: 92.0000 (92.5905)  time: 0.1875  data: 0.0945  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4126 (1.2756)  acc1: 72.0000 (73.5360)  acc5: 91.2000 (92.4000)  time: 0.1867  data: 0.0945  max mem: 13273
Test: Total time: 0:00:10 (0.4070 s / it)
* Acc@1 73.634 Acc@5 92.116 loss 1.277
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 74.16%
Epoch: [92]  [   0/1251]  eta: 1:00:37  lr: 0.003382  min_lr: 0.003382  loss: 3.0577 (3.0577)  weight_decay: 0.0500 (0.0500)  time: 2.9077  data: 2.2973  max mem: 13273
Epoch: [92]  [ 200/1251]  eta: 0:03:52  lr: 0.003380  min_lr: 0.003380  loss: 3.6749 (3.2851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7234 (0.7267)  time: 0.2066  data: 0.0008  max mem: 13273
Epoch: [92]  [ 400/1251]  eta: 0:03:01  lr: 0.003377  min_lr: 0.003377  loss: 3.0375 (3.2875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6732 (0.6938)  time: 0.2121  data: 0.0003  max mem: 13273
Epoch: [92]  [ 600/1251]  eta: 0:02:16  lr: 0.003374  min_lr: 0.003374  loss: 2.9416 (3.2867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6387 (0.6848)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [92]  [ 800/1251]  eta: 0:01:34  lr: 0.003372  min_lr: 0.003372  loss: 3.2043 (3.2689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6452 (0.6865)  time: 0.2037  data: 0.0004  max mem: 13273
Epoch: [92]  [1000/1251]  eta: 0:00:52  lr: 0.003369  min_lr: 0.003369  loss: 2.8191 (3.2683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7239 (0.6901)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [92]  [1200/1251]  eta: 0:00:10  lr: 0.003367  min_lr: 0.003367  loss: 3.4385 (3.2677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6127 (0.6874)  time: 0.2067  data: 0.0004  max mem: 13273
Epoch: [92]  [1250/1251]  eta: 0:00:00  lr: 0.003366  min_lr: 0.003366  loss: 3.2034 (3.2674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6519 (0.6861)  time: 0.1704  data: 0.0006  max mem: 13273
Epoch: [92] Total time: 0:04:20 (0.2080 s / it)
Averaged stats: lr: 0.003366  min_lr: 0.003366  loss: 3.2034 (3.3089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6519 (0.6861)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8469 (0.8469)  acc1: 85.2000 (85.2000)  acc5: 96.0000 (96.0000)  time: 5.4515  data: 5.3460  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9693 (1.0259)  acc1: 79.6000 (78.5455)  acc5: 96.0000 (95.1273)  time: 0.7485  data: 0.6505  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3030 (1.2481)  acc1: 70.0000 (73.8095)  acc5: 91.6000 (92.1714)  time: 0.2148  data: 0.1201  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3392 (1.2570)  acc1: 70.0000 (73.7600)  acc5: 90.4000 (92.1280)  time: 0.2191  data: 0.1252  max mem: 13273
Test: Total time: 0:00:10 (0.4115 s / it)
* Acc@1 74.018 Acc@5 92.254 loss 1.256
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.16%
Epoch: [93]  [   0/1251]  eta: 1:05:15  lr: 0.003366  min_lr: 0.003366  loss: 4.1235 (4.1235)  weight_decay: 0.0500 (0.0500)  time: 3.1302  data: 1.6128  max mem: 13273
Epoch: [93]  [ 200/1251]  eta: 0:03:53  lr: 0.003363  min_lr: 0.003363  loss: 3.4711 (3.3169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7221 (0.7294)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [93]  [ 400/1251]  eta: 0:03:02  lr: 0.003361  min_lr: 0.003361  loss: 3.7611 (3.3618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6323 (0.7180)  time: 0.2064  data: 0.0006  max mem: 13273
Epoch: [93]  [ 600/1251]  eta: 0:02:17  lr: 0.003358  min_lr: 0.003358  loss: 2.7421 (3.3566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7083 (0.7101)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [93]  [ 800/1251]  eta: 0:01:34  lr: 0.003355  min_lr: 0.003355  loss: 2.8592 (3.3371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6852 (0.7448)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [93]  [1000/1251]  eta: 0:00:52  lr: 0.003353  min_lr: 0.003353  loss: 3.6764 (3.3355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6696 (0.7368)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [93]  [1200/1251]  eta: 0:00:10  lr: 0.003350  min_lr: 0.003350  loss: 2.6740 (3.3213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6751 (0.7611)  time: 0.2051  data: 0.0006  max mem: 13273
Epoch: [93]  [1250/1251]  eta: 0:00:00  lr: 0.003350  min_lr: 0.003350  loss: 2.5932 (3.3161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5473 (0.7548)  time: 0.1711  data: 0.0010  max mem: 13273
Epoch: [93] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.003350  min_lr: 0.003350  loss: 2.5932 (3.3292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5473 (0.7548)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.7423 (0.7423)  acc1: 85.2000 (85.2000)  acc5: 96.8000 (96.8000)  time: 5.9226  data: 5.8155  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9228 (0.9612)  acc1: 80.4000 (78.8727)  acc5: 96.4000 (94.9091)  time: 0.7642  data: 0.6655  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2486 (1.1833)  acc1: 71.6000 (73.9619)  acc5: 89.6000 (91.9619)  time: 0.1933  data: 0.0986  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2663 (1.1890)  acc1: 72.0000 (73.9040)  acc5: 89.6000 (92.0320)  time: 0.2020  data: 0.1087  max mem: 13273
Test: Total time: 0:00:10 (0.4177 s / it)
* Acc@1 74.114 Acc@5 92.338 loss 1.183
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.16%
Epoch: [94]  [   0/1251]  eta: 1:04:17  lr: 0.003350  min_lr: 0.003350  loss: 3.0331 (3.0331)  weight_decay: 0.0500 (0.0500)  time: 3.0834  data: 2.3891  max mem: 13273
Epoch: [94]  [ 200/1251]  eta: 0:03:52  lr: 0.003347  min_lr: 0.003347  loss: 3.0594 (3.2152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6791 (0.6783)  time: 0.2102  data: 0.0004  max mem: 13273
Epoch: [94]  [ 400/1251]  eta: 0:03:02  lr: 0.003344  min_lr: 0.003344  loss: 2.6104 (3.2762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6891 (0.6923)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [94]  [ 600/1251]  eta: 0:02:17  lr: 0.003342  min_lr: 0.003342  loss: 2.8077 (3.2754)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5683 (0.6771)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [94]  [ 800/1251]  eta: 0:01:35  lr: 0.003339  min_lr: 0.003339  loss: 3.8711 (3.3093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6534 (0.6632)  time: 0.2149  data: 0.0005  max mem: 13273
Epoch: [94]  [1000/1251]  eta: 0:00:52  lr: 0.003336  min_lr: 0.003336  loss: 3.4988 (3.3096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8098 (0.6821)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [94]  [1200/1251]  eta: 0:00:10  lr: 0.003334  min_lr: 0.003334  loss: 3.6887 (3.3151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6379 (0.6808)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [94]  [1250/1251]  eta: 0:00:00  lr: 0.003333  min_lr: 0.003333  loss: 2.6164 (3.3115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7020 (0.6826)  time: 0.1701  data: 0.0006  max mem: 13273
Epoch: [94] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.003333  min_lr: 0.003333  loss: 2.6164 (3.3118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7020 (0.6826)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7928 (0.7928)  acc1: 83.2000 (83.2000)  acc5: 95.6000 (95.6000)  time: 5.6726  data: 5.5254  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9835 (0.9941)  acc1: 81.2000 (79.0545)  acc5: 95.6000 (95.2000)  time: 0.7201  data: 0.6170  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2750 (1.2178)  acc1: 71.2000 (74.4381)  acc5: 91.6000 (92.4571)  time: 0.1909  data: 0.0960  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3896 (1.2280)  acc1: 71.2000 (73.9200)  acc5: 90.0000 (92.5120)  time: 0.2030  data: 0.1098  max mem: 13273
Test: Total time: 0:00:10 (0.4089 s / it)
* Acc@1 74.078 Acc@5 92.358 loss 1.230
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.16%
Epoch: [95]  [   0/1251]  eta: 1:05:50  lr: 0.003333  min_lr: 0.003333  loss: 2.3897 (2.3897)  weight_decay: 0.0500 (0.0500)  time: 3.1578  data: 2.7940  max mem: 13273
Epoch: [95]  [ 200/1251]  eta: 0:03:55  lr: 0.003330  min_lr: 0.003330  loss: 2.7030 (3.2606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6431 (0.6846)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [95]  [ 400/1251]  eta: 0:03:02  lr: 0.003327  min_lr: 0.003327  loss: 3.0649 (3.2874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6212 (0.6961)  time: 0.2077  data: 0.0005  max mem: 13273
Epoch: [95]  [ 600/1251]  eta: 0:02:17  lr: 0.003325  min_lr: 0.003325  loss: 3.4514 (3.2917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7510 (0.7041)  time: 0.2045  data: 0.0004  max mem: 13273
Epoch: [95]  [ 800/1251]  eta: 0:01:34  lr: 0.003322  min_lr: 0.003322  loss: 2.9841 (3.3034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6036 (0.6976)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [95]  [1000/1251]  eta: 0:00:52  lr: 0.003319  min_lr: 0.003319  loss: 3.6242 (3.3263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7491 (0.7092)  time: 0.2062  data: 0.0006  max mem: 13273
Epoch: [95]  [1200/1251]  eta: 0:00:10  lr: 0.003317  min_lr: 0.003317  loss: 3.5335 (3.3259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6105 (0.7053)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [95]  [1250/1251]  eta: 0:00:00  lr: 0.003316  min_lr: 0.003316  loss: 3.6754 (3.3339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6761 (0.7070)  time: 0.1706  data: 0.0008  max mem: 13273
Epoch: [95] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.003316  min_lr: 0.003316  loss: 3.6754 (3.3096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6761 (0.7070)
Test:  [ 0/25]  eta: 0:02:08  loss: 0.8484 (0.8484)  acc1: 81.6000 (81.6000)  acc5: 96.4000 (96.4000)  time: 5.1494  data: 5.0390  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.1065 (1.0780)  acc1: 79.2000 (77.8909)  acc5: 96.4000 (95.3091)  time: 0.6773  data: 0.5793  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3173 (1.2736)  acc1: 71.2000 (73.7143)  acc5: 91.6000 (92.3619)  time: 0.2050  data: 0.1103  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4239 (1.2822)  acc1: 71.2000 (73.3920)  acc5: 90.4000 (92.3200)  time: 0.2066  data: 0.1127  max mem: 13273
Test: Total time: 0:00:10 (0.4026 s / it)
* Acc@1 74.182 Acc@5 92.316 loss 1.274
Accuracy of the model on the 50000 test images: 74.2%
Max accuracy: 74.18%
Epoch: [96]  [   0/1251]  eta: 1:03:11  lr: 0.003316  min_lr: 0.003316  loss: 3.2333 (3.2333)  weight_decay: 0.0500 (0.0500)  time: 3.0305  data: 2.7853  max mem: 13273
Epoch: [96]  [ 200/1251]  eta: 0:03:52  lr: 0.003313  min_lr: 0.003313  loss: 3.3632 (3.2680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6163 (0.7023)  time: 0.2050  data: 0.0006  max mem: 13273
Epoch: [96]  [ 400/1251]  eta: 0:03:01  lr: 0.003311  min_lr: 0.003311  loss: 2.6691 (3.2711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6643 (0.6942)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [96]  [ 600/1251]  eta: 0:02:17  lr: 0.003308  min_lr: 0.003308  loss: 3.4463 (3.2756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7715 (0.7183)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [96]  [ 800/1251]  eta: 0:01:34  lr: 0.003305  min_lr: 0.003305  loss: 3.0265 (3.2740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6294 (0.7128)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [96]  [1000/1251]  eta: 0:00:52  lr: 0.003302  min_lr: 0.003302  loss: 2.9415 (3.2755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6694 (0.7029)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [96]  [1200/1251]  eta: 0:00:10  lr: 0.003300  min_lr: 0.003300  loss: 3.2842 (3.2956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6332 (0.7077)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [96]  [1250/1251]  eta: 0:00:00  lr: 0.003299  min_lr: 0.003299  loss: 3.5242 (3.2972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6261 (0.7064)  time: 0.1708  data: 0.0005  max mem: 13273
Epoch: [96] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.003299  min_lr: 0.003299  loss: 3.5242 (3.3046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6261 (0.7064)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8145 (0.8145)  acc1: 87.2000 (87.2000)  acc5: 97.2000 (97.2000)  time: 5.6364  data: 5.4913  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0081 (0.9974)  acc1: 78.8000 (79.3455)  acc5: 95.2000 (95.4182)  time: 0.7507  data: 0.6476  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3237 (1.2348)  acc1: 72.0000 (74.4381)  acc5: 90.8000 (92.3619)  time: 0.2053  data: 0.1103  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3766 (1.2424)  acc1: 72.0000 (74.4480)  acc5: 90.8000 (92.2880)  time: 0.2035  data: 0.1102  max mem: 13273
Test: Total time: 0:00:10 (0.4080 s / it)
* Acc@1 74.234 Acc@5 92.522 loss 1.242
Accuracy of the model on the 50000 test images: 74.2%
Max accuracy: 74.23%
Epoch: [97]  [   0/1251]  eta: 1:00:46  lr: 0.003299  min_lr: 0.003299  loss: 2.2772 (2.2772)  weight_decay: 0.0500 (0.0500)  time: 2.9147  data: 2.6736  max mem: 13273
Epoch: [97]  [ 200/1251]  eta: 0:03:52  lr: 0.003296  min_lr: 0.003296  loss: 3.0357 (3.3291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7241 (0.7189)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [97]  [ 400/1251]  eta: 0:03:01  lr: 0.003294  min_lr: 0.003294  loss: 2.8248 (3.3532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7650 (0.7312)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [97]  [ 600/1251]  eta: 0:02:17  lr: 0.003291  min_lr: 0.003291  loss: 3.0156 (3.3397)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6991 (0.7166)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [97]  [ 800/1251]  eta: 0:01:34  lr: 0.003288  min_lr: 0.003288  loss: 2.7812 (3.3351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6848 (0.7072)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [97]  [1000/1251]  eta: 0:00:52  lr: 0.003285  min_lr: 0.003285  loss: 3.2881 (3.3260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.7093)  time: 0.2131  data: 0.0005  max mem: 13273
Epoch: [97]  [1200/1251]  eta: 0:00:10  lr: 0.003283  min_lr: 0.003283  loss: 3.8628 (3.3108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6503 (0.7027)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [97]  [1250/1251]  eta: 0:00:00  lr: 0.003282  min_lr: 0.003282  loss: 3.4880 (3.3118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6183 (0.6986)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [97] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.003282  min_lr: 0.003282  loss: 3.4880 (3.3009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6183 (0.6986)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.8562 (0.8562)  acc1: 82.4000 (82.4000)  acc5: 96.4000 (96.4000)  time: 5.3416  data: 5.2347  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0258 (1.0544)  acc1: 79.2000 (78.8727)  acc5: 96.0000 (95.5636)  time: 0.7533  data: 0.6538  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3662 (1.2635)  acc1: 71.2000 (74.8000)  acc5: 92.0000 (92.8000)  time: 0.2393  data: 0.1447  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4112 (1.2782)  acc1: 71.6000 (74.5600)  acc5: 91.6000 (92.6720)  time: 0.2379  data: 0.1446  max mem: 13273
Test: Total time: 0:00:10 (0.4223 s / it)
* Acc@1 74.318 Acc@5 92.414 loss 1.278
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.32%
Epoch: [98]  [   0/1251]  eta: 1:01:39  lr: 0.003282  min_lr: 0.003282  loss: 4.4558 (4.4558)  weight_decay: 0.0500 (0.0500)  time: 2.9570  data: 2.7329  max mem: 13273
Epoch: [98]  [ 200/1251]  eta: 0:03:52  lr: 0.003279  min_lr: 0.003279  loss: 2.4834 (3.2639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7615 (0.7658)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [98]  [ 400/1251]  eta: 0:03:01  lr: 0.003276  min_lr: 0.003276  loss: 3.1756 (3.2864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7150 (0.7392)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [98]  [ 600/1251]  eta: 0:02:17  lr: 0.003274  min_lr: 0.003274  loss: 3.4620 (3.2441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6796 (0.7212)  time: 0.2033  data: 0.0004  max mem: 13273
Epoch: [98]  [ 800/1251]  eta: 0:01:34  lr: 0.003271  min_lr: 0.003271  loss: 3.0579 (3.2329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6931 (0.7192)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [98]  [1000/1251]  eta: 0:00:52  lr: 0.003268  min_lr: 0.003268  loss: 2.9474 (3.2328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6612 (0.7143)  time: 0.2050  data: 0.0006  max mem: 13273
Epoch: [98]  [1200/1251]  eta: 0:00:10  lr: 0.003265  min_lr: 0.003265  loss: 3.7263 (3.2475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6007 (0.7085)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [98]  [1250/1251]  eta: 0:00:00  lr: 0.003265  min_lr: 0.003265  loss: 3.4369 (3.2524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6942 (0.7105)  time: 0.1708  data: 0.0008  max mem: 13273
Epoch: [98] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.003265  min_lr: 0.003265  loss: 3.4369 (3.2876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6942 (0.7105)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7663 (0.7663)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 5.5732  data: 5.4650  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0440 (1.0252)  acc1: 79.2000 (79.3818)  acc5: 95.6000 (95.2000)  time: 0.7158  data: 0.6170  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2351 (1.2270)  acc1: 73.2000 (74.6667)  acc5: 92.0000 (92.3810)  time: 0.1859  data: 0.0919  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3695 (1.2344)  acc1: 72.4000 (74.3040)  acc5: 91.2000 (92.4160)  time: 0.1860  data: 0.0934  max mem: 13273
Test: Total time: 0:00:09 (0.3908 s / it)
* Acc@1 74.148 Acc@5 92.320 loss 1.231
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.32%
Epoch: [99]  [   0/1251]  eta: 1:03:32  lr: 0.003265  min_lr: 0.003265  loss: 3.9198 (3.9198)  weight_decay: 0.0500 (0.0500)  time: 3.0476  data: 1.7110  max mem: 13273
Epoch: [99]  [ 200/1251]  eta: 0:03:54  lr: 0.003262  min_lr: 0.003262  loss: 3.4355 (3.2494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6653 (0.7090)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [99]  [ 400/1251]  eta: 0:03:02  lr: 0.003259  min_lr: 0.003259  loss: 2.7849 (3.2795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6679 (0.7077)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [99]  [ 600/1251]  eta: 0:02:17  lr: 0.003256  min_lr: 0.003256  loss: 3.4959 (3.2612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6343 (0.6943)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [99]  [ 800/1251]  eta: 0:01:34  lr: 0.003253  min_lr: 0.003253  loss: 3.1432 (3.2865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7659 (0.7077)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [99]  [1000/1251]  eta: 0:00:52  lr: 0.003251  min_lr: 0.003251  loss: 2.7413 (3.2778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6448 (0.7058)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [99]  [1200/1251]  eta: 0:00:10  lr: 0.003248  min_lr: 0.003248  loss: 3.3159 (3.2920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5457 (0.6968)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [99]  [1250/1251]  eta: 0:00:00  lr: 0.003247  min_lr: 0.003247  loss: 3.1537 (3.2965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6266 (0.6977)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [99] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.003247  min_lr: 0.003247  loss: 3.1537 (3.2994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6266 (0.6977)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.9122 (0.9122)  acc1: 84.4000 (84.4000)  acc5: 96.0000 (96.0000)  time: 5.7472  data: 5.6167  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.1661 (1.0964)  acc1: 78.8000 (78.4364)  acc5: 95.6000 (95.2364)  time: 0.7068  data: 0.6052  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2977 (1.2893)  acc1: 71.6000 (73.8857)  acc5: 91.2000 (92.2857)  time: 0.1862  data: 0.0916  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3965 (1.2958)  acc1: 71.6000 (73.4880)  acc5: 90.4000 (92.2560)  time: 0.1847  data: 0.0920  max mem: 13273
Test: Total time: 0:00:10 (0.4006 s / it)
* Acc@1 74.112 Acc@5 92.348 loss 1.288
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.32%
Epoch: [100]  [   0/1251]  eta: 1:04:59  lr: 0.003247  min_lr: 0.003247  loss: 2.4142 (2.4142)  weight_decay: 0.0500 (0.0500)  time: 3.1170  data: 1.6674  max mem: 13273
Epoch: [100]  [ 200/1251]  eta: 0:03:53  lr: 0.003244  min_lr: 0.003244  loss: 3.9206 (3.2934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7140 (0.7055)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [100]  [ 400/1251]  eta: 0:03:01  lr: 0.003242  min_lr: 0.003242  loss: 2.7288 (3.2811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7345 (0.7179)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [100]  [ 600/1251]  eta: 0:02:17  lr: 0.003239  min_lr: 0.003239  loss: 3.5591 (3.2843)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6417 (0.7101)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [100]  [ 800/1251]  eta: 0:01:34  lr: 0.003236  min_lr: 0.003236  loss: 3.3816 (3.2983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6497 (0.7103)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [100]  [1000/1251]  eta: 0:00:52  lr: 0.003233  min_lr: 0.003233  loss: 2.9539 (3.3041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7275 (0.7104)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [100]  [1200/1251]  eta: 0:00:10  lr: 0.003230  min_lr: 0.003230  loss: 2.7780 (3.3185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.7144)  time: 0.2054  data: 0.0006  max mem: 13273
Epoch: [100]  [1250/1251]  eta: 0:00:00  lr: 0.003230  min_lr: 0.003230  loss: 3.0386 (3.3138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5784 (0.7098)  time: 0.1704  data: 0.0006  max mem: 13273
Epoch: [100] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003230  min_lr: 0.003230  loss: 3.0386 (3.2958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5784 (0.7098)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7428 (0.7428)  acc1: 85.6000 (85.6000)  acc5: 96.4000 (96.4000)  time: 5.6235  data: 5.4925  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9806 (0.9568)  acc1: 80.8000 (79.7818)  acc5: 96.0000 (95.0909)  time: 0.7566  data: 0.6551  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2012 (1.1772)  acc1: 72.8000 (75.2000)  acc5: 91.2000 (92.5714)  time: 0.2052  data: 0.1107  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3222 (1.1904)  acc1: 72.4000 (74.6880)  acc5: 90.4000 (92.4960)  time: 0.2046  data: 0.1118  max mem: 13273
Test: Total time: 0:00:10 (0.4079 s / it)
* Acc@1 74.696 Acc@5 92.578 loss 1.183
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 74.70%
Epoch: [101]  [   0/1251]  eta: 0:56:19  lr: 0.003230  min_lr: 0.003230  loss: 2.3091 (2.3091)  weight_decay: 0.0500 (0.0500)  time: 2.7012  data: 2.4323  max mem: 13273
Epoch: [101]  [ 200/1251]  eta: 0:03:54  lr: 0.003227  min_lr: 0.003227  loss: 2.5737 (3.2295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6199 (0.6956)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [101]  [ 400/1251]  eta: 0:03:02  lr: 0.003224  min_lr: 0.003224  loss: 3.1041 (3.2147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6944 (0.7146)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [101]  [ 600/1251]  eta: 0:02:17  lr: 0.003221  min_lr: 0.003221  loss: 3.3635 (3.2339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6984 (0.7208)  time: 0.2031  data: 0.0004  max mem: 13273
Epoch: [101]  [ 800/1251]  eta: 0:01:34  lr: 0.003218  min_lr: 0.003218  loss: 3.4467 (3.2562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7084 (0.7131)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [101]  [1000/1251]  eta: 0:00:52  lr: 0.003215  min_lr: 0.003215  loss: 3.3274 (3.2730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6910 (0.7080)  time: 0.2077  data: 0.0005  max mem: 13273
Epoch: [101]  [1200/1251]  eta: 0:00:10  lr: 0.003212  min_lr: 0.003212  loss: 2.8257 (3.2701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6996 (0.7120)  time: 0.2072  data: 0.0006  max mem: 13273
Epoch: [101]  [1250/1251]  eta: 0:00:00  lr: 0.003212  min_lr: 0.003212  loss: 3.8770 (3.2759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6408 (0.7099)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [101] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.003212  min_lr: 0.003212  loss: 3.8770 (3.2885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6408 (0.7099)
Test:  [ 0/25]  eta: 0:01:21  loss: 0.8685 (0.8685)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 3.2622  data: 3.1369  max mem: 13273
Test:  [10/25]  eta: 0:00:08  loss: 1.0447 (1.0914)  acc1: 79.6000 (78.8000)  acc5: 96.0000 (95.1636)  time: 0.5935  data: 0.4991  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3099 (1.2992)  acc1: 72.4000 (73.8095)  acc5: 91.2000 (92.2476)  time: 0.2722  data: 0.1813  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4381 (1.3080)  acc1: 71.2000 (73.6480)  acc5: 90.8000 (92.1280)  time: 0.2326  data: 0.1422  max mem: 13273
Test: Total time: 0:00:09 (0.3990 s / it)
* Acc@1 74.016 Acc@5 92.364 loss 1.306
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.70%
Epoch: [102]  [   0/1251]  eta: 1:02:27  lr: 0.003212  min_lr: 0.003212  loss: 2.5504 (2.5504)  weight_decay: 0.0500 (0.0500)  time: 2.9955  data: 2.7195  max mem: 13273
Epoch: [102]  [ 200/1251]  eta: 0:03:52  lr: 0.003209  min_lr: 0.003209  loss: 3.1601 (3.2454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7930 (0.7441)  time: 0.2065  data: 0.0004  max mem: 13273
Epoch: [102]  [ 400/1251]  eta: 0:03:01  lr: 0.003206  min_lr: 0.003206  loss: 2.9621 (3.2711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6674 (0.7326)  time: 0.2039  data: 0.0003  max mem: 13273
Epoch: [102]  [ 600/1251]  eta: 0:02:17  lr: 0.003203  min_lr: 0.003203  loss: 3.2444 (3.3106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6270 (0.7258)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [102]  [ 800/1251]  eta: 0:01:34  lr: 0.003200  min_lr: 0.003200  loss: 4.0152 (3.3078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7216 (0.7209)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [102]  [1000/1251]  eta: 0:00:52  lr: 0.003197  min_lr: 0.003197  loss: 2.7033 (3.3033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6465 (0.7178)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [102]  [1200/1251]  eta: 0:00:10  lr: 0.003195  min_lr: 0.003195  loss: 2.9899 (3.3045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8532 (0.7225)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [102]  [1250/1251]  eta: 0:00:00  lr: 0.003194  min_lr: 0.003194  loss: 2.8024 (3.3021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6808 (0.7207)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [102] Total time: 0:04:20 (0.2079 s / it)
Averaged stats: lr: 0.003194  min_lr: 0.003194  loss: 2.8024 (3.2938)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6808 (0.7207)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.8120 (0.8120)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.7822  data: 5.6760  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0832 (1.0574)  acc1: 80.8000 (78.7273)  acc5: 95.6000 (95.2364)  time: 0.7043  data: 0.6053  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3283 (1.2547)  acc1: 71.6000 (74.2476)  acc5: 92.4000 (92.4381)  time: 0.1832  data: 0.0883  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3878 (1.2662)  acc1: 72.0000 (73.9680)  acc5: 90.8000 (92.3680)  time: 0.1939  data: 0.1001  max mem: 13273
Test: Total time: 0:00:10 (0.4047 s / it)
* Acc@1 74.340 Acc@5 92.500 loss 1.267
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.70%
Epoch: [103]  [   0/1251]  eta: 1:05:29  lr: 0.003194  min_lr: 0.003194  loss: 4.2795 (4.2795)  weight_decay: 0.0500 (0.0500)  time: 3.1414  data: 2.5743  max mem: 13273
Epoch: [103]  [ 200/1251]  eta: 0:03:52  lr: 0.003191  min_lr: 0.003191  loss: 3.4736 (3.3542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6823 (0.6978)  time: 0.2048  data: 0.0006  max mem: 13273
Epoch: [103]  [ 400/1251]  eta: 0:03:02  lr: 0.003188  min_lr: 0.003188  loss: 3.4863 (3.3242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6608 (0.6798)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [103]  [ 600/1251]  eta: 0:02:17  lr: 0.003185  min_lr: 0.003185  loss: 2.8252 (3.3110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6733 (0.6806)  time: 0.2068  data: 0.0005  max mem: 13273
Epoch: [103]  [ 800/1251]  eta: 0:01:34  lr: 0.003182  min_lr: 0.003182  loss: 2.4882 (3.3109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8151 (0.7055)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [103]  [1000/1251]  eta: 0:00:52  lr: 0.003179  min_lr: 0.003179  loss: 3.4434 (3.3136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6548 (0.7072)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [103]  [1200/1251]  eta: 0:00:10  lr: 0.003176  min_lr: 0.003176  loss: 2.7528 (3.3145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6658 (0.7083)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [103]  [1250/1251]  eta: 0:00:00  lr: 0.003176  min_lr: 0.003176  loss: 2.7396 (3.3103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6610 (0.7071)  time: 0.1755  data: 0.0006  max mem: 13273
Epoch: [103] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.003176  min_lr: 0.003176  loss: 2.7396 (3.2839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6610 (0.7071)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8543 (0.8543)  acc1: 83.6000 (83.6000)  acc5: 96.0000 (96.0000)  time: 5.6532  data: 5.5436  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0540 (1.0436)  acc1: 80.4000 (79.2364)  acc5: 94.8000 (94.7273)  time: 0.7050  data: 0.6062  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2572 (1.2301)  acc1: 72.8000 (74.2667)  acc5: 90.4000 (91.9810)  time: 0.1883  data: 0.0933  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3144 (1.2408)  acc1: 72.8000 (73.8400)  acc5: 90.4000 (91.9520)  time: 0.2006  data: 0.1070  max mem: 13273
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 74.250 Acc@5 92.528 loss 1.232
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.70%
Epoch: [104]  [   0/1251]  eta: 1:06:08  lr: 0.003176  min_lr: 0.003176  loss: 2.4999 (2.4999)  weight_decay: 0.0500 (0.0500)  time: 3.1719  data: 2.4081  max mem: 13273
Epoch: [104]  [ 200/1251]  eta: 0:03:53  lr: 0.003173  min_lr: 0.003173  loss: 3.2050 (3.2678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6846 (0.7018)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [104]  [ 400/1251]  eta: 0:03:02  lr: 0.003170  min_lr: 0.003170  loss: 3.8407 (3.2705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7179 (0.7143)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [104]  [ 600/1251]  eta: 0:02:18  lr: 0.003167  min_lr: 0.003167  loss: 2.8728 (3.2973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6600 (0.7043)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [104]  [ 800/1251]  eta: 0:01:35  lr: 0.003164  min_lr: 0.003164  loss: 3.3829 (3.3136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6799 (0.6999)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [104]  [1000/1251]  eta: 0:00:52  lr: 0.003161  min_lr: 0.003161  loss: 3.4560 (3.3239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8101 (0.7095)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [104]  [1200/1251]  eta: 0:00:10  lr: 0.003158  min_lr: 0.003158  loss: 3.1940 (3.3180)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6590 (0.7094)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [104]  [1250/1251]  eta: 0:00:00  lr: 0.003158  min_lr: 0.003158  loss: 2.5934 (3.3102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6645 (0.7107)  time: 0.1704  data: 0.0011  max mem: 13273
Epoch: [104] Total time: 0:04:22 (0.2096 s / it)
Averaged stats: lr: 0.003158  min_lr: 0.003158  loss: 2.5934 (3.2819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6645 (0.7107)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7369 (0.7369)  acc1: 86.0000 (86.0000)  acc5: 97.2000 (97.2000)  time: 5.7133  data: 5.6063  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9968 (0.9674)  acc1: 80.0000 (79.1636)  acc5: 96.8000 (95.4546)  time: 0.7058  data: 0.6121  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2158 (1.1513)  acc1: 73.6000 (75.2571)  acc5: 91.6000 (92.8571)  time: 0.1841  data: 0.0927  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2965 (1.1717)  acc1: 73.2000 (74.5120)  acc5: 91.2000 (92.7680)  time: 0.2033  data: 0.1127  max mem: 13273
Test: Total time: 0:00:10 (0.4094 s / it)
* Acc@1 74.870 Acc@5 92.764 loss 1.167
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.87%
Epoch: [105]  [   0/1251]  eta: 1:04:14  lr: 0.003158  min_lr: 0.003158  loss: 2.6130 (2.6130)  weight_decay: 0.0500 (0.0500)  time: 3.0814  data: 2.8259  max mem: 13273
Epoch: [105]  [ 200/1251]  eta: 0:03:51  lr: 0.003155  min_lr: 0.003155  loss: 2.8512 (3.2617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7122 (0.7694)  time: 0.2050  data: 0.0006  max mem: 13273
Epoch: [105]  [ 400/1251]  eta: 0:03:01  lr: 0.003152  min_lr: 0.003152  loss: 3.5084 (3.2972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7196 (0.7426)  time: 0.2062  data: 0.0006  max mem: 13273
Epoch: [105]  [ 600/1251]  eta: 0:02:16  lr: 0.003149  min_lr: 0.003149  loss: 3.0281 (3.2971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7068 (0.7287)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [105]  [ 800/1251]  eta: 0:01:34  lr: 0.003146  min_lr: 0.003146  loss: 3.0348 (3.2824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7522 (0.7252)  time: 0.2068  data: 0.0005  max mem: 13273
Epoch: [105]  [1000/1251]  eta: 0:00:52  lr: 0.003143  min_lr: 0.003143  loss: 2.5777 (3.2966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6897 (0.7322)  time: 0.2098  data: 0.0006  max mem: 13273
Epoch: [105]  [1200/1251]  eta: 0:00:10  lr: 0.003140  min_lr: 0.003140  loss: 3.8478 (3.2840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6758 (0.7294)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [105]  [1250/1251]  eta: 0:00:00  lr: 0.003139  min_lr: 0.003139  loss: 3.1479 (3.2795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5743 (0.7263)  time: 0.1709  data: 0.0007  max mem: 13273
Epoch: [105] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.003139  min_lr: 0.003139  loss: 3.1479 (3.2865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5743 (0.7263)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8603 (0.8603)  acc1: 83.2000 (83.2000)  acc5: 95.6000 (95.6000)  time: 5.7434  data: 5.6380  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0089 (1.0238)  acc1: 79.2000 (79.0182)  acc5: 95.6000 (95.4182)  time: 0.7693  data: 0.6734  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2957 (1.2402)  acc1: 73.2000 (74.4571)  acc5: 92.0000 (92.6667)  time: 0.2105  data: 0.1168  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4172 (1.2537)  acc1: 70.4000 (74.1120)  acc5: 90.8000 (92.5760)  time: 0.2103  data: 0.1167  max mem: 13273
Test: Total time: 0:00:10 (0.4156 s / it)
* Acc@1 74.146 Acc@5 92.446 loss 1.261
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.87%
Epoch: [106]  [   0/1251]  eta: 1:01:23  lr: 0.003139  min_lr: 0.003139  loss: 2.5479 (2.5479)  weight_decay: 0.0500 (0.0500)  time: 2.9442  data: 2.3960  max mem: 13273
Epoch: [106]  [ 200/1251]  eta: 0:03:53  lr: 0.003136  min_lr: 0.003136  loss: 3.3339 (3.3262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7502 (0.8018)  time: 0.2037  data: 0.0006  max mem: 13273
Epoch: [106]  [ 400/1251]  eta: 0:03:02  lr: 0.003133  min_lr: 0.003133  loss: 3.1546 (3.2676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6729 (0.7608)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [106]  [ 600/1251]  eta: 0:02:17  lr: 0.003130  min_lr: 0.003130  loss: 2.9617 (3.2691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6492 (0.7325)  time: 0.2069  data: 0.0006  max mem: 13273
Epoch: [106]  [ 800/1251]  eta: 0:01:34  lr: 0.003127  min_lr: 0.003127  loss: 2.9289 (3.2722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5881 (0.7388)  time: 0.2076  data: 0.0006  max mem: 13273
Epoch: [106]  [1000/1251]  eta: 0:00:52  lr: 0.003124  min_lr: 0.003124  loss: 2.9762 (3.2810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7963 (0.7343)  time: 0.2057  data: 0.0007  max mem: 13273
Epoch: [106]  [1200/1251]  eta: 0:00:10  lr: 0.003121  min_lr: 0.003121  loss: 3.5943 (3.2878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7690 (0.7399)  time: 0.2047  data: 0.0006  max mem: 13273
Epoch: [106]  [1250/1251]  eta: 0:00:00  lr: 0.003121  min_lr: 0.003121  loss: 2.5030 (3.2801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7204 (0.7386)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [106] Total time: 0:04:21 (0.2086 s / it)
Averaged stats: lr: 0.003121  min_lr: 0.003121  loss: 2.5030 (3.2748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7204 (0.7386)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6824 (0.6824)  acc1: 84.0000 (84.0000)  acc5: 98.4000 (98.4000)  time: 5.6180  data: 5.4869  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9646 (0.9272)  acc1: 77.6000 (78.4727)  acc5: 95.6000 (95.7455)  time: 0.6730  data: 0.5714  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1882 (1.1234)  acc1: 72.0000 (74.4571)  acc5: 92.4000 (92.9905)  time: 0.1749  data: 0.0796  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2503 (1.1320)  acc1: 70.0000 (74.2880)  acc5: 91.2000 (92.9280)  time: 0.2022  data: 0.1086  max mem: 13273
Test: Total time: 0:00:10 (0.4054 s / it)
* Acc@1 74.778 Acc@5 92.734 loss 1.137
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 74.87%
Epoch: [107]  [   0/1251]  eta: 1:00:17  lr: 0.003121  min_lr: 0.003121  loss: 2.4162 (2.4162)  weight_decay: 0.0500 (0.0500)  time: 2.8914  data: 2.0473  max mem: 13273
Epoch: [107]  [ 200/1251]  eta: 0:03:53  lr: 0.003118  min_lr: 0.003118  loss: 3.6966 (3.2211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7630 (0.7659)  time: 0.2063  data: 0.0007  max mem: 13273
Epoch: [107]  [ 400/1251]  eta: 0:03:02  lr: 0.003115  min_lr: 0.003115  loss: 2.8422 (3.2170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7069 (0.7159)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [107]  [ 600/1251]  eta: 0:02:17  lr: 0.003112  min_lr: 0.003112  loss: 3.0250 (3.2321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6633 (0.7030)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [107]  [ 800/1251]  eta: 0:01:34  lr: 0.003109  min_lr: 0.003109  loss: 3.0454 (3.2556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6986 (0.7038)  time: 0.2059  data: 0.0006  max mem: 13273
Epoch: [107]  [1000/1251]  eta: 0:00:52  lr: 0.003106  min_lr: 0.003106  loss: 3.3446 (3.2583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7546 (0.7088)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [107]  [1200/1251]  eta: 0:00:10  lr: 0.003103  min_lr: 0.003103  loss: 2.9702 (3.2686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7104 (0.7062)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [107]  [1250/1251]  eta: 0:00:00  lr: 0.003102  min_lr: 0.003102  loss: 2.6161 (3.2704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6642 (0.7095)  time: 0.1756  data: 0.0005  max mem: 13273
Epoch: [107] Total time: 0:04:21 (0.2092 s / it)
Averaged stats: lr: 0.003102  min_lr: 0.003102  loss: 2.6161 (3.2847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6642 (0.7095)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7419 (0.7419)  acc1: 83.2000 (83.2000)  acc5: 96.8000 (96.8000)  time: 5.4701  data: 5.3649  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9907 (0.9621)  acc1: 80.8000 (79.3091)  acc5: 96.0000 (95.5273)  time: 0.7272  data: 0.6313  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2198 (1.1811)  acc1: 73.2000 (74.7429)  acc5: 92.0000 (92.6667)  time: 0.2059  data: 0.1134  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3115 (1.1979)  acc1: 72.4000 (74.3840)  acc5: 91.6000 (92.4960)  time: 0.2075  data: 0.1133  max mem: 13273
Test: Total time: 0:00:10 (0.4026 s / it)
* Acc@1 74.460 Acc@5 92.676 loss 1.191
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.87%
Epoch: [108]  [   0/1251]  eta: 1:07:12  lr: 0.003102  min_lr: 0.003102  loss: 3.4472 (3.4472)  weight_decay: 0.0500 (0.0500)  time: 3.2234  data: 1.5374  max mem: 13273
Epoch: [108]  [ 200/1251]  eta: 0:03:53  lr: 0.003099  min_lr: 0.003099  loss: 3.5868 (3.2998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6023 (0.6569)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [108]  [ 400/1251]  eta: 0:03:02  lr: 0.003096  min_lr: 0.003096  loss: 3.7984 (3.3226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7225 (0.7024)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [108]  [ 600/1251]  eta: 0:02:17  lr: 0.003093  min_lr: 0.003093  loss: 3.3125 (3.2922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6180 (0.7060)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [108]  [ 800/1251]  eta: 0:01:34  lr: 0.003090  min_lr: 0.003090  loss: 2.6884 (3.3044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6601 (0.7042)  time: 0.2085  data: 0.0006  max mem: 13273
Epoch: [108]  [1000/1251]  eta: 0:00:52  lr: 0.003087  min_lr: 0.003087  loss: 3.4454 (3.2975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6052 (0.7104)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [108]  [1200/1251]  eta: 0:00:10  lr: 0.003084  min_lr: 0.003084  loss: 3.2016 (3.3114)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6949 (0.7018)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [108]  [1250/1251]  eta: 0:00:00  lr: 0.003083  min_lr: 0.003083  loss: 2.6124 (3.3091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7058 (0.7043)  time: 0.1704  data: 0.0006  max mem: 13273
Epoch: [108] Total time: 0:04:21 (0.2090 s / it)
Averaged stats: lr: 0.003083  min_lr: 0.003083  loss: 2.6124 (3.2652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7058 (0.7043)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7673 (0.7673)  acc1: 85.2000 (85.2000)  acc5: 97.2000 (97.2000)  time: 5.6068  data: 5.5016  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0640 (1.0007)  acc1: 78.8000 (79.0909)  acc5: 95.2000 (95.3455)  time: 0.7520  data: 0.6538  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2181 (1.1986)  acc1: 71.2000 (74.8952)  acc5: 92.4000 (92.6476)  time: 0.2071  data: 0.1132  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3556 (1.2115)  acc1: 70.4000 (74.4480)  acc5: 90.0000 (92.4960)  time: 0.2067  data: 0.1139  max mem: 13273
Test: Total time: 0:00:10 (0.4113 s / it)
* Acc@1 74.746 Acc@5 92.454 loss 1.203
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 74.87%
Epoch: [109]  [   0/1251]  eta: 1:04:43  lr: 0.003083  min_lr: 0.003083  loss: 2.5358 (2.5358)  weight_decay: 0.0500 (0.0500)  time: 3.1046  data: 2.5149  max mem: 13273
Epoch: [109]  [ 200/1251]  eta: 0:03:52  lr: 0.003080  min_lr: 0.003080  loss: 3.2862 (3.2782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6895 (0.7660)  time: 0.2039  data: 0.0004  max mem: 13273
Epoch: [109]  [ 400/1251]  eta: 0:03:01  lr: 0.003077  min_lr: 0.003077  loss: 3.2498 (3.2683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6760 (0.7237)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [109]  [ 600/1251]  eta: 0:02:17  lr: 0.003074  min_lr: 0.003074  loss: 2.6539 (3.2733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7003 (0.7310)  time: 0.2077  data: 0.0005  max mem: 13273
Epoch: [109]  [ 800/1251]  eta: 0:01:34  lr: 0.003071  min_lr: 0.003071  loss: 2.7315 (3.2582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6084 (0.7140)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [109]  [1000/1251]  eta: 0:00:52  lr: 0.003068  min_lr: 0.003068  loss: 2.7765 (3.2574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6779 (0.7195)  time: 0.2058  data: 0.0006  max mem: 13273
Epoch: [109]  [1200/1251]  eta: 0:00:10  lr: 0.003065  min_lr: 0.003065  loss: 3.3518 (3.2601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7168 (0.7185)  time: 0.2043  data: 0.0005  max mem: 13273
Epoch: [109]  [1250/1251]  eta: 0:00:00  lr: 0.003064  min_lr: 0.003064  loss: 3.1705 (3.2649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7526 (0.7258)  time: 0.1711  data: 0.0008  max mem: 13273
Epoch: [109] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.003064  min_lr: 0.003064  loss: 3.1705 (3.2698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7526 (0.7258)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7000 (0.7000)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.6053  data: 5.4975  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9397 (0.9495)  acc1: 78.4000 (79.4182)  acc5: 95.6000 (95.4909)  time: 0.7497  data: 0.6557  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2258 (1.1846)  acc1: 71.6000 (75.1048)  acc5: 92.0000 (92.7619)  time: 0.2142  data: 0.1227  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.4029 (1.1929)  acc1: 71.6000 (74.7680)  acc5: 90.4000 (92.7200)  time: 0.2125  data: 0.1221  max mem: 13273
Test: Total time: 0:00:10 (0.4133 s / it)
* Acc@1 74.840 Acc@5 92.790 loss 1.187
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 74.87%
Epoch: [110]  [   0/1251]  eta: 1:01:14  lr: 0.003064  min_lr: 0.003064  loss: 2.5942 (2.5942)  weight_decay: 0.0500 (0.0500)  time: 2.9373  data: 1.7913  max mem: 13273
Epoch: [110]  [ 200/1251]  eta: 0:03:53  lr: 0.003061  min_lr: 0.003061  loss: 3.7697 (3.2900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7084 (0.7091)  time: 0.2119  data: 0.0003  max mem: 13273
Epoch: [110]  [ 400/1251]  eta: 0:03:02  lr: 0.003058  min_lr: 0.003058  loss: 3.3787 (3.2864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6575 (0.6898)  time: 0.2041  data: 0.0003  max mem: 13273
Epoch: [110]  [ 600/1251]  eta: 0:02:17  lr: 0.003055  min_lr: 0.003055  loss: 2.6414 (3.2688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6720 (0.6874)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [110]  [ 800/1251]  eta: 0:01:34  lr: 0.003052  min_lr: 0.003052  loss: 2.7713 (3.2759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7363 (0.6919)  time: 0.2077  data: 0.0006  max mem: 13273
Epoch: [110]  [1000/1251]  eta: 0:00:52  lr: 0.003049  min_lr: 0.003049  loss: 2.7310 (3.2677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6718 (0.7049)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [110]  [1200/1251]  eta: 0:00:10  lr: 0.003046  min_lr: 0.003046  loss: 3.1917 (3.2758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6899 (0.7019)  time: 0.2054  data: 0.0006  max mem: 13273
Epoch: [110]  [1250/1251]  eta: 0:00:00  lr: 0.003045  min_lr: 0.003045  loss: 2.6049 (3.2772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6465 (0.6992)  time: 0.1710  data: 0.0008  max mem: 13273
Epoch: [110] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.003045  min_lr: 0.003045  loss: 2.6049 (3.2635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6465 (0.6992)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.7262 (0.7262)  acc1: 84.0000 (84.0000)  acc5: 96.8000 (96.8000)  time: 5.2233  data: 5.1174  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0033 (0.9462)  acc1: 78.0000 (79.5636)  acc5: 95.6000 (95.3091)  time: 0.7165  data: 0.6208  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1757 (1.1609)  acc1: 74.0000 (75.3905)  acc5: 92.0000 (92.8381)  time: 0.2068  data: 0.1089  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3184 (1.1742)  acc1: 72.8000 (74.9440)  acc5: 91.6000 (92.8160)  time: 0.2057  data: 0.1088  max mem: 13273
Test: Total time: 0:00:09 (0.3920 s / it)
* Acc@1 75.072 Acc@5 92.882 loss 1.173
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.07%
Epoch: [111]  [   0/1251]  eta: 1:07:23  lr: 0.003045  min_lr: 0.003045  loss: 2.6827 (2.6827)  weight_decay: 0.0500 (0.0500)  time: 3.2320  data: 2.9968  max mem: 13273
Epoch: [111]  [ 200/1251]  eta: 0:03:52  lr: 0.003042  min_lr: 0.003042  loss: 3.2031 (3.2752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6494 (0.7004)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [111]  [ 400/1251]  eta: 0:03:02  lr: 0.003039  min_lr: 0.003039  loss: 2.8770 (3.2455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5781 (0.6912)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [111]  [ 600/1251]  eta: 0:02:18  lr: 0.003036  min_lr: 0.003036  loss: 3.4837 (3.2597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6928 (0.6941)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [111]  [ 800/1251]  eta: 0:01:35  lr: 0.003033  min_lr: 0.003033  loss: 2.6476 (3.2587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6521 (0.7098)  time: 0.2073  data: 0.0004  max mem: 13273
Epoch: [111]  [1000/1251]  eta: 0:00:52  lr: 0.003030  min_lr: 0.003030  loss: 2.9735 (3.2534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6818 (0.7107)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [111]  [1200/1251]  eta: 0:00:10  lr: 0.003027  min_lr: 0.003027  loss: 3.8677 (3.2595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7033 (0.7116)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [111]  [1250/1251]  eta: 0:00:00  lr: 0.003026  min_lr: 0.003026  loss: 2.7950 (3.2525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6547 (0.7107)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [111] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.003026  min_lr: 0.003026  loss: 2.7950 (3.2629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6547 (0.7107)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7979 (0.7979)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.5859  data: 5.4374  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9802 (0.9694)  acc1: 79.2000 (79.2000)  acc5: 96.0000 (95.7818)  time: 0.7643  data: 0.6615  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2095 (1.1802)  acc1: 73.2000 (75.1238)  acc5: 92.0000 (92.7048)  time: 0.2136  data: 0.1193  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3593 (1.1901)  acc1: 73.2000 (74.6240)  acc5: 90.0000 (92.4800)  time: 0.2119  data: 0.1193  max mem: 13273
Test: Total time: 0:00:10 (0.4126 s / it)
* Acc@1 74.996 Acc@5 92.668 loss 1.181
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 75.07%
Epoch: [112]  [   0/1251]  eta: 1:07:00  lr: 0.003026  min_lr: 0.003026  loss: 4.3130 (4.3130)  weight_decay: 0.0500 (0.0500)  time: 3.2137  data: 2.5070  max mem: 13273
Epoch: [112]  [ 200/1251]  eta: 0:03:53  lr: 0.003023  min_lr: 0.003023  loss: 2.9142 (3.2801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6829 (0.7196)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [112]  [ 400/1251]  eta: 0:03:01  lr: 0.003020  min_lr: 0.003020  loss: 2.7882 (3.2702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8407 (0.7312)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [112]  [ 600/1251]  eta: 0:02:17  lr: 0.003017  min_lr: 0.003017  loss: 2.9141 (3.2405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7560 (0.7329)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [112]  [ 800/1251]  eta: 0:01:34  lr: 0.003014  min_lr: 0.003014  loss: 3.2996 (3.2324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6518 (0.7288)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [112]  [1000/1251]  eta: 0:00:52  lr: 0.003011  min_lr: 0.003011  loss: 3.4722 (3.2601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6291 (0.7247)  time: 0.2052  data: 0.0006  max mem: 13273
Epoch: [112]  [1200/1251]  eta: 0:00:10  lr: 0.003007  min_lr: 0.003007  loss: 2.8265 (3.2681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6678 (0.7236)  time: 0.2046  data: 0.0003  max mem: 13273
Epoch: [112]  [1250/1251]  eta: 0:00:00  lr: 0.003007  min_lr: 0.003007  loss: 3.0041 (3.2681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7590 (0.7232)  time: 0.1710  data: 0.0006  max mem: 13273
Epoch: [112] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.003007  min_lr: 0.003007  loss: 3.0041 (3.2621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7590 (0.7232)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7608 (0.7608)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 5.7249  data: 5.5780  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9807 (0.9701)  acc1: 79.2000 (79.6727)  acc5: 95.2000 (95.2727)  time: 0.7420  data: 0.6391  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2067 (1.1865)  acc1: 71.6000 (75.2000)  acc5: 92.4000 (92.6476)  time: 0.1969  data: 0.1024  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2993 (1.1882)  acc1: 71.6000 (75.0880)  acc5: 91.2000 (92.6720)  time: 0.1951  data: 0.1024  max mem: 13273
Test: Total time: 0:00:10 (0.4042 s / it)
* Acc@1 74.856 Acc@5 92.852 loss 1.190
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 75.07%
Epoch: [113]  [   0/1251]  eta: 1:07:27  lr: 0.003007  min_lr: 0.003007  loss: 3.1927 (3.1927)  weight_decay: 0.0500 (0.0500)  time: 3.2358  data: 1.7552  max mem: 13273
Epoch: [113]  [ 200/1251]  eta: 0:03:52  lr: 0.003004  min_lr: 0.003004  loss: 3.1728 (3.2436)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6604 (0.6920)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [113]  [ 400/1251]  eta: 0:03:01  lr: 0.003000  min_lr: 0.003000  loss: 3.8436 (3.2532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6185 (0.6811)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [113]  [ 600/1251]  eta: 0:02:17  lr: 0.002997  min_lr: 0.002997  loss: 3.6118 (3.2620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6355 (0.6950)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [113]  [ 800/1251]  eta: 0:01:34  lr: 0.002994  min_lr: 0.002994  loss: 3.7195 (3.2618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6989 (0.6960)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [113]  [1000/1251]  eta: 0:00:52  lr: 0.002991  min_lr: 0.002991  loss: 2.7938 (3.2614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8268 (0.7158)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [113]  [1200/1251]  eta: 0:00:10  lr: 0.002988  min_lr: 0.002988  loss: 2.9528 (3.2650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6641 (0.7145)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [113]  [1250/1251]  eta: 0:00:00  lr: 0.002987  min_lr: 0.002987  loss: 2.6975 (3.2628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6732 (0.7167)  time: 0.1711  data: 0.0006  max mem: 13273
Epoch: [113] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.002987  min_lr: 0.002987  loss: 2.6975 (3.2640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6732 (0.7167)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.8259 (0.8259)  acc1: 84.0000 (84.0000)  acc5: 96.0000 (96.0000)  time: 5.6552  data: 5.5498  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0189 (0.9877)  acc1: 78.4000 (79.4545)  acc5: 95.6000 (95.2364)  time: 0.7554  data: 0.6584  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2027 (1.1871)  acc1: 72.8000 (74.8952)  acc5: 92.4000 (92.4191)  time: 0.2179  data: 0.1246  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2320 (1.1897)  acc1: 72.4000 (74.5760)  acc5: 90.4000 (92.4160)  time: 0.2172  data: 0.1245  max mem: 13273
Test: Total time: 0:00:10 (0.4186 s / it)
* Acc@1 75.134 Acc@5 92.862 loss 1.177
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.13%
Epoch: [114]  [   0/1251]  eta: 1:09:44  lr: 0.002987  min_lr: 0.002987  loss: 3.1314 (3.1314)  weight_decay: 0.0500 (0.0500)  time: 3.3447  data: 3.1127  max mem: 13273
Epoch: [114]  [ 200/1251]  eta: 0:03:53  lr: 0.002984  min_lr: 0.002984  loss: 2.7764 (3.1691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7112 (0.7403)  time: 0.2065  data: 0.0004  max mem: 13273
Epoch: [114]  [ 400/1251]  eta: 0:03:02  lr: 0.002981  min_lr: 0.002981  loss: 3.4974 (3.2259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7183 (0.7106)  time: 0.2095  data: 0.0005  max mem: 13273
Epoch: [114]  [ 600/1251]  eta: 0:02:17  lr: 0.002978  min_lr: 0.002978  loss: 3.3562 (3.1971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6907 (0.7135)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [114]  [ 800/1251]  eta: 0:01:35  lr: 0.002975  min_lr: 0.002975  loss: 2.9203 (3.2088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7604 (0.7222)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [114]  [1000/1251]  eta: 0:00:52  lr: 0.002972  min_lr: 0.002972  loss: 3.9212 (3.2415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5896 (0.7085)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [114]  [1200/1251]  eta: 0:00:10  lr: 0.002968  min_lr: 0.002968  loss: 3.1820 (3.2482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6303 (0.7043)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [114]  [1250/1251]  eta: 0:00:00  lr: 0.002968  min_lr: 0.002968  loss: 3.8417 (3.2462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6247 (0.7029)  time: 0.1775  data: 0.0005  max mem: 13273
Epoch: [114] Total time: 0:04:21 (0.2094 s / it)
Averaged stats: lr: 0.002968  min_lr: 0.002968  loss: 3.8417 (3.2487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6247 (0.7029)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.8161 (0.8161)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.6883  data: 5.5830  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 1.0297 (1.0280)  acc1: 82.0000 (79.8909)  acc5: 96.8000 (95.8182)  time: 0.7282  data: 0.6310  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3042 (1.2560)  acc1: 71.2000 (74.8952)  acc5: 90.8000 (92.8762)  time: 0.1900  data: 0.0966  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3830 (1.2599)  acc1: 72.8000 (74.9440)  acc5: 90.4000 (92.7360)  time: 0.2205  data: 0.1282  max mem: 13273
Test: Total time: 0:00:10 (0.4223 s / it)
* Acc@1 74.740 Acc@5 92.736 loss 1.270
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 75.13%
Epoch: [115]  [   0/1251]  eta: 1:03:29  lr: 0.002968  min_lr: 0.002968  loss: 2.4617 (2.4617)  weight_decay: 0.0500 (0.0500)  time: 3.0449  data: 2.3651  max mem: 13273
Epoch: [115]  [ 200/1251]  eta: 0:03:52  lr: 0.002965  min_lr: 0.002965  loss: 3.9490 (3.2953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7482 (0.7107)  time: 0.2040  data: 0.0004  max mem: 13273
Epoch: [115]  [ 400/1251]  eta: 0:03:01  lr: 0.002961  min_lr: 0.002961  loss: 3.4455 (3.2530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6159 (0.7057)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [115]  [ 600/1251]  eta: 0:02:17  lr: 0.002958  min_lr: 0.002958  loss: 3.4879 (3.2392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6491 (0.7018)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [115]  [ 800/1251]  eta: 0:01:34  lr: 0.002955  min_lr: 0.002955  loss: 2.5567 (3.2242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8418 (0.7223)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [115]  [1000/1251]  eta: 0:00:52  lr: 0.002952  min_lr: 0.002952  loss: 2.7568 (3.2313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7588 (0.7219)  time: 0.2069  data: 0.0004  max mem: 13273
Epoch: [115]  [1200/1251]  eta: 0:00:10  lr: 0.002949  min_lr: 0.002949  loss: 2.9148 (3.2365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6811 (0.7214)  time: 0.2085  data: 0.0006  max mem: 13273
Epoch: [115]  [1250/1251]  eta: 0:00:00  lr: 0.002948  min_lr: 0.002948  loss: 2.8269 (3.2312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6529 (0.7248)  time: 0.1704  data: 0.0008  max mem: 13273
Epoch: [115] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.002948  min_lr: 0.002948  loss: 2.8269 (3.2512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6529 (0.7248)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.7134 (0.7134)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 5.8032  data: 5.6978  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8725 (0.9006)  acc1: 80.0000 (79.9273)  acc5: 96.0000 (96.1818)  time: 0.7262  data: 0.6303  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1539 (1.1221)  acc1: 73.6000 (75.0667)  acc5: 92.4000 (93.0095)  time: 0.1917  data: 0.0987  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2873 (1.1327)  acc1: 72.0000 (74.6080)  acc5: 92.0000 (92.8800)  time: 0.1969  data: 0.1026  max mem: 13273
Test: Total time: 0:00:10 (0.4087 s / it)
* Acc@1 75.082 Acc@5 92.882 loss 1.131
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.13%
Epoch: [116]  [   0/1251]  eta: 1:03:40  lr: 0.002948  min_lr: 0.002948  loss: 3.9263 (3.9263)  weight_decay: 0.0500 (0.0500)  time: 3.0536  data: 2.3893  max mem: 13273
Epoch: [116]  [ 200/1251]  eta: 0:03:51  lr: 0.002945  min_lr: 0.002945  loss: 3.4397 (3.2825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6385 (0.6771)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [116]  [ 400/1251]  eta: 0:03:01  lr: 0.002942  min_lr: 0.002942  loss: 2.4984 (3.2753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8348 (0.7127)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [116]  [ 600/1251]  eta: 0:02:17  lr: 0.002938  min_lr: 0.002938  loss: 2.7305 (3.2194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6291 (0.7254)  time: 0.2062  data: 0.0006  max mem: 13273
Epoch: [116]  [ 800/1251]  eta: 0:01:34  lr: 0.002935  min_lr: 0.002935  loss: 2.8398 (3.2364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6299 (0.7270)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [116]  [1000/1251]  eta: 0:00:52  lr: 0.002932  min_lr: 0.002932  loss: 3.2187 (3.2311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6466 (0.7242)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [116]  [1200/1251]  eta: 0:00:10  lr: 0.002929  min_lr: 0.002929  loss: 2.7668 (3.2364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6878 (0.7220)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [116]  [1250/1251]  eta: 0:00:00  lr: 0.002928  min_lr: 0.002928  loss: 2.7730 (3.2373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6878 (0.7226)  time: 0.1702  data: 0.0010  max mem: 13273
Epoch: [116] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.002928  min_lr: 0.002928  loss: 2.7730 (3.2430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6878 (0.7226)
Test:  [ 0/25]  eta: 0:01:23  loss: 0.7549 (0.7549)  acc1: 84.0000 (84.0000)  acc5: 96.4000 (96.4000)  time: 3.3557  data: 3.2496  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.9165 (0.9331)  acc1: 80.0000 (79.2364)  acc5: 96.0000 (95.6727)  time: 0.6063  data: 0.5073  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1635 (1.1446)  acc1: 72.0000 (74.7810)  acc5: 92.4000 (92.9333)  time: 0.2796  data: 0.1824  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2624 (1.1474)  acc1: 72.4000 (74.6560)  acc5: 91.2000 (92.9440)  time: 0.2119  data: 0.1166  max mem: 13273
Test: Total time: 0:00:10 (0.4065 s / it)
* Acc@1 75.016 Acc@5 92.890 loss 1.150
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 75.13%
Epoch: [117]  [   0/1251]  eta: 1:01:44  lr: 0.002928  min_lr: 0.002928  loss: 3.1752 (3.1752)  weight_decay: 0.0500 (0.0500)  time: 2.9614  data: 1.6750  max mem: 13273
Epoch: [117]  [ 200/1251]  eta: 0:03:53  lr: 0.002925  min_lr: 0.002925  loss: 2.5061 (3.1430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (inf)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [117]  [ 400/1251]  eta: 0:03:01  lr: 0.002922  min_lr: 0.002922  loss: 3.3163 (3.1981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6066 (inf)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [117]  [ 600/1251]  eta: 0:02:17  lr: 0.002919  min_lr: 0.002919  loss: 2.7149 (3.2067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7912 (inf)  time: 0.2071  data: 0.0004  max mem: 13273
Epoch: [117]  [ 800/1251]  eta: 0:01:34  lr: 0.002915  min_lr: 0.002915  loss: 2.7011 (3.2153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7881 (inf)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [117]  [1000/1251]  eta: 0:00:52  lr: 0.002912  min_lr: 0.002912  loss: 3.2303 (3.2288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7807 (inf)  time: 0.2054  data: 0.0006  max mem: 13273
Epoch: [117]  [1200/1251]  eta: 0:00:10  lr: 0.002909  min_lr: 0.002909  loss: 3.5076 (3.2326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6673 (inf)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [117]  [1250/1251]  eta: 0:00:00  lr: 0.002908  min_lr: 0.002908  loss: 3.1204 (3.2332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6333 (inf)  time: 0.1753  data: 0.0008  max mem: 13273
Epoch: [117] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.002908  min_lr: 0.002908  loss: 3.1204 (3.2514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6333 (inf)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.8888 (0.8888)  acc1: 83.6000 (83.6000)  acc5: 98.0000 (98.0000)  time: 5.3619  data: 5.2487  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9554 (1.0449)  acc1: 83.6000 (80.1455)  acc5: 96.4000 (95.6364)  time: 0.7184  data: 0.6236  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.3125 (1.2318)  acc1: 73.6000 (75.7143)  acc5: 92.0000 (93.1238)  time: 0.2175  data: 0.1258  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3125 (1.2381)  acc1: 73.6000 (75.2960)  acc5: 91.6000 (93.0720)  time: 0.2307  data: 0.1390  max mem: 13273
Test: Total time: 0:00:10 (0.4176 s / it)
* Acc@1 74.978 Acc@5 92.796 loss 1.246
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 75.13%
Epoch: [118]  [   0/1251]  eta: 1:06:27  lr: 0.002908  min_lr: 0.002908  loss: 3.6431 (3.6431)  weight_decay: 0.0500 (0.0500)  time: 3.1874  data: 1.7226  max mem: 13273
Epoch: [118]  [ 200/1251]  eta: 0:03:54  lr: 0.002905  min_lr: 0.002905  loss: 2.5555 (3.1671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6731 (0.7634)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [118]  [ 400/1251]  eta: 0:03:02  lr: 0.002902  min_lr: 0.002902  loss: 2.6541 (3.2481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7280 (0.7552)  time: 0.2119  data: 0.0005  max mem: 13273
Epoch: [118]  [ 600/1251]  eta: 0:02:17  lr: 0.002899  min_lr: 0.002899  loss: 3.6123 (3.2842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7293 (0.7489)  time: 0.2053  data: 0.0006  max mem: 13273
Epoch: [118]  [ 800/1251]  eta: 0:01:34  lr: 0.002895  min_lr: 0.002895  loss: 3.6137 (3.2767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6589 (0.7435)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [118]  [1000/1251]  eta: 0:00:52  lr: 0.002892  min_lr: 0.002892  loss: 3.4318 (3.2634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7305 (0.7270)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [118]  [1200/1251]  eta: 0:00:10  lr: 0.002889  min_lr: 0.002889  loss: 3.2412 (3.2749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5754 (0.7211)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [118]  [1250/1251]  eta: 0:00:00  lr: 0.002888  min_lr: 0.002888  loss: 3.6938 (3.2711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6001 (0.7174)  time: 0.1704  data: 0.0010  max mem: 13273
Epoch: [118] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.002888  min_lr: 0.002888  loss: 3.6938 (3.2456)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6001 (0.7174)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7671 (0.7671)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.7227  data: 5.6166  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9778 (1.0171)  acc1: 80.4000 (79.2364)  acc5: 95.6000 (95.3455)  time: 0.7540  data: 0.6598  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2765 (1.2062)  acc1: 72.4000 (75.2381)  acc5: 92.0000 (92.7810)  time: 0.2022  data: 0.1104  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3317 (1.2138)  acc1: 72.8000 (74.9760)  acc5: 91.2000 (92.7680)  time: 0.2015  data: 0.1104  max mem: 13273
Test: Total time: 0:00:10 (0.4088 s / it)
* Acc@1 75.170 Acc@5 92.780 loss 1.211
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.17%
Epoch: [119]  [   0/1251]  eta: 1:01:34  lr: 0.002888  min_lr: 0.002888  loss: 4.8476 (4.8476)  weight_decay: 0.0500 (0.0500)  time: 2.9535  data: 2.7157  max mem: 13273
Epoch: [119]  [ 200/1251]  eta: 0:03:50  lr: 0.002885  min_lr: 0.002885  loss: 2.6092 (3.2369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6403 (0.7269)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [119]  [ 400/1251]  eta: 0:03:00  lr: 0.002882  min_lr: 0.002882  loss: 2.5414 (3.1882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6702 (0.7416)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [119]  [ 600/1251]  eta: 0:02:16  lr: 0.002879  min_lr: 0.002879  loss: 3.1543 (3.2065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6712 (inf)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [119]  [ 800/1251]  eta: 0:01:34  lr: 0.002875  min_lr: 0.002875  loss: 3.6478 (3.2256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6387 (inf)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [119]  [1000/1251]  eta: 0:00:52  lr: 0.002872  min_lr: 0.002872  loss: 2.8527 (3.2341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7828 (inf)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [119]  [1200/1251]  eta: 0:00:10  lr: 0.002869  min_lr: 0.002869  loss: 2.7188 (3.2188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6802 (inf)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [119]  [1250/1251]  eta: 0:00:00  lr: 0.002868  min_lr: 0.002868  loss: 3.0285 (3.2289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6401 (inf)  time: 0.1711  data: 0.0007  max mem: 13273
Epoch: [119] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.002868  min_lr: 0.002868  loss: 3.0285 (3.2378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6401 (inf)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7754 (0.7754)  acc1: 88.0000 (88.0000)  acc5: 97.2000 (97.2000)  time: 5.7099  data: 5.6007  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0053 (1.0047)  acc1: 80.4000 (79.8182)  acc5: 95.6000 (95.5273)  time: 0.7537  data: 0.6578  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2224 (1.2055)  acc1: 71.2000 (75.2762)  acc5: 93.2000 (92.8000)  time: 0.1992  data: 0.1062  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3198 (1.2106)  acc1: 71.2000 (75.1520)  acc5: 90.4000 (92.8000)  time: 0.1985  data: 0.1061  max mem: 13273
Test: Total time: 0:00:10 (0.4056 s / it)
* Acc@1 75.052 Acc@5 92.896 loss 1.211
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.17%
Epoch: [120]  [   0/1251]  eta: 1:08:39  lr: 0.002868  min_lr: 0.002868  loss: 4.3651 (4.3651)  weight_decay: 0.0500 (0.0500)  time: 3.2930  data: 3.0373  max mem: 13273
Epoch: [120]  [ 200/1251]  eta: 0:03:52  lr: 0.002865  min_lr: 0.002865  loss: 2.6259 (3.2094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6456 (0.7518)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [120]  [ 400/1251]  eta: 0:03:01  lr: 0.002862  min_lr: 0.002862  loss: 3.2803 (3.1812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7038 (0.7378)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [120]  [ 600/1251]  eta: 0:02:17  lr: 0.002858  min_lr: 0.002858  loss: 3.3821 (3.2031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7554 (0.7393)  time: 0.2084  data: 0.0005  max mem: 13273
Epoch: [120]  [ 800/1251]  eta: 0:01:34  lr: 0.002855  min_lr: 0.002855  loss: 2.6406 (3.2223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6759 (0.7266)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [120]  [1000/1251]  eta: 0:00:52  lr: 0.002852  min_lr: 0.002852  loss: 2.9942 (3.2313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6107 (0.7324)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [120]  [1200/1251]  eta: 0:00:10  lr: 0.002849  min_lr: 0.002849  loss: 2.7539 (3.2281)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7720 (0.7489)  time: 0.2048  data: 0.0006  max mem: 13273
Epoch: [120]  [1250/1251]  eta: 0:00:00  lr: 0.002848  min_lr: 0.002848  loss: 3.5469 (3.2298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7390 (0.7500)  time: 0.1700  data: 0.0006  max mem: 13273
Epoch: [120] Total time: 0:04:20 (0.2080 s / it)
Averaged stats: lr: 0.002848  min_lr: 0.002848  loss: 3.5469 (3.2425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7390 (0.7500)
Test:  [ 0/25]  eta: 0:01:20  loss: 0.8436 (0.8436)  acc1: 84.4000 (84.4000)  acc5: 96.4000 (96.4000)  time: 3.2111  data: 3.1050  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.9262 (0.9615)  acc1: 81.6000 (79.4909)  acc5: 96.0000 (95.6364)  time: 0.6138  data: 0.5173  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2216 (1.1685)  acc1: 71.6000 (75.0857)  acc5: 92.4000 (92.4381)  time: 0.2939  data: 0.1981  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2216 (1.1753)  acc1: 71.6000 (74.8160)  acc5: 90.8000 (92.3680)  time: 0.2171  data: 0.1225  max mem: 13273
Test: Total time: 0:00:10 (0.4077 s / it)
* Acc@1 75.068 Acc@5 92.884 loss 1.169
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.17%
Epoch: [121]  [   0/1251]  eta: 1:07:06  lr: 0.002848  min_lr: 0.002848  loss: 3.3551 (3.3551)  weight_decay: 0.0500 (0.0500)  time: 3.2187  data: 2.8983  max mem: 13273
Epoch: [121]  [ 200/1251]  eta: 0:03:53  lr: 0.002845  min_lr: 0.002845  loss: 2.8645 (3.1792)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7685 (0.7865)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [121]  [ 400/1251]  eta: 0:03:03  lr: 0.002841  min_lr: 0.002841  loss: 2.6257 (3.2099)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6508 (0.7804)  time: 0.2154  data: 0.0005  max mem: 13273
Epoch: [121]  [ 600/1251]  eta: 0:02:18  lr: 0.002838  min_lr: 0.002838  loss: 2.7298 (3.1976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7273 (0.7577)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [121]  [ 800/1251]  eta: 0:01:35  lr: 0.002835  min_lr: 0.002835  loss: 3.6891 (3.2223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7232 (0.7491)  time: 0.2158  data: 0.0004  max mem: 13273
Epoch: [121]  [1000/1251]  eta: 0:00:52  lr: 0.002831  min_lr: 0.002831  loss: 2.6744 (3.2267)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6627 (0.7373)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [121]  [1200/1251]  eta: 0:00:10  lr: 0.002828  min_lr: 0.002828  loss: 2.5987 (3.2102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7138 (0.7386)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [121]  [1250/1251]  eta: 0:00:00  lr: 0.002827  min_lr: 0.002827  loss: 3.1177 (3.2122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6431 (0.7352)  time: 0.1703  data: 0.0006  max mem: 13273
Epoch: [121] Total time: 0:04:22 (0.2096 s / it)
Averaged stats: lr: 0.002827  min_lr: 0.002827  loss: 3.1177 (3.2364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6431 (0.7352)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8057 (0.8057)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 5.7287  data: 5.5953  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0037 (0.9867)  acc1: 80.0000 (79.7455)  acc5: 96.4000 (96.1818)  time: 0.7929  data: 0.6897  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2608 (1.1779)  acc1: 74.4000 (75.6381)  acc5: 92.8000 (93.4667)  time: 0.2446  data: 0.1482  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2728 (1.1884)  acc1: 73.6000 (75.3760)  acc5: 92.4000 (93.3440)  time: 0.2426  data: 0.1481  max mem: 13273
Test: Total time: 0:00:11 (0.4424 s / it)
* Acc@1 75.544 Acc@5 93.054 loss 1.187
Accuracy of the model on the 50000 test images: 75.5%
Max accuracy: 75.54%
Epoch: [122]  [   0/1251]  eta: 1:05:09  lr: 0.002827  min_lr: 0.002827  loss: 3.0400 (3.0400)  weight_decay: 0.0500 (0.0500)  time: 3.1253  data: 2.8779  max mem: 13273
Epoch: [122]  [ 200/1251]  eta: 0:03:52  lr: 0.002824  min_lr: 0.002824  loss: 2.7096 (3.2674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6940 (0.7575)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [122]  [ 400/1251]  eta: 0:03:01  lr: 0.002821  min_lr: 0.002821  loss: 2.7243 (3.2425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6658 (0.7446)  time: 0.2052  data: 0.0006  max mem: 13273
Epoch: [122]  [ 600/1251]  eta: 0:02:17  lr: 0.002818  min_lr: 0.002818  loss: 3.0896 (3.2232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7584 (0.7455)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [122]  [ 800/1251]  eta: 0:01:34  lr: 0.002814  min_lr: 0.002814  loss: 3.1347 (3.2246)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7116 (0.7501)  time: 0.2040  data: 0.0005  max mem: 13273
Epoch: [122]  [1000/1251]  eta: 0:00:52  lr: 0.002811  min_lr: 0.002811  loss: 3.2402 (3.2415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7655 (0.7429)  time: 0.2077  data: 0.0006  max mem: 13273
Epoch: [122]  [1200/1251]  eta: 0:00:10  lr: 0.002808  min_lr: 0.002808  loss: 2.5283 (3.2532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5777 (0.7288)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [122]  [1250/1251]  eta: 0:00:00  lr: 0.002807  min_lr: 0.002807  loss: 2.7907 (3.2543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7495 (0.7309)  time: 0.1702  data: 0.0005  max mem: 13273
Epoch: [122] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.002807  min_lr: 0.002807  loss: 2.7907 (3.2330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7495 (0.7309)
Test:  [ 0/25]  eta: 0:01:52  loss: 0.8130 (0.8130)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 4.4876  data: 4.3770  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9914 (0.9764)  acc1: 78.8000 (79.2364)  acc5: 96.0000 (95.8182)  time: 0.7377  data: 0.6374  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2053 (1.1714)  acc1: 72.8000 (75.0857)  acc5: 92.8000 (92.8952)  time: 0.2561  data: 0.1605  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2779 (1.1752)  acc1: 71.2000 (74.9120)  acc5: 90.4000 (92.9440)  time: 0.2312  data: 0.1372  max mem: 13273
Test: Total time: 0:00:10 (0.4027 s / it)
* Acc@1 75.102 Acc@5 92.922 loss 1.177
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.54%
Epoch: [123]  [   0/1251]  eta: 1:08:15  lr: 0.002807  min_lr: 0.002807  loss: 3.7056 (3.7056)  weight_decay: 0.0500 (0.0500)  time: 3.2741  data: 2.4872  max mem: 13273
Epoch: [123]  [ 200/1251]  eta: 0:03:54  lr: 0.002804  min_lr: 0.002804  loss: 2.7749 (3.2667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6883 (0.7379)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [123]  [ 400/1251]  eta: 0:03:02  lr: 0.002800  min_lr: 0.002800  loss: 3.3562 (3.2525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7524 (0.7194)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [123]  [ 600/1251]  eta: 0:02:17  lr: 0.002797  min_lr: 0.002797  loss: 3.4057 (3.2677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6524 (0.7241)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [123]  [ 800/1251]  eta: 0:01:34  lr: 0.002794  min_lr: 0.002794  loss: 2.7368 (3.2562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6928 (0.7293)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [123]  [1000/1251]  eta: 0:00:52  lr: 0.002790  min_lr: 0.002790  loss: 3.5260 (3.2544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8375 (0.7450)  time: 0.2065  data: 0.0004  max mem: 13273
Epoch: [123]  [1200/1251]  eta: 0:00:10  lr: 0.002787  min_lr: 0.002787  loss: 3.6808 (3.2494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6602 (0.7400)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [123]  [1250/1251]  eta: 0:00:00  lr: 0.002786  min_lr: 0.002786  loss: 2.6872 (3.2453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6602 (0.7378)  time: 0.1701  data: 0.0006  max mem: 13273
Epoch: [123] Total time: 0:04:21 (0.2086 s / it)
Averaged stats: lr: 0.002786  min_lr: 0.002786  loss: 2.6872 (3.2364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6602 (0.7378)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7487 (0.7487)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 5.6452  data: 5.5399  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8628 (0.8873)  acc1: 82.0000 (80.8727)  acc5: 96.8000 (95.8546)  time: 0.7603  data: 0.6651  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1394 (1.0850)  acc1: 74.0000 (76.1524)  acc5: 92.0000 (93.0857)  time: 0.2041  data: 0.1110  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2440 (1.0960)  acc1: 72.8000 (75.8400)  acc5: 91.2000 (93.0400)  time: 0.2031  data: 0.1109  max mem: 13273
Test: Total time: 0:00:10 (0.4066 s / it)
* Acc@1 75.734 Acc@5 93.164 loss 1.099
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 75.73%
Epoch: [124]  [   0/1251]  eta: 0:58:00  lr: 0.002786  min_lr: 0.002786  loss: 3.7876 (3.7876)  weight_decay: 0.0500 (0.0500)  time: 2.7818  data: 2.5157  max mem: 13273
Epoch: [124]  [ 200/1251]  eta: 0:03:50  lr: 0.002783  min_lr: 0.002783  loss: 3.5659 (3.3306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7275 (0.7357)  time: 0.2059  data: 0.0006  max mem: 13273
Epoch: [124]  [ 400/1251]  eta: 0:03:00  lr: 0.002780  min_lr: 0.002780  loss: 3.1071 (3.2898)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5883 (0.7515)  time: 0.2038  data: 0.0005  max mem: 13273
Epoch: [124]  [ 600/1251]  eta: 0:02:16  lr: 0.002776  min_lr: 0.002776  loss: 3.3750 (3.2534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6472 (0.7286)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [124]  [ 800/1251]  eta: 0:01:34  lr: 0.002773  min_lr: 0.002773  loss: 2.5493 (3.2297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6317 (0.7170)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [124]  [1000/1251]  eta: 0:00:52  lr: 0.002770  min_lr: 0.002770  loss: 3.3531 (3.2348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6718 (0.7226)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [124]  [1200/1251]  eta: 0:00:10  lr: 0.002766  min_lr: 0.002766  loss: 2.7804 (3.2233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7098 (0.7248)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [124]  [1250/1251]  eta: 0:00:00  lr: 0.002766  min_lr: 0.002766  loss: 3.4162 (3.2212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6980 (0.7226)  time: 0.1703  data: 0.0005  max mem: 13273
Epoch: [124] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.002766  min_lr: 0.002766  loss: 3.4162 (3.2297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6980 (0.7226)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6869 (0.6869)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 5.7278  data: 5.5981  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9236 (0.9229)  acc1: 80.4000 (80.2182)  acc5: 96.0000 (95.8909)  time: 0.7482  data: 0.6460  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1434 (1.1428)  acc1: 73.2000 (75.4286)  acc5: 92.8000 (92.8381)  time: 0.2129  data: 0.1173  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3188 (1.1594)  acc1: 73.2000 (75.1200)  acc5: 91.6000 (92.8640)  time: 0.2164  data: 0.1226  max mem: 13273
Test: Total time: 0:00:10 (0.4211 s / it)
* Acc@1 75.244 Acc@5 92.952 loss 1.159
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.73%
Epoch: [125]  [   0/1251]  eta: 1:10:58  lr: 0.002766  min_lr: 0.002766  loss: 2.3455 (2.3455)  weight_decay: 0.0500 (0.0500)  time: 3.4041  data: 1.7322  max mem: 13273
Epoch: [125]  [ 200/1251]  eta: 0:03:55  lr: 0.002762  min_lr: 0.002762  loss: 3.3123 (3.2190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7183 (0.7193)  time: 0.2043  data: 0.0003  max mem: 13273
Epoch: [125]  [ 400/1251]  eta: 0:03:02  lr: 0.002759  min_lr: 0.002759  loss: 2.8255 (3.1481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6227 (0.6988)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [125]  [ 600/1251]  eta: 0:02:17  lr: 0.002756  min_lr: 0.002756  loss: 3.1625 (3.1603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6410 (0.6929)  time: 0.2032  data: 0.0005  max mem: 13273
Epoch: [125]  [ 800/1251]  eta: 0:01:34  lr: 0.002752  min_lr: 0.002752  loss: 3.7860 (3.1693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7156 (0.7027)  time: 0.2060  data: 0.0007  max mem: 13273
Epoch: [125]  [1000/1251]  eta: 0:00:52  lr: 0.002749  min_lr: 0.002749  loss: 2.8514 (3.1713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6412 (0.7105)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [125]  [1200/1251]  eta: 0:00:10  lr: 0.002746  min_lr: 0.002746  loss: 2.8510 (3.1796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6640 (0.7106)  time: 0.2092  data: 0.0005  max mem: 13273
Epoch: [125]  [1250/1251]  eta: 0:00:00  lr: 0.002745  min_lr: 0.002745  loss: 2.8539 (3.1834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6475 (0.7085)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [125] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.002745  min_lr: 0.002745  loss: 2.8539 (3.2189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6475 (0.7085)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6931 (0.6931)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 5.4508  data: 5.3450  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8745 (0.8898)  acc1: 81.2000 (80.6909)  acc5: 96.4000 (96.0727)  time: 0.7554  data: 0.6553  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1207 (1.1101)  acc1: 73.2000 (76.0000)  acc5: 92.0000 (93.2762)  time: 0.2183  data: 0.1216  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2809 (1.1283)  acc1: 73.2000 (75.3440)  acc5: 91.6000 (93.2320)  time: 0.2166  data: 0.1215  max mem: 13273
Test: Total time: 0:00:10 (0.4110 s / it)
* Acc@1 75.688 Acc@5 93.152 loss 1.121
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 75.73%
Epoch: [126]  [   0/1251]  eta: 1:09:07  lr: 0.002745  min_lr: 0.002745  loss: 3.9929 (3.9929)  weight_decay: 0.0500 (0.0500)  time: 3.3154  data: 1.5788  max mem: 13273
Epoch: [126]  [ 200/1251]  eta: 0:03:52  lr: 0.002742  min_lr: 0.002742  loss: 2.7697 (3.2501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7808 (0.7377)  time: 0.2072  data: 0.0003  max mem: 13273
Epoch: [126]  [ 400/1251]  eta: 0:03:02  lr: 0.002738  min_lr: 0.002738  loss: 3.0057 (3.2669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6758 (0.7351)  time: 0.2084  data: 0.0006  max mem: 13273
Epoch: [126]  [ 600/1251]  eta: 0:02:17  lr: 0.002735  min_lr: 0.002735  loss: 2.8603 (3.2466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7105 (0.7331)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [126]  [ 800/1251]  eta: 0:01:34  lr: 0.002732  min_lr: 0.002732  loss: 2.5483 (3.2290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7394 (0.7572)  time: 0.2036  data: 0.0005  max mem: 13273
Epoch: [126]  [1000/1251]  eta: 0:00:52  lr: 0.002728  min_lr: 0.002728  loss: 3.6800 (3.2414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7019 (0.7585)  time: 0.2064  data: 0.0004  max mem: 13273
Epoch: [126]  [1200/1251]  eta: 0:00:10  lr: 0.002725  min_lr: 0.002725  loss: 3.2972 (3.2396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7332 (0.7561)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [126]  [1250/1251]  eta: 0:00:00  lr: 0.002724  min_lr: 0.002724  loss: 2.7073 (3.2363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8074 (0.7599)  time: 0.1702  data: 0.0008  max mem: 13273
Epoch: [126] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.002724  min_lr: 0.002724  loss: 2.7073 (3.2253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8074 (0.7599)
Test:  [ 0/25]  eta: 0:02:03  loss: 0.6806 (0.6806)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 4.9408  data: 4.8353  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9710 (0.9162)  acc1: 80.4000 (80.1455)  acc5: 96.0000 (95.8909)  time: 0.7062  data: 0.6107  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1106 (1.1215)  acc1: 72.4000 (75.5048)  acc5: 92.8000 (92.7619)  time: 0.2214  data: 0.1279  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2970 (1.1376)  acc1: 72.4000 (75.0240)  acc5: 90.4000 (92.6400)  time: 0.2210  data: 0.1279  max mem: 13273
Test: Total time: 0:00:09 (0.3942 s / it)
* Acc@1 75.596 Acc@5 93.100 loss 1.127
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.73%
Epoch: [127]  [   0/1251]  eta: 1:03:11  lr: 0.002724  min_lr: 0.002724  loss: 2.3585 (2.3585)  weight_decay: 0.0500 (0.0500)  time: 3.0305  data: 1.7174  max mem: 13273
Epoch: [127]  [ 200/1251]  eta: 0:03:52  lr: 0.002721  min_lr: 0.002721  loss: 3.3046 (3.1869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7047 (0.7435)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [127]  [ 400/1251]  eta: 0:03:01  lr: 0.002717  min_lr: 0.002717  loss: 3.2056 (3.1829)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7270 (0.7342)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [127]  [ 600/1251]  eta: 0:02:16  lr: 0.002714  min_lr: 0.002714  loss: 2.9583 (3.2072)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7572 (0.7487)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [127]  [ 800/1251]  eta: 0:01:34  lr: 0.002711  min_lr: 0.002711  loss: 3.5924 (3.2062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6593 (0.7438)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [127]  [1000/1251]  eta: 0:00:52  lr: 0.002707  min_lr: 0.002707  loss: 2.7525 (3.2129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7736 (0.7397)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [127]  [1200/1251]  eta: 0:00:10  lr: 0.002704  min_lr: 0.002704  loss: 3.3392 (3.2223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7781 (0.7513)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [127]  [1250/1251]  eta: 0:00:00  lr: 0.002703  min_lr: 0.002703  loss: 3.0462 (3.2260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6795 (0.7483)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [127] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.002703  min_lr: 0.002703  loss: 3.0462 (3.2107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6795 (0.7483)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8784 (0.8784)  acc1: 86.0000 (86.0000)  acc5: 99.2000 (99.2000)  time: 5.5448  data: 5.4184  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0170 (1.0183)  acc1: 79.2000 (79.5636)  acc5: 96.0000 (96.1091)  time: 0.7555  data: 0.6535  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2736 (1.2183)  acc1: 74.0000 (75.4095)  acc5: 92.8000 (93.2000)  time: 0.2136  data: 0.1175  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3057 (1.2341)  acc1: 74.0000 (75.0240)  acc5: 90.8000 (92.9920)  time: 0.2181  data: 0.1238  max mem: 13273
Test: Total time: 0:00:10 (0.4160 s / it)
* Acc@1 75.174 Acc@5 92.996 loss 1.238
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.73%
Epoch: [128]  [   0/1251]  eta: 1:08:24  lr: 0.002703  min_lr: 0.002703  loss: 2.8877 (2.8877)  weight_decay: 0.0500 (0.0500)  time: 3.2808  data: 2.6883  max mem: 13273
Epoch: [128]  [ 200/1251]  eta: 0:03:56  lr: 0.002700  min_lr: 0.002700  loss: 2.9887 (3.1638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7055 (0.7639)  time: 0.2174  data: 0.0005  max mem: 13273
Epoch: [128]  [ 400/1251]  eta: 0:03:03  lr: 0.002696  min_lr: 0.002696  loss: 2.8756 (3.1840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7240 (0.7377)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [128]  [ 600/1251]  eta: 0:02:18  lr: 0.002693  min_lr: 0.002693  loss: 3.7617 (3.2129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8358 (0.7495)  time: 0.2176  data: 0.0004  max mem: 13273
Epoch: [128]  [ 800/1251]  eta: 0:01:35  lr: 0.002690  min_lr: 0.002690  loss: 3.8265 (3.2207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7049 (0.7428)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [128]  [1000/1251]  eta: 0:00:52  lr: 0.002686  min_lr: 0.002686  loss: 3.5246 (3.2407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6960 (0.7479)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [128]  [1200/1251]  eta: 0:00:10  lr: 0.002683  min_lr: 0.002683  loss: 2.8090 (3.2384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6868 (0.7403)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [128]  [1250/1251]  eta: 0:00:00  lr: 0.002682  min_lr: 0.002682  loss: 2.6284 (3.2386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6545 (0.7385)  time: 0.1707  data: 0.0008  max mem: 13273
Epoch: [128] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.002682  min_lr: 0.002682  loss: 2.6284 (3.2255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6545 (0.7385)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6727 (0.6727)  acc1: 88.0000 (88.0000)  acc5: 96.4000 (96.4000)  time: 5.6905  data: 5.5851  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8784 (0.8955)  acc1: 80.8000 (80.0364)  acc5: 96.4000 (95.5273)  time: 0.7658  data: 0.6693  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1486 (1.1049)  acc1: 72.8000 (75.8667)  acc5: 91.6000 (92.6095)  time: 0.2072  data: 0.1123  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2399 (1.1197)  acc1: 72.8000 (75.5200)  acc5: 91.2000 (92.6240)  time: 0.2074  data: 0.1123  max mem: 13273
Test: Total time: 0:00:10 (0.4114 s / it)
* Acc@1 76.046 Acc@5 93.316 loss 1.099
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.05%
Epoch: [129]  [   0/1251]  eta: 0:59:52  lr: 0.002682  min_lr: 0.002682  loss: 2.3731 (2.3731)  weight_decay: 0.0500 (0.0500)  time: 2.8717  data: 2.6102  max mem: 13273
Epoch: [129]  [ 200/1251]  eta: 0:03:50  lr: 0.002679  min_lr: 0.002679  loss: 2.6689 (3.2044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (0.7283)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [129]  [ 400/1251]  eta: 0:03:00  lr: 0.002675  min_lr: 0.002675  loss: 3.4061 (3.2125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6506 (0.7072)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [129]  [ 600/1251]  eta: 0:02:16  lr: 0.002672  min_lr: 0.002672  loss: 3.0276 (3.2187)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8144 (0.7266)  time: 0.2040  data: 0.0005  max mem: 13273
Epoch: [129]  [ 800/1251]  eta: 0:01:34  lr: 0.002668  min_lr: 0.002668  loss: 3.2631 (3.2195)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7403 (0.7269)  time: 0.2089  data: 0.0005  max mem: 13273
Epoch: [129]  [1000/1251]  eta: 0:00:52  lr: 0.002665  min_lr: 0.002665  loss: 3.1412 (3.2313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7567 (0.7277)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [129]  [1200/1251]  eta: 0:00:10  lr: 0.002662  min_lr: 0.002662  loss: 3.8388 (3.2333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7686 (0.7393)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [129]  [1250/1251]  eta: 0:00:00  lr: 0.002661  min_lr: 0.002661  loss: 2.6238 (3.2339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (0.7419)  time: 0.1702  data: 0.0008  max mem: 13273
Epoch: [129] Total time: 0:04:20 (0.2080 s / it)
Averaged stats: lr: 0.002661  min_lr: 0.002661  loss: 2.6238 (3.2177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (0.7419)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8465 (0.8465)  acc1: 83.2000 (83.2000)  acc5: 97.6000 (97.6000)  time: 5.7508  data: 5.6441  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 1.0121 (0.9942)  acc1: 80.8000 (79.4545)  acc5: 96.4000 (95.6364)  time: 0.7505  data: 0.6530  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2575 (1.2040)  acc1: 72.0000 (75.6952)  acc5: 91.6000 (92.9333)  time: 0.2022  data: 0.1087  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3101 (1.2122)  acc1: 72.4000 (75.3920)  acc5: 91.2000 (92.8160)  time: 0.2013  data: 0.1086  max mem: 13273
Test: Total time: 0:00:10 (0.4095 s / it)
* Acc@1 75.680 Acc@5 93.068 loss 1.202
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 76.05%
Epoch: [130]  [   0/1251]  eta: 1:03:17  lr: 0.002661  min_lr: 0.002661  loss: 2.4646 (2.4646)  weight_decay: 0.0500 (0.0500)  time: 3.0356  data: 2.7847  max mem: 13273
Epoch: [130]  [ 200/1251]  eta: 0:03:53  lr: 0.002657  min_lr: 0.002657  loss: 2.9593 (3.2350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6669 (0.7110)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [130]  [ 400/1251]  eta: 0:03:01  lr: 0.002654  min_lr: 0.002654  loss: 3.1791 (3.2300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7338 (0.7091)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [130]  [ 600/1251]  eta: 0:02:17  lr: 0.002651  min_lr: 0.002651  loss: 2.8236 (3.2603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6806 (0.7220)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [130]  [ 800/1251]  eta: 0:01:34  lr: 0.002647  min_lr: 0.002647  loss: 3.3572 (3.2365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6737 (0.7185)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [130]  [1000/1251]  eta: 0:00:52  lr: 0.002644  min_lr: 0.002644  loss: 2.7819 (3.2416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7084 (0.7324)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [130]  [1200/1251]  eta: 0:00:10  lr: 0.002640  min_lr: 0.002640  loss: 2.8106 (3.2223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7236 (0.7345)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [130]  [1250/1251]  eta: 0:00:00  lr: 0.002640  min_lr: 0.002640  loss: 2.9628 (3.2228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8223 (0.7383)  time: 0.1702  data: 0.0011  max mem: 13273
Epoch: [130] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.002640  min_lr: 0.002640  loss: 2.9628 (3.2074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8223 (0.7383)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7186 (0.7186)  acc1: 86.4000 (86.4000)  acc5: 97.2000 (97.2000)  time: 5.4580  data: 5.3478  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9158 (0.9455)  acc1: 80.4000 (79.7818)  acc5: 95.6000 (95.6000)  time: 0.7171  data: 0.6199  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1959 (1.1661)  acc1: 73.2000 (75.4857)  acc5: 91.6000 (92.6667)  time: 0.1988  data: 0.1056  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3485 (1.1762)  acc1: 73.2000 (75.4240)  acc5: 91.2000 (92.6400)  time: 0.1975  data: 0.1055  max mem: 13273
Test: Total time: 0:00:09 (0.3980 s / it)
* Acc@1 75.604 Acc@5 93.070 loss 1.160
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 76.05%
Epoch: [131]  [   0/1251]  eta: 1:02:36  lr: 0.002640  min_lr: 0.002640  loss: 3.2686 (3.2686)  weight_decay: 0.0500 (0.0500)  time: 3.0030  data: 2.2542  max mem: 13273
Epoch: [131]  [ 200/1251]  eta: 0:03:53  lr: 0.002636  min_lr: 0.002636  loss: 3.0887 (3.2428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6810 (0.7495)  time: 0.2047  data: 0.0007  max mem: 13273
Epoch: [131]  [ 400/1251]  eta: 0:03:01  lr: 0.002633  min_lr: 0.002633  loss: 3.7067 (3.2102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7319 (0.7580)  time: 0.2047  data: 0.0007  max mem: 13273
Epoch: [131]  [ 600/1251]  eta: 0:02:17  lr: 0.002629  min_lr: 0.002629  loss: 3.6726 (3.2269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6723 (0.7483)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [131]  [ 800/1251]  eta: 0:01:34  lr: 0.002626  min_lr: 0.002626  loss: 3.0655 (3.2240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7815 (0.7622)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [131]  [1000/1251]  eta: 0:00:52  lr: 0.002623  min_lr: 0.002623  loss: 3.0937 (3.2280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6471 (0.7520)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [131]  [1200/1251]  eta: 0:00:10  lr: 0.002619  min_lr: 0.002619  loss: 2.6363 (3.2117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7442 (0.7504)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [131]  [1250/1251]  eta: 0:00:00  lr: 0.002618  min_lr: 0.002618  loss: 3.1156 (3.2159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7310 (0.7489)  time: 0.1710  data: 0.0011  max mem: 13273
Epoch: [131] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.002618  min_lr: 0.002618  loss: 3.1156 (3.2138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7310 (0.7489)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8219 (0.8219)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 5.5349  data: 5.4242  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 1.0263 (1.0013)  acc1: 78.8000 (79.0182)  acc5: 95.6000 (95.6000)  time: 0.6629  data: 0.5639  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2504 (1.1907)  acc1: 71.2000 (74.7810)  acc5: 92.8000 (92.5905)  time: 0.1755  data: 0.0801  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3125 (1.2026)  acc1: 71.2000 (74.4320)  acc5: 92.0000 (92.6240)  time: 0.1950  data: 0.1007  max mem: 13273
Test: Total time: 0:00:09 (0.3968 s / it)
* Acc@1 75.208 Acc@5 93.016 loss 1.191
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 76.05%
Epoch: [132]  [   0/1251]  eta: 1:06:24  lr: 0.002618  min_lr: 0.002618  loss: 4.0722 (4.0722)  weight_decay: 0.0500 (0.0500)  time: 3.1854  data: 2.9206  max mem: 13273
Epoch: [132]  [ 200/1251]  eta: 0:03:52  lr: 0.002615  min_lr: 0.002615  loss: 3.3117 (3.2106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7352 (0.7481)  time: 0.2037  data: 0.0004  max mem: 13273
Epoch: [132]  [ 400/1251]  eta: 0:03:01  lr: 0.002612  min_lr: 0.002612  loss: 3.8973 (3.2330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7873 (0.7551)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [132]  [ 600/1251]  eta: 0:02:17  lr: 0.002608  min_lr: 0.002608  loss: 3.7326 (3.2201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6721 (0.7439)  time: 0.2065  data: 0.0004  max mem: 13273
Epoch: [132]  [ 800/1251]  eta: 0:01:34  lr: 0.002605  min_lr: 0.002605  loss: 3.2001 (3.2274)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7801 (0.7404)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [132]  [1000/1251]  eta: 0:00:52  lr: 0.002601  min_lr: 0.002601  loss: 3.4113 (3.2271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6990 (0.7482)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [132]  [1200/1251]  eta: 0:00:10  lr: 0.002598  min_lr: 0.002598  loss: 2.5283 (3.2152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6856 (0.7469)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [132]  [1250/1251]  eta: 0:00:00  lr: 0.002597  min_lr: 0.002597  loss: 3.0480 (3.2133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6691 (0.7466)  time: 0.1701  data: 0.0006  max mem: 13273
Epoch: [132] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.002597  min_lr: 0.002597  loss: 3.0480 (3.2065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6691 (0.7466)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8208 (0.8208)  acc1: 84.8000 (84.8000)  acc5: 96.4000 (96.4000)  time: 5.4523  data: 5.3469  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9922 (1.0021)  acc1: 80.8000 (78.8364)  acc5: 96.0000 (95.7818)  time: 0.7189  data: 0.6246  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1938 (1.2001)  acc1: 71.6000 (74.7810)  acc5: 92.8000 (92.8952)  time: 0.1937  data: 0.1018  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3443 (1.2077)  acc1: 72.0000 (74.4640)  acc5: 91.6000 (92.8960)  time: 0.1944  data: 0.1033  max mem: 13273
Test: Total time: 0:00:09 (0.3953 s / it)
* Acc@1 75.110 Acc@5 93.004 loss 1.198
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 76.05%
Epoch: [133]  [   0/1251]  eta: 1:05:37  lr: 0.002597  min_lr: 0.002597  loss: 4.3819 (4.3819)  weight_decay: 0.0500 (0.0500)  time: 3.1478  data: 2.6090  max mem: 13273
Epoch: [133]  [ 200/1251]  eta: 0:03:52  lr: 0.002594  min_lr: 0.002594  loss: 2.8768 (3.2262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7276 (0.7392)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [133]  [ 400/1251]  eta: 0:03:01  lr: 0.002590  min_lr: 0.002590  loss: 2.6283 (3.2075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8084 (0.7638)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [133]  [ 600/1251]  eta: 0:02:16  lr: 0.002587  min_lr: 0.002587  loss: 2.4769 (3.1749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6813 (0.7470)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [133]  [ 800/1251]  eta: 0:01:34  lr: 0.002583  min_lr: 0.002583  loss: 2.7934 (3.1781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6607 (0.7429)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [133]  [1000/1251]  eta: 0:00:52  lr: 0.002580  min_lr: 0.002580  loss: 2.9977 (3.1983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7529 (0.7524)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [133]  [1200/1251]  eta: 0:00:10  lr: 0.002576  min_lr: 0.002576  loss: 3.3581 (3.1937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7570 (0.7581)  time: 0.2052  data: 0.0006  max mem: 13273
Epoch: [133]  [1250/1251]  eta: 0:00:00  lr: 0.002576  min_lr: 0.002576  loss: 3.5284 (3.1946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7489 (0.7570)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [133] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.002576  min_lr: 0.002576  loss: 3.5284 (3.1982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7489 (0.7570)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7578 (0.7578)  acc1: 85.6000 (85.6000)  acc5: 97.2000 (97.2000)  time: 5.5750  data: 5.4683  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9342 (0.9483)  acc1: 79.6000 (79.8545)  acc5: 95.6000 (95.7818)  time: 0.7383  data: 0.6435  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1634 (1.1633)  acc1: 72.8000 (75.4857)  acc5: 93.2000 (93.1619)  time: 0.2007  data: 0.1087  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2640 (1.1702)  acc1: 72.8000 (75.1520)  acc5: 92.8000 (93.2480)  time: 0.2066  data: 0.1156  max mem: 13273
Test: Total time: 0:00:10 (0.4066 s / it)
* Acc@1 75.792 Acc@5 93.274 loss 1.176
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 76.05%
Epoch: [134]  [   0/1251]  eta: 1:07:09  lr: 0.002576  min_lr: 0.002576  loss: 2.2764 (2.2764)  weight_decay: 0.0500 (0.0500)  time: 3.2213  data: 1.6529  max mem: 13273
Epoch: [134]  [ 200/1251]  eta: 0:03:52  lr: 0.002572  min_lr: 0.002572  loss: 2.9233 (3.0546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7951 (0.7995)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [134]  [ 400/1251]  eta: 0:03:01  lr: 0.002569  min_lr: 0.002569  loss: 2.8119 (3.1575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7405 (0.7598)  time: 0.2049  data: 0.0003  max mem: 13273
Epoch: [134]  [ 600/1251]  eta: 0:02:17  lr: 0.002565  min_lr: 0.002565  loss: 2.7899 (3.1765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6671 (0.7460)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [134]  [ 800/1251]  eta: 0:01:34  lr: 0.002562  min_lr: 0.002562  loss: 3.0573 (3.1920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7290 (0.7398)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [134]  [1000/1251]  eta: 0:00:52  lr: 0.002558  min_lr: 0.002558  loss: 3.1946 (3.1945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6627 (0.7390)  time: 0.2045  data: 0.0003  max mem: 13273
Epoch: [134]  [1200/1251]  eta: 0:00:10  lr: 0.002555  min_lr: 0.002555  loss: 3.1975 (3.1978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7149 (0.7404)  time: 0.2127  data: 0.0004  max mem: 13273
Epoch: [134]  [1250/1251]  eta: 0:00:00  lr: 0.002554  min_lr: 0.002554  loss: 2.5748 (3.1928)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7042 (0.7401)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [134] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.002554  min_lr: 0.002554  loss: 2.5748 (3.1906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7042 (0.7401)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6832 (0.6832)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 5.7689  data: 5.6596  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8668 (0.8705)  acc1: 81.2000 (80.0727)  acc5: 96.4000 (95.9273)  time: 0.7466  data: 0.6480  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1171 (1.0741)  acc1: 74.4000 (75.9429)  acc5: 92.0000 (93.2571)  time: 0.1960  data: 0.1009  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2264 (1.0802)  acc1: 72.0000 (75.4400)  acc5: 91.6000 (93.2800)  time: 0.1949  data: 0.1008  max mem: 13273
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 75.836 Acc@5 93.396 loss 1.070
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 76.05%
Epoch: [135]  [   0/1251]  eta: 1:04:26  lr: 0.002554  min_lr: 0.002554  loss: 2.2824 (2.2824)  weight_decay: 0.0500 (0.0500)  time: 3.0908  data: 2.3822  max mem: 13273
Epoch: [135]  [ 200/1251]  eta: 0:03:53  lr: 0.002551  min_lr: 0.002551  loss: 3.1588 (3.1733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6631 (0.7221)  time: 0.2045  data: 0.0004  max mem: 13273
Epoch: [135]  [ 400/1251]  eta: 0:03:02  lr: 0.002547  min_lr: 0.002547  loss: 3.4289 (3.1695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6393 (0.7232)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [135]  [ 600/1251]  eta: 0:02:17  lr: 0.002544  min_lr: 0.002544  loss: 2.6631 (3.1692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7621 (0.7408)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [135]  [ 800/1251]  eta: 0:01:34  lr: 0.002540  min_lr: 0.002540  loss: 3.4066 (3.1748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7322 (0.7448)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [135]  [1000/1251]  eta: 0:00:52  lr: 0.002537  min_lr: 0.002537  loss: 3.0435 (3.2046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8712 (0.7557)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [135]  [1200/1251]  eta: 0:00:10  lr: 0.002533  min_lr: 0.002533  loss: 3.2307 (3.2128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7023 (0.7561)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [135]  [1250/1251]  eta: 0:00:00  lr: 0.002533  min_lr: 0.002533  loss: 3.1674 (3.2132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8387 (0.7604)  time: 0.1711  data: 0.0008  max mem: 13273
Epoch: [135] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.002533  min_lr: 0.002533  loss: 3.1674 (3.2051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8387 (0.7604)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.8016 (0.8016)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 5.7039  data: 5.5942  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9948 (0.9683)  acc1: 82.4000 (80.4727)  acc5: 96.8000 (96.2182)  time: 0.7612  data: 0.6673  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2348 (1.1816)  acc1: 73.2000 (75.7333)  acc5: 92.8000 (93.2571)  time: 0.2049  data: 0.1129  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3434 (1.1897)  acc1: 71.6000 (75.3280)  acc5: 92.0000 (93.1840)  time: 0.2046  data: 0.1128  max mem: 13273
Test: Total time: 0:00:10 (0.4094 s / it)
* Acc@1 75.632 Acc@5 93.304 loss 1.175
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 76.05%
Epoch: [136]  [   0/1251]  eta: 1:04:30  lr: 0.002532  min_lr: 0.002532  loss: 2.2956 (2.2956)  weight_decay: 0.0500 (0.0500)  time: 3.0942  data: 2.6156  max mem: 13273
Epoch: [136]  [ 200/1251]  eta: 0:03:53  lr: 0.002529  min_lr: 0.002529  loss: 2.8354 (3.2067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6741 (0.7508)  time: 0.2050  data: 0.0006  max mem: 13273
Epoch: [136]  [ 400/1251]  eta: 0:03:01  lr: 0.002526  min_lr: 0.002526  loss: 2.9155 (3.2006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7738 (0.7594)  time: 0.2043  data: 0.0005  max mem: 13273
Epoch: [136]  [ 600/1251]  eta: 0:02:17  lr: 0.002522  min_lr: 0.002522  loss: 2.9968 (3.1845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7680 (nan)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [136]  [ 800/1251]  eta: 0:01:34  lr: 0.002519  min_lr: 0.002519  loss: 2.6596 (3.1801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7394 (nan)  time: 0.2033  data: 0.0004  max mem: 13273
Epoch: [136]  [1000/1251]  eta: 0:00:52  lr: 0.002515  min_lr: 0.002515  loss: 3.0162 (3.1880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6341 (nan)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [136]  [1200/1251]  eta: 0:00:10  lr: 0.002512  min_lr: 0.002512  loss: 2.9252 (3.1852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7239 (nan)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [136]  [1250/1251]  eta: 0:00:00  lr: 0.002511  min_lr: 0.002511  loss: 3.0587 (3.1883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6729 (nan)  time: 0.1708  data: 0.0008  max mem: 13273
Epoch: [136] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.002511  min_lr: 0.002511  loss: 3.0587 (3.2014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6729 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8112 (0.8112)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.5795  data: 5.4743  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9823 (0.9811)  acc1: 80.8000 (80.1818)  acc5: 96.4000 (96.0364)  time: 0.7370  data: 0.6392  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2736 (1.1963)  acc1: 72.8000 (75.5810)  acc5: 91.2000 (93.0095)  time: 0.2077  data: 0.1131  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3187 (1.2025)  acc1: 73.2000 (75.3920)  acc5: 91.2000 (93.1360)  time: 0.2066  data: 0.1130  max mem: 13273
Test: Total time: 0:00:10 (0.4070 s / it)
* Acc@1 75.938 Acc@5 93.390 loss 1.193
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 76.05%
Epoch: [137]  [   0/1251]  eta: 1:04:23  lr: 0.002511  min_lr: 0.002511  loss: 2.5497 (2.5497)  weight_decay: 0.0500 (0.0500)  time: 3.0887  data: 2.3461  max mem: 13273
Epoch: [137]  [ 200/1251]  eta: 0:03:52  lr: 0.002507  min_lr: 0.002507  loss: 3.6012 (3.3280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7104 (0.7541)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [137]  [ 400/1251]  eta: 0:03:01  lr: 0.002504  min_lr: 0.002504  loss: 2.8107 (3.2530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7117 (0.7492)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [137]  [ 600/1251]  eta: 0:02:17  lr: 0.002500  min_lr: 0.002500  loss: 2.9464 (3.2287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8345 (0.7624)  time: 0.2071  data: 0.0005  max mem: 13273
Epoch: [137]  [ 800/1251]  eta: 0:01:34  lr: 0.002497  min_lr: 0.002497  loss: 3.0757 (3.2056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7318 (0.7709)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [137]  [1000/1251]  eta: 0:00:52  lr: 0.002493  min_lr: 0.002493  loss: 3.6727 (3.2145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7017 (0.7643)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [137]  [1200/1251]  eta: 0:00:10  lr: 0.002490  min_lr: 0.002490  loss: 2.6778 (3.2080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6389 (0.7619)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [137]  [1250/1251]  eta: 0:00:00  lr: 0.002489  min_lr: 0.002489  loss: 2.7236 (3.2059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7609 (0.7664)  time: 0.1702  data: 0.0008  max mem: 13273
Epoch: [137] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.002489  min_lr: 0.002489  loss: 2.7236 (3.1796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7609 (0.7664)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6699 (0.6699)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 5.5930  data: 5.4850  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8723 (0.8925)  acc1: 81.6000 (80.7636)  acc5: 96.4000 (96.0364)  time: 0.7020  data: 0.6038  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1634 (1.1106)  acc1: 73.2000 (76.7429)  acc5: 92.8000 (93.0667)  time: 0.1888  data: 0.0949  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2680 (1.1241)  acc1: 71.6000 (76.0480)  acc5: 90.8000 (92.8960)  time: 0.1877  data: 0.0948  max mem: 13273
Test: Total time: 0:00:09 (0.3965 s / it)
* Acc@1 75.860 Acc@5 93.248 loss 1.119
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 76.05%
Epoch: [138]  [   0/1251]  eta: 1:04:06  lr: 0.002489  min_lr: 0.002489  loss: 3.6641 (3.6641)  weight_decay: 0.0500 (0.0500)  time: 3.0751  data: 2.6219  max mem: 13273
Epoch: [138]  [ 200/1251]  eta: 0:03:53  lr: 0.002486  min_lr: 0.002486  loss: 3.5220 (3.0892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7550 (0.7533)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [138]  [ 400/1251]  eta: 0:03:02  lr: 0.002482  min_lr: 0.002482  loss: 2.7547 (3.1033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7258 (0.7465)  time: 0.2047  data: 0.0006  max mem: 13273
Epoch: [138]  [ 600/1251]  eta: 0:02:17  lr: 0.002479  min_lr: 0.002479  loss: 3.4965 (3.1409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7823 (0.7349)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [138]  [ 800/1251]  eta: 0:01:35  lr: 0.002475  min_lr: 0.002475  loss: 2.7933 (3.1454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7061 (0.7504)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [138]  [1000/1251]  eta: 0:00:52  lr: 0.002472  min_lr: 0.002472  loss: 2.3857 (3.1613)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6983 (0.7502)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [138]  [1200/1251]  eta: 0:00:10  lr: 0.002468  min_lr: 0.002468  loss: 3.3859 (3.1752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7370 (0.7518)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [138]  [1250/1251]  eta: 0:00:00  lr: 0.002467  min_lr: 0.002467  loss: 2.7767 (3.1759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8040 (0.7553)  time: 0.1710  data: 0.0006  max mem: 13273
Epoch: [138] Total time: 0:04:21 (0.2092 s / it)
Averaged stats: lr: 0.002467  min_lr: 0.002467  loss: 2.7767 (3.1847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8040 (0.7553)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6893 (0.6893)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.7509  data: 5.6188  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9014 (0.9198)  acc1: 81.6000 (80.1818)  acc5: 96.8000 (96.2182)  time: 0.7279  data: 0.6251  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2044 (1.1331)  acc1: 72.8000 (75.7714)  acc5: 93.2000 (93.4286)  time: 0.1986  data: 0.1017  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2751 (1.1446)  acc1: 72.8000 (75.2320)  acc5: 91.2000 (93.3120)  time: 0.1967  data: 0.1016  max mem: 13273
Test: Total time: 0:00:10 (0.4067 s / it)
* Acc@1 75.926 Acc@5 93.244 loss 1.131
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 76.05%
Epoch: [139]  [   0/1251]  eta: 1:04:27  lr: 0.002467  min_lr: 0.002467  loss: 3.2490 (3.2490)  weight_decay: 0.0500 (0.0500)  time: 3.0916  data: 2.2824  max mem: 13273
Epoch: [139]  [ 200/1251]  eta: 0:03:53  lr: 0.002464  min_lr: 0.002464  loss: 2.7648 (3.1933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7115 (0.7594)  time: 0.2041  data: 0.0006  max mem: 13273
Epoch: [139]  [ 400/1251]  eta: 0:03:01  lr: 0.002460  min_lr: 0.002460  loss: 3.5514 (3.1832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.7901)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [139]  [ 600/1251]  eta: 0:02:17  lr: 0.002457  min_lr: 0.002457  loss: 2.4799 (3.1974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7050 (0.7786)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [139]  [ 800/1251]  eta: 0:01:34  lr: 0.002453  min_lr: 0.002453  loss: 3.6731 (3.1753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7117 (0.7706)  time: 0.2054  data: 0.0007  max mem: 13273
Epoch: [139]  [1000/1251]  eta: 0:00:52  lr: 0.002450  min_lr: 0.002450  loss: 3.3098 (3.1974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7090 (0.7696)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [139]  [1200/1251]  eta: 0:00:10  lr: 0.002446  min_lr: 0.002446  loss: 3.1307 (3.2218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6234 (0.7596)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [139]  [1250/1251]  eta: 0:00:00  lr: 0.002446  min_lr: 0.002446  loss: 2.6388 (3.2161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8251 (0.7654)  time: 0.1709  data: 0.0009  max mem: 13273
Epoch: [139] Total time: 0:04:21 (0.2090 s / it)
Averaged stats: lr: 0.002446  min_lr: 0.002446  loss: 2.6388 (3.1803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8251 (0.7654)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7052 (0.7052)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 5.5303  data: 5.3919  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9437 (0.9311)  acc1: 80.4000 (80.3636)  acc5: 96.4000 (95.9273)  time: 0.7149  data: 0.6172  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1764 (1.1241)  acc1: 73.2000 (75.9619)  acc5: 93.6000 (93.0667)  time: 0.1978  data: 0.1045  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2447 (1.1298)  acc1: 73.2000 (75.5200)  acc5: 91.2000 (93.0720)  time: 0.2060  data: 0.1137  max mem: 13273
Test: Total time: 0:00:10 (0.4103 s / it)
* Acc@1 75.774 Acc@5 93.352 loss 1.123
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 76.05%
Epoch: [140]  [   0/1251]  eta: 0:57:49  lr: 0.002445  min_lr: 0.002445  loss: 4.0120 (4.0120)  weight_decay: 0.0500 (0.0500)  time: 2.7735  data: 1.5662  max mem: 13273
Epoch: [140]  [ 200/1251]  eta: 0:03:52  lr: 0.002442  min_lr: 0.002442  loss: 3.2169 (3.1877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6263 (0.7711)  time: 0.2051  data: 0.0006  max mem: 13273
Epoch: [140]  [ 400/1251]  eta: 0:03:01  lr: 0.002438  min_lr: 0.002438  loss: 2.8150 (3.2327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7428 (0.7762)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [140]  [ 600/1251]  eta: 0:02:17  lr: 0.002435  min_lr: 0.002435  loss: 2.5877 (3.2312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7819 (0.7631)  time: 0.2047  data: 0.0007  max mem: 13273
Epoch: [140]  [ 800/1251]  eta: 0:01:34  lr: 0.002431  min_lr: 0.002431  loss: 3.5599 (3.2378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7848 (0.7677)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [140]  [1000/1251]  eta: 0:00:52  lr: 0.002428  min_lr: 0.002428  loss: 3.0477 (3.2284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (0.7657)  time: 0.2060  data: 0.0006  max mem: 13273
Epoch: [140]  [1200/1251]  eta: 0:00:10  lr: 0.002424  min_lr: 0.002424  loss: 2.6869 (3.2193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7021 (0.7640)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [140]  [1250/1251]  eta: 0:00:00  lr: 0.002424  min_lr: 0.002424  loss: 2.9379 (3.2199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7021 (0.7620)  time: 0.1708  data: 0.0011  max mem: 13273
Epoch: [140] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.002424  min_lr: 0.002424  loss: 2.9379 (3.1885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7021 (0.7620)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7382 (0.7382)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.7151  data: 5.6098  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9122 (0.9396)  acc1: 81.6000 (80.5091)  acc5: 96.4000 (95.7455)  time: 0.7837  data: 0.6878  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1969 (1.1247)  acc1: 74.4000 (76.1524)  acc5: 92.0000 (93.0286)  time: 0.2101  data: 0.1173  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2757 (1.1333)  acc1: 73.2000 (75.7760)  acc5: 92.0000 (93.1200)  time: 0.2092  data: 0.1172  max mem: 13273
Test: Total time: 0:00:10 (0.4144 s / it)
* Acc@1 76.106 Acc@5 93.506 loss 1.130
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.11%
Epoch: [141]  [   0/1251]  eta: 1:09:41  lr: 0.002424  min_lr: 0.002424  loss: 1.9838 (1.9838)  weight_decay: 0.0500 (0.0500)  time: 3.3427  data: 3.0914  max mem: 13273
Epoch: [141]  [ 200/1251]  eta: 0:03:52  lr: 0.002420  min_lr: 0.002420  loss: 2.6517 (3.1724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7942 (0.7659)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [141]  [ 400/1251]  eta: 0:03:01  lr: 0.002417  min_lr: 0.002417  loss: 2.6522 (3.1673)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7353 (0.7422)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [141]  [ 600/1251]  eta: 0:02:17  lr: 0.002413  min_lr: 0.002413  loss: 3.1520 (3.1647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7149 (0.7443)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [141]  [ 800/1251]  eta: 0:01:34  lr: 0.002409  min_lr: 0.002409  loss: 3.1677 (3.1864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7866 (0.7527)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [141]  [1000/1251]  eta: 0:00:52  lr: 0.002406  min_lr: 0.002406  loss: 2.8790 (3.1817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7624 (0.7615)  time: 0.2070  data: 0.0005  max mem: 13273
Epoch: [141]  [1200/1251]  eta: 0:00:10  lr: 0.002402  min_lr: 0.002402  loss: 2.6759 (3.1841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6962 (0.7521)  time: 0.2135  data: 0.0005  max mem: 13273
Epoch: [141]  [1250/1251]  eta: 0:00:00  lr: 0.002402  min_lr: 0.002402  loss: 2.5220 (3.1862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8922 (0.7591)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [141] Total time: 0:04:22 (0.2095 s / it)
Averaged stats: lr: 0.002402  min_lr: 0.002402  loss: 2.5220 (3.1834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8922 (0.7591)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.7135 (0.7135)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.2119  data: 5.0723  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9151 (0.9189)  acc1: 81.2000 (80.1091)  acc5: 96.0000 (96.2909)  time: 0.7450  data: 0.6425  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1842 (1.1101)  acc1: 73.6000 (75.9238)  acc5: 93.6000 (93.4667)  time: 0.2189  data: 0.1243  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1927 (1.1199)  acc1: 73.6000 (75.5840)  acc5: 92.0000 (93.4080)  time: 0.2171  data: 0.1237  max mem: 13273
Test: Total time: 0:00:10 (0.4012 s / it)
* Acc@1 75.828 Acc@5 93.314 loss 1.118
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 76.11%
Epoch: [142]  [   0/1251]  eta: 1:05:36  lr: 0.002402  min_lr: 0.002402  loss: 2.2763 (2.2763)  weight_decay: 0.0500 (0.0500)  time: 3.1467  data: 2.9157  max mem: 13273
Epoch: [142]  [ 200/1251]  eta: 0:03:52  lr: 0.002398  min_lr: 0.002398  loss: 2.3877 (3.1785)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6753 (0.7780)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [142]  [ 400/1251]  eta: 0:03:01  lr: 0.002395  min_lr: 0.002395  loss: 2.5241 (3.1437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7428 (0.7697)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [142]  [ 600/1251]  eta: 0:02:17  lr: 0.002391  min_lr: 0.002391  loss: 2.8178 (3.1617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7175 (0.7650)  time: 0.2069  data: 0.0005  max mem: 13273
Epoch: [142]  [ 800/1251]  eta: 0:01:34  lr: 0.002387  min_lr: 0.002387  loss: 2.9807 (3.1743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8003 (0.7607)  time: 0.2074  data: 0.0006  max mem: 13273
Epoch: [142]  [1000/1251]  eta: 0:00:52  lr: 0.002384  min_lr: 0.002384  loss: 2.6488 (3.1735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7879 (0.7876)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [142]  [1200/1251]  eta: 0:00:10  lr: 0.002380  min_lr: 0.002380  loss: 3.4412 (3.1870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6815 (0.7802)  time: 0.2071  data: 0.0006  max mem: 13273
Epoch: [142]  [1250/1251]  eta: 0:00:00  lr: 0.002380  min_lr: 0.002380  loss: 3.5517 (3.1911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7597 (0.7801)  time: 0.1715  data: 0.0009  max mem: 13273
Epoch: [142] Total time: 0:04:21 (0.2092 s / it)
Averaged stats: lr: 0.002380  min_lr: 0.002380  loss: 3.5517 (3.1850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7597 (0.7801)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7332 (0.7332)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 5.3833  data: 5.2777  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9237 (0.9572)  acc1: 80.0000 (80.0364)  acc5: 96.4000 (96.0727)  time: 0.7678  data: 0.6691  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1977 (1.1775)  acc1: 72.8000 (75.8857)  acc5: 92.4000 (93.4476)  time: 0.2219  data: 0.1271  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3804 (1.1973)  acc1: 72.4000 (75.4400)  acc5: 91.6000 (93.1840)  time: 0.2209  data: 0.1270  max mem: 13273
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 75.700 Acc@5 93.194 loss 1.189
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 76.11%
Epoch: [143]  [   0/1251]  eta: 1:02:37  lr: 0.002380  min_lr: 0.002380  loss: 4.3046 (4.3046)  weight_decay: 0.0500 (0.0500)  time: 3.0035  data: 1.6871  max mem: 13273
Epoch: [143]  [ 200/1251]  eta: 0:03:53  lr: 0.002376  min_lr: 0.002376  loss: 2.4954 (3.1439)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8078 (0.7802)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [143]  [ 400/1251]  eta: 0:03:01  lr: 0.002373  min_lr: 0.002373  loss: 3.4486 (3.1746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6941 (0.7569)  time: 0.2061  data: 0.0006  max mem: 13273
Epoch: [143]  [ 600/1251]  eta: 0:02:17  lr: 0.002369  min_lr: 0.002369  loss: 2.5051 (3.2006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6972 (0.7628)  time: 0.2073  data: 0.0005  max mem: 13273
Epoch: [143]  [ 800/1251]  eta: 0:01:34  lr: 0.002365  min_lr: 0.002365  loss: 3.0433 (3.1737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7842 (0.7581)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [143]  [1000/1251]  eta: 0:00:52  lr: 0.002362  min_lr: 0.002362  loss: 2.6209 (3.1741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7614 (0.7552)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [143]  [1200/1251]  eta: 0:00:10  lr: 0.002358  min_lr: 0.002358  loss: 2.5476 (3.1611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6750 (0.7546)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [143]  [1250/1251]  eta: 0:00:00  lr: 0.002358  min_lr: 0.002358  loss: 2.8292 (3.1628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7394 (0.7565)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [143] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.002358  min_lr: 0.002358  loss: 2.8292 (3.1619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7394 (0.7565)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7403 (0.7403)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.3802  data: 5.2748  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9167 (0.9494)  acc1: 79.6000 (80.4000)  acc5: 96.0000 (95.8909)  time: 0.7442  data: 0.6434  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2066 (1.1545)  acc1: 72.8000 (75.9048)  acc5: 93.6000 (93.4476)  time: 0.2142  data: 0.1186  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3226 (1.1669)  acc1: 72.8000 (75.6960)  acc5: 91.2000 (93.1200)  time: 0.2127  data: 0.1186  max mem: 13273
Test: Total time: 0:00:10 (0.4042 s / it)
* Acc@1 76.108 Acc@5 93.502 loss 1.154
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.11%
Epoch: [144]  [   0/1251]  eta: 1:08:21  lr: 0.002358  min_lr: 0.002358  loss: 2.5662 (2.5662)  weight_decay: 0.0500 (0.0500)  time: 3.2785  data: 3.0606  max mem: 13273
Epoch: [144]  [ 200/1251]  eta: 0:03:51  lr: 0.002354  min_lr: 0.002354  loss: 3.4420 (3.1761)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6709 (0.6807)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [144]  [ 400/1251]  eta: 0:03:00  lr: 0.002350  min_lr: 0.002350  loss: 3.0962 (3.1871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7933 (0.7122)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [144]  [ 600/1251]  eta: 0:02:16  lr: 0.002347  min_lr: 0.002347  loss: 2.5159 (3.1733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7171 (0.7185)  time: 0.2036  data: 0.0005  max mem: 13273
Epoch: [144]  [ 800/1251]  eta: 0:01:34  lr: 0.002343  min_lr: 0.002343  loss: 2.6984 (3.1609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6936 (0.7228)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [144]  [1000/1251]  eta: 0:00:52  lr: 0.002340  min_lr: 0.002340  loss: 2.5887 (3.1695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7082 (0.7378)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [144]  [1200/1251]  eta: 0:00:10  lr: 0.002336  min_lr: 0.002336  loss: 3.5743 (3.1687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7117 (0.7459)  time: 0.2122  data: 0.0004  max mem: 13273
Epoch: [144]  [1250/1251]  eta: 0:00:00  lr: 0.002335  min_lr: 0.002335  loss: 3.3254 (3.1780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7373 (0.7459)  time: 0.1749  data: 0.0005  max mem: 13273
Epoch: [144] Total time: 0:04:20 (0.2080 s / it)
Averaged stats: lr: 0.002335  min_lr: 0.002335  loss: 3.3254 (3.1806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7373 (0.7459)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7840 (0.7840)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 5.6925  data: 5.5578  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9678 (0.9488)  acc1: 81.6000 (81.3091)  acc5: 96.4000 (95.7091)  time: 0.7157  data: 0.6134  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2188 (1.1516)  acc1: 74.0000 (76.4762)  acc5: 92.4000 (93.1619)  time: 0.1929  data: 0.0964  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.3144 (1.1687)  acc1: 72.4000 (75.7760)  acc5: 91.6000 (92.9440)  time: 0.1952  data: 0.1007  max mem: 13273
Test: Total time: 0:00:10 (0.4048 s / it)
* Acc@1 75.890 Acc@5 93.236 loss 1.162
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 76.11%
Epoch: [145]  [   0/1251]  eta: 1:04:51  lr: 0.002335  min_lr: 0.002335  loss: 3.0400 (3.0400)  weight_decay: 0.0500 (0.0500)  time: 3.1108  data: 1.6399  max mem: 13273
Epoch: [145]  [ 200/1251]  eta: 0:03:54  lr: 0.002332  min_lr: 0.002332  loss: 3.2590 (3.2247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6470 (0.7240)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [145]  [ 400/1251]  eta: 0:03:03  lr: 0.002328  min_lr: 0.002328  loss: 2.7344 (3.2193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7370 (0.7344)  time: 0.2211  data: 0.0004  max mem: 13273
Epoch: [145]  [ 600/1251]  eta: 0:02:18  lr: 0.002325  min_lr: 0.002325  loss: 3.2801 (3.2084)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7134 (0.7398)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [145]  [ 800/1251]  eta: 0:01:35  lr: 0.002321  min_lr: 0.002321  loss: 2.5705 (3.1795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7784 (0.7503)  time: 0.2061  data: 0.0006  max mem: 13273
Epoch: [145]  [1000/1251]  eta: 0:00:52  lr: 0.002318  min_lr: 0.002318  loss: 2.8130 (3.1796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7813 (0.7571)  time: 0.2076  data: 0.0005  max mem: 13273
Epoch: [145]  [1200/1251]  eta: 0:00:10  lr: 0.002314  min_lr: 0.002314  loss: 2.8160 (3.1632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7409 (0.7604)  time: 0.2083  data: 0.0006  max mem: 13273
Epoch: [145]  [1250/1251]  eta: 0:00:00  lr: 0.002313  min_lr: 0.002313  loss: 2.6814 (3.1641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8143 (0.7619)  time: 0.1707  data: 0.0009  max mem: 13273
Epoch: [145] Total time: 0:04:22 (0.2099 s / it)
Averaged stats: lr: 0.002313  min_lr: 0.002313  loss: 2.6814 (3.1662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8143 (0.7619)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6942 (0.6942)  acc1: 83.2000 (83.2000)  acc5: 97.6000 (97.6000)  time: 5.8825  data: 5.7743  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9321 (0.9063)  acc1: 81.6000 (80.2182)  acc5: 96.0000 (95.7091)  time: 0.7702  data: 0.6751  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1228 (1.1117)  acc1: 74.4000 (76.1905)  acc5: 92.4000 (93.3524)  time: 0.2040  data: 0.1119  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2864 (1.1241)  acc1: 73.6000 (75.7920)  acc5: 92.4000 (93.3760)  time: 0.2030  data: 0.1118  max mem: 13273
Test: Total time: 0:00:10 (0.4160 s / it)
* Acc@1 76.074 Acc@5 93.400 loss 1.120
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.11%
Epoch: [146]  [   0/1251]  eta: 1:06:38  lr: 0.002313  min_lr: 0.002313  loss: 4.0040 (4.0040)  weight_decay: 0.0500 (0.0500)  time: 3.1966  data: 2.3519  max mem: 13273
Epoch: [146]  [ 200/1251]  eta: 0:03:52  lr: 0.002310  min_lr: 0.002310  loss: 3.5274 (3.1570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7925 (0.7815)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [146]  [ 400/1251]  eta: 0:03:01  lr: 0.002306  min_lr: 0.002306  loss: 2.9338 (3.1608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6968 (0.7945)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [146]  [ 600/1251]  eta: 0:02:17  lr: 0.002303  min_lr: 0.002303  loss: 2.8880 (3.1917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7519 (0.7866)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [146]  [ 800/1251]  eta: 0:01:34  lr: 0.002299  min_lr: 0.002299  loss: 2.4456 (3.1589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7462 (nan)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [146]  [1000/1251]  eta: 0:00:52  lr: 0.002296  min_lr: 0.002296  loss: 3.2954 (3.1758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7714 (nan)  time: 0.2053  data: 0.0003  max mem: 13273
Epoch: [146]  [1200/1251]  eta: 0:00:10  lr: 0.002292  min_lr: 0.002292  loss: 2.5412 (3.1745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8018 (nan)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [146]  [1250/1251]  eta: 0:00:00  lr: 0.002291  min_lr: 0.002291  loss: 2.4901 (3.1697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8329 (nan)  time: 0.1702  data: 0.0008  max mem: 13273
Epoch: [146] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.002291  min_lr: 0.002291  loss: 2.4901 (3.1813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8329 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7737 (0.7737)  acc1: 83.6000 (83.6000)  acc5: 96.8000 (96.8000)  time: 5.4533  data: 5.3475  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9526 (0.9522)  acc1: 81.6000 (79.9636)  acc5: 96.4000 (95.9273)  time: 0.7324  data: 0.6362  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1674 (1.1512)  acc1: 72.8000 (75.5429)  acc5: 93.2000 (93.5048)  time: 0.1966  data: 0.1026  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2671 (1.1572)  acc1: 72.4000 (75.4080)  acc5: 92.4000 (93.5040)  time: 0.1979  data: 0.1025  max mem: 13273
Test: Total time: 0:00:09 (0.3969 s / it)
* Acc@1 75.996 Acc@5 93.528 loss 1.149
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.11%
Epoch: [147]  [   0/1251]  eta: 1:05:15  lr: 0.002291  min_lr: 0.002291  loss: 3.6976 (3.6976)  weight_decay: 0.0500 (0.0500)  time: 3.1298  data: 2.8503  max mem: 13273
Epoch: [147]  [ 200/1251]  eta: 0:03:52  lr: 0.002288  min_lr: 0.002288  loss: 3.3456 (3.0738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7233 (0.7889)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [147]  [ 400/1251]  eta: 0:03:01  lr: 0.002284  min_lr: 0.002284  loss: 3.4688 (3.1662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8289 (0.7841)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [147]  [ 600/1251]  eta: 0:02:17  lr: 0.002280  min_lr: 0.002280  loss: 2.6882 (3.1768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8086 (0.7868)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [147]  [ 800/1251]  eta: 0:01:34  lr: 0.002277  min_lr: 0.002277  loss: 2.9220 (3.1861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6965 (0.7669)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [147]  [1000/1251]  eta: 0:00:52  lr: 0.002273  min_lr: 0.002273  loss: 3.2621 (3.1659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7555 (0.7635)  time: 0.2073  data: 0.0004  max mem: 13273
Epoch: [147]  [1200/1251]  eta: 0:00:10  lr: 0.002270  min_lr: 0.002270  loss: 2.6189 (3.1705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7834 (0.7705)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [147]  [1250/1251]  eta: 0:00:00  lr: 0.002269  min_lr: 0.002269  loss: 3.0655 (3.1716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7778 (0.7724)  time: 0.1706  data: 0.0008  max mem: 13273
Epoch: [147] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.002269  min_lr: 0.002269  loss: 3.0655 (3.1644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7778 (0.7724)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7429 (0.7429)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 5.4952  data: 5.3833  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9407 (0.9524)  acc1: 81.2000 (80.0727)  acc5: 96.8000 (96.3273)  time: 0.7171  data: 0.6202  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2222 (1.1553)  acc1: 74.4000 (75.9810)  acc5: 91.6000 (93.1238)  time: 0.1957  data: 0.1015  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2536 (1.1722)  acc1: 74.0000 (75.5520)  acc5: 90.4000 (92.9760)  time: 0.2062  data: 0.1129  max mem: 13273
Test: Total time: 0:00:10 (0.4029 s / it)
* Acc@1 75.820 Acc@5 93.472 loss 1.160
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 76.11%
Epoch: [148]  [   0/1251]  eta: 1:04:23  lr: 0.002269  min_lr: 0.002269  loss: 2.5451 (2.5451)  weight_decay: 0.0500 (0.0500)  time: 3.0887  data: 2.3921  max mem: 13273
Epoch: [148]  [ 200/1251]  eta: 0:03:53  lr: 0.002265  min_lr: 0.002265  loss: 3.2009 (3.1255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7451 (0.7859)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [148]  [ 400/1251]  eta: 0:03:02  lr: 0.002262  min_lr: 0.002262  loss: 3.1392 (3.0915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7700 (0.7749)  time: 0.2061  data: 0.0006  max mem: 13273
Epoch: [148]  [ 600/1251]  eta: 0:02:18  lr: 0.002258  min_lr: 0.002258  loss: 3.1365 (3.1206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7033 (0.7791)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [148]  [ 800/1251]  eta: 0:01:35  lr: 0.002255  min_lr: 0.002255  loss: 2.7419 (3.1367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7997 (nan)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [148]  [1000/1251]  eta: 0:00:52  lr: 0.002251  min_lr: 0.002251  loss: 3.0982 (3.1230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7058 (nan)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [148]  [1200/1251]  eta: 0:00:10  lr: 0.002248  min_lr: 0.002248  loss: 2.7251 (3.1305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8288 (nan)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [148]  [1250/1251]  eta: 0:00:00  lr: 0.002247  min_lr: 0.002247  loss: 2.5636 (3.1235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8732 (nan)  time: 0.1712  data: 0.0008  max mem: 13273
Epoch: [148] Total time: 0:04:22 (0.2096 s / it)
Averaged stats: lr: 0.002247  min_lr: 0.002247  loss: 2.5636 (3.1603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8732 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6803 (0.6803)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 5.7189  data: 5.6122  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8764 (0.8613)  acc1: 80.8000 (79.8909)  acc5: 96.4000 (95.9636)  time: 0.7567  data: 0.6602  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0882 (1.0617)  acc1: 74.0000 (76.3238)  acc5: 92.8000 (93.3143)  time: 0.2005  data: 0.1075  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2269 (1.0775)  acc1: 74.0000 (75.9360)  acc5: 91.2000 (93.2000)  time: 0.1996  data: 0.1075  max mem: 13273
Test: Total time: 0:00:10 (0.4071 s / it)
* Acc@1 76.096 Acc@5 93.562 loss 1.075
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.11%
Epoch: [149]  [   0/1251]  eta: 1:06:44  lr: 0.002247  min_lr: 0.002247  loss: 3.9610 (3.9610)  weight_decay: 0.0500 (0.0500)  time: 3.2010  data: 2.6064  max mem: 13273
Epoch: [149]  [ 200/1251]  eta: 0:03:53  lr: 0.002243  min_lr: 0.002243  loss: 3.3334 (3.2492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7650 (0.8216)  time: 0.2054  data: 0.0014  max mem: 13273
Epoch: [149]  [ 400/1251]  eta: 0:03:02  lr: 0.002240  min_lr: 0.002240  loss: 2.9070 (3.1963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6978 (0.7784)  time: 0.2059  data: 0.0006  max mem: 13273
Epoch: [149]  [ 600/1251]  eta: 0:02:17  lr: 0.002236  min_lr: 0.002236  loss: 2.8005 (3.1557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6477 (0.7675)  time: 0.2068  data: 0.0005  max mem: 13273
Epoch: [149]  [ 800/1251]  eta: 0:01:34  lr: 0.002232  min_lr: 0.002232  loss: 3.2169 (3.1589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7253 (0.7709)  time: 0.2052  data: 0.0006  max mem: 13273
Epoch: [149]  [1000/1251]  eta: 0:00:52  lr: 0.002229  min_lr: 0.002229  loss: 2.5167 (3.1562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7270 (0.7643)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [149]  [1200/1251]  eta: 0:00:10  lr: 0.002225  min_lr: 0.002225  loss: 2.6579 (3.1597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8284 (0.7794)  time: 0.2060  data: 0.0006  max mem: 13273
Epoch: [149]  [1250/1251]  eta: 0:00:00  lr: 0.002224  min_lr: 0.002224  loss: 3.2477 (3.1642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7719 (0.7781)  time: 0.1706  data: 0.0009  max mem: 13273
Epoch: [149] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.002224  min_lr: 0.002224  loss: 3.2477 (3.1604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7719 (0.7781)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7465 (0.7465)  acc1: 86.0000 (86.0000)  acc5: 96.8000 (96.8000)  time: 5.7265  data: 5.6212  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8651 (0.8898)  acc1: 81.6000 (81.0182)  acc5: 96.4000 (96.2182)  time: 0.7474  data: 0.6534  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1343 (1.0883)  acc1: 74.4000 (76.5714)  acc5: 94.0000 (93.8667)  time: 0.2061  data: 0.1144  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2553 (1.0989)  acc1: 73.6000 (76.1120)  acc5: 92.4000 (93.8240)  time: 0.2055  data: 0.1143  max mem: 13273
Test: Total time: 0:00:10 (0.4115 s / it)
* Acc@1 76.428 Acc@5 93.626 loss 1.094
Accuracy of the model on the 50000 test images: 76.4%
Max accuracy: 76.43%
Epoch: [150]  [   0/1251]  eta: 1:02:05  lr: 0.002224  min_lr: 0.002224  loss: 2.4548 (2.4548)  weight_decay: 0.0500 (0.0500)  time: 2.9784  data: 2.7264  max mem: 13273
Epoch: [150]  [ 200/1251]  eta: 0:03:51  lr: 0.002221  min_lr: 0.002221  loss: 2.6518 (3.0767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6906 (0.7611)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [150]  [ 400/1251]  eta: 0:03:00  lr: 0.002217  min_lr: 0.002217  loss: 3.5049 (3.1097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7273 (0.7693)  time: 0.2045  data: 0.0004  max mem: 13273
Epoch: [150]  [ 600/1251]  eta: 0:02:16  lr: 0.002214  min_lr: 0.002214  loss: 2.5433 (3.1073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7453 (0.7809)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [150]  [ 800/1251]  eta: 0:01:34  lr: 0.002210  min_lr: 0.002210  loss: 3.4783 (3.1195)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7579 (0.7890)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [150]  [1000/1251]  eta: 0:00:52  lr: 0.002207  min_lr: 0.002207  loss: 3.1526 (3.1286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8132 (0.8037)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [150]  [1200/1251]  eta: 0:00:10  lr: 0.002203  min_lr: 0.002203  loss: 3.0432 (3.1369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8330 (0.8005)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [150]  [1250/1251]  eta: 0:00:00  lr: 0.002202  min_lr: 0.002202  loss: 3.0564 (3.1339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7388 (0.7982)  time: 0.1718  data: 0.0011  max mem: 13273
Epoch: [150] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.002202  min_lr: 0.002202  loss: 3.0564 (3.1508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7388 (0.7982)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7459 (0.7459)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.4533  data: 5.3481  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9379 (0.9160)  acc1: 79.6000 (79.8909)  acc5: 96.4000 (96.0364)  time: 0.7058  data: 0.6122  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1533 (1.1126)  acc1: 74.0000 (76.4381)  acc5: 93.2000 (93.6952)  time: 0.2082  data: 0.1168  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2751 (1.1241)  acc1: 74.0000 (76.3040)  acc5: 92.0000 (93.7280)  time: 0.2087  data: 0.1181  max mem: 13273
Test: Total time: 0:00:10 (0.4031 s / it)
* Acc@1 76.256 Acc@5 93.620 loss 1.122
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.43%
Epoch: [151]  [   0/1251]  eta: 1:02:40  lr: 0.002202  min_lr: 0.002202  loss: 3.1061 (3.1061)  weight_decay: 0.0500 (0.0500)  time: 3.0059  data: 2.4228  max mem: 13273
Epoch: [151]  [ 200/1251]  eta: 0:03:53  lr: 0.002198  min_lr: 0.002198  loss: 3.3897 (3.1221)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [151]  [ 400/1251]  eta: 0:03:01  lr: 0.002195  min_lr: 0.002195  loss: 3.0280 (3.1081)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8235 (nan)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [151]  [ 600/1251]  eta: 0:02:17  lr: 0.002191  min_lr: 0.002191  loss: 2.9568 (3.1538)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8048 (nan)  time: 0.2045  data: 0.0004  max mem: 13273
Epoch: [151]  [ 800/1251]  eta: 0:01:34  lr: 0.002188  min_lr: 0.002188  loss: 3.0445 (3.1560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6930 (nan)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [151]  [1000/1251]  eta: 0:00:52  lr: 0.002184  min_lr: 0.002184  loss: 3.2946 (3.1691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7217 (nan)  time: 0.2043  data: 0.0005  max mem: 13273
Epoch: [151]  [1200/1251]  eta: 0:00:10  lr: 0.002181  min_lr: 0.002181  loss: 3.1920 (3.1631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7535 (nan)  time: 0.2124  data: 0.0004  max mem: 13273
Epoch: [151]  [1250/1251]  eta: 0:00:00  lr: 0.002180  min_lr: 0.002180  loss: 3.5817 (3.1622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8178 (nan)  time: 0.1707  data: 0.0008  max mem: 13273
Epoch: [151] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.002180  min_lr: 0.002180  loss: 3.5817 (3.1584)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8178 (nan)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7439 (0.7439)  acc1: 84.4000 (84.4000)  acc5: 98.4000 (98.4000)  time: 5.8716  data: 5.7650  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9983 (0.9595)  acc1: 82.4000 (80.8364)  acc5: 96.0000 (95.8545)  time: 0.7480  data: 0.6535  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1834 (1.1587)  acc1: 73.2000 (76.5143)  acc5: 92.4000 (93.6381)  time: 0.2004  data: 0.1086  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2351 (1.1630)  acc1: 74.4000 (76.1760)  acc5: 92.4000 (93.5680)  time: 0.1994  data: 0.1085  max mem: 13273
Test: Total time: 0:00:10 (0.4127 s / it)
* Acc@1 76.304 Acc@5 93.514 loss 1.161
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.43%
Epoch: [152]  [   0/1251]  eta: 1:07:49  lr: 0.002180  min_lr: 0.002180  loss: 2.1059 (2.1059)  weight_decay: 0.0500 (0.0500)  time: 3.2534  data: 2.2914  max mem: 13273
Epoch: [152]  [ 200/1251]  eta: 0:03:55  lr: 0.002176  min_lr: 0.002176  loss: 2.6631 (3.0369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6991 (0.7780)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [152]  [ 400/1251]  eta: 0:03:02  lr: 0.002173  min_lr: 0.002173  loss: 3.2317 (3.0721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7014 (0.7418)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [152]  [ 600/1251]  eta: 0:02:17  lr: 0.002169  min_lr: 0.002169  loss: 2.8110 (3.0935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8157 (0.7542)  time: 0.2038  data: 0.0004  max mem: 13273
Epoch: [152]  [ 800/1251]  eta: 0:01:34  lr: 0.002165  min_lr: 0.002165  loss: 3.4630 (3.1196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8200 (0.7751)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [152]  [1000/1251]  eta: 0:00:52  lr: 0.002162  min_lr: 0.002162  loss: 3.1159 (3.1124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7268 (0.7694)  time: 0.2048  data: 0.0006  max mem: 13273
Epoch: [152]  [1200/1251]  eta: 0:00:10  lr: 0.002158  min_lr: 0.002158  loss: 2.5588 (3.1316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8777 (0.7783)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [152]  [1250/1251]  eta: 0:00:00  lr: 0.002157  min_lr: 0.002157  loss: 3.2183 (3.1367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8777 (0.7839)  time: 0.1704  data: 0.0006  max mem: 13273
Epoch: [152] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.002157  min_lr: 0.002157  loss: 3.2183 (3.1378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8777 (0.7839)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7685 (0.7685)  acc1: 88.0000 (88.0000)  acc5: 97.6000 (97.6000)  time: 5.6560  data: 5.5505  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9659 (0.9834)  acc1: 82.0000 (80.8000)  acc5: 96.0000 (95.7818)  time: 0.7487  data: 0.6559  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2040 (1.1665)  acc1: 73.2000 (76.7429)  acc5: 93.2000 (93.6191)  time: 0.2098  data: 0.1186  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2906 (1.1775)  acc1: 73.2000 (76.2880)  acc5: 92.0000 (93.5840)  time: 0.2095  data: 0.1186  max mem: 13273
Test: Total time: 0:00:10 (0.4116 s / it)
* Acc@1 76.132 Acc@5 93.470 loss 1.179
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.43%
Epoch: [153]  [   0/1251]  eta: 1:05:12  lr: 0.002157  min_lr: 0.002157  loss: 2.6016 (2.6016)  weight_decay: 0.0500 (0.0500)  time: 3.1272  data: 1.7984  max mem: 13273
Epoch: [153]  [ 200/1251]  eta: 0:03:53  lr: 0.002154  min_lr: 0.002154  loss: 3.5206 (3.2131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7178 (0.7546)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [153]  [ 400/1251]  eta: 0:03:02  lr: 0.002150  min_lr: 0.002150  loss: 3.1548 (3.1859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8839 (0.7861)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [153]  [ 600/1251]  eta: 0:02:17  lr: 0.002147  min_lr: 0.002147  loss: 2.9456 (3.1786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7374 (0.7665)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [153]  [ 800/1251]  eta: 0:01:34  lr: 0.002143  min_lr: 0.002143  loss: 3.6463 (3.1762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8236 (0.7721)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [153]  [1000/1251]  eta: 0:00:52  lr: 0.002139  min_lr: 0.002139  loss: 2.6215 (3.1710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7337 (0.7799)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [153]  [1200/1251]  eta: 0:00:10  lr: 0.002136  min_lr: 0.002136  loss: 3.1056 (3.1644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7464 (0.7805)  time: 0.2068  data: 0.0004  max mem: 13273
Epoch: [153]  [1250/1251]  eta: 0:00:00  lr: 0.002135  min_lr: 0.002135  loss: 2.6384 (3.1618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7239 (0.7797)  time: 0.1710  data: 0.0006  max mem: 13273
Epoch: [153] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.002135  min_lr: 0.002135  loss: 2.6384 (3.1466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7239 (0.7797)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6914 (0.6914)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 5.8393  data: 5.7323  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8189 (0.8485)  acc1: 82.8000 (81.5273)  acc5: 96.8000 (96.4727)  time: 0.7794  data: 0.6817  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0718 (1.0449)  acc1: 75.6000 (77.3143)  acc5: 93.2000 (94.0762)  time: 0.2186  data: 0.1251  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1618 (1.0610)  acc1: 74.8000 (77.0240)  acc5: 92.8000 (94.0320)  time: 0.2177  data: 0.1250  max mem: 13273
Test: Total time: 0:00:10 (0.4263 s / it)
* Acc@1 76.562 Acc@5 93.782 loss 1.062
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.56%
Epoch: [154]  [   0/1251]  eta: 1:03:52  lr: 0.002135  min_lr: 0.002135  loss: 2.0505 (2.0505)  weight_decay: 0.0500 (0.0500)  time: 3.0639  data: 2.8463  max mem: 13273
Epoch: [154]  [ 200/1251]  eta: 0:03:51  lr: 0.002131  min_lr: 0.002131  loss: 3.2531 (3.1108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7881 (0.7972)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [154]  [ 400/1251]  eta: 0:03:01  lr: 0.002128  min_lr: 0.002128  loss: 2.5475 (3.1427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7826 (0.7725)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [154]  [ 600/1251]  eta: 0:02:17  lr: 0.002124  min_lr: 0.002124  loss: 2.3709 (3.1356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7642 (0.7638)  time: 0.2080  data: 0.0005  max mem: 13273
Epoch: [154]  [ 800/1251]  eta: 0:01:34  lr: 0.002121  min_lr: 0.002121  loss: 2.8131 (3.1396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8293 (0.7791)  time: 0.2044  data: 0.0006  max mem: 13273
Epoch: [154]  [1000/1251]  eta: 0:00:52  lr: 0.002117  min_lr: 0.002117  loss: 3.2045 (3.1493)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7462 (0.7815)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [154]  [1200/1251]  eta: 0:00:10  lr: 0.002113  min_lr: 0.002113  loss: 3.8566 (3.1543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7855 (0.7790)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [154]  [1250/1251]  eta: 0:00:00  lr: 0.002113  min_lr: 0.002113  loss: 2.7509 (3.1491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6994 (0.7758)  time: 0.1703  data: 0.0008  max mem: 13273
Epoch: [154] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.002113  min_lr: 0.002113  loss: 2.7509 (3.1325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6994 (0.7758)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6960 (0.6960)  acc1: 84.4000 (84.4000)  acc5: 98.4000 (98.4000)  time: 5.6366  data: 5.5303  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8801 (0.8754)  acc1: 81.6000 (80.2909)  acc5: 96.8000 (96.2545)  time: 0.7053  data: 0.6099  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1012 (1.0868)  acc1: 74.0000 (76.2476)  acc5: 93.6000 (93.5619)  time: 0.1988  data: 0.1064  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1895 (1.0967)  acc1: 74.0000 (75.9360)  acc5: 92.0000 (93.5360)  time: 0.2029  data: 0.1101  max mem: 13273
Test: Total time: 0:00:10 (0.4075 s / it)
* Acc@1 76.496 Acc@5 93.606 loss 1.087
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.56%
Epoch: [155]  [   0/1251]  eta: 1:10:03  lr: 0.002113  min_lr: 0.002113  loss: 3.5262 (3.5262)  weight_decay: 0.0500 (0.0500)  time: 3.3600  data: 1.6236  max mem: 13273
Epoch: [155]  [ 200/1251]  eta: 0:03:55  lr: 0.002109  min_lr: 0.002109  loss: 2.3913 (3.1157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8616 (0.8119)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [155]  [ 400/1251]  eta: 0:03:03  lr: 0.002105  min_lr: 0.002105  loss: 3.5148 (3.1351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7061 (0.7927)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [155]  [ 600/1251]  eta: 0:02:18  lr: 0.002102  min_lr: 0.002102  loss: 2.5381 (3.1370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7206 (0.7887)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [155]  [ 800/1251]  eta: 0:01:35  lr: 0.002098  min_lr: 0.002098  loss: 3.5890 (3.1494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.7770)  time: 0.2063  data: 0.0006  max mem: 13273
Epoch: [155]  [1000/1251]  eta: 0:00:52  lr: 0.002095  min_lr: 0.002095  loss: 3.6636 (3.1526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7155 (0.7737)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [155]  [1200/1251]  eta: 0:00:10  lr: 0.002091  min_lr: 0.002091  loss: 2.5449 (3.1520)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7747 (0.7817)  time: 0.2061  data: 0.0006  max mem: 13273
Epoch: [155]  [1250/1251]  eta: 0:00:00  lr: 0.002090  min_lr: 0.002090  loss: 2.8441 (3.1516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7884 (0.7841)  time: 0.1701  data: 0.0008  max mem: 13273
Epoch: [155] Total time: 0:04:22 (0.2096 s / it)
Averaged stats: lr: 0.002090  min_lr: 0.002090  loss: 2.8441 (3.1434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7884 (0.7841)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.7600 (0.7600)  acc1: 87.2000 (87.2000)  acc5: 97.2000 (97.2000)  time: 5.9591  data: 5.8535  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9249 (0.9069)  acc1: 81.6000 (81.3091)  acc5: 96.8000 (96.2182)  time: 0.6689  data: 0.5755  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1536 (1.0969)  acc1: 74.0000 (76.9333)  acc5: 93.6000 (93.7905)  time: 0.1580  data: 0.0662  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1735 (1.1052)  acc1: 73.6000 (76.6240)  acc5: 93.2000 (93.9040)  time: 0.2154  data: 0.1239  max mem: 13273
Test: Total time: 0:00:10 (0.4284 s / it)
* Acc@1 76.744 Acc@5 93.684 loss 1.106
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.74%
Epoch: [156]  [   0/1251]  eta: 0:52:38  lr: 0.002090  min_lr: 0.002090  loss: 2.2518 (2.2518)  weight_decay: 0.0500 (0.0500)  time: 2.5251  data: 2.2389  max mem: 13273
Epoch: [156]  [ 200/1251]  eta: 0:03:50  lr: 0.002087  min_lr: 0.002087  loss: 3.2652 (3.2042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7768 (0.7439)  time: 0.2047  data: 0.0006  max mem: 13273
Epoch: [156]  [ 400/1251]  eta: 0:03:00  lr: 0.002083  min_lr: 0.002083  loss: 2.4588 (3.1918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7449 (0.7744)  time: 0.2061  data: 0.0006  max mem: 13273
Epoch: [156]  [ 600/1251]  eta: 0:02:16  lr: 0.002079  min_lr: 0.002079  loss: 2.4170 (3.1962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7676 (0.7579)  time: 0.2070  data: 0.0006  max mem: 13273
Epoch: [156]  [ 800/1251]  eta: 0:01:34  lr: 0.002076  min_lr: 0.002076  loss: 3.3290 (3.1682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9045 (0.7897)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [156]  [1000/1251]  eta: 0:00:52  lr: 0.002072  min_lr: 0.002072  loss: 2.9449 (3.1649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.7918)  time: 0.2039  data: 0.0007  max mem: 13273
Epoch: [156]  [1200/1251]  eta: 0:00:10  lr: 0.002069  min_lr: 0.002069  loss: 3.2763 (3.1669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6258 (0.7785)  time: 0.2073  data: 0.0005  max mem: 13273
Epoch: [156]  [1250/1251]  eta: 0:00:00  lr: 0.002068  min_lr: 0.002068  loss: 2.9081 (3.1632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6774 (0.7781)  time: 0.1704  data: 0.0013  max mem: 13273
Epoch: [156] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.002068  min_lr: 0.002068  loss: 2.9081 (3.1251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6774 (0.7781)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7882 (0.7882)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 5.7144  data: 5.5828  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8514 (0.9179)  acc1: 83.2000 (80.9455)  acc5: 97.2000 (96.3636)  time: 0.7145  data: 0.6129  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0896 (1.1072)  acc1: 73.2000 (76.6476)  acc5: 93.2000 (93.8667)  time: 0.1868  data: 0.0923  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2503 (1.1144)  acc1: 73.2000 (76.3360)  acc5: 92.0000 (93.7600)  time: 0.1959  data: 0.1030  max mem: 13273
Test: Total time: 0:00:10 (0.4041 s / it)
* Acc@1 76.664 Acc@5 93.896 loss 1.109
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.74%
Epoch: [157]  [   0/1251]  eta: 0:59:58  lr: 0.002068  min_lr: 0.002068  loss: 3.2876 (3.2876)  weight_decay: 0.0500 (0.0500)  time: 2.8763  data: 1.6551  max mem: 13273
Epoch: [157]  [ 200/1251]  eta: 0:03:52  lr: 0.002064  min_lr: 0.002064  loss: 2.4675 (3.1545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7745 (0.8235)  time: 0.2051  data: 0.0006  max mem: 13273
Epoch: [157]  [ 400/1251]  eta: 0:03:01  lr: 0.002061  min_lr: 0.002061  loss: 2.8948 (3.1077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7475 (0.8112)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [157]  [ 600/1251]  eta: 0:02:17  lr: 0.002057  min_lr: 0.002057  loss: 2.7606 (3.0545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6834 (0.7976)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [157]  [ 800/1251]  eta: 0:01:34  lr: 0.002053  min_lr: 0.002053  loss: 3.5116 (3.1068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8256 (0.8066)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [157]  [1000/1251]  eta: 0:00:52  lr: 0.002050  min_lr: 0.002050  loss: 3.5412 (3.0899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7560 (0.7913)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [157]  [1200/1251]  eta: 0:00:10  lr: 0.002046  min_lr: 0.002046  loss: 3.7564 (3.1048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7973 (0.7916)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [157]  [1250/1251]  eta: 0:00:00  lr: 0.002045  min_lr: 0.002045  loss: 2.5690 (3.1011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8198 (0.7934)  time: 0.1705  data: 0.0007  max mem: 13273
Epoch: [157] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.002045  min_lr: 0.002045  loss: 2.5690 (3.1268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8198 (0.7934)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7008 (0.7008)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.5869  data: 5.4466  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9750 (0.9357)  acc1: 82.0000 (81.0909)  acc5: 95.6000 (95.8182)  time: 0.7186  data: 0.6163  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1888 (1.1235)  acc1: 75.2000 (76.7048)  acc5: 92.4000 (93.5238)  time: 0.1907  data: 0.0958  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2868 (1.1362)  acc1: 74.0000 (76.3040)  acc5: 92.4000 (93.3440)  time: 0.2009  data: 0.1076  max mem: 13273
Test: Total time: 0:00:10 (0.4045 s / it)
* Acc@1 76.472 Acc@5 93.732 loss 1.130
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.74%
Epoch: [158]  [   0/1251]  eta: 1:06:54  lr: 0.002045  min_lr: 0.002045  loss: 2.4944 (2.4944)  weight_decay: 0.0500 (0.0500)  time: 3.2087  data: 2.9330  max mem: 13273
Epoch: [158]  [ 200/1251]  eta: 0:03:51  lr: 0.002042  min_lr: 0.002042  loss: 3.2521 (3.1719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7399 (0.8040)  time: 0.2040  data: 0.0004  max mem: 13273
Epoch: [158]  [ 400/1251]  eta: 0:03:01  lr: 0.002038  min_lr: 0.002038  loss: 3.5185 (3.1667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7601 (0.8139)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [158]  [ 600/1251]  eta: 0:02:17  lr: 0.002035  min_lr: 0.002035  loss: 3.3269 (3.1591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6634 (0.7864)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [158]  [ 800/1251]  eta: 0:01:34  lr: 0.002031  min_lr: 0.002031  loss: 3.4805 (3.1777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7768 (0.7906)  time: 0.2138  data: 0.0005  max mem: 13273
Epoch: [158]  [1000/1251]  eta: 0:00:52  lr: 0.002027  min_lr: 0.002027  loss: 3.4276 (3.1753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7455 (0.7949)  time: 0.2148  data: 0.0005  max mem: 13273
Epoch: [158]  [1200/1251]  eta: 0:00:10  lr: 0.002024  min_lr: 0.002024  loss: 3.6983 (3.1565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8162 (0.8038)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [158]  [1250/1251]  eta: 0:00:00  lr: 0.002023  min_lr: 0.002023  loss: 3.3028 (3.1558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7690 (0.8017)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [158] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.002023  min_lr: 0.002023  loss: 3.3028 (3.1279)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7690 (0.8017)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7186 (0.7186)  acc1: 88.4000 (88.4000)  acc5: 97.6000 (97.6000)  time: 5.7726  data: 5.6629  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8560 (0.9059)  acc1: 81.2000 (81.3091)  acc5: 96.0000 (95.8182)  time: 0.7417  data: 0.6455  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1344 (1.0997)  acc1: 75.6000 (77.1238)  acc5: 93.2000 (93.2191)  time: 0.1942  data: 0.1016  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1580 (1.1106)  acc1: 75.2000 (76.9120)  acc5: 91.2000 (93.2320)  time: 0.1937  data: 0.1015  max mem: 13273
Test: Total time: 0:00:10 (0.4042 s / it)
* Acc@1 76.802 Acc@5 93.716 loss 1.110
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.80%
Epoch: [159]  [   0/1251]  eta: 0:57:44  lr: 0.002023  min_lr: 0.002023  loss: 4.4518 (4.4518)  weight_decay: 0.0500 (0.0500)  time: 2.7696  data: 2.5034  max mem: 13273
Epoch: [159]  [ 200/1251]  eta: 0:03:50  lr: 0.002019  min_lr: 0.002019  loss: 2.5982 (3.0116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7581 (0.7731)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [159]  [ 400/1251]  eta: 0:03:00  lr: 0.002016  min_lr: 0.002016  loss: 2.5042 (3.0522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7130 (0.7821)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [159]  [ 600/1251]  eta: 0:02:16  lr: 0.002012  min_lr: 0.002012  loss: 2.5895 (3.0942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8530 (0.8153)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [159]  [ 800/1251]  eta: 0:01:34  lr: 0.002009  min_lr: 0.002009  loss: 2.6872 (3.0976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7994 (0.8037)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [159]  [1000/1251]  eta: 0:00:52  lr: 0.002005  min_lr: 0.002005  loss: 2.7966 (3.1017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8373 (0.8045)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [159]  [1200/1251]  eta: 0:00:10  lr: 0.002001  min_lr: 0.002001  loss: 3.5183 (3.1092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8547 (0.8115)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [159]  [1250/1251]  eta: 0:00:00  lr: 0.002001  min_lr: 0.002001  loss: 2.6813 (3.1036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7607 (0.8097)  time: 0.1702  data: 0.0008  max mem: 13273
Epoch: [159] Total time: 0:04:20 (0.2079 s / it)
Averaged stats: lr: 0.002001  min_lr: 0.002001  loss: 2.6813 (3.1313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7607 (0.8097)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7364 (0.7364)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.5012  data: 5.3929  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8958 (0.9033)  acc1: 81.2000 (81.2727)  acc5: 96.0000 (96.0364)  time: 0.6989  data: 0.6010  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1137 (1.0851)  acc1: 76.0000 (77.2952)  acc5: 93.6000 (93.9429)  time: 0.1872  data: 0.0936  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1930 (1.0971)  acc1: 74.8000 (76.9120)  acc5: 92.4000 (93.8240)  time: 0.2163  data: 0.1235  max mem: 13273
Test: Total time: 0:00:10 (0.4115 s / it)
* Acc@1 76.868 Acc@5 93.886 loss 1.100
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 76.87%
Epoch: [160]  [   0/1251]  eta: 1:07:45  lr: 0.002001  min_lr: 0.002001  loss: 4.6132 (4.6132)  weight_decay: 0.0500 (0.0500)  time: 3.2498  data: 3.0054  max mem: 13273
Epoch: [160]  [ 200/1251]  eta: 0:03:52  lr: 0.001997  min_lr: 0.001997  loss: 2.6542 (3.0512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6820 (0.7951)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [160]  [ 400/1251]  eta: 0:03:01  lr: 0.001993  min_lr: 0.001993  loss: 3.4335 (3.0984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7129 (0.7674)  time: 0.2124  data: 0.0003  max mem: 13273
Epoch: [160]  [ 600/1251]  eta: 0:02:17  lr: 0.001990  min_lr: 0.001990  loss: 3.4264 (3.0857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7223 (0.7707)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [160]  [ 800/1251]  eta: 0:01:34  lr: 0.001986  min_lr: 0.001986  loss: 2.7838 (3.1205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8012 (0.7845)  time: 0.2054  data: 0.0006  max mem: 13273
Epoch: [160]  [1000/1251]  eta: 0:00:52  lr: 0.001983  min_lr: 0.001983  loss: 2.5876 (3.1108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7214 (0.7842)  time: 0.2043  data: 0.0006  max mem: 13273
Epoch: [160]  [1200/1251]  eta: 0:00:10  lr: 0.001979  min_lr: 0.001979  loss: 3.2485 (3.1010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7991 (0.7976)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [160]  [1250/1251]  eta: 0:00:00  lr: 0.001978  min_lr: 0.001978  loss: 2.5423 (3.1046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8038 (0.7988)  time: 0.1708  data: 0.0006  max mem: 13273
Epoch: [160] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.001978  min_lr: 0.001978  loss: 2.5423 (3.1260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8038 (0.7988)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7489 (0.7489)  acc1: 84.8000 (84.8000)  acc5: 97.2000 (97.2000)  time: 5.7047  data: 5.5994  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9319 (0.8818)  acc1: 79.2000 (81.0545)  acc5: 96.8000 (96.2182)  time: 0.7477  data: 0.6511  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0849 (1.0762)  acc1: 73.6000 (76.8762)  acc5: 93.2000 (93.7333)  time: 0.2026  data: 0.1082  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2197 (1.0924)  acc1: 73.6000 (76.4000)  acc5: 92.8000 (93.6960)  time: 0.2024  data: 0.1081  max mem: 13273
Test: Total time: 0:00:10 (0.4078 s / it)
* Acc@1 76.790 Acc@5 93.800 loss 1.093
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.87%
Epoch: [161]  [   0/1251]  eta: 1:06:34  lr: 0.001978  min_lr: 0.001978  loss: 2.6830 (2.6830)  weight_decay: 0.0500 (0.0500)  time: 3.1933  data: 2.4659  max mem: 13273
Epoch: [161]  [ 200/1251]  eta: 0:03:53  lr: 0.001974  min_lr: 0.001974  loss: 3.1316 (3.0673)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8806 (0.8213)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [161]  [ 400/1251]  eta: 0:03:02  lr: 0.001971  min_lr: 0.001971  loss: 3.5651 (3.0736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7163 (0.8078)  time: 0.2049  data: 0.0006  max mem: 13273
Epoch: [161]  [ 600/1251]  eta: 0:02:17  lr: 0.001967  min_lr: 0.001967  loss: 2.7765 (3.0676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7933 (0.8059)  time: 0.2080  data: 0.0004  max mem: 13273
Epoch: [161]  [ 800/1251]  eta: 0:01:34  lr: 0.001964  min_lr: 0.001964  loss: 3.1752 (3.0892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6741 (0.7967)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [161]  [1000/1251]  eta: 0:00:52  lr: 0.001960  min_lr: 0.001960  loss: 2.9772 (3.0916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7439 (0.7943)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [161]  [1200/1251]  eta: 0:00:10  lr: 0.001956  min_lr: 0.001956  loss: 3.1831 (3.1091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8323 (0.7995)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [161]  [1250/1251]  eta: 0:00:00  lr: 0.001956  min_lr: 0.001956  loss: 2.7842 (3.1101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8480 (0.8009)  time: 0.1705  data: 0.0012  max mem: 13273
Epoch: [161] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.001956  min_lr: 0.001956  loss: 2.7842 (3.1159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8480 (0.8009)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7406 (0.7406)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.7794  data: 5.6721  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9437 (0.9273)  acc1: 82.4000 (80.4727)  acc5: 96.8000 (96.1455)  time: 0.7120  data: 0.6155  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1539 (1.1150)  acc1: 73.6000 (76.3810)  acc5: 92.8000 (93.8667)  time: 0.1905  data: 0.0970  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1605 (1.1264)  acc1: 73.6000 (76.0320)  acc5: 92.0000 (93.7120)  time: 0.1928  data: 0.1000  max mem: 13273
Test: Total time: 0:00:10 (0.4045 s / it)
* Acc@1 76.664 Acc@5 93.692 loss 1.121
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.87%
Epoch: [162]  [   0/1251]  eta: 1:04:58  lr: 0.001956  min_lr: 0.001956  loss: 2.2724 (2.2724)  weight_decay: 0.0500 (0.0500)  time: 3.1166  data: 1.6109  max mem: 13273
Epoch: [162]  [ 200/1251]  eta: 0:03:56  lr: 0.001952  min_lr: 0.001952  loss: 2.6459 (3.0699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8243 (0.8386)  time: 0.2105  data: 0.0004  max mem: 13273
Epoch: [162]  [ 400/1251]  eta: 0:03:03  lr: 0.001948  min_lr: 0.001948  loss: 2.8773 (3.0744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7606 (0.8051)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [162]  [ 600/1251]  eta: 0:02:18  lr: 0.001945  min_lr: 0.001945  loss: 2.6143 (3.0548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8942 (0.8324)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [162]  [ 800/1251]  eta: 0:01:35  lr: 0.001941  min_lr: 0.001941  loss: 2.7292 (3.0653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8962 (0.8269)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [162]  [1000/1251]  eta: 0:00:52  lr: 0.001938  min_lr: 0.001938  loss: 2.9855 (3.0840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.8150)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [162]  [1200/1251]  eta: 0:00:10  lr: 0.001934  min_lr: 0.001934  loss: 3.7272 (3.1016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8649 (0.8123)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [162]  [1250/1251]  eta: 0:00:00  lr: 0.001933  min_lr: 0.001933  loss: 2.3851 (3.0968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8325 (0.8112)  time: 0.1703  data: 0.0007  max mem: 13273
Epoch: [162] Total time: 0:04:21 (0.2094 s / it)
Averaged stats: lr: 0.001933  min_lr: 0.001933  loss: 2.3851 (3.1226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8325 (0.8112)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7240 (0.7240)  acc1: 87.2000 (87.2000)  acc5: 96.8000 (96.8000)  time: 5.4704  data: 5.3537  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8509 (0.8975)  acc1: 82.0000 (81.1273)  acc5: 96.4000 (95.9636)  time: 0.7119  data: 0.6159  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1334 (1.0788)  acc1: 75.2000 (77.3905)  acc5: 92.8000 (93.9048)  time: 0.1927  data: 0.1006  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1381 (1.0900)  acc1: 74.0000 (76.8640)  acc5: 92.4000 (93.7760)  time: 0.2072  data: 0.1156  max mem: 13273
Test: Total time: 0:00:10 (0.4030 s / it)
* Acc@1 76.930 Acc@5 93.868 loss 1.083
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 76.93%
Epoch: [163]  [   0/1251]  eta: 0:56:30  lr: 0.001933  min_lr: 0.001933  loss: 4.0885 (4.0885)  weight_decay: 0.0500 (0.0500)  time: 2.7101  data: 2.0029  max mem: 13273
Epoch: [163]  [ 200/1251]  eta: 0:03:50  lr: 0.001930  min_lr: 0.001930  loss: 2.5728 (3.1143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7405 (0.8144)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [163]  [ 400/1251]  eta: 0:03:00  lr: 0.001926  min_lr: 0.001926  loss: 2.3685 (3.0698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7959 (0.7963)  time: 0.2071  data: 0.0004  max mem: 13273
Epoch: [163]  [ 600/1251]  eta: 0:02:16  lr: 0.001922  min_lr: 0.001922  loss: 3.1349 (3.0576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7317 (0.7943)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [163]  [ 800/1251]  eta: 0:01:34  lr: 0.001919  min_lr: 0.001919  loss: 2.6494 (3.0582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7340 (0.7854)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [163]  [1000/1251]  eta: 0:00:52  lr: 0.001915  min_lr: 0.001915  loss: 3.2441 (3.0748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7759 (0.7943)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [163]  [1200/1251]  eta: 0:00:10  lr: 0.001912  min_lr: 0.001912  loss: 2.4936 (3.0773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7996 (0.8106)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [163]  [1250/1251]  eta: 0:00:00  lr: 0.001911  min_lr: 0.001911  loss: 3.2688 (3.0869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8302 (0.8111)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [163] Total time: 0:04:20 (0.2080 s / it)
Averaged stats: lr: 0.001911  min_lr: 0.001911  loss: 3.2688 (3.1094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8302 (0.8111)
Test:  [ 0/25]  eta: 0:01:47  loss: 0.7448 (0.7448)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 4.2840  data: 4.1735  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.9151 (0.9684)  acc1: 83.2000 (80.8000)  acc5: 96.8000 (96.0364)  time: 0.6574  data: 0.5644  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2422 (1.1485)  acc1: 72.4000 (76.6095)  acc5: 93.2000 (93.7905)  time: 0.2344  data: 0.1436  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2422 (1.1595)  acc1: 74.0000 (76.2400)  acc5: 92.0000 (93.5680)  time: 0.2063  data: 0.1155  max mem: 13273
Test: Total time: 0:00:10 (0.4109 s / it)
* Acc@1 76.776 Acc@5 93.734 loss 1.155
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.93%
Epoch: [164]  [   0/1251]  eta: 1:06:28  lr: 0.001911  min_lr: 0.001911  loss: 2.3747 (2.3747)  weight_decay: 0.0500 (0.0500)  time: 3.1882  data: 2.6975  max mem: 13273
Epoch: [164]  [ 200/1251]  eta: 0:03:52  lr: 0.001907  min_lr: 0.001907  loss: 2.4337 (3.0692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8204 (0.8339)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [164]  [ 400/1251]  eta: 0:03:01  lr: 0.001904  min_lr: 0.001904  loss: 2.7561 (3.0998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6881 (0.8289)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [164]  [ 600/1251]  eta: 0:02:17  lr: 0.001900  min_lr: 0.001900  loss: 3.6071 (3.1125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7615 (0.8225)  time: 0.2080  data: 0.0006  max mem: 13273
Epoch: [164]  [ 800/1251]  eta: 0:01:34  lr: 0.001896  min_lr: 0.001896  loss: 2.9289 (3.0968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7478 (0.8143)  time: 0.2071  data: 0.0006  max mem: 13273
Epoch: [164]  [1000/1251]  eta: 0:00:52  lr: 0.001893  min_lr: 0.001893  loss: 3.6168 (3.1137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6437 (0.8064)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [164]  [1200/1251]  eta: 0:00:10  lr: 0.001889  min_lr: 0.001889  loss: 2.5795 (3.1113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7934 (0.8060)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [164]  [1250/1251]  eta: 0:00:00  lr: 0.001888  min_lr: 0.001888  loss: 2.9990 (3.1038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8045 (0.8056)  time: 0.1708  data: 0.0006  max mem: 13273
Epoch: [164] Total time: 0:04:21 (0.2090 s / it)
Averaged stats: lr: 0.001888  min_lr: 0.001888  loss: 2.9990 (3.1057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8045 (0.8056)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6868 (0.6868)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 5.6976  data: 5.5910  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8678 (0.8931)  acc1: 82.0000 (81.3091)  acc5: 96.4000 (95.8909)  time: 0.7489  data: 0.6545  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1366 (1.0717)  acc1: 76.0000 (77.3714)  acc5: 93.2000 (93.5619)  time: 0.2210  data: 0.1293  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1721 (1.0877)  acc1: 75.6000 (76.9120)  acc5: 92.4000 (93.4560)  time: 0.2198  data: 0.1292  max mem: 13273
Test: Total time: 0:00:10 (0.4223 s / it)
* Acc@1 77.164 Acc@5 93.928 loss 1.084
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.16%
Epoch: [165]  [   0/1251]  eta: 0:59:27  lr: 0.001888  min_lr: 0.001888  loss: 4.2687 (4.2687)  weight_decay: 0.0500 (0.0500)  time: 2.8520  data: 2.5829  max mem: 13273
Epoch: [165]  [ 200/1251]  eta: 0:03:51  lr: 0.001885  min_lr: 0.001885  loss: 3.5133 (3.1344)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8245 (0.8606)  time: 0.2056  data: 0.0007  max mem: 13273
Epoch: [165]  [ 400/1251]  eta: 0:03:01  lr: 0.001881  min_lr: 0.001881  loss: 3.4697 (3.0976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7599 (0.8403)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [165]  [ 600/1251]  eta: 0:02:17  lr: 0.001878  min_lr: 0.001878  loss: 3.2073 (3.0809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8332 (0.8407)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [165]  [ 800/1251]  eta: 0:01:35  lr: 0.001874  min_lr: 0.001874  loss: 2.6802 (3.0808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7847 (0.8164)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [165]  [1000/1251]  eta: 0:00:52  lr: 0.001870  min_lr: 0.001870  loss: 2.6945 (3.0887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7103 (0.8177)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [165]  [1200/1251]  eta: 0:00:10  lr: 0.001867  min_lr: 0.001867  loss: 2.9400 (3.0808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8050 (0.8182)  time: 0.2074  data: 0.0007  max mem: 13273
Epoch: [165]  [1250/1251]  eta: 0:00:00  lr: 0.001866  min_lr: 0.001866  loss: 2.9490 (3.0820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7683 (0.8202)  time: 0.1706  data: 0.0007  max mem: 13273
Epoch: [165] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.001866  min_lr: 0.001866  loss: 2.9490 (3.0985)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7683 (0.8202)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6867 (0.6867)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.5722  data: 5.4651  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8461 (0.8809)  acc1: 83.2000 (81.2364)  acc5: 97.6000 (96.2545)  time: 0.7264  data: 0.6307  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0941 (1.0808)  acc1: 74.0000 (77.0476)  acc5: 92.8000 (93.7333)  time: 0.1999  data: 0.1070  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2278 (1.0957)  acc1: 73.2000 (76.6560)  acc5: 92.8000 (93.7920)  time: 0.1998  data: 0.1069  max mem: 13273
Test: Total time: 0:00:10 (0.4003 s / it)
* Acc@1 76.852 Acc@5 93.898 loss 1.092
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 77.16%
Epoch: [166]  [   0/1251]  eta: 0:59:09  lr: 0.001866  min_lr: 0.001866  loss: 3.7615 (3.7615)  weight_decay: 0.0500 (0.0500)  time: 2.8376  data: 2.2371  max mem: 13273
Epoch: [166]  [ 200/1251]  eta: 0:03:53  lr: 0.001862  min_lr: 0.001862  loss: 3.3351 (3.1139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7835 (0.8289)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [166]  [ 400/1251]  eta: 0:03:02  lr: 0.001859  min_lr: 0.001859  loss: 3.4291 (3.1470)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7729 (0.8195)  time: 0.2058  data: 0.0006  max mem: 13273
Epoch: [166]  [ 600/1251]  eta: 0:02:17  lr: 0.001855  min_lr: 0.001855  loss: 3.0313 (3.1497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7759 (0.8095)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [166]  [ 800/1251]  eta: 0:01:34  lr: 0.001852  min_lr: 0.001852  loss: 3.3382 (3.1498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7610 (inf)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [166]  [1000/1251]  eta: 0:00:52  lr: 0.001848  min_lr: 0.001848  loss: 2.4258 (3.1119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7514 (inf)  time: 0.2045  data: 0.0004  max mem: 13273
Epoch: [166]  [1200/1251]  eta: 0:00:10  lr: 0.001844  min_lr: 0.001844  loss: 2.7811 (3.1142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7233 (inf)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [166]  [1250/1251]  eta: 0:00:00  lr: 0.001844  min_lr: 0.001844  loss: 2.4338 (3.1069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8531 (inf)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [166] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.001844  min_lr: 0.001844  loss: 2.4338 (3.1073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8531 (inf)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7595 (0.7595)  acc1: 86.0000 (86.0000)  acc5: 96.8000 (96.8000)  time: 5.6749  data: 5.5689  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9061 (0.9142)  acc1: 83.6000 (81.2000)  acc5: 96.8000 (95.9636)  time: 0.7587  data: 0.6619  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1508 (1.0761)  acc1: 73.6000 (77.1619)  acc5: 92.4000 (93.7524)  time: 0.2092  data: 0.1157  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1508 (1.0825)  acc1: 73.6000 (76.8320)  acc5: 92.0000 (93.7760)  time: 0.2140  data: 0.1211  max mem: 13273
Test: Total time: 0:00:10 (0.4168 s / it)
* Acc@1 77.304 Acc@5 93.944 loss 1.074
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.30%
Epoch: [167]  [   0/1251]  eta: 0:58:37  lr: 0.001844  min_lr: 0.001844  loss: 2.3348 (2.3348)  weight_decay: 0.0500 (0.0500)  time: 2.8117  data: 2.5064  max mem: 13273
Epoch: [167]  [ 200/1251]  eta: 0:03:49  lr: 0.001840  min_lr: 0.001840  loss: 2.8475 (3.0451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7155 (0.8756)  time: 0.2041  data: 0.0004  max mem: 13273
Epoch: [167]  [ 400/1251]  eta: 0:03:00  lr: 0.001836  min_lr: 0.001836  loss: 2.5940 (3.0258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8797 (0.8538)  time: 0.2079  data: 0.0006  max mem: 13273
Epoch: [167]  [ 600/1251]  eta: 0:02:16  lr: 0.001833  min_lr: 0.001833  loss: 3.0894 (3.0518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7922 (0.8402)  time: 0.2058  data: 0.0006  max mem: 13273
Epoch: [167]  [ 800/1251]  eta: 0:01:34  lr: 0.001829  min_lr: 0.001829  loss: 2.8193 (3.0674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8410 (0.8319)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [167]  [1000/1251]  eta: 0:00:52  lr: 0.001826  min_lr: 0.001826  loss: 2.5297 (3.0734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8270 (0.8387)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [167]  [1200/1251]  eta: 0:00:10  lr: 0.001822  min_lr: 0.001822  loss: 3.5891 (3.0704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9339 (0.8344)  time: 0.2072  data: 0.0006  max mem: 13273
Epoch: [167]  [1250/1251]  eta: 0:00:00  lr: 0.001821  min_lr: 0.001821  loss: 2.9716 (3.0732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9004 (0.8342)  time: 0.1705  data: 0.0007  max mem: 13273
Epoch: [167] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.001821  min_lr: 0.001821  loss: 2.9716 (3.0979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9004 (0.8342)
Test:  [ 0/25]  eta: 0:02:06  loss: 0.7676 (0.7676)  acc1: 86.0000 (86.0000)  acc5: 96.0000 (96.0000)  time: 5.0792  data: 4.9725  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8961 (0.9141)  acc1: 79.6000 (81.3091)  acc5: 96.4000 (95.8909)  time: 0.6981  data: 0.6015  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1453 (1.1102)  acc1: 74.8000 (76.9905)  acc5: 92.4000 (93.6000)  time: 0.2118  data: 0.1187  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2521 (1.1179)  acc1: 74.0000 (76.6240)  acc5: 92.0000 (93.6160)  time: 0.2250  data: 0.1322  max mem: 13273
Test: Total time: 0:00:10 (0.4009 s / it)
* Acc@1 76.874 Acc@5 93.794 loss 1.111
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 77.30%
Epoch: [168]  [   0/1251]  eta: 1:07:57  lr: 0.001821  min_lr: 0.001821  loss: 2.9330 (2.9330)  weight_decay: 0.0500 (0.0500)  time: 3.2597  data: 1.5508  max mem: 13273
Epoch: [168]  [ 200/1251]  eta: 0:03:52  lr: 0.001818  min_lr: 0.001818  loss: 3.7378 (3.1175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7987 (0.8415)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [168]  [ 400/1251]  eta: 0:03:01  lr: 0.001814  min_lr: 0.001814  loss: 2.6835 (3.1028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7321 (0.7951)  time: 0.2054  data: 0.0010  max mem: 13273
Epoch: [168]  [ 600/1251]  eta: 0:02:17  lr: 0.001811  min_lr: 0.001811  loss: 3.2045 (3.1281)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8690 (0.7991)  time: 0.2066  data: 0.0006  max mem: 13273
Epoch: [168]  [ 800/1251]  eta: 0:01:34  lr: 0.001807  min_lr: 0.001807  loss: 3.6992 (3.1304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7454 (0.7967)  time: 0.2048  data: 0.0003  max mem: 13273
Epoch: [168]  [1000/1251]  eta: 0:00:52  lr: 0.001803  min_lr: 0.001803  loss: 2.6702 (3.1333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8093 (0.8016)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [168]  [1200/1251]  eta: 0:00:10  lr: 0.001800  min_lr: 0.001800  loss: 3.4222 (3.1349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6759 (0.8002)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [168]  [1250/1251]  eta: 0:00:00  lr: 0.001799  min_lr: 0.001799  loss: 2.6960 (3.1338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7404 (0.7963)  time: 0.1701  data: 0.0007  max mem: 13273
Epoch: [168] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.001799  min_lr: 0.001799  loss: 2.6960 (3.1131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7404 (0.7963)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7266 (0.7266)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.5520  data: 5.4443  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9186 (0.8949)  acc1: 82.4000 (81.5636)  acc5: 96.0000 (96.2909)  time: 0.6806  data: 0.5866  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1176 (1.0973)  acc1: 74.4000 (77.0476)  acc5: 93.6000 (93.9238)  time: 0.1834  data: 0.0915  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2453 (1.1045)  acc1: 74.4000 (76.6560)  acc5: 92.4000 (93.9360)  time: 0.1994  data: 0.1081  max mem: 13273
Test: Total time: 0:00:10 (0.4004 s / it)
* Acc@1 77.002 Acc@5 93.766 loss 1.103
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.30%
Epoch: [169]  [   0/1251]  eta: 1:04:41  lr: 0.001799  min_lr: 0.001799  loss: 3.0304 (3.0304)  weight_decay: 0.0500 (0.0500)  time: 3.1024  data: 2.8241  max mem: 13273
Epoch: [169]  [ 200/1251]  eta: 0:03:54  lr: 0.001795  min_lr: 0.001795  loss: 3.3032 (3.1076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8158 (1.0216)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [169]  [ 400/1251]  eta: 0:03:02  lr: 0.001792  min_lr: 0.001792  loss: 2.6633 (3.1192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6883 (0.8954)  time: 0.2083  data: 0.0005  max mem: 13273
Epoch: [169]  [ 600/1251]  eta: 0:02:17  lr: 0.001788  min_lr: 0.001788  loss: 3.7486 (3.1296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.8654)  time: 0.2033  data: 0.0005  max mem: 13273
Epoch: [169]  [ 800/1251]  eta: 0:01:34  lr: 0.001785  min_lr: 0.001785  loss: 3.2987 (3.1332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7772 (0.8448)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [169]  [1000/1251]  eta: 0:00:52  lr: 0.001781  min_lr: 0.001781  loss: 3.2309 (3.1293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8443 (0.8450)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [169]  [1200/1251]  eta: 0:00:10  lr: 0.001777  min_lr: 0.001777  loss: 2.9831 (3.1058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8517 (0.8489)  time: 0.2069  data: 0.0005  max mem: 13273
Epoch: [169]  [1250/1251]  eta: 0:00:00  lr: 0.001777  min_lr: 0.001777  loss: 3.4938 (3.1121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7514 (0.8468)  time: 0.1699  data: 0.0010  max mem: 13273
Epoch: [169] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.001777  min_lr: 0.001777  loss: 3.4938 (3.1090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7514 (0.8468)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8257 (0.8257)  acc1: 88.0000 (88.0000)  acc5: 97.2000 (97.2000)  time: 5.5278  data: 5.4212  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8846 (0.9606)  acc1: 83.6000 (80.9091)  acc5: 96.4000 (96.1818)  time: 0.7237  data: 0.6266  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1768 (1.1279)  acc1: 75.6000 (77.0857)  acc5: 93.6000 (93.8857)  time: 0.2037  data: 0.1084  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2180 (1.1340)  acc1: 75.6000 (76.8640)  acc5: 93.2000 (93.8400)  time: 0.2157  data: 0.1213  max mem: 13273
Test: Total time: 0:00:10 (0.4121 s / it)
* Acc@1 77.122 Acc@5 93.960 loss 1.131
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.30%
Epoch: [170]  [   0/1251]  eta: 1:01:18  lr: 0.001777  min_lr: 0.001777  loss: 3.8715 (3.8715)  weight_decay: 0.0500 (0.0500)  time: 2.9401  data: 1.8468  max mem: 13273
Epoch: [170]  [ 200/1251]  eta: 0:03:52  lr: 0.001773  min_lr: 0.001773  loss: 3.1808 (3.0494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7850 (0.8079)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [170]  [ 400/1251]  eta: 0:03:01  lr: 0.001769  min_lr: 0.001769  loss: 3.1578 (3.0946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9696 (0.8574)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [170]  [ 600/1251]  eta: 0:02:17  lr: 0.001766  min_lr: 0.001766  loss: 2.8844 (3.0786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7153 (0.8477)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [170]  [ 800/1251]  eta: 0:01:34  lr: 0.001762  min_lr: 0.001762  loss: 3.0666 (3.0933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7965 (0.8354)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [170]  [1000/1251]  eta: 0:00:52  lr: 0.001759  min_lr: 0.001759  loss: 3.0433 (3.1047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (0.8287)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [170]  [1200/1251]  eta: 0:00:10  lr: 0.001755  min_lr: 0.001755  loss: 2.9473 (3.1140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7569 (0.8160)  time: 0.2073  data: 0.0004  max mem: 13273
Epoch: [170]  [1250/1251]  eta: 0:00:00  lr: 0.001754  min_lr: 0.001754  loss: 3.7510 (3.1208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7973 (0.8168)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [170] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.001754  min_lr: 0.001754  loss: 3.7510 (3.0920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7973 (0.8168)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8258 (0.8258)  acc1: 86.0000 (86.0000)  acc5: 96.4000 (96.4000)  time: 5.6318  data: 5.5233  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9076 (0.9538)  acc1: 84.0000 (82.0727)  acc5: 96.8000 (96.2545)  time: 0.7206  data: 0.6261  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1644 (1.1375)  acc1: 75.6000 (77.7333)  acc5: 93.6000 (93.9810)  time: 0.1880  data: 0.0960  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2477 (1.1501)  acc1: 73.2000 (77.1520)  acc5: 92.4000 (93.8560)  time: 0.1963  data: 0.1045  max mem: 13273
Test: Total time: 0:00:10 (0.4007 s / it)
* Acc@1 77.122 Acc@5 93.948 loss 1.150
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.30%
Epoch: [171]  [   0/1251]  eta: 0:59:21  lr: 0.001754  min_lr: 0.001754  loss: 3.4398 (3.4398)  weight_decay: 0.0500 (0.0500)  time: 2.8467  data: 2.5558  max mem: 13273
Epoch: [171]  [ 200/1251]  eta: 0:03:52  lr: 0.001751  min_lr: 0.001751  loss: 2.5950 (3.1361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7522 (0.8152)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [171]  [ 400/1251]  eta: 0:03:01  lr: 0.001747  min_lr: 0.001747  loss: 3.5473 (3.1242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8151 (0.8233)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [171]  [ 600/1251]  eta: 0:02:17  lr: 0.001744  min_lr: 0.001744  loss: 3.4851 (3.1360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7354 (0.8158)  time: 0.2083  data: 0.0005  max mem: 13273
Epoch: [171]  [ 800/1251]  eta: 0:01:34  lr: 0.001740  min_lr: 0.001740  loss: 2.5928 (3.1333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8005 (0.8404)  time: 0.2068  data: 0.0004  max mem: 13273
Epoch: [171]  [1000/1251]  eta: 0:00:52  lr: 0.001737  min_lr: 0.001737  loss: 2.5945 (3.1222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8076 (0.8424)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [171]  [1200/1251]  eta: 0:00:10  lr: 0.001733  min_lr: 0.001733  loss: 3.3091 (3.1163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7810 (0.8353)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [171]  [1250/1251]  eta: 0:00:00  lr: 0.001732  min_lr: 0.001732  loss: 2.8185 (3.1117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8533 (0.8364)  time: 0.1697  data: 0.0006  max mem: 13273
Epoch: [171] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.001732  min_lr: 0.001732  loss: 2.8185 (3.0917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8533 (0.8364)
Test:  [ 0/25]  eta: 0:01:28  loss: 0.7091 (0.7091)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 3.5448  data: 3.4363  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.8526 (0.8488)  acc1: 83.2000 (81.9273)  acc5: 96.8000 (96.6545)  time: 0.6011  data: 0.5029  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0876 (1.0399)  acc1: 75.2000 (77.7524)  acc5: 92.8000 (94.0381)  time: 0.2774  data: 0.1835  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1794 (1.0532)  acc1: 75.2000 (77.3440)  acc5: 92.4000 (93.9840)  time: 0.2128  data: 0.1207  max mem: 13273
Test: Total time: 0:00:10 (0.4113 s / it)
* Acc@1 77.210 Acc@5 94.094 loss 1.054
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.30%
Epoch: [172]  [   0/1251]  eta: 0:56:51  lr: 0.001732  min_lr: 0.001732  loss: 3.7571 (3.7571)  weight_decay: 0.0500 (0.0500)  time: 2.7267  data: 2.3785  max mem: 13273
Epoch: [172]  [ 200/1251]  eta: 0:03:54  lr: 0.001729  min_lr: 0.001729  loss: 3.2515 (3.1100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7577 (0.8072)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [172]  [ 400/1251]  eta: 0:03:03  lr: 0.001725  min_lr: 0.001725  loss: 2.6290 (3.0652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7416 (0.8057)  time: 0.2054  data: 0.0006  max mem: 13273
Epoch: [172]  [ 600/1251]  eta: 0:02:18  lr: 0.001721  min_lr: 0.001721  loss: 3.1928 (3.0734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7606 (0.7979)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [172]  [ 800/1251]  eta: 0:01:35  lr: 0.001718  min_lr: 0.001718  loss: 3.7265 (3.0730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8688 (0.8102)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [172]  [1000/1251]  eta: 0:00:52  lr: 0.001714  min_lr: 0.001714  loss: 3.5794 (3.0842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7619 (0.8131)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [172]  [1200/1251]  eta: 0:00:10  lr: 0.001711  min_lr: 0.001711  loss: 2.8195 (3.0797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7799 (0.8233)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [172]  [1250/1251]  eta: 0:00:00  lr: 0.001710  min_lr: 0.001710  loss: 2.7821 (3.0785)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.8222)  time: 0.1705  data: 0.0008  max mem: 13273
Epoch: [172] Total time: 0:04:22 (0.2096 s / it)
Averaged stats: lr: 0.001710  min_lr: 0.001710  loss: 2.7821 (3.0698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.8222)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6797 (0.6797)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.7180  data: 5.6124  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8427 (0.8759)  acc1: 82.8000 (81.7455)  acc5: 96.4000 (96.4727)  time: 0.7651  data: 0.6664  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0608 (1.0553)  acc1: 76.8000 (77.6952)  acc5: 93.6000 (93.9810)  time: 0.2136  data: 0.1184  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1669 (1.0654)  acc1: 76.8000 (77.2960)  acc5: 93.2000 (93.8560)  time: 0.2124  data: 0.1183  max mem: 13273
Test: Total time: 0:00:10 (0.4180 s / it)
* Acc@1 77.404 Acc@5 94.030 loss 1.063
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.40%
Epoch: [173]  [   0/1251]  eta: 1:01:41  lr: 0.001710  min_lr: 0.001710  loss: 3.1836 (3.1836)  weight_decay: 0.0500 (0.0500)  time: 2.9588  data: 2.7169  max mem: 13273
Epoch: [173]  [ 200/1251]  eta: 0:03:50  lr: 0.001706  min_lr: 0.001706  loss: 2.7012 (3.0096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7510 (0.7856)  time: 0.2047  data: 0.0008  max mem: 13273
Epoch: [173]  [ 400/1251]  eta: 0:03:00  lr: 0.001703  min_lr: 0.001703  loss: 3.3110 (3.0743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7274 (0.7935)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [173]  [ 600/1251]  eta: 0:02:16  lr: 0.001699  min_lr: 0.001699  loss: 3.3522 (3.1101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8475 (0.8293)  time: 0.2069  data: 0.0006  max mem: 13273
Epoch: [173]  [ 800/1251]  eta: 0:01:34  lr: 0.001696  min_lr: 0.001696  loss: 2.4756 (3.1078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7418 (0.8180)  time: 0.2062  data: 0.0006  max mem: 13273
Epoch: [173]  [1000/1251]  eta: 0:00:52  lr: 0.001692  min_lr: 0.001692  loss: 3.6226 (3.1220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7460 (0.8148)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [173]  [1200/1251]  eta: 0:00:10  lr: 0.001689  min_lr: 0.001689  loss: 3.5024 (3.1234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9008 (0.8232)  time: 0.2061  data: 0.0003  max mem: 13273
Epoch: [173]  [1250/1251]  eta: 0:00:00  lr: 0.001688  min_lr: 0.001688  loss: 2.4923 (3.1154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8428 (0.8224)  time: 0.1709  data: 0.0005  max mem: 13273
Epoch: [173] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.001688  min_lr: 0.001688  loss: 2.4923 (3.1000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8428 (0.8224)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6951 (0.6951)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.7983  data: 5.6931  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8763 (0.8976)  acc1: 82.0000 (81.5636)  acc5: 96.4000 (96.0727)  time: 0.7681  data: 0.6730  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1170 (1.0771)  acc1: 75.6000 (77.5429)  acc5: 94.0000 (93.8857)  time: 0.1968  data: 0.1046  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2290 (1.0891)  acc1: 74.8000 (77.2160)  acc5: 92.4000 (93.8080)  time: 0.1960  data: 0.1046  max mem: 13273
Test: Total time: 0:00:10 (0.4072 s / it)
* Acc@1 77.272 Acc@5 94.006 loss 1.082
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.40%
Epoch: [174]  [   0/1251]  eta: 1:07:57  lr: 0.001688  min_lr: 0.001688  loss: 2.1701 (2.1701)  weight_decay: 0.0500 (0.0500)  time: 3.2594  data: 2.9993  max mem: 13273
Epoch: [174]  [ 200/1251]  eta: 0:03:53  lr: 0.001684  min_lr: 0.001684  loss: 2.4585 (3.0587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8487 (0.8319)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [174]  [ 400/1251]  eta: 0:03:01  lr: 0.001681  min_lr: 0.001681  loss: 2.8982 (3.0780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8053 (0.8378)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [174]  [ 600/1251]  eta: 0:02:17  lr: 0.001677  min_lr: 0.001677  loss: 2.4957 (3.0678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7770 (0.8325)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [174]  [ 800/1251]  eta: 0:01:34  lr: 0.001674  min_lr: 0.001674  loss: 2.5896 (3.0717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7962 (0.8351)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [174]  [1000/1251]  eta: 0:00:52  lr: 0.001670  min_lr: 0.001670  loss: 2.5272 (3.0733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7721 (0.8289)  time: 0.2064  data: 0.0004  max mem: 13273
Epoch: [174]  [1200/1251]  eta: 0:00:10  lr: 0.001666  min_lr: 0.001666  loss: 2.9627 (3.0817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7136 (0.8325)  time: 0.2050  data: 0.0006  max mem: 13273
Epoch: [174]  [1250/1251]  eta: 0:00:00  lr: 0.001666  min_lr: 0.001666  loss: 2.6437 (3.0813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8409 (0.8332)  time: 0.1714  data: 0.0008  max mem: 13273
Epoch: [174] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.001666  min_lr: 0.001666  loss: 2.6437 (3.0776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8409 (0.8332)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.6909 (0.6909)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.2412  data: 5.1315  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8474 (0.8899)  acc1: 82.4000 (81.6364)  acc5: 96.4000 (96.6909)  time: 0.7623  data: 0.6647  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1138 (1.0850)  acc1: 76.0000 (77.6000)  acc5: 94.0000 (94.1905)  time: 0.2196  data: 0.1256  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1641 (1.0938)  acc1: 74.8000 (77.0880)  acc5: 92.8000 (94.2560)  time: 0.2188  data: 0.1255  max mem: 13273
Test: Total time: 0:00:10 (0.4045 s / it)
* Acc@1 77.530 Acc@5 94.086 loss 1.086
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.53%
Epoch: [175]  [   0/1251]  eta: 1:02:49  lr: 0.001666  min_lr: 0.001666  loss: 2.2610 (2.2610)  weight_decay: 0.0500 (0.0500)  time: 3.0128  data: 2.7721  max mem: 13273
Epoch: [175]  [ 200/1251]  eta: 0:03:51  lr: 0.001662  min_lr: 0.001662  loss: 3.4751 (3.0652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8593 (0.8404)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [175]  [ 400/1251]  eta: 0:03:00  lr: 0.001658  min_lr: 0.001658  loss: 2.7957 (3.0768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8772 (0.8386)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [175]  [ 600/1251]  eta: 0:02:17  lr: 0.001655  min_lr: 0.001655  loss: 3.1231 (3.1197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8045 (0.8259)  time: 0.2154  data: 0.0005  max mem: 13273
Epoch: [175]  [ 800/1251]  eta: 0:01:34  lr: 0.001651  min_lr: 0.001651  loss: 2.4239 (3.1113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8157 (0.8363)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [175]  [1000/1251]  eta: 0:00:52  lr: 0.001648  min_lr: 0.001648  loss: 2.3580 (3.0852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9098 (0.8388)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [175]  [1200/1251]  eta: 0:00:10  lr: 0.001644  min_lr: 0.001644  loss: 3.5445 (3.0787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7552 (0.8336)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [175]  [1250/1251]  eta: 0:00:00  lr: 0.001644  min_lr: 0.001644  loss: 2.7898 (3.0789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8060 (0.8341)  time: 0.1700  data: 0.0007  max mem: 13273
Epoch: [175] Total time: 0:04:21 (0.2092 s / it)
Averaged stats: lr: 0.001644  min_lr: 0.001644  loss: 2.7898 (3.0671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8060 (0.8341)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6637 (0.6637)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.6651  data: 5.5287  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8908 (0.8838)  acc1: 83.6000 (81.5273)  acc5: 96.0000 (96.2546)  time: 0.6725  data: 0.5704  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0986 (1.0686)  acc1: 74.0000 (77.3333)  acc5: 93.6000 (94.0952)  time: 0.1718  data: 0.0772  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2104 (1.0771)  acc1: 74.0000 (76.8480)  acc5: 92.4000 (93.8720)  time: 0.1996  data: 0.1069  max mem: 13273
Test: Total time: 0:00:10 (0.4059 s / it)
* Acc@1 77.384 Acc@5 94.082 loss 1.072
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.53%
Epoch: [176]  [   0/1251]  eta: 1:05:43  lr: 0.001643  min_lr: 0.001643  loss: 3.8212 (3.8212)  weight_decay: 0.0500 (0.0500)  time: 3.1525  data: 1.6152  max mem: 13273
Epoch: [176]  [ 200/1251]  eta: 0:03:52  lr: 0.001640  min_lr: 0.001640  loss: 2.4056 (2.9792)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9602 (0.9519)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [176]  [ 400/1251]  eta: 0:03:01  lr: 0.001636  min_lr: 0.001636  loss: 2.6959 (3.0529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7238 (0.9087)  time: 0.2043  data: 0.0005  max mem: 13273
Epoch: [176]  [ 600/1251]  eta: 0:02:17  lr: 0.001633  min_lr: 0.001633  loss: 2.8972 (3.0591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8025 (0.8755)  time: 0.2043  data: 0.0005  max mem: 13273
Epoch: [176]  [ 800/1251]  eta: 0:01:34  lr: 0.001629  min_lr: 0.001629  loss: 3.0252 (3.0822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7356 (0.8641)  time: 0.2042  data: 0.0006  max mem: 13273
Epoch: [176]  [1000/1251]  eta: 0:00:52  lr: 0.001626  min_lr: 0.001626  loss: 3.4609 (3.0767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7862 (0.8605)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [176]  [1200/1251]  eta: 0:00:10  lr: 0.001622  min_lr: 0.001622  loss: 3.0421 (3.0730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8173 (0.8631)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [176]  [1250/1251]  eta: 0:00:00  lr: 0.001621  min_lr: 0.001621  loss: 3.0458 (3.0713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7938 (0.8611)  time: 0.1707  data: 0.0007  max mem: 13273
Epoch: [176] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.001621  min_lr: 0.001621  loss: 3.0458 (3.0720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7938 (0.8611)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6794 (0.6794)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 5.3382  data: 5.2329  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9018 (0.9096)  acc1: 83.2000 (81.4909)  acc5: 96.8000 (96.4364)  time: 0.7527  data: 0.6525  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1137 (1.0898)  acc1: 74.4000 (77.4857)  acc5: 92.8000 (93.8476)  time: 0.2218  data: 0.1252  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2126 (1.0965)  acc1: 73.6000 (77.0400)  acc5: 92.4000 (93.7440)  time: 0.2201  data: 0.1251  max mem: 13273
Test: Total time: 0:00:10 (0.4090 s / it)
* Acc@1 77.516 Acc@5 94.096 loss 1.092
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.53%
Epoch: [177]  [   0/1251]  eta: 1:07:37  lr: 0.001621  min_lr: 0.001621  loss: 3.7441 (3.7441)  weight_decay: 0.0500 (0.0500)  time: 3.2432  data: 2.3614  max mem: 13273
Epoch: [177]  [ 200/1251]  eta: 0:03:53  lr: 0.001618  min_lr: 0.001618  loss: 3.6430 (3.0279)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9703 (0.8897)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [177]  [ 400/1251]  eta: 0:03:01  lr: 0.001614  min_lr: 0.001614  loss: 2.5888 (3.0631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8273 (0.9085)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [177]  [ 600/1251]  eta: 0:02:17  lr: 0.001611  min_lr: 0.001611  loss: 3.6251 (3.0918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8380 (inf)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [177]  [ 800/1251]  eta: 0:01:34  lr: 0.001607  min_lr: 0.001607  loss: 2.4945 (3.0796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8044 (inf)  time: 0.2079  data: 0.0006  max mem: 13273
Epoch: [177]  [1000/1251]  eta: 0:00:52  lr: 0.001604  min_lr: 0.001604  loss: 2.9395 (3.0853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8047 (inf)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [177]  [1200/1251]  eta: 0:00:10  lr: 0.001600  min_lr: 0.001600  loss: 3.1190 (3.0779)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8108 (inf)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [177]  [1250/1251]  eta: 0:00:00  lr: 0.001599  min_lr: 0.001599  loss: 2.9358 (3.0741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7916 (inf)  time: 0.1702  data: 0.0008  max mem: 13273
Epoch: [177] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.001599  min_lr: 0.001599  loss: 2.9358 (3.0675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7916 (inf)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7176 (0.7176)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 5.8752  data: 5.7685  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8409 (0.8763)  acc1: 82.0000 (81.6000)  acc5: 96.8000 (96.4364)  time: 0.7345  data: 0.6355  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0946 (1.0573)  acc1: 74.8000 (77.6000)  acc5: 93.6000 (93.9810)  time: 0.1861  data: 0.0918  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0961 (1.0639)  acc1: 74.4000 (77.0720)  acc5: 92.8000 (93.8720)  time: 0.1902  data: 0.0973  max mem: 13273
Test: Total time: 0:00:10 (0.4062 s / it)
* Acc@1 77.318 Acc@5 94.040 loss 1.058
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.53%
Epoch: [178]  [   0/1251]  eta: 1:07:25  lr: 0.001599  min_lr: 0.001599  loss: 3.9527 (3.9527)  weight_decay: 0.0500 (0.0500)  time: 3.2335  data: 2.9782  max mem: 13273
Epoch: [178]  [ 200/1251]  eta: 0:03:53  lr: 0.001596  min_lr: 0.001596  loss: 3.4901 (3.1426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7996 (0.8585)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [178]  [ 400/1251]  eta: 0:03:02  lr: 0.001592  min_lr: 0.001592  loss: 2.8301 (3.1068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8117 (0.8671)  time: 0.2088  data: 0.0005  max mem: 13273
Epoch: [178]  [ 600/1251]  eta: 0:02:17  lr: 0.001589  min_lr: 0.001589  loss: 2.8544 (3.0776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6924 (0.8515)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [178]  [ 800/1251]  eta: 0:01:34  lr: 0.001585  min_lr: 0.001585  loss: 2.8104 (3.0505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8359 (0.8644)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [178]  [1000/1251]  eta: 0:00:52  lr: 0.001582  min_lr: 0.001582  loss: 2.7429 (3.0582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8611 (0.8706)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [178]  [1200/1251]  eta: 0:00:10  lr: 0.001578  min_lr: 0.001578  loss: 3.3059 (3.0635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7946 (0.8594)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [178]  [1250/1251]  eta: 0:00:00  lr: 0.001578  min_lr: 0.001578  loss: 2.6712 (3.0556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7704 (0.8565)  time: 0.1706  data: 0.0007  max mem: 13273
Epoch: [178] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.001578  min_lr: 0.001578  loss: 2.6712 (3.0627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7704 (0.8565)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6479 (0.6479)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.5974  data: 5.4613  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8660 (0.8662)  acc1: 83.6000 (81.7818)  acc5: 96.4000 (96.4727)  time: 0.7475  data: 0.6455  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0764 (1.0571)  acc1: 75.6000 (77.4476)  acc5: 93.6000 (94.1143)  time: 0.2031  data: 0.1086  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1492 (1.0623)  acc1: 74.4000 (77.1360)  acc5: 93.2000 (94.0480)  time: 0.2019  data: 0.1086  max mem: 13273
Test: Total time: 0:00:10 (0.4051 s / it)
* Acc@1 77.702 Acc@5 94.186 loss 1.055
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.70%
Epoch: [179]  [   0/1251]  eta: 1:06:15  lr: 0.001577  min_lr: 0.001577  loss: 3.8774 (3.8774)  weight_decay: 0.0500 (0.0500)  time: 3.1778  data: 2.9606  max mem: 13273
Epoch: [179]  [ 200/1251]  eta: 0:03:54  lr: 0.001574  min_lr: 0.001574  loss: 3.5491 (3.0115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7568 (0.7872)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [179]  [ 400/1251]  eta: 0:03:02  lr: 0.001570  min_lr: 0.001570  loss: 3.6327 (3.0496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8090 (0.8278)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [179]  [ 600/1251]  eta: 0:02:17  lr: 0.001567  min_lr: 0.001567  loss: 2.7226 (3.0457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (0.8127)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [179]  [ 800/1251]  eta: 0:01:34  lr: 0.001563  min_lr: 0.001563  loss: 2.7113 (3.0368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8248 (0.8261)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [179]  [1000/1251]  eta: 0:00:52  lr: 0.001560  min_lr: 0.001560  loss: 3.0742 (3.0729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7438 (0.8216)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [179]  [1200/1251]  eta: 0:00:10  lr: 0.001556  min_lr: 0.001556  loss: 2.6787 (3.0839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8406 (0.8321)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [179]  [1250/1251]  eta: 0:00:00  lr: 0.001556  min_lr: 0.001556  loss: 2.4714 (3.0782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8249 (0.8315)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [179] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.001556  min_lr: 0.001556  loss: 2.4714 (3.0579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8249 (0.8315)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6472 (0.6472)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.5298  data: 5.4229  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8546 (0.8364)  acc1: 81.2000 (81.4909)  acc5: 96.8000 (96.5455)  time: 0.7375  data: 0.6427  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0818 (1.0132)  acc1: 74.8000 (77.5238)  acc5: 93.6000 (94.3048)  time: 0.2025  data: 0.1067  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0818 (1.0212)  acc1: 74.4000 (77.2640)  acc5: 93.2000 (94.2240)  time: 0.2050  data: 0.1095  max mem: 13273
Test: Total time: 0:00:10 (0.4037 s / it)
* Acc@1 77.678 Acc@5 94.190 loss 1.014
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.70%
Epoch: [180]  [   0/1251]  eta: 1:02:35  lr: 0.001556  min_lr: 0.001556  loss: 3.7709 (3.7709)  weight_decay: 0.0500 (0.0500)  time: 3.0018  data: 2.4825  max mem: 13273
Epoch: [180]  [ 200/1251]  eta: 0:03:53  lr: 0.001552  min_lr: 0.001552  loss: 3.2772 (3.0986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7578 (0.8781)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [180]  [ 400/1251]  eta: 0:03:01  lr: 0.001549  min_lr: 0.001549  loss: 3.4540 (3.0982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8635 (0.8673)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [180]  [ 600/1251]  eta: 0:02:16  lr: 0.001545  min_lr: 0.001545  loss: 2.7483 (3.0748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7597 (0.8447)  time: 0.2037  data: 0.0005  max mem: 13273
Epoch: [180]  [ 800/1251]  eta: 0:01:34  lr: 0.001542  min_lr: 0.001542  loss: 2.9493 (3.0705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9057 (0.8472)  time: 0.2074  data: 0.0005  max mem: 13273
Epoch: [180]  [1000/1251]  eta: 0:00:52  lr: 0.001538  min_lr: 0.001538  loss: 2.8528 (3.0848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8157 (0.8622)  time: 0.2059  data: 0.0007  max mem: 13273
Epoch: [180]  [1200/1251]  eta: 0:00:10  lr: 0.001535  min_lr: 0.001535  loss: 2.7854 (3.0690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8216 (0.8631)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [180]  [1250/1251]  eta: 0:00:00  lr: 0.001534  min_lr: 0.001534  loss: 2.6411 (3.0618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7618 (0.8594)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [180] Total time: 0:04:19 (0.2076 s / it)
Averaged stats: lr: 0.001534  min_lr: 0.001534  loss: 2.6411 (3.0511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7618 (0.8594)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7341 (0.7341)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 5.6201  data: 5.5136  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8788 (0.8784)  acc1: 84.4000 (82.0364)  acc5: 96.4000 (96.2909)  time: 0.7519  data: 0.6559  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1292 (1.0689)  acc1: 76.4000 (77.5619)  acc5: 93.6000 (94.0000)  time: 0.2093  data: 0.1153  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1383 (1.0798)  acc1: 74.8000 (77.1520)  acc5: 92.8000 (93.9040)  time: 0.2081  data: 0.1152  max mem: 13273
Test: Total time: 0:00:10 (0.4101 s / it)
* Acc@1 77.672 Acc@5 94.098 loss 1.075
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.70%
Epoch: [181]  [   0/1251]  eta: 1:06:07  lr: 0.001534  min_lr: 0.001534  loss: 2.1797 (2.1797)  weight_decay: 0.0500 (0.0500)  time: 3.1716  data: 1.8594  max mem: 13273
Epoch: [181]  [ 200/1251]  eta: 0:03:53  lr: 0.001530  min_lr: 0.001530  loss: 2.6855 (3.0739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8987 (0.8204)  time: 0.2069  data: 0.0005  max mem: 13273
Epoch: [181]  [ 400/1251]  eta: 0:03:02  lr: 0.001527  min_lr: 0.001527  loss: 2.8367 (3.0231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8430 (0.8364)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [181]  [ 600/1251]  eta: 0:02:17  lr: 0.001523  min_lr: 0.001523  loss: 2.8700 (3.0256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8094 (0.8379)  time: 0.2070  data: 0.0010  max mem: 13273
Epoch: [181]  [ 800/1251]  eta: 0:01:34  lr: 0.001520  min_lr: 0.001520  loss: 3.0495 (3.0289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8820 (0.8467)  time: 0.2054  data: 0.0006  max mem: 13273
Epoch: [181]  [1000/1251]  eta: 0:00:52  lr: 0.001516  min_lr: 0.001516  loss: 3.3519 (3.0487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8413 (0.8521)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [181]  [1200/1251]  eta: 0:00:10  lr: 0.001513  min_lr: 0.001513  loss: 3.0264 (3.0533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7123 (0.8494)  time: 0.2078  data: 0.0005  max mem: 13273
Epoch: [181]  [1250/1251]  eta: 0:00:00  lr: 0.001512  min_lr: 0.001512  loss: 3.5963 (3.0597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7123 (0.8458)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [181] Total time: 0:04:21 (0.2086 s / it)
Averaged stats: lr: 0.001512  min_lr: 0.001512  loss: 3.5963 (3.0520)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7123 (0.8458)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7361 (0.7361)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.6209  data: 5.5139  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8761 (0.9053)  acc1: 82.0000 (81.7455)  acc5: 96.4000 (96.2909)  time: 0.6981  data: 0.5996  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1467 (1.0999)  acc1: 74.4000 (77.7524)  acc5: 94.0000 (94.2286)  time: 0.1820  data: 0.0876  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1879 (1.1131)  acc1: 74.4000 (77.2480)  acc5: 92.8000 (94.0960)  time: 0.2015  data: 0.1083  max mem: 13273
Test: Total time: 0:00:10 (0.4056 s / it)
* Acc@1 77.658 Acc@5 94.100 loss 1.109
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.70%
Epoch: [182]  [   0/1251]  eta: 1:03:48  lr: 0.001512  min_lr: 0.001512  loss: 3.5557 (3.5557)  weight_decay: 0.0500 (0.0500)  time: 3.0603  data: 2.2490  max mem: 13273
Epoch: [182]  [ 200/1251]  eta: 0:03:53  lr: 0.001508  min_lr: 0.001508  loss: 2.5593 (3.0516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7928 (0.8139)  time: 0.2037  data: 0.0005  max mem: 13273
Epoch: [182]  [ 400/1251]  eta: 0:03:02  lr: 0.001505  min_lr: 0.001505  loss: 2.4265 (3.0378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8069 (0.8465)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [182]  [ 600/1251]  eta: 0:02:18  lr: 0.001501  min_lr: 0.001501  loss: 3.0630 (3.0365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8522 (0.8596)  time: 0.2049  data: 0.0006  max mem: 13273
Epoch: [182]  [ 800/1251]  eta: 0:01:35  lr: 0.001498  min_lr: 0.001498  loss: 2.3562 (3.0487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9211 (0.8702)  time: 0.2081  data: 0.0008  max mem: 13273
Epoch: [182]  [1000/1251]  eta: 0:00:52  lr: 0.001495  min_lr: 0.001495  loss: 2.6320 (3.0461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8236 (0.8715)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [182]  [1200/1251]  eta: 0:00:10  lr: 0.001491  min_lr: 0.001491  loss: 2.5754 (3.0452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8743 (0.8680)  time: 0.2075  data: 0.0005  max mem: 13273
Epoch: [182]  [1250/1251]  eta: 0:00:00  lr: 0.001490  min_lr: 0.001490  loss: 3.0215 (3.0461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8994 (0.8689)  time: 0.1710  data: 0.0008  max mem: 13273
Epoch: [182] Total time: 0:04:21 (0.2094 s / it)
Averaged stats: lr: 0.001490  min_lr: 0.001490  loss: 3.0215 (3.0569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8994 (0.8689)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7121 (0.7121)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.6651  data: 5.5209  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8603 (0.8846)  acc1: 81.2000 (81.2364)  acc5: 96.4000 (96.0364)  time: 0.7613  data: 0.6587  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0883 (1.0559)  acc1: 76.0000 (77.6762)  acc5: 93.6000 (93.8857)  time: 0.2069  data: 0.1124  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1561 (1.0586)  acc1: 74.8000 (77.2480)  acc5: 93.2000 (94.0000)  time: 0.2127  data: 0.1201  max mem: 13273
Test: Total time: 0:00:10 (0.4157 s / it)
* Acc@1 77.560 Acc@5 94.134 loss 1.055
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.70%
Epoch: [183]  [   0/1251]  eta: 1:03:54  lr: 0.001490  min_lr: 0.001490  loss: 2.4210 (2.4210)  weight_decay: 0.0500 (0.0500)  time: 3.0654  data: 2.2212  max mem: 13273
Epoch: [183]  [ 200/1251]  eta: 0:03:52  lr: 0.001487  min_lr: 0.001487  loss: 2.6366 (3.0612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8014 (0.9466)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [183]  [ 400/1251]  eta: 0:03:01  lr: 0.001483  min_lr: 0.001483  loss: 3.1272 (3.0142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9074 (0.9096)  time: 0.2056  data: 0.0003  max mem: 13273
Epoch: [183]  [ 600/1251]  eta: 0:02:17  lr: 0.001480  min_lr: 0.001480  loss: 2.3918 (3.0134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7481 (0.8742)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [183]  [ 800/1251]  eta: 0:01:34  lr: 0.001476  min_lr: 0.001476  loss: 2.7138 (3.0138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8468 (0.8787)  time: 0.2061  data: 0.0003  max mem: 13273
Epoch: [183]  [1000/1251]  eta: 0:00:52  lr: 0.001473  min_lr: 0.001473  loss: 3.2578 (3.0242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7469 (0.8677)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [183]  [1200/1251]  eta: 0:00:10  lr: 0.001469  min_lr: 0.001469  loss: 2.5204 (3.0255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7991 (0.8618)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [183]  [1250/1251]  eta: 0:00:00  lr: 0.001469  min_lr: 0.001469  loss: 3.0441 (3.0252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8384 (0.8616)  time: 0.1698  data: 0.0006  max mem: 13273
Epoch: [183] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.001469  min_lr: 0.001469  loss: 3.0441 (3.0457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8384 (0.8616)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6940 (0.6940)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 5.7189  data: 5.6124  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.9129 (0.8918)  acc1: 83.2000 (82.2909)  acc5: 96.8000 (96.3636)  time: 0.7095  data: 0.6140  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1146 (1.0773)  acc1: 76.0000 (77.6381)  acc5: 94.4000 (93.9429)  time: 0.1826  data: 0.0902  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1578 (1.0833)  acc1: 76.0000 (77.3280)  acc5: 93.2000 (93.9040)  time: 0.1950  data: 0.1030  max mem: 13273
Test: Total time: 0:00:10 (0.4030 s / it)
* Acc@1 77.708 Acc@5 94.144 loss 1.075
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.71%
Epoch: [184]  [   0/1251]  eta: 1:01:27  lr: 0.001469  min_lr: 0.001469  loss: 3.8193 (3.8193)  weight_decay: 0.0500 (0.0500)  time: 2.9477  data: 2.6680  max mem: 13273
Epoch: [184]  [ 200/1251]  eta: 0:03:51  lr: 0.001465  min_lr: 0.001465  loss: 2.9930 (2.9226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8547 (0.9244)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [184]  [ 400/1251]  eta: 0:03:00  lr: 0.001462  min_lr: 0.001462  loss: 2.7206 (2.9892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8522 (0.9008)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [184]  [ 600/1251]  eta: 0:02:16  lr: 0.001458  min_lr: 0.001458  loss: 2.5574 (3.0025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7822 (0.8653)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [184]  [ 800/1251]  eta: 0:01:34  lr: 0.001455  min_lr: 0.001455  loss: 2.3371 (3.0222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9751 (0.8711)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [184]  [1000/1251]  eta: 0:00:52  lr: 0.001451  min_lr: 0.001451  loss: 2.4369 (3.0262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8491 (0.8636)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [184]  [1200/1251]  eta: 0:00:10  lr: 0.001448  min_lr: 0.001448  loss: 2.7182 (3.0383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7991 (0.8558)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [184]  [1250/1251]  eta: 0:00:00  lr: 0.001447  min_lr: 0.001447  loss: 2.5435 (3.0315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8687 (0.8586)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [184] Total time: 0:04:20 (0.2080 s / it)
Averaged stats: lr: 0.001447  min_lr: 0.001447  loss: 2.5435 (3.0382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8687 (0.8586)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6498 (0.6498)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.3924  data: 5.2842  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8536 (0.8647)  acc1: 84.0000 (82.2909)  acc5: 96.4000 (96.5455)  time: 0.7151  data: 0.6172  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0745 (1.0540)  acc1: 74.8000 (77.8095)  acc5: 94.4000 (94.3048)  time: 0.2181  data: 0.1245  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1114 (1.0614)  acc1: 74.8000 (77.3760)  acc5: 93.6000 (94.2240)  time: 0.2208  data: 0.1281  max mem: 13273
Test: Total time: 0:00:10 (0.4112 s / it)
* Acc@1 77.726 Acc@5 94.252 loss 1.055
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.73%
Epoch: [185]  [   0/1251]  eta: 1:07:10  lr: 0.001447  min_lr: 0.001447  loss: 2.2122 (2.2122)  weight_decay: 0.0500 (0.0500)  time: 3.2218  data: 3.0011  max mem: 13273
Epoch: [185]  [ 200/1251]  eta: 0:03:51  lr: 0.001444  min_lr: 0.001444  loss: 2.8718 (2.9877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8180 (0.9002)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [185]  [ 400/1251]  eta: 0:03:01  lr: 0.001440  min_lr: 0.001440  loss: 2.3568 (3.0306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7649 (0.8930)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [185]  [ 600/1251]  eta: 0:02:17  lr: 0.001437  min_lr: 0.001437  loss: 3.3420 (3.0263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8753 (0.8815)  time: 0.2161  data: 0.0005  max mem: 13273
Epoch: [185]  [ 800/1251]  eta: 0:01:34  lr: 0.001433  min_lr: 0.001433  loss: 2.7926 (3.0483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9150 (0.8984)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [185]  [1000/1251]  eta: 0:00:52  lr: 0.001430  min_lr: 0.001430  loss: 3.0134 (3.0540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7620 (0.8781)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [185]  [1200/1251]  eta: 0:00:10  lr: 0.001426  min_lr: 0.001426  loss: 2.3802 (3.0398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8332 (0.8803)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [185]  [1250/1251]  eta: 0:00:00  lr: 0.001426  min_lr: 0.001426  loss: 2.9956 (3.0427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8332 (0.8786)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [185] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.001426  min_lr: 0.001426  loss: 2.9956 (3.0424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8332 (0.8786)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.7087 (0.7087)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.2320  data: 5.1153  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8881 (0.8745)  acc1: 82.0000 (81.5273)  acc5: 96.8000 (96.6545)  time: 0.7133  data: 0.6167  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0581 (1.0689)  acc1: 75.2000 (77.3905)  acc5: 94.4000 (94.2286)  time: 0.2083  data: 0.1158  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1617 (1.0762)  acc1: 74.8000 (77.0880)  acc5: 93.2000 (94.2080)  time: 0.2073  data: 0.1157  max mem: 13273
Test: Total time: 0:00:09 (0.3977 s / it)
* Acc@1 77.714 Acc@5 94.280 loss 1.071
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.73%
Epoch: [186]  [   0/1251]  eta: 1:06:02  lr: 0.001425  min_lr: 0.001425  loss: 3.7214 (3.7214)  weight_decay: 0.0500 (0.0500)  time: 3.1678  data: 1.6342  max mem: 13273
Epoch: [186]  [ 200/1251]  eta: 0:03:52  lr: 0.001422  min_lr: 0.001422  loss: 3.2831 (3.0077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9235 (0.9187)  time: 0.2078  data: 0.0004  max mem: 13273
Epoch: [186]  [ 400/1251]  eta: 0:03:01  lr: 0.001419  min_lr: 0.001419  loss: 2.3858 (2.9545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8417 (0.8926)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [186]  [ 600/1251]  eta: 0:02:17  lr: 0.001415  min_lr: 0.001415  loss: 2.5925 (2.9885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8720 (0.8983)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [186]  [ 800/1251]  eta: 0:01:34  lr: 0.001412  min_lr: 0.001412  loss: 2.6734 (3.0129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8060 (0.8780)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [186]  [1000/1251]  eta: 0:00:52  lr: 0.001408  min_lr: 0.001408  loss: 3.1412 (3.0434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7955 (0.8651)  time: 0.2066  data: 0.0006  max mem: 13273
Epoch: [186]  [1200/1251]  eta: 0:00:10  lr: 0.001405  min_lr: 0.001405  loss: 2.5294 (3.0408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8401 (0.8626)  time: 0.2064  data: 0.0004  max mem: 13273
Epoch: [186]  [1250/1251]  eta: 0:00:00  lr: 0.001404  min_lr: 0.001404  loss: 3.5559 (3.0435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8034 (0.8596)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [186] Total time: 0:04:21 (0.2090 s / it)
Averaged stats: lr: 0.001404  min_lr: 0.001404  loss: 3.5559 (3.0372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8034 (0.8596)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8345 (0.8345)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 5.4608  data: 5.3419  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9349 (0.9641)  acc1: 84.8000 (81.6364)  acc5: 97.2000 (96.6546)  time: 0.7397  data: 0.6390  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.2187 (1.1513)  acc1: 75.2000 (77.9810)  acc5: 93.2000 (94.2286)  time: 0.2138  data: 0.1186  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2441 (1.1609)  acc1: 75.2000 (77.5040)  acc5: 92.8000 (94.0960)  time: 0.2117  data: 0.1185  max mem: 13273
Test: Total time: 0:00:10 (0.4073 s / it)
* Acc@1 77.520 Acc@5 94.332 loss 1.155
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.73%
Epoch: [187]  [   0/1251]  eta: 1:06:51  lr: 0.001404  min_lr: 0.001404  loss: 2.0771 (2.0771)  weight_decay: 0.0500 (0.0500)  time: 3.2066  data: 2.9825  max mem: 13273
Epoch: [187]  [ 200/1251]  eta: 0:03:52  lr: 0.001401  min_lr: 0.001401  loss: 2.4715 (2.9715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8123 (0.8907)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [187]  [ 400/1251]  eta: 0:03:01  lr: 0.001397  min_lr: 0.001397  loss: 2.4473 (3.0106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8696 (0.8818)  time: 0.2040  data: 0.0005  max mem: 13273
Epoch: [187]  [ 600/1251]  eta: 0:02:16  lr: 0.001394  min_lr: 0.001394  loss: 3.2466 (3.0099)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0402 (0.9043)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [187]  [ 800/1251]  eta: 0:01:34  lr: 0.001390  min_lr: 0.001390  loss: 2.7505 (3.0184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8601 (0.9067)  time: 0.2036  data: 0.0005  max mem: 13273
Epoch: [187]  [1000/1251]  eta: 0:00:52  lr: 0.001387  min_lr: 0.001387  loss: 3.6238 (3.0396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9426 (0.9058)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [187]  [1200/1251]  eta: 0:00:10  lr: 0.001383  min_lr: 0.001383  loss: 3.2294 (3.0452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8335 (0.9028)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [187]  [1250/1251]  eta: 0:00:00  lr: 0.001383  min_lr: 0.001383  loss: 2.9529 (3.0421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8186 (0.9009)  time: 0.1703  data: 0.0007  max mem: 13273
Epoch: [187] Total time: 0:04:19 (0.2075 s / it)
Averaged stats: lr: 0.001383  min_lr: 0.001383  loss: 2.9529 (3.0340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8186 (0.9009)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7407 (0.7407)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.5752  data: 5.4701  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8537 (0.9009)  acc1: 84.0000 (82.7273)  acc5: 96.8000 (96.6545)  time: 0.7375  data: 0.6391  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1013 (1.0970)  acc1: 76.4000 (78.1333)  acc5: 94.4000 (94.4762)  time: 0.2017  data: 0.1077  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.2192 (1.1031)  acc1: 75.2000 (77.6320)  acc5: 93.2000 (94.3680)  time: 0.2003  data: 0.1076  max mem: 13273
Test: Total time: 0:00:10 (0.4020 s / it)
* Acc@1 77.974 Acc@5 94.294 loss 1.098
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 77.97%
Epoch: [188]  [   0/1251]  eta: 0:58:39  lr: 0.001383  min_lr: 0.001383  loss: 2.1198 (2.1198)  weight_decay: 0.0500 (0.0500)  time: 2.8130  data: 2.5673  max mem: 13273
Epoch: [188]  [ 200/1251]  eta: 0:03:50  lr: 0.001379  min_lr: 0.001379  loss: 2.9394 (3.1071)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8514 (0.8916)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [188]  [ 400/1251]  eta: 0:03:00  lr: 0.001376  min_lr: 0.001376  loss: 3.6041 (3.1122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8795 (0.8774)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [188]  [ 600/1251]  eta: 0:02:16  lr: 0.001372  min_lr: 0.001372  loss: 3.2783 (3.0816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (0.8640)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [188]  [ 800/1251]  eta: 0:01:34  lr: 0.001369  min_lr: 0.001369  loss: 2.3841 (3.0766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8209 (0.8577)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [188]  [1000/1251]  eta: 0:00:52  lr: 0.001366  min_lr: 0.001366  loss: 2.5504 (3.0709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8342 (0.8651)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [188]  [1200/1251]  eta: 0:00:10  lr: 0.001362  min_lr: 0.001362  loss: 2.7239 (3.0711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8577 (0.8679)  time: 0.2062  data: 0.0006  max mem: 13273
Epoch: [188]  [1250/1251]  eta: 0:00:00  lr: 0.001361  min_lr: 0.001361  loss: 3.1191 (3.0743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8624 (0.8704)  time: 0.1703  data: 0.0007  max mem: 13273
Epoch: [188] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.001361  min_lr: 0.001361  loss: 3.1191 (3.0309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8624 (0.8704)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.7183 (0.7183)  acc1: 85.6000 (85.6000)  acc5: 98.8000 (98.8000)  time: 5.9497  data: 5.8437  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8851 (0.8848)  acc1: 84.4000 (82.1091)  acc5: 96.8000 (96.6182)  time: 0.7228  data: 0.6278  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1074 (1.0498)  acc1: 75.2000 (78.2095)  acc5: 94.0000 (94.5714)  time: 0.1835  data: 0.0913  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1493 (1.0642)  acc1: 75.2000 (77.7760)  acc5: 93.2000 (94.4000)  time: 0.1861  data: 0.0948  max mem: 13273
Test: Total time: 0:00:10 (0.4055 s / it)
* Acc@1 78.024 Acc@5 94.374 loss 1.056
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 78.02%
Epoch: [189]  [   0/1251]  eta: 1:04:23  lr: 0.001361  min_lr: 0.001361  loss: 3.0853 (3.0853)  weight_decay: 0.0500 (0.0500)  time: 3.0881  data: 2.8643  max mem: 13273
Epoch: [189]  [ 200/1251]  eta: 0:03:53  lr: 0.001358  min_lr: 0.001358  loss: 2.3858 (2.9619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7733 (0.9281)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [189]  [ 400/1251]  eta: 0:03:02  lr: 0.001355  min_lr: 0.001355  loss: 2.7121 (2.9997)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8514 (0.8990)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [189]  [ 600/1251]  eta: 0:02:18  lr: 0.001351  min_lr: 0.001351  loss: 2.7567 (2.9934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8028 (0.9099)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [189]  [ 800/1251]  eta: 0:01:34  lr: 0.001348  min_lr: 0.001348  loss: 3.0839 (3.0340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7921 (0.8835)  time: 0.2069  data: 0.0007  max mem: 13273
Epoch: [189]  [1000/1251]  eta: 0:00:52  lr: 0.001344  min_lr: 0.001344  loss: 3.4752 (3.0376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8359 (0.8779)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [189]  [1200/1251]  eta: 0:00:10  lr: 0.001341  min_lr: 0.001341  loss: 2.4778 (3.0308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9148 (0.8831)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [189]  [1250/1251]  eta: 0:00:00  lr: 0.001340  min_lr: 0.001340  loss: 2.7799 (3.0236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8872 (0.8865)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [189] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.001340  min_lr: 0.001340  loss: 2.7799 (3.0277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8872 (0.8865)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6337 (0.6337)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.4524  data: 5.3444  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8437 (0.8715)  acc1: 82.4000 (82.2909)  acc5: 96.4000 (96.6546)  time: 0.7753  data: 0.6752  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0952 (1.0701)  acc1: 75.6000 (77.9810)  acc5: 94.4000 (94.3810)  time: 0.2343  data: 0.1381  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1814 (1.0830)  acc1: 75.6000 (77.6000)  acc5: 93.6000 (94.1760)  time: 0.2326  data: 0.1381  max mem: 13273
Test: Total time: 0:00:10 (0.4231 s / it)
* Acc@1 77.894 Acc@5 94.252 loss 1.088
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 78.02%
Epoch: [190]  [   0/1251]  eta: 1:06:59  lr: 0.001340  min_lr: 0.001340  loss: 3.4444 (3.4444)  weight_decay: 0.0500 (0.0500)  time: 3.2128  data: 1.6842  max mem: 13273
Epoch: [190]  [ 200/1251]  eta: 0:03:52  lr: 0.001337  min_lr: 0.001337  loss: 2.4152 (3.0213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8428 (1.0030)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [190]  [ 400/1251]  eta: 0:03:01  lr: 0.001333  min_lr: 0.001333  loss: 2.8943 (2.9984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8663 (0.9262)  time: 0.2060  data: 0.0008  max mem: 13273
Epoch: [190]  [ 600/1251]  eta: 0:02:17  lr: 0.001330  min_lr: 0.001330  loss: 2.3863 (3.0149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9104 (0.8961)  time: 0.2062  data: 0.0007  max mem: 13273
Epoch: [190]  [ 800/1251]  eta: 0:01:34  lr: 0.001327  min_lr: 0.001327  loss: 2.6504 (3.0100)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2051  data: 0.0010  max mem: 13273
Epoch: [190]  [1000/1251]  eta: 0:00:52  lr: 0.001323  min_lr: 0.001323  loss: 2.9163 (2.9964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8992 (nan)  time: 0.2052  data: 0.0007  max mem: 13273
Epoch: [190]  [1200/1251]  eta: 0:00:10  lr: 0.001320  min_lr: 0.001320  loss: 2.5125 (2.9976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8322 (nan)  time: 0.2050  data: 0.0009  max mem: 13273
Epoch: [190]  [1250/1251]  eta: 0:00:00  lr: 0.001319  min_lr: 0.001319  loss: 3.2653 (3.0069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8583 (nan)  time: 0.1708  data: 0.0006  max mem: 13273
Epoch: [190] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.001319  min_lr: 0.001319  loss: 3.2653 (3.0190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8583 (nan)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7392 (0.7392)  acc1: 86.8000 (86.8000)  acc5: 99.2000 (99.2000)  time: 5.7820  data: 5.6731  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8888 (0.9119)  acc1: 82.0000 (81.8909)  acc5: 96.4000 (96.3636)  time: 0.7204  data: 0.6243  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1404 (1.0920)  acc1: 75.2000 (78.1714)  acc5: 94.4000 (94.1143)  time: 0.1850  data: 0.0924  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1672 (1.1018)  acc1: 75.2000 (77.8400)  acc5: 93.6000 (94.0960)  time: 0.1897  data: 0.0976  max mem: 13273
Test: Total time: 0:00:10 (0.4026 s / it)
* Acc@1 77.866 Acc@5 94.252 loss 1.097
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 78.02%
Epoch: [191]  [   0/1251]  eta: 1:07:09  lr: 0.001319  min_lr: 0.001319  loss: 3.6377 (3.6377)  weight_decay: 0.0500 (0.0500)  time: 3.2208  data: 1.8760  max mem: 13273
Epoch: [191]  [ 200/1251]  eta: 0:03:54  lr: 0.001316  min_lr: 0.001316  loss: 3.0587 (3.0402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7730 (0.8444)  time: 0.2071  data: 0.0004  max mem: 13273
Epoch: [191]  [ 400/1251]  eta: 0:03:02  lr: 0.001312  min_lr: 0.001312  loss: 2.5186 (3.0253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8384 (0.8493)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [191]  [ 600/1251]  eta: 0:02:17  lr: 0.001309  min_lr: 0.001309  loss: 2.6735 (3.0424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9057 (0.8646)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [191]  [ 800/1251]  eta: 0:01:34  lr: 0.001305  min_lr: 0.001305  loss: 2.7184 (3.0462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9511 (0.8965)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [191]  [1000/1251]  eta: 0:00:52  lr: 0.001302  min_lr: 0.001302  loss: 3.1868 (3.0356)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2056  data: 0.0003  max mem: 13273
Epoch: [191]  [1200/1251]  eta: 0:00:10  lr: 0.001299  min_lr: 0.001299  loss: 2.6146 (3.0224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9466 (nan)  time: 0.2054  data: 0.0003  max mem: 13273
Epoch: [191]  [1250/1251]  eta: 0:00:00  lr: 0.001298  min_lr: 0.001298  loss: 2.7761 (3.0209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8594 (nan)  time: 0.1710  data: 0.0007  max mem: 13273
Epoch: [191] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.001298  min_lr: 0.001298  loss: 2.7761 (3.0184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8594 (nan)
Test:  [ 0/25]  eta: 0:02:07  loss: 0.7003 (0.7003)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 5.0979  data: 4.9900  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8607 (0.8590)  acc1: 83.6000 (82.0000)  acc5: 96.8000 (96.6182)  time: 0.6884  data: 0.5939  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0621 (1.0314)  acc1: 76.0000 (78.3619)  acc5: 94.8000 (94.5333)  time: 0.2074  data: 0.1143  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0845 (1.0452)  acc1: 77.2000 (78.0800)  acc5: 93.6000 (94.4960)  time: 0.2080  data: 0.1152  max mem: 13273
Test: Total time: 0:00:09 (0.3920 s / it)
* Acc@1 78.022 Acc@5 94.410 loss 1.043
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 78.02%
Epoch: [192]  [   0/1251]  eta: 1:04:17  lr: 0.001298  min_lr: 0.001298  loss: 3.4024 (3.4024)  weight_decay: 0.0500 (0.0500)  time: 3.0834  data: 2.3641  max mem: 13273
Epoch: [192]  [ 200/1251]  eta: 0:03:51  lr: 0.001295  min_lr: 0.001295  loss: 2.3506 (2.9901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8434 (0.8833)  time: 0.2041  data: 0.0004  max mem: 13273
Epoch: [192]  [ 400/1251]  eta: 0:03:01  lr: 0.001291  min_lr: 0.001291  loss: 2.8586 (3.0292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8323 (0.8692)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [192]  [ 600/1251]  eta: 0:02:17  lr: 0.001288  min_lr: 0.001288  loss: 3.3930 (3.0488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8519 (0.8745)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [192]  [ 800/1251]  eta: 0:01:34  lr: 0.001284  min_lr: 0.001284  loss: 3.2363 (3.0466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8555 (0.8838)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [192]  [1000/1251]  eta: 0:00:52  lr: 0.001281  min_lr: 0.001281  loss: 2.8336 (3.0385)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8535 (0.8739)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [192]  [1200/1251]  eta: 0:00:10  lr: 0.001278  min_lr: 0.001278  loss: 3.0551 (3.0405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8462 (0.8739)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [192]  [1250/1251]  eta: 0:00:00  lr: 0.001277  min_lr: 0.001277  loss: 2.4908 (3.0378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8462 (0.8716)  time: 0.1703  data: 0.0005  max mem: 13273
Epoch: [192] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.001277  min_lr: 0.001277  loss: 2.4908 (3.0265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8462 (0.8716)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7799 (0.7799)  acc1: 84.0000 (84.0000)  acc5: 97.2000 (97.2000)  time: 5.4598  data: 5.3475  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8653 (0.8765)  acc1: 82.8000 (81.9273)  acc5: 97.2000 (96.9091)  time: 0.7356  data: 0.6345  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0599 (1.0577)  acc1: 76.0000 (78.2476)  acc5: 93.6000 (94.4571)  time: 0.2046  data: 0.1084  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1826 (1.0656)  acc1: 76.0000 (77.9680)  acc5: 93.6000 (94.4480)  time: 0.2091  data: 0.1150  max mem: 13273
Test: Total time: 0:00:10 (0.4063 s / it)
* Acc@1 78.190 Acc@5 94.378 loss 1.051
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.19%
Epoch: [193]  [   0/1251]  eta: 1:07:34  lr: 0.001277  min_lr: 0.001277  loss: 3.4600 (3.4600)  weight_decay: 0.0500 (0.0500)  time: 3.2409  data: 3.0132  max mem: 13273
Epoch: [193]  [ 200/1251]  eta: 0:03:52  lr: 0.001274  min_lr: 0.001274  loss: 2.8609 (2.9505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9230 (0.9031)  time: 0.2058  data: 0.0003  max mem: 13273
Epoch: [193]  [ 400/1251]  eta: 0:03:01  lr: 0.001270  min_lr: 0.001270  loss: 2.7963 (2.9527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7628 (0.8608)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [193]  [ 600/1251]  eta: 0:02:17  lr: 0.001267  min_lr: 0.001267  loss: 2.7732 (2.9791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7819 (0.8746)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [193]  [ 800/1251]  eta: 0:01:34  lr: 0.001264  min_lr: 0.001264  loss: 2.4355 (2.9772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9313 (0.8832)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [193]  [1000/1251]  eta: 0:00:52  lr: 0.001260  min_lr: 0.001260  loss: 2.4487 (2.9801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9373 (0.8986)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [193]  [1200/1251]  eta: 0:00:10  lr: 0.001257  min_lr: 0.001257  loss: 2.3782 (2.9805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8742 (0.8942)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [193]  [1250/1251]  eta: 0:00:00  lr: 0.001256  min_lr: 0.001256  loss: 3.1798 (2.9793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8865 (0.8945)  time: 0.1701  data: 0.0006  max mem: 13273
Epoch: [193] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.001256  min_lr: 0.001256  loss: 3.1798 (2.9950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8865 (0.8945)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6849 (0.6849)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 5.5393  data: 5.4330  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8558 (0.8712)  acc1: 84.0000 (81.9273)  acc5: 97.2000 (96.5455)  time: 0.7314  data: 0.6335  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0759 (1.0603)  acc1: 76.0000 (78.1524)  acc5: 94.0000 (94.4381)  time: 0.2050  data: 0.1112  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1794 (1.0696)  acc1: 75.6000 (77.8240)  acc5: 93.6000 (94.4320)  time: 0.2038  data: 0.1111  max mem: 13273
Test: Total time: 0:00:10 (0.4040 s / it)
* Acc@1 78.242 Acc@5 94.502 loss 1.061
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.24%
Epoch: [194]  [   0/1251]  eta: 0:57:56  lr: 0.001256  min_lr: 0.001256  loss: 2.1372 (2.1372)  weight_decay: 0.0500 (0.0500)  time: 2.7791  data: 2.5019  max mem: 13273
Epoch: [194]  [ 200/1251]  eta: 0:03:51  lr: 0.001253  min_lr: 0.001253  loss: 3.2050 (3.0553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9280 (0.9204)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [194]  [ 400/1251]  eta: 0:03:01  lr: 0.001249  min_lr: 0.001249  loss: 3.2279 (3.0398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8396 (0.9185)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [194]  [ 600/1251]  eta: 0:02:17  lr: 0.001246  min_lr: 0.001246  loss: 2.8478 (3.0229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8762 (0.9073)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [194]  [ 800/1251]  eta: 0:01:34  lr: 0.001243  min_lr: 0.001243  loss: 2.3432 (3.0102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8632 (0.8949)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [194]  [1000/1251]  eta: 0:00:52  lr: 0.001239  min_lr: 0.001239  loss: 3.4717 (3.0132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9011 (0.8912)  time: 0.2038  data: 0.0004  max mem: 13273
Epoch: [194]  [1200/1251]  eta: 0:00:10  lr: 0.001236  min_lr: 0.001236  loss: 2.5739 (3.0131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8665 (0.8811)  time: 0.2064  data: 0.0004  max mem: 13273
Epoch: [194]  [1250/1251]  eta: 0:00:00  lr: 0.001235  min_lr: 0.001235  loss: 3.3862 (3.0185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9326 (0.8817)  time: 0.1711  data: 0.0005  max mem: 13273
Epoch: [194] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.001235  min_lr: 0.001235  loss: 3.3862 (3.0135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9326 (0.8817)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7289 (0.7289)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 5.6185  data: 5.5134  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8828 (0.9060)  acc1: 84.8000 (82.6909)  acc5: 96.8000 (96.7273)  time: 0.7012  data: 0.6055  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1671 (1.1130)  acc1: 76.0000 (78.0571)  acc5: 93.6000 (94.4571)  time: 0.1927  data: 0.1001  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1790 (1.1221)  acc1: 76.0000 (77.8560)  acc5: 93.6000 (94.4960)  time: 0.2097  data: 0.1170  max mem: 13273
Test: Total time: 0:00:10 (0.4105 s / it)
* Acc@1 78.242 Acc@5 94.456 loss 1.115
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.24%
Epoch: [195]  [   0/1251]  eta: 1:09:50  lr: 0.001235  min_lr: 0.001235  loss: 2.2824 (2.2824)  weight_decay: 0.0500 (0.0500)  time: 3.3498  data: 2.3246  max mem: 13273
Epoch: [195]  [ 200/1251]  eta: 0:03:54  lr: 0.001232  min_lr: 0.001232  loss: 2.2652 (3.0057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8576 (0.9301)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [195]  [ 400/1251]  eta: 0:03:02  lr: 0.001229  min_lr: 0.001229  loss: 3.2276 (2.9947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9729 (0.9206)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [195]  [ 600/1251]  eta: 0:02:17  lr: 0.001225  min_lr: 0.001225  loss: 2.7889 (2.9979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9446 (0.9210)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [195]  [ 800/1251]  eta: 0:01:34  lr: 0.001222  min_lr: 0.001222  loss: 2.6517 (3.0053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7815 (0.9124)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [195]  [1000/1251]  eta: 0:00:52  lr: 0.001219  min_lr: 0.001219  loss: 2.8448 (3.0019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8359 (0.9058)  time: 0.2135  data: 0.0005  max mem: 13273
Epoch: [195]  [1200/1251]  eta: 0:00:10  lr: 0.001215  min_lr: 0.001215  loss: 3.2026 (3.0104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9309 (0.9157)  time: 0.2125  data: 0.0005  max mem: 13273
Epoch: [195]  [1250/1251]  eta: 0:00:00  lr: 0.001215  min_lr: 0.001215  loss: 2.9018 (3.0088)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0524 (0.9242)  time: 0.1698  data: 0.0006  max mem: 13273
Epoch: [195] Total time: 0:04:21 (0.2092 s / it)
Averaged stats: lr: 0.001215  min_lr: 0.001215  loss: 2.9018 (3.0070)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0524 (0.9242)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7578 (0.7578)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.5931  data: 5.4876  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8885 (0.8997)  acc1: 82.4000 (82.3273)  acc5: 96.8000 (96.6182)  time: 0.7754  data: 0.6772  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0742 (1.0663)  acc1: 76.4000 (78.5524)  acc5: 94.4000 (94.5905)  time: 0.2146  data: 0.1197  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1723 (1.0747)  acc1: 75.6000 (78.2240)  acc5: 94.0000 (94.4640)  time: 0.2174  data: 0.1230  max mem: 13273
Test: Total time: 0:00:10 (0.4166 s / it)
* Acc@1 78.232 Acc@5 94.518 loss 1.072
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.24%
Epoch: [196]  [   0/1251]  eta: 1:05:32  lr: 0.001215  min_lr: 0.001215  loss: 3.2801 (3.2801)  weight_decay: 0.0500 (0.0500)  time: 3.1434  data: 2.4968  max mem: 13273
Epoch: [196]  [ 200/1251]  eta: 0:03:53  lr: 0.001211  min_lr: 0.001211  loss: 3.0381 (3.0085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9021 (0.9419)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [196]  [ 400/1251]  eta: 0:03:02  lr: 0.001208  min_lr: 0.001208  loss: 3.0472 (3.0205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8462 (0.9117)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [196]  [ 600/1251]  eta: 0:02:17  lr: 0.001205  min_lr: 0.001205  loss: 2.8583 (3.0241)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0078 (0.9245)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [196]  [ 800/1251]  eta: 0:01:34  lr: 0.001201  min_lr: 0.001201  loss: 2.5088 (3.0233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7687 (0.9209)  time: 0.2071  data: 0.0005  max mem: 13273
Epoch: [196]  [1000/1251]  eta: 0:00:52  lr: 0.001198  min_lr: 0.001198  loss: 2.9365 (2.9980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8819 (0.9082)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [196]  [1200/1251]  eta: 0:00:10  lr: 0.001195  min_lr: 0.001195  loss: 2.3042 (3.0002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8649 (0.9043)  time: 0.2067  data: 0.0006  max mem: 13273
Epoch: [196]  [1250/1251]  eta: 0:00:00  lr: 0.001194  min_lr: 0.001194  loss: 3.0421 (3.0004)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8212 (0.9015)  time: 0.1712  data: 0.0007  max mem: 13273
Epoch: [196] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.001194  min_lr: 0.001194  loss: 3.0421 (3.0072)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8212 (0.9015)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6491 (0.6491)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.6567  data: 5.5493  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7966 (0.8354)  acc1: 82.8000 (81.9273)  acc5: 96.4000 (96.6909)  time: 0.7345  data: 0.6355  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1125 (1.0136)  acc1: 76.0000 (77.8095)  acc5: 94.0000 (94.3048)  time: 0.1956  data: 0.1014  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0988 (1.0237)  acc1: 74.4000 (77.3600)  acc5: 93.2000 (94.1760)  time: 0.1977  data: 0.1052  max mem: 13273
Test: Total time: 0:00:10 (0.4036 s / it)
* Acc@1 78.200 Acc@5 94.470 loss 1.011
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.24%
Epoch: [197]  [   0/1251]  eta: 1:06:24  lr: 0.001194  min_lr: 0.001194  loss: 2.0528 (2.0528)  weight_decay: 0.0500 (0.0500)  time: 3.1852  data: 1.6912  max mem: 13273
Epoch: [197]  [ 200/1251]  eta: 0:03:52  lr: 0.001191  min_lr: 0.001191  loss: 3.0956 (3.0506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8979 (0.9260)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [197]  [ 400/1251]  eta: 0:03:01  lr: 0.001187  min_lr: 0.001187  loss: 3.0810 (3.0490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8160 (0.9177)  time: 0.2064  data: 0.0006  max mem: 13273
Epoch: [197]  [ 600/1251]  eta: 0:02:17  lr: 0.001184  min_lr: 0.001184  loss: 3.2465 (3.0361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9203 (0.9295)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [197]  [ 800/1251]  eta: 0:01:34  lr: 0.001181  min_lr: 0.001181  loss: 2.8533 (3.0177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9281 (0.9200)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [197]  [1000/1251]  eta: 0:00:52  lr: 0.001178  min_lr: 0.001178  loss: 2.7782 (3.0178)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8144 (0.9188)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [197]  [1200/1251]  eta: 0:00:10  lr: 0.001174  min_lr: 0.001174  loss: 2.9722 (3.0248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7749 (0.9048)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [197]  [1250/1251]  eta: 0:00:00  lr: 0.001174  min_lr: 0.001174  loss: 2.3319 (3.0230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7600 (0.9003)  time: 0.1704  data: 0.0008  max mem: 13273
Epoch: [197] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.001174  min_lr: 0.001174  loss: 2.3319 (2.9934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7600 (0.9003)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6765 (0.6765)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 5.4913  data: 5.3861  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7941 (0.8223)  acc1: 84.0000 (82.6909)  acc5: 96.8000 (96.4727)  time: 0.7244  data: 0.6280  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0305 (1.0017)  acc1: 76.8000 (78.4571)  acc5: 93.2000 (94.2857)  time: 0.2024  data: 0.1094  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1167 (1.0127)  acc1: 76.8000 (78.0160)  acc5: 92.8000 (94.0960)  time: 0.2028  data: 0.1107  max mem: 13273
Test: Total time: 0:00:10 (0.4007 s / it)
* Acc@1 78.362 Acc@5 94.456 loss 0.998
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.36%
Epoch: [198]  [   0/1251]  eta: 0:59:46  lr: 0.001174  min_lr: 0.001174  loss: 2.0316 (2.0316)  weight_decay: 0.0500 (0.0500)  time: 2.8668  data: 2.6400  max mem: 13273
Epoch: [198]  [ 200/1251]  eta: 0:03:50  lr: 0.001170  min_lr: 0.001170  loss: 3.0862 (2.9294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8902 (0.9383)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [198]  [ 400/1251]  eta: 0:03:01  lr: 0.001167  min_lr: 0.001167  loss: 2.6019 (2.9464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9104 (0.9431)  time: 0.2051  data: 0.0006  max mem: 13273
Epoch: [198]  [ 600/1251]  eta: 0:02:16  lr: 0.001164  min_lr: 0.001164  loss: 2.3232 (2.9697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9070 (0.9237)  time: 0.2065  data: 0.0003  max mem: 13273
Epoch: [198]  [ 800/1251]  eta: 0:01:34  lr: 0.001161  min_lr: 0.001161  loss: 2.8219 (2.9724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7990 (0.9040)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [198]  [1000/1251]  eta: 0:00:52  lr: 0.001157  min_lr: 0.001157  loss: 3.0220 (2.9854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7999 (0.9056)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [198]  [1200/1251]  eta: 0:00:10  lr: 0.001154  min_lr: 0.001154  loss: 2.8844 (2.9880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8494 (0.9011)  time: 0.2071  data: 0.0006  max mem: 13273
Epoch: [198]  [1250/1251]  eta: 0:00:00  lr: 0.001153  min_lr: 0.001153  loss: 2.6110 (2.9861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9363 (0.9036)  time: 0.1705  data: 0.0005  max mem: 13273
Epoch: [198] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.001153  min_lr: 0.001153  loss: 2.6110 (2.9977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9363 (0.9036)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6539 (0.6539)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.7982  data: 5.6918  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8096 (0.8284)  acc1: 83.2000 (82.8000)  acc5: 96.4000 (96.9091)  time: 0.7588  data: 0.6620  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0641 (1.0009)  acc1: 77.2000 (79.0667)  acc5: 94.0000 (94.4191)  time: 0.1968  data: 0.1038  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1094 (1.0166)  acc1: 75.2000 (78.5120)  acc5: 93.2000 (94.4160)  time: 0.1967  data: 0.1046  max mem: 13273
Test: Total time: 0:00:10 (0.4074 s / it)
* Acc@1 78.664 Acc@5 94.612 loss 1.008
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.66%
Epoch: [199]  [   0/1251]  eta: 1:03:02  lr: 0.001153  min_lr: 0.001153  loss: 3.3555 (3.3555)  weight_decay: 0.0500 (0.0500)  time: 3.0237  data: 2.7598  max mem: 13273
Epoch: [199]  [ 200/1251]  eta: 0:03:53  lr: 0.001150  min_lr: 0.001150  loss: 2.2608 (2.9360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8303 (0.8735)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [199]  [ 400/1251]  eta: 0:03:03  lr: 0.001147  min_lr: 0.001147  loss: 2.8515 (2.9888)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8106 (0.8807)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [199]  [ 600/1251]  eta: 0:02:18  lr: 0.001143  min_lr: 0.001143  loss: 2.3829 (2.9971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8977 (0.8831)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [199]  [ 800/1251]  eta: 0:01:35  lr: 0.001140  min_lr: 0.001140  loss: 2.6692 (2.9941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9778 (0.8789)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [199]  [1000/1251]  eta: 0:00:52  lr: 0.001137  min_lr: 0.001137  loss: 3.2290 (3.0027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9012 (0.8812)  time: 0.2097  data: 0.0005  max mem: 13273
Epoch: [199]  [1200/1251]  eta: 0:00:10  lr: 0.001134  min_lr: 0.001134  loss: 2.5851 (3.0060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7296 (0.8795)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [199]  [1250/1251]  eta: 0:00:00  lr: 0.001133  min_lr: 0.001133  loss: 3.6743 (3.0062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8344 (0.8767)  time: 0.1699  data: 0.0011  max mem: 13273
Epoch: [199] Total time: 0:04:21 (0.2094 s / it)
Averaged stats: lr: 0.001133  min_lr: 0.001133  loss: 3.6743 (2.9895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8344 (0.8767)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.7469 (0.7469)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.4203  data: 5.3072  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8876 (0.9113)  acc1: 82.8000 (82.2545)  acc5: 96.8000 (96.7273)  time: 0.7375  data: 0.6418  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1116 (1.0923)  acc1: 76.0000 (78.0762)  acc5: 94.8000 (94.7238)  time: 0.2179  data: 0.1245  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1254 (1.1008)  acc1: 76.0000 (77.8080)  acc5: 93.6000 (94.6240)  time: 0.2187  data: 0.1253  max mem: 13273
Test: Total time: 0:00:10 (0.4106 s / it)
* Acc@1 78.428 Acc@5 94.458 loss 1.091
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.66%
Epoch: [200]  [   0/1251]  eta: 1:05:22  lr: 0.001133  min_lr: 0.001133  loss: 2.2212 (2.2212)  weight_decay: 0.0500 (0.0500)  time: 3.1358  data: 2.3494  max mem: 13273
Epoch: [200]  [ 200/1251]  eta: 0:03:53  lr: 0.001130  min_lr: 0.001130  loss: 2.7988 (2.9716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8781 (0.8920)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [200]  [ 400/1251]  eta: 0:03:01  lr: 0.001126  min_lr: 0.001126  loss: 2.8222 (2.9597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9270 (0.9273)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [200]  [ 600/1251]  eta: 0:02:17  lr: 0.001123  min_lr: 0.001123  loss: 2.4998 (2.9677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9017 (0.9385)  time: 0.2064  data: 0.0006  max mem: 13273
Epoch: [200]  [ 800/1251]  eta: 0:01:34  lr: 0.001120  min_lr: 0.001120  loss: 3.0593 (2.9838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9395 (0.9412)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [200]  [1000/1251]  eta: 0:00:52  lr: 0.001117  min_lr: 0.001117  loss: 2.4796 (2.9617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8943 (0.9344)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [200]  [1200/1251]  eta: 0:00:10  lr: 0.001114  min_lr: 0.001114  loss: 2.3546 (2.9632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9028 (0.9268)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [200]  [1250/1251]  eta: 0:00:00  lr: 0.001113  min_lr: 0.001113  loss: 2.5235 (2.9619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8459 (0.9251)  time: 0.1701  data: 0.0006  max mem: 13273
Epoch: [200] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.001113  min_lr: 0.001113  loss: 2.5235 (2.9810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8459 (0.9251)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.6149 (0.6149)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.9897  data: 5.8844  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7814 (0.8007)  acc1: 82.8000 (82.3636)  acc5: 96.8000 (96.6909)  time: 0.7544  data: 0.6605  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0254 (0.9734)  acc1: 77.6000 (78.9524)  acc5: 94.4000 (94.5524)  time: 0.1904  data: 0.0982  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0348 (0.9829)  acc1: 77.6000 (78.4960)  acc5: 93.6000 (94.5280)  time: 0.1928  data: 0.1012  max mem: 13273
Test: Total time: 0:00:10 (0.4127 s / it)
* Acc@1 78.546 Acc@5 94.566 loss 0.976
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.66%
Epoch: [201]  [   0/1251]  eta: 1:05:43  lr: 0.001113  min_lr: 0.001113  loss: 2.4792 (2.4792)  weight_decay: 0.0500 (0.0500)  time: 3.1524  data: 2.3157  max mem: 13273
Epoch: [201]  [ 200/1251]  eta: 0:03:53  lr: 0.001110  min_lr: 0.001110  loss: 3.2984 (3.0841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9173 (0.9361)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [201]  [ 400/1251]  eta: 0:03:02  lr: 0.001106  min_lr: 0.001106  loss: 2.6415 (3.0120)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0193 (0.9404)  time: 0.2040  data: 0.0006  max mem: 13273
Epoch: [201]  [ 600/1251]  eta: 0:02:17  lr: 0.001103  min_lr: 0.001103  loss: 2.6927 (2.9833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8570 (0.9302)  time: 0.2037  data: 0.0005  max mem: 13273
Epoch: [201]  [ 800/1251]  eta: 0:01:34  lr: 0.001100  min_lr: 0.001100  loss: 2.4839 (2.9811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9146 (0.9294)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [201]  [1000/1251]  eta: 0:00:52  lr: 0.001097  min_lr: 0.001097  loss: 2.8056 (2.9727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8787 (0.9362)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [201]  [1200/1251]  eta: 0:00:10  lr: 0.001094  min_lr: 0.001094  loss: 2.6839 (2.9658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9152 (0.9376)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [201]  [1250/1251]  eta: 0:00:00  lr: 0.001093  min_lr: 0.001093  loss: 2.4055 (2.9660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9226 (0.9381)  time: 0.1707  data: 0.0006  max mem: 13273
Epoch: [201] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.001093  min_lr: 0.001093  loss: 2.4055 (2.9914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9226 (0.9381)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6937 (0.6937)  acc1: 85.6000 (85.6000)  acc5: 98.8000 (98.8000)  time: 5.8885  data: 5.7832  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8586 (0.8504)  acc1: 81.2000 (82.3273)  acc5: 96.4000 (96.5455)  time: 0.7506  data: 0.6567  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0776 (1.0250)  acc1: 77.6000 (78.8191)  acc5: 94.4000 (94.3048)  time: 0.2026  data: 0.1110  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0997 (1.0331)  acc1: 76.8000 (78.4160)  acc5: 94.0000 (94.2080)  time: 0.2050  data: 0.1141  max mem: 13273
Test: Total time: 0:00:10 (0.4174 s / it)
* Acc@1 78.668 Acc@5 94.518 loss 1.025
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.67%
Epoch: [202]  [   0/1251]  eta: 0:55:57  lr: 0.001093  min_lr: 0.001093  loss: 4.0191 (4.0191)  weight_decay: 0.0500 (0.0500)  time: 2.6842  data: 2.3501  max mem: 13273
Epoch: [202]  [ 200/1251]  eta: 0:03:51  lr: 0.001090  min_lr: 0.001090  loss: 2.3465 (3.1932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8297 (0.9107)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [202]  [ 400/1251]  eta: 0:03:01  lr: 0.001086  min_lr: 0.001086  loss: 3.0321 (3.0791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8853 (0.9116)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [202]  [ 600/1251]  eta: 0:02:17  lr: 0.001083  min_lr: 0.001083  loss: 2.7057 (3.0261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8905 (0.9286)  time: 0.2160  data: 0.0005  max mem: 13273
Epoch: [202]  [ 800/1251]  eta: 0:01:34  lr: 0.001080  min_lr: 0.001080  loss: 2.9038 (3.0236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9261 (0.9309)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [202]  [1000/1251]  eta: 0:00:52  lr: 0.001077  min_lr: 0.001077  loss: 2.6904 (3.0083)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8611 (0.9306)  time: 0.2041  data: 0.0006  max mem: 13273
Epoch: [202]  [1200/1251]  eta: 0:00:10  lr: 0.001074  min_lr: 0.001074  loss: 2.6061 (2.9963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9439 (0.9272)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [202]  [1250/1251]  eta: 0:00:00  lr: 0.001073  min_lr: 0.001073  loss: 3.2980 (2.9942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9634 (0.9287)  time: 0.1700  data: 0.0005  max mem: 13273
Epoch: [202] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.001073  min_lr: 0.001073  loss: 3.2980 (2.9816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9634 (0.9287)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.6708 (0.6708)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.9379  data: 5.8086  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8858 (0.8709)  acc1: 84.0000 (83.0182)  acc5: 96.8000 (96.8364)  time: 0.7735  data: 0.6721  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1075 (1.0298)  acc1: 76.4000 (78.9143)  acc5: 94.4000 (94.6286)  time: 0.2077  data: 0.1131  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0413 (1.0381)  acc1: 76.4000 (78.5920)  acc5: 94.0000 (94.5760)  time: 0.2058  data: 0.1131  max mem: 13273
Test: Total time: 0:00:10 (0.4213 s / it)
* Acc@1 78.618 Acc@5 94.604 loss 1.037
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.67%
Epoch: [203]  [   0/1251]  eta: 1:03:27  lr: 0.001073  min_lr: 0.001073  loss: 3.6160 (3.6160)  weight_decay: 0.0500 (0.0500)  time: 3.0435  data: 2.7451  max mem: 13273
Epoch: [203]  [ 200/1251]  eta: 0:03:53  lr: 0.001070  min_lr: 0.001070  loss: 3.3300 (2.9707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8720 (0.8971)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [203]  [ 400/1251]  eta: 0:03:02  lr: 0.001066  min_lr: 0.001066  loss: 3.1626 (3.0181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9156 (0.9011)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [203]  [ 600/1251]  eta: 0:02:17  lr: 0.001063  min_lr: 0.001063  loss: 3.1765 (3.0139)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0402 (0.9247)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [203]  [ 800/1251]  eta: 0:01:34  lr: 0.001060  min_lr: 0.001060  loss: 2.9521 (3.0091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8375 (nan)  time: 0.2093  data: 0.0004  max mem: 13273
Epoch: [203]  [1000/1251]  eta: 0:00:52  lr: 0.001057  min_lr: 0.001057  loss: 3.4307 (3.0000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9342 (nan)  time: 0.2044  data: 0.0006  max mem: 13273
Epoch: [203]  [1200/1251]  eta: 0:00:10  lr: 0.001054  min_lr: 0.001054  loss: 3.1512 (2.9984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8976 (nan)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [203]  [1250/1251]  eta: 0:00:00  lr: 0.001053  min_lr: 0.001053  loss: 2.3969 (2.9945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8631 (nan)  time: 0.1708  data: 0.0006  max mem: 13273
Epoch: [203] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.001053  min_lr: 0.001053  loss: 2.3969 (2.9780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8631 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6932 (0.6932)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 5.5668  data: 5.4614  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8037 (0.8388)  acc1: 84.0000 (82.0727)  acc5: 96.8000 (96.5818)  time: 0.7445  data: 0.6513  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0549 (1.0063)  acc1: 75.2000 (78.4000)  acc5: 93.6000 (94.3429)  time: 0.2173  data: 0.1261  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0449 (1.0122)  acc1: 75.2000 (78.3040)  acc5: 93.2000 (94.3040)  time: 0.2169  data: 0.1260  max mem: 13273
Test: Total time: 0:00:10 (0.4139 s / it)
* Acc@1 78.624 Acc@5 94.500 loss 1.003
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.67%
Epoch: [204]  [   0/1251]  eta: 1:00:13  lr: 0.001053  min_lr: 0.001053  loss: 2.1192 (2.1192)  weight_decay: 0.0500 (0.0500)  time: 2.8883  data: 2.1763  max mem: 13273
Epoch: [204]  [ 200/1251]  eta: 0:03:53  lr: 0.001050  min_lr: 0.001050  loss: 2.4962 (2.8812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8560 (0.8764)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [204]  [ 400/1251]  eta: 0:03:02  lr: 0.001047  min_lr: 0.001047  loss: 2.8907 (2.9230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8716 (0.9044)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [204]  [ 600/1251]  eta: 0:02:17  lr: 0.001044  min_lr: 0.001044  loss: 3.1925 (2.9373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9173 (0.9140)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [204]  [ 800/1251]  eta: 0:01:34  lr: 0.001040  min_lr: 0.001040  loss: 2.3531 (2.9128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9366 (0.9244)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [204]  [1000/1251]  eta: 0:00:52  lr: 0.001037  min_lr: 0.001037  loss: 2.7251 (2.9277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9679 (0.9471)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [204]  [1200/1251]  eta: 0:00:10  lr: 0.001034  min_lr: 0.001034  loss: 2.8429 (2.9458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8296 (0.9340)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [204]  [1250/1251]  eta: 0:00:00  lr: 0.001033  min_lr: 0.001033  loss: 2.2849 (2.9406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8880 (0.9332)  time: 0.1704  data: 0.0006  max mem: 13273
Epoch: [204] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.001033  min_lr: 0.001033  loss: 2.2849 (2.9710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8880 (0.9332)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6654 (0.6654)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.8997  data: 5.7939  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7948 (0.8369)  acc1: 84.4000 (82.6182)  acc5: 97.2000 (96.6546)  time: 0.7590  data: 0.6645  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0758 (1.0131)  acc1: 75.2000 (78.9714)  acc5: 94.0000 (94.6095)  time: 0.1994  data: 0.1073  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0882 (1.0234)  acc1: 76.8000 (78.6880)  acc5: 92.8000 (94.4960)  time: 0.1985  data: 0.1072  max mem: 13273
Test: Total time: 0:00:10 (0.4136 s / it)
* Acc@1 78.680 Acc@5 94.686 loss 1.012
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.68%
Epoch: [205]  [   0/1251]  eta: 1:06:18  lr: 0.001033  min_lr: 0.001033  loss: 3.5607 (3.5607)  weight_decay: 0.0500 (0.0500)  time: 3.1803  data: 2.9537  max mem: 13273
Epoch: [205]  [ 200/1251]  eta: 0:03:51  lr: 0.001030  min_lr: 0.001030  loss: 2.4166 (2.9029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9100 (0.9598)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [205]  [ 400/1251]  eta: 0:03:01  lr: 0.001027  min_lr: 0.001027  loss: 2.9435 (2.9371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8793 (0.9502)  time: 0.2074  data: 0.0005  max mem: 13273
Epoch: [205]  [ 600/1251]  eta: 0:02:17  lr: 0.001024  min_lr: 0.001024  loss: 2.8316 (2.9444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9493 (0.9350)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [205]  [ 800/1251]  eta: 0:01:34  lr: 0.001021  min_lr: 0.001021  loss: 2.4322 (2.9410)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0024 (0.9395)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [205]  [1000/1251]  eta: 0:00:52  lr: 0.001018  min_lr: 0.001018  loss: 2.8942 (2.9551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8506 (0.9497)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [205]  [1200/1251]  eta: 0:00:10  lr: 0.001014  min_lr: 0.001014  loss: 2.8817 (2.9590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8789 (0.9485)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [205]  [1250/1251]  eta: 0:00:00  lr: 0.001014  min_lr: 0.001014  loss: 2.3362 (2.9579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9334 (0.9512)  time: 0.1705  data: 0.0006  max mem: 13273
Epoch: [205] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.001014  min_lr: 0.001014  loss: 2.3362 (2.9762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9334 (0.9512)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6788 (0.6788)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 5.6515  data: 5.5458  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8024 (0.8237)  acc1: 82.8000 (82.5091)  acc5: 97.2000 (96.9091)  time: 0.7746  data: 0.6800  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0299 (1.0082)  acc1: 77.2000 (78.5714)  acc5: 93.6000 (94.7238)  time: 0.2108  data: 0.1186  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0561 (1.0163)  acc1: 76.8000 (78.3520)  acc5: 93.6000 (94.6400)  time: 0.2100  data: 0.1185  max mem: 13273
Test: Total time: 0:00:10 (0.4125 s / it)
* Acc@1 78.868 Acc@5 94.686 loss 1.003
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 78.87%
Epoch: [206]  [   0/1251]  eta: 1:02:31  lr: 0.001014  min_lr: 0.001014  loss: 2.0135 (2.0135)  weight_decay: 0.0500 (0.0500)  time: 2.9984  data: 2.7703  max mem: 13273
Epoch: [206]  [ 200/1251]  eta: 0:03:53  lr: 0.001011  min_lr: 0.001011  loss: 2.3252 (2.9274)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8049 (0.8893)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [206]  [ 400/1251]  eta: 0:03:02  lr: 0.001007  min_lr: 0.001007  loss: 2.3107 (2.9724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9305 (0.9161)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [206]  [ 600/1251]  eta: 0:02:17  lr: 0.001004  min_lr: 0.001004  loss: 2.5735 (2.9691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8896 (0.9297)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [206]  [ 800/1251]  eta: 0:01:34  lr: 0.001001  min_lr: 0.001001  loss: 2.9135 (2.9784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8691 (0.9317)  time: 0.2037  data: 0.0004  max mem: 13273
Epoch: [206]  [1000/1251]  eta: 0:00:52  lr: 0.000998  min_lr: 0.000998  loss: 2.8554 (2.9840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9043 (0.9308)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [206]  [1200/1251]  eta: 0:00:10  lr: 0.000995  min_lr: 0.000995  loss: 2.6743 (2.9765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8866 (0.9344)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [206]  [1250/1251]  eta: 0:00:00  lr: 0.000994  min_lr: 0.000994  loss: 2.2626 (2.9752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9424 (0.9342)  time: 0.1702  data: 0.0010  max mem: 13273
Epoch: [206] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.000994  min_lr: 0.000994  loss: 2.2626 (2.9710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9424 (0.9342)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6993 (0.6993)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.9074  data: 5.8008  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8346 (0.8487)  acc1: 83.2000 (82.6909)  acc5: 96.8000 (96.5818)  time: 0.7108  data: 0.6123  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0751 (1.0282)  acc1: 76.8000 (78.7429)  acc5: 93.6000 (94.5524)  time: 0.1787  data: 0.0807  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1079 (1.0407)  acc1: 76.4000 (78.4320)  acc5: 92.8000 (94.4640)  time: 0.1863  data: 0.0897  max mem: 13273
Test: Total time: 0:00:10 (0.4066 s / it)
* Acc@1 78.718 Acc@5 94.692 loss 1.025
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.87%
Epoch: [207]  [   0/1251]  eta: 1:12:12  lr: 0.000994  min_lr: 0.000994  loss: 2.6515 (2.6515)  weight_decay: 0.0500 (0.0500)  time: 3.4631  data: 1.7776  max mem: 13273
Epoch: [207]  [ 200/1251]  eta: 0:03:52  lr: 0.000991  min_lr: 0.000991  loss: 2.4518 (2.9199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8604 (0.9134)  time: 0.2041  data: 0.0006  max mem: 13273
Epoch: [207]  [ 400/1251]  eta: 0:03:01  lr: 0.000988  min_lr: 0.000988  loss: 3.0305 (2.9480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9201 (0.9414)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [207]  [ 600/1251]  eta: 0:02:17  lr: 0.000985  min_lr: 0.000985  loss: 3.0547 (2.9691)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0509 (0.9614)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [207]  [ 800/1251]  eta: 0:01:34  lr: 0.000982  min_lr: 0.000982  loss: 2.2924 (2.9361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9001 (0.9628)  time: 0.2067  data: 0.0006  max mem: 13273
Epoch: [207]  [1000/1251]  eta: 0:00:52  lr: 0.000979  min_lr: 0.000979  loss: 2.6619 (2.9287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9174 (0.9585)  time: 0.2073  data: 0.0007  max mem: 13273
Epoch: [207]  [1200/1251]  eta: 0:00:10  lr: 0.000976  min_lr: 0.000976  loss: 3.3580 (2.9408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9388 (0.9585)  time: 0.2071  data: 0.0006  max mem: 13273
Epoch: [207]  [1250/1251]  eta: 0:00:00  lr: 0.000975  min_lr: 0.000975  loss: 3.0906 (2.9409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9268 (0.9589)  time: 0.1702  data: 0.0008  max mem: 13273
Epoch: [207] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.000975  min_lr: 0.000975  loss: 3.0906 (2.9470)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9268 (0.9589)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6834 (0.6834)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.2994  data: 5.1921  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.8176 (0.8414)  acc1: 83.2000 (82.1818)  acc5: 97.2000 (96.7273)  time: 0.6628  data: 0.5639  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0593 (1.0281)  acc1: 76.0000 (78.3619)  acc5: 94.0000 (94.4952)  time: 0.1900  data: 0.0937  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1472 (1.0402)  acc1: 75.6000 (77.9360)  acc5: 93.6000 (94.3840)  time: 0.2115  data: 0.1162  max mem: 13273
Test: Total time: 0:00:10 (0.4027 s / it)
* Acc@1 78.522 Acc@5 94.648 loss 1.025
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.87%
Epoch: [208]  [   0/1251]  eta: 1:05:54  lr: 0.000975  min_lr: 0.000975  loss: 2.1220 (2.1220)  weight_decay: 0.0500 (0.0500)  time: 3.1611  data: 2.8113  max mem: 13273
Epoch: [208]  [ 200/1251]  eta: 0:03:53  lr: 0.000972  min_lr: 0.000972  loss: 2.5697 (3.0032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9156 (0.9275)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [208]  [ 400/1251]  eta: 0:03:02  lr: 0.000969  min_lr: 0.000969  loss: 2.3528 (3.0073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9237 (0.9274)  time: 0.2050  data: 0.0006  max mem: 13273
Epoch: [208]  [ 600/1251]  eta: 0:02:17  lr: 0.000966  min_lr: 0.000966  loss: 2.7763 (2.9882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9031 (0.9147)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [208]  [ 800/1251]  eta: 0:01:34  lr: 0.000963  min_lr: 0.000963  loss: 2.5572 (2.9687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9245 (0.9257)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [208]  [1000/1251]  eta: 0:00:52  lr: 0.000960  min_lr: 0.000960  loss: 2.5845 (2.9744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9343 (0.9342)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [208]  [1200/1251]  eta: 0:00:10  lr: 0.000956  min_lr: 0.000956  loss: 2.4710 (2.9596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8716 (0.9330)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [208]  [1250/1251]  eta: 0:00:00  lr: 0.000956  min_lr: 0.000956  loss: 3.1123 (2.9615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8520 (0.9318)  time: 0.1711  data: 0.0007  max mem: 13273
Epoch: [208] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.000956  min_lr: 0.000956  loss: 3.1123 (2.9525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8520 (0.9318)
Test:  [ 0/25]  eta: 0:02:05  loss: 0.7147 (0.7147)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 5.0019  data: 4.8924  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.8506 (0.8703)  acc1: 83.2000 (82.5455)  acc5: 96.4000 (96.5091)  time: 0.6076  data: 0.5087  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0630 (1.0572)  acc1: 77.2000 (78.5333)  acc5: 94.0000 (94.6095)  time: 0.1739  data: 0.0797  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1483 (1.0651)  acc1: 74.4000 (78.1600)  acc5: 93.6000 (94.5760)  time: 0.2110  data: 0.1180  max mem: 13273
Test: Total time: 0:00:09 (0.3915 s / it)
* Acc@1 78.570 Acc@5 94.654 loss 1.050
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.87%
Epoch: [209]  [   0/1251]  eta: 1:06:12  lr: 0.000956  min_lr: 0.000956  loss: 4.0330 (4.0330)  weight_decay: 0.0500 (0.0500)  time: 3.1753  data: 2.7940  max mem: 13273
Epoch: [209]  [ 200/1251]  eta: 0:03:53  lr: 0.000953  min_lr: 0.000953  loss: 3.4642 (2.9146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9446 (1.0266)  time: 0.2060  data: 0.0006  max mem: 13273
Epoch: [209]  [ 400/1251]  eta: 0:03:01  lr: 0.000950  min_lr: 0.000950  loss: 2.7018 (2.9100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8930 (0.9820)  time: 0.2040  data: 0.0005  max mem: 13273
Epoch: [209]  [ 600/1251]  eta: 0:02:17  lr: 0.000947  min_lr: 0.000947  loss: 2.4851 (2.9498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9478 (0.9833)  time: 0.2062  data: 0.0006  max mem: 13273
Epoch: [209]  [ 800/1251]  eta: 0:01:34  lr: 0.000944  min_lr: 0.000944  loss: 2.7219 (2.9468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9191 (0.9856)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [209]  [1000/1251]  eta: 0:00:52  lr: 0.000940  min_lr: 0.000940  loss: 2.4929 (2.9545)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0219 (0.9823)  time: 0.2076  data: 0.0005  max mem: 13273
Epoch: [209]  [1200/1251]  eta: 0:00:10  lr: 0.000937  min_lr: 0.000937  loss: 2.9346 (2.9529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9958 (0.9834)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [209]  [1250/1251]  eta: 0:00:00  lr: 0.000937  min_lr: 0.000937  loss: 2.3930 (2.9508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9087 (0.9800)  time: 0.1717  data: 0.0008  max mem: 13273
Epoch: [209] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.000937  min_lr: 0.000937  loss: 2.3930 (2.9571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9087 (0.9800)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6338 (0.6338)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 5.6430  data: 5.5375  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8013 (0.8005)  acc1: 83.6000 (83.2727)  acc5: 96.8000 (96.5818)  time: 0.6815  data: 0.5850  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0466 (0.9720)  acc1: 77.2000 (79.0857)  acc5: 93.6000 (94.6095)  time: 0.1906  data: 0.0976  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0362 (0.9806)  acc1: 77.6000 (78.8160)  acc5: 93.6000 (94.5600)  time: 0.2119  data: 0.1193  max mem: 13273
Test: Total time: 0:00:10 (0.4129 s / it)
* Acc@1 78.812 Acc@5 94.662 loss 0.978
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.87%
Epoch: [210]  [   0/1251]  eta: 1:05:34  lr: 0.000937  min_lr: 0.000937  loss: 3.8403 (3.8403)  weight_decay: 0.0500 (0.0500)  time: 3.1454  data: 1.8166  max mem: 13273
Epoch: [210]  [ 200/1251]  eta: 0:03:52  lr: 0.000934  min_lr: 0.000934  loss: 2.3500 (2.8316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8126 (0.9381)  time: 0.2043  data: 0.0005  max mem: 13273
Epoch: [210]  [ 400/1251]  eta: 0:03:01  lr: 0.000931  min_lr: 0.000931  loss: 3.3369 (2.8888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0190 (0.9626)  time: 0.2066  data: 0.0006  max mem: 13273
Epoch: [210]  [ 600/1251]  eta: 0:02:17  lr: 0.000928  min_lr: 0.000928  loss: 3.5667 (2.9123)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9616 (0.9567)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [210]  [ 800/1251]  eta: 0:01:34  lr: 0.000925  min_lr: 0.000925  loss: 2.9227 (2.9183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8994 (0.9422)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [210]  [1000/1251]  eta: 0:00:52  lr: 0.000922  min_lr: 0.000922  loss: 3.1355 (2.9296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9013 (0.9370)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [210]  [1200/1251]  eta: 0:00:10  lr: 0.000918  min_lr: 0.000918  loss: 2.9870 (2.9384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8621 (0.9303)  time: 0.2092  data: 0.0006  max mem: 13273
Epoch: [210]  [1250/1251]  eta: 0:00:00  lr: 0.000918  min_lr: 0.000918  loss: 2.7009 (2.9472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8401 (0.9279)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [210] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.000918  min_lr: 0.000918  loss: 2.7009 (2.9603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8401 (0.9279)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6821 (0.6821)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.6090  data: 5.5020  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8046 (0.8246)  acc1: 84.0000 (82.9818)  acc5: 96.4000 (96.6909)  time: 0.7538  data: 0.6578  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0443 (1.0126)  acc1: 76.4000 (78.8762)  acc5: 94.4000 (94.6476)  time: 0.2014  data: 0.1078  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1021 (1.0214)  acc1: 76.4000 (78.6240)  acc5: 93.2000 (94.5120)  time: 0.2011  data: 0.1078  max mem: 13273
Test: Total time: 0:00:10 (0.4028 s / it)
* Acc@1 78.842 Acc@5 94.726 loss 1.011
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.87%
Epoch: [211]  [   0/1251]  eta: 1:04:42  lr: 0.000918  min_lr: 0.000918  loss: 1.8308 (1.8308)  weight_decay: 0.0500 (0.0500)  time: 3.1033  data: 1.7207  max mem: 13273
Epoch: [211]  [ 200/1251]  eta: 0:03:54  lr: 0.000915  min_lr: 0.000915  loss: 2.7884 (2.8683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9394 (0.9224)  time: 0.2060  data: 0.0006  max mem: 13273
Epoch: [211]  [ 400/1251]  eta: 0:03:02  lr: 0.000912  min_lr: 0.000912  loss: 3.0321 (2.9074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8560 (0.9539)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [211]  [ 600/1251]  eta: 0:02:17  lr: 0.000909  min_lr: 0.000909  loss: 3.3211 (2.9560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9626 (0.9604)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [211]  [ 800/1251]  eta: 0:01:34  lr: 0.000906  min_lr: 0.000906  loss: 2.3002 (2.9285)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0540 (0.9621)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [211]  [1000/1251]  eta: 0:00:52  lr: 0.000903  min_lr: 0.000903  loss: 3.4594 (2.9347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8888 (0.9539)  time: 0.2078  data: 0.0004  max mem: 13273
Epoch: [211]  [1200/1251]  eta: 0:00:10  lr: 0.000900  min_lr: 0.000900  loss: 2.8699 (2.9500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9132 (0.9611)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [211]  [1250/1251]  eta: 0:00:00  lr: 0.000899  min_lr: 0.000899  loss: 3.2011 (2.9604)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0385 (0.9660)  time: 0.1699  data: 0.0007  max mem: 13273
Epoch: [211] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.000899  min_lr: 0.000899  loss: 3.2011 (2.9415)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0385 (0.9660)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8350 (0.8350)  acc1: 85.6000 (85.6000)  acc5: 98.8000 (98.8000)  time: 5.7426  data: 5.6340  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9039 (0.9477)  acc1: 84.0000 (82.3636)  acc5: 96.8000 (96.8000)  time: 0.7682  data: 0.6687  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1695 (1.1086)  acc1: 76.4000 (78.8571)  acc5: 94.4000 (94.6095)  time: 0.2098  data: 0.1142  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1922 (1.1153)  acc1: 76.4000 (78.5120)  acc5: 94.0000 (94.5120)  time: 0.2082  data: 0.1141  max mem: 13273
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 78.686 Acc@5 94.600 loss 1.105
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.87%
Epoch: [212]  [   0/1251]  eta: 1:06:05  lr: 0.000899  min_lr: 0.000899  loss: 3.4544 (3.4544)  weight_decay: 0.0500 (0.0500)  time: 3.1696  data: 2.8937  max mem: 13273
Epoch: [212]  [ 200/1251]  eta: 0:03:52  lr: 0.000896  min_lr: 0.000896  loss: 2.3143 (2.8301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9425 (0.9579)  time: 0.2057  data: 0.0006  max mem: 13273
Epoch: [212]  [ 400/1251]  eta: 0:03:01  lr: 0.000893  min_lr: 0.000893  loss: 2.5119 (2.8795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9943 (0.9618)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [212]  [ 600/1251]  eta: 0:02:17  lr: 0.000890  min_lr: 0.000890  loss: 2.4586 (2.8881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9864 (0.9856)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [212]  [ 800/1251]  eta: 0:01:34  lr: 0.000887  min_lr: 0.000887  loss: 3.0782 (2.8916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8715 (0.9677)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [212]  [1000/1251]  eta: 0:00:52  lr: 0.000884  min_lr: 0.000884  loss: 2.8037 (2.9032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9399 (0.9642)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [212]  [1200/1251]  eta: 0:00:10  lr: 0.000881  min_lr: 0.000881  loss: 2.2822 (2.9026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9348 (0.9632)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [212]  [1250/1251]  eta: 0:00:00  lr: 0.000880  min_lr: 0.000880  loss: 3.3077 (2.9059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9486 (0.9645)  time: 0.1704  data: 0.0006  max mem: 13273
Epoch: [212] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.000880  min_lr: 0.000880  loss: 3.3077 (2.9418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9486 (0.9645)
Test:  [ 0/25]  eta: 0:02:07  loss: 0.7325 (0.7325)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 5.0983  data: 4.9906  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8907 (0.8876)  acc1: 84.4000 (83.0909)  acc5: 96.8000 (96.6182)  time: 0.6927  data: 0.5986  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0795 (1.0622)  acc1: 77.2000 (78.8191)  acc5: 94.0000 (94.6476)  time: 0.2089  data: 0.1159  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1360 (1.0648)  acc1: 76.4000 (78.6080)  acc5: 94.0000 (94.6400)  time: 0.2108  data: 0.1179  max mem: 13273
Test: Total time: 0:00:10 (0.4026 s / it)
* Acc@1 78.626 Acc@5 94.682 loss 1.052
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.87%
Epoch: [213]  [   0/1251]  eta: 1:04:18  lr: 0.000880  min_lr: 0.000880  loss: 1.8848 (1.8848)  weight_decay: 0.0500 (0.0500)  time: 3.0843  data: 2.5849  max mem: 13273
Epoch: [213]  [ 200/1251]  eta: 0:03:52  lr: 0.000877  min_lr: 0.000877  loss: 2.2887 (2.9326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9351 (0.9721)  time: 0.2041  data: 0.0004  max mem: 13273
Epoch: [213]  [ 400/1251]  eta: 0:03:01  lr: 0.000874  min_lr: 0.000874  loss: 2.9747 (2.9875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8941 (0.9537)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [213]  [ 600/1251]  eta: 0:02:17  lr: 0.000871  min_lr: 0.000871  loss: 2.3246 (2.9681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9422 (0.9587)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [213]  [ 800/1251]  eta: 0:01:34  lr: 0.000868  min_lr: 0.000868  loss: 2.6156 (2.9535)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0004 (0.9781)  time: 0.2070  data: 0.0005  max mem: 13273
Epoch: [213]  [1000/1251]  eta: 0:00:52  lr: 0.000865  min_lr: 0.000865  loss: 2.4304 (2.9427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9697 (0.9766)  time: 0.2038  data: 0.0005  max mem: 13273
Epoch: [213]  [1200/1251]  eta: 0:00:10  lr: 0.000863  min_lr: 0.000863  loss: 2.7600 (2.9462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8957 (0.9740)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [213]  [1250/1251]  eta: 0:00:00  lr: 0.000862  min_lr: 0.000862  loss: 2.5368 (2.9476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9268 (0.9737)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [213] Total time: 0:04:20 (0.2080 s / it)
Averaged stats: lr: 0.000862  min_lr: 0.000862  loss: 2.5368 (2.9406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9268 (0.9737)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6768 (0.6768)  acc1: 84.8000 (84.8000)  acc5: 98.4000 (98.4000)  time: 5.6049  data: 5.4987  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8112 (0.8233)  acc1: 84.8000 (82.5818)  acc5: 96.8000 (96.8364)  time: 0.7721  data: 0.6739  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9936 (0.9966)  acc1: 76.0000 (78.6667)  acc5: 94.0000 (94.6667)  time: 0.2282  data: 0.1334  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0811 (1.0058)  acc1: 76.0000 (78.4960)  acc5: 93.6000 (94.6400)  time: 0.2276  data: 0.1334  max mem: 13273
Test: Total time: 0:00:10 (0.4245 s / it)
* Acc@1 79.156 Acc@5 94.786 loss 0.997
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.16%
Epoch: [214]  [   0/1251]  eta: 1:02:55  lr: 0.000862  min_lr: 0.000862  loss: 3.7910 (3.7910)  weight_decay: 0.0500 (0.0500)  time: 3.0180  data: 2.7486  max mem: 13273
Epoch: [214]  [ 200/1251]  eta: 0:03:50  lr: 0.000859  min_lr: 0.000859  loss: 2.7991 (2.9181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9024 (1.0404)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [214]  [ 400/1251]  eta: 0:03:01  lr: 0.000856  min_lr: 0.000856  loss: 3.2376 (2.9319)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8955 (1.0152)  time: 0.2086  data: 0.0005  max mem: 13273
Epoch: [214]  [ 600/1251]  eta: 0:02:17  lr: 0.000853  min_lr: 0.000853  loss: 3.1238 (2.9406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9935 (1.0114)  time: 0.2070  data: 0.0005  max mem: 13273
Epoch: [214]  [ 800/1251]  eta: 0:01:34  lr: 0.000850  min_lr: 0.000850  loss: 2.7569 (2.9299)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0877 (1.0176)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [214]  [1000/1251]  eta: 0:00:52  lr: 0.000847  min_lr: 0.000847  loss: 2.5222 (2.9333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9701 (1.0057)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [214]  [1200/1251]  eta: 0:00:10  lr: 0.000844  min_lr: 0.000844  loss: 3.1072 (2.9276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9419 (0.9948)  time: 0.2068  data: 0.0005  max mem: 13273
Epoch: [214]  [1250/1251]  eta: 0:00:00  lr: 0.000844  min_lr: 0.000844  loss: 2.6090 (2.9250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8720 (0.9919)  time: 0.1699  data: 0.0006  max mem: 13273
Epoch: [214] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.000844  min_lr: 0.000844  loss: 2.6090 (2.9330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8720 (0.9919)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6483 (0.6483)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.5565  data: 5.4141  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7958 (0.8212)  acc1: 82.8000 (82.8000)  acc5: 97.6000 (97.0182)  time: 0.7692  data: 0.6651  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0257 (1.0139)  acc1: 77.2000 (79.0857)  acc5: 94.0000 (94.7048)  time: 0.2072  data: 0.1116  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0774 (1.0186)  acc1: 77.2000 (78.8960)  acc5: 92.8000 (94.6400)  time: 0.2075  data: 0.1138  max mem: 13273
Test: Total time: 0:00:10 (0.4077 s / it)
* Acc@1 79.094 Acc@5 94.718 loss 1.008
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.16%
Epoch: [215]  [   0/1251]  eta: 1:06:00  lr: 0.000843  min_lr: 0.000843  loss: 2.9924 (2.9924)  weight_decay: 0.0500 (0.0500)  time: 3.1658  data: 2.4684  max mem: 13273
Epoch: [215]  [ 200/1251]  eta: 0:03:52  lr: 0.000841  min_lr: 0.000841  loss: 2.4419 (2.9737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9889 (1.0213)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [215]  [ 400/1251]  eta: 0:03:01  lr: 0.000838  min_lr: 0.000838  loss: 2.3995 (2.9320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9778 (0.9957)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [215]  [ 600/1251]  eta: 0:02:17  lr: 0.000835  min_lr: 0.000835  loss: 2.5513 (2.9321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9523 (1.0034)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [215]  [ 800/1251]  eta: 0:01:34  lr: 0.000832  min_lr: 0.000832  loss: 2.8468 (2.9277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8767 (0.9868)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [215]  [1000/1251]  eta: 0:00:52  lr: 0.000829  min_lr: 0.000829  loss: 3.1615 (2.9256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9398 (0.9849)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [215]  [1200/1251]  eta: 0:00:10  lr: 0.000826  min_lr: 0.000826  loss: 2.2963 (2.9167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9123 (0.9857)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [215]  [1250/1251]  eta: 0:00:00  lr: 0.000825  min_lr: 0.000825  loss: 2.6146 (2.9169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9267 (0.9856)  time: 0.1706  data: 0.0008  max mem: 13273
Epoch: [215] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.000825  min_lr: 0.000825  loss: 2.6146 (2.9197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9267 (0.9856)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6657 (0.6657)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.7978  data: 5.6916  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7656 (0.8046)  acc1: 84.4000 (83.0909)  acc5: 97.6000 (96.9091)  time: 0.7457  data: 0.6483  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0143 (0.9869)  acc1: 77.6000 (79.3524)  acc5: 94.4000 (94.5905)  time: 0.1997  data: 0.1062  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1129 (0.9966)  acc1: 77.2000 (78.9440)  acc5: 92.8000 (94.4480)  time: 0.2012  data: 0.1085  max mem: 13273
Test: Total time: 0:00:10 (0.4114 s / it)
* Acc@1 78.966 Acc@5 94.856 loss 0.982
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.16%
Epoch: [216]  [   0/1251]  eta: 1:04:23  lr: 0.000825  min_lr: 0.000825  loss: 2.1180 (2.1180)  weight_decay: 0.0500 (0.0500)  time: 3.0884  data: 2.3251  max mem: 13273
Epoch: [216]  [ 200/1251]  eta: 0:03:54  lr: 0.000822  min_lr: 0.000822  loss: 2.5172 (2.8638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8717 (0.9750)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [216]  [ 400/1251]  eta: 0:03:02  lr: 0.000819  min_lr: 0.000819  loss: 2.7073 (2.9135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8983 (0.9866)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [216]  [ 600/1251]  eta: 0:02:18  lr: 0.000817  min_lr: 0.000817  loss: 2.5177 (2.8905)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0851 (1.0175)  time: 0.2075  data: 0.0005  max mem: 13273
Epoch: [216]  [ 800/1251]  eta: 0:01:34  lr: 0.000814  min_lr: 0.000814  loss: 2.5254 (2.9022)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9398 (1.0210)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [216]  [1000/1251]  eta: 0:00:52  lr: 0.000811  min_lr: 0.000811  loss: 2.5414 (2.8932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8990 (1.0056)  time: 0.2067  data: 0.0004  max mem: 13273
Epoch: [216]  [1200/1251]  eta: 0:00:10  lr: 0.000808  min_lr: 0.000808  loss: 2.3743 (2.8879)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [216]  [1250/1251]  eta: 0:00:00  lr: 0.000807  min_lr: 0.000807  loss: 2.8333 (2.8895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9439 (nan)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [216] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.000807  min_lr: 0.000807  loss: 2.8333 (2.9255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9439 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7155 (0.7155)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.6977  data: 5.5910  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8403 (0.8666)  acc1: 83.6000 (83.2000)  acc5: 96.8000 (96.7273)  time: 0.7170  data: 0.6181  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0416 (1.0499)  acc1: 77.6000 (79.4476)  acc5: 94.4000 (94.6095)  time: 0.1953  data: 0.0999  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1344 (1.0638)  acc1: 77.6000 (78.7680)  acc5: 93.6000 (94.5280)  time: 0.2193  data: 0.1255  max mem: 13273
Test: Total time: 0:00:10 (0.4226 s / it)
* Acc@1 79.042 Acc@5 94.870 loss 1.049
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.16%
Epoch: [217]  [   0/1251]  eta: 1:03:57  lr: 0.000807  min_lr: 0.000807  loss: 4.2283 (4.2283)  weight_decay: 0.0500 (0.0500)  time: 3.0672  data: 2.3797  max mem: 13273
Epoch: [217]  [ 200/1251]  eta: 0:03:53  lr: 0.000804  min_lr: 0.000804  loss: 2.6108 (2.9903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9913 (0.9713)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [217]  [ 400/1251]  eta: 0:03:01  lr: 0.000801  min_lr: 0.000801  loss: 3.0278 (2.9079)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1059 (0.9944)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [217]  [ 600/1251]  eta: 0:02:17  lr: 0.000799  min_lr: 0.000799  loss: 2.1722 (2.8982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9270 (0.9827)  time: 0.2070  data: 0.0004  max mem: 13273
Epoch: [217]  [ 800/1251]  eta: 0:01:34  lr: 0.000796  min_lr: 0.000796  loss: 3.3783 (2.9019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9738 (0.9941)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [217]  [1000/1251]  eta: 0:00:52  lr: 0.000793  min_lr: 0.000793  loss: 2.5952 (2.9175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9749 (0.9904)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [217]  [1200/1251]  eta: 0:00:10  lr: 0.000790  min_lr: 0.000790  loss: 2.6763 (2.9106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9976 (0.9994)  time: 0.2085  data: 0.0004  max mem: 13273
Epoch: [217]  [1250/1251]  eta: 0:00:00  lr: 0.000789  min_lr: 0.000789  loss: 2.3659 (2.9100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9674 (0.9978)  time: 0.1707  data: 0.0008  max mem: 13273
Epoch: [217] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.000789  min_lr: 0.000789  loss: 2.3659 (2.9164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9674 (0.9978)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6576 (0.6576)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.5033  data: 5.3961  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8181 (0.8329)  acc1: 84.0000 (82.9818)  acc5: 96.8000 (97.0545)  time: 0.7402  data: 0.6434  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0632 (0.9924)  acc1: 77.2000 (79.3524)  acc5: 94.0000 (94.8381)  time: 0.2102  data: 0.1172  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0808 (1.0031)  acc1: 77.2000 (79.0240)  acc5: 93.6000 (94.7360)  time: 0.2090  data: 0.1172  max mem: 13273
Test: Total time: 0:00:10 (0.4065 s / it)
* Acc@1 79.222 Acc@5 94.948 loss 0.990
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.22%
Epoch: [218]  [   0/1251]  eta: 1:03:02  lr: 0.000789  min_lr: 0.000789  loss: 2.2424 (2.2424)  weight_decay: 0.0500 (0.0500)  time: 3.0234  data: 2.7636  max mem: 13273
Epoch: [218]  [ 200/1251]  eta: 0:03:51  lr: 0.000786  min_lr: 0.000786  loss: 3.0560 (2.7750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8845 (0.9415)  time: 0.2079  data: 0.0006  max mem: 13273
Epoch: [218]  [ 400/1251]  eta: 0:03:00  lr: 0.000784  min_lr: 0.000784  loss: 2.7194 (2.7972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9583 (0.9459)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [218]  [ 600/1251]  eta: 0:02:16  lr: 0.000781  min_lr: 0.000781  loss: 2.2234 (2.8312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0235 (0.9592)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [218]  [ 800/1251]  eta: 0:01:34  lr: 0.000778  min_lr: 0.000778  loss: 2.5120 (2.8662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8935 (0.9641)  time: 0.2040  data: 0.0005  max mem: 13273
Epoch: [218]  [1000/1251]  eta: 0:00:52  lr: 0.000775  min_lr: 0.000775  loss: 2.5061 (2.8858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9582 (0.9639)  time: 0.2090  data: 0.0004  max mem: 13273
Epoch: [218]  [1200/1251]  eta: 0:00:10  lr: 0.000772  min_lr: 0.000772  loss: 3.2638 (2.8884)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1269 (0.9824)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [218]  [1250/1251]  eta: 0:00:00  lr: 0.000772  min_lr: 0.000772  loss: 3.0837 (2.8942)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0194 (0.9806)  time: 0.1726  data: 0.0006  max mem: 13273
Epoch: [218] Total time: 0:04:20 (0.2080 s / it)
Averaged stats: lr: 0.000772  min_lr: 0.000772  loss: 3.0837 (2.9200)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0194 (0.9806)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7151 (0.7151)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.7967  data: 5.6906  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8656 (0.8728)  acc1: 84.4000 (82.9455)  acc5: 97.2000 (97.2000)  time: 0.7450  data: 0.6480  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0786 (1.0432)  acc1: 77.2000 (79.2000)  acc5: 94.4000 (94.9143)  time: 0.1942  data: 0.1005  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1294 (1.0538)  acc1: 77.2000 (78.8320)  acc5: 93.2000 (94.7680)  time: 0.1951  data: 0.1018  max mem: 13273
Test: Total time: 0:00:10 (0.4062 s / it)
* Acc@1 79.192 Acc@5 94.870 loss 1.044
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.22%
Epoch: [219]  [   0/1251]  eta: 1:09:56  lr: 0.000771  min_lr: 0.000771  loss: 1.9361 (1.9361)  weight_decay: 0.0500 (0.0500)  time: 3.3548  data: 2.6075  max mem: 13273
Epoch: [219]  [ 200/1251]  eta: 0:03:52  lr: 0.000769  min_lr: 0.000769  loss: 2.7128 (2.9479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0888 (1.0365)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [219]  [ 400/1251]  eta: 0:03:02  lr: 0.000766  min_lr: 0.000766  loss: 2.4415 (2.9281)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8330 (0.9927)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [219]  [ 600/1251]  eta: 0:02:17  lr: 0.000763  min_lr: 0.000763  loss: 3.0516 (2.9235)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0149 (0.9965)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [219]  [ 800/1251]  eta: 0:01:34  lr: 0.000760  min_lr: 0.000760  loss: 2.6242 (2.9092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9379 (0.9815)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [219]  [1000/1251]  eta: 0:00:52  lr: 0.000757  min_lr: 0.000757  loss: 2.8413 (2.8976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9817 (0.9826)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [219]  [1200/1251]  eta: 0:00:10  lr: 0.000755  min_lr: 0.000755  loss: 3.5110 (2.9032)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0125 (0.9908)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [219]  [1250/1251]  eta: 0:00:00  lr: 0.000754  min_lr: 0.000754  loss: 2.3673 (2.8960)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0125 (0.9926)  time: 0.1706  data: 0.0009  max mem: 13273
Epoch: [219] Total time: 0:04:21 (0.2094 s / it)
Averaged stats: lr: 0.000754  min_lr: 0.000754  loss: 2.3673 (2.9169)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0125 (0.9926)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6382 (0.6382)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.6669  data: 5.5614  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8018 (0.7909)  acc1: 83.2000 (82.8000)  acc5: 96.4000 (96.6182)  time: 0.7472  data: 0.6517  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0011 (0.9551)  acc1: 77.6000 (79.0095)  acc5: 94.0000 (94.8381)  time: 0.2100  data: 0.1175  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0097 (0.9602)  acc1: 76.8000 (78.7200)  acc5: 93.6000 (94.7520)  time: 0.2097  data: 0.1175  max mem: 13273
Test: Total time: 0:00:10 (0.4122 s / it)
* Acc@1 79.258 Acc@5 95.036 loss 0.946
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.26%
Epoch: [220]  [   0/1251]  eta: 1:06:01  lr: 0.000754  min_lr: 0.000754  loss: 3.6842 (3.6842)  weight_decay: 0.0500 (0.0500)  time: 3.1666  data: 2.9190  max mem: 13273
Epoch: [220]  [ 200/1251]  eta: 0:03:52  lr: 0.000751  min_lr: 0.000751  loss: 2.7523 (2.9325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8871 (0.9809)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [220]  [ 400/1251]  eta: 0:03:01  lr: 0.000748  min_lr: 0.000748  loss: 2.7295 (2.9035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9257 (0.9611)  time: 0.2072  data: 0.0004  max mem: 13273
Epoch: [220]  [ 600/1251]  eta: 0:02:17  lr: 0.000745  min_lr: 0.000745  loss: 3.0266 (2.9043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9083 (0.9772)  time: 0.2069  data: 0.0005  max mem: 13273
Epoch: [220]  [ 800/1251]  eta: 0:01:34  lr: 0.000743  min_lr: 0.000743  loss: 3.1237 (2.9006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9255 (0.9786)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [220]  [1000/1251]  eta: 0:00:52  lr: 0.000740  min_lr: 0.000740  loss: 2.9232 (2.9231)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0081 (0.9844)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [220]  [1200/1251]  eta: 0:00:10  lr: 0.000737  min_lr: 0.000737  loss: 2.6807 (2.9138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0263 (0.9899)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [220]  [1250/1251]  eta: 0:00:00  lr: 0.000736  min_lr: 0.000736  loss: 2.5832 (2.9179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8891 (0.9889)  time: 0.1704  data: 0.0006  max mem: 13273
Epoch: [220] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.000736  min_lr: 0.000736  loss: 2.5832 (2.9140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8891 (0.9889)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6271 (0.6271)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.6561  data: 5.5507  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8275 (0.8248)  acc1: 82.8000 (82.8727)  acc5: 96.4000 (96.5455)  time: 0.7403  data: 0.6426  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0830 (0.9934)  acc1: 77.6000 (79.0667)  acc5: 93.6000 (94.6667)  time: 0.2053  data: 0.1116  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0776 (1.0065)  acc1: 76.8000 (78.6400)  acc5: 93.6000 (94.5920)  time: 0.2163  data: 0.1236  max mem: 13273
Test: Total time: 0:00:10 (0.4182 s / it)
* Acc@1 79.186 Acc@5 94.908 loss 0.993
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.26%
Epoch: [221]  [   0/1251]  eta: 1:04:06  lr: 0.000736  min_lr: 0.000736  loss: 2.6306 (2.6306)  weight_decay: 0.0500 (0.0500)  time: 3.0746  data: 2.8072  max mem: 13273
Epoch: [221]  [ 200/1251]  eta: 0:03:52  lr: 0.000734  min_lr: 0.000734  loss: 3.5034 (2.8466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9897 (0.9980)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [221]  [ 400/1251]  eta: 0:03:01  lr: 0.000731  min_lr: 0.000731  loss: 2.8747 (2.9035)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0453 (1.0172)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [221]  [ 600/1251]  eta: 0:02:17  lr: 0.000728  min_lr: 0.000728  loss: 3.6167 (2.9013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9349 (1.0054)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [221]  [ 800/1251]  eta: 0:01:34  lr: 0.000725  min_lr: 0.000725  loss: 2.7292 (2.8839)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0328 (1.0294)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [221]  [1000/1251]  eta: 0:00:52  lr: 0.000722  min_lr: 0.000722  loss: 2.4286 (2.8727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8532 (1.0109)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [221]  [1200/1251]  eta: 0:00:10  lr: 0.000720  min_lr: 0.000720  loss: 3.1872 (2.8891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9463 (1.0066)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [221]  [1250/1251]  eta: 0:00:00  lr: 0.000719  min_lr: 0.000719  loss: 2.3727 (2.8838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9798 (1.0043)  time: 0.1711  data: 0.0006  max mem: 13273
Epoch: [221] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.000719  min_lr: 0.000719  loss: 2.3727 (2.9052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9798 (1.0043)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.5975 (0.5975)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.9732  data: 5.8475  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7504 (0.7715)  acc1: 83.2000 (82.8727)  acc5: 97.2000 (96.7636)  time: 0.7416  data: 0.6405  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9686 (0.9541)  acc1: 76.4000 (78.8762)  acc5: 94.4000 (94.6667)  time: 0.1910  data: 0.0961  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0613 (0.9656)  acc1: 75.6000 (78.6240)  acc5: 93.6000 (94.5600)  time: 0.2114  data: 0.1183  max mem: 13273
Test: Total time: 0:00:10 (0.4277 s / it)
* Acc@1 79.204 Acc@5 94.870 loss 0.944
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.26%
Epoch: [222]  [   0/1251]  eta: 1:05:16  lr: 0.000719  min_lr: 0.000719  loss: 2.9306 (2.9306)  weight_decay: 0.0500 (0.0500)  time: 3.1310  data: 1.7020  max mem: 13273
Epoch: [222]  [ 200/1251]  eta: 0:03:53  lr: 0.000716  min_lr: 0.000716  loss: 2.3302 (2.8290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9602 (1.0157)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [222]  [ 400/1251]  eta: 0:03:02  lr: 0.000714  min_lr: 0.000714  loss: 2.2903 (2.8160)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0423 (1.0280)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [222]  [ 600/1251]  eta: 0:02:17  lr: 0.000711  min_lr: 0.000711  loss: 2.3118 (2.8182)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9576 (1.0134)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [222]  [ 800/1251]  eta: 0:01:34  lr: 0.000708  min_lr: 0.000708  loss: 3.1322 (2.8775)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2186 (1.0320)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [222]  [1000/1251]  eta: 0:00:52  lr: 0.000705  min_lr: 0.000705  loss: 3.4860 (2.8890)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0568 (1.0363)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [222]  [1200/1251]  eta: 0:00:10  lr: 0.000703  min_lr: 0.000703  loss: 3.4706 (2.8943)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0295 (1.0358)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [222]  [1250/1251]  eta: 0:00:00  lr: 0.000702  min_lr: 0.000702  loss: 2.4480 (2.8963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9722 (1.0340)  time: 0.1704  data: 0.0008  max mem: 13273
Epoch: [222] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.000702  min_lr: 0.000702  loss: 2.4480 (2.9106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9722 (1.0340)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6528 (0.6528)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.3207  data: 5.2136  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8150 (0.8339)  acc1: 83.2000 (82.7636)  acc5: 97.2000 (96.5091)  time: 0.7475  data: 0.6507  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0676 (1.0130)  acc1: 77.2000 (79.2191)  acc5: 93.6000 (94.5905)  time: 0.2201  data: 0.1260  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0596 (1.0187)  acc1: 77.2000 (78.9280)  acc5: 94.0000 (94.5920)  time: 0.2197  data: 0.1259  max mem: 13273
Test: Total time: 0:00:10 (0.4071 s / it)
* Acc@1 79.236 Acc@5 94.870 loss 1.010
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.26%
Epoch: [223]  [   0/1251]  eta: 1:08:39  lr: 0.000702  min_lr: 0.000702  loss: 1.8506 (1.8506)  weight_decay: 0.0500 (0.0500)  time: 3.2932  data: 3.0539  max mem: 13273
Epoch: [223]  [ 200/1251]  eta: 0:03:54  lr: 0.000699  min_lr: 0.000699  loss: 2.9658 (2.8559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9922 (1.0593)  time: 0.2041  data: 0.0004  max mem: 13273
Epoch: [223]  [ 400/1251]  eta: 0:03:03  lr: 0.000696  min_lr: 0.000696  loss: 2.4801 (2.8419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9295 (1.0235)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [223]  [ 600/1251]  eta: 0:02:18  lr: 0.000694  min_lr: 0.000694  loss: 2.3685 (2.8343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9778 (1.0200)  time: 0.2070  data: 0.0007  max mem: 13273
Epoch: [223]  [ 800/1251]  eta: 0:01:34  lr: 0.000691  min_lr: 0.000691  loss: 2.7639 (2.8636)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9343 (1.0121)  time: 0.2054  data: 0.0006  max mem: 13273
Epoch: [223]  [1000/1251]  eta: 0:00:52  lr: 0.000688  min_lr: 0.000688  loss: 2.3026 (2.8664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9759 (1.0066)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [223]  [1200/1251]  eta: 0:00:10  lr: 0.000686  min_lr: 0.000686  loss: 2.5578 (2.8710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9901 (1.0004)  time: 0.2069  data: 0.0004  max mem: 13273
Epoch: [223]  [1250/1251]  eta: 0:00:00  lr: 0.000685  min_lr: 0.000685  loss: 2.9600 (2.8742)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9237 (1.0006)  time: 0.1700  data: 0.0006  max mem: 13273
Epoch: [223] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.000685  min_lr: 0.000685  loss: 2.9600 (2.8945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9237 (1.0006)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6416 (0.6416)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.7088  data: 5.6048  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7775 (0.8086)  acc1: 84.0000 (83.2364)  acc5: 97.2000 (97.1636)  time: 0.7451  data: 0.6509  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0412 (0.9880)  acc1: 77.6000 (79.4476)  acc5: 94.0000 (94.9714)  time: 0.2086  data: 0.1168  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0959 (0.9954)  acc1: 77.6000 (79.2960)  acc5: 94.0000 (94.9440)  time: 0.2106  data: 0.1167  max mem: 13273
Test: Total time: 0:00:10 (0.4151 s / it)
* Acc@1 79.452 Acc@5 95.078 loss 0.987
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.45%
Epoch: [224]  [   0/1251]  eta: 1:09:32  lr: 0.000685  min_lr: 0.000685  loss: 2.3028 (2.3028)  weight_decay: 0.0500 (0.0500)  time: 3.3353  data: 3.0991  max mem: 13273
Epoch: [224]  [ 200/1251]  eta: 0:03:52  lr: 0.000682  min_lr: 0.000682  loss: 3.3980 (2.9217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9265 (1.0735)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [224]  [ 400/1251]  eta: 0:03:00  lr: 0.000680  min_lr: 0.000680  loss: 2.8375 (2.8814)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1368 (1.0618)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [224]  [ 600/1251]  eta: 0:02:16  lr: 0.000677  min_lr: 0.000677  loss: 2.7248 (2.8701)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0308 (nan)  time: 0.2035  data: 0.0004  max mem: 13273
Epoch: [224]  [ 800/1251]  eta: 0:01:34  lr: 0.000674  min_lr: 0.000674  loss: 2.9446 (2.8825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9600 (nan)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [224]  [1000/1251]  eta: 0:00:52  lr: 0.000671  min_lr: 0.000671  loss: 2.5385 (2.8874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9939 (nan)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [224]  [1200/1251]  eta: 0:00:10  lr: 0.000669  min_lr: 0.000669  loss: 3.0006 (2.8811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9503 (nan)  time: 0.2046  data: 0.0006  max mem: 13273
Epoch: [224]  [1250/1251]  eta: 0:00:00  lr: 0.000668  min_lr: 0.000668  loss: 2.2698 (2.8787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9944 (nan)  time: 0.1705  data: 0.0006  max mem: 13273
Epoch: [224] Total time: 0:04:19 (0.2077 s / it)
Averaged stats: lr: 0.000668  min_lr: 0.000668  loss: 2.2698 (2.8980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9944 (nan)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6291 (0.6291)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 5.4048  data: 5.2994  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7706 (0.7799)  acc1: 84.0000 (83.4182)  acc5: 97.2000 (97.0909)  time: 0.7544  data: 0.6567  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9734 (0.9428)  acc1: 78.0000 (79.9619)  acc5: 94.4000 (95.1048)  time: 0.2187  data: 0.1239  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9715 (0.9502)  acc1: 78.0000 (79.6640)  acc5: 94.0000 (95.0240)  time: 0.2180  data: 0.1238  max mem: 13273
Test: Total time: 0:00:10 (0.4092 s / it)
* Acc@1 79.664 Acc@5 95.002 loss 0.938
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.66%
Epoch: [225]  [   0/1251]  eta: 1:11:01  lr: 0.000668  min_lr: 0.000668  loss: 1.9482 (1.9482)  weight_decay: 0.0500 (0.0500)  time: 3.4063  data: 3.1894  max mem: 13273
Epoch: [225]  [ 200/1251]  eta: 0:03:52  lr: 0.000665  min_lr: 0.000665  loss: 2.6924 (2.9212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9836 (1.0930)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [225]  [ 400/1251]  eta: 0:03:01  lr: 0.000663  min_lr: 0.000663  loss: 2.7051 (2.9051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9827 (1.0480)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [225]  [ 600/1251]  eta: 0:02:17  lr: 0.000660  min_lr: 0.000660  loss: 2.8722 (2.8957)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1432 (1.0538)  time: 0.2077  data: 0.0004  max mem: 13273
Epoch: [225]  [ 800/1251]  eta: 0:01:34  lr: 0.000657  min_lr: 0.000657  loss: 3.1029 (2.9059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9771 (1.0395)  time: 0.2056  data: 0.0003  max mem: 13273
Epoch: [225]  [1000/1251]  eta: 0:00:52  lr: 0.000655  min_lr: 0.000655  loss: 2.8628 (2.9291)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0551 (1.0402)  time: 0.2184  data: 0.0006  max mem: 13273
Epoch: [225]  [1200/1251]  eta: 0:00:10  lr: 0.000652  min_lr: 0.000652  loss: 2.2845 (2.9181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9290 (1.0366)  time: 0.2079  data: 0.0005  max mem: 13273
Epoch: [225]  [1250/1251]  eta: 0:00:00  lr: 0.000652  min_lr: 0.000652  loss: 2.7590 (2.9195)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0104 (1.0362)  time: 0.1701  data: 0.0007  max mem: 13273
Epoch: [225] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.000652  min_lr: 0.000652  loss: 2.7590 (2.8989)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0104 (1.0362)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6801 (0.6801)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.8114  data: 5.7060  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7898 (0.8134)  acc1: 85.2000 (83.3455)  acc5: 97.2000 (97.0182)  time: 0.7249  data: 0.6307  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0170 (1.0044)  acc1: 77.2000 (79.4095)  acc5: 94.4000 (95.0667)  time: 0.1833  data: 0.0914  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0886 (1.0149)  acc1: 77.2000 (79.0880)  acc5: 94.0000 (94.9440)  time: 0.1832  data: 0.0913  max mem: 13273
Test: Total time: 0:00:09 (0.3973 s / it)
* Acc@1 79.470 Acc@5 94.962 loss 1.013
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.66%
Epoch: [226]  [   0/1251]  eta: 1:03:04  lr: 0.000651  min_lr: 0.000651  loss: 2.0153 (2.0153)  weight_decay: 0.0500 (0.0500)  time: 3.0250  data: 2.7253  max mem: 13273
Epoch: [226]  [ 200/1251]  eta: 0:03:54  lr: 0.000649  min_lr: 0.000649  loss: 3.0187 (2.9041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8775 (0.9465)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [226]  [ 400/1251]  eta: 0:03:02  lr: 0.000646  min_lr: 0.000646  loss: 3.2743 (2.8717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9943 (0.9503)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [226]  [ 600/1251]  eta: 0:02:18  lr: 0.000644  min_lr: 0.000644  loss: 3.4716 (2.8527)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0134 (0.9825)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [226]  [ 800/1251]  eta: 0:01:35  lr: 0.000641  min_lr: 0.000641  loss: 3.1656 (2.8512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8865 (0.9760)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [226]  [1000/1251]  eta: 0:00:52  lr: 0.000638  min_lr: 0.000638  loss: 2.4980 (2.8757)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1354 (1.0045)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [226]  [1200/1251]  eta: 0:00:10  lr: 0.000636  min_lr: 0.000636  loss: 2.3606 (2.8783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9975 (1.0040)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [226]  [1250/1251]  eta: 0:00:00  lr: 0.000635  min_lr: 0.000635  loss: 2.6807 (2.8824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9521 (1.0043)  time: 0.1707  data: 0.0010  max mem: 13273
Epoch: [226] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.000635  min_lr: 0.000635  loss: 2.6807 (2.8936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9521 (1.0043)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6531 (0.6531)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 5.9157  data: 5.8085  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8020 (0.8028)  acc1: 82.8000 (83.0545)  acc5: 96.8000 (97.0545)  time: 0.7550  data: 0.6582  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0366 (0.9750)  acc1: 77.2000 (79.2952)  acc5: 94.0000 (95.0857)  time: 0.1947  data: 0.1016  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0209 (0.9846)  acc1: 77.6000 (79.1040)  acc5: 94.0000 (94.9920)  time: 0.1972  data: 0.1046  max mem: 13273
Test: Total time: 0:00:10 (0.4131 s / it)
* Acc@1 79.502 Acc@5 95.072 loss 0.977
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.66%
Epoch: [227]  [   0/1251]  eta: 1:10:10  lr: 0.000635  min_lr: 0.000635  loss: 4.0707 (4.0707)  weight_decay: 0.0500 (0.0500)  time: 3.3657  data: 3.1036  max mem: 13273
Epoch: [227]  [ 200/1251]  eta: 0:03:53  lr: 0.000632  min_lr: 0.000632  loss: 2.5988 (2.9245)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9312 (1.0030)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [227]  [ 400/1251]  eta: 0:03:01  lr: 0.000630  min_lr: 0.000630  loss: 2.2437 (2.8583)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0437 (1.0351)  time: 0.2041  data: 0.0004  max mem: 13273
Epoch: [227]  [ 600/1251]  eta: 0:02:17  lr: 0.000627  min_lr: 0.000627  loss: 3.2199 (2.8518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9282 (1.0197)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [227]  [ 800/1251]  eta: 0:01:34  lr: 0.000625  min_lr: 0.000625  loss: 2.2125 (2.8612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9652 (1.0198)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [227]  [1000/1251]  eta: 0:00:52  lr: 0.000622  min_lr: 0.000622  loss: 2.2445 (2.8670)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0491 (1.0318)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [227]  [1200/1251]  eta: 0:00:10  lr: 0.000619  min_lr: 0.000619  loss: 3.3740 (2.8691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9367 (1.0320)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [227]  [1250/1251]  eta: 0:00:00  lr: 0.000619  min_lr: 0.000619  loss: 3.2018 (2.8713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9085 (1.0277)  time: 0.1707  data: 0.0006  max mem: 13273
Epoch: [227] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.000619  min_lr: 0.000619  loss: 3.2018 (2.8849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9085 (1.0277)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8016 (0.8016)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.8373  data: 5.7078  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8952 (0.9228)  acc1: 83.6000 (83.6000)  acc5: 97.2000 (96.9818)  time: 0.6916  data: 0.5902  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.1601 (1.0951)  acc1: 77.2000 (79.3143)  acc5: 93.6000 (94.9333)  time: 0.1688  data: 0.0744  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1860 (1.1084)  acc1: 76.4000 (78.8640)  acc5: 93.6000 (94.7840)  time: 0.1875  data: 0.0949  max mem: 13273
Test: Total time: 0:00:10 (0.4040 s / it)
* Acc@1 79.298 Acc@5 94.890 loss 1.105
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.66%
Epoch: [228]  [   0/1251]  eta: 1:04:26  lr: 0.000619  min_lr: 0.000619  loss: 2.2805 (2.2805)  weight_decay: 0.0500 (0.0500)  time: 3.0911  data: 2.6313  max mem: 13273
Epoch: [228]  [ 200/1251]  eta: 0:03:52  lr: 0.000616  min_lr: 0.000616  loss: 2.6113 (2.8438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9378 (1.0459)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [228]  [ 400/1251]  eta: 0:03:01  lr: 0.000614  min_lr: 0.000614  loss: 2.3536 (2.7724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9647 (1.0408)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [228]  [ 600/1251]  eta: 0:02:17  lr: 0.000611  min_lr: 0.000611  loss: 3.5068 (2.8227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9524 (1.0261)  time: 0.2084  data: 0.0006  max mem: 13273
Epoch: [228]  [ 800/1251]  eta: 0:01:34  lr: 0.000608  min_lr: 0.000608  loss: 3.3834 (2.8547)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0054 (1.0331)  time: 0.2080  data: 0.0004  max mem: 13273
Epoch: [228]  [1000/1251]  eta: 0:00:52  lr: 0.000606  min_lr: 0.000606  loss: 2.5937 (2.8509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0284 (1.0335)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [228]  [1200/1251]  eta: 0:00:10  lr: 0.000603  min_lr: 0.000603  loss: 2.8556 (2.8538)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9954 (1.0310)  time: 0.2045  data: 0.0004  max mem: 13273
Epoch: [228]  [1250/1251]  eta: 0:00:00  lr: 0.000603  min_lr: 0.000603  loss: 3.1522 (2.8567)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0220 (1.0290)  time: 0.1701  data: 0.0007  max mem: 13273
Epoch: [228] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.000603  min_lr: 0.000603  loss: 3.1522 (2.8923)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0220 (1.0290)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7243 (0.7243)  acc1: 86.0000 (86.0000)  acc5: 98.8000 (98.8000)  time: 5.7099  data: 5.6028  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8733 (0.8642)  acc1: 83.6000 (82.5091)  acc5: 96.8000 (96.9818)  time: 0.7453  data: 0.6497  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0720 (1.0321)  acc1: 76.0000 (79.0857)  acc5: 94.0000 (94.9333)  time: 0.2002  data: 0.1078  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0643 (1.0397)  acc1: 77.2000 (78.9280)  acc5: 93.6000 (94.8160)  time: 0.2000  data: 0.1077  max mem: 13273
Test: Total time: 0:00:10 (0.4059 s / it)
* Acc@1 79.446 Acc@5 95.106 loss 1.034
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.66%
Epoch: [229]  [   0/1251]  eta: 1:11:08  lr: 0.000603  min_lr: 0.000603  loss: 1.9500 (1.9500)  weight_decay: 0.0500 (0.0500)  time: 3.4121  data: 1.6473  max mem: 13273
Epoch: [229]  [ 200/1251]  eta: 0:03:53  lr: 0.000600  min_lr: 0.000600  loss: 2.8516 (2.9079)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0375 (1.0579)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [229]  [ 400/1251]  eta: 0:03:02  lr: 0.000597  min_lr: 0.000597  loss: 2.1704 (2.8841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9934 (1.0649)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [229]  [ 600/1251]  eta: 0:02:17  lr: 0.000595  min_lr: 0.000595  loss: 2.7924 (2.8953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9996 (1.0587)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [229]  [ 800/1251]  eta: 0:01:34  lr: 0.000592  min_lr: 0.000592  loss: 2.8748 (2.9196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9742 (1.0482)  time: 0.2119  data: 0.0005  max mem: 13273
Epoch: [229]  [1000/1251]  eta: 0:00:52  lr: 0.000590  min_lr: 0.000590  loss: 2.3304 (2.9042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9842 (1.0451)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [229]  [1200/1251]  eta: 0:00:10  lr: 0.000587  min_lr: 0.000587  loss: 2.7564 (2.9127)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0342 (inf)  time: 0.2172  data: 0.0005  max mem: 13273
Epoch: [229]  [1250/1251]  eta: 0:00:00  lr: 0.000587  min_lr: 0.000587  loss: 2.6373 (2.9176)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0095 (inf)  time: 0.1704  data: 0.0010  max mem: 13273
Epoch: [229] Total time: 0:04:22 (0.2095 s / it)
Averaged stats: lr: 0.000587  min_lr: 0.000587  loss: 2.6373 (2.8917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0095 (inf)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6862 (0.6862)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.8568  data: 5.7498  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8302 (0.8309)  acc1: 84.4000 (83.2000)  acc5: 97.2000 (97.0545)  time: 0.7590  data: 0.6625  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0527 (0.9995)  acc1: 78.0000 (79.5238)  acc5: 94.4000 (95.2191)  time: 0.2063  data: 0.1134  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0958 (1.0107)  acc1: 78.0000 (79.2480)  acc5: 94.0000 (95.0560)  time: 0.2100  data: 0.1178  max mem: 13273
Test: Total time: 0:00:10 (0.4209 s / it)
* Acc@1 79.512 Acc@5 95.062 loss 1.004
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.66%
Epoch: [230]  [   0/1251]  eta: 1:02:42  lr: 0.000587  min_lr: 0.000587  loss: 3.7386 (3.7386)  weight_decay: 0.0500 (0.0500)  time: 3.0075  data: 2.7188  max mem: 13273
Epoch: [230]  [ 200/1251]  eta: 0:03:53  lr: 0.000584  min_lr: 0.000584  loss: 2.6909 (2.7729)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0548 (1.0488)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [230]  [ 400/1251]  eta: 0:03:02  lr: 0.000582  min_lr: 0.000582  loss: 3.3119 (2.8214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9968 (1.0623)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [230]  [ 600/1251]  eta: 0:02:17  lr: 0.000579  min_lr: 0.000579  loss: 3.2455 (2.8375)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0979 (1.0617)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [230]  [ 800/1251]  eta: 0:01:34  lr: 0.000577  min_lr: 0.000577  loss: 3.3314 (2.8582)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0925 (1.0741)  time: 0.2079  data: 0.0006  max mem: 13273
Epoch: [230]  [1000/1251]  eta: 0:00:52  lr: 0.000574  min_lr: 0.000574  loss: 2.2872 (2.8635)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0070 (1.0733)  time: 0.2060  data: 0.0007  max mem: 13273
Epoch: [230]  [1200/1251]  eta: 0:00:10  lr: 0.000571  min_lr: 0.000571  loss: 2.5024 (2.8769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1579 (1.0789)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [230]  [1250/1251]  eta: 0:00:00  lr: 0.000571  min_lr: 0.000571  loss: 3.1262 (2.8733)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0468 (1.0763)  time: 0.1708  data: 0.0006  max mem: 13273
Epoch: [230] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.000571  min_lr: 0.000571  loss: 3.1262 (2.8751)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0468 (1.0763)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7317 (0.7317)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.4726  data: 5.3635  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.9004 (0.8694)  acc1: 82.4000 (82.0364)  acc5: 96.8000 (96.6182)  time: 0.7789  data: 0.6791  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0552 (1.0296)  acc1: 76.0000 (78.8000)  acc5: 94.0000 (94.8952)  time: 0.2244  data: 0.1296  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1159 (1.0360)  acc1: 76.0000 (78.5120)  acc5: 93.6000 (94.8320)  time: 0.2231  data: 0.1295  max mem: 13273
Test: Total time: 0:00:10 (0.4167 s / it)
* Acc@1 79.376 Acc@5 95.004 loss 1.026
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.66%
Epoch: [231]  [   0/1251]  eta: 1:04:15  lr: 0.000571  min_lr: 0.000571  loss: 3.3135 (3.3135)  weight_decay: 0.0500 (0.0500)  time: 3.0816  data: 2.8367  max mem: 13273
Epoch: [231]  [ 200/1251]  eta: 0:03:53  lr: 0.000568  min_lr: 0.000568  loss: 2.4500 (2.8980)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0365 (1.1441)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [231]  [ 400/1251]  eta: 0:03:02  lr: 0.000566  min_lr: 0.000566  loss: 2.1735 (2.8675)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0582 (1.1099)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [231]  [ 600/1251]  eta: 0:02:17  lr: 0.000563  min_lr: 0.000563  loss: 3.0045 (2.8693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9665 (1.0760)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [231]  [ 800/1251]  eta: 0:01:34  lr: 0.000561  min_lr: 0.000561  loss: 2.2674 (2.8543)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0576 (1.0938)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [231]  [1000/1251]  eta: 0:00:52  lr: 0.000558  min_lr: 0.000558  loss: 2.8304 (2.8776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9832 (1.0776)  time: 0.2070  data: 0.0004  max mem: 13273
Epoch: [231]  [1200/1251]  eta: 0:00:10  lr: 0.000556  min_lr: 0.000556  loss: 2.2944 (2.8715)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0179 (1.0684)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [231]  [1250/1251]  eta: 0:00:00  lr: 0.000555  min_lr: 0.000555  loss: 3.1958 (2.8735)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0837 (1.0676)  time: 0.1703  data: 0.0008  max mem: 13273
Epoch: [231] Total time: 0:04:21 (0.2090 s / it)
Averaged stats: lr: 0.000555  min_lr: 0.000555  loss: 3.1958 (2.8758)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0837 (1.0676)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.7464 (0.7464)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 5.2089  data: 5.0962  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8743 (0.8752)  acc1: 84.0000 (83.3455)  acc5: 97.2000 (97.0545)  time: 0.7136  data: 0.6156  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0743 (1.0387)  acc1: 77.2000 (79.6952)  acc5: 94.0000 (94.9524)  time: 0.2109  data: 0.1169  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1099 (1.0492)  acc1: 77.2000 (79.4400)  acc5: 93.2000 (94.8480)  time: 0.2185  data: 0.1253  max mem: 13273
Test: Total time: 0:00:10 (0.4024 s / it)
* Acc@1 79.690 Acc@5 95.084 loss 1.041
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.69%
Epoch: [232]  [   0/1251]  eta: 1:02:07  lr: 0.000555  min_lr: 0.000555  loss: 3.7242 (3.7242)  weight_decay: 0.0500 (0.0500)  time: 2.9798  data: 2.7092  max mem: 13273
Epoch: [232]  [ 200/1251]  eta: 0:03:51  lr: 0.000553  min_lr: 0.000553  loss: 2.5042 (2.9251)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1668 (1.1389)  time: 0.2036  data: 0.0005  max mem: 13273
Epoch: [232]  [ 400/1251]  eta: 0:03:00  lr: 0.000550  min_lr: 0.000550  loss: 2.5265 (2.8867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9390 (1.0920)  time: 0.2039  data: 0.0004  max mem: 13273
Epoch: [232]  [ 600/1251]  eta: 0:02:16  lr: 0.000548  min_lr: 0.000548  loss: 2.2783 (2.8923)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0406 (1.0859)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [232]  [ 800/1251]  eta: 0:01:34  lr: 0.000545  min_lr: 0.000545  loss: 2.5642 (2.8742)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0226 (1.0736)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [232]  [1000/1251]  eta: 0:00:52  lr: 0.000543  min_lr: 0.000543  loss: 2.7169 (2.8849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9527 (1.0550)  time: 0.2036  data: 0.0005  max mem: 13273
Epoch: [232]  [1200/1251]  eta: 0:00:10  lr: 0.000540  min_lr: 0.000540  loss: 2.6685 (2.8811)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1192 (1.0716)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [232]  [1250/1251]  eta: 0:00:00  lr: 0.000540  min_lr: 0.000540  loss: 2.2198 (2.8781)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1192 (1.0737)  time: 0.1701  data: 0.0008  max mem: 13273
Epoch: [232] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.000540  min_lr: 0.000540  loss: 2.2198 (2.8709)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1192 (1.0737)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7189 (0.7189)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.6761  data: 5.5710  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8144 (0.8216)  acc1: 84.8000 (83.8545)  acc5: 97.2000 (97.0182)  time: 0.7715  data: 0.6751  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0180 (0.9822)  acc1: 78.8000 (79.7905)  acc5: 94.0000 (94.9905)  time: 0.2094  data: 0.1159  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0842 (0.9921)  acc1: 78.8000 (79.4720)  acc5: 93.6000 (94.9280)  time: 0.2089  data: 0.1159  max mem: 13273
Test: Total time: 0:00:10 (0.4120 s / it)
* Acc@1 79.694 Acc@5 95.148 loss 0.989
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.69%
Epoch: [233]  [   0/1251]  eta: 1:00:55  lr: 0.000540  min_lr: 0.000540  loss: 4.1065 (4.1065)  weight_decay: 0.0500 (0.0500)  time: 2.9221  data: 2.6888  max mem: 13273
Epoch: [233]  [ 200/1251]  eta: 0:03:51  lr: 0.000537  min_lr: 0.000537  loss: 2.8370 (2.8277)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0026 (1.0820)  time: 0.2069  data: 0.0008  max mem: 13273
Epoch: [233]  [ 400/1251]  eta: 0:03:02  lr: 0.000535  min_lr: 0.000535  loss: 2.2247 (2.8310)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9813 (1.0423)  time: 0.2047  data: 0.0007  max mem: 13273
Epoch: [233]  [ 600/1251]  eta: 0:02:17  lr: 0.000533  min_lr: 0.000533  loss: 2.9739 (2.8544)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0182 (1.0396)  time: 0.2060  data: 0.0008  max mem: 13273
Epoch: [233]  [ 800/1251]  eta: 0:01:34  lr: 0.000530  min_lr: 0.000530  loss: 2.5221 (2.8548)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1781 (1.0730)  time: 0.2054  data: 0.0008  max mem: 13273
Epoch: [233]  [1000/1251]  eta: 0:00:52  lr: 0.000528  min_lr: 0.000528  loss: 2.4531 (2.8637)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0152 (1.0718)  time: 0.2052  data: 0.0008  max mem: 13273
Epoch: [233]  [1200/1251]  eta: 0:00:10  lr: 0.000525  min_lr: 0.000525  loss: 2.3473 (2.8639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9941 (1.0658)  time: 0.2067  data: 0.0007  max mem: 13273
Epoch: [233]  [1250/1251]  eta: 0:00:00  lr: 0.000525  min_lr: 0.000525  loss: 2.4059 (2.8643)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0944 (1.0689)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [233] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.000525  min_lr: 0.000525  loss: 2.4059 (2.8693)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0944 (1.0689)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6784 (0.6784)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.6173  data: 5.5117  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7865 (0.7971)  acc1: 83.6000 (83.4182)  acc5: 97.2000 (97.1273)  time: 0.7587  data: 0.6646  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9836 (0.9580)  acc1: 77.6000 (79.7143)  acc5: 94.0000 (95.2381)  time: 0.2117  data: 0.1191  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0427 (0.9682)  acc1: 76.8000 (79.4080)  acc5: 93.6000 (95.1040)  time: 0.2110  data: 0.1190  max mem: 13273
Test: Total time: 0:00:10 (0.4114 s / it)
* Acc@1 79.788 Acc@5 95.122 loss 0.960
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.79%
Epoch: [234]  [   0/1251]  eta: 0:59:16  lr: 0.000525  min_lr: 0.000525  loss: 3.5267 (3.5267)  weight_decay: 0.0500 (0.0500)  time: 2.8429  data: 2.5093  max mem: 13273
Epoch: [234]  [ 200/1251]  eta: 0:03:50  lr: 0.000522  min_lr: 0.000522  loss: 2.5160 (2.7986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9876 (1.0061)  time: 0.2046  data: 0.0007  max mem: 13273
Epoch: [234]  [ 400/1251]  eta: 0:03:00  lr: 0.000520  min_lr: 0.000520  loss: 2.7814 (2.8484)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0632 (1.0424)  time: 0.2062  data: 0.0009  max mem: 13273
Epoch: [234]  [ 600/1251]  eta: 0:02:16  lr: 0.000517  min_lr: 0.000517  loss: 3.3167 (2.8476)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0136 (1.0530)  time: 0.2048  data: 0.0009  max mem: 13273
Epoch: [234]  [ 800/1251]  eta: 0:01:34  lr: 0.000515  min_lr: 0.000515  loss: 2.7673 (2.8491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9517 (1.0499)  time: 0.2036  data: 0.0008  max mem: 13273
Epoch: [234]  [1000/1251]  eta: 0:00:52  lr: 0.000513  min_lr: 0.000513  loss: 2.3810 (2.8552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9632 (1.0435)  time: 0.2054  data: 0.0009  max mem: 13273
Epoch: [234]  [1200/1251]  eta: 0:00:10  lr: 0.000510  min_lr: 0.000510  loss: 2.2136 (2.8597)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1034 (1.0463)  time: 0.2065  data: 0.0007  max mem: 13273
Epoch: [234]  [1250/1251]  eta: 0:00:00  lr: 0.000510  min_lr: 0.000510  loss: 2.6102 (2.8603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1568 (1.0520)  time: 0.1700  data: 0.0006  max mem: 13273
Epoch: [234] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.000510  min_lr: 0.000510  loss: 2.6102 (2.8662)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1568 (1.0520)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6336 (0.6336)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.3643  data: 5.2590  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7767 (0.7953)  acc1: 84.8000 (83.6364)  acc5: 97.2000 (97.1273)  time: 0.7259  data: 0.6309  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9976 (0.9796)  acc1: 77.2000 (80.0762)  acc5: 94.4000 (95.1619)  time: 0.2025  data: 0.1103  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1092 (0.9870)  acc1: 77.2000 (79.7760)  acc5: 94.0000 (95.0240)  time: 0.2223  data: 0.1310  max mem: 13273
Test: Total time: 0:00:10 (0.4107 s / it)
* Acc@1 79.678 Acc@5 95.052 loss 0.983
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.79%
Epoch: [235]  [   0/1251]  eta: 1:08:24  lr: 0.000510  min_lr: 0.000510  loss: 2.2636 (2.2636)  weight_decay: 0.0500 (0.0500)  time: 3.2814  data: 2.4677  max mem: 13273
Epoch: [235]  [ 200/1251]  eta: 0:03:52  lr: 0.000507  min_lr: 0.000507  loss: 2.4241 (2.8450)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0595 (1.0834)  time: 0.2047  data: 0.0007  max mem: 13273
Epoch: [235]  [ 400/1251]  eta: 0:03:01  lr: 0.000505  min_lr: 0.000505  loss: 2.1751 (2.8744)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1162 (1.0864)  time: 0.2050  data: 0.0008  max mem: 13273
Epoch: [235]  [ 600/1251]  eta: 0:02:17  lr: 0.000502  min_lr: 0.000502  loss: 2.9217 (2.8653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9714 (1.0824)  time: 0.2060  data: 0.0007  max mem: 13273
Epoch: [235]  [ 800/1251]  eta: 0:01:34  lr: 0.000500  min_lr: 0.000500  loss: 3.4462 (2.8512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0861 (1.0796)  time: 0.2041  data: 0.0006  max mem: 13273
Epoch: [235]  [1000/1251]  eta: 0:00:52  lr: 0.000498  min_lr: 0.000498  loss: 2.9121 (2.8513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9376 (1.0737)  time: 0.2047  data: 0.0007  max mem: 13273
Epoch: [235]  [1200/1251]  eta: 0:00:10  lr: 0.000495  min_lr: 0.000495  loss: 2.3457 (2.8588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9681 (1.0746)  time: 0.2047  data: 0.0007  max mem: 13273
Epoch: [235]  [1250/1251]  eta: 0:00:00  lr: 0.000495  min_lr: 0.000495  loss: 2.8437 (2.8588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9580 (1.0719)  time: 0.1706  data: 0.0008  max mem: 13273
Epoch: [235] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.000495  min_lr: 0.000495  loss: 2.8437 (2.8629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9580 (1.0719)
Test:  [ 0/25]  eta: 0:01:26  loss: 0.6763 (0.6763)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 3.4667  data: 3.3590  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.8412 (0.8481)  acc1: 83.6000 (83.2727)  acc5: 97.6000 (97.2000)  time: 0.6254  data: 0.5305  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0378 (1.0161)  acc1: 77.6000 (79.7143)  acc5: 94.8000 (95.4667)  time: 0.2913  data: 0.1992  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0849 (1.0206)  acc1: 77.6000 (79.4560)  acc5: 94.8000 (95.3600)  time: 0.2574  data: 0.1654  max mem: 13273
Test: Total time: 0:00:10 (0.4199 s / it)
* Acc@1 79.576 Acc@5 95.140 loss 1.015
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.79%
Epoch: [236]  [   0/1251]  eta: 1:03:41  lr: 0.000495  min_lr: 0.000495  loss: 3.4916 (3.4916)  weight_decay: 0.0500 (0.0500)  time: 3.0551  data: 2.0655  max mem: 13273
Epoch: [236]  [ 200/1251]  eta: 0:03:52  lr: 0.000492  min_lr: 0.000492  loss: 2.9354 (2.8726)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9908 (1.0481)  time: 0.2043  data: 0.0005  max mem: 13273
Epoch: [236]  [ 400/1251]  eta: 0:03:02  lr: 0.000490  min_lr: 0.000490  loss: 2.9198 (2.9230)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0687 (1.0723)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [236]  [ 600/1251]  eta: 0:02:17  lr: 0.000488  min_lr: 0.000488  loss: 2.4638 (2.9075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9507 (1.0721)  time: 0.2037  data: 0.0004  max mem: 13273
Epoch: [236]  [ 800/1251]  eta: 0:01:34  lr: 0.000485  min_lr: 0.000485  loss: 2.3878 (2.8992)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0841 (1.0778)  time: 0.2040  data: 0.0004  max mem: 13273
Epoch: [236]  [1000/1251]  eta: 0:00:52  lr: 0.000483  min_lr: 0.000483  loss: 2.4289 (2.8714)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0307 (1.0881)  time: 0.2070  data: 0.0005  max mem: 13273
Epoch: [236]  [1200/1251]  eta: 0:00:10  lr: 0.000481  min_lr: 0.000481  loss: 2.2857 (2.8745)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0054 (1.0761)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [236]  [1250/1251]  eta: 0:00:00  lr: 0.000480  min_lr: 0.000480  loss: 2.7674 (2.8698)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0317 (1.0756)  time: 0.1707  data: 0.0008  max mem: 13273
Epoch: [236] Total time: 0:04:21 (0.2090 s / it)
Averaged stats: lr: 0.000480  min_lr: 0.000480  loss: 2.7674 (2.8551)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0317 (1.0756)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6812 (0.6812)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.9150  data: 5.7836  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8108 (0.8328)  acc1: 85.2000 (83.3818)  acc5: 97.2000 (96.9091)  time: 0.7631  data: 0.6616  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0744 (1.0072)  acc1: 77.6000 (79.4667)  acc5: 94.0000 (94.9524)  time: 0.1981  data: 0.1037  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0634 (1.0104)  acc1: 78.0000 (79.3280)  acc5: 93.6000 (94.9280)  time: 0.1963  data: 0.1036  max mem: 13273
Test: Total time: 0:00:10 (0.4128 s / it)
* Acc@1 79.790 Acc@5 95.034 loss 1.006
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.79%
Epoch: [237]  [   0/1251]  eta: 1:06:34  lr: 0.000480  min_lr: 0.000480  loss: 2.2083 (2.2083)  weight_decay: 0.0500 (0.0500)  time: 3.1927  data: 2.9656  max mem: 13273
Epoch: [237]  [ 200/1251]  eta: 0:03:50  lr: 0.000478  min_lr: 0.000478  loss: 3.0693 (2.8614)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0948 (1.0609)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [237]  [ 400/1251]  eta: 0:03:00  lr: 0.000475  min_lr: 0.000475  loss: 2.7534 (2.8885)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0569 (1.0703)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [237]  [ 600/1251]  eta: 0:02:17  lr: 0.000473  min_lr: 0.000473  loss: 2.8580 (2.8699)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1701 (1.0900)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [237]  [ 800/1251]  eta: 0:01:34  lr: 0.000471  min_lr: 0.000471  loss: 2.3355 (2.8636)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1052 (1.1017)  time: 0.2100  data: 0.0005  max mem: 13273
Epoch: [237]  [1000/1251]  eta: 0:00:52  lr: 0.000468  min_lr: 0.000468  loss: 2.5836 (2.8564)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1112 (1.0928)  time: 0.2043  data: 0.0005  max mem: 13273
Epoch: [237]  [1200/1251]  eta: 0:00:10  lr: 0.000466  min_lr: 0.000466  loss: 2.8965 (2.8635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9805 (1.0908)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [237]  [1250/1251]  eta: 0:00:00  lr: 0.000466  min_lr: 0.000466  loss: 2.9614 (2.8652)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1045 (1.0906)  time: 0.1704  data: 0.0005  max mem: 13273
Epoch: [237] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.000466  min_lr: 0.000466  loss: 2.9614 (2.8515)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1045 (1.0906)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6736 (0.6736)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.7577  data: 5.6525  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8325 (0.8472)  acc1: 85.2000 (83.6364)  acc5: 97.2000 (97.2727)  time: 0.7777  data: 0.6826  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0557 (1.0252)  acc1: 78.0000 (80.0571)  acc5: 94.0000 (95.3714)  time: 0.2153  data: 0.1221  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1543 (1.0352)  acc1: 78.0000 (79.9040)  acc5: 94.0000 (95.1520)  time: 0.2145  data: 0.1221  max mem: 13273
Test: Total time: 0:00:10 (0.4202 s / it)
* Acc@1 79.812 Acc@5 95.120 loss 1.026
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.81%
Epoch: [238]  [   0/1251]  eta: 0:55:12  lr: 0.000466  min_lr: 0.000466  loss: 2.6095 (2.6095)  weight_decay: 0.0500 (0.0500)  time: 2.6481  data: 2.3747  max mem: 13273
Epoch: [238]  [ 200/1251]  eta: 0:03:51  lr: 0.000463  min_lr: 0.000463  loss: 2.7902 (2.7765)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0423 (1.0585)  time: 0.2063  data: 0.0006  max mem: 13273
Epoch: [238]  [ 400/1251]  eta: 0:03:01  lr: 0.000461  min_lr: 0.000461  loss: 2.7116 (2.8185)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0412 (1.1051)  time: 0.2111  data: 0.0005  max mem: 13273
Epoch: [238]  [ 600/1251]  eta: 0:02:17  lr: 0.000459  min_lr: 0.000459  loss: 2.5321 (2.8293)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0100 (1.0984)  time: 0.2049  data: 0.0006  max mem: 13273
Epoch: [238]  [ 800/1251]  eta: 0:01:34  lr: 0.000456  min_lr: 0.000456  loss: 2.5877 (2.8428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0711 (1.0891)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [238]  [1000/1251]  eta: 0:00:52  lr: 0.000454  min_lr: 0.000454  loss: 2.4171 (2.8315)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0341 (1.0809)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [238]  [1200/1251]  eta: 0:00:10  lr: 0.000452  min_lr: 0.000452  loss: 3.0676 (2.8481)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0245 (1.0805)  time: 0.2059  data: 0.0006  max mem: 13273
Epoch: [238]  [1250/1251]  eta: 0:00:00  lr: 0.000451  min_lr: 0.000451  loss: 2.2604 (2.8402)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0244 (1.0809)  time: 0.1719  data: 0.0008  max mem: 13273
Epoch: [238] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.000451  min_lr: 0.000451  loss: 2.2604 (2.8461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0244 (1.0809)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6072 (0.6072)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 5.4230  data: 5.3174  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7411 (0.7604)  acc1: 85.2000 (84.0364)  acc5: 96.8000 (97.1273)  time: 0.7262  data: 0.6306  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9521 (0.9293)  acc1: 78.0000 (79.9810)  acc5: 94.8000 (95.2381)  time: 0.2118  data: 0.1179  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0179 (0.9391)  acc1: 77.6000 (79.7280)  acc5: 94.4000 (95.1520)  time: 0.2207  data: 0.1275  max mem: 13273
Test: Total time: 0:00:10 (0.4119 s / it)
* Acc@1 79.898 Acc@5 95.248 loss 0.934
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.90%
Epoch: [239]  [   0/1251]  eta: 1:09:43  lr: 0.000451  min_lr: 0.000451  loss: 2.1382 (2.1382)  weight_decay: 0.0500 (0.0500)  time: 3.3442  data: 3.1250  max mem: 13273
Epoch: [239]  [ 200/1251]  eta: 0:03:52  lr: 0.000449  min_lr: 0.000449  loss: 2.6382 (2.7360)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0732 (0.9986)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [239]  [ 400/1251]  eta: 0:03:01  lr: 0.000447  min_lr: 0.000447  loss: 3.0328 (2.7623)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0606 (1.0529)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [239]  [ 600/1251]  eta: 0:02:17  lr: 0.000445  min_lr: 0.000445  loss: 2.4582 (2.7837)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1375 (1.1000)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [239]  [ 800/1251]  eta: 0:01:34  lr: 0.000442  min_lr: 0.000442  loss: 2.8118 (2.8048)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0345 (1.0865)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [239]  [1000/1251]  eta: 0:00:52  lr: 0.000440  min_lr: 0.000440  loss: 2.4025 (2.8197)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0571 (1.0823)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [239]  [1200/1251]  eta: 0:00:10  lr: 0.000438  min_lr: 0.000438  loss: 2.5686 (2.8319)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0694 (1.0780)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [239]  [1250/1251]  eta: 0:00:00  lr: 0.000437  min_lr: 0.000437  loss: 3.2763 (2.8414)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0730 (1.0801)  time: 0.1698  data: 0.0005  max mem: 13273
Epoch: [239] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.000437  min_lr: 0.000437  loss: 3.2763 (2.8559)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0730 (1.0801)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7245 (0.7245)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 5.7740  data: 5.6444  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8783 (0.8808)  acc1: 84.8000 (83.7455)  acc5: 97.2000 (97.1636)  time: 0.7817  data: 0.6790  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0495 (1.0557)  acc1: 79.6000 (79.9429)  acc5: 94.8000 (95.0286)  time: 0.2134  data: 0.1179  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1218 (1.0613)  acc1: 78.4000 (79.6480)  acc5: 93.2000 (94.9600)  time: 0.2115  data: 0.1178  max mem: 13273
Test: Total time: 0:00:10 (0.4193 s / it)
* Acc@1 79.648 Acc@5 95.116 loss 1.054
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.90%
Epoch: [240]  [   0/1251]  eta: 0:59:24  lr: 0.000437  min_lr: 0.000437  loss: 2.1971 (2.1971)  weight_decay: 0.0500 (0.0500)  time: 2.8495  data: 2.4438  max mem: 13273
Epoch: [240]  [ 200/1251]  eta: 0:03:55  lr: 0.000435  min_lr: 0.000435  loss: 2.7548 (2.7563)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1038 (1.1658)  time: 0.2144  data: 0.0005  max mem: 13273
Epoch: [240]  [ 400/1251]  eta: 0:03:02  lr: 0.000433  min_lr: 0.000433  loss: 2.6563 (2.8214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9735 (1.1381)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [240]  [ 600/1251]  eta: 0:02:17  lr: 0.000431  min_lr: 0.000431  loss: 2.6470 (2.8179)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0710 (1.1230)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [240]  [ 800/1251]  eta: 0:01:34  lr: 0.000428  min_lr: 0.000428  loss: 2.2201 (2.8110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0124 (1.1085)  time: 0.2051  data: 0.0006  max mem: 13273
Epoch: [240]  [1000/1251]  eta: 0:00:52  lr: 0.000426  min_lr: 0.000426  loss: 2.4171 (2.8291)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0545 (1.1136)  time: 0.2064  data: 0.0004  max mem: 13273
Epoch: [240]  [1200/1251]  eta: 0:00:10  lr: 0.000424  min_lr: 0.000424  loss: 2.7281 (2.8347)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0922 (1.1124)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [240]  [1250/1251]  eta: 0:00:00  lr: 0.000423  min_lr: 0.000423  loss: 2.2179 (2.8356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0719 (1.1112)  time: 0.1705  data: 0.0006  max mem: 13273
Epoch: [240] Total time: 0:04:21 (0.2092 s / it)
Averaged stats: lr: 0.000423  min_lr: 0.000423  loss: 2.2179 (2.8404)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0719 (1.1112)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6414 (0.6414)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.6383  data: 5.5296  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8041 (0.7911)  acc1: 84.8000 (83.7091)  acc5: 97.6000 (97.2364)  time: 0.7588  data: 0.6613  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9852 (0.9450)  acc1: 77.6000 (79.9238)  acc5: 94.4000 (95.3143)  time: 0.2159  data: 0.1218  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9970 (0.9543)  acc1: 77.6000 (79.4560)  acc5: 94.0000 (95.2480)  time: 0.2148  data: 0.1217  max mem: 13273
Test: Total time: 0:00:10 (0.4159 s / it)
* Acc@1 79.886 Acc@5 95.246 loss 0.950
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.90%
Epoch: [241]  [   0/1251]  eta: 1:03:20  lr: 0.000423  min_lr: 0.000423  loss: 3.5840 (3.5840)  weight_decay: 0.0500 (0.0500)  time: 3.0377  data: 1.9087  max mem: 13273
Epoch: [241]  [ 200/1251]  eta: 0:03:54  lr: 0.000421  min_lr: 0.000421  loss: 2.1124 (2.8711)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0595 (1.1270)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [241]  [ 400/1251]  eta: 0:03:02  lr: 0.000419  min_lr: 0.000419  loss: 3.0082 (2.8999)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1113 (1.1324)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [241]  [ 600/1251]  eta: 0:02:17  lr: 0.000417  min_lr: 0.000417  loss: 2.2256 (2.8490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0381 (1.1256)  time: 0.2065  data: 0.0003  max mem: 13273
Epoch: [241]  [ 800/1251]  eta: 0:01:34  lr: 0.000415  min_lr: 0.000415  loss: 3.4061 (2.8657)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0183 (1.1288)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [241]  [1000/1251]  eta: 0:00:52  lr: 0.000412  min_lr: 0.000412  loss: 2.6237 (2.8572)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0282 (1.1195)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [241]  [1200/1251]  eta: 0:00:10  lr: 0.000410  min_lr: 0.000410  loss: 2.3427 (2.8727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9808 (1.1119)  time: 0.2070  data: 0.0006  max mem: 13273
Epoch: [241]  [1250/1251]  eta: 0:00:00  lr: 0.000410  min_lr: 0.000410  loss: 3.1820 (2.8781)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0737 (1.1125)  time: 0.1708  data: 0.0008  max mem: 13273
Epoch: [241] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.000410  min_lr: 0.000410  loss: 3.1820 (2.8396)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0737 (1.1125)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6532 (0.6532)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.7702  data: 5.6366  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7469 (0.7916)  acc1: 84.4000 (83.8909)  acc5: 97.2000 (97.0909)  time: 0.7709  data: 0.6689  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9996 (0.9696)  acc1: 78.4000 (80.1714)  acc5: 94.0000 (95.1429)  time: 0.2177  data: 0.1230  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0852 (0.9782)  acc1: 78.0000 (79.8880)  acc5: 93.6000 (95.0560)  time: 0.2156  data: 0.1229  max mem: 13273
Test: Total time: 0:00:10 (0.4227 s / it)
* Acc@1 79.902 Acc@5 95.206 loss 0.980
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.90%
Epoch: [242]  [   0/1251]  eta: 1:01:44  lr: 0.000410  min_lr: 0.000410  loss: 1.8050 (1.8050)  weight_decay: 0.0500 (0.0500)  time: 2.9611  data: 2.7270  max mem: 13273
Epoch: [242]  [ 200/1251]  eta: 0:03:51  lr: 0.000407  min_lr: 0.000407  loss: 2.4952 (2.8205)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0197 (1.0570)  time: 0.2049  data: 0.0006  max mem: 13273
Epoch: [242]  [ 400/1251]  eta: 0:03:01  lr: 0.000405  min_lr: 0.000405  loss: 2.6061 (2.8496)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1008 (1.0850)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [242]  [ 600/1251]  eta: 0:02:16  lr: 0.000403  min_lr: 0.000403  loss: 3.2431 (2.8550)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0581 (1.0781)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [242]  [ 800/1251]  eta: 0:01:34  lr: 0.000401  min_lr: 0.000401  loss: 2.5816 (2.8411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1185 (1.0801)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [242]  [1000/1251]  eta: 0:00:52  lr: 0.000399  min_lr: 0.000399  loss: 3.3434 (2.8327)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0950 (inf)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [242]  [1200/1251]  eta: 0:00:10  lr: 0.000397  min_lr: 0.000397  loss: 2.9075 (2.8260)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0296 (inf)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [242]  [1250/1251]  eta: 0:00:00  lr: 0.000396  min_lr: 0.000396  loss: 2.1876 (2.8304)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0531 (inf)  time: 0.1699  data: 0.0007  max mem: 13273
Epoch: [242] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.000396  min_lr: 0.000396  loss: 2.1876 (2.8361)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0531 (inf)
Test:  [ 0/25]  eta: 0:01:28  loss: 0.6484 (0.6484)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 3.5437  data: 3.4183  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.7654 (0.7868)  acc1: 85.2000 (83.7091)  acc5: 98.0000 (97.2000)  time: 0.6361  data: 0.5389  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9656 (0.9592)  acc1: 78.0000 (80.0952)  acc5: 94.8000 (95.3333)  time: 0.2798  data: 0.1858  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0072 (0.9624)  acc1: 78.8000 (79.8880)  acc5: 94.4000 (95.3120)  time: 0.2226  data: 0.1296  max mem: 13273
Test: Total time: 0:00:09 (0.3918 s / it)
* Acc@1 79.948 Acc@5 95.234 loss 0.962
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.95%
Epoch: [243]  [   0/1251]  eta: 0:57:24  lr: 0.000396  min_lr: 0.000396  loss: 3.3829 (3.3829)  weight_decay: 0.0500 (0.0500)  time: 2.7531  data: 2.5216  max mem: 13273
Epoch: [243]  [ 200/1251]  eta: 0:03:50  lr: 0.000394  min_lr: 0.000394  loss: 2.2383 (2.8083)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0089 (1.0924)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [243]  [ 400/1251]  eta: 0:03:01  lr: 0.000392  min_lr: 0.000392  loss: 2.1336 (2.8067)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0389 (1.1176)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [243]  [ 600/1251]  eta: 0:02:17  lr: 0.000390  min_lr: 0.000390  loss: 3.1346 (2.8007)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0887 (inf)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [243]  [ 800/1251]  eta: 0:01:34  lr: 0.000388  min_lr: 0.000388  loss: 3.3363 (2.8174)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1014 (inf)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [243]  [1000/1251]  eta: 0:00:52  lr: 0.000385  min_lr: 0.000385  loss: 2.6427 (2.8211)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1172 (inf)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [243]  [1200/1251]  eta: 0:00:10  lr: 0.000383  min_lr: 0.000383  loss: 2.8337 (2.8361)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1789 (inf)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [243]  [1250/1251]  eta: 0:00:00  lr: 0.000383  min_lr: 0.000383  loss: 2.2285 (2.8369)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1069 (inf)  time: 0.1703  data: 0.0010  max mem: 13273
Epoch: [243] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.000383  min_lr: 0.000383  loss: 2.2285 (2.8324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1069 (inf)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5925 (0.5925)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.6674  data: 5.5595  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7275 (0.7348)  acc1: 84.8000 (83.6727)  acc5: 97.6000 (97.2000)  time: 0.7234  data: 0.6247  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9237 (0.9038)  acc1: 77.6000 (80.1333)  acc5: 94.8000 (95.5048)  time: 0.1999  data: 0.1058  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9552 (0.9110)  acc1: 79.2000 (79.9040)  acc5: 94.4000 (95.3760)  time: 0.1985  data: 0.1057  max mem: 13273
Test: Total time: 0:00:10 (0.4067 s / it)
* Acc@1 80.114 Acc@5 95.268 loss 0.908
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.11%
Epoch: [244]  [   0/1251]  eta: 1:10:47  lr: 0.000383  min_lr: 0.000383  loss: 1.9516 (1.9516)  weight_decay: 0.0500 (0.0500)  time: 3.3955  data: 3.1834  max mem: 13273
Epoch: [244]  [ 200/1251]  eta: 0:03:51  lr: 0.000381  min_lr: 0.000381  loss: 3.3293 (2.8233)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1482 (1.1120)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [244]  [ 400/1251]  eta: 0:03:01  lr: 0.000379  min_lr: 0.000379  loss: 3.1649 (2.7997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0191 (1.0865)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [244]  [ 600/1251]  eta: 0:02:17  lr: 0.000377  min_lr: 0.000377  loss: 3.2290 (2.8200)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0077 (1.0985)  time: 0.2070  data: 0.0005  max mem: 13273
Epoch: [244]  [ 800/1251]  eta: 0:01:34  lr: 0.000374  min_lr: 0.000374  loss: 2.8864 (2.8178)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0478 (1.1168)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [244]  [1000/1251]  eta: 0:00:52  lr: 0.000372  min_lr: 0.000372  loss: 3.0175 (2.8017)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1390 (1.1170)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [244]  [1200/1251]  eta: 0:00:10  lr: 0.000370  min_lr: 0.000370  loss: 2.8509 (2.8080)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0465 (1.1139)  time: 0.2043  data: 0.0006  max mem: 13273
Epoch: [244]  [1250/1251]  eta: 0:00:00  lr: 0.000370  min_lr: 0.000370  loss: 2.8614 (2.8102)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0700 (1.1191)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [244] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.000370  min_lr: 0.000370  loss: 2.8614 (2.8246)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0700 (1.1191)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6917 (0.6917)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.5338  data: 5.4285  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8121 (0.8216)  acc1: 84.8000 (84.1091)  acc5: 97.2000 (97.3455)  time: 0.7873  data: 0.6914  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0145 (1.0005)  acc1: 78.0000 (79.9238)  acc5: 95.2000 (95.3143)  time: 0.2211  data: 0.1284  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0925 (1.0093)  acc1: 78.0000 (79.7280)  acc5: 94.4000 (95.2800)  time: 0.2239  data: 0.1320  max mem: 13273
Test: Total time: 0:00:10 (0.4191 s / it)
* Acc@1 79.936 Acc@5 95.142 loss 1.006
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.11%
Epoch: [245]  [   0/1251]  eta: 1:06:46  lr: 0.000370  min_lr: 0.000370  loss: 2.1694 (2.1694)  weight_decay: 0.0500 (0.0500)  time: 3.2024  data: 1.5259  max mem: 13273
Epoch: [245]  [ 200/1251]  eta: 0:03:53  lr: 0.000368  min_lr: 0.000368  loss: 3.1012 (2.8759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0538 (1.0889)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [245]  [ 400/1251]  eta: 0:03:02  lr: 0.000366  min_lr: 0.000366  loss: 2.6753 (2.8866)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0499 (1.1052)  time: 0.2089  data: 0.0007  max mem: 13273
Epoch: [245]  [ 600/1251]  eta: 0:02:17  lr: 0.000364  min_lr: 0.000364  loss: 2.9517 (2.8601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1392 (1.1263)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [245]  [ 800/1251]  eta: 0:01:34  lr: 0.000362  min_lr: 0.000362  loss: 2.0240 (2.8366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1041 (1.1260)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [245]  [1000/1251]  eta: 0:00:52  lr: 0.000359  min_lr: 0.000359  loss: 2.6019 (2.8370)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0409 (1.1136)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [245]  [1200/1251]  eta: 0:00:10  lr: 0.000357  min_lr: 0.000357  loss: 2.1258 (2.8268)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0323 (1.1105)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [245]  [1250/1251]  eta: 0:00:00  lr: 0.000357  min_lr: 0.000357  loss: 3.4566 (2.8303)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0219 (1.1050)  time: 0.1703  data: 0.0007  max mem: 13273
Epoch: [245] Total time: 0:04:21 (0.2086 s / it)
Averaged stats: lr: 0.000357  min_lr: 0.000357  loss: 3.4566 (2.8167)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0219 (1.1050)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7689 (0.7689)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.5572  data: 5.4517  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8706 (0.9000)  acc1: 85.6000 (83.8182)  acc5: 97.2000 (97.3818)  time: 0.7547  data: 0.6580  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0853 (1.0648)  acc1: 77.6000 (79.9048)  acc5: 94.8000 (95.4095)  time: 0.2080  data: 0.1147  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1587 (1.0726)  acc1: 77.2000 (79.6480)  acc5: 93.6000 (95.2480)  time: 0.2231  data: 0.1307  max mem: 13273
Test: Total time: 0:00:10 (0.4193 s / it)
* Acc@1 80.082 Acc@5 95.298 loss 1.068
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.11%
Epoch: [246]  [   0/1251]  eta: 1:02:08  lr: 0.000357  min_lr: 0.000357  loss: 2.0169 (2.0169)  weight_decay: 0.0500 (0.0500)  time: 2.9804  data: 1.6862  max mem: 13273
Epoch: [246]  [ 200/1251]  eta: 0:03:52  lr: 0.000355  min_lr: 0.000355  loss: 3.2834 (2.9076)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1604 (1.1562)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [246]  [ 400/1251]  eta: 0:03:01  lr: 0.000353  min_lr: 0.000353  loss: 2.4917 (2.8761)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0470 (1.1308)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [246]  [ 600/1251]  eta: 0:02:17  lr: 0.000351  min_lr: 0.000351  loss: 3.2283 (2.8546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1558 (1.1373)  time: 0.2127  data: 0.0004  max mem: 13273
Epoch: [246]  [ 800/1251]  eta: 0:01:34  lr: 0.000349  min_lr: 0.000349  loss: 2.7334 (2.8478)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0777 (1.1306)  time: 0.2060  data: 0.0006  max mem: 13273
Epoch: [246]  [1000/1251]  eta: 0:00:52  lr: 0.000347  min_lr: 0.000347  loss: 3.2147 (2.8539)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0923 (1.1286)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [246]  [1200/1251]  eta: 0:00:10  lr: 0.000345  min_lr: 0.000345  loss: 2.7590 (2.8549)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0401 (1.1180)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [246]  [1250/1251]  eta: 0:00:00  lr: 0.000344  min_lr: 0.000344  loss: 3.4313 (2.8598)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0536 (1.1199)  time: 0.1701  data: 0.0008  max mem: 13273
Epoch: [246] Total time: 0:04:21 (0.2090 s / it)
Averaged stats: lr: 0.000344  min_lr: 0.000344  loss: 3.4313 (2.8304)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0536 (1.1199)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7112 (0.7112)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.5298  data: 5.4243  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8355 (0.8446)  acc1: 85.2000 (83.9636)  acc5: 97.6000 (97.3818)  time: 0.7564  data: 0.6591  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0380 (1.0166)  acc1: 78.0000 (80.2857)  acc5: 94.4000 (95.2571)  time: 0.2296  data: 0.1330  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.1063 (1.0213)  acc1: 78.0000 (80.0640)  acc5: 93.6000 (95.2160)  time: 0.2301  data: 0.1329  max mem: 13273
Test: Total time: 0:00:10 (0.4233 s / it)
* Acc@1 80.088 Acc@5 95.208 loss 1.016
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.11%
Epoch: [247]  [   0/1251]  eta: 1:04:02  lr: 0.000344  min_lr: 0.000344  loss: 2.7645 (2.7645)  weight_decay: 0.0500 (0.0500)  time: 3.0714  data: 2.5255  max mem: 13273
Epoch: [247]  [ 200/1251]  eta: 0:03:53  lr: 0.000342  min_lr: 0.000342  loss: 2.7307 (2.7383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0038 (1.1458)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [247]  [ 400/1251]  eta: 0:03:01  lr: 0.000340  min_lr: 0.000340  loss: 2.8390 (2.7571)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1758 (1.1587)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [247]  [ 600/1251]  eta: 0:02:17  lr: 0.000338  min_lr: 0.000338  loss: 2.1309 (2.7492)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0329 (1.1561)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [247]  [ 800/1251]  eta: 0:01:34  lr: 0.000336  min_lr: 0.000336  loss: 2.8686 (2.7512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0712 (1.1366)  time: 0.2064  data: 0.0004  max mem: 13273
Epoch: [247]  [1000/1251]  eta: 0:00:52  lr: 0.000334  min_lr: 0.000334  loss: 3.1197 (2.7671)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0558 (1.1251)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [247]  [1200/1251]  eta: 0:00:10  lr: 0.000332  min_lr: 0.000332  loss: 3.0597 (2.7752)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0834 (1.1242)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [247]  [1250/1251]  eta: 0:00:00  lr: 0.000332  min_lr: 0.000332  loss: 2.6178 (2.7834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0430 (1.1238)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [247] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.000332  min_lr: 0.000332  loss: 2.6178 (2.8099)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0430 (1.1238)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6551 (0.6551)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.7958  data: 5.6905  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.8359 (0.8382)  acc1: 85.2000 (84.0000)  acc5: 97.6000 (97.2000)  time: 0.7082  data: 0.6131  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0459 (1.0183)  acc1: 78.4000 (80.0000)  acc5: 94.0000 (95.2571)  time: 0.1840  data: 0.0917  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0852 (1.0251)  acc1: 78.0000 (79.8240)  acc5: 94.0000 (95.2000)  time: 0.1948  data: 0.1032  max mem: 13273
Test: Total time: 0:00:10 (0.4064 s / it)
* Acc@1 80.166 Acc@5 95.198 loss 1.016
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.17%
Epoch: [248]  [   0/1251]  eta: 0:57:51  lr: 0.000332  min_lr: 0.000332  loss: 2.1326 (2.1326)  weight_decay: 0.0500 (0.0500)  time: 2.7752  data: 2.4908  max mem: 13273
Epoch: [248]  [ 200/1251]  eta: 0:03:50  lr: 0.000330  min_lr: 0.000330  loss: 2.6045 (2.7829)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1284 (1.1111)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [248]  [ 400/1251]  eta: 0:03:00  lr: 0.000328  min_lr: 0.000328  loss: 2.7023 (2.7463)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1598 (1.1663)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [248]  [ 600/1251]  eta: 0:02:16  lr: 0.000326  min_lr: 0.000326  loss: 2.4940 (2.7737)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1204 (1.1667)  time: 0.2051  data: 0.0006  max mem: 13273
Epoch: [248]  [ 800/1251]  eta: 0:01:34  lr: 0.000324  min_lr: 0.000324  loss: 2.8614 (2.8002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0311 (1.1462)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [248]  [1000/1251]  eta: 0:00:52  lr: 0.000322  min_lr: 0.000322  loss: 3.1671 (2.7972)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0675 (1.1410)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [248]  [1200/1251]  eta: 0:00:10  lr: 0.000320  min_lr: 0.000320  loss: 2.4113 (2.8002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0173 (1.1311)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [248]  [1250/1251]  eta: 0:00:00  lr: 0.000320  min_lr: 0.000320  loss: 3.2880 (2.8025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9635 (1.1275)  time: 0.1710  data: 0.0010  max mem: 13273
Epoch: [248] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.000320  min_lr: 0.000320  loss: 3.2880 (2.8015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9635 (1.1275)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6464 (0.6464)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.8906  data: 5.7843  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7902 (0.8159)  acc1: 85.6000 (83.8909)  acc5: 97.2000 (97.1636)  time: 0.7652  data: 0.6702  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0183 (0.9885)  acc1: 78.0000 (80.0571)  acc5: 94.4000 (95.1048)  time: 0.2117  data: 0.1189  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0448 (0.9944)  acc1: 77.6000 (79.8720)  acc5: 94.4000 (95.1200)  time: 0.2108  data: 0.1188  max mem: 13273
Test: Total time: 0:00:10 (0.4221 s / it)
* Acc@1 80.098 Acc@5 95.290 loss 0.986
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.17%
Epoch: [249]  [   0/1251]  eta: 1:06:32  lr: 0.000320  min_lr: 0.000320  loss: 2.1067 (2.1067)  weight_decay: 0.0500 (0.0500)  time: 3.1913  data: 2.3090  max mem: 13273
Epoch: [249]  [ 200/1251]  eta: 0:03:52  lr: 0.000318  min_lr: 0.000318  loss: 2.6178 (2.8312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0614 (1.1055)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [249]  [ 400/1251]  eta: 0:03:01  lr: 0.000316  min_lr: 0.000316  loss: 2.6840 (2.8296)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0949 (1.0844)  time: 0.2080  data: 0.0006  max mem: 13273
Epoch: [249]  [ 600/1251]  eta: 0:02:17  lr: 0.000314  min_lr: 0.000314  loss: 2.7006 (2.8007)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1274 (1.1165)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [249]  [ 800/1251]  eta: 0:01:34  lr: 0.000312  min_lr: 0.000312  loss: 2.9858 (2.7869)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0877 (1.1102)  time: 0.2070  data: 0.0005  max mem: 13273
Epoch: [249]  [1000/1251]  eta: 0:00:52  lr: 0.000310  min_lr: 0.000310  loss: 2.4994 (2.7998)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0850 (1.1147)  time: 0.2116  data: 0.0005  max mem: 13273
Epoch: [249]  [1200/1251]  eta: 0:00:10  lr: 0.000308  min_lr: 0.000308  loss: 2.4952 (2.7984)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0291 (1.1121)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [249]  [1250/1251]  eta: 0:00:00  lr: 0.000308  min_lr: 0.000308  loss: 3.0642 (2.7983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1157 (1.1151)  time: 0.1705  data: 0.0006  max mem: 13273
Epoch: [249] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.000308  min_lr: 0.000308  loss: 3.0642 (2.7961)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1157 (1.1151)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6286 (0.6286)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.6811  data: 5.5753  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7625 (0.7989)  acc1: 85.6000 (84.1455)  acc5: 97.2000 (97.0545)  time: 0.7473  data: 0.6503  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0374 (0.9590)  acc1: 78.0000 (80.4191)  acc5: 94.8000 (95.3143)  time: 0.2034  data: 0.1101  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0382 (0.9667)  acc1: 78.4000 (80.1920)  acc5: 94.8000 (95.2800)  time: 0.2028  data: 0.1101  max mem: 13273
Test: Total time: 0:00:10 (0.4080 s / it)
* Acc@1 80.288 Acc@5 95.368 loss 0.956
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.29%
Epoch: [250]  [   0/1251]  eta: 1:07:44  lr: 0.000307  min_lr: 0.000307  loss: 3.4888 (3.4888)  weight_decay: 0.0500 (0.0500)  time: 3.2487  data: 3.0156  max mem: 13273
Epoch: [250]  [ 200/1251]  eta: 0:03:54  lr: 0.000306  min_lr: 0.000306  loss: 2.3657 (2.8276)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0908 (1.0991)  time: 0.2150  data: 0.0004  max mem: 13273
Epoch: [250]  [ 400/1251]  eta: 0:03:02  lr: 0.000304  min_lr: 0.000304  loss: 3.1470 (2.7894)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1259 (1.1342)  time: 0.2040  data: 0.0004  max mem: 13273
Epoch: [250]  [ 600/1251]  eta: 0:02:17  lr: 0.000302  min_lr: 0.000302  loss: 2.0522 (2.7827)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2053 (1.1462)  time: 0.2065  data: 0.0004  max mem: 13273
Epoch: [250]  [ 800/1251]  eta: 0:01:34  lr: 0.000300  min_lr: 0.000300  loss: 2.5056 (2.7678)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2043 (1.1474)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [250]  [1000/1251]  eta: 0:00:52  lr: 0.000298  min_lr: 0.000298  loss: 3.1245 (2.7793)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2155 (1.1705)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [250]  [1200/1251]  eta: 0:00:10  lr: 0.000296  min_lr: 0.000296  loss: 3.0114 (2.7741)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0996 (1.1684)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [250]  [1250/1251]  eta: 0:00:00  lr: 0.000296  min_lr: 0.000296  loss: 2.1103 (2.7781)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0958 (1.1652)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [250] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.000296  min_lr: 0.000296  loss: 2.1103 (2.8038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0958 (1.1652)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6036 (0.6036)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.3410  data: 5.2355  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7688 (0.7862)  acc1: 85.6000 (84.1091)  acc5: 97.2000 (96.9455)  time: 0.7261  data: 0.6285  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9779 (0.9531)  acc1: 78.0000 (80.0952)  acc5: 94.4000 (95.1429)  time: 0.2160  data: 0.1224  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0141 (0.9587)  acc1: 78.0000 (79.9200)  acc5: 94.0000 (95.1040)  time: 0.2148  data: 0.1223  max mem: 13273
Test: Total time: 0:00:10 (0.4056 s / it)
* Acc@1 80.256 Acc@5 95.290 loss 0.950
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.29%
Epoch: [251]  [   0/1251]  eta: 1:05:15  lr: 0.000296  min_lr: 0.000296  loss: 2.6934 (2.6934)  weight_decay: 0.0500 (0.0500)  time: 3.1295  data: 2.4734  max mem: 13273
Epoch: [251]  [ 200/1251]  eta: 0:03:52  lr: 0.000294  min_lr: 0.000294  loss: 2.9745 (2.8490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9970 (1.1447)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [251]  [ 400/1251]  eta: 0:03:01  lr: 0.000292  min_lr: 0.000292  loss: 2.2132 (2.7948)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0830 (1.1597)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [251]  [ 600/1251]  eta: 0:02:17  lr: 0.000290  min_lr: 0.000290  loss: 2.8005 (2.8059)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1229 (1.1665)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [251]  [ 800/1251]  eta: 0:01:34  lr: 0.000288  min_lr: 0.000288  loss: 2.1877 (2.8022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0544 (1.1489)  time: 0.2060  data: 0.0006  max mem: 13273
Epoch: [251]  [1000/1251]  eta: 0:00:52  lr: 0.000286  min_lr: 0.000286  loss: 2.1707 (2.8005)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1930 (1.1454)  time: 0.2095  data: 0.0004  max mem: 13273
Epoch: [251]  [1200/1251]  eta: 0:00:10  lr: 0.000284  min_lr: 0.000284  loss: 3.0321 (2.7918)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0445 (1.1516)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [251]  [1250/1251]  eta: 0:00:00  lr: 0.000284  min_lr: 0.000284  loss: 2.6031 (2.7938)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0153 (1.1491)  time: 0.1700  data: 0.0009  max mem: 13273
Epoch: [251] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.000284  min_lr: 0.000284  loss: 2.6031 (2.8049)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0153 (1.1491)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6225 (0.6225)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.8184  data: 5.7124  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7556 (0.7768)  acc1: 85.2000 (83.9273)  acc5: 97.6000 (97.2727)  time: 0.7533  data: 0.6554  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9719 (0.9424)  acc1: 78.8000 (80.1143)  acc5: 94.4000 (95.3333)  time: 0.2055  data: 0.1116  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0070 (0.9482)  acc1: 78.8000 (80.0000)  acc5: 94.4000 (95.2480)  time: 0.2049  data: 0.1115  max mem: 13273
Test: Total time: 0:00:10 (0.4152 s / it)
* Acc@1 80.402 Acc@5 95.350 loss 0.942
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.40%
Epoch: [252]  [   0/1251]  eta: 1:06:27  lr: 0.000284  min_lr: 0.000284  loss: 2.2203 (2.2203)  weight_decay: 0.0500 (0.0500)  time: 3.1878  data: 2.9638  max mem: 13273
Epoch: [252]  [ 200/1251]  eta: 0:03:50  lr: 0.000282  min_lr: 0.000282  loss: 3.0260 (2.7216)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2053 (1.1541)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [252]  [ 400/1251]  eta: 0:03:01  lr: 0.000280  min_lr: 0.000280  loss: 2.7557 (2.7414)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1648 (1.1934)  time: 0.2091  data: 0.0004  max mem: 13273
Epoch: [252]  [ 600/1251]  eta: 0:02:16  lr: 0.000279  min_lr: 0.000279  loss: 2.2283 (2.7503)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1128 (1.1906)  time: 0.2083  data: 0.0005  max mem: 13273
Epoch: [252]  [ 800/1251]  eta: 0:01:34  lr: 0.000277  min_lr: 0.000277  loss: 3.3483 (2.7654)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0505 (1.1818)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [252]  [1000/1251]  eta: 0:00:52  lr: 0.000275  min_lr: 0.000275  loss: 2.6975 (2.7688)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0507 (1.1597)  time: 0.2039  data: 0.0004  max mem: 13273
Epoch: [252]  [1200/1251]  eta: 0:00:10  lr: 0.000273  min_lr: 0.000273  loss: 2.2035 (2.7773)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0597 (1.1477)  time: 0.2041  data: 0.0006  max mem: 13273
Epoch: [252]  [1250/1251]  eta: 0:00:00  lr: 0.000273  min_lr: 0.000273  loss: 2.3462 (2.7812)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0481 (1.1436)  time: 0.1706  data: 0.0005  max mem: 13273
Epoch: [252] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.000273  min_lr: 0.000273  loss: 2.3462 (2.7943)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0481 (1.1436)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6310 (0.6310)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.6113  data: 5.5049  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.7788 (0.7994)  acc1: 85.2000 (83.7818)  acc5: 97.2000 (97.1636)  time: 0.6651  data: 0.5665  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0141 (0.9647)  acc1: 78.4000 (80.3619)  acc5: 94.4000 (95.3905)  time: 0.1696  data: 0.0744  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0093 (0.9703)  acc1: 78.4000 (80.0640)  acc5: 94.4000 (95.3120)  time: 0.2013  data: 0.1074  max mem: 13273
Test: Total time: 0:00:10 (0.4045 s / it)
* Acc@1 80.232 Acc@5 95.334 loss 0.964
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.40%
Epoch: [253]  [   0/1251]  eta: 1:09:03  lr: 0.000273  min_lr: 0.000273  loss: 2.1191 (2.1191)  weight_decay: 0.0500 (0.0500)  time: 3.3124  data: 2.5812  max mem: 13273
Epoch: [253]  [ 200/1251]  eta: 0:03:53  lr: 0.000271  min_lr: 0.000271  loss: 2.9395 (2.8176)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0585 (1.0934)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [253]  [ 400/1251]  eta: 0:03:02  lr: 0.000269  min_lr: 0.000269  loss: 3.2982 (2.7894)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2020 (1.1286)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [253]  [ 600/1251]  eta: 0:02:18  lr: 0.000267  min_lr: 0.000267  loss: 2.8196 (2.8040)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0593 (1.1269)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [253]  [ 800/1251]  eta: 0:01:35  lr: 0.000265  min_lr: 0.000265  loss: 2.0554 (2.7907)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0254 (1.1159)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [253]  [1000/1251]  eta: 0:00:52  lr: 0.000264  min_lr: 0.000264  loss: 2.3384 (2.7840)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1034 (1.1209)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [253]  [1200/1251]  eta: 0:00:10  lr: 0.000262  min_lr: 0.000262  loss: 2.5919 (2.8029)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0390 (1.1214)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [253]  [1250/1251]  eta: 0:00:00  lr: 0.000261  min_lr: 0.000261  loss: 3.0939 (2.7988)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0108 (1.1210)  time: 0.1714  data: 0.0008  max mem: 13273
Epoch: [253] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.000261  min_lr: 0.000261  loss: 3.0939 (2.7870)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0108 (1.1210)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6383 (0.6383)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.5426  data: 5.4358  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7857 (0.8123)  acc1: 84.4000 (83.7455)  acc5: 97.2000 (97.2000)  time: 0.7235  data: 0.6239  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0174 (0.9666)  acc1: 78.0000 (80.1333)  acc5: 95.2000 (95.4667)  time: 0.2046  data: 0.1098  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9924 (0.9737)  acc1: 78.0000 (79.9840)  acc5: 95.2000 (95.3120)  time: 0.2092  data: 0.1157  max mem: 13273
Test: Total time: 0:00:10 (0.4078 s / it)
* Acc@1 80.256 Acc@5 95.340 loss 0.970
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.40%
Epoch: [254]  [   0/1251]  eta: 1:06:25  lr: 0.000261  min_lr: 0.000261  loss: 1.8419 (1.8419)  weight_decay: 0.0500 (0.0500)  time: 3.1856  data: 2.3854  max mem: 13273
Epoch: [254]  [ 200/1251]  eta: 0:03:53  lr: 0.000260  min_lr: 0.000260  loss: 2.1365 (2.8064)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0583 (1.1582)  time: 0.2068  data: 0.0005  max mem: 13273
Epoch: [254]  [ 400/1251]  eta: 0:03:01  lr: 0.000258  min_lr: 0.000258  loss: 2.8092 (2.8110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1043 (1.1479)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [254]  [ 600/1251]  eta: 0:02:17  lr: 0.000256  min_lr: 0.000256  loss: 2.6337 (2.8069)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0497 (1.1421)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [254]  [ 800/1251]  eta: 0:01:34  lr: 0.000254  min_lr: 0.000254  loss: 2.2513 (2.8170)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0760 (1.1423)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [254]  [1000/1251]  eta: 0:00:52  lr: 0.000253  min_lr: 0.000253  loss: 2.3335 (2.8256)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1781 (1.1499)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [254]  [1200/1251]  eta: 0:00:10  lr: 0.000251  min_lr: 0.000251  loss: 2.0692 (2.8286)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1888 (1.1516)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [254]  [1250/1251]  eta: 0:00:00  lr: 0.000251  min_lr: 0.000251  loss: 3.3125 (2.8318)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1415 (1.1534)  time: 0.1706  data: 0.0007  max mem: 13273
Epoch: [254] Total time: 0:04:21 (0.2086 s / it)
Averaged stats: lr: 0.000251  min_lr: 0.000251  loss: 3.3125 (2.7959)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1415 (1.1534)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6806 (0.6806)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.4714  data: 5.3647  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8266 (0.8360)  acc1: 85.6000 (84.2182)  acc5: 97.6000 (97.3818)  time: 0.7456  data: 0.6492  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0167 (1.0094)  acc1: 78.0000 (80.7048)  acc5: 95.2000 (95.5238)  time: 0.2229  data: 0.1293  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0956 (1.0190)  acc1: 78.0000 (80.4800)  acc5: 94.4000 (95.3600)  time: 0.2226  data: 0.1293  max mem: 13273
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 80.504 Acc@5 95.378 loss 1.014
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.50%
Epoch: [255]  [   0/1251]  eta: 0:57:28  lr: 0.000250  min_lr: 0.000250  loss: 3.6229 (3.6229)  weight_decay: 0.0500 (0.0500)  time: 2.7568  data: 2.5108  max mem: 13273
Epoch: [255]  [ 200/1251]  eta: 0:03:52  lr: 0.000249  min_lr: 0.000249  loss: 2.1352 (2.8178)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1174 (1.1530)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [255]  [ 400/1251]  eta: 0:03:01  lr: 0.000247  min_lr: 0.000247  loss: 2.3616 (2.7545)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0583 (1.1475)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [255]  [ 600/1251]  eta: 0:02:16  lr: 0.000245  min_lr: 0.000245  loss: 2.2414 (2.7525)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1857 (nan)  time: 0.2058  data: 0.0006  max mem: 13273
Epoch: [255]  [ 800/1251]  eta: 0:01:34  lr: 0.000244  min_lr: 0.000244  loss: 3.2976 (2.7630)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1199 (nan)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [255]  [1000/1251]  eta: 0:00:52  lr: 0.000242  min_lr: 0.000242  loss: 2.4254 (2.7652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9935 (nan)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [255]  [1200/1251]  eta: 0:00:10  lr: 0.000240  min_lr: 0.000240  loss: 2.9796 (2.7581)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2381 (nan)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [255]  [1250/1251]  eta: 0:00:00  lr: 0.000240  min_lr: 0.000240  loss: 2.3874 (2.7535)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2503 (nan)  time: 0.1702  data: 0.0009  max mem: 13273
Epoch: [255] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.000240  min_lr: 0.000240  loss: 2.3874 (2.7791)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2503 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5845 (0.5845)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.5940  data: 5.4626  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7303 (0.7460)  acc1: 85.2000 (84.3636)  acc5: 98.0000 (97.4182)  time: 0.7851  data: 0.6824  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9816 (0.9089)  acc1: 78.8000 (80.4571)  acc5: 94.8000 (95.5048)  time: 0.2190  data: 0.1229  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9724 (0.9188)  acc1: 78.8000 (80.0480)  acc5: 94.8000 (95.3600)  time: 0.2172  data: 0.1228  max mem: 13273
Test: Total time: 0:00:10 (0.4168 s / it)
* Acc@1 80.534 Acc@5 95.438 loss 0.913
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.53%
Epoch: [256]  [   0/1251]  eta: 1:01:42  lr: 0.000240  min_lr: 0.000240  loss: 4.4308 (4.4308)  weight_decay: 0.0500 (0.0500)  time: 2.9592  data: 2.6895  max mem: 13273
Epoch: [256]  [ 200/1251]  eta: 0:03:51  lr: 0.000238  min_lr: 0.000238  loss: 2.9589 (2.8054)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0297 (1.1641)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [256]  [ 400/1251]  eta: 0:03:01  lr: 0.000236  min_lr: 0.000236  loss: 2.6846 (2.8085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1781 (1.1629)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [256]  [ 600/1251]  eta: 0:02:16  lr: 0.000235  min_lr: 0.000235  loss: 3.0721 (2.8220)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1552 (1.1658)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [256]  [ 800/1251]  eta: 0:01:34  lr: 0.000233  min_lr: 0.000233  loss: 3.2013 (2.8344)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1500 (1.1563)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [256]  [1000/1251]  eta: 0:00:52  lr: 0.000231  min_lr: 0.000231  loss: 2.3174 (2.8307)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2205 (1.1662)  time: 0.2179  data: 0.0005  max mem: 13273
Epoch: [256]  [1200/1251]  eta: 0:00:10  lr: 0.000230  min_lr: 0.000230  loss: 2.2404 (2.8223)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1258 (1.1752)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [256]  [1250/1251]  eta: 0:00:00  lr: 0.000229  min_lr: 0.000229  loss: 2.7696 (2.8255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1392 (1.1734)  time: 0.1707  data: 0.0010  max mem: 13273
Epoch: [256] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.000229  min_lr: 0.000229  loss: 2.7696 (2.7917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1392 (1.1734)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6521 (0.6521)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.6098  data: 5.5012  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7804 (0.8078)  acc1: 84.8000 (83.3818)  acc5: 97.2000 (97.1273)  time: 0.7754  data: 0.6754  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0039 (0.9609)  acc1: 78.4000 (80.1714)  acc5: 94.8000 (95.4286)  time: 0.2163  data: 0.1204  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0017 (0.9677)  acc1: 78.4000 (80.0960)  acc5: 94.4000 (95.3600)  time: 0.2148  data: 0.1204  max mem: 13273
Test: Total time: 0:00:10 (0.4155 s / it)
* Acc@1 80.366 Acc@5 95.438 loss 0.964
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.53%
Epoch: [257]  [   0/1251]  eta: 1:03:07  lr: 0.000229  min_lr: 0.000229  loss: 3.0876 (3.0876)  weight_decay: 0.0500 (0.0500)  time: 3.0275  data: 2.2050  max mem: 13273
Epoch: [257]  [ 200/1251]  eta: 0:03:55  lr: 0.000228  min_lr: 0.000228  loss: 2.3966 (2.7810)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0505 (1.1505)  time: 0.2094  data: 0.0004  max mem: 13273
Epoch: [257]  [ 400/1251]  eta: 0:03:02  lr: 0.000226  min_lr: 0.000226  loss: 2.5924 (2.8057)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1236 (1.1557)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [257]  [ 600/1251]  eta: 0:02:17  lr: 0.000224  min_lr: 0.000224  loss: 3.1218 (2.7880)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1035 (1.1547)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [257]  [ 800/1251]  eta: 0:01:34  lr: 0.000223  min_lr: 0.000223  loss: 3.1966 (2.8041)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2517 (1.1732)  time: 0.2053  data: 0.0003  max mem: 13273
Epoch: [257]  [1000/1251]  eta: 0:00:52  lr: 0.000221  min_lr: 0.000221  loss: 3.2910 (2.7943)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1668 (1.1691)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [257]  [1200/1251]  eta: 0:00:10  lr: 0.000219  min_lr: 0.000219  loss: 2.4429 (2.7881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2274 (1.1761)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [257]  [1250/1251]  eta: 0:00:00  lr: 0.000219  min_lr: 0.000219  loss: 2.2350 (2.7856)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2094 (1.1768)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [257] Total time: 0:04:21 (0.2090 s / it)
Averaged stats: lr: 0.000219  min_lr: 0.000219  loss: 2.2350 (2.7729)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2094 (1.1768)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.5936 (0.5936)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.8638  data: 5.7585  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7440 (0.7714)  acc1: 86.4000 (84.2545)  acc5: 97.6000 (97.1273)  time: 0.6956  data: 0.5992  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9979 (0.9256)  acc1: 78.0000 (80.6476)  acc5: 94.8000 (95.5429)  time: 0.1697  data: 0.0767  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9935 (0.9354)  acc1: 78.0000 (80.2400)  acc5: 94.8000 (95.4400)  time: 0.1941  data: 0.1016  max mem: 13273
Test: Total time: 0:00:10 (0.4078 s / it)
* Acc@1 80.476 Acc@5 95.438 loss 0.932
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.53%
Epoch: [258]  [   0/1251]  eta: 1:04:48  lr: 0.000219  min_lr: 0.000219  loss: 3.3762 (3.3762)  weight_decay: 0.0500 (0.0500)  time: 3.1087  data: 2.4916  max mem: 13273
Epoch: [258]  [ 200/1251]  eta: 0:03:52  lr: 0.000217  min_lr: 0.000217  loss: 2.1670 (2.7457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1866 (1.1822)  time: 0.2065  data: 0.0004  max mem: 13273
Epoch: [258]  [ 400/1251]  eta: 0:03:02  lr: 0.000216  min_lr: 0.000216  loss: 2.1507 (2.7347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9980 (1.1516)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [258]  [ 600/1251]  eta: 0:02:17  lr: 0.000214  min_lr: 0.000214  loss: 2.4511 (2.7355)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1374 (1.1573)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [258]  [ 800/1251]  eta: 0:01:34  lr: 0.000212  min_lr: 0.000212  loss: 2.2630 (2.7486)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0884 (1.1433)  time: 0.2089  data: 0.0005  max mem: 13273
Epoch: [258]  [1000/1251]  eta: 0:00:52  lr: 0.000211  min_lr: 0.000211  loss: 2.7267 (2.7721)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0323 (1.1401)  time: 0.2065  data: 0.0004  max mem: 13273
Epoch: [258]  [1200/1251]  eta: 0:00:10  lr: 0.000209  min_lr: 0.000209  loss: 2.3513 (2.7726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1343 (1.1395)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [258]  [1250/1251]  eta: 0:00:00  lr: 0.000209  min_lr: 0.000209  loss: 2.5807 (2.7735)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1855 (1.1459)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [258] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.000209  min_lr: 0.000209  loss: 2.5807 (2.7665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1855 (1.1459)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6177 (0.6177)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.3777  data: 5.2683  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7785 (0.7882)  acc1: 86.4000 (84.6182)  acc5: 97.2000 (97.2000)  time: 0.7263  data: 0.6311  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0116 (0.9534)  acc1: 79.2000 (80.7048)  acc5: 94.4000 (95.3524)  time: 0.2045  data: 0.1124  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0108 (0.9628)  acc1: 79.2000 (80.5600)  acc5: 94.4000 (95.2320)  time: 0.2052  data: 0.1140  max mem: 13273
Test: Total time: 0:00:09 (0.3988 s / it)
* Acc@1 80.572 Acc@5 95.400 loss 0.959
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.57%
Epoch: [259]  [   0/1251]  eta: 0:57:00  lr: 0.000209  min_lr: 0.000209  loss: 3.1601 (3.1601)  weight_decay: 0.0500 (0.0500)  time: 2.7345  data: 2.4982  max mem: 13273
Epoch: [259]  [ 200/1251]  eta: 0:03:50  lr: 0.000207  min_lr: 0.000207  loss: 3.1779 (2.8075)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0706 (1.1764)  time: 0.2066  data: 0.0006  max mem: 13273
Epoch: [259]  [ 400/1251]  eta: 0:03:00  lr: 0.000206  min_lr: 0.000206  loss: 2.7795 (2.8201)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0758 (1.1609)  time: 0.2080  data: 0.0004  max mem: 13273
Epoch: [259]  [ 600/1251]  eta: 0:02:16  lr: 0.000204  min_lr: 0.000204  loss: 2.1351 (2.8256)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1502 (1.1735)  time: 0.2042  data: 0.0004  max mem: 13273
Epoch: [259]  [ 800/1251]  eta: 0:01:34  lr: 0.000203  min_lr: 0.000203  loss: 2.3404 (2.8035)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0187 (1.1484)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [259]  [1000/1251]  eta: 0:00:52  lr: 0.000201  min_lr: 0.000201  loss: 2.5857 (2.7930)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1139 (1.1516)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [259]  [1200/1251]  eta: 0:00:10  lr: 0.000199  min_lr: 0.000199  loss: 2.9771 (2.7965)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2092 (1.1534)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [259]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 2.2836 (2.7949)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0808 (1.1546)  time: 0.1748  data: 0.0009  max mem: 13273
Epoch: [259] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 2.2836 (2.7738)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0808 (1.1546)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5857 (0.5857)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.6211  data: 5.5168  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7264 (0.7454)  acc1: 86.0000 (84.3273)  acc5: 96.8000 (97.0909)  time: 0.7274  data: 0.6298  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9585 (0.9083)  acc1: 78.8000 (80.7048)  acc5: 94.8000 (95.2952)  time: 0.1939  data: 0.1002  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9828 (0.9187)  acc1: 78.8000 (80.4160)  acc5: 94.8000 (95.2160)  time: 0.1928  data: 0.1001  max mem: 13273
Test: Total time: 0:00:09 (0.3997 s / it)
* Acc@1 80.688 Acc@5 95.462 loss 0.911
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.69%
Epoch: [260]  [   0/1251]  eta: 0:54:05  lr: 0.000199  min_lr: 0.000199  loss: 1.8344 (1.8344)  weight_decay: 0.0500 (0.0500)  time: 2.5941  data: 2.3221  max mem: 13273
Epoch: [260]  [ 200/1251]  eta: 0:03:52  lr: 0.000197  min_lr: 0.000197  loss: 3.2718 (2.7853)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1352 (1.1428)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [260]  [ 400/1251]  eta: 0:03:02  lr: 0.000196  min_lr: 0.000196  loss: 2.2675 (2.7299)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3100 (1.1828)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [260]  [ 600/1251]  eta: 0:02:18  lr: 0.000194  min_lr: 0.000194  loss: 3.0512 (2.7665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1230 (1.1801)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [260]  [ 800/1251]  eta: 0:01:34  lr: 0.000193  min_lr: 0.000193  loss: 3.2967 (2.7888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1218 (1.1776)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [260]  [1000/1251]  eta: 0:00:52  lr: 0.000191  min_lr: 0.000191  loss: 3.3790 (2.7870)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0913 (1.1832)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [260]  [1200/1251]  eta: 0:00:10  lr: 0.000190  min_lr: 0.000190  loss: 2.3136 (2.7729)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2071 (1.1884)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [260]  [1250/1251]  eta: 0:00:00  lr: 0.000189  min_lr: 0.000189  loss: 2.8764 (2.7725)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2924 (1.1911)  time: 0.1699  data: 0.0007  max mem: 13273
Epoch: [260] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.000189  min_lr: 0.000189  loss: 2.8764 (2.7719)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2924 (1.1911)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6174 (0.6174)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.5755  data: 5.4702  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7766 (0.7889)  acc1: 84.8000 (84.3636)  acc5: 97.2000 (96.9091)  time: 0.7601  data: 0.6617  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0013 (0.9567)  acc1: 78.8000 (80.3238)  acc5: 94.4000 (95.1048)  time: 0.2091  data: 0.1143  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0151 (0.9663)  acc1: 78.8000 (80.0960)  acc5: 94.0000 (95.0240)  time: 0.2084  data: 0.1143  max mem: 13273
Test: Total time: 0:00:10 (0.4091 s / it)
* Acc@1 80.648 Acc@5 95.416 loss 0.959
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.69%
Epoch: [261]  [   0/1251]  eta: 1:10:07  lr: 0.000189  min_lr: 0.000189  loss: 2.3927 (2.3927)  weight_decay: 0.0500 (0.0500)  time: 3.3636  data: 2.9568  max mem: 13273
Epoch: [261]  [ 200/1251]  eta: 0:03:53  lr: 0.000188  min_lr: 0.000188  loss: 2.6770 (2.8121)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0783 (1.1650)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [261]  [ 400/1251]  eta: 0:03:01  lr: 0.000186  min_lr: 0.000186  loss: 2.6982 (2.8307)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1042 (1.1743)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [261]  [ 600/1251]  eta: 0:02:17  lr: 0.000185  min_lr: 0.000185  loss: 3.0563 (2.8196)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1150 (1.1820)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [261]  [ 800/1251]  eta: 0:01:34  lr: 0.000183  min_lr: 0.000183  loss: 2.1181 (2.8144)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2504 (1.1808)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [261]  [1000/1251]  eta: 0:00:52  lr: 0.000182  min_lr: 0.000182  loss: 3.2195 (2.8123)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2005 (1.1798)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [261]  [1200/1251]  eta: 0:00:10  lr: 0.000180  min_lr: 0.000180  loss: 2.2816 (2.8015)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4104 (1.1906)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [261]  [1250/1251]  eta: 0:00:00  lr: 0.000180  min_lr: 0.000180  loss: 3.2957 (2.8065)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2409 (1.1918)  time: 0.1711  data: 0.0008  max mem: 13273
Epoch: [261] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.000180  min_lr: 0.000180  loss: 3.2957 (2.7683)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2409 (1.1918)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6481 (0.6481)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.8335  data: 5.6907  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8213 (0.8272)  acc1: 85.2000 (84.1091)  acc5: 97.2000 (97.1273)  time: 0.7854  data: 0.6819  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0158 (0.9859)  acc1: 79.2000 (80.4381)  acc5: 94.4000 (95.3905)  time: 0.2098  data: 0.1141  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0648 (0.9932)  acc1: 78.8000 (80.2240)  acc5: 94.4000 (95.2960)  time: 0.2080  data: 0.1140  max mem: 13273
Test: Total time: 0:00:10 (0.4188 s / it)
* Acc@1 80.510 Acc@5 95.472 loss 0.987
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.69%
Epoch: [262]  [   0/1251]  eta: 1:05:50  lr: 0.000180  min_lr: 0.000180  loss: 3.7249 (3.7249)  weight_decay: 0.0500 (0.0500)  time: 3.1576  data: 1.7018  max mem: 13273
Epoch: [262]  [ 200/1251]  eta: 0:03:52  lr: 0.000179  min_lr: 0.000179  loss: 2.7310 (2.8275)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1975 (1.2088)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [262]  [ 400/1251]  eta: 0:03:01  lr: 0.000177  min_lr: 0.000177  loss: 3.3399 (2.8330)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1400 (1.1687)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [262]  [ 600/1251]  eta: 0:02:17  lr: 0.000176  min_lr: 0.000176  loss: 3.2491 (2.8168)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1380 (1.1856)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [262]  [ 800/1251]  eta: 0:01:34  lr: 0.000174  min_lr: 0.000174  loss: 2.8346 (2.8144)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0833 (1.1704)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [262]  [1000/1251]  eta: 0:00:52  lr: 0.000173  min_lr: 0.000173  loss: 3.1998 (2.8181)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1272 (1.1760)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [262]  [1200/1251]  eta: 0:00:10  lr: 0.000171  min_lr: 0.000171  loss: 2.8377 (2.8046)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1601 (1.1784)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [262]  [1250/1251]  eta: 0:00:00  lr: 0.000171  min_lr: 0.000171  loss: 2.0567 (2.7924)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0739 (1.1742)  time: 0.1706  data: 0.0008  max mem: 13273
Epoch: [262] Total time: 0:04:20 (0.2083 s / it)
Averaged stats: lr: 0.000171  min_lr: 0.000171  loss: 2.0567 (2.7751)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0739 (1.1742)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5761 (0.5761)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 5.7682  data: 5.6616  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7212 (0.7443)  acc1: 84.8000 (84.7273)  acc5: 97.2000 (97.1636)  time: 0.7112  data: 0.6131  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9609 (0.9027)  acc1: 79.2000 (80.9524)  acc5: 94.4000 (95.5429)  time: 0.1915  data: 0.0975  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9741 (0.9108)  acc1: 79.2000 (80.8160)  acc5: 94.4000 (95.4880)  time: 0.1998  data: 0.1056  max mem: 13273
Test: Total time: 0:00:10 (0.4096 s / it)
* Acc@1 80.770 Acc@5 95.510 loss 0.907
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.77%
Epoch: [263]  [   0/1251]  eta: 1:04:26  lr: 0.000171  min_lr: 0.000171  loss: 2.9089 (2.9089)  weight_decay: 0.0500 (0.0500)  time: 3.0906  data: 2.8543  max mem: 13273
Epoch: [263]  [ 200/1251]  eta: 0:03:52  lr: 0.000169  min_lr: 0.000169  loss: 2.8648 (2.7509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0988 (1.1377)  time: 0.2063  data: 0.0006  max mem: 13273
Epoch: [263]  [ 400/1251]  eta: 0:03:02  lr: 0.000168  min_lr: 0.000168  loss: 2.3132 (2.7428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1685 (1.1753)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [263]  [ 600/1251]  eta: 0:02:18  lr: 0.000167  min_lr: 0.000167  loss: 2.8691 (2.7360)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0779 (1.1518)  time: 0.2066  data: 0.0003  max mem: 13273
Epoch: [263]  [ 800/1251]  eta: 0:01:35  lr: 0.000165  min_lr: 0.000165  loss: 2.5817 (2.7277)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1506 (1.1527)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [263]  [1000/1251]  eta: 0:00:52  lr: 0.000164  min_lr: 0.000164  loss: 2.9182 (2.7363)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1861 (1.1590)  time: 0.2042  data: 0.0005  max mem: 13273
Epoch: [263]  [1200/1251]  eta: 0:00:10  lr: 0.000162  min_lr: 0.000162  loss: 2.1188 (2.7305)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1880 (1.1698)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [263]  [1250/1251]  eta: 0:00:00  lr: 0.000162  min_lr: 0.000162  loss: 2.6187 (2.7298)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2300 (1.1746)  time: 0.1707  data: 0.0005  max mem: 13273
Epoch: [263] Total time: 0:04:21 (0.2092 s / it)
Averaged stats: lr: 0.000162  min_lr: 0.000162  loss: 2.6187 (2.7476)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2300 (1.1746)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5753 (0.5753)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 5.4666  data: 5.3584  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7300 (0.7548)  acc1: 84.8000 (84.3636)  acc5: 97.6000 (97.1636)  time: 0.7784  data: 0.6823  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9495 (0.9261)  acc1: 79.6000 (80.9333)  acc5: 94.8000 (95.2952)  time: 0.2206  data: 0.1280  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0207 (0.9336)  acc1: 78.8000 (80.7840)  acc5: 94.4000 (95.2000)  time: 0.2198  data: 0.1279  max mem: 13273
Test: Total time: 0:00:10 (0.4125 s / it)
* Acc@1 80.680 Acc@5 95.486 loss 0.930
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.77%
Epoch: [264]  [   0/1251]  eta: 1:05:13  lr: 0.000162  min_lr: 0.000162  loss: 3.7028 (3.7028)  weight_decay: 0.0500 (0.0500)  time: 3.1283  data: 2.2278  max mem: 13273
Epoch: [264]  [ 200/1251]  eta: 0:03:52  lr: 0.000160  min_lr: 0.000160  loss: 2.7740 (2.7257)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0433 (1.1381)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [264]  [ 400/1251]  eta: 0:03:01  lr: 0.000159  min_lr: 0.000159  loss: 3.1383 (2.7710)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1110 (1.1627)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [264]  [ 600/1251]  eta: 0:02:17  lr: 0.000158  min_lr: 0.000158  loss: 2.7499 (2.7800)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0775 (1.1266)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [264]  [ 800/1251]  eta: 0:01:34  lr: 0.000156  min_lr: 0.000156  loss: 2.5102 (2.7875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1786 (1.1465)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [264]  [1000/1251]  eta: 0:00:52  lr: 0.000155  min_lr: 0.000155  loss: 2.7878 (2.7899)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1594 (1.1547)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [264]  [1200/1251]  eta: 0:00:10  lr: 0.000154  min_lr: 0.000154  loss: 2.3018 (2.7924)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0934 (1.1568)  time: 0.2056  data: 0.0004  max mem: 13273
Epoch: [264]  [1250/1251]  eta: 0:00:00  lr: 0.000153  min_lr: 0.000153  loss: 2.3839 (2.7837)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1381 (1.1595)  time: 0.1706  data: 0.0010  max mem: 13273
Epoch: [264] Total time: 0:04:19 (0.2078 s / it)
Averaged stats: lr: 0.000153  min_lr: 0.000153  loss: 2.3839 (2.7603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1381 (1.1595)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5945 (0.5945)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.3732  data: 5.2671  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7448 (0.7582)  acc1: 85.6000 (84.6182)  acc5: 96.8000 (97.0909)  time: 0.7433  data: 0.6470  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9824 (0.9232)  acc1: 79.2000 (80.7238)  acc5: 94.4000 (95.4857)  time: 0.2065  data: 0.1131  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9841 (0.9301)  acc1: 79.2000 (80.5440)  acc5: 94.4000 (95.3920)  time: 0.2058  data: 0.1130  max mem: 13273
Test: Total time: 0:00:10 (0.4035 s / it)
* Acc@1 80.658 Acc@5 95.478 loss 0.927
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.77%
Epoch: [265]  [   0/1251]  eta: 1:04:26  lr: 0.000153  min_lr: 0.000153  loss: 3.4712 (3.4712)  weight_decay: 0.0500 (0.0500)  time: 3.0905  data: 2.5858  max mem: 13273
Epoch: [265]  [ 200/1251]  eta: 0:03:53  lr: 0.000152  min_lr: 0.000152  loss: 2.2539 (2.6997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1616 (1.1870)  time: 0.2067  data: 0.0006  max mem: 13273
Epoch: [265]  [ 400/1251]  eta: 0:03:01  lr: 0.000150  min_lr: 0.000150  loss: 2.2203 (2.7263)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0476 (1.1950)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [265]  [ 600/1251]  eta: 0:02:17  lr: 0.000149  min_lr: 0.000149  loss: 2.5914 (2.7295)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1703 (1.1787)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [265]  [ 800/1251]  eta: 0:01:34  lr: 0.000148  min_lr: 0.000148  loss: 3.1307 (2.7539)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1219 (1.1769)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [265]  [1000/1251]  eta: 0:00:52  lr: 0.000146  min_lr: 0.000146  loss: 2.7796 (2.7430)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1364 (1.1754)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [265]  [1200/1251]  eta: 0:00:10  lr: 0.000145  min_lr: 0.000145  loss: 2.7646 (2.7479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1484 (1.1758)  time: 0.2034  data: 0.0004  max mem: 13273
Epoch: [265]  [1250/1251]  eta: 0:00:00  lr: 0.000145  min_lr: 0.000145  loss: 2.8185 (2.7496)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1439 (1.1782)  time: 0.1703  data: 0.0006  max mem: 13273
Epoch: [265] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.000145  min_lr: 0.000145  loss: 2.8185 (2.7540)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1439 (1.1782)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6597 (0.6597)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.7179  data: 5.5827  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7849 (0.8117)  acc1: 85.2000 (84.6182)  acc5: 97.2000 (97.1636)  time: 0.7298  data: 0.6279  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0352 (0.9768)  acc1: 79.2000 (80.9143)  acc5: 94.4000 (95.3905)  time: 0.1975  data: 0.1030  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0414 (0.9830)  acc1: 78.8000 (80.7680)  acc5: 94.4000 (95.3120)  time: 0.1957  data: 0.1030  max mem: 13273
Test: Total time: 0:00:10 (0.4043 s / it)
* Acc@1 80.772 Acc@5 95.510 loss 0.980
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.77%
Epoch: [266]  [   0/1251]  eta: 1:03:45  lr: 0.000145  min_lr: 0.000145  loss: 3.4249 (3.4249)  weight_decay: 0.0500 (0.0500)  time: 3.0582  data: 2.8042  max mem: 13273
Epoch: [266]  [ 200/1251]  eta: 0:03:51  lr: 0.000143  min_lr: 0.000143  loss: 2.2628 (2.7703)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0682 (1.1471)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [266]  [ 400/1251]  eta: 0:03:01  lr: 0.000142  min_lr: 0.000142  loss: 3.0298 (2.7050)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0974 (1.1443)  time: 0.2070  data: 0.0006  max mem: 13273
Epoch: [266]  [ 600/1251]  eta: 0:02:17  lr: 0.000141  min_lr: 0.000141  loss: 2.1487 (2.7130)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1043 (1.1486)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [266]  [ 800/1251]  eta: 0:01:34  lr: 0.000139  min_lr: 0.000139  loss: 2.9042 (2.7151)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0866 (1.1434)  time: 0.2081  data: 0.0005  max mem: 13273
Epoch: [266]  [1000/1251]  eta: 0:00:52  lr: 0.000138  min_lr: 0.000138  loss: 2.8665 (2.7078)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1495 (1.1528)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [266]  [1200/1251]  eta: 0:00:10  lr: 0.000137  min_lr: 0.000137  loss: 2.6601 (2.7072)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0740 (1.1634)  time: 0.2133  data: 0.0005  max mem: 13273
Epoch: [266]  [1250/1251]  eta: 0:00:00  lr: 0.000137  min_lr: 0.000137  loss: 2.0896 (2.7047)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1021 (1.1625)  time: 0.1700  data: 0.0007  max mem: 13273
Epoch: [266] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.000137  min_lr: 0.000137  loss: 2.0896 (2.7475)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1021 (1.1625)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.5801 (0.5801)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.9551  data: 5.8192  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7302 (0.7409)  acc1: 86.0000 (84.7273)  acc5: 97.2000 (97.3818)  time: 0.7515  data: 0.6495  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9581 (0.9113)  acc1: 79.2000 (80.9333)  acc5: 94.4000 (95.4476)  time: 0.2012  data: 0.1067  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9920 (0.9189)  acc1: 79.2000 (80.6880)  acc5: 94.4000 (95.3440)  time: 0.1994  data: 0.1067  max mem: 13273
Test: Total time: 0:00:10 (0.4171 s / it)
* Acc@1 80.788 Acc@5 95.506 loss 0.913
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.79%
Epoch: [267]  [   0/1251]  eta: 1:03:25  lr: 0.000136  min_lr: 0.000136  loss: 2.3283 (2.3283)  weight_decay: 0.0500 (0.0500)  time: 3.0416  data: 2.8168  max mem: 13273
Epoch: [267]  [ 200/1251]  eta: 0:03:50  lr: 0.000135  min_lr: 0.000135  loss: 3.2013 (2.8052)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1526 (1.1994)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [267]  [ 400/1251]  eta: 0:03:01  lr: 0.000134  min_lr: 0.000134  loss: 2.8528 (2.7360)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0719 (1.1563)  time: 0.2085  data: 0.0005  max mem: 13273
Epoch: [267]  [ 600/1251]  eta: 0:02:17  lr: 0.000133  min_lr: 0.000133  loss: 2.1934 (2.7552)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1639 (1.1725)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [267]  [ 800/1251]  eta: 0:01:34  lr: 0.000131  min_lr: 0.000131  loss: 2.2862 (2.7739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0898 (1.1536)  time: 0.2035  data: 0.0004  max mem: 13273
Epoch: [267]  [1000/1251]  eta: 0:00:52  lr: 0.000130  min_lr: 0.000130  loss: 2.0623 (2.7603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1461 (1.1687)  time: 0.2057  data: 0.0004  max mem: 13273
Epoch: [267]  [1200/1251]  eta: 0:00:10  lr: 0.000129  min_lr: 0.000129  loss: 2.0168 (2.7391)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1348 (1.1688)  time: 0.2070  data: 0.0005  max mem: 13273
Epoch: [267]  [1250/1251]  eta: 0:00:00  lr: 0.000129  min_lr: 0.000129  loss: 3.2055 (2.7394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1341 (1.1690)  time: 0.1710  data: 0.0009  max mem: 13273
Epoch: [267] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.000129  min_lr: 0.000129  loss: 3.2055 (2.7453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1341 (1.1690)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6444 (0.6444)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.7606  data: 5.6544  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7796 (0.7964)  acc1: 85.2000 (84.2182)  acc5: 97.2000 (97.2364)  time: 0.7084  data: 0.6123  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0116 (0.9590)  acc1: 79.6000 (80.8000)  acc5: 94.4000 (95.3143)  time: 0.1870  data: 0.0910  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0045 (0.9638)  acc1: 79.6000 (80.5760)  acc5: 94.4000 (95.2160)  time: 0.1905  data: 0.0947  max mem: 13273
Test: Total time: 0:00:10 (0.4022 s / it)
* Acc@1 80.764 Acc@5 95.490 loss 0.960
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.79%
Epoch: [268]  [   0/1251]  eta: 1:04:15  lr: 0.000128  min_lr: 0.000128  loss: 3.4631 (3.4631)  weight_decay: 0.0500 (0.0500)  time: 3.0817  data: 1.5822  max mem: 13273
Epoch: [268]  [ 200/1251]  eta: 0:03:52  lr: 0.000127  min_lr: 0.000127  loss: 2.1876 (2.7906)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1094 (1.1225)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [268]  [ 400/1251]  eta: 0:03:01  lr: 0.000126  min_lr: 0.000126  loss: 3.1931 (2.7720)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2875 (1.1939)  time: 0.2065  data: 0.0006  max mem: 13273
Epoch: [268]  [ 600/1251]  eta: 0:02:17  lr: 0.000125  min_lr: 0.000125  loss: 2.9615 (2.7674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1578 (nan)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [268]  [ 800/1251]  eta: 0:01:34  lr: 0.000123  min_lr: 0.000123  loss: 2.6593 (2.7478)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0023 (nan)  time: 0.2050  data: 0.0006  max mem: 13273
Epoch: [268]  [1000/1251]  eta: 0:00:52  lr: 0.000122  min_lr: 0.000122  loss: 2.1836 (2.7660)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1461 (nan)  time: 0.2062  data: 0.0007  max mem: 13273
Epoch: [268]  [1200/1251]  eta: 0:00:10  lr: 0.000121  min_lr: 0.000121  loss: 2.5166 (2.7577)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1240 (nan)  time: 0.2052  data: 0.0005  max mem: 13273
Epoch: [268]  [1250/1251]  eta: 0:00:00  lr: 0.000121  min_lr: 0.000121  loss: 2.5182 (2.7609)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2163 (nan)  time: 0.1707  data: 0.0008  max mem: 13273
Epoch: [268] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.000121  min_lr: 0.000121  loss: 2.5182 (2.7437)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2163 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5994 (0.5994)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.6150  data: 5.5081  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7591 (0.7716)  acc1: 84.8000 (84.2545)  acc5: 97.2000 (97.0909)  time: 0.7712  data: 0.6722  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9748 (0.9384)  acc1: 78.4000 (80.7429)  acc5: 94.8000 (95.4857)  time: 0.2158  data: 0.1200  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0192 (0.9459)  acc1: 78.4000 (80.4800)  acc5: 94.4000 (95.3280)  time: 0.2205  data: 0.1254  max mem: 13273
Test: Total time: 0:00:10 (0.4194 s / it)
* Acc@1 80.738 Acc@5 95.492 loss 0.942
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.79%
Epoch: [269]  [   0/1251]  eta: 1:08:17  lr: 0.000121  min_lr: 0.000121  loss: 3.6024 (3.6024)  weight_decay: 0.0500 (0.0500)  time: 3.2752  data: 3.0206  max mem: 13273
Epoch: [269]  [ 200/1251]  eta: 0:03:53  lr: 0.000120  min_lr: 0.000120  loss: 3.0062 (2.7798)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1394 (1.1971)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [269]  [ 400/1251]  eta: 0:03:02  lr: 0.000118  min_lr: 0.000118  loss: 2.9865 (2.7913)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1641 (1.2120)  time: 0.2071  data: 0.0004  max mem: 13273
Epoch: [269]  [ 600/1251]  eta: 0:02:17  lr: 0.000117  min_lr: 0.000117  loss: 2.9394 (2.7773)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1051 (1.2019)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [269]  [ 800/1251]  eta: 0:01:34  lr: 0.000116  min_lr: 0.000116  loss: 3.3796 (2.7802)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1182 (1.1967)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [269]  [1000/1251]  eta: 0:00:52  lr: 0.000115  min_lr: 0.000115  loss: 2.6841 (2.7771)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2185 (1.1997)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [269]  [1200/1251]  eta: 0:00:10  lr: 0.000113  min_lr: 0.000113  loss: 3.1386 (2.7739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1732 (1.1984)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [269]  [1250/1251]  eta: 0:00:00  lr: 0.000113  min_lr: 0.000113  loss: 3.0087 (2.7765)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2026 (1.1990)  time: 0.1709  data: 0.0011  max mem: 13273
Epoch: [269] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.000113  min_lr: 0.000113  loss: 3.0087 (2.7588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2026 (1.1990)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5818 (0.5818)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.5129  data: 5.4058  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.7486 (0.7602)  acc1: 85.2000 (84.5091)  acc5: 97.2000 (97.3091)  time: 0.6489  data: 0.5528  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9731 (0.9197)  acc1: 79.2000 (80.9905)  acc5: 94.4000 (95.4476)  time: 0.1708  data: 0.0781  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0031 (0.9277)  acc1: 79.2000 (80.8480)  acc5: 94.4000 (95.3280)  time: 0.2054  data: 0.1135  max mem: 13273
Test: Total time: 0:00:10 (0.4056 s / it)
* Acc@1 80.794 Acc@5 95.530 loss 0.923
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.79%
Epoch: [270]  [   0/1251]  eta: 1:08:51  lr: 0.000113  min_lr: 0.000113  loss: 1.9886 (1.9886)  weight_decay: 0.0500 (0.0500)  time: 3.3028  data: 3.0258  max mem: 13273
Epoch: [270]  [ 200/1251]  eta: 0:03:55  lr: 0.000112  min_lr: 0.000112  loss: 2.0229 (2.6824)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2133 (1.1682)  time: 0.2064  data: 0.0004  max mem: 13273
Epoch: [270]  [ 400/1251]  eta: 0:03:02  lr: 0.000111  min_lr: 0.000111  loss: 3.1449 (2.7462)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2459 (1.2153)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [270]  [ 600/1251]  eta: 0:02:17  lr: 0.000110  min_lr: 0.000110  loss: 3.1305 (2.7797)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0708 (1.2131)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [270]  [ 800/1251]  eta: 0:01:34  lr: 0.000109  min_lr: 0.000109  loss: 2.5855 (2.7499)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1007 (1.1943)  time: 0.2041  data: 0.0004  max mem: 13273
Epoch: [270]  [1000/1251]  eta: 0:00:52  lr: 0.000107  min_lr: 0.000107  loss: 2.6556 (2.7578)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1095 (1.1823)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [270]  [1200/1251]  eta: 0:00:10  lr: 0.000106  min_lr: 0.000106  loss: 2.6366 (2.7484)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1499 (1.1777)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [270]  [1250/1251]  eta: 0:00:00  lr: 0.000106  min_lr: 0.000106  loss: 2.7904 (2.7443)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1000 (1.1762)  time: 0.1701  data: 0.0007  max mem: 13273
Epoch: [270] Total time: 0:04:21 (0.2086 s / it)
Averaged stats: lr: 0.000106  min_lr: 0.000106  loss: 2.7904 (2.7528)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1000 (1.1762)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6445 (0.6445)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.6691  data: 5.5636  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8003 (0.8092)  acc1: 86.0000 (84.7636)  acc5: 97.2000 (97.2727)  time: 0.7723  data: 0.6750  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0227 (0.9683)  acc1: 78.8000 (80.8381)  acc5: 94.4000 (95.3333)  time: 0.2075  data: 0.1133  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0339 (0.9748)  acc1: 79.2000 (80.6560)  acc5: 94.0000 (95.2160)  time: 0.2066  data: 0.1133  max mem: 13273
Test: Total time: 0:00:10 (0.4109 s / it)
* Acc@1 80.820 Acc@5 95.480 loss 0.970
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.82%
Epoch: [271]  [   0/1251]  eta: 1:02:42  lr: 0.000106  min_lr: 0.000106  loss: 2.2739 (2.2739)  weight_decay: 0.0500 (0.0500)  time: 3.0075  data: 2.7674  max mem: 13273
Epoch: [271]  [ 200/1251]  eta: 0:03:51  lr: 0.000105  min_lr: 0.000105  loss: 2.8503 (2.8219)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1336 (1.1426)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [271]  [ 400/1251]  eta: 0:03:01  lr: 0.000104  min_lr: 0.000104  loss: 2.4012 (2.7839)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1068 (1.1513)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [271]  [ 600/1251]  eta: 0:02:17  lr: 0.000102  min_lr: 0.000102  loss: 2.2022 (2.7726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1828 (1.1827)  time: 0.2075  data: 0.0004  max mem: 13273
Epoch: [271]  [ 800/1251]  eta: 0:01:34  lr: 0.000101  min_lr: 0.000101  loss: 2.7403 (2.7722)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1035 (1.1783)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [271]  [1000/1251]  eta: 0:00:52  lr: 0.000100  min_lr: 0.000100  loss: 3.0258 (2.7644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1524 (1.1793)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [271]  [1200/1251]  eta: 0:00:10  lr: 0.000099  min_lr: 0.000099  loss: 2.9233 (2.7630)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1662 (1.1819)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [271]  [1250/1251]  eta: 0:00:00  lr: 0.000099  min_lr: 0.000099  loss: 2.7281 (2.7605)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1662 (1.1819)  time: 0.1701  data: 0.0007  max mem: 13273
Epoch: [271] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.000099  min_lr: 0.000099  loss: 2.7281 (2.7313)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1662 (1.1819)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6284 (0.6284)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.4405  data: 5.3317  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7886 (0.8015)  acc1: 84.8000 (84.1455)  acc5: 97.2000 (97.3091)  time: 0.7255  data: 0.6306  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0058 (0.9709)  acc1: 79.2000 (81.0095)  acc5: 94.8000 (95.5238)  time: 0.2096  data: 0.1177  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0477 (0.9771)  acc1: 79.2000 (80.7840)  acc5: 94.4000 (95.3440)  time: 0.2090  data: 0.1176  max mem: 13273
Test: Total time: 0:00:10 (0.4037 s / it)
* Acc@1 80.814 Acc@5 95.506 loss 0.972
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.82%
Epoch: [272]  [   0/1251]  eta: 1:02:29  lr: 0.000099  min_lr: 0.000099  loss: 3.6935 (3.6935)  weight_decay: 0.0500 (0.0500)  time: 2.9969  data: 1.6895  max mem: 13273
Epoch: [272]  [ 200/1251]  eta: 0:03:54  lr: 0.000098  min_lr: 0.000098  loss: 2.2738 (2.7417)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1785 (inf)  time: 0.2177  data: 0.0005  max mem: 13273
Epoch: [272]  [ 400/1251]  eta: 0:03:02  lr: 0.000097  min_lr: 0.000097  loss: 2.1740 (2.7680)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2245 (inf)  time: 0.2060  data: 0.0007  max mem: 13273
Epoch: [272]  [ 600/1251]  eta: 0:02:17  lr: 0.000096  min_lr: 0.000096  loss: 3.1204 (2.7746)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1052 (inf)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [272]  [ 800/1251]  eta: 0:01:34  lr: 0.000094  min_lr: 0.000094  loss: 2.3570 (2.7528)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1004 (inf)  time: 0.2081  data: 0.0006  max mem: 13273
Epoch: [272]  [1000/1251]  eta: 0:00:52  lr: 0.000093  min_lr: 0.000093  loss: 2.1628 (2.7413)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0859 (inf)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [272]  [1200/1251]  eta: 0:00:10  lr: 0.000092  min_lr: 0.000092  loss: 2.1500 (2.7431)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1708 (inf)  time: 0.2059  data: 0.0006  max mem: 13273
Epoch: [272]  [1250/1251]  eta: 0:00:00  lr: 0.000092  min_lr: 0.000092  loss: 2.9502 (2.7495)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0671 (inf)  time: 0.1702  data: 0.0008  max mem: 13273
Epoch: [272] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.000092  min_lr: 0.000092  loss: 2.9502 (2.7350)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0671 (inf)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6157 (0.6157)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.8626  data: 5.7572  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7648 (0.7789)  acc1: 84.4000 (84.2909)  acc5: 97.2000 (97.1636)  time: 0.7163  data: 0.6221  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9863 (0.9461)  acc1: 78.8000 (80.8571)  acc5: 94.4000 (95.3905)  time: 0.1803  data: 0.0885  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0066 (0.9530)  acc1: 78.8000 (80.6560)  acc5: 94.4000 (95.2960)  time: 0.2019  data: 0.1102  max mem: 13273
Test: Total time: 0:00:10 (0.4135 s / it)
* Acc@1 80.784 Acc@5 95.520 loss 0.948
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.82%
Epoch: [273]  [   0/1251]  eta: 1:08:03  lr: 0.000092  min_lr: 0.000092  loss: 3.3433 (3.3433)  weight_decay: 0.0500 (0.0500)  time: 3.2638  data: 2.3120  max mem: 13273
Epoch: [273]  [ 200/1251]  eta: 0:03:52  lr: 0.000091  min_lr: 0.000091  loss: 2.4437 (2.6927)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0499 (1.1635)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [273]  [ 400/1251]  eta: 0:03:01  lr: 0.000090  min_lr: 0.000090  loss: 2.1283 (2.6755)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2522 (1.2019)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [273]  [ 600/1251]  eta: 0:02:17  lr: 0.000089  min_lr: 0.000089  loss: 2.1019 (2.6791)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1867 (1.2083)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [273]  [ 800/1251]  eta: 0:01:34  lr: 0.000088  min_lr: 0.000088  loss: 2.7662 (2.7079)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1003 (1.2092)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [273]  [1000/1251]  eta: 0:00:52  lr: 0.000087  min_lr: 0.000087  loss: 3.3694 (2.7117)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1489 (1.2102)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [273]  [1200/1251]  eta: 0:00:10  lr: 0.000086  min_lr: 0.000086  loss: 3.2143 (2.7192)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1639 (1.2028)  time: 0.2129  data: 0.0005  max mem: 13273
Epoch: [273]  [1250/1251]  eta: 0:00:00  lr: 0.000085  min_lr: 0.000085  loss: 3.0494 (2.7223)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1892 (1.2021)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [273] Total time: 0:04:21 (0.2092 s / it)
Averaged stats: lr: 0.000085  min_lr: 0.000085  loss: 3.0494 (2.7463)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1892 (1.2021)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6701 (0.6701)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.7690  data: 5.6628  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.8025 (0.8266)  acc1: 84.4000 (83.7818)  acc5: 97.2000 (97.3455)  time: 0.7656  data: 0.6665  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0302 (0.9934)  acc1: 78.0000 (80.4762)  acc5: 94.4000 (95.4857)  time: 0.2090  data: 0.1145  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0645 (1.0003)  acc1: 78.0000 (80.4320)  acc5: 94.0000 (95.3280)  time: 0.2072  data: 0.1145  max mem: 13273
Test: Total time: 0:00:10 (0.4158 s / it)
* Acc@1 80.698 Acc@5 95.522 loss 0.994
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.82%
Epoch: [274]  [   0/1251]  eta: 1:04:50  lr: 0.000085  min_lr: 0.000085  loss: 2.2154 (2.2154)  weight_decay: 0.0500 (0.0500)  time: 3.1096  data: 2.5916  max mem: 13273
Epoch: [274]  [ 200/1251]  eta: 0:03:51  lr: 0.000084  min_lr: 0.000084  loss: 2.5889 (2.7408)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0461 (1.1329)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [274]  [ 400/1251]  eta: 0:03:00  lr: 0.000083  min_lr: 0.000083  loss: 2.0872 (2.7446)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1985 (1.1733)  time: 0.2050  data: 0.0003  max mem: 13273
Epoch: [274]  [ 600/1251]  eta: 0:02:16  lr: 0.000082  min_lr: 0.000082  loss: 2.5515 (2.7551)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1698 (1.1824)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [274]  [ 800/1251]  eta: 0:01:34  lr: 0.000081  min_lr: 0.000081  loss: 3.2566 (2.7417)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1467 (1.1691)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [274]  [1000/1251]  eta: 0:00:52  lr: 0.000080  min_lr: 0.000080  loss: 2.8702 (2.7367)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0912 (1.1657)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [274]  [1200/1251]  eta: 0:00:10  lr: 0.000079  min_lr: 0.000079  loss: 3.0443 (2.7404)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1593 (1.1771)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [274]  [1250/1251]  eta: 0:00:00  lr: 0.000079  min_lr: 0.000079  loss: 2.9806 (2.7397)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1746 (1.1808)  time: 0.1706  data: 0.0006  max mem: 13273
Epoch: [274] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.000079  min_lr: 0.000079  loss: 2.9806 (2.7420)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1746 (1.1808)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6265 (0.6265)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.5846  data: 5.4681  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7761 (0.7993)  acc1: 84.0000 (83.9273)  acc5: 97.2000 (97.2000)  time: 0.7432  data: 0.6459  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0035 (0.9643)  acc1: 78.8000 (80.6667)  acc5: 94.8000 (95.4286)  time: 0.1987  data: 0.1059  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0293 (0.9701)  acc1: 79.2000 (80.4960)  acc5: 94.4000 (95.3280)  time: 0.2050  data: 0.1123  max mem: 13273
Test: Total time: 0:00:10 (0.4063 s / it)
* Acc@1 80.798 Acc@5 95.502 loss 0.965
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.82%
Epoch: [275]  [   0/1251]  eta: 1:03:09  lr: 0.000079  min_lr: 0.000079  loss: 3.4942 (3.4942)  weight_decay: 0.0500 (0.0500)  time: 3.0290  data: 2.7783  max mem: 13273
Epoch: [275]  [ 200/1251]  eta: 0:03:52  lr: 0.000078  min_lr: 0.000078  loss: 2.6490 (2.7455)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0868 (1.1513)  time: 0.2041  data: 0.0004  max mem: 13273
Epoch: [275]  [ 400/1251]  eta: 0:03:01  lr: 0.000077  min_lr: 0.000077  loss: 2.9519 (2.7544)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0857 (1.1441)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [275]  [ 600/1251]  eta: 0:02:17  lr: 0.000076  min_lr: 0.000076  loss: 2.1052 (2.7331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1548 (1.1535)  time: 0.2041  data: 0.0004  max mem: 13273
Epoch: [275]  [ 800/1251]  eta: 0:01:34  lr: 0.000075  min_lr: 0.000075  loss: 2.9625 (2.7331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1646 (1.1647)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [275]  [1000/1251]  eta: 0:00:52  lr: 0.000074  min_lr: 0.000074  loss: 2.3603 (2.7187)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1075 (1.1687)  time: 0.2072  data: 0.0004  max mem: 13273
Epoch: [275]  [1200/1251]  eta: 0:00:10  lr: 0.000073  min_lr: 0.000073  loss: 2.2586 (2.7210)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2878 (1.1764)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [275]  [1250/1251]  eta: 0:00:00  lr: 0.000073  min_lr: 0.000073  loss: 2.6588 (2.7189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1861 (1.1789)  time: 0.1701  data: 0.0009  max mem: 13273
Epoch: [275] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.000073  min_lr: 0.000073  loss: 2.6588 (2.7325)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1861 (1.1789)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5534 (0.5534)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.5075  data: 5.4016  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7177 (0.7312)  acc1: 85.2000 (84.4000)  acc5: 97.6000 (97.3455)  time: 0.7542  data: 0.6553  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9458 (0.8922)  acc1: 79.2000 (81.2191)  acc5: 95.2000 (95.5810)  time: 0.2236  data: 0.1269  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9616 (0.9009)  acc1: 79.6000 (80.9760)  acc5: 94.8000 (95.4880)  time: 0.2232  data: 0.1274  max mem: 13273
Test: Total time: 0:00:10 (0.4173 s / it)
* Acc@1 81.000 Acc@5 95.558 loss 0.899
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.00%
Epoch: [276]  [   0/1251]  eta: 0:59:25  lr: 0.000073  min_lr: 0.000073  loss: 2.0544 (2.0544)  weight_decay: 0.0500 (0.0500)  time: 2.8502  data: 2.5734  max mem: 13273
Epoch: [276]  [ 200/1251]  eta: 0:03:50  lr: 0.000072  min_lr: 0.000072  loss: 2.7901 (2.6784)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1368 (1.1851)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [276]  [ 400/1251]  eta: 0:03:00  lr: 0.000071  min_lr: 0.000071  loss: 2.4025 (2.7082)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1747 (1.1831)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [276]  [ 600/1251]  eta: 0:02:16  lr: 0.000070  min_lr: 0.000070  loss: 2.9181 (2.7251)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1629 (1.1847)  time: 0.2047  data: 0.0006  max mem: 13273
Epoch: [276]  [ 800/1251]  eta: 0:01:34  lr: 0.000069  min_lr: 0.000069  loss: 2.1599 (2.7216)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2022 (1.1828)  time: 0.2055  data: 0.0006  max mem: 13273
Epoch: [276]  [1000/1251]  eta: 0:00:52  lr: 0.000068  min_lr: 0.000068  loss: 2.5204 (2.6949)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1695 (1.1845)  time: 0.2072  data: 0.0005  max mem: 13273
Epoch: [276]  [1200/1251]  eta: 0:00:10  lr: 0.000067  min_lr: 0.000067  loss: 2.5293 (2.7134)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2266 (1.1809)  time: 0.2135  data: 0.0004  max mem: 13273
Epoch: [276]  [1250/1251]  eta: 0:00:00  lr: 0.000067  min_lr: 0.000067  loss: 2.8353 (2.7130)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2321 (1.1819)  time: 0.1703  data: 0.0005  max mem: 13273
Epoch: [276] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.000067  min_lr: 0.000067  loss: 2.8353 (2.7307)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2321 (1.1819)
Test:  [ 0/25]  eta: 0:02:05  loss: 0.5903 (0.5903)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.0299  data: 4.9204  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.7518 (0.7685)  acc1: 85.2000 (84.3636)  acc5: 97.2000 (97.3455)  time: 0.6540  data: 0.5588  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9892 (0.9358)  acc1: 78.8000 (81.0286)  acc5: 94.4000 (95.4857)  time: 0.1910  data: 0.0989  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9951 (0.9431)  acc1: 79.6000 (80.8160)  acc5: 94.4000 (95.4080)  time: 0.2142  data: 0.1206  max mem: 13273
Test: Total time: 0:00:09 (0.3930 s / it)
* Acc@1 80.834 Acc@5 95.546 loss 0.938
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 81.00%
Epoch: [277]  [   0/1251]  eta: 1:10:36  lr: 0.000067  min_lr: 0.000067  loss: 2.0304 (2.0304)  weight_decay: 0.0500 (0.0500)  time: 3.3868  data: 2.6623  max mem: 13273
Epoch: [277]  [ 200/1251]  eta: 0:03:54  lr: 0.000066  min_lr: 0.000066  loss: 2.3697 (2.7454)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1302 (1.2037)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [277]  [ 400/1251]  eta: 0:03:02  lr: 0.000065  min_lr: 0.000065  loss: 2.2230 (2.7366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0871 (1.1589)  time: 0.2071  data: 0.0005  max mem: 13273
Epoch: [277]  [ 600/1251]  eta: 0:02:17  lr: 0.000064  min_lr: 0.000064  loss: 2.8858 (2.7416)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2301 (1.1679)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [277]  [ 800/1251]  eta: 0:01:34  lr: 0.000064  min_lr: 0.000064  loss: 2.8958 (2.7348)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2904 (1.1727)  time: 0.2052  data: 0.0007  max mem: 13273
Epoch: [277]  [1000/1251]  eta: 0:00:52  lr: 0.000063  min_lr: 0.000063  loss: 2.9747 (2.7222)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2085 (1.1883)  time: 0.2074  data: 0.0005  max mem: 13273
Epoch: [277]  [1200/1251]  eta: 0:00:10  lr: 0.000062  min_lr: 0.000062  loss: 2.3822 (2.7197)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1701 (1.1837)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [277]  [1250/1251]  eta: 0:00:00  lr: 0.000062  min_lr: 0.000062  loss: 2.5148 (2.7247)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1701 (1.1820)  time: 0.1704  data: 0.0008  max mem: 13273
Epoch: [277] Total time: 0:04:21 (0.2089 s / it)
Averaged stats: lr: 0.000062  min_lr: 0.000062  loss: 2.5148 (2.7414)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1701 (1.1820)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6390 (0.6390)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.7509  data: 5.6457  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7900 (0.8056)  acc1: 84.8000 (84.3636)  acc5: 97.2000 (97.3455)  time: 0.7460  data: 0.6473  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0319 (0.9651)  acc1: 78.8000 (80.9333)  acc5: 94.4000 (95.4857)  time: 0.2025  data: 0.1084  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0291 (0.9710)  acc1: 79.2000 (80.8000)  acc5: 94.0000 (95.3920)  time: 0.2010  data: 0.1083  max mem: 13273
Test: Total time: 0:00:10 (0.4104 s / it)
* Acc@1 80.808 Acc@5 95.564 loss 0.968
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 81.00%
Epoch: [278]  [   0/1251]  eta: 1:07:21  lr: 0.000062  min_lr: 0.000062  loss: 1.9116 (1.9116)  weight_decay: 0.0500 (0.0500)  time: 3.2303  data: 2.5117  max mem: 13273
Epoch: [278]  [ 200/1251]  eta: 0:03:53  lr: 0.000061  min_lr: 0.000061  loss: 3.0072 (2.7546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0729 (1.1654)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [278]  [ 400/1251]  eta: 0:03:02  lr: 0.000060  min_lr: 0.000060  loss: 2.5357 (2.7095)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1589 (1.1651)  time: 0.2045  data: 0.0004  max mem: 13273
Epoch: [278]  [ 600/1251]  eta: 0:02:17  lr: 0.000059  min_lr: 0.000059  loss: 2.4856 (2.7378)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1492 (1.1737)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [278]  [ 800/1251]  eta: 0:01:34  lr: 0.000058  min_lr: 0.000058  loss: 3.1351 (2.7507)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2140 (1.1943)  time: 0.2066  data: 0.0006  max mem: 13273
Epoch: [278]  [1000/1251]  eta: 0:00:52  lr: 0.000057  min_lr: 0.000057  loss: 2.8968 (2.7438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1469 (inf)  time: 0.2057  data: 0.0005  max mem: 13273
Epoch: [278]  [1200/1251]  eta: 0:00:10  lr: 0.000056  min_lr: 0.000056  loss: 2.7225 (2.7466)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1578 (inf)  time: 0.2045  data: 0.0005  max mem: 13273
Epoch: [278]  [1250/1251]  eta: 0:00:00  lr: 0.000056  min_lr: 0.000056  loss: 2.1334 (2.7456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1678 (inf)  time: 0.1703  data: 0.0008  max mem: 13273
Epoch: [278] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.000056  min_lr: 0.000056  loss: 2.1334 (2.7230)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1678 (inf)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5810 (0.5810)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.6328  data: 5.5250  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7336 (0.7504)  acc1: 86.0000 (84.5455)  acc5: 97.2000 (97.1636)  time: 0.7723  data: 0.6726  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9656 (0.9139)  acc1: 78.8000 (81.0667)  acc5: 94.4000 (95.4095)  time: 0.2049  data: 0.1096  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9685 (0.9218)  acc1: 78.8000 (80.9440)  acc5: 94.0000 (95.3120)  time: 0.2035  data: 0.1096  max mem: 13273
Test: Total time: 0:00:10 (0.4073 s / it)
* Acc@1 80.904 Acc@5 95.522 loss 0.918
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.00%
Epoch: [279]  [   0/1251]  eta: 1:09:32  lr: 0.000056  min_lr: 0.000056  loss: 2.0533 (2.0533)  weight_decay: 0.0500 (0.0500)  time: 3.3354  data: 3.0583  max mem: 13273
Epoch: [279]  [ 200/1251]  eta: 0:03:52  lr: 0.000055  min_lr: 0.000055  loss: 2.0172 (2.7163)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0823 (1.1423)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [279]  [ 400/1251]  eta: 0:03:01  lr: 0.000055  min_lr: 0.000055  loss: 2.2521 (2.7118)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1223 (1.1618)  time: 0.2049  data: 0.0004  max mem: 13273
Epoch: [279]  [ 600/1251]  eta: 0:02:16  lr: 0.000054  min_lr: 0.000054  loss: 2.2799 (2.6999)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1947 (1.1679)  time: 0.2070  data: 0.0004  max mem: 13273
Epoch: [279]  [ 800/1251]  eta: 0:01:34  lr: 0.000053  min_lr: 0.000053  loss: 2.1297 (2.7042)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2216 (1.1692)  time: 0.2062  data: 0.0004  max mem: 13273
Epoch: [279]  [1000/1251]  eta: 0:00:52  lr: 0.000052  min_lr: 0.000052  loss: 2.2633 (2.6890)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1897 (1.1715)  time: 0.2066  data: 0.0006  max mem: 13273
Epoch: [279]  [1200/1251]  eta: 0:00:10  lr: 0.000051  min_lr: 0.000051  loss: 3.1325 (2.7027)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1215 (1.1748)  time: 0.2041  data: 0.0005  max mem: 13273
Epoch: [279]  [1250/1251]  eta: 0:00:00  lr: 0.000051  min_lr: 0.000051  loss: 2.5281 (2.7010)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0391 (1.1741)  time: 0.1700  data: 0.0007  max mem: 13273
Epoch: [279] Total time: 0:04:20 (0.2081 s / it)
Averaged stats: lr: 0.000051  min_lr: 0.000051  loss: 2.5281 (2.7257)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0391 (1.1741)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6067 (0.6067)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.7375  data: 5.6313  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7528 (0.7764)  acc1: 85.2000 (84.3636)  acc5: 97.2000 (97.2727)  time: 0.7440  data: 0.6442  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9851 (0.9403)  acc1: 78.8000 (80.8381)  acc5: 94.4000 (95.4857)  time: 0.2093  data: 0.1145  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9904 (0.9470)  acc1: 79.2000 (80.7520)  acc5: 94.4000 (95.3760)  time: 0.2078  data: 0.1151  max mem: 13273
Test: Total time: 0:00:10 (0.4162 s / it)
* Acc@1 80.888 Acc@5 95.530 loss 0.944
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.00%
Epoch: [280]  [   0/1251]  eta: 0:55:42  lr: 0.000051  min_lr: 0.000051  loss: 2.2845 (2.2845)  weight_decay: 0.0500 (0.0500)  time: 2.6717  data: 1.5294  max mem: 13273
Epoch: [280]  [ 200/1251]  eta: 0:03:54  lr: 0.000050  min_lr: 0.000050  loss: 2.9824 (2.7268)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1467 (1.1309)  time: 0.2068  data: 0.0005  max mem: 13273
Epoch: [280]  [ 400/1251]  eta: 0:03:03  lr: 0.000050  min_lr: 0.000050  loss: 2.2704 (2.7805)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1020 (1.1531)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [280]  [ 600/1251]  eta: 0:02:18  lr: 0.000049  min_lr: 0.000049  loss: 2.4041 (2.7550)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1044 (1.1552)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [280]  [ 800/1251]  eta: 0:01:35  lr: 0.000048  min_lr: 0.000048  loss: 2.5480 (2.7230)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1821 (1.1621)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [280]  [1000/1251]  eta: 0:00:52  lr: 0.000047  min_lr: 0.000047  loss: 2.2032 (2.7089)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1242 (1.1746)  time: 0.2066  data: 0.0005  max mem: 13273
Epoch: [280]  [1200/1251]  eta: 0:00:10  lr: 0.000046  min_lr: 0.000046  loss: 3.1079 (2.7110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0988 (1.1730)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [280]  [1250/1251]  eta: 0:00:00  lr: 0.000046  min_lr: 0.000046  loss: 2.1721 (2.7085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1622 (1.1727)  time: 0.1704  data: 0.0007  max mem: 13273
Epoch: [280] Total time: 0:04:22 (0.2097 s / it)
Averaged stats: lr: 0.000046  min_lr: 0.000046  loss: 2.1721 (2.7258)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1622 (1.1727)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6145 (0.6145)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.3025  data: 5.1944  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.7528 (0.7723)  acc1: 84.8000 (84.3273)  acc5: 97.2000 (97.2727)  time: 0.6598  data: 0.5622  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9921 (0.9370)  acc1: 79.6000 (81.1048)  acc5: 94.4000 (95.4667)  time: 0.1780  data: 0.0845  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0049 (0.9445)  acc1: 80.0000 (80.8640)  acc5: 94.4000 (95.3440)  time: 0.2177  data: 0.1251  max mem: 13273
Test: Total time: 0:00:10 (0.4059 s / it)
* Acc@1 80.848 Acc@5 95.534 loss 0.942
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 81.00%
Epoch: [281]  [   0/1251]  eta: 1:06:35  lr: 0.000046  min_lr: 0.000046  loss: 2.7481 (2.7481)  weight_decay: 0.0500 (0.0500)  time: 3.1938  data: 2.9476  max mem: 13273
Epoch: [281]  [ 200/1251]  eta: 0:03:54  lr: 0.000046  min_lr: 0.000046  loss: 2.1299 (2.7356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1078 (1.1893)  time: 0.2043  data: 0.0005  max mem: 13273
Epoch: [281]  [ 400/1251]  eta: 0:03:02  lr: 0.000045  min_lr: 0.000045  loss: 2.2743 (2.6978)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1268 (1.1796)  time: 0.2069  data: 0.0005  max mem: 13273
Epoch: [281]  [ 600/1251]  eta: 0:02:17  lr: 0.000044  min_lr: 0.000044  loss: 2.1516 (2.7068)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1416 (1.1937)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [281]  [ 800/1251]  eta: 0:01:34  lr: 0.000043  min_lr: 0.000043  loss: 2.2362 (2.7094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1506 (1.1919)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [281]  [1000/1251]  eta: 0:00:52  lr: 0.000043  min_lr: 0.000043  loss: 2.2171 (2.7213)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1229 (1.1903)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [281]  [1200/1251]  eta: 0:00:10  lr: 0.000042  min_lr: 0.000042  loss: 2.6493 (2.7175)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0345 (1.1851)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [281]  [1250/1251]  eta: 0:00:00  lr: 0.000042  min_lr: 0.000042  loss: 2.1453 (2.7180)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1275 (1.1824)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [281] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.000042  min_lr: 0.000042  loss: 2.1453 (2.7299)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1275 (1.1824)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5884 (0.5884)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.7387  data: 5.6335  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7394 (0.7631)  acc1: 85.2000 (84.5091)  acc5: 97.2000 (97.1273)  time: 0.7513  data: 0.6569  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9779 (0.9233)  acc1: 78.8000 (80.8191)  acc5: 94.4000 (95.3905)  time: 0.2015  data: 0.1096  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9849 (0.9312)  acc1: 78.8000 (80.6240)  acc5: 94.4000 (95.2640)  time: 0.2010  data: 0.1096  max mem: 13273
Test: Total time: 0:00:10 (0.4082 s / it)
* Acc@1 80.792 Acc@5 95.552 loss 0.927
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 81.00%
Epoch: [282]  [   0/1251]  eta: 1:02:59  lr: 0.000042  min_lr: 0.000042  loss: 1.7316 (1.7316)  weight_decay: 0.0500 (0.0500)  time: 3.0213  data: 2.1791  max mem: 13273
Epoch: [282]  [ 200/1251]  eta: 0:03:53  lr: 0.000041  min_lr: 0.000041  loss: 2.2637 (2.6734)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0993 (1.1610)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [282]  [ 400/1251]  eta: 0:03:02  lr: 0.000040  min_lr: 0.000040  loss: 2.6126 (2.7202)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0897 (1.1715)  time: 0.2060  data: 0.0005  max mem: 13273
Epoch: [282]  [ 600/1251]  eta: 0:02:17  lr: 0.000040  min_lr: 0.000040  loss: 2.9797 (2.7587)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1234 (1.1525)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [282]  [ 800/1251]  eta: 0:01:34  lr: 0.000039  min_lr: 0.000039  loss: 2.4113 (2.7212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1004 (1.1508)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [282]  [1000/1251]  eta: 0:00:52  lr: 0.000038  min_lr: 0.000038  loss: 2.3771 (2.7412)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1575 (1.1518)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [282]  [1200/1251]  eta: 0:00:10  lr: 0.000037  min_lr: 0.000037  loss: 2.2518 (2.7397)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0896 (1.1537)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [282]  [1250/1251]  eta: 0:00:00  lr: 0.000037  min_lr: 0.000037  loss: 2.6173 (2.7427)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0678 (1.1531)  time: 0.1702  data: 0.0007  max mem: 13273
Epoch: [282] Total time: 0:04:21 (0.2088 s / it)
Averaged stats: lr: 0.000037  min_lr: 0.000037  loss: 2.6173 (2.7344)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0678 (1.1531)
Test:  [ 0/25]  eta: 0:01:27  loss: 0.6287 (0.6287)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 3.4934  data: 3.3844  max mem: 13273
Test:  [10/25]  eta: 0:00:08  loss: 0.7687 (0.7901)  acc1: 85.2000 (84.6182)  acc5: 97.2000 (97.2364)  time: 0.5813  data: 0.4816  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0045 (0.9557)  acc1: 78.8000 (81.0667)  acc5: 94.4000 (95.4476)  time: 0.2569  data: 0.1623  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0259 (0.9628)  acc1: 79.2000 (80.9600)  acc5: 94.0000 (95.3120)  time: 0.2071  data: 0.1143  max mem: 13273
Test: Total time: 0:00:10 (0.4012 s / it)
* Acc@1 80.958 Acc@5 95.590 loss 0.960
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.00%
Epoch: [283]  [   0/1251]  eta: 1:06:10  lr: 0.000037  min_lr: 0.000037  loss: 1.7302 (1.7302)  weight_decay: 0.0500 (0.0500)  time: 3.1740  data: 2.1451  max mem: 13273
Epoch: [283]  [ 200/1251]  eta: 0:03:52  lr: 0.000037  min_lr: 0.000037  loss: 2.4239 (2.7279)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1074 (1.1327)  time: 0.2055  data: 0.0004  max mem: 13273
Epoch: [283]  [ 400/1251]  eta: 0:03:01  lr: 0.000036  min_lr: 0.000036  loss: 2.9677 (2.7496)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1135 (1.1476)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [283]  [ 600/1251]  eta: 0:02:17  lr: 0.000035  min_lr: 0.000035  loss: 2.9081 (2.7590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1370 (1.1375)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [283]  [ 800/1251]  eta: 0:01:34  lr: 0.000035  min_lr: 0.000035  loss: 2.2542 (2.7413)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2720 (1.1675)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [283]  [1000/1251]  eta: 0:00:52  lr: 0.000034  min_lr: 0.000034  loss: 2.4263 (2.7597)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1306 (1.1626)  time: 0.2167  data: 0.0005  max mem: 13273
Epoch: [283]  [1200/1251]  eta: 0:00:10  lr: 0.000033  min_lr: 0.000033  loss: 2.5298 (2.7452)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1640 (1.1656)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [283]  [1250/1251]  eta: 0:00:00  lr: 0.000033  min_lr: 0.000033  loss: 2.4271 (2.7433)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0834 (1.1624)  time: 0.1713  data: 0.0008  max mem: 13273
Epoch: [283] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.000033  min_lr: 0.000033  loss: 2.4271 (2.7153)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0834 (1.1624)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5781 (0.5781)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.6226  data: 5.5172  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7207 (0.7414)  acc1: 84.4000 (84.0727)  acc5: 97.2000 (97.2727)  time: 0.7862  data: 0.6883  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9517 (0.9000)  acc1: 78.8000 (80.7619)  acc5: 94.4000 (95.4857)  time: 0.2232  data: 0.1280  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9469 (0.9067)  acc1: 78.8000 (80.6400)  acc5: 94.0000 (95.3760)  time: 0.2231  data: 0.1286  max mem: 13273
Test: Total time: 0:00:10 (0.4224 s / it)
* Acc@1 80.942 Acc@5 95.576 loss 0.902
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.00%
Epoch: [284]  [   0/1251]  eta: 1:05:59  lr: 0.000033  min_lr: 0.000033  loss: 2.2877 (2.2877)  weight_decay: 0.0500 (0.0500)  time: 3.1654  data: 1.5041  max mem: 13273
Epoch: [284]  [ 200/1251]  eta: 0:03:53  lr: 0.000032  min_lr: 0.000032  loss: 2.0554 (2.6503)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1082 (1.1353)  time: 0.2046  data: 0.0007  max mem: 13273
Epoch: [284]  [ 400/1251]  eta: 0:03:02  lr: 0.000032  min_lr: 0.000032  loss: 2.8166 (2.6814)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0915 (1.1537)  time: 0.2052  data: 0.0006  max mem: 13273
Epoch: [284]  [ 600/1251]  eta: 0:02:17  lr: 0.000031  min_lr: 0.000031  loss: 2.4711 (2.6858)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1718 (1.1804)  time: 0.2069  data: 0.0004  max mem: 13273
Epoch: [284]  [ 800/1251]  eta: 0:01:34  lr: 0.000031  min_lr: 0.000031  loss: 2.9245 (2.6925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1093 (1.1906)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [284]  [1000/1251]  eta: 0:00:52  lr: 0.000030  min_lr: 0.000030  loss: 2.7889 (2.7114)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1309 (1.1836)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [284]  [1200/1251]  eta: 0:00:10  lr: 0.000029  min_lr: 0.000029  loss: 2.2351 (2.7002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1912 (1.1866)  time: 0.2049  data: 0.0006  max mem: 13273
Epoch: [284]  [1250/1251]  eta: 0:00:00  lr: 0.000029  min_lr: 0.000029  loss: 2.2623 (2.6960)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2264 (1.1902)  time: 0.1706  data: 0.0010  max mem: 13273
Epoch: [284] Total time: 0:04:21 (0.2087 s / it)
Averaged stats: lr: 0.000029  min_lr: 0.000029  loss: 2.2623 (2.7141)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2264 (1.1902)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5740 (0.5740)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.4065  data: 5.2648  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7315 (0.7404)  acc1: 84.8000 (84.0727)  acc5: 97.2000 (97.2727)  time: 0.7136  data: 0.6112  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9516 (0.8989)  acc1: 78.4000 (80.9143)  acc5: 94.8000 (95.4857)  time: 0.1989  data: 0.1041  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9560 (0.9070)  acc1: 79.2000 (80.7200)  acc5: 94.4000 (95.3440)  time: 0.2061  data: 0.1129  max mem: 13273
Test: Total time: 0:00:10 (0.4004 s / it)
* Acc@1 80.896 Acc@5 95.582 loss 0.903
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.00%
Epoch: [285]  [   0/1251]  eta: 1:06:30  lr: 0.000029  min_lr: 0.000029  loss: 3.4523 (3.4523)  weight_decay: 0.0500 (0.0500)  time: 3.1900  data: 2.9188  max mem: 13273
Epoch: [285]  [ 200/1251]  eta: 0:03:52  lr: 0.000029  min_lr: 0.000029  loss: 2.3606 (2.6789)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0688 (1.1570)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [285]  [ 400/1251]  eta: 0:03:01  lr: 0.000028  min_lr: 0.000028  loss: 2.9348 (2.6969)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0563 (1.1759)  time: 0.2048  data: 0.0005  max mem: 13273
Epoch: [285]  [ 600/1251]  eta: 0:02:16  lr: 0.000027  min_lr: 0.000027  loss: 2.7080 (2.6964)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1024 (1.1736)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [285]  [ 800/1251]  eta: 0:01:34  lr: 0.000027  min_lr: 0.000027  loss: 2.8222 (2.7039)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1229 (1.1771)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [285]  [1000/1251]  eta: 0:00:52  lr: 0.000026  min_lr: 0.000026  loss: 3.0485 (2.7150)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1523 (1.1795)  time: 0.2058  data: 0.0006  max mem: 13273
Epoch: [285]  [1200/1251]  eta: 0:00:10  lr: 0.000026  min_lr: 0.000026  loss: 3.0769 (2.7278)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1331 (1.1747)  time: 0.2055  data: 0.0005  max mem: 13273
Epoch: [285]  [1250/1251]  eta: 0:00:00  lr: 0.000026  min_lr: 0.000026  loss: 2.1033 (2.7229)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1027 (1.1752)  time: 0.1722  data: 0.0008  max mem: 13273
Epoch: [285] Total time: 0:04:20 (0.2084 s / it)
Averaged stats: lr: 0.000026  min_lr: 0.000026  loss: 2.1033 (2.7282)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1027 (1.1752)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5568 (0.5568)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.7771  data: 5.6708  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7118 (0.7249)  acc1: 85.6000 (84.6182)  acc5: 97.2000 (97.2000)  time: 0.7497  data: 0.6552  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9389 (0.8839)  acc1: 78.4000 (81.1619)  acc5: 94.4000 (95.4667)  time: 0.1956  data: 0.1038  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9421 (0.8918)  acc1: 78.8000 (80.9920)  acc5: 94.4000 (95.3280)  time: 0.1990  data: 0.1077  max mem: 13273
Test: Total time: 0:00:10 (0.4086 s / it)
* Acc@1 80.964 Acc@5 95.596 loss 0.887
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.00%
Epoch: [286]  [   0/1251]  eta: 1:06:06  lr: 0.000026  min_lr: 0.000026  loss: 1.9535 (1.9535)  weight_decay: 0.0500 (0.0500)  time: 3.1709  data: 2.3302  max mem: 13273
Epoch: [286]  [ 200/1251]  eta: 0:03:52  lr: 0.000025  min_lr: 0.000025  loss: 2.3053 (2.7539)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1765 (1.1934)  time: 0.2063  data: 0.0006  max mem: 13273
Epoch: [286]  [ 400/1251]  eta: 0:03:01  lr: 0.000025  min_lr: 0.000025  loss: 2.2849 (2.7685)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1510 (1.1713)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [286]  [ 600/1251]  eta: 0:02:17  lr: 0.000024  min_lr: 0.000024  loss: 2.2116 (2.7711)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1460 (1.1676)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [286]  [ 800/1251]  eta: 0:01:34  lr: 0.000023  min_lr: 0.000023  loss: 2.8833 (2.7561)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0410 (1.1615)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [286]  [1000/1251]  eta: 0:00:52  lr: 0.000023  min_lr: 0.000023  loss: 2.7738 (2.7388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9935 (1.1526)  time: 0.2046  data: 0.0005  max mem: 13273
Epoch: [286]  [1200/1251]  eta: 0:00:10  lr: 0.000022  min_lr: 0.000022  loss: 2.3352 (2.7347)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0800 (1.1534)  time: 0.2048  data: 0.0004  max mem: 13273
Epoch: [286]  [1250/1251]  eta: 0:00:00  lr: 0.000022  min_lr: 0.000022  loss: 2.3931 (2.7352)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1252 (1.1526)  time: 0.1701  data: 0.0007  max mem: 13273
Epoch: [286] Total time: 0:04:20 (0.2086 s / it)
Averaged stats: lr: 0.000022  min_lr: 0.000022  loss: 2.3931 (2.7162)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1252 (1.1526)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6254 (0.6254)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.7945  data: 5.6886  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7786 (0.7982)  acc1: 84.4000 (84.2909)  acc5: 97.2000 (97.2727)  time: 0.6924  data: 0.5954  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0008 (0.9572)  acc1: 78.8000 (80.8952)  acc5: 94.4000 (95.4857)  time: 0.1746  data: 0.0814  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0203 (0.9641)  acc1: 79.2000 (80.7040)  acc5: 94.0000 (95.3440)  time: 0.1968  data: 0.1042  max mem: 13273
Test: Total time: 0:00:10 (0.4080 s / it)
* Acc@1 80.930 Acc@5 95.582 loss 0.960
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.00%
Epoch: [287]  [   0/1251]  eta: 1:03:48  lr: 0.000022  min_lr: 0.000022  loss: 2.0663 (2.0663)  weight_decay: 0.0500 (0.0500)  time: 3.0607  data: 1.5608  max mem: 13273
Epoch: [287]  [ 200/1251]  eta: 0:03:56  lr: 0.000022  min_lr: 0.000022  loss: 3.0712 (2.7126)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1062 (1.1638)  time: 0.2138  data: 0.0004  max mem: 13273
Epoch: [287]  [ 400/1251]  eta: 0:03:03  lr: 0.000021  min_lr: 0.000021  loss: 2.7673 (2.6738)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1216 (1.1480)  time: 0.2050  data: 0.0005  max mem: 13273
Epoch: [287]  [ 600/1251]  eta: 0:02:18  lr: 0.000021  min_lr: 0.000021  loss: 2.2649 (2.6953)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1371 (1.1525)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [287]  [ 800/1251]  eta: 0:01:35  lr: 0.000020  min_lr: 0.000020  loss: 2.6576 (2.7012)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0781 (1.1441)  time: 0.2064  data: 0.0005  max mem: 13273
Epoch: [287]  [1000/1251]  eta: 0:00:52  lr: 0.000020  min_lr: 0.000020  loss: 2.5644 (2.6866)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1488 (1.1470)  time: 0.2060  data: 0.0004  max mem: 13273
Epoch: [287]  [1200/1251]  eta: 0:00:10  lr: 0.000019  min_lr: 0.000019  loss: 2.7047 (2.6863)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0951 (1.1453)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [287]  [1250/1251]  eta: 0:00:00  lr: 0.000019  min_lr: 0.000019  loss: 2.5060 (2.6959)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1588 (1.1480)  time: 0.1701  data: 0.0010  max mem: 13273
Epoch: [287] Total time: 0:04:22 (0.2096 s / it)
Averaged stats: lr: 0.000019  min_lr: 0.000019  loss: 2.5060 (2.7031)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1588 (1.1480)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6114 (0.6114)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.7374  data: 5.6321  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7649 (0.7776)  acc1: 85.2000 (84.2909)  acc5: 97.6000 (97.3455)  time: 0.7455  data: 0.6505  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9807 (0.9389)  acc1: 79.2000 (81.0667)  acc5: 94.8000 (95.6381)  time: 0.2103  data: 0.1169  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0065 (0.9456)  acc1: 79.2000 (80.8640)  acc5: 94.4000 (95.4720)  time: 0.2104  data: 0.1171  max mem: 13273
Test: Total time: 0:00:10 (0.4156 s / it)
* Acc@1 80.940 Acc@5 95.584 loss 0.942
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.00%
Epoch: [288]  [   0/1251]  eta: 1:00:37  lr: 0.000019  min_lr: 0.000019  loss: 1.7718 (1.7718)  weight_decay: 0.0500 (0.0500)  time: 2.9078  data: 2.4084  max mem: 13273
Epoch: [288]  [ 200/1251]  eta: 0:03:53  lr: 0.000019  min_lr: 0.000019  loss: 2.4177 (2.7004)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1318 (1.1944)  time: 0.2063  data: 0.0005  max mem: 13273
Epoch: [288]  [ 400/1251]  eta: 0:03:01  lr: 0.000018  min_lr: 0.000018  loss: 2.2448 (2.7252)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0371 (1.1754)  time: 0.2047  data: 0.0005  max mem: 13273
Epoch: [288]  [ 600/1251]  eta: 0:02:17  lr: 0.000018  min_lr: 0.000018  loss: 3.1162 (2.7323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0157 (1.1631)  time: 0.2064  data: 0.0006  max mem: 13273
Epoch: [288]  [ 800/1251]  eta: 0:01:34  lr: 0.000017  min_lr: 0.000017  loss: 2.5280 (2.7167)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0500 (1.1639)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [288]  [1000/1251]  eta: 0:00:52  lr: 0.000017  min_lr: 0.000017  loss: 2.9782 (2.7142)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1271 (1.1579)  time: 0.2049  data: 0.0006  max mem: 13273
Epoch: [288]  [1200/1251]  eta: 0:00:10  lr: 0.000016  min_lr: 0.000016  loss: 2.3153 (2.7209)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1636 (1.1592)  time: 0.2050  data: 0.0004  max mem: 13273
Epoch: [288]  [1250/1251]  eta: 0:00:00  lr: 0.000016  min_lr: 0.000016  loss: 2.8063 (2.7258)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1636 (1.1609)  time: 0.1705  data: 0.0007  max mem: 13273
Epoch: [288] Total time: 0:04:21 (0.2090 s / it)
Averaged stats: lr: 0.000016  min_lr: 0.000016  loss: 2.8063 (2.7129)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1636 (1.1609)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6098 (0.6098)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.6139  data: 5.4835  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7666 (0.7814)  acc1: 85.2000 (84.4000)  acc5: 97.2000 (97.2727)  time: 0.7380  data: 0.6355  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9934 (0.9433)  acc1: 79.2000 (81.1619)  acc5: 94.4000 (95.4476)  time: 0.1995  data: 0.1038  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0087 (0.9499)  acc1: 79.6000 (80.9120)  acc5: 94.4000 (95.3280)  time: 0.1975  data: 0.1038  max mem: 13273
Test: Total time: 0:00:10 (0.4040 s / it)
* Acc@1 80.888 Acc@5 95.556 loss 0.945
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.00%
Epoch: [289]  [   0/1251]  eta: 1:00:00  lr: 0.000016  min_lr: 0.000016  loss: 3.3460 (3.3460)  weight_decay: 0.0500 (0.0500)  time: 2.8779  data: 2.4698  max mem: 13273
Epoch: [289]  [ 200/1251]  eta: 0:03:53  lr: 0.000016  min_lr: 0.000016  loss: 3.0479 (2.7542)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0961 (1.1623)  time: 0.2049  data: 0.0005  max mem: 13273
Epoch: [289]  [ 400/1251]  eta: 0:03:01  lr: 0.000015  min_lr: 0.000015  loss: 3.1939 (2.7771)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1228 (1.1532)  time: 0.2056  data: 0.0005  max mem: 13273
Epoch: [289]  [ 600/1251]  eta: 0:02:17  lr: 0.000015  min_lr: 0.000015  loss: 2.1565 (2.7530)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1094 (1.1603)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [289]  [ 800/1251]  eta: 0:01:34  lr: 0.000014  min_lr: 0.000014  loss: 2.0365 (2.7229)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0797 (1.1614)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [289]  [1000/1251]  eta: 0:00:52  lr: 0.000014  min_lr: 0.000014  loss: 2.3289 (2.7151)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0968 (1.1567)  time: 0.2058  data: 0.0004  max mem: 13273
Epoch: [289]  [1200/1251]  eta: 0:00:10  lr: 0.000014  min_lr: 0.000014  loss: 2.6805 (2.7094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0656 (1.1491)  time: 0.2053  data: 0.0005  max mem: 13273
Epoch: [289]  [1250/1251]  eta: 0:00:00  lr: 0.000014  min_lr: 0.000014  loss: 2.6169 (2.7130)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0656 (1.1465)  time: 0.1701  data: 0.0008  max mem: 13273
Epoch: [289] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.000014  min_lr: 0.000014  loss: 2.6169 (2.7296)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0656 (1.1465)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6033 (0.6033)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.7576  data: 5.6510  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7520 (0.7693)  acc1: 84.8000 (84.1818)  acc5: 97.6000 (97.3818)  time: 0.7144  data: 0.6179  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9839 (0.9308)  acc1: 78.8000 (80.9143)  acc5: 94.4000 (95.4857)  time: 0.1837  data: 0.0903  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9989 (0.9385)  acc1: 78.8000 (80.7520)  acc5: 94.4000 (95.3760)  time: 0.1926  data: 0.0995  max mem: 13273
Test: Total time: 0:00:10 (0.4036 s / it)
* Acc@1 80.930 Acc@5 95.570 loss 0.935
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.00%
Epoch: [290]  [   0/1251]  eta: 1:07:32  lr: 0.000014  min_lr: 0.000014  loss: 2.6502 (2.6502)  weight_decay: 0.0500 (0.0500)  time: 3.2390  data: 2.8937  max mem: 13273
Epoch: [290]  [ 200/1251]  eta: 0:03:52  lr: 0.000013  min_lr: 0.000013  loss: 2.2419 (2.7336)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1460 (1.1353)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [290]  [ 400/1251]  eta: 0:03:01  lr: 0.000013  min_lr: 0.000013  loss: 3.0175 (2.7125)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0606 (1.1358)  time: 0.2144  data: 0.0005  max mem: 13273
Epoch: [290]  [ 600/1251]  eta: 0:02:17  lr: 0.000012  min_lr: 0.000012  loss: 2.5784 (2.7155)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1704 (1.1484)  time: 0.2163  data: 0.0004  max mem: 13273
Epoch: [290]  [ 800/1251]  eta: 0:01:34  lr: 0.000012  min_lr: 0.000012  loss: 2.3913 (2.7159)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1438 (1.1507)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [290]  [1000/1251]  eta: 0:00:52  lr: 0.000012  min_lr: 0.000012  loss: 2.4723 (2.7184)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1676 (1.1521)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [290]  [1200/1251]  eta: 0:00:10  lr: 0.000011  min_lr: 0.000011  loss: 2.6446 (2.7241)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1089 (1.1490)  time: 0.2047  data: 0.0007  max mem: 13273
Epoch: [290]  [1250/1251]  eta: 0:00:00  lr: 0.000011  min_lr: 0.000011  loss: 2.6255 (2.7216)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1125 (1.1502)  time: 0.1701  data: 0.0010  max mem: 13273
Epoch: [290] Total time: 0:04:21 (0.2090 s / it)
Averaged stats: lr: 0.000011  min_lr: 0.000011  loss: 2.6255 (2.7052)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1125 (1.1502)
Test:  [ 0/25]  eta: 0:01:19  loss: 0.5905 (0.5905)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 3.1750  data: 3.0685  max mem: 13273
Test:  [10/25]  eta: 0:00:09  loss: 0.7474 (0.7610)  acc1: 84.8000 (84.3273)  acc5: 97.2000 (97.2364)  time: 0.6011  data: 0.5047  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9756 (0.9234)  acc1: 78.8000 (80.9714)  acc5: 94.4000 (95.4857)  time: 0.3085  data: 0.2133  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9813 (0.9304)  acc1: 79.2000 (80.8480)  acc5: 94.4000 (95.3600)  time: 0.1950  data: 0.1005  max mem: 13273
Test: Total time: 0:00:09 (0.3968 s / it)
* Acc@1 80.962 Acc@5 95.592 loss 0.927
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.00%
Epoch: [291]  [   0/1251]  eta: 1:06:45  lr: 0.000011  min_lr: 0.000011  loss: 1.8319 (1.8319)  weight_decay: 0.0500 (0.0500)  time: 3.2019  data: 2.4021  max mem: 13273
Epoch: [291]  [ 200/1251]  eta: 0:03:52  lr: 0.000011  min_lr: 0.000011  loss: 2.6423 (2.6764)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1019 (1.1185)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [291]  [ 400/1251]  eta: 0:03:01  lr: 0.000010  min_lr: 0.000010  loss: 2.1521 (2.6847)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1610 (1.1482)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [291]  [ 600/1251]  eta: 0:02:17  lr: 0.000010  min_lr: 0.000010  loss: 2.6411 (2.6902)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0970 (1.1557)  time: 0.2068  data: 0.0005  max mem: 13273
Epoch: [291]  [ 800/1251]  eta: 0:01:34  lr: 0.000010  min_lr: 0.000010  loss: 2.8407 (2.7139)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0368 (1.1457)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [291]  [1000/1251]  eta: 0:00:52  lr: 0.000009  min_lr: 0.000009  loss: 2.4260 (2.7250)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0855 (1.1516)  time: 0.2063  data: 0.0004  max mem: 13273
Epoch: [291]  [1200/1251]  eta: 0:00:10  lr: 0.000009  min_lr: 0.000009  loss: 2.0611 (2.7347)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0951 (1.1536)  time: 0.2054  data: 0.0004  max mem: 13273
Epoch: [291]  [1250/1251]  eta: 0:00:00  lr: 0.000009  min_lr: 0.000009  loss: 2.1964 (2.7383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0888 (1.1541)  time: 0.1700  data: 0.0006  max mem: 13273
Epoch: [291] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.000009  min_lr: 0.000009  loss: 2.1964 (2.7249)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0888 (1.1541)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5871 (0.5871)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.5685  data: 5.4633  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7367 (0.7516)  acc1: 85.6000 (84.2182)  acc5: 97.2000 (97.2000)  time: 0.7184  data: 0.6245  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9601 (0.9100)  acc1: 78.8000 (81.0286)  acc5: 94.4000 (95.5238)  time: 0.1924  data: 0.1008  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9642 (0.9172)  acc1: 79.2000 (80.8960)  acc5: 94.4000 (95.4080)  time: 0.1952  data: 0.1043  max mem: 13273
Test: Total time: 0:00:09 (0.3972 s / it)
* Acc@1 80.986 Acc@5 95.600 loss 0.914
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.00%
Epoch: [292]  [   0/1251]  eta: 1:03:06  lr: 0.000009  min_lr: 0.000009  loss: 1.7984 (1.7984)  weight_decay: 0.0500 (0.0500)  time: 3.0270  data: 2.1703  max mem: 13273
Epoch: [292]  [ 200/1251]  eta: 0:03:52  lr: 0.000009  min_lr: 0.000009  loss: 2.6205 (2.7034)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1573 (1.1612)  time: 0.2059  data: 0.0004  max mem: 13273
Epoch: [292]  [ 400/1251]  eta: 0:03:01  lr: 0.000008  min_lr: 0.000008  loss: 2.7395 (2.7522)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2397 (inf)  time: 0.2039  data: 0.0005  max mem: 13273
Epoch: [292]  [ 600/1251]  eta: 0:02:16  lr: 0.000008  min_lr: 0.000008  loss: 2.2148 (2.7401)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0921 (inf)  time: 0.2067  data: 0.0004  max mem: 13273
Epoch: [292]  [ 800/1251]  eta: 0:01:34  lr: 0.000008  min_lr: 0.000008  loss: 2.5986 (2.7545)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1756 (inf)  time: 0.2069  data: 0.0005  max mem: 13273
Epoch: [292]  [1000/1251]  eta: 0:00:52  lr: 0.000008  min_lr: 0.000008  loss: 2.4183 (2.7438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1772 (inf)  time: 0.2061  data: 0.0004  max mem: 13273
Epoch: [292]  [1200/1251]  eta: 0:00:10  lr: 0.000007  min_lr: 0.000007  loss: 2.0463 (2.7269)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1433 (inf)  time: 0.2133  data: 0.0004  max mem: 13273
Epoch: [292]  [1250/1251]  eta: 0:00:00  lr: 0.000007  min_lr: 0.000007  loss: 3.0227 (2.7311)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1419 (inf)  time: 0.1705  data: 0.0008  max mem: 13273
Epoch: [292] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.000007  min_lr: 0.000007  loss: 3.0227 (2.7203)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1419 (inf)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6432 (0.6432)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.8543  data: 5.7510  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7957 (0.8140)  acc1: 84.8000 (84.4364)  acc5: 97.6000 (97.3455)  time: 0.7349  data: 0.6372  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0286 (0.9775)  acc1: 78.8000 (81.1048)  acc5: 94.4000 (95.4857)  time: 0.1953  data: 0.1016  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0435 (0.9838)  acc1: 79.2000 (80.9920)  acc5: 94.4000 (95.3280)  time: 0.1959  data: 0.1032  max mem: 13273
Test: Total time: 0:00:10 (0.4091 s / it)
* Acc@1 80.942 Acc@5 95.574 loss 0.980
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.00%
Epoch: [293]  [   0/1251]  eta: 1:06:13  lr: 0.000007  min_lr: 0.000007  loss: 1.8286 (1.8286)  weight_decay: 0.0500 (0.0500)  time: 3.1759  data: 2.6640  max mem: 13273
Epoch: [293]  [ 200/1251]  eta: 0:03:52  lr: 0.000007  min_lr: 0.000007  loss: 2.1481 (2.6158)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0593 (1.1637)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [293]  [ 400/1251]  eta: 0:03:01  lr: 0.000007  min_lr: 0.000007  loss: 2.1359 (2.6382)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1062 (1.1611)  time: 0.2043  data: 0.0004  max mem: 13273
Epoch: [293]  [ 600/1251]  eta: 0:02:17  lr: 0.000006  min_lr: 0.000006  loss: 3.2384 (2.6560)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1100 (1.1591)  time: 0.2067  data: 0.0004  max mem: 13273
Epoch: [293]  [ 800/1251]  eta: 0:01:34  lr: 0.000006  min_lr: 0.000006  loss: 3.0698 (2.6972)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1634 (1.1596)  time: 0.2061  data: 0.0006  max mem: 13273
Epoch: [293]  [1000/1251]  eta: 0:00:52  lr: 0.000006  min_lr: 0.000006  loss: 2.7162 (2.6928)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1298 (1.1590)  time: 0.2071  data: 0.0005  max mem: 13273
Epoch: [293]  [1200/1251]  eta: 0:00:10  lr: 0.000006  min_lr: 0.000006  loss: 2.2286 (2.6864)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1789 (1.1668)  time: 0.2061  data: 0.0005  max mem: 13273
Epoch: [293]  [1250/1251]  eta: 0:00:00  lr: 0.000006  min_lr: 0.000006  loss: 2.3257 (2.6809)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0588 (1.1654)  time: 0.1709  data: 0.0008  max mem: 13273
Epoch: [293] Total time: 0:04:21 (0.2092 s / it)
Averaged stats: lr: 0.000006  min_lr: 0.000006  loss: 2.3257 (2.7038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0588 (1.1654)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6005 (0.6005)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5756  data: 5.4301  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7508 (0.7681)  acc1: 85.6000 (84.6182)  acc5: 97.2000 (97.2364)  time: 0.7237  data: 0.6209  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9807 (0.9322)  acc1: 78.8000 (80.9143)  acc5: 94.4000 (95.4476)  time: 0.2007  data: 0.1048  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9965 (0.9394)  acc1: 78.8000 (80.7040)  acc5: 94.4000 (95.3120)  time: 0.2017  data: 0.1077  max mem: 13273
Test: Total time: 0:00:10 (0.4037 s / it)
* Acc@1 80.850 Acc@5 95.562 loss 0.934
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.00%
Epoch: [294]  [   0/1251]  eta: 1:03:03  lr: 0.000006  min_lr: 0.000006  loss: 3.6131 (3.6131)  weight_decay: 0.0500 (0.0500)  time: 3.0241  data: 2.6652  max mem: 13273
Epoch: [294]  [ 200/1251]  eta: 0:03:52  lr: 0.000005  min_lr: 0.000005  loss: 2.5077 (2.7452)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0603 (1.1499)  time: 0.2045  data: 0.0004  max mem: 13273
Epoch: [294]  [ 400/1251]  eta: 0:03:01  lr: 0.000005  min_lr: 0.000005  loss: 2.9945 (2.7170)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1262 (1.1606)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [294]  [ 600/1251]  eta: 0:02:17  lr: 0.000005  min_lr: 0.000005  loss: 2.1602 (2.7119)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1729 (1.1664)  time: 0.2047  data: 0.0004  max mem: 13273
Epoch: [294]  [ 800/1251]  eta: 0:01:34  lr: 0.000005  min_lr: 0.000005  loss: 3.1846 (2.7242)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1200 (1.1614)  time: 0.2036  data: 0.0005  max mem: 13273
Epoch: [294]  [1000/1251]  eta: 0:00:52  lr: 0.000004  min_lr: 0.000004  loss: 2.9155 (2.7182)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1046 (1.1601)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [294]  [1200/1251]  eta: 0:00:10  lr: 0.000004  min_lr: 0.000004  loss: 2.0771 (2.7084)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0856 (1.1534)  time: 0.2044  data: 0.0004  max mem: 13273
Epoch: [294]  [1250/1251]  eta: 0:00:00  lr: 0.000004  min_lr: 0.000004  loss: 2.0851 (2.7092)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1195 (1.1537)  time: 0.1703  data: 0.0007  max mem: 13273
Epoch: [294] Total time: 0:04:20 (0.2082 s / it)
Averaged stats: lr: 0.000004  min_lr: 0.000004  loss: 2.0851 (2.7106)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1195 (1.1537)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5690 (0.5690)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.5432  data: 5.4369  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7220 (0.7336)  acc1: 86.0000 (84.5091)  acc5: 97.2000 (97.2727)  time: 0.7084  data: 0.6127  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9437 (0.8891)  acc1: 78.4000 (80.9905)  acc5: 94.4000 (95.5429)  time: 0.2191  data: 0.1267  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9482 (0.8971)  acc1: 78.8000 (80.7360)  acc5: 94.4000 (95.4560)  time: 0.2182  data: 0.1266  max mem: 13273
Test: Total time: 0:00:10 (0.4144 s / it)
* Acc@1 80.988 Acc@5 95.592 loss 0.894
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.00%
Epoch: [295]  [   0/1251]  eta: 1:01:18  lr: 0.000004  min_lr: 0.000004  loss: 1.9215 (1.9215)  weight_decay: 0.0500 (0.0500)  time: 2.9405  data: 2.6540  max mem: 13273
Epoch: [295]  [ 200/1251]  eta: 0:03:52  lr: 0.000004  min_lr: 0.000004  loss: 2.3288 (2.7373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1000 (1.1386)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [295]  [ 400/1251]  eta: 0:03:01  lr: 0.000004  min_lr: 0.000004  loss: 2.6020 (2.7103)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0532 (1.1542)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [295]  [ 600/1251]  eta: 0:02:17  lr: 0.000004  min_lr: 0.000004  loss: 3.1433 (2.7098)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1808 (1.1495)  time: 0.2046  data: 0.0004  max mem: 13273
Epoch: [295]  [ 800/1251]  eta: 0:01:34  lr: 0.000003  min_lr: 0.000003  loss: 2.2023 (2.7289)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1096 (1.1592)  time: 0.2065  data: 0.0006  max mem: 13273
Epoch: [295]  [1000/1251]  eta: 0:00:52  lr: 0.000003  min_lr: 0.000003  loss: 2.8272 (2.7276)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0777 (1.1576)  time: 0.2073  data: 0.0005  max mem: 13273
Epoch: [295]  [1200/1251]  eta: 0:00:10  lr: 0.000003  min_lr: 0.000003  loss: 2.1700 (2.7002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0984 (1.1566)  time: 0.2065  data: 0.0005  max mem: 13273
Epoch: [295]  [1250/1251]  eta: 0:00:00  lr: 0.000003  min_lr: 0.000003  loss: 2.0168 (2.6970)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1322 (1.1560)  time: 0.1712  data: 0.0011  max mem: 13273
Epoch: [295] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.000003  min_lr: 0.000003  loss: 2.0168 (2.6989)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1322 (1.1560)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.5690 (0.5690)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.9163  data: 5.7778  max mem: 13273
Test:  [10/25]  eta: 0:00:11  loss: 0.7200 (0.7393)  acc1: 84.8000 (84.2909)  acc5: 97.2000 (97.2364)  time: 0.7335  data: 0.6313  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9531 (0.8960)  acc1: 78.8000 (81.0095)  acc5: 94.8000 (95.5238)  time: 0.1951  data: 0.1006  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9553 (0.9033)  acc1: 79.2000 (80.8000)  acc5: 94.8000 (95.4560)  time: 0.2086  data: 0.1158  max mem: 13273
Test: Total time: 0:00:10 (0.4235 s / it)
* Acc@1 80.978 Acc@5 95.616 loss 0.899
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.00%
Epoch: [296]  [   0/1251]  eta: 1:04:15  lr: 0.000003  min_lr: 0.000003  loss: 3.4459 (3.4459)  weight_decay: 0.0500 (0.0500)  time: 3.0819  data: 2.6895  max mem: 13273
Epoch: [296]  [ 200/1251]  eta: 0:03:53  lr: 0.000003  min_lr: 0.000003  loss: 3.0131 (2.7308)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0807 (1.1487)  time: 0.2082  data: 0.0004  max mem: 13273
Epoch: [296]  [ 400/1251]  eta: 0:03:02  lr: 0.000003  min_lr: 0.000003  loss: 3.3560 (2.7937)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0566 (1.1659)  time: 0.2053  data: 0.0004  max mem: 13273
Epoch: [296]  [ 600/1251]  eta: 0:02:17  lr: 0.000003  min_lr: 0.000003  loss: 2.7299 (2.7808)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0842 (1.1513)  time: 0.2066  data: 0.0006  max mem: 13273
Epoch: [296]  [ 800/1251]  eta: 0:01:34  lr: 0.000002  min_lr: 0.000002  loss: 2.8247 (2.7925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0798 (1.1439)  time: 0.2044  data: 0.0005  max mem: 13273
Epoch: [296]  [1000/1251]  eta: 0:00:52  lr: 0.000002  min_lr: 0.000002  loss: 2.2249 (2.7702)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1414 (1.1529)  time: 0.2052  data: 0.0004  max mem: 13273
Epoch: [296]  [1200/1251]  eta: 0:00:10  lr: 0.000002  min_lr: 0.000002  loss: 2.6251 (2.7639)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1426 (1.1484)  time: 0.2089  data: 0.0005  max mem: 13273
Epoch: [296]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.3644 (2.7592)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1426 (1.1476)  time: 0.1701  data: 0.0007  max mem: 13273
Epoch: [296] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.3644 (2.7186)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1426 (1.1476)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5759 (0.5759)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.2858  data: 5.1774  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7225 (0.7433)  acc1: 84.8000 (84.4364)  acc5: 97.2000 (97.2727)  time: 0.7003  data: 0.6013  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9615 (0.9025)  acc1: 78.8000 (81.1429)  acc5: 94.4000 (95.4286)  time: 0.1999  data: 0.1056  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9659 (0.9105)  acc1: 79.2000 (80.8960)  acc5: 94.4000 (95.3600)  time: 0.2228  data: 0.1292  max mem: 13273
Test: Total time: 0:00:10 (0.4089 s / it)
* Acc@1 80.990 Acc@5 95.596 loss 0.906
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.00%
Epoch: [297]  [   0/1251]  eta: 1:05:55  lr: 0.000002  min_lr: 0.000002  loss: 3.4825 (3.4825)  weight_decay: 0.0500 (0.0500)  time: 3.1619  data: 2.3510  max mem: 13273
Epoch: [297]  [ 200/1251]  eta: 0:03:52  lr: 0.000002  min_lr: 0.000002  loss: 3.0624 (2.7449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1052 (1.1184)  time: 0.2038  data: 0.0004  max mem: 13273
Epoch: [297]  [ 400/1251]  eta: 0:03:01  lr: 0.000002  min_lr: 0.000002  loss: 2.8797 (2.7669)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0586 (1.1351)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [297]  [ 600/1251]  eta: 0:02:17  lr: 0.000002  min_lr: 0.000002  loss: 2.0620 (2.7372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0573 (1.1501)  time: 0.2189  data: 0.0006  max mem: 13273
Epoch: [297]  [ 800/1251]  eta: 0:01:34  lr: 0.000002  min_lr: 0.000002  loss: 2.5003 (2.7628)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1254 (1.1496)  time: 0.2062  data: 0.0005  max mem: 13273
Epoch: [297]  [1000/1251]  eta: 0:00:52  lr: 0.000002  min_lr: 0.000002  loss: 2.1566 (2.7361)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0701 (1.1555)  time: 0.2054  data: 0.0005  max mem: 13273
Epoch: [297]  [1200/1251]  eta: 0:00:10  lr: 0.000002  min_lr: 0.000002  loss: 2.5435 (2.7420)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1643 (1.1619)  time: 0.2069  data: 0.0003  max mem: 13273
Epoch: [297]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.8066 (2.7351)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0988 (1.1634)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [297] Total time: 0:04:21 (0.2093 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.8066 (2.7309)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0988 (1.1634)
Test:  [ 0/25]  eta: 0:01:39  loss: 0.6352 (0.6352)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 3.9979  data: 3.8853  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7757 (0.7976)  acc1: 84.4000 (84.2545)  acc5: 97.6000 (97.3091)  time: 0.6855  data: 0.5880  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0057 (0.9625)  acc1: 78.8000 (80.7238)  acc5: 94.8000 (95.6191)  time: 0.2572  data: 0.1634  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0250 (0.9696)  acc1: 78.8000 (80.5760)  acc5: 94.8000 (95.4720)  time: 0.2368  data: 0.1432  max mem: 13273
Test: Total time: 0:00:09 (0.3836 s / it)
* Acc@1 80.846 Acc@5 95.608 loss 0.965
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 81.00%
Epoch: [298]  [   0/1251]  eta: 1:02:07  lr: 0.000002  min_lr: 0.000002  loss: 4.0787 (4.0787)  weight_decay: 0.0500 (0.0500)  time: 2.9798  data: 2.3887  max mem: 13273
Epoch: [298]  [ 200/1251]  eta: 0:03:51  lr: 0.000001  min_lr: 0.000001  loss: 2.5091 (2.7176)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2358 (1.1630)  time: 0.2051  data: 0.0004  max mem: 13273
Epoch: [298]  [ 400/1251]  eta: 0:03:01  lr: 0.000001  min_lr: 0.000001  loss: 2.1651 (2.7640)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0940 (1.1476)  time: 0.2058  data: 0.0005  max mem: 13273
Epoch: [298]  [ 600/1251]  eta: 0:02:16  lr: 0.000001  min_lr: 0.000001  loss: 2.0268 (2.7361)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0708 (inf)  time: 0.2065  data: 0.0006  max mem: 13273
Epoch: [298]  [ 800/1251]  eta: 0:01:34  lr: 0.000001  min_lr: 0.000001  loss: 2.2510 (2.7361)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1053 (inf)  time: 0.2047  data: 0.0006  max mem: 13273
Epoch: [298]  [1000/1251]  eta: 0:00:52  lr: 0.000001  min_lr: 0.000001  loss: 2.0286 (2.7297)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1022 (inf)  time: 0.2066  data: 0.0004  max mem: 13273
Epoch: [298]  [1200/1251]  eta: 0:00:10  lr: 0.000001  min_lr: 0.000001  loss: 2.7678 (2.7356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1370 (inf)  time: 0.2051  data: 0.0005  max mem: 13273
Epoch: [298]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.5084 (2.7367)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1370 (inf)  time: 0.1702  data: 0.0006  max mem: 13273
Epoch: [298] Total time: 0:04:20 (0.2085 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.5084 (2.7130)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1370 (inf)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6278 (0.6278)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.6481  data: 5.5407  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7741 (0.7952)  acc1: 85.2000 (84.2909)  acc5: 97.2000 (97.3818)  time: 0.7286  data: 0.6320  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 1.0014 (0.9559)  acc1: 79.2000 (80.9714)  acc5: 94.8000 (95.6000)  time: 0.2033  data: 0.1096  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 1.0244 (0.9629)  acc1: 79.2000 (80.7680)  acc5: 94.8000 (95.4560)  time: 0.2024  data: 0.1095  max mem: 13273
Test: Total time: 0:00:10 (0.4071 s / it)
* Acc@1 80.912 Acc@5 95.598 loss 0.960
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.00%
Epoch: [299]  [   0/1251]  eta: 1:06:07  lr: 0.000001  min_lr: 0.000001  loss: 3.0122 (3.0122)  weight_decay: 0.0500 (0.0500)  time: 3.1713  data: 2.4801  max mem: 13273
Epoch: [299]  [ 200/1251]  eta: 0:03:53  lr: 0.000001  min_lr: 0.000001  loss: 3.2208 (2.6999)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1252 (1.1916)  time: 0.2067  data: 0.0005  max mem: 13273
Epoch: [299]  [ 400/1251]  eta: 0:03:02  lr: 0.000001  min_lr: 0.000001  loss: 2.6481 (2.6851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0763 (1.1704)  time: 0.2070  data: 0.0006  max mem: 13273
Epoch: [299]  [ 600/1251]  eta: 0:02:17  lr: 0.000001  min_lr: 0.000001  loss: 3.3144 (2.6885)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0822 (1.1574)  time: 0.2059  data: 0.0005  max mem: 13273
Epoch: [299]  [ 800/1251]  eta: 0:01:34  lr: 0.000001  min_lr: 0.000001  loss: 2.9133 (2.7004)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1172 (1.1581)  time: 0.2071  data: 0.0004  max mem: 13273
Epoch: [299]  [1000/1251]  eta: 0:00:52  lr: 0.000001  min_lr: 0.000001  loss: 2.9090 (2.7166)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1195 (1.1610)  time: 0.2056  data: 0.0006  max mem: 13273
Epoch: [299]  [1200/1251]  eta: 0:00:10  lr: 0.000001  min_lr: 0.000001  loss: 2.1172 (2.7116)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1540 (1.1607)  time: 0.2068  data: 0.0005  max mem: 13273
Epoch: [299]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.0783 (2.7116)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1540 (1.1647)  time: 0.1712  data: 0.0011  max mem: 13273
Epoch: [299] Total time: 0:04:21 (0.2091 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.0783 (2.7126)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1540 (1.1647)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5994 (0.5994)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.6239  data: 5.5174  max mem: 13273
Test:  [10/25]  eta: 0:00:10  loss: 0.7478 (0.7639)  acc1: 85.2000 (84.4000)  acc5: 97.2000 (97.2727)  time: 0.6838  data: 0.5892  max mem: 13273
Test:  [20/25]  eta: 0:00:02  loss: 0.9767 (0.9235)  acc1: 78.4000 (80.9143)  acc5: 94.4000 (95.5238)  time: 0.1798  data: 0.0868  max mem: 13273
Test:  [24/25]  eta: 0:00:00  loss: 0.9806 (0.9309)  acc1: 78.8000 (80.7200)  acc5: 94.4000 (95.4400)  time: 0.1998  data: 0.1076  max mem: 13273
Test: Total time: 0:00:10 (0.4032 s / it)
* Acc@1 80.978 Acc@5 95.622 loss 0.927
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.00%
Training time 22:39:06
