| distributed init (rank 0): env://, gpu 0
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 3): env://, gpu 3
Namespace(batch_size=128, epochs=300, update_freq=4, model='tiny', drop_path=0, input_size=224, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.0035, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.4, cutmix=0.5, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_tiny_2.4G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=4, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(224, 224))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7fb2bc3a6350>
Mixup is activated!
Model = SFCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
  )
  (layer1): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((24,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=24, bias=False)
        (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(192, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=24, bias=False)
          (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(24, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): Identity()
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(48, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(192, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.003)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(48, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(192, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.006)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(48, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(192, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.009)
    )
  )
  (layer2): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.011)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.014)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.017)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.020)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.023)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.026)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.029)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.031)
    )
  )
  (layer3): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.034)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.037)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.040)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.043)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.046)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.049)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.051)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.054)
    )
    (8): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.057)
    )
    (9): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.060)
    )
    (10): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.063)
    )
    (11): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.066)
    )
    (12): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.069)
    )
    (13): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.071)
    )
    (14): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.074)
    )
    (15): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.077)
    )
    (16): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.080)
    )
    (17): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.083)
    )
    (18): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.086)
    )
    (19): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.089)
    )
  )
  (layer4): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1536, bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1536, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.091)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1536, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.094)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1536, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.097)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1536, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.100)
    )
  )
  (head): ConvX(
    (conv): Conv2d(384, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 15945832
LR = 0.00350000
Batch size = 2048
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 625
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.le.conv.weight",
      "layer1.0.conv_in.conv.weight",
      "layer1.0.conv.conv.weight",
      "layer1.0.conv_out.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.le.conv.weight",
      "layer1.1.conv_in.conv.weight",
      "layer1.1.conv.conv.weight",
      "layer1.1.conv_out.conv.weight",
      "layer1.2.le.conv.weight",
      "layer1.2.conv_in.conv.weight",
      "layer1.2.conv.conv.weight",
      "layer1.2.conv_out.conv.weight",
      "layer1.3.le.conv.weight",
      "layer1.3.conv_in.conv.weight",
      "layer1.3.conv.conv.weight",
      "layer1.3.conv_out.conv.weight",
      "layer2.0.le.conv.weight",
      "layer2.0.conv_in.conv.weight",
      "layer2.0.conv.conv.weight",
      "layer2.0.conv_out.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.le.conv.weight",
      "layer2.1.conv_in.conv.weight",
      "layer2.1.conv.conv.weight",
      "layer2.1.conv_out.conv.weight",
      "layer2.2.le.conv.weight",
      "layer2.2.conv_in.conv.weight",
      "layer2.2.conv.conv.weight",
      "layer2.2.conv_out.conv.weight",
      "layer2.3.le.conv.weight",
      "layer2.3.conv_in.conv.weight",
      "layer2.3.conv.conv.weight",
      "layer2.3.conv_out.conv.weight",
      "layer2.4.le.conv.weight",
      "layer2.4.conv_in.conv.weight",
      "layer2.4.conv.conv.weight",
      "layer2.4.conv_out.conv.weight",
      "layer2.5.le.conv.weight",
      "layer2.5.conv_in.conv.weight",
      "layer2.5.conv.conv.weight",
      "layer2.5.conv_out.conv.weight",
      "layer2.6.le.conv.weight",
      "layer2.6.conv_in.conv.weight",
      "layer2.6.conv.conv.weight",
      "layer2.6.conv_out.conv.weight",
      "layer2.7.le.conv.weight",
      "layer2.7.conv_in.conv.weight",
      "layer2.7.conv.conv.weight",
      "layer2.7.conv_out.conv.weight",
      "layer3.0.le.conv.weight",
      "layer3.0.conv_in.conv.weight",
      "layer3.0.conv.conv.weight",
      "layer3.0.conv_out.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.le.conv.weight",
      "layer3.1.conv_in.conv.weight",
      "layer3.1.conv.conv.weight",
      "layer3.1.conv_out.conv.weight",
      "layer3.2.le.conv.weight",
      "layer3.2.conv_in.conv.weight",
      "layer3.2.conv.conv.weight",
      "layer3.2.conv_out.conv.weight",
      "layer3.3.le.conv.weight",
      "layer3.3.conv_in.conv.weight",
      "layer3.3.conv.conv.weight",
      "layer3.3.conv_out.conv.weight",
      "layer3.4.le.conv.weight",
      "layer3.4.conv_in.conv.weight",
      "layer3.4.conv.conv.weight",
      "layer3.4.conv_out.conv.weight",
      "layer3.5.le.conv.weight",
      "layer3.5.conv_in.conv.weight",
      "layer3.5.conv.conv.weight",
      "layer3.5.conv_out.conv.weight",
      "layer3.6.le.conv.weight",
      "layer3.6.conv_in.conv.weight",
      "layer3.6.conv.conv.weight",
      "layer3.6.conv_out.conv.weight",
      "layer3.7.le.conv.weight",
      "layer3.7.conv_in.conv.weight",
      "layer3.7.conv.conv.weight",
      "layer3.7.conv_out.conv.weight",
      "layer3.8.le.conv.weight",
      "layer3.8.conv_in.conv.weight",
      "layer3.8.conv.conv.weight",
      "layer3.8.conv_out.conv.weight",
      "layer3.9.le.conv.weight",
      "layer3.9.conv_in.conv.weight",
      "layer3.9.conv.conv.weight",
      "layer3.9.conv_out.conv.weight",
      "layer3.10.le.conv.weight",
      "layer3.10.conv_in.conv.weight",
      "layer3.10.conv.conv.weight",
      "layer3.10.conv_out.conv.weight",
      "layer3.11.le.conv.weight",
      "layer3.11.conv_in.conv.weight",
      "layer3.11.conv.conv.weight",
      "layer3.11.conv_out.conv.weight",
      "layer3.12.le.conv.weight",
      "layer3.12.conv_in.conv.weight",
      "layer3.12.conv.conv.weight",
      "layer3.12.conv_out.conv.weight",
      "layer3.13.le.conv.weight",
      "layer3.13.conv_in.conv.weight",
      "layer3.13.conv.conv.weight",
      "layer3.13.conv_out.conv.weight",
      "layer3.14.le.conv.weight",
      "layer3.14.conv_in.conv.weight",
      "layer3.14.conv.conv.weight",
      "layer3.14.conv_out.conv.weight",
      "layer3.15.le.conv.weight",
      "layer3.15.conv_in.conv.weight",
      "layer3.15.conv.conv.weight",
      "layer3.15.conv_out.conv.weight",
      "layer3.16.le.conv.weight",
      "layer3.16.conv_in.conv.weight",
      "layer3.16.conv.conv.weight",
      "layer3.16.conv_out.conv.weight",
      "layer3.17.le.conv.weight",
      "layer3.17.conv_in.conv.weight",
      "layer3.17.conv.conv.weight",
      "layer3.17.conv_out.conv.weight",
      "layer3.18.le.conv.weight",
      "layer3.18.conv_in.conv.weight",
      "layer3.18.conv.conv.weight",
      "layer3.18.conv_out.conv.weight",
      "layer3.19.le.conv.weight",
      "layer3.19.conv_in.conv.weight",
      "layer3.19.conv.conv.weight",
      "layer3.19.conv_out.conv.weight",
      "layer4.0.le.conv.weight",
      "layer4.0.conv_in.conv.weight",
      "layer4.0.conv.conv.weight",
      "layer4.0.conv_out.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.le.conv.weight",
      "layer4.1.conv_in.conv.weight",
      "layer4.1.conv.conv.weight",
      "layer4.1.conv_out.conv.weight",
      "layer4.2.le.conv.weight",
      "layer4.2.conv_in.conv.weight",
      "layer4.2.conv.conv.weight",
      "layer4.2.conv_out.conv.weight",
      "layer4.3.le.conv.weight",
      "layer4.3.conv_in.conv.weight",
      "layer4.3.conv.conv.weight",
      "layer4.3.conv_out.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.ln.weight",
      "layer1.0.ln.bias",
      "layer1.0.le.norm.weight",
      "layer1.0.le.norm.bias",
      "layer1.0.conv_in.norm.weight",
      "layer1.0.conv_in.norm.bias",
      "layer1.0.conv.norm.weight",
      "layer1.0.conv.norm.bias",
      "layer1.0.conv_out.norm.weight",
      "layer1.0.conv_out.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.le.norm.weight",
      "layer1.1.le.norm.bias",
      "layer1.1.conv_in.norm.weight",
      "layer1.1.conv_in.norm.bias",
      "layer1.1.conv.norm.weight",
      "layer1.1.conv.norm.bias",
      "layer1.1.conv_out.norm.weight",
      "layer1.1.conv_out.norm.bias",
      "layer1.2.le.norm.weight",
      "layer1.2.le.norm.bias",
      "layer1.2.conv_in.norm.weight",
      "layer1.2.conv_in.norm.bias",
      "layer1.2.conv.norm.weight",
      "layer1.2.conv.norm.bias",
      "layer1.2.conv_out.norm.weight",
      "layer1.2.conv_out.norm.bias",
      "layer1.3.le.norm.weight",
      "layer1.3.le.norm.bias",
      "layer1.3.conv_in.norm.weight",
      "layer1.3.conv_in.norm.bias",
      "layer1.3.conv.norm.weight",
      "layer1.3.conv.norm.bias",
      "layer1.3.conv_out.norm.weight",
      "layer1.3.conv_out.norm.bias",
      "layer2.0.ln.weight",
      "layer2.0.ln.bias",
      "layer2.0.le.norm.weight",
      "layer2.0.le.norm.bias",
      "layer2.0.conv_in.norm.weight",
      "layer2.0.conv_in.norm.bias",
      "layer2.0.conv.norm.weight",
      "layer2.0.conv.norm.bias",
      "layer2.0.conv_out.norm.weight",
      "layer2.0.conv_out.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.le.norm.weight",
      "layer2.1.le.norm.bias",
      "layer2.1.conv_in.norm.weight",
      "layer2.1.conv_in.norm.bias",
      "layer2.1.conv.norm.weight",
      "layer2.1.conv.norm.bias",
      "layer2.1.conv_out.norm.weight",
      "layer2.1.conv_out.norm.bias",
      "layer2.2.le.norm.weight",
      "layer2.2.le.norm.bias",
      "layer2.2.conv_in.norm.weight",
      "layer2.2.conv_in.norm.bias",
      "layer2.2.conv.norm.weight",
      "layer2.2.conv.norm.bias",
      "layer2.2.conv_out.norm.weight",
      "layer2.2.conv_out.norm.bias",
      "layer2.3.le.norm.weight",
      "layer2.3.le.norm.bias",
      "layer2.3.conv_in.norm.weight",
      "layer2.3.conv_in.norm.bias",
      "layer2.3.conv.norm.weight",
      "layer2.3.conv.norm.bias",
      "layer2.3.conv_out.norm.weight",
      "layer2.3.conv_out.norm.bias",
      "layer2.4.le.norm.weight",
      "layer2.4.le.norm.bias",
      "layer2.4.conv_in.norm.weight",
      "layer2.4.conv_in.norm.bias",
      "layer2.4.conv.norm.weight",
      "layer2.4.conv.norm.bias",
      "layer2.4.conv_out.norm.weight",
      "layer2.4.conv_out.norm.bias",
      "layer2.5.le.norm.weight",
      "layer2.5.le.norm.bias",
      "layer2.5.conv_in.norm.weight",
      "layer2.5.conv_in.norm.bias",
      "layer2.5.conv.norm.weight",
      "layer2.5.conv.norm.bias",
      "layer2.5.conv_out.norm.weight",
      "layer2.5.conv_out.norm.bias",
      "layer2.6.le.norm.weight",
      "layer2.6.le.norm.bias",
      "layer2.6.conv_in.norm.weight",
      "layer2.6.conv_in.norm.bias",
      "layer2.6.conv.norm.weight",
      "layer2.6.conv.norm.bias",
      "layer2.6.conv_out.norm.weight",
      "layer2.6.conv_out.norm.bias",
      "layer2.7.le.norm.weight",
      "layer2.7.le.norm.bias",
      "layer2.7.conv_in.norm.weight",
      "layer2.7.conv_in.norm.bias",
      "layer2.7.conv.norm.weight",
      "layer2.7.conv.norm.bias",
      "layer2.7.conv_out.norm.weight",
      "layer2.7.conv_out.norm.bias",
      "layer3.0.ln.weight",
      "layer3.0.ln.bias",
      "layer3.0.le.norm.weight",
      "layer3.0.le.norm.bias",
      "layer3.0.conv_in.norm.weight",
      "layer3.0.conv_in.norm.bias",
      "layer3.0.conv.norm.weight",
      "layer3.0.conv.norm.bias",
      "layer3.0.conv_out.norm.weight",
      "layer3.0.conv_out.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.le.norm.weight",
      "layer3.1.le.norm.bias",
      "layer3.1.conv_in.norm.weight",
      "layer3.1.conv_in.norm.bias",
      "layer3.1.conv.norm.weight",
      "layer3.1.conv.norm.bias",
      "layer3.1.conv_out.norm.weight",
      "layer3.1.conv_out.norm.bias",
      "layer3.2.le.norm.weight",
      "layer3.2.le.norm.bias",
      "layer3.2.conv_in.norm.weight",
      "layer3.2.conv_in.norm.bias",
      "layer3.2.conv.norm.weight",
      "layer3.2.conv.norm.bias",
      "layer3.2.conv_out.norm.weight",
      "layer3.2.conv_out.norm.bias",
      "layer3.3.le.norm.weight",
      "layer3.3.le.norm.bias",
      "layer3.3.conv_in.norm.weight",
      "layer3.3.conv_in.norm.bias",
      "layer3.3.conv.norm.weight",
      "layer3.3.conv.norm.bias",
      "layer3.3.conv_out.norm.weight",
      "layer3.3.conv_out.norm.bias",
      "layer3.4.le.norm.weight",
      "layer3.4.le.norm.bias",
      "layer3.4.conv_in.norm.weight",
      "layer3.4.conv_in.norm.bias",
      "layer3.4.conv.norm.weight",
      "layer3.4.conv.norm.bias",
      "layer3.4.conv_out.norm.weight",
      "layer3.4.conv_out.norm.bias",
      "layer3.5.le.norm.weight",
      "layer3.5.le.norm.bias",
      "layer3.5.conv_in.norm.weight",
      "layer3.5.conv_in.norm.bias",
      "layer3.5.conv.norm.weight",
      "layer3.5.conv.norm.bias",
      "layer3.5.conv_out.norm.weight",
      "layer3.5.conv_out.norm.bias",
      "layer3.6.le.norm.weight",
      "layer3.6.le.norm.bias",
      "layer3.6.conv_in.norm.weight",
      "layer3.6.conv_in.norm.bias",
      "layer3.6.conv.norm.weight",
      "layer3.6.conv.norm.bias",
      "layer3.6.conv_out.norm.weight",
      "layer3.6.conv_out.norm.bias",
      "layer3.7.le.norm.weight",
      "layer3.7.le.norm.bias",
      "layer3.7.conv_in.norm.weight",
      "layer3.7.conv_in.norm.bias",
      "layer3.7.conv.norm.weight",
      "layer3.7.conv.norm.bias",
      "layer3.7.conv_out.norm.weight",
      "layer3.7.conv_out.norm.bias",
      "layer3.8.le.norm.weight",
      "layer3.8.le.norm.bias",
      "layer3.8.conv_in.norm.weight",
      "layer3.8.conv_in.norm.bias",
      "layer3.8.conv.norm.weight",
      "layer3.8.conv.norm.bias",
      "layer3.8.conv_out.norm.weight",
      "layer3.8.conv_out.norm.bias",
      "layer3.9.le.norm.weight",
      "layer3.9.le.norm.bias",
      "layer3.9.conv_in.norm.weight",
      "layer3.9.conv_in.norm.bias",
      "layer3.9.conv.norm.weight",
      "layer3.9.conv.norm.bias",
      "layer3.9.conv_out.norm.weight",
      "layer3.9.conv_out.norm.bias",
      "layer3.10.le.norm.weight",
      "layer3.10.le.norm.bias",
      "layer3.10.conv_in.norm.weight",
      "layer3.10.conv_in.norm.bias",
      "layer3.10.conv.norm.weight",
      "layer3.10.conv.norm.bias",
      "layer3.10.conv_out.norm.weight",
      "layer3.10.conv_out.norm.bias",
      "layer3.11.le.norm.weight",
      "layer3.11.le.norm.bias",
      "layer3.11.conv_in.norm.weight",
      "layer3.11.conv_in.norm.bias",
      "layer3.11.conv.norm.weight",
      "layer3.11.conv.norm.bias",
      "layer3.11.conv_out.norm.weight",
      "layer3.11.conv_out.norm.bias",
      "layer3.12.le.norm.weight",
      "layer3.12.le.norm.bias",
      "layer3.12.conv_in.norm.weight",
      "layer3.12.conv_in.norm.bias",
      "layer3.12.conv.norm.weight",
      "layer3.12.conv.norm.bias",
      "layer3.12.conv_out.norm.weight",
      "layer3.12.conv_out.norm.bias",
      "layer3.13.le.norm.weight",
      "layer3.13.le.norm.bias",
      "layer3.13.conv_in.norm.weight",
      "layer3.13.conv_in.norm.bias",
      "layer3.13.conv.norm.weight",
      "layer3.13.conv.norm.bias",
      "layer3.13.conv_out.norm.weight",
      "layer3.13.conv_out.norm.bias",
      "layer3.14.le.norm.weight",
      "layer3.14.le.norm.bias",
      "layer3.14.conv_in.norm.weight",
      "layer3.14.conv_in.norm.bias",
      "layer3.14.conv.norm.weight",
      "layer3.14.conv.norm.bias",
      "layer3.14.conv_out.norm.weight",
      "layer3.14.conv_out.norm.bias",
      "layer3.15.le.norm.weight",
      "layer3.15.le.norm.bias",
      "layer3.15.conv_in.norm.weight",
      "layer3.15.conv_in.norm.bias",
      "layer3.15.conv.norm.weight",
      "layer3.15.conv.norm.bias",
      "layer3.15.conv_out.norm.weight",
      "layer3.15.conv_out.norm.bias",
      "layer3.16.le.norm.weight",
      "layer3.16.le.norm.bias",
      "layer3.16.conv_in.norm.weight",
      "layer3.16.conv_in.norm.bias",
      "layer3.16.conv.norm.weight",
      "layer3.16.conv.norm.bias",
      "layer3.16.conv_out.norm.weight",
      "layer3.16.conv_out.norm.bias",
      "layer3.17.le.norm.weight",
      "layer3.17.le.norm.bias",
      "layer3.17.conv_in.norm.weight",
      "layer3.17.conv_in.norm.bias",
      "layer3.17.conv.norm.weight",
      "layer3.17.conv.norm.bias",
      "layer3.17.conv_out.norm.weight",
      "layer3.17.conv_out.norm.bias",
      "layer3.18.le.norm.weight",
      "layer3.18.le.norm.bias",
      "layer3.18.conv_in.norm.weight",
      "layer3.18.conv_in.norm.bias",
      "layer3.18.conv.norm.weight",
      "layer3.18.conv.norm.bias",
      "layer3.18.conv_out.norm.weight",
      "layer3.18.conv_out.norm.bias",
      "layer3.19.le.norm.weight",
      "layer3.19.le.norm.bias",
      "layer3.19.conv_in.norm.weight",
      "layer3.19.conv_in.norm.bias",
      "layer3.19.conv.norm.weight",
      "layer3.19.conv.norm.bias",
      "layer3.19.conv_out.norm.weight",
      "layer3.19.conv_out.norm.bias",
      "layer4.0.ln.weight",
      "layer4.0.ln.bias",
      "layer4.0.le.norm.weight",
      "layer4.0.le.norm.bias",
      "layer4.0.conv_in.norm.weight",
      "layer4.0.conv_in.norm.bias",
      "layer4.0.conv.norm.weight",
      "layer4.0.conv.norm.bias",
      "layer4.0.conv_out.norm.weight",
      "layer4.0.conv_out.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.le.norm.weight",
      "layer4.1.le.norm.bias",
      "layer4.1.conv_in.norm.weight",
      "layer4.1.conv_in.norm.bias",
      "layer4.1.conv.norm.weight",
      "layer4.1.conv.norm.bias",
      "layer4.1.conv_out.norm.weight",
      "layer4.1.conv_out.norm.bias",
      "layer4.2.le.norm.weight",
      "layer4.2.le.norm.bias",
      "layer4.2.conv_in.norm.weight",
      "layer4.2.conv_in.norm.bias",
      "layer4.2.conv.norm.weight",
      "layer4.2.conv.norm.bias",
      "layer4.2.conv_out.norm.weight",
      "layer4.2.conv_out.norm.bias",
      "layer4.3.le.norm.weight",
      "layer4.3.le.norm.bias",
      "layer4.3.conv_in.norm.weight",
      "layer4.3.conv_in.norm.bias",
      "layer4.3.conv.norm.weight",
      "layer4.3.conv.norm.bias",
      "layer4.3.conv_out.norm.weight",
      "layer4.3.conv_out.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 12500
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: 
Start training for 300 epochs
Epoch: [0]  [   0/2502]  eta: 8:32:18  lr: 0.000000  min_lr: 0.000000  loss: 6.9405 (6.9405)  weight_decay: 0.0500 (0.0500)  time: 12.2856  data: 1.8066  max mem: 18975
Epoch: [0]  [ 200/2502]  eta: 0:12:12  lr: 0.000014  min_lr: 0.000014  loss: 6.9172 (6.9393)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.2784 (12.5995)  time: 0.2569  data: 0.0005  max mem: 18975
Epoch: [0]  [ 400/2502]  eta: 0:10:04  lr: 0.000028  min_lr: 0.000028  loss: 6.8420 (6.9052)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9936 (8.8194)  time: 0.2568  data: 0.0005  max mem: 18975
Epoch: [0]  [ 600/2502]  eta: 0:08:47  lr: 0.000042  min_lr: 0.000042  loss: 6.7482 (6.8678)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.0458 (7.3275)  time: 0.2560  data: 0.0005  max mem: 18975
Epoch: [0]  [ 800/2502]  eta: 0:07:43  lr: 0.000056  min_lr: 0.000056  loss: 6.6564 (6.8334)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.6233 (inf)  time: 0.2582  data: 0.0006  max mem: 18975
Epoch: [0]  [1000/2502]  eta: 0:06:44  lr: 0.000070  min_lr: 0.000070  loss: 6.6939 (6.8010)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.7496 (inf)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [0]  [1200/2502]  eta: 0:05:47  lr: 0.000084  min_lr: 0.000084  loss: 6.6283 (6.7695)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.3247 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [0]  [1400/2502]  eta: 0:04:52  lr: 0.000098  min_lr: 0.000098  loss: 6.5304 (6.7424)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.7182 (inf)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [0]  [1600/2502]  eta: 0:03:58  lr: 0.000112  min_lr: 0.000112  loss: 6.4370 (6.7171)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.7832 (inf)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [0]  [1800/2502]  eta: 0:03:05  lr: 0.000126  min_lr: 0.000126  loss: 6.4015 (6.6904)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.6160 (inf)  time: 0.2572  data: 0.0005  max mem: 18975
Epoch: [0]  [2000/2502]  eta: 0:02:12  lr: 0.000140  min_lr: 0.000140  loss: 6.4646 (6.6672)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.4221 (inf)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [0]  [2200/2502]  eta: 0:01:19  lr: 0.000154  min_lr: 0.000154  loss: 6.3713 (6.6458)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.9245 (inf)  time: 0.2574  data: 0.0005  max mem: 18975
Epoch: [0]  [2400/2502]  eta: 0:00:26  lr: 0.000168  min_lr: 0.000168  loss: 6.2673 (6.6231)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.6318 (inf)  time: 0.2583  data: 0.0005  max mem: 18975
Epoch: [0]  [2501/2502]  eta: 0:00:00  lr: 0.000175  min_lr: 0.000175  loss: 6.2060 (6.6115)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.0498 (inf)  time: 0.2314  data: 0.0006  max mem: 18975
Epoch: [0] Total time: 0:10:55 (0.2621 s / it)
Averaged stats: lr: 0.000175  min_lr: 0.000175  loss: 6.2060 (6.6129)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.0498 (inf)
Test:  [ 0/50]  eta: 0:05:46  loss: 5.3231 (5.3231)  acc1: 6.4000 (6.4000)  acc5: 22.0000 (22.0000)  time: 6.9380  data: 2.4180  max mem: 18975
Test:  [10/50]  eta: 0:00:29  loss: 5.5474 (5.5434)  acc1: 4.4000 (4.8364)  acc5: 14.8000 (15.2000)  time: 0.7392  data: 0.2201  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 5.5474 (5.5204)  acc1: 4.0000 (4.7048)  acc5: 15.2000 (16.1333)  time: 0.1205  data: 0.0003  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 5.5433 (5.5399)  acc1: 4.4000 (4.8258)  acc5: 16.4000 (16.5419)  time: 0.1985  data: 0.0763  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 5.5767 (5.5467)  acc1: 5.2000 (5.2195)  acc5: 17.2000 (16.6537)  time: 0.3304  data: 0.2075  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 5.4734 (5.4981)  acc1: 6.0000 (5.6160)  acc5: 19.2000 (17.7600)  time: 0.2884  data: 0.1677  max mem: 18975
Test: Total time: 0:00:17 (0.3560 s / it)
* Acc@1 5.522 Acc@5 17.200 loss 5.503
Accuracy of the model on the 50000 test images: 5.5%
Max accuracy: 5.52%
Epoch: [1]  [   0/2502]  eta: 2:07:39  lr: 0.000175  min_lr: 0.000175  loss: 6.4180 (6.4180)  weight_decay: 0.0500 (0.0500)  time: 3.0613  data: 2.7140  max mem: 18975
Epoch: [1]  [ 200/2502]  eta: 0:10:27  lr: 0.000189  min_lr: 0.000189  loss: 6.3845 (6.3527)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.0029 (12.0162)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [1]  [ 400/2502]  eta: 0:09:16  lr: 0.000203  min_lr: 0.000203  loss: 6.1285 (6.3281)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.0564 (12.1125)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [1]  [ 600/2502]  eta: 0:08:18  lr: 0.000217  min_lr: 0.000217  loss: 6.3715 (6.3114)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.4336 (12.1261)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [1]  [ 800/2502]  eta: 0:07:24  lr: 0.000231  min_lr: 0.000231  loss: 6.1909 (6.2954)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.8196 (12.2482)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [1]  [1000/2502]  eta: 0:06:30  lr: 0.000245  min_lr: 0.000245  loss: 6.3602 (6.2779)  weight_decay: 0.0500 (0.0500)  grad_norm: 12.1274 (12.3603)  time: 0.2590  data: 0.0004  max mem: 18975
Epoch: [1]  [1200/2502]  eta: 0:05:37  lr: 0.000259  min_lr: 0.000259  loss: 6.3806 (6.2572)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.3250 (12.7246)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [1]  [1400/2502]  eta: 0:04:45  lr: 0.000273  min_lr: 0.000273  loss: 6.1817 (6.2414)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.9234 (12.5988)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [1]  [1600/2502]  eta: 0:03:53  lr: 0.000287  min_lr: 0.000287  loss: 6.0534 (6.2246)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.6331 (12.6279)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [1]  [1800/2502]  eta: 0:03:01  lr: 0.000301  min_lr: 0.000301  loss: 6.3537 (6.2113)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.4815 (12.5262)  time: 0.2557  data: 0.0003  max mem: 18975
Epoch: [1]  [2000/2502]  eta: 0:02:09  lr: 0.000315  min_lr: 0.000315  loss: 6.2936 (6.1979)  weight_decay: 0.0500 (0.0500)  grad_norm: 11.7767 (12.3442)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [1]  [2200/2502]  eta: 0:01:17  lr: 0.000329  min_lr: 0.000329  loss: 6.3882 (6.1896)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.3242 (12.1801)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [1]  [2400/2502]  eta: 0:00:26  lr: 0.000343  min_lr: 0.000343  loss: 6.0301 (6.1715)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.3665 (12.0046)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [1]  [2501/2502]  eta: 0:00:00  lr: 0.000350  min_lr: 0.000350  loss: 6.0070 (6.1660)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.7695 (11.9666)  time: 0.2369  data: 0.0007  max mem: 18975
Epoch: [1] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.000350  min_lr: 0.000350  loss: 6.0070 (6.1683)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.7695 (11.9666)
Test:  [ 0/50]  eta: 0:03:11  loss: 4.7278 (4.7278)  acc1: 13.2000 (13.2000)  acc5: 34.0000 (34.0000)  time: 3.8331  data: 3.6864  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 4.6037 (4.6392)  acc1: 12.8000 (11.5636)  acc5: 31.6000 (30.7273)  time: 0.5172  data: 0.3955  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 4.5963 (4.6163)  acc1: 12.8000 (13.0476)  acc5: 32.8000 (31.6381)  time: 0.1768  data: 0.0576  max mem: 18975
Test:  [30/50]  eta: 0:00:05  loss: 4.7437 (4.6911)  acc1: 13.2000 (13.0323)  acc5: 30.0000 (31.0065)  time: 0.1722  data: 0.0521  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 4.8785 (4.7472)  acc1: 12.8000 (12.8585)  acc5: 27.6000 (30.0683)  time: 0.1796  data: 0.0595  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 4.8665 (4.7196)  acc1: 13.2000 (13.3280)  acc5: 27.6000 (30.7920)  time: 0.1780  data: 0.0589  max mem: 18975
Test: Total time: 0:00:12 (0.2538 s / it)
* Acc@1 12.908 Acc@5 30.680 loss 4.719
Accuracy of the model on the 50000 test images: 12.9%
Max accuracy: 12.91%
Epoch: [2]  [   0/2502]  eta: 2:09:35  lr: 0.000350  min_lr: 0.000350  loss: 5.8410 (5.8410)  weight_decay: 0.0500 (0.0500)  time: 3.1078  data: 2.8048  max mem: 18975
Epoch: [2]  [ 200/2502]  eta: 0:10:33  lr: 0.000364  min_lr: 0.000364  loss: 5.7591 (5.9551)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.3180 (11.1016)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [2]  [ 400/2502]  eta: 0:09:20  lr: 0.000378  min_lr: 0.000378  loss: 5.9150 (5.9383)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.6811 (10.2667)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [2]  [ 600/2502]  eta: 0:08:21  lr: 0.000392  min_lr: 0.000392  loss: 5.6175 (5.9321)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.5178 (10.2195)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [2]  [ 800/2502]  eta: 0:07:25  lr: 0.000406  min_lr: 0.000406  loss: 6.0822 (5.9278)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.3960 (10.0962)  time: 0.2572  data: 0.0003  max mem: 18975
Epoch: [2]  [1000/2502]  eta: 0:06:32  lr: 0.000420  min_lr: 0.000420  loss: 6.0641 (5.9227)  weight_decay: 0.0500 (0.0500)  grad_norm: 9.0805 (10.0309)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [2]  [1200/2502]  eta: 0:05:38  lr: 0.000434  min_lr: 0.000434  loss: 6.0600 (5.9182)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.3640 (9.8372)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [2]  [1400/2502]  eta: 0:04:46  lr: 0.000448  min_lr: 0.000448  loss: 5.8536 (5.9075)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.0916 (9.7774)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [2]  [1600/2502]  eta: 0:03:54  lr: 0.000462  min_lr: 0.000462  loss: 6.0988 (5.8982)  weight_decay: 0.0500 (0.0500)  grad_norm: 10.0995 (9.7736)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [2]  [1800/2502]  eta: 0:03:02  lr: 0.000476  min_lr: 0.000476  loss: 5.5230 (5.8797)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.6693 (9.6145)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [2]  [2000/2502]  eta: 0:02:10  lr: 0.000490  min_lr: 0.000490  loss: 5.5846 (5.8686)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.1805 (9.4778)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [2]  [2200/2502]  eta: 0:01:18  lr: 0.000504  min_lr: 0.000504  loss: 5.5429 (5.8604)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.1253 (9.3744)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [2]  [2400/2502]  eta: 0:00:26  lr: 0.000518  min_lr: 0.000518  loss: 5.6059 (5.8485)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.6506 (9.2522)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [2]  [2501/2502]  eta: 0:00:00  lr: 0.000525  min_lr: 0.000525  loss: 5.7175 (5.8430)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.4486 (9.1986)  time: 0.2310  data: 0.0007  max mem: 18975
Epoch: [2] Total time: 0:10:47 (0.2589 s / it)
Averaged stats: lr: 0.000525  min_lr: 0.000525  loss: 5.7175 (5.8451)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.4486 (9.1986)
Test:  [ 0/50]  eta: 0:02:09  loss: 3.5918 (3.5918)  acc1: 26.4000 (26.4000)  acc5: 51.6000 (51.6000)  time: 2.5875  data: 2.4439  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 3.6989 (3.7016)  acc1: 23.6000 (24.0000)  acc5: 51.6000 (50.7273)  time: 0.4276  data: 0.3036  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 3.6989 (3.7164)  acc1: 22.4000 (24.0190)  acc5: 50.8000 (50.9905)  time: 0.2590  data: 0.1381  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 3.9656 (3.8606)  acc1: 22.0000 (23.1742)  acc5: 46.8000 (48.4258)  time: 0.2929  data: 0.1708  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 4.1991 (3.9507)  acc1: 20.4000 (22.5463)  acc5: 41.6000 (46.6146)  time: 0.2302  data: 0.1084  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 4.0565 (3.9186)  acc1: 21.2000 (23.1920)  acc5: 43.2000 (47.2960)  time: 0.1766  data: 0.0575  max mem: 18975
Test: Total time: 0:00:14 (0.2824 s / it)
* Acc@1 23.384 Acc@5 47.012 loss 3.913
Accuracy of the model on the 50000 test images: 23.4%
Max accuracy: 23.38%
Epoch: [3]  [   0/2502]  eta: 2:11:32  lr: 0.000525  min_lr: 0.000525  loss: 4.9412 (4.9412)  weight_decay: 0.0500 (0.0500)  time: 3.1543  data: 2.8558  max mem: 18975
Epoch: [3]  [ 200/2502]  eta: 0:10:30  lr: 0.000539  min_lr: 0.000539  loss: 5.8752 (5.6888)  weight_decay: 0.0500 (0.0500)  grad_norm: 8.2259 (7.3598)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [3]  [ 400/2502]  eta: 0:09:18  lr: 0.000553  min_lr: 0.000553  loss: 5.6950 (5.6831)  weight_decay: 0.0500 (0.0500)  grad_norm: 7.2697 (7.5138)  time: 0.2558  data: 0.0003  max mem: 18975
Epoch: [3]  [ 600/2502]  eta: 0:08:19  lr: 0.000567  min_lr: 0.000567  loss: 5.6658 (5.6485)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.4919 (7.6088)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [3]  [ 800/2502]  eta: 0:07:24  lr: 0.000581  min_lr: 0.000581  loss: 5.5585 (5.6494)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.3169 (7.6477)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [3]  [1000/2502]  eta: 0:06:30  lr: 0.000595  min_lr: 0.000595  loss: 5.9302 (5.6317)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.7518 (7.4828)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [3]  [1200/2502]  eta: 0:05:37  lr: 0.000609  min_lr: 0.000609  loss: 5.7382 (5.6230)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.7401 (7.4603)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [3]  [1400/2502]  eta: 0:04:45  lr: 0.000623  min_lr: 0.000623  loss: 5.6591 (5.6032)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.4282 (7.4635)  time: 0.2631  data: 0.0004  max mem: 18975
Epoch: [3]  [1600/2502]  eta: 0:03:53  lr: 0.000637  min_lr: 0.000637  loss: 5.5003 (5.5940)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.4016 (7.3864)  time: 0.2647  data: 0.0004  max mem: 18975
Epoch: [3]  [1800/2502]  eta: 0:03:01  lr: 0.000651  min_lr: 0.000651  loss: 5.7104 (5.5801)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.9206 (7.3049)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [3]  [2000/2502]  eta: 0:02:09  lr: 0.000665  min_lr: 0.000665  loss: 5.6757 (5.5733)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.6050 (7.1611)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [3]  [2200/2502]  eta: 0:01:18  lr: 0.000679  min_lr: 0.000679  loss: 5.1027 (5.5619)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.3004 (7.0629)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [3]  [2400/2502]  eta: 0:00:26  lr: 0.000693  min_lr: 0.000693  loss: 5.6175 (5.5487)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.3155 (6.9981)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [3]  [2501/2502]  eta: 0:00:00  lr: 0.000700  min_lr: 0.000700  loss: 5.0400 (5.5416)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.5183 (6.9614)  time: 0.2312  data: 0.0007  max mem: 18975
Epoch: [3] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.000700  min_lr: 0.000700  loss: 5.0400 (5.5421)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.5183 (6.9614)
Test:  [ 0/50]  eta: 0:02:12  loss: 2.7371 (2.7371)  acc1: 41.6000 (41.6000)  acc5: 72.4000 (72.4000)  time: 2.6431  data: 2.4937  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 3.0759 (3.0214)  acc1: 39.6000 (35.3091)  acc5: 62.8000 (63.0182)  time: 0.4654  data: 0.3430  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 2.9505 (2.9565)  acc1: 36.4000 (36.7619)  acc5: 64.4000 (64.8000)  time: 0.2589  data: 0.1396  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 3.0918 (3.1653)  acc1: 32.4000 (34.4645)  acc5: 59.2000 (60.9032)  time: 0.2415  data: 0.1226  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 3.6969 (3.3000)  acc1: 26.8000 (32.7122)  acc5: 50.0000 (58.1366)  time: 0.2059  data: 0.0868  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 3.6650 (3.2980)  acc1: 26.8000 (32.8320)  acc5: 50.4000 (58.3280)  time: 0.1714  data: 0.0524  max mem: 18975
Test: Total time: 0:00:13 (0.2680 s / it)
* Acc@1 32.990 Acc@5 58.286 loss 3.294
Accuracy of the model on the 50000 test images: 33.0%
Max accuracy: 32.99%
Epoch: [4]  [   0/2502]  eta: 1:53:54  lr: 0.000700  min_lr: 0.000700  loss: 5.9108 (5.9108)  weight_decay: 0.0500 (0.0500)  time: 2.7314  data: 2.4311  max mem: 18975
Epoch: [4]  [ 200/2502]  eta: 0:10:27  lr: 0.000714  min_lr: 0.000714  loss: 5.3766 (5.4677)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.8726 (6.3386)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [4]  [ 400/2502]  eta: 0:09:16  lr: 0.000728  min_lr: 0.000728  loss: 4.8795 (5.4021)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.2124 (6.3824)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [4]  [ 600/2502]  eta: 0:08:18  lr: 0.000742  min_lr: 0.000742  loss: 5.6404 (5.3732)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.9744 (6.1339)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [4]  [ 800/2502]  eta: 0:07:23  lr: 0.000756  min_lr: 0.000756  loss: 5.5817 (5.3713)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.5411 (5.9862)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [4]  [1000/2502]  eta: 0:06:30  lr: 0.000770  min_lr: 0.000770  loss: 5.6091 (5.3495)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.0223 (5.9228)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [4]  [1200/2502]  eta: 0:05:37  lr: 0.000784  min_lr: 0.000784  loss: 5.3945 (5.3499)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.5547 (5.7995)  time: 0.2577  data: 0.0005  max mem: 18975
Epoch: [4]  [1400/2502]  eta: 0:04:45  lr: 0.000798  min_lr: 0.000798  loss: 5.4880 (5.3478)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.3042 (5.6837)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [4]  [1600/2502]  eta: 0:03:53  lr: 0.000812  min_lr: 0.000812  loss: 5.4401 (5.3407)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.7101 (5.6268)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [4]  [1800/2502]  eta: 0:03:01  lr: 0.000826  min_lr: 0.000826  loss: 4.9131 (5.3276)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.3948 (5.5508)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [4]  [2000/2502]  eta: 0:02:09  lr: 0.000840  min_lr: 0.000840  loss: 4.6614 (5.3178)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.6822 (5.5075)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [4]  [2200/2502]  eta: 0:01:18  lr: 0.000854  min_lr: 0.000854  loss: 4.8005 (5.3052)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.4514 (5.4687)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [4]  [2400/2502]  eta: 0:00:26  lr: 0.000868  min_lr: 0.000868  loss: 5.0971 (5.3055)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.5310 (5.4639)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [4]  [2501/2502]  eta: 0:00:00  lr: 0.000875  min_lr: 0.000875  loss: 4.7322 (5.2983)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0362 (5.4257)  time: 0.2304  data: 0.0006  max mem: 18975
Epoch: [4] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.000875  min_lr: 0.000875  loss: 4.7322 (5.2905)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0362 (5.4257)
Test:  [ 0/50]  eta: 0:02:40  loss: 2.2736 (2.2736)  acc1: 52.4000 (52.4000)  acc5: 79.6000 (79.6000)  time: 3.2028  data: 3.0633  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 2.7120 (2.6153)  acc1: 46.0000 (43.5273)  acc5: 70.8000 (71.7818)  time: 0.5539  data: 0.4318  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 2.5647 (2.5366)  acc1: 42.4000 (44.8571)  acc5: 72.8000 (73.3905)  time: 0.2402  data: 0.1199  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 2.7159 (2.7781)  acc1: 40.4000 (41.9226)  acc5: 64.8000 (68.9677)  time: 0.1947  data: 0.0750  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 3.3537 (2.9221)  acc1: 33.6000 (39.6488)  acc5: 58.0000 (66.3220)  time: 0.2128  data: 0.0938  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 3.3115 (2.9370)  acc1: 33.6000 (39.6080)  acc5: 60.0000 (65.9600)  time: 0.1956  data: 0.0768  max mem: 18975
Test: Total time: 0:00:13 (0.2788 s / it)
* Acc@1 39.854 Acc@5 65.850 loss 2.933
Accuracy of the model on the 50000 test images: 39.9%
Max accuracy: 39.85%
Epoch: [5]  [   0/2502]  eta: 1:58:11  lr: 0.000875  min_lr: 0.000875  loss: 5.0442 (5.0442)  weight_decay: 0.0500 (0.0500)  time: 2.8345  data: 2.5347  max mem: 18975
Epoch: [5]  [ 200/2502]  eta: 0:10:26  lr: 0.000889  min_lr: 0.000889  loss: 4.9264 (5.2065)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0229 (4.5494)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [5]  [ 400/2502]  eta: 0:09:15  lr: 0.000903  min_lr: 0.000903  loss: 4.7094 (5.1598)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.7153 (4.8812)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [5]  [ 600/2502]  eta: 0:08:18  lr: 0.000917  min_lr: 0.000917  loss: 5.5947 (5.1698)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0589 (4.6128)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [5]  [ 800/2502]  eta: 0:07:23  lr: 0.000931  min_lr: 0.000931  loss: 5.1120 (5.1612)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9055 (4.5568)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [5]  [1000/2502]  eta: 0:06:30  lr: 0.000945  min_lr: 0.000945  loss: 5.5540 (5.1527)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.1511 (4.5032)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [5]  [1200/2502]  eta: 0:05:37  lr: 0.000959  min_lr: 0.000959  loss: 5.2469 (5.1374)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7610 (4.4257)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [5]  [1400/2502]  eta: 0:04:45  lr: 0.000973  min_lr: 0.000973  loss: 5.1530 (5.1273)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6162 (4.4093)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [5]  [1600/2502]  eta: 0:03:53  lr: 0.000987  min_lr: 0.000987  loss: 5.4846 (5.1278)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9879 (4.3767)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [5]  [1800/2502]  eta: 0:03:01  lr: 0.001001  min_lr: 0.001001  loss: 4.6839 (5.1196)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2956 (4.3468)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [5]  [2000/2502]  eta: 0:02:09  lr: 0.001015  min_lr: 0.001015  loss: 5.5011 (5.1133)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6659 (4.3155)  time: 0.2606  data: 0.0004  max mem: 18975
Epoch: [5]  [2200/2502]  eta: 0:01:18  lr: 0.001029  min_lr: 0.001029  loss: 5.5048 (5.1036)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5867 (4.2658)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [5]  [2400/2502]  eta: 0:00:26  lr: 0.001043  min_lr: 0.001043  loss: 4.3453 (5.0995)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0743 (4.2559)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [5]  [2501/2502]  eta: 0:00:00  lr: 0.001050  min_lr: 0.001050  loss: 4.3351 (5.0932)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.4299 (4.2341)  time: 0.2306  data: 0.0011  max mem: 18975
Epoch: [5] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.001050  min_lr: 0.001050  loss: 4.3351 (5.0789)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.4299 (4.2341)
Test:  [ 0/50]  eta: 0:03:15  loss: 1.8490 (1.8490)  acc1: 60.0000 (60.0000)  acc5: 84.0000 (84.0000)  time: 3.9135  data: 3.7723  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 2.2965 (2.3037)  acc1: 49.2000 (50.1091)  acc5: 77.2000 (77.2727)  time: 0.6249  data: 0.5037  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 2.2622 (2.2643)  acc1: 49.2000 (51.2190)  acc5: 78.4000 (78.1333)  time: 0.2670  data: 0.1475  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 2.4005 (2.4986)  acc1: 46.4000 (47.6000)  acc5: 72.4000 (73.5613)  time: 0.2093  data: 0.0899  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 3.0841 (2.6458)  acc1: 38.4000 (45.1610)  acc5: 62.0000 (70.7220)  time: 0.1767  data: 0.0575  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 3.0047 (2.6542)  acc1: 38.4000 (44.7440)  acc5: 62.8000 (70.4400)  time: 0.1766  data: 0.0574  max mem: 18975
Test: Total time: 0:00:14 (0.2801 s / it)
* Acc@1 44.646 Acc@5 70.396 loss 2.657
Accuracy of the model on the 50000 test images: 44.6%
Max accuracy: 44.65%
Epoch: [6]  [   0/2502]  eta: 2:27:25  lr: 0.001050  min_lr: 0.001050  loss: 4.1591 (4.1591)  weight_decay: 0.0500 (0.0500)  time: 3.5354  data: 3.2261  max mem: 18975
Epoch: [6]  [ 200/2502]  eta: 0:10:35  lr: 0.001064  min_lr: 0.001064  loss: 5.2713 (4.9621)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3845 (3.6034)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [6]  [ 400/2502]  eta: 0:09:20  lr: 0.001078  min_lr: 0.001078  loss: 4.5506 (4.9532)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2337 (3.5447)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [6]  [ 600/2502]  eta: 0:08:20  lr: 0.001092  min_lr: 0.001092  loss: 5.2370 (4.9362)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2841 (3.5698)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [6]  [ 800/2502]  eta: 0:07:25  lr: 0.001106  min_lr: 0.001106  loss: 4.9540 (4.9236)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1477 (3.5063)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [6]  [1000/2502]  eta: 0:06:31  lr: 0.001120  min_lr: 0.001120  loss: 5.1049 (4.9230)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.4475 (3.5024)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [6]  [1200/2502]  eta: 0:05:38  lr: 0.001134  min_lr: 0.001134  loss: 4.9826 (4.9160)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7663 (3.5388)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [6]  [1400/2502]  eta: 0:04:46  lr: 0.001148  min_lr: 0.001148  loss: 5.2925 (4.9260)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1905 (3.5120)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [6]  [1600/2502]  eta: 0:03:54  lr: 0.001162  min_lr: 0.001162  loss: 4.6704 (4.9179)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9091 (3.5117)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [6]  [1800/2502]  eta: 0:03:01  lr: 0.001176  min_lr: 0.001176  loss: 4.2624 (4.9125)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2042 (3.4841)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [6]  [2000/2502]  eta: 0:02:10  lr: 0.001190  min_lr: 0.001190  loss: 4.4843 (4.9108)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0890 (3.4634)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [6]  [2200/2502]  eta: 0:01:18  lr: 0.001204  min_lr: 0.001204  loss: 5.3086 (4.9132)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9180 (3.4221)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [6]  [2400/2502]  eta: 0:00:26  lr: 0.001218  min_lr: 0.001218  loss: 4.9036 (4.9064)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7166 (3.3959)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [6]  [2501/2502]  eta: 0:00:00  lr: 0.001225  min_lr: 0.001225  loss: 5.1809 (4.8989)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3813 (3.3925)  time: 0.2307  data: 0.0007  max mem: 18975
Epoch: [6] Total time: 0:10:46 (0.2586 s / it)
Averaged stats: lr: 0.001225  min_lr: 0.001225  loss: 5.1809 (4.9173)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3813 (3.3925)
Test:  [ 0/50]  eta: 0:02:24  loss: 1.9472 (1.9472)  acc1: 67.6000 (67.6000)  acc5: 86.4000 (86.4000)  time: 2.8918  data: 2.7471  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 2.2592 (2.2383)  acc1: 55.2000 (54.9818)  acc5: 78.4000 (80.8000)  time: 0.5556  data: 0.4336  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 2.1713 (2.1729)  acc1: 55.2000 (56.1714)  acc5: 81.2000 (81.1619)  time: 0.2964  data: 0.1770  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 2.2986 (2.3570)  acc1: 50.4000 (53.0065)  acc5: 74.4000 (77.3806)  time: 0.2190  data: 0.0997  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 2.8887 (2.4922)  acc1: 42.8000 (50.1463)  acc5: 66.0000 (74.6634)  time: 0.1710  data: 0.0519  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.8806 (2.5123)  acc1: 42.4000 (49.5520)  acc5: 66.8000 (74.3200)  time: 0.1709  data: 0.0518  max mem: 18975
Test: Total time: 0:00:13 (0.2691 s / it)
* Acc@1 49.184 Acc@5 74.398 loss 2.501
Accuracy of the model on the 50000 test images: 49.2%
Max accuracy: 49.18%
Epoch: [7]  [   0/2502]  eta: 2:06:42  lr: 0.001225  min_lr: 0.001225  loss: 4.2947 (4.2947)  weight_decay: 0.0500 (0.0500)  time: 3.0387  data: 2.7389  max mem: 18975
Epoch: [7]  [ 200/2502]  eta: 0:10:31  lr: 0.001239  min_lr: 0.001239  loss: 5.2387 (4.8473)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0442 (3.1862)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [7]  [ 400/2502]  eta: 0:09:18  lr: 0.001253  min_lr: 0.001253  loss: 4.8943 (4.8441)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7386 (3.1776)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [7]  [ 600/2502]  eta: 0:08:19  lr: 0.001267  min_lr: 0.001267  loss: 4.7441 (4.8278)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8131 (3.1667)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [7]  [ 800/2502]  eta: 0:07:24  lr: 0.001281  min_lr: 0.001281  loss: 4.7311 (4.8241)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4998 (3.0651)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [7]  [1000/2502]  eta: 0:06:31  lr: 0.001295  min_lr: 0.001295  loss: 4.6722 (4.8028)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9872 (3.0673)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [7]  [1200/2502]  eta: 0:05:38  lr: 0.001309  min_lr: 0.001309  loss: 5.1139 (4.7987)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9577 (3.0400)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [7]  [1400/2502]  eta: 0:04:45  lr: 0.001323  min_lr: 0.001323  loss: 4.6980 (4.7994)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7597 (3.0212)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [7]  [1600/2502]  eta: 0:03:53  lr: 0.001337  min_lr: 0.001337  loss: 4.0992 (4.7857)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7564 (3.0008)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [7]  [1800/2502]  eta: 0:03:01  lr: 0.001351  min_lr: 0.001351  loss: 5.3453 (4.7836)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7759 (2.9762)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [7]  [2000/2502]  eta: 0:02:09  lr: 0.001365  min_lr: 0.001365  loss: 4.1051 (4.7729)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2161 (2.9435)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [7]  [2200/2502]  eta: 0:01:18  lr: 0.001379  min_lr: 0.001379  loss: 4.6631 (4.7742)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4339 (2.9272)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [7]  [2400/2502]  eta: 0:00:26  lr: 0.001393  min_lr: 0.001393  loss: 4.8244 (4.7751)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3742 (2.8968)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [7]  [2501/2502]  eta: 0:00:00  lr: 0.001400  min_lr: 0.001400  loss: 4.9145 (4.7739)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5518 (2.8961)  time: 0.2310  data: 0.0008  max mem: 18975
Epoch: [7] Total time: 0:10:47 (0.2586 s / it)
Averaged stats: lr: 0.001400  min_lr: 0.001400  loss: 4.9145 (4.7799)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5518 (2.8961)
Test:  [ 0/50]  eta: 0:02:50  loss: 1.6964 (1.6964)  acc1: 64.8000 (64.8000)  acc5: 86.4000 (86.4000)  time: 3.4007  data: 3.2561  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.9770 (1.9581)  acc1: 58.0000 (56.9455)  acc5: 81.6000 (82.6182)  time: 0.5285  data: 0.4042  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.9242 (1.9119)  acc1: 58.0000 (57.8857)  acc5: 82.4000 (83.6952)  time: 0.2270  data: 0.1061  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 2.0462 (2.1388)  acc1: 51.6000 (54.2710)  acc5: 77.2000 (79.6903)  time: 0.2005  data: 0.0807  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 2.7074 (2.2786)  acc1: 43.6000 (51.5805)  acc5: 69.2000 (77.1610)  time: 0.1821  data: 0.0619  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.6885 (2.2960)  acc1: 44.8000 (51.4560)  acc5: 69.6000 (76.7280)  time: 0.1669  data: 0.0467  max mem: 18975
Test: Total time: 0:00:13 (0.2613 s / it)
* Acc@1 51.462 Acc@5 76.462 loss 2.300
Accuracy of the model on the 50000 test images: 51.5%
Max accuracy: 51.46%
Epoch: [8]  [   0/2502]  eta: 2:04:33  lr: 0.001400  min_lr: 0.001400  loss: 4.1026 (4.1026)  weight_decay: 0.0500 (0.0500)  time: 2.9869  data: 2.7015  max mem: 18975
Epoch: [8]  [ 200/2502]  eta: 0:10:31  lr: 0.001414  min_lr: 0.001414  loss: 4.0652 (4.6207)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8730 (2.5872)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [8]  [ 400/2502]  eta: 0:09:19  lr: 0.001428  min_lr: 0.001428  loss: 4.4247 (4.6394)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6783 (2.6455)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [8]  [ 600/2502]  eta: 0:08:20  lr: 0.001442  min_lr: 0.001442  loss: 4.4765 (4.6619)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7481 (2.6638)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [8]  [ 800/2502]  eta: 0:07:25  lr: 0.001456  min_lr: 0.001456  loss: 4.1615 (4.6418)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1510 (2.5909)  time: 0.2573  data: 0.0005  max mem: 18975
Epoch: [8]  [1000/2502]  eta: 0:06:31  lr: 0.001470  min_lr: 0.001470  loss: 4.5646 (4.6464)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3662 (2.5983)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [8]  [1200/2502]  eta: 0:05:38  lr: 0.001484  min_lr: 0.001484  loss: 4.9092 (4.6698)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2159 (2.5502)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [8]  [1400/2502]  eta: 0:04:46  lr: 0.001498  min_lr: 0.001498  loss: 4.0980 (4.6610)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2336 (inf)  time: 0.2611  data: 0.0004  max mem: 18975
Epoch: [8]  [1600/2502]  eta: 0:03:53  lr: 0.001512  min_lr: 0.001512  loss: 4.6171 (4.6633)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3957 (inf)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [8]  [1800/2502]  eta: 0:03:01  lr: 0.001526  min_lr: 0.001526  loss: 4.6693 (4.6498)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1628 (inf)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [8]  [2000/2502]  eta: 0:02:09  lr: 0.001540  min_lr: 0.001540  loss: 4.8173 (4.6491)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5910 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [8]  [2200/2502]  eta: 0:01:18  lr: 0.001554  min_lr: 0.001554  loss: 5.3417 (4.6557)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8246 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [8]  [2400/2502]  eta: 0:00:26  lr: 0.001568  min_lr: 0.001568  loss: 4.6912 (4.6527)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2533 (inf)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [8]  [2501/2502]  eta: 0:00:00  lr: 0.001575  min_lr: 0.001575  loss: 4.9177 (4.6495)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9645 (inf)  time: 0.2308  data: 0.0008  max mem: 18975
Epoch: [8] Total time: 0:10:47 (0.2586 s / it)
Averaged stats: lr: 0.001575  min_lr: 0.001575  loss: 4.9177 (4.6577)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9645 (inf)
Test:  [ 0/50]  eta: 0:02:13  loss: 1.6098 (1.6098)  acc1: 67.6000 (67.6000)  acc5: 86.0000 (86.0000)  time: 2.6690  data: 2.5179  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 1.8119 (1.8344)  acc1: 60.8000 (60.6182)  acc5: 84.8000 (83.6727)  time: 0.4889  data: 0.3667  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.7659 (1.7857)  acc1: 60.8000 (60.8000)  acc5: 85.2000 (84.8191)  time: 0.2381  data: 0.1190  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.9857 (2.0019)  acc1: 53.2000 (57.5097)  acc5: 79.6000 (81.2645)  time: 0.2566  data: 0.1375  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.5470 (2.1445)  acc1: 46.8000 (54.7512)  acc5: 71.2000 (78.5854)  time: 0.2488  data: 0.1297  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.5078 (2.1564)  acc1: 47.2000 (54.4720)  acc5: 71.6000 (78.4160)  time: 0.1720  data: 0.0531  max mem: 18975
Test: Total time: 0:00:13 (0.2781 s / it)
* Acc@1 54.248 Acc@5 78.554 loss 2.151
Accuracy of the model on the 50000 test images: 54.2%
Max accuracy: 54.25%
Epoch: [9]  [   0/2502]  eta: 1:49:33  lr: 0.001575  min_lr: 0.001575  loss: 5.0789 (5.0789)  weight_decay: 0.0500 (0.0500)  time: 2.6274  data: 2.3521  max mem: 18975
Epoch: [9]  [ 200/2502]  eta: 0:10:24  lr: 0.001589  min_lr: 0.001589  loss: 4.2025 (4.5900)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0468 (2.1946)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [9]  [ 400/2502]  eta: 0:09:16  lr: 0.001603  min_lr: 0.001603  loss: 4.3374 (4.6324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9240 (2.2036)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [9]  [ 600/2502]  eta: 0:08:19  lr: 0.001617  min_lr: 0.001617  loss: 4.1185 (4.6160)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0446 (2.2116)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [9]  [ 800/2502]  eta: 0:07:24  lr: 0.001631  min_lr: 0.001631  loss: 4.4060 (4.6033)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9082 (2.1891)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [9]  [1000/2502]  eta: 0:06:30  lr: 0.001645  min_lr: 0.001645  loss: 4.2922 (4.5767)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9120 (2.1453)  time: 0.2583  data: 0.0004  max mem: 18975
Epoch: [9]  [1200/2502]  eta: 0:05:38  lr: 0.001659  min_lr: 0.001659  loss: 4.3411 (4.5737)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8557 (2.1383)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [9]  [1400/2502]  eta: 0:04:45  lr: 0.001673  min_lr: 0.001673  loss: 4.7092 (4.5813)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9330 (2.1246)  time: 0.2582  data: 0.0005  max mem: 18975
Epoch: [9]  [1600/2502]  eta: 0:03:53  lr: 0.001687  min_lr: 0.001687  loss: 4.2120 (4.5834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6180 (2.0981)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [9]  [1800/2502]  eta: 0:03:01  lr: 0.001701  min_lr: 0.001701  loss: 3.9988 (4.5782)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8928 (2.0972)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [9]  [2000/2502]  eta: 0:02:09  lr: 0.001715  min_lr: 0.001715  loss: 4.9538 (4.5818)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8633 (2.0983)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [9]  [2200/2502]  eta: 0:01:18  lr: 0.001729  min_lr: 0.001729  loss: 4.0877 (4.5851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7723 (2.0730)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [9]  [2400/2502]  eta: 0:00:26  lr: 0.001743  min_lr: 0.001743  loss: 4.3705 (4.5859)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7412 (2.0572)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [9]  [2501/2502]  eta: 0:00:00  lr: 0.001750  min_lr: 0.001750  loss: 4.6361 (4.5851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9860 (2.0542)  time: 0.2320  data: 0.0007  max mem: 18975
Epoch: [9] Total time: 0:10:46 (0.2586 s / it)
Averaged stats: lr: 0.001750  min_lr: 0.001750  loss: 4.6361 (4.5744)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9860 (2.0542)
Test:  [ 0/50]  eta: 0:03:11  loss: 1.4717 (1.4717)  acc1: 68.4000 (68.4000)  acc5: 89.6000 (89.6000)  time: 3.8231  data: 3.6792  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.7110 (1.7482)  acc1: 64.4000 (62.2545)  acc5: 86.8000 (86.1455)  time: 0.5667  data: 0.4437  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.6870 (1.7142)  acc1: 60.8000 (61.9810)  acc5: 88.0000 (86.9333)  time: 0.2398  data: 0.1199  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.8055 (1.9396)  acc1: 54.4000 (58.8387)  acc5: 78.4000 (83.0452)  time: 0.2323  data: 0.1134  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.4889 (2.0777)  acc1: 49.6000 (56.3220)  acc5: 73.2000 (80.7122)  time: 0.2071  data: 0.0882  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.4791 (2.0990)  acc1: 48.8000 (55.9200)  acc5: 75.2000 (80.4240)  time: 0.1841  data: 0.0653  max mem: 18975
Test: Total time: 0:00:14 (0.2809 s / it)
* Acc@1 56.062 Acc@5 80.284 loss 2.093
Accuracy of the model on the 50000 test images: 56.1%
Max accuracy: 56.06%
Epoch: [10]  [   0/2502]  eta: 1:46:06  lr: 0.001750  min_lr: 0.001750  loss: 5.1704 (5.1704)  weight_decay: 0.0500 (0.0500)  time: 2.5445  data: 2.2781  max mem: 18975
Epoch: [10]  [ 200/2502]  eta: 0:10:22  lr: 0.001764  min_lr: 0.001764  loss: 4.4141 (4.5656)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6376 (1.8077)  time: 0.2636  data: 0.0004  max mem: 18975
Epoch: [10]  [ 400/2502]  eta: 0:09:14  lr: 0.001778  min_lr: 0.001778  loss: 5.0234 (4.5681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7677 (1.8311)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [10]  [ 600/2502]  eta: 0:08:17  lr: 0.001792  min_lr: 0.001792  loss: 4.7828 (4.5589)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7775 (1.7967)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [10]  [ 800/2502]  eta: 0:07:23  lr: 0.001806  min_lr: 0.001806  loss: 4.4866 (4.5610)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6189 (1.7994)  time: 0.2580  data: 0.0005  max mem: 18975
Epoch: [10]  [1000/2502]  eta: 0:06:30  lr: 0.001820  min_lr: 0.001820  loss: 4.9748 (4.5442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6305 (1.8063)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [10]  [1200/2502]  eta: 0:05:38  lr: 0.001834  min_lr: 0.001834  loss: 5.0491 (4.5587)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6635 (1.7908)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [10]  [1400/2502]  eta: 0:04:45  lr: 0.001848  min_lr: 0.001848  loss: 4.2051 (4.5472)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5222 (1.7864)  time: 0.2583  data: 0.0004  max mem: 18975
Epoch: [10]  [1600/2502]  eta: 0:03:53  lr: 0.001862  min_lr: 0.001862  loss: 4.4939 (4.5379)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6538 (1.7773)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [10]  [1800/2502]  eta: 0:03:01  lr: 0.001876  min_lr: 0.001876  loss: 4.4638 (4.5359)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5752 (1.7829)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [10]  [2000/2502]  eta: 0:02:09  lr: 0.001890  min_lr: 0.001890  loss: 5.0139 (4.5373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7186 (1.7795)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [10]  [2200/2502]  eta: 0:01:18  lr: 0.001904  min_lr: 0.001904  loss: 4.1344 (4.5309)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5363 (1.7678)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [10]  [2400/2502]  eta: 0:00:26  lr: 0.001918  min_lr: 0.001918  loss: 4.0577 (4.5227)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6486 (1.7523)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [10]  [2501/2502]  eta: 0:00:00  lr: 0.001925  min_lr: 0.001925  loss: 4.6912 (4.5260)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5190 (1.7435)  time: 0.2311  data: 0.0012  max mem: 18975
Epoch: [10] Total time: 0:10:46 (0.2585 s / it)
Averaged stats: lr: 0.001925  min_lr: 0.001925  loss: 4.6912 (4.5250)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5190 (1.7435)
Test:  [ 0/50]  eta: 0:02:37  loss: 1.4528 (1.4528)  acc1: 70.8000 (70.8000)  acc5: 89.2000 (89.2000)  time: 3.1566  data: 3.0095  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.6493 (1.7295)  acc1: 69.2000 (63.3091)  acc5: 85.6000 (86.0727)  time: 0.5285  data: 0.4063  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.6133 (1.6754)  acc1: 64.0000 (63.8667)  acc5: 86.4000 (87.0286)  time: 0.2336  data: 0.1130  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.8059 (1.8816)  acc1: 58.4000 (60.1806)  acc5: 80.0000 (83.1871)  time: 0.2178  data: 0.0965  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.4225 (2.0093)  acc1: 49.6000 (57.5512)  acc5: 73.6000 (81.0146)  time: 0.2484  data: 0.1282  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.4206 (2.0344)  acc1: 49.2000 (56.9920)  acc5: 73.6000 (80.5760)  time: 0.2073  data: 0.0878  max mem: 18975
Test: Total time: 0:00:14 (0.2855 s / it)
* Acc@1 56.990 Acc@5 80.780 loss 2.029
Accuracy of the model on the 50000 test images: 57.0%
Max accuracy: 56.99%
Epoch: [11]  [   0/2502]  eta: 1:25:18  lr: 0.001925  min_lr: 0.001925  loss: 5.3426 (5.3426)  weight_decay: 0.0500 (0.0500)  time: 2.0459  data: 1.7205  max mem: 18975
Epoch: [11]  [ 200/2502]  eta: 0:10:16  lr: 0.001939  min_lr: 0.001939  loss: 4.5467 (4.4961)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5309 (1.5673)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [11]  [ 400/2502]  eta: 0:09:10  lr: 0.001953  min_lr: 0.001953  loss: 4.1687 (4.4445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4332 (1.5639)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [11]  [ 600/2502]  eta: 0:08:15  lr: 0.001967  min_lr: 0.001967  loss: 4.7627 (4.4579)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4351 (1.5655)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [11]  [ 800/2502]  eta: 0:07:21  lr: 0.001981  min_lr: 0.001981  loss: 4.7681 (4.4681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3000 (1.5403)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [11]  [1000/2502]  eta: 0:06:28  lr: 0.001995  min_lr: 0.001995  loss: 4.2043 (4.4612)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3218 (1.5122)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [11]  [1200/2502]  eta: 0:05:36  lr: 0.002009  min_lr: 0.002009  loss: 4.8185 (4.4605)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4010 (1.4911)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [11]  [1400/2502]  eta: 0:04:44  lr: 0.002023  min_lr: 0.002023  loss: 4.9035 (4.4650)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2770 (1.4825)  time: 0.2576  data: 0.0003  max mem: 18975
Epoch: [11]  [1600/2502]  eta: 0:03:52  lr: 0.002037  min_lr: 0.002037  loss: 4.1372 (4.4651)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2910 (1.4703)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [11]  [1800/2502]  eta: 0:03:01  lr: 0.002051  min_lr: 0.002051  loss: 4.9860 (4.4651)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2092 (1.4507)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [11]  [2000/2502]  eta: 0:02:09  lr: 0.002065  min_lr: 0.002065  loss: 4.4746 (4.4530)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1792 (1.4448)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [11]  [2200/2502]  eta: 0:01:17  lr: 0.002079  min_lr: 0.002079  loss: 3.9855 (4.4490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3021 (1.4318)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [11]  [2400/2502]  eta: 0:00:26  lr: 0.002093  min_lr: 0.002093  loss: 4.1456 (4.4506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4826 (1.4238)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [11]  [2501/2502]  eta: 0:00:00  lr: 0.002100  min_lr: 0.002100  loss: 3.8611 (4.4477)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2712 (1.4199)  time: 0.2307  data: 0.0007  max mem: 18975
Epoch: [11] Total time: 0:10:44 (0.2577 s / it)
Averaged stats: lr: 0.002100  min_lr: 0.002100  loss: 3.8611 (4.4470)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2712 (1.4199)
Test:  [ 0/50]  eta: 0:02:43  loss: 1.2728 (1.2728)  acc1: 74.0000 (74.0000)  acc5: 91.6000 (91.6000)  time: 3.2634  data: 3.1155  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.6235 (1.6430)  acc1: 67.2000 (64.6909)  acc5: 88.0000 (87.7818)  time: 0.5333  data: 0.4108  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.5222 (1.5955)  acc1: 66.4000 (65.3905)  acc5: 88.0000 (88.3810)  time: 0.2351  data: 0.1156  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.7987 (1.7850)  acc1: 58.8000 (62.1419)  acc5: 81.6000 (85.0323)  time: 0.2177  data: 0.0985  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.2455 (1.9022)  acc1: 51.6000 (59.5024)  acc5: 75.6000 (82.6732)  time: 0.2280  data: 0.1088  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.2392 (1.9320)  acc1: 51.6000 (58.8720)  acc5: 76.0000 (82.1360)  time: 0.1967  data: 0.0776  max mem: 18975
Test: Total time: 0:00:13 (0.2740 s / it)
* Acc@1 58.966 Acc@5 82.324 loss 1.927
Accuracy of the model on the 50000 test images: 59.0%
Max accuracy: 58.97%
Epoch: [12]  [   0/2502]  eta: 1:13:54  lr: 0.002100  min_lr: 0.002100  loss: 4.6086 (4.6086)  weight_decay: 0.0500 (0.0500)  time: 1.7723  data: 1.5077  max mem: 18975
Epoch: [12]  [ 200/2502]  eta: 0:10:14  lr: 0.002114  min_lr: 0.002114  loss: 4.6474 (4.4598)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1110 (1.2101)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [12]  [ 400/2502]  eta: 0:09:10  lr: 0.002128  min_lr: 0.002128  loss: 4.0212 (4.4566)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1930 (1.2086)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [12]  [ 600/2502]  eta: 0:08:14  lr: 0.002142  min_lr: 0.002142  loss: 4.6832 (4.4618)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2673 (1.2265)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [12]  [ 800/2502]  eta: 0:07:21  lr: 0.002156  min_lr: 0.002156  loss: 4.6319 (4.4389)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0531 (1.2263)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [12]  [1000/2502]  eta: 0:06:29  lr: 0.002170  min_lr: 0.002170  loss: 4.5740 (4.4216)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1697 (1.2136)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [12]  [1200/2502]  eta: 0:05:36  lr: 0.002184  min_lr: 0.002184  loss: 4.6179 (4.4168)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1826 (1.2103)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [12]  [1400/2502]  eta: 0:04:44  lr: 0.002198  min_lr: 0.002198  loss: 4.5663 (4.4187)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1753 (1.2034)  time: 0.2634  data: 0.0004  max mem: 18975
Epoch: [12]  [1600/2502]  eta: 0:03:52  lr: 0.002212  min_lr: 0.002212  loss: 3.7360 (4.4100)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0969 (1.2065)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [12]  [1800/2502]  eta: 0:03:01  lr: 0.002226  min_lr: 0.002226  loss: 4.8373 (4.4119)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1975 (1.1980)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [12]  [2000/2502]  eta: 0:02:09  lr: 0.002240  min_lr: 0.002240  loss: 4.0376 (4.4140)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1584 (1.2028)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [12]  [2200/2502]  eta: 0:01:17  lr: 0.002254  min_lr: 0.002254  loss: 4.4975 (4.4087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9966 (1.1912)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [12]  [2400/2502]  eta: 0:00:26  lr: 0.002268  min_lr: 0.002268  loss: 4.1410 (4.4045)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0055 (1.1795)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [12]  [2501/2502]  eta: 0:00:00  lr: 0.002275  min_lr: 0.002275  loss: 4.6717 (4.4038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0726 (1.1765)  time: 0.2309  data: 0.0007  max mem: 18975
Epoch: [12] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.002275  min_lr: 0.002275  loss: 4.6717 (4.4014)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0726 (1.1765)
Test:  [ 0/50]  eta: 0:03:00  loss: 1.5699 (1.5699)  acc1: 72.8000 (72.8000)  acc5: 90.0000 (90.0000)  time: 3.6177  data: 3.4793  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.6277 (1.6661)  acc1: 66.8000 (65.0909)  acc5: 88.4000 (88.0727)  time: 0.5259  data: 0.4035  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.5614 (1.6294)  acc1: 63.6000 (65.3524)  acc5: 88.4000 (88.6095)  time: 0.2064  data: 0.0863  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.7453 (1.8247)  acc1: 59.6000 (62.1806)  acc5: 82.8000 (85.1226)  time: 0.2336  data: 0.1143  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.3301 (1.9593)  acc1: 52.0000 (59.6000)  acc5: 75.6000 (82.7512)  time: 0.2480  data: 0.1287  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.3384 (1.9848)  acc1: 52.0000 (59.1600)  acc5: 76.0000 (82.3680)  time: 0.2105  data: 0.0913  max mem: 18975
Test: Total time: 0:00:14 (0.2831 s / it)
* Acc@1 59.396 Acc@5 82.838 loss 1.973
Accuracy of the model on the 50000 test images: 59.4%
Max accuracy: 59.40%
Epoch: [13]  [   0/2502]  eta: 1:28:17  lr: 0.002275  min_lr: 0.002275  loss: 5.1857 (5.1857)  weight_decay: 0.0500 (0.0500)  time: 2.1171  data: 1.8386  max mem: 18975
Epoch: [13]  [ 200/2502]  eta: 0:10:18  lr: 0.002289  min_lr: 0.002289  loss: 4.0649 (4.3341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9978 (1.0539)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [13]  [ 400/2502]  eta: 0:09:11  lr: 0.002303  min_lr: 0.002303  loss: 3.8508 (4.3309)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0707 (1.0605)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [13]  [ 600/2502]  eta: 0:08:15  lr: 0.002317  min_lr: 0.002317  loss: 4.7020 (4.3573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9691 (1.0638)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [13]  [ 800/2502]  eta: 0:07:22  lr: 0.002331  min_lr: 0.002331  loss: 4.5158 (4.3483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9929 (1.0548)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [13]  [1000/2502]  eta: 0:06:29  lr: 0.002345  min_lr: 0.002345  loss: 4.6729 (4.3473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9318 (1.0441)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [13]  [1200/2502]  eta: 0:05:37  lr: 0.002359  min_lr: 0.002359  loss: 4.8940 (4.3451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9484 (1.0342)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [13]  [1400/2502]  eta: 0:04:45  lr: 0.002373  min_lr: 0.002373  loss: 4.1305 (4.3433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9335 (1.0248)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [13]  [1600/2502]  eta: 0:03:53  lr: 0.002387  min_lr: 0.002387  loss: 4.7706 (4.3512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8893 (1.0129)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [13]  [1800/2502]  eta: 0:03:01  lr: 0.002401  min_lr: 0.002401  loss: 4.2937 (4.3546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8516 (1.0081)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [13]  [2000/2502]  eta: 0:02:09  lr: 0.002415  min_lr: 0.002415  loss: 4.6318 (4.3507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9343 (1.0012)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [13]  [2200/2502]  eta: 0:01:18  lr: 0.002429  min_lr: 0.002429  loss: 4.7989 (4.3537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9803 (1.0033)  time: 0.2578  data: 0.0003  max mem: 18975
Epoch: [13]  [2400/2502]  eta: 0:00:26  lr: 0.002443  min_lr: 0.002443  loss: 4.4039 (4.3529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9416 (0.9948)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [13]  [2501/2502]  eta: 0:00:00  lr: 0.002450  min_lr: 0.002450  loss: 4.4681 (4.3532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9544 (0.9922)  time: 0.2314  data: 0.0009  max mem: 18975
Epoch: [13] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002450  min_lr: 0.002450  loss: 4.4681 (4.3354)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9544 (0.9922)
Test:  [ 0/50]  eta: 0:02:37  loss: 1.1463 (1.1463)  acc1: 77.2000 (77.2000)  acc5: 92.8000 (92.8000)  time: 3.1428  data: 2.9963  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 1.4393 (1.4545)  acc1: 72.4000 (67.7091)  acc5: 90.4000 (90.0364)  time: 0.5062  data: 0.3841  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.3852 (1.4413)  acc1: 66.8000 (67.3905)  acc5: 90.0000 (90.2857)  time: 0.2467  data: 0.1265  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.6401 (1.6569)  acc1: 60.4000 (64.4129)  acc5: 83.6000 (86.8903)  time: 0.2630  data: 0.1430  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.2216 (1.8036)  acc1: 54.8000 (61.5610)  acc5: 77.6000 (84.4781)  time: 0.2633  data: 0.1443  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.1819 (1.8376)  acc1: 54.0000 (61.0400)  acc5: 78.0000 (83.8240)  time: 0.2021  data: 0.0833  max mem: 18975
Test: Total time: 0:00:14 (0.2970 s / it)
* Acc@1 60.680 Acc@5 83.716 loss 1.843
Accuracy of the model on the 50000 test images: 60.7%
Max accuracy: 60.68%
Epoch: [14]  [   0/2502]  eta: 1:09:32  lr: 0.002450  min_lr: 0.002450  loss: 3.3112 (3.3112)  weight_decay: 0.0500 (0.0500)  time: 1.6675  data: 1.3891  max mem: 18975
Epoch: [14]  [ 200/2502]  eta: 0:10:14  lr: 0.002464  min_lr: 0.002464  loss: 4.7567 (4.3711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8576 (0.9420)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [14]  [ 400/2502]  eta: 0:09:10  lr: 0.002478  min_lr: 0.002478  loss: 4.9271 (4.3500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8175 (0.9372)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [14]  [ 600/2502]  eta: 0:08:14  lr: 0.002492  min_lr: 0.002492  loss: 4.4071 (4.3181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8836 (0.9171)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [14]  [ 800/2502]  eta: 0:07:21  lr: 0.002506  min_lr: 0.002506  loss: 4.3444 (4.3072)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8298 (0.9279)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [14]  [1000/2502]  eta: 0:06:28  lr: 0.002520  min_lr: 0.002520  loss: 3.5012 (4.2581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8804 (0.9172)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [14]  [1200/2502]  eta: 0:05:36  lr: 0.002534  min_lr: 0.002534  loss: 3.8499 (4.2586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8769 (0.9155)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [14]  [1400/2502]  eta: 0:04:44  lr: 0.002548  min_lr: 0.002548  loss: 3.9786 (4.2527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8410 (0.9094)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [14]  [1600/2502]  eta: 0:03:52  lr: 0.002562  min_lr: 0.002562  loss: 3.6806 (4.2529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9186 (0.9130)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [14]  [1800/2502]  eta: 0:03:01  lr: 0.002576  min_lr: 0.002576  loss: 4.1293 (4.2519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8296 (0.9067)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [14]  [2000/2502]  eta: 0:02:09  lr: 0.002590  min_lr: 0.002590  loss: 3.6636 (4.2511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7678 (0.9009)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [14]  [2200/2502]  eta: 0:01:17  lr: 0.002604  min_lr: 0.002604  loss: 4.1761 (4.2522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8786 (0.8997)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [14]  [2400/2502]  eta: 0:00:26  lr: 0.002618  min_lr: 0.002618  loss: 4.0536 (4.2565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8069 (0.8942)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [14]  [2501/2502]  eta: 0:00:00  lr: 0.002625  min_lr: 0.002625  loss: 4.5467 (4.2608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7919 (0.8931)  time: 0.2303  data: 0.0009  max mem: 18975
Epoch: [14] Total time: 0:10:45 (0.2578 s / it)
Averaged stats: lr: 0.002625  min_lr: 0.002625  loss: 4.5467 (4.2810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7919 (0.8931)
Test:  [ 0/50]  eta: 0:03:23  loss: 1.1362 (1.1362)  acc1: 78.8000 (78.8000)  acc5: 92.8000 (92.8000)  time: 4.0744  data: 3.9303  max mem: 18975
Test:  [10/50]  eta: 0:00:25  loss: 1.4741 (1.4659)  acc1: 70.4000 (69.1273)  acc5: 88.4000 (90.0000)  time: 0.6307  data: 0.5076  max mem: 18975
Test:  [20/50]  eta: 0:00:14  loss: 1.4873 (1.4841)  acc1: 68.8000 (68.6857)  acc5: 89.2000 (89.7905)  time: 0.2948  data: 0.1729  max mem: 18975
Test:  [30/50]  eta: 0:00:08  loss: 1.6897 (1.6853)  acc1: 61.6000 (64.8516)  acc5: 83.6000 (86.5161)  time: 0.3009  data: 0.1767  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.1924 (1.8101)  acc1: 54.0000 (62.1561)  acc5: 78.0000 (84.6342)  time: 0.2335  data: 0.1105  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.1415 (1.8392)  acc1: 54.4000 (61.3520)  acc5: 78.4000 (84.0160)  time: 0.2306  data: 0.1103  max mem: 18975
Test: Total time: 0:00:15 (0.3162 s / it)
* Acc@1 60.900 Acc@5 84.130 loss 1.844
Accuracy of the model on the 50000 test images: 60.9%
Max accuracy: 60.90%
Epoch: [15]  [   0/2502]  eta: 1:08:39  lr: 0.002625  min_lr: 0.002625  loss: 4.5985 (4.5985)  weight_decay: 0.0500 (0.0500)  time: 1.6464  data: 1.3713  max mem: 18975
Epoch: [15]  [ 200/2502]  eta: 0:10:12  lr: 0.002639  min_lr: 0.002639  loss: 4.6022 (4.2772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8063 (0.8686)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [15]  [ 400/2502]  eta: 0:09:10  lr: 0.002653  min_lr: 0.002653  loss: 4.6277 (4.2693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8044 (0.8379)  time: 0.2574  data: 0.0003  max mem: 18975
Epoch: [15]  [ 600/2502]  eta: 0:08:15  lr: 0.002667  min_lr: 0.002667  loss: 4.2832 (4.2595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8439 (0.8473)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [15]  [ 800/2502]  eta: 0:07:21  lr: 0.002681  min_lr: 0.002681  loss: 4.6528 (4.2538)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7989 (0.8398)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [15]  [1000/2502]  eta: 0:06:29  lr: 0.002695  min_lr: 0.002695  loss: 4.3687 (4.2605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8817 (0.8412)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [15]  [1200/2502]  eta: 0:05:36  lr: 0.002709  min_lr: 0.002709  loss: 4.6563 (4.2459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7853 (0.8417)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [15]  [1400/2502]  eta: 0:04:44  lr: 0.002723  min_lr: 0.002723  loss: 3.9190 (4.2602)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7668 (0.8402)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [15]  [1600/2502]  eta: 0:03:52  lr: 0.002737  min_lr: 0.002737  loss: 4.2955 (4.2532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8272 (0.8450)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [15]  [1800/2502]  eta: 0:03:01  lr: 0.002751  min_lr: 0.002751  loss: 4.0832 (4.2549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7444 (0.8397)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [15]  [2000/2502]  eta: 0:02:09  lr: 0.002765  min_lr: 0.002765  loss: 3.8493 (4.2409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9295 (0.8404)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [15]  [2200/2502]  eta: 0:01:17  lr: 0.002779  min_lr: 0.002779  loss: 4.2968 (4.2445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8855 (0.8413)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [15]  [2400/2502]  eta: 0:00:26  lr: 0.002793  min_lr: 0.002793  loss: 4.3030 (4.2448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7844 (0.8386)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [15]  [2501/2502]  eta: 0:00:00  lr: 0.002800  min_lr: 0.002800  loss: 4.2039 (4.2399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7954 (0.8396)  time: 0.2308  data: 0.0011  max mem: 18975
Epoch: [15] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.002800  min_lr: 0.002800  loss: 4.2039 (4.2496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7954 (0.8396)
Test:  [ 0/50]  eta: 0:02:54  loss: 1.0840 (1.0840)  acc1: 80.0000 (80.0000)  acc5: 95.2000 (95.2000)  time: 3.4943  data: 3.3517  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 1.3591 (1.4340)  acc1: 73.6000 (69.6727)  acc5: 91.6000 (90.5455)  time: 0.5894  data: 0.4655  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 1.3991 (1.4362)  acc1: 67.2000 (69.3905)  acc5: 90.0000 (90.7048)  time: 0.2983  data: 0.1761  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.5585 (1.6376)  acc1: 60.8000 (65.7161)  acc5: 83.6000 (87.3548)  time: 0.2657  data: 0.1441  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.1165 (1.7674)  acc1: 56.4000 (63.1805)  acc5: 78.8000 (85.2976)  time: 0.2035  data: 0.0822  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.1556 (1.7896)  acc1: 55.6000 (62.5120)  acc5: 79.6000 (84.9840)  time: 0.2175  data: 0.0971  max mem: 18975
Test: Total time: 0:00:14 (0.2998 s / it)
* Acc@1 62.286 Acc@5 84.720 loss 1.795
Accuracy of the model on the 50000 test images: 62.3%
Max accuracy: 62.29%
Epoch: [16]  [   0/2502]  eta: 1:23:48  lr: 0.002800  min_lr: 0.002800  loss: 3.1332 (3.1332)  weight_decay: 0.0500 (0.0500)  time: 2.0099  data: 1.7325  max mem: 18975
Epoch: [16]  [ 200/2502]  eta: 0:10:16  lr: 0.002814  min_lr: 0.002814  loss: 4.1364 (4.0615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7195 (0.7811)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [16]  [ 400/2502]  eta: 0:09:12  lr: 0.002828  min_lr: 0.002828  loss: 4.1094 (4.1225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7708 (0.8075)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [16]  [ 600/2502]  eta: 0:08:15  lr: 0.002842  min_lr: 0.002842  loss: 3.9137 (4.1110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7692 (0.7968)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [16]  [ 800/2502]  eta: 0:07:22  lr: 0.002856  min_lr: 0.002856  loss: 3.9622 (4.1447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7605 (0.7871)  time: 0.2626  data: 0.0005  max mem: 18975
Epoch: [16]  [1000/2502]  eta: 0:06:29  lr: 0.002870  min_lr: 0.002870  loss: 4.0714 (4.1446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7592 (0.7861)  time: 0.2565  data: 0.0005  max mem: 18975
Epoch: [16]  [1200/2502]  eta: 0:05:37  lr: 0.002884  min_lr: 0.002884  loss: 3.9884 (4.1438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7394 (0.7865)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [16]  [1400/2502]  eta: 0:04:45  lr: 0.002898  min_lr: 0.002898  loss: 4.6356 (4.1694)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7574 (0.7832)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [16]  [1600/2502]  eta: 0:03:53  lr: 0.002912  min_lr: 0.002912  loss: 4.2775 (4.1707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7499 (0.7842)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [16]  [1800/2502]  eta: 0:03:01  lr: 0.002926  min_lr: 0.002926  loss: 4.4898 (4.1805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7235 (0.7805)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [16]  [2000/2502]  eta: 0:02:09  lr: 0.002940  min_lr: 0.002940  loss: 4.1491 (4.1795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7494 (0.7788)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [16]  [2200/2502]  eta: 0:01:17  lr: 0.002954  min_lr: 0.002954  loss: 4.7623 (4.1825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6528 (0.7774)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [16]  [2400/2502]  eta: 0:00:26  lr: 0.002968  min_lr: 0.002968  loss: 4.5107 (4.1906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7121 (0.7771)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [16]  [2501/2502]  eta: 0:00:00  lr: 0.002975  min_lr: 0.002975  loss: 4.1253 (4.1850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7693 (0.7787)  time: 0.2306  data: 0.0007  max mem: 18975
Epoch: [16] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002975  min_lr: 0.002975  loss: 4.1253 (4.1947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7693 (0.7787)
Test:  [ 0/50]  eta: 0:02:33  loss: 1.1839 (1.1839)  acc1: 80.8000 (80.8000)  acc5: 92.4000 (92.4000)  time: 3.0647  data: 2.9253  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 1.3704 (1.4410)  acc1: 70.8000 (69.7091)  acc5: 91.2000 (91.2727)  time: 0.5042  data: 0.3811  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.4061 (1.4448)  acc1: 68.4000 (69.3143)  acc5: 91.2000 (91.1810)  time: 0.2261  data: 0.1056  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.6053 (1.6332)  acc1: 60.8000 (65.8839)  acc5: 88.4000 (88.3742)  time: 0.2190  data: 0.0997  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.1526 (1.7672)  acc1: 55.6000 (63.1902)  acc5: 79.2000 (85.9122)  time: 0.2440  data: 0.1251  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.1526 (1.7935)  acc1: 56.4000 (62.6160)  acc5: 79.2000 (85.4080)  time: 0.2273  data: 0.1085  max mem: 18975
Test: Total time: 0:00:14 (0.2853 s / it)
* Acc@1 62.308 Acc@5 85.060 loss 1.802
Accuracy of the model on the 50000 test images: 62.3%
Max accuracy: 62.31%
Epoch: [17]  [   0/2502]  eta: 1:09:56  lr: 0.002975  min_lr: 0.002975  loss: 3.2260 (3.2260)  weight_decay: 0.0500 (0.0500)  time: 1.6772  data: 1.3997  max mem: 18975
Epoch: [17]  [ 200/2502]  eta: 0:10:12  lr: 0.002989  min_lr: 0.002989  loss: 4.7474 (4.1350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7216 (0.7641)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [17]  [ 400/2502]  eta: 0:09:09  lr: 0.003003  min_lr: 0.003003  loss: 4.2981 (4.1342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6586 (0.7336)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [17]  [ 600/2502]  eta: 0:08:14  lr: 0.003017  min_lr: 0.003017  loss: 4.6230 (4.1611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8640 (0.7637)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [17]  [ 800/2502]  eta: 0:07:20  lr: 0.003031  min_lr: 0.003031  loss: 4.3791 (4.1663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6844 (0.7659)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [17]  [1000/2502]  eta: 0:06:28  lr: 0.003045  min_lr: 0.003045  loss: 4.0999 (4.1813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7094 (0.7648)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [17]  [1200/2502]  eta: 0:05:36  lr: 0.003059  min_lr: 0.003059  loss: 4.3937 (4.1748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6788 (0.7660)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [17]  [1400/2502]  eta: 0:04:44  lr: 0.003073  min_lr: 0.003073  loss: 4.4692 (4.1721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7291 (0.7641)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [17]  [1600/2502]  eta: 0:03:52  lr: 0.003087  min_lr: 0.003087  loss: 3.9228 (4.1772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7125 (0.7666)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [17]  [1800/2502]  eta: 0:03:01  lr: 0.003101  min_lr: 0.003101  loss: 4.4268 (4.1774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7025 (0.7596)  time: 0.2597  data: 0.0004  max mem: 18975
Epoch: [17]  [2000/2502]  eta: 0:02:09  lr: 0.003115  min_lr: 0.003115  loss: 4.1311 (4.1687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7495 (0.7602)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [17]  [2200/2502]  eta: 0:01:17  lr: 0.003129  min_lr: 0.003129  loss: 4.6735 (4.1685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7248 (0.7594)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [17]  [2400/2502]  eta: 0:00:26  lr: 0.003143  min_lr: 0.003143  loss: 4.1802 (4.1714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7548 (0.7608)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [17]  [2501/2502]  eta: 0:00:00  lr: 0.003150  min_lr: 0.003150  loss: 4.1241 (4.1678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6884 (0.7597)  time: 0.2316  data: 0.0009  max mem: 18975
Epoch: [17] Total time: 0:10:45 (0.2578 s / it)
Averaged stats: lr: 0.003150  min_lr: 0.003150  loss: 4.1241 (4.1597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6884 (0.7597)
Test:  [ 0/50]  eta: 0:02:39  loss: 1.1827 (1.1827)  acc1: 81.6000 (81.6000)  acc5: 94.0000 (94.0000)  time: 3.1858  data: 3.0388  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.4964 (1.4576)  acc1: 72.4000 (71.0909)  acc5: 90.8000 (91.0909)  time: 0.5727  data: 0.4494  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.3904 (1.4567)  acc1: 70.4000 (70.5714)  acc5: 90.8000 (91.1810)  time: 0.2950  data: 0.1745  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.6359 (1.6343)  acc1: 62.4000 (67.1484)  acc5: 86.0000 (87.9226)  time: 0.2809  data: 0.1613  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.0732 (1.7531)  acc1: 56.8000 (64.3805)  acc5: 79.2000 (85.9610)  time: 0.2362  data: 0.1172  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.0660 (1.7825)  acc1: 56.8000 (63.7440)  acc5: 80.0000 (85.6000)  time: 0.2395  data: 0.1206  max mem: 18975
Test: Total time: 0:00:15 (0.3048 s / it)
* Acc@1 63.368 Acc@5 85.574 loss 1.785
Accuracy of the model on the 50000 test images: 63.4%
Max accuracy: 63.37%
Epoch: [18]  [   0/2502]  eta: 1:13:04  lr: 0.003150  min_lr: 0.003150  loss: 4.5613 (4.5613)  weight_decay: 0.0500 (0.0500)  time: 1.7524  data: 1.4780  max mem: 18975
Epoch: [18]  [ 200/2502]  eta: 0:10:14  lr: 0.003164  min_lr: 0.003164  loss: 4.1116 (4.1447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7061 (0.7074)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [18]  [ 400/2502]  eta: 0:09:10  lr: 0.003178  min_lr: 0.003178  loss: 3.6206 (4.1715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.7355)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [18]  [ 600/2502]  eta: 0:08:15  lr: 0.003192  min_lr: 0.003192  loss: 4.1576 (4.1610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7103 (0.7167)  time: 0.2632  data: 0.0004  max mem: 18975
Epoch: [18]  [ 800/2502]  eta: 0:07:22  lr: 0.003206  min_lr: 0.003206  loss: 3.3865 (4.1299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7495 (0.7217)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [18]  [1000/2502]  eta: 0:06:29  lr: 0.003220  min_lr: 0.003220  loss: 4.6112 (4.1488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6540 (0.7203)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [18]  [1200/2502]  eta: 0:05:37  lr: 0.003234  min_lr: 0.003234  loss: 4.1166 (4.1345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6961 (0.7234)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [18]  [1400/2502]  eta: 0:04:45  lr: 0.003248  min_lr: 0.003248  loss: 3.7222 (4.1472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7301 (0.7251)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [18]  [1600/2502]  eta: 0:03:53  lr: 0.003262  min_lr: 0.003262  loss: 4.0061 (4.1543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7429 (0.7251)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [18]  [1800/2502]  eta: 0:03:01  lr: 0.003276  min_lr: 0.003276  loss: 3.7597 (4.1480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7573 (0.7279)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [18]  [2000/2502]  eta: 0:02:09  lr: 0.003290  min_lr: 0.003290  loss: 3.5475 (4.1507)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [18]  [2200/2502]  eta: 0:01:17  lr: 0.003304  min_lr: 0.003304  loss: 3.8971 (4.1516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6451 (nan)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [18]  [2400/2502]  eta: 0:00:26  lr: 0.003318  min_lr: 0.003318  loss: 4.4426 (4.1518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7296 (nan)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [18]  [2501/2502]  eta: 0:00:00  lr: 0.003325  min_lr: 0.003325  loss: 4.2626 (4.1525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7247 (nan)  time: 0.2305  data: 0.0007  max mem: 18975
Epoch: [18] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.003325  min_lr: 0.003325  loss: 4.2626 (4.1455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7247 (nan)
Test:  [ 0/50]  eta: 0:03:11  loss: 1.0589 (1.0589)  acc1: 81.6000 (81.6000)  acc5: 93.6000 (93.6000)  time: 3.8214  data: 3.6685  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 1.4251 (1.4271)  acc1: 71.6000 (69.8182)  acc5: 90.8000 (90.8000)  time: 0.5794  data: 0.4551  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.3418 (1.4016)  acc1: 67.2000 (69.7143)  acc5: 90.8000 (91.3524)  time: 0.2380  data: 0.1147  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.6212 (1.6056)  acc1: 62.8000 (66.2839)  acc5: 86.0000 (88.0387)  time: 0.2429  data: 0.1205  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.0687 (1.7236)  acc1: 56.4000 (63.7268)  acc5: 78.4000 (86.1659)  time: 0.2468  data: 0.1274  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.0429 (1.7506)  acc1: 56.4000 (63.2160)  acc5: 80.8000 (85.6800)  time: 0.2307  data: 0.1118  max mem: 18975
Test: Total time: 0:00:14 (0.2933 s / it)
* Acc@1 63.060 Acc@5 85.534 loss 1.752
Accuracy of the model on the 50000 test images: 63.1%
Max accuracy: 63.37%
Epoch: [19]  [   0/2502]  eta: 1:24:16  lr: 0.003325  min_lr: 0.003325  loss: 3.7269 (3.7269)  weight_decay: 0.0500 (0.0500)  time: 2.0210  data: 1.4665  max mem: 18975
Epoch: [19]  [ 200/2502]  eta: 0:10:20  lr: 0.003339  min_lr: 0.003339  loss: 4.0235 (4.0981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6202 (0.7008)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [19]  [ 400/2502]  eta: 0:09:14  lr: 0.003353  min_lr: 0.003353  loss: 3.4788 (4.0817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7426 (0.7160)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [19]  [ 600/2502]  eta: 0:08:17  lr: 0.003367  min_lr: 0.003367  loss: 4.4911 (4.0990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7035 (0.7278)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [19]  [ 800/2502]  eta: 0:07:23  lr: 0.003381  min_lr: 0.003381  loss: 4.4389 (4.1066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6635 (0.7147)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [19]  [1000/2502]  eta: 0:06:30  lr: 0.003395  min_lr: 0.003395  loss: 4.4378 (4.1090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6859 (0.7089)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [19]  [1200/2502]  eta: 0:05:37  lr: 0.003409  min_lr: 0.003409  loss: 4.3550 (4.1111)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7668 (0.7197)  time: 0.2574  data: 0.0005  max mem: 18975
Epoch: [19]  [1400/2502]  eta: 0:04:45  lr: 0.003423  min_lr: 0.003423  loss: 3.8250 (4.1090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6371 (0.7145)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [19]  [1600/2502]  eta: 0:03:53  lr: 0.003437  min_lr: 0.003437  loss: 4.4530 (4.1119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.7192)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [19]  [1800/2502]  eta: 0:03:01  lr: 0.003451  min_lr: 0.003451  loss: 4.4394 (4.1114)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6757 (0.7166)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [19]  [2000/2502]  eta: 0:02:09  lr: 0.003465  min_lr: 0.003465  loss: 4.1006 (4.1055)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6539 (0.7133)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [19]  [2200/2502]  eta: 0:01:18  lr: 0.003479  min_lr: 0.003479  loss: 4.1124 (4.0959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6520 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [19]  [2400/2502]  eta: 0:00:26  lr: 0.003493  min_lr: 0.003493  loss: 4.0231 (4.0984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7526 (inf)  time: 0.2573  data: 0.0005  max mem: 18975
Epoch: [19]  [2501/2502]  eta: 0:00:00  lr: 0.003500  min_lr: 0.003500  loss: 4.3861 (4.0969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7007 (inf)  time: 0.2307  data: 0.0009  max mem: 18975
Epoch: [19] Total time: 0:10:47 (0.2586 s / it)
Averaged stats: lr: 0.003500  min_lr: 0.003500  loss: 4.3861 (4.1062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7007 (inf)
Test:  [ 0/50]  eta: 0:03:02  loss: 1.1206 (1.1206)  acc1: 80.4000 (80.4000)  acc5: 95.6000 (95.6000)  time: 3.6486  data: 3.5024  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.4694 (1.4763)  acc1: 71.6000 (70.9455)  acc5: 92.4000 (91.4182)  time: 0.5442  data: 0.4210  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.4825 (1.4910)  acc1: 68.0000 (69.7143)  acc5: 91.6000 (91.3524)  time: 0.2239  data: 0.1028  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.6858 (1.6852)  acc1: 63.6000 (66.3742)  acc5: 86.0000 (88.3742)  time: 0.2219  data: 0.1012  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.1008 (1.7898)  acc1: 57.6000 (64.1073)  acc5: 81.2000 (86.8488)  time: 0.2299  data: 0.1095  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.1187 (1.8249)  acc1: 55.6000 (63.0640)  acc5: 81.2000 (86.2400)  time: 0.2015  data: 0.0814  max mem: 18975
Test: Total time: 0:00:13 (0.2798 s / it)
* Acc@1 63.370 Acc@5 85.814 loss 1.827
Accuracy of the model on the 50000 test images: 63.4%
Max accuracy: 63.37%
Epoch: [20]  [   0/2502]  eta: 1:24:37  lr: 0.003500  min_lr: 0.003500  loss: 3.4775 (3.4775)  weight_decay: 0.0500 (0.0500)  time: 2.0295  data: 1.7628  max mem: 18975
Epoch: [20]  [ 200/2502]  eta: 0:10:18  lr: 0.003500  min_lr: 0.003500  loss: 4.0988 (4.0770)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5710 (0.6678)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [20]  [ 400/2502]  eta: 0:09:11  lr: 0.003500  min_lr: 0.003500  loss: 3.3898 (4.0170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6941 (0.6897)  time: 0.2555  data: 0.0005  max mem: 18975
Epoch: [20]  [ 600/2502]  eta: 0:08:15  lr: 0.003500  min_lr: 0.003500  loss: 3.9752 (4.0142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6688 (0.6975)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [20]  [ 800/2502]  eta: 0:07:21  lr: 0.003500  min_lr: 0.003500  loss: 3.8837 (4.0323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6448 (0.7070)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [20]  [1000/2502]  eta: 0:06:29  lr: 0.003500  min_lr: 0.003500  loss: 4.1787 (4.0363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6408 (0.7011)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [20]  [1200/2502]  eta: 0:05:37  lr: 0.003500  min_lr: 0.003500  loss: 4.4239 (4.0388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6946 (0.7016)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [20]  [1400/2502]  eta: 0:04:44  lr: 0.003500  min_lr: 0.003500  loss: 4.1428 (4.0564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6563 (0.6990)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [20]  [1600/2502]  eta: 0:03:53  lr: 0.003500  min_lr: 0.003500  loss: 4.1640 (4.0582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6346 (0.6972)  time: 0.2575  data: 0.0005  max mem: 18975
Epoch: [20]  [1800/2502]  eta: 0:03:01  lr: 0.003500  min_lr: 0.003500  loss: 4.1839 (4.0622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6443 (0.6975)  time: 0.2585  data: 0.0005  max mem: 18975
Epoch: [20]  [2000/2502]  eta: 0:02:09  lr: 0.003500  min_lr: 0.003500  loss: 4.4525 (4.0670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6011 (0.6933)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [20]  [2200/2502]  eta: 0:01:17  lr: 0.003500  min_lr: 0.003500  loss: 4.2989 (4.0730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6711 (0.6954)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [20]  [2400/2502]  eta: 0:00:26  lr: 0.003500  min_lr: 0.003500  loss: 4.0394 (4.0705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6303 (0.6917)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [20]  [2501/2502]  eta: 0:00:00  lr: 0.003500  min_lr: 0.003500  loss: 3.4422 (4.0739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6296 (0.6903)  time: 0.2306  data: 0.0011  max mem: 18975
Epoch: [20] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003500  min_lr: 0.003500  loss: 3.4422 (4.0757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6296 (0.6903)
Test:  [ 0/50]  eta: 0:02:31  loss: 1.1962 (1.1962)  acc1: 82.4000 (82.4000)  acc5: 93.2000 (93.2000)  time: 3.0248  data: 2.8832  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 1.3563 (1.3708)  acc1: 73.2000 (72.7273)  acc5: 92.8000 (91.7455)  time: 0.4813  data: 0.3583  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.3563 (1.3764)  acc1: 70.8000 (71.7714)  acc5: 92.8000 (92.0000)  time: 0.2237  data: 0.1025  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.5735 (1.5604)  acc1: 66.4000 (68.6323)  acc5: 88.0000 (89.0194)  time: 0.2550  data: 0.1345  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.0051 (1.6669)  acc1: 59.6000 (65.9122)  acc5: 80.8000 (87.2488)  time: 0.2944  data: 0.1751  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.9773 (1.7043)  acc1: 57.6000 (64.9200)  acc5: 81.2000 (86.5600)  time: 0.2218  data: 0.1029  max mem: 18975
Test: Total time: 0:00:14 (0.2966 s / it)
* Acc@1 64.574 Acc@5 86.390 loss 1.703
Accuracy of the model on the 50000 test images: 64.6%
Max accuracy: 64.57%
Epoch: [21]  [   0/2502]  eta: 1:12:30  lr: 0.003500  min_lr: 0.003500  loss: 5.2966 (5.2966)  weight_decay: 0.0500 (0.0500)  time: 1.7388  data: 1.4624  max mem: 18975
Epoch: [21]  [ 200/2502]  eta: 0:10:24  lr: 0.003500  min_lr: 0.003500  loss: 4.3348 (4.1105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (0.7245)  time: 0.2567  data: 0.0005  max mem: 18975
Epoch: [21]  [ 400/2502]  eta: 0:09:14  lr: 0.003500  min_lr: 0.003500  loss: 4.1461 (4.1297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6646 (0.7080)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [21]  [ 600/2502]  eta: 0:08:17  lr: 0.003500  min_lr: 0.003500  loss: 4.1983 (4.1024)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6798 (0.7044)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [21]  [ 800/2502]  eta: 0:07:23  lr: 0.003500  min_lr: 0.003500  loss: 4.3457 (4.0903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6831 (0.6995)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [21]  [1000/2502]  eta: 0:06:30  lr: 0.003500  min_lr: 0.003500  loss: 3.6274 (4.0880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6577 (0.6953)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [21]  [1200/2502]  eta: 0:05:37  lr: 0.003500  min_lr: 0.003500  loss: 4.1395 (4.0875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6732 (0.6970)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [21]  [1400/2502]  eta: 0:04:45  lr: 0.003500  min_lr: 0.003500  loss: 4.1697 (4.0743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6881 (0.6974)  time: 0.2557  data: 0.0004  max mem: 18975
Epoch: [21]  [1600/2502]  eta: 0:03:53  lr: 0.003500  min_lr: 0.003500  loss: 4.0175 (4.0601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6727 (0.6968)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [21]  [1800/2502]  eta: 0:03:01  lr: 0.003500  min_lr: 0.003500  loss: 4.5238 (4.0607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6637 (0.6981)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [21]  [2000/2502]  eta: 0:02:09  lr: 0.003500  min_lr: 0.003500  loss: 4.0856 (4.0596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6440 (0.6935)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [21]  [2200/2502]  eta: 0:01:18  lr: 0.003500  min_lr: 0.003500  loss: 3.9128 (4.0562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6646 (0.6956)  time: 0.2568  data: 0.0005  max mem: 18975
Epoch: [21]  [2400/2502]  eta: 0:00:26  lr: 0.003500  min_lr: 0.003500  loss: 4.2396 (4.0500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6498 (0.6958)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [21]  [2501/2502]  eta: 0:00:00  lr: 0.003500  min_lr: 0.003500  loss: 4.3694 (4.0532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6786 (0.6959)  time: 0.2310  data: 0.0009  max mem: 18975
Epoch: [21] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.003500  min_lr: 0.003500  loss: 4.3694 (4.0437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6786 (0.6959)
Test:  [ 0/50]  eta: 0:03:17  loss: 1.0595 (1.0595)  acc1: 82.0000 (82.0000)  acc5: 94.8000 (94.8000)  time: 3.9586  data: 3.8129  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 1.4585 (1.4075)  acc1: 71.6000 (73.0182)  acc5: 91.6000 (91.7455)  time: 0.6014  data: 0.4786  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.4238 (1.4094)  acc1: 71.6000 (72.3429)  acc5: 92.0000 (91.9619)  time: 0.2176  data: 0.0973  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.6348 (1.5732)  acc1: 66.4000 (69.1226)  acc5: 86.8000 (89.4194)  time: 0.1937  data: 0.0736  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 2.0275 (1.6832)  acc1: 58.8000 (66.4000)  acc5: 81.6000 (87.4244)  time: 0.2182  data: 0.0971  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 2.0192 (1.7054)  acc1: 58.0000 (65.7520)  acc5: 81.2000 (87.0400)  time: 0.1913  data: 0.0702  max mem: 18975
Test: Total time: 0:00:14 (0.2856 s / it)
* Acc@1 65.408 Acc@5 86.986 loss 1.711
Accuracy of the model on the 50000 test images: 65.4%
Max accuracy: 65.41%
Epoch: [22]  [   0/2502]  eta: 1:17:06  lr: 0.003500  min_lr: 0.003500  loss: 3.6873 (3.6873)  weight_decay: 0.0500 (0.0500)  time: 1.8493  data: 1.5689  max mem: 18975
Epoch: [22]  [ 200/2502]  eta: 0:10:15  lr: 0.003500  min_lr: 0.003500  loss: 4.4917 (3.9883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6572 (0.7132)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [22]  [ 400/2502]  eta: 0:09:11  lr: 0.003499  min_lr: 0.003499  loss: 4.2684 (4.0429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6427 (0.7206)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [22]  [ 600/2502]  eta: 0:08:15  lr: 0.003499  min_lr: 0.003499  loss: 3.9238 (4.0044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6508 (0.7065)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [22]  [ 800/2502]  eta: 0:07:22  lr: 0.003499  min_lr: 0.003499  loss: 3.4033 (4.0094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6281 (0.7085)  time: 0.2567  data: 0.0005  max mem: 18975
Epoch: [22]  [1000/2502]  eta: 0:06:30  lr: 0.003499  min_lr: 0.003499  loss: 4.5176 (4.0222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7746 (0.7098)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [22]  [1200/2502]  eta: 0:05:37  lr: 0.003499  min_lr: 0.003499  loss: 4.2116 (4.0276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5969 (0.6986)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [22]  [1400/2502]  eta: 0:04:45  lr: 0.003499  min_lr: 0.003499  loss: 3.5296 (4.0155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7038 (0.6985)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [22]  [1600/2502]  eta: 0:03:53  lr: 0.003499  min_lr: 0.003499  loss: 3.6789 (4.0054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7392 (0.6982)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [22]  [1800/2502]  eta: 0:03:01  lr: 0.003499  min_lr: 0.003499  loss: 3.4556 (4.0046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6251 (0.6960)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [22]  [2000/2502]  eta: 0:02:09  lr: 0.003499  min_lr: 0.003499  loss: 3.8469 (3.9990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6649 (0.6949)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [22]  [2200/2502]  eta: 0:01:17  lr: 0.003499  min_lr: 0.003499  loss: 4.2236 (3.9999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6836 (0.6923)  time: 0.2613  data: 0.0004  max mem: 18975
Epoch: [22]  [2400/2502]  eta: 0:00:26  lr: 0.003499  min_lr: 0.003499  loss: 3.4570 (3.9986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6960 (0.6895)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [22]  [2501/2502]  eta: 0:00:00  lr: 0.003499  min_lr: 0.003499  loss: 3.6344 (3.9976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5995 (0.6868)  time: 0.2310  data: 0.0007  max mem: 18975
Epoch: [22] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003499  min_lr: 0.003499  loss: 3.6344 (4.0092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5995 (0.6868)
Test:  [ 0/50]  eta: 0:02:47  loss: 1.1392 (1.1392)  acc1: 76.8000 (76.8000)  acc5: 92.4000 (92.4000)  time: 3.3494  data: 3.2110  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 1.2580 (1.2762)  acc1: 75.6000 (72.6182)  acc5: 92.4000 (91.7091)  time: 0.5212  data: 0.3962  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.2580 (1.2693)  acc1: 72.8000 (72.4952)  acc5: 92.4000 (92.0571)  time: 0.2370  data: 0.1157  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.4401 (1.4577)  acc1: 64.0000 (69.0968)  acc5: 88.0000 (89.4581)  time: 0.2318  data: 0.1130  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.8990 (1.5580)  acc1: 60.4000 (67.2000)  acc5: 82.8000 (87.9805)  time: 0.2283  data: 0.1094  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.8784 (1.5928)  acc1: 59.6000 (66.4000)  acc5: 84.0000 (87.6720)  time: 0.1964  data: 0.0777  max mem: 18975
Test: Total time: 0:00:14 (0.2853 s / it)
* Acc@1 66.148 Acc@5 87.848 loss 1.595
Accuracy of the model on the 50000 test images: 66.1%
Max accuracy: 66.15%
Epoch: [23]  [   0/2502]  eta: 1:11:54  lr: 0.003499  min_lr: 0.003499  loss: 4.5580 (4.5580)  weight_decay: 0.0500 (0.0500)  time: 1.7245  data: 1.4464  max mem: 18975
Epoch: [23]  [ 200/2502]  eta: 0:10:14  lr: 0.003499  min_lr: 0.003499  loss: 3.3673 (3.9508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5959 (0.6713)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [23]  [ 400/2502]  eta: 0:09:11  lr: 0.003499  min_lr: 0.003499  loss: 4.3153 (3.9908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7208 (0.6734)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [23]  [ 600/2502]  eta: 0:08:15  lr: 0.003499  min_lr: 0.003499  loss: 3.5254 (3.9617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6973 (0.6999)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [23]  [ 800/2502]  eta: 0:07:22  lr: 0.003499  min_lr: 0.003499  loss: 4.3877 (3.9781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6938 (0.6937)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [23]  [1000/2502]  eta: 0:06:29  lr: 0.003499  min_lr: 0.003499  loss: 3.4531 (3.9964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6476 (0.6855)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [23]  [1200/2502]  eta: 0:05:36  lr: 0.003499  min_lr: 0.003499  loss: 4.1267 (4.0041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6735 (0.6948)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [23]  [1400/2502]  eta: 0:04:45  lr: 0.003499  min_lr: 0.003499  loss: 4.3391 (4.0069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7038 (0.6945)  time: 0.2579  data: 0.0005  max mem: 18975
Epoch: [23]  [1600/2502]  eta: 0:03:53  lr: 0.003499  min_lr: 0.003499  loss: 3.8910 (4.0036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6605 (0.6916)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [23]  [1800/2502]  eta: 0:03:01  lr: 0.003498  min_lr: 0.003498  loss: 4.4092 (4.0108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6951 (0.6950)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [23]  [2000/2502]  eta: 0:02:09  lr: 0.003498  min_lr: 0.003498  loss: 4.2456 (4.0097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7355 (0.6977)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [23]  [2200/2502]  eta: 0:01:17  lr: 0.003498  min_lr: 0.003498  loss: 4.1478 (4.0043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7267 (0.6982)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [23]  [2400/2502]  eta: 0:00:26  lr: 0.003498  min_lr: 0.003498  loss: 3.5121 (3.9937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6255 (0.6946)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [23]  [2501/2502]  eta: 0:00:00  lr: 0.003498  min_lr: 0.003498  loss: 4.1844 (3.9930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6270 (0.6944)  time: 0.2302  data: 0.0009  max mem: 18975
Epoch: [23] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.003498  min_lr: 0.003498  loss: 4.1844 (3.9865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6270 (0.6944)
Test:  [ 0/50]  eta: 0:03:04  loss: 1.1253 (1.1253)  acc1: 82.8000 (82.8000)  acc5: 94.8000 (94.8000)  time: 3.6833  data: 3.5412  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.3556 (1.3457)  acc1: 76.0000 (75.4545)  acc5: 92.4000 (92.7636)  time: 0.5402  data: 0.4185  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.3570 (1.3729)  acc1: 73.6000 (74.1333)  acc5: 92.4000 (92.8952)  time: 0.2401  data: 0.1208  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.6087 (1.5188)  acc1: 68.4000 (70.5419)  acc5: 88.0000 (90.3484)  time: 0.2800  data: 0.1610  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.8848 (1.6196)  acc1: 60.8000 (67.9707)  acc5: 83.2000 (88.5268)  time: 0.2710  data: 0.1512  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.8691 (1.6441)  acc1: 61.6000 (67.2640)  acc5: 83.6000 (88.2000)  time: 0.2184  data: 0.0989  max mem: 18975
Test: Total time: 0:00:15 (0.3014 s / it)
* Acc@1 66.762 Acc@5 88.276 loss 1.643
Accuracy of the model on the 50000 test images: 66.8%
Max accuracy: 66.76%
Epoch: [24]  [   0/2502]  eta: 1:19:18  lr: 0.003498  min_lr: 0.003498  loss: 4.1432 (4.1432)  weight_decay: 0.0500 (0.0500)  time: 1.9019  data: 1.6381  max mem: 18975
Epoch: [24]  [ 200/2502]  eta: 0:10:13  lr: 0.003498  min_lr: 0.003498  loss: 4.2413 (4.0308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6695 (0.6581)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [24]  [ 400/2502]  eta: 0:09:10  lr: 0.003498  min_lr: 0.003498  loss: 4.3330 (4.0189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6966 (0.6815)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [24]  [ 600/2502]  eta: 0:08:14  lr: 0.003498  min_lr: 0.003498  loss: 4.2895 (4.0280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7030 (0.6976)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [24]  [ 800/2502]  eta: 0:07:21  lr: 0.003498  min_lr: 0.003498  loss: 3.5505 (4.0147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6789 (0.6954)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [24]  [1000/2502]  eta: 0:06:28  lr: 0.003498  min_lr: 0.003498  loss: 3.3695 (3.9889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6542 (0.6945)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [24]  [1200/2502]  eta: 0:05:36  lr: 0.003498  min_lr: 0.003498  loss: 3.4455 (3.9826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7327 (0.6975)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [24]  [1400/2502]  eta: 0:04:44  lr: 0.003498  min_lr: 0.003498  loss: 4.1529 (3.9880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6311 (0.7000)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [24]  [1600/2502]  eta: 0:03:52  lr: 0.003498  min_lr: 0.003498  loss: 3.5422 (3.9813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7003 (0.7018)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [24]  [1800/2502]  eta: 0:03:01  lr: 0.003498  min_lr: 0.003498  loss: 4.0032 (3.9664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6766 (0.7023)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [24]  [2000/2502]  eta: 0:02:09  lr: 0.003497  min_lr: 0.003497  loss: 3.3171 (3.9573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6316 (0.7019)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [24]  [2200/2502]  eta: 0:01:17  lr: 0.003497  min_lr: 0.003497  loss: 4.3787 (3.9598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6177 (0.6982)  time: 0.2578  data: 0.0005  max mem: 18975
Epoch: [24]  [2400/2502]  eta: 0:00:26  lr: 0.003497  min_lr: 0.003497  loss: 4.0079 (3.9619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7070 (0.6995)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [24]  [2501/2502]  eta: 0:00:00  lr: 0.003497  min_lr: 0.003497  loss: 3.5548 (3.9604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6400 (0.6981)  time: 0.2307  data: 0.0008  max mem: 18975
Epoch: [24] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.003497  min_lr: 0.003497  loss: 3.5548 (3.9549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6400 (0.6981)
Test:  [ 0/50]  eta: 0:02:39  loss: 1.0073 (1.0073)  acc1: 82.8000 (82.8000)  acc5: 94.4000 (94.4000)  time: 3.1834  data: 3.0449  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.3259 (1.2672)  acc1: 74.4000 (74.4364)  acc5: 92.8000 (92.8000)  time: 0.5576  data: 0.4358  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.2287 (1.2516)  acc1: 72.0000 (73.1810)  acc5: 92.8000 (93.0667)  time: 0.2901  data: 0.1705  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3946 (1.4202)  acc1: 67.6000 (70.3355)  acc5: 89.2000 (90.3613)  time: 0.2802  data: 0.1610  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.9042 (1.5424)  acc1: 61.2000 (67.7756)  acc5: 82.4000 (88.4390)  time: 0.2314  data: 0.1115  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.9272 (1.5699)  acc1: 60.8000 (67.2800)  acc5: 82.4000 (88.0800)  time: 0.2236  data: 0.1039  max mem: 18975
Test: Total time: 0:00:14 (0.2987 s / it)
* Acc@1 67.352 Acc@5 88.334 loss 1.561
Accuracy of the model on the 50000 test images: 67.4%
Max accuracy: 67.35%
Epoch: [25]  [   0/2502]  eta: 1:23:57  lr: 0.003497  min_lr: 0.003497  loss: 4.9669 (4.9669)  weight_decay: 0.0500 (0.0500)  time: 2.0135  data: 1.7437  max mem: 18975
Epoch: [25]  [ 200/2502]  eta: 0:10:16  lr: 0.003497  min_lr: 0.003497  loss: 4.4835 (4.0413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6699 (0.6916)  time: 0.2572  data: 0.0003  max mem: 18975
Epoch: [25]  [ 400/2502]  eta: 0:09:11  lr: 0.003497  min_lr: 0.003497  loss: 4.1209 (3.9836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7914 (0.7061)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [25]  [ 600/2502]  eta: 0:08:16  lr: 0.003497  min_lr: 0.003497  loss: 3.9905 (3.9614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6228 (0.6939)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [25]  [ 800/2502]  eta: 0:07:23  lr: 0.003497  min_lr: 0.003497  loss: 4.3302 (3.9397)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6783 (0.7019)  time: 0.2574  data: 0.0003  max mem: 18975
Epoch: [25]  [1000/2502]  eta: 0:06:30  lr: 0.003497  min_lr: 0.003497  loss: 4.3749 (3.9323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6543 (0.6956)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [25]  [1200/2502]  eta: 0:05:37  lr: 0.003497  min_lr: 0.003497  loss: 4.3231 (3.9391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6599 (0.6951)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [25]  [1400/2502]  eta: 0:04:45  lr: 0.003497  min_lr: 0.003497  loss: 3.5468 (3.9259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6669 (0.6964)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [25]  [1600/2502]  eta: 0:03:53  lr: 0.003496  min_lr: 0.003496  loss: 4.2861 (3.9382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6685 (0.6969)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [25]  [1800/2502]  eta: 0:03:01  lr: 0.003496  min_lr: 0.003496  loss: 4.0290 (3.9423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6243 (0.6961)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [25]  [2000/2502]  eta: 0:02:09  lr: 0.003496  min_lr: 0.003496  loss: 3.7275 (3.9350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6720 (0.6985)  time: 0.2570  data: 0.0005  max mem: 18975
Epoch: [25]  [2200/2502]  eta: 0:01:18  lr: 0.003496  min_lr: 0.003496  loss: 3.8009 (3.9330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6207 (0.6969)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [25]  [2400/2502]  eta: 0:00:26  lr: 0.003496  min_lr: 0.003496  loss: 3.4203 (3.9279)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6878 (0.6979)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [25]  [2501/2502]  eta: 0:00:00  lr: 0.003496  min_lr: 0.003496  loss: 3.5901 (3.9285)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6505 (0.6983)  time: 0.2310  data: 0.0010  max mem: 18975
Epoch: [25] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.003496  min_lr: 0.003496  loss: 3.5901 (3.9246)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6505 (0.6983)
Test:  [ 0/50]  eta: 0:02:32  loss: 0.9106 (0.9106)  acc1: 87.2000 (87.2000)  acc5: 96.0000 (96.0000)  time: 3.0470  data: 2.9019  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 1.2100 (1.2649)  acc1: 71.6000 (72.5818)  acc5: 93.2000 (93.0909)  time: 0.4382  data: 0.3161  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.2837 (1.2680)  acc1: 72.4000 (72.8952)  acc5: 92.8000 (92.9905)  time: 0.2313  data: 0.1118  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.4146 (1.4304)  acc1: 68.4000 (70.1548)  acc5: 89.2000 (90.5161)  time: 0.2837  data: 0.1648  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 1.8618 (1.5374)  acc1: 61.2000 (67.8829)  acc5: 84.0000 (88.8683)  time: 0.2264  data: 0.1066  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.8744 (1.5650)  acc1: 60.4000 (67.0160)  acc5: 84.0000 (88.3600)  time: 0.1672  data: 0.0476  max mem: 18975
Test: Total time: 0:00:13 (0.2770 s / it)
* Acc@1 67.010 Acc@5 88.340 loss 1.562
Accuracy of the model on the 50000 test images: 67.0%
Max accuracy: 67.35%
Epoch: [26]  [   0/2502]  eta: 1:14:40  lr: 0.003496  min_lr: 0.003496  loss: 3.2654 (3.2654)  weight_decay: 0.0500 (0.0500)  time: 1.7909  data: 1.5437  max mem: 18975
Epoch: [26]  [ 200/2502]  eta: 0:10:18  lr: 0.003496  min_lr: 0.003496  loss: 4.0755 (3.8998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6152 (0.6783)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [26]  [ 400/2502]  eta: 0:09:11  lr: 0.003496  min_lr: 0.003496  loss: 4.1369 (3.8714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7234 (0.6847)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [26]  [ 600/2502]  eta: 0:08:15  lr: 0.003496  min_lr: 0.003496  loss: 3.6794 (3.9032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7558 (0.6871)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [26]  [ 800/2502]  eta: 0:07:22  lr: 0.003496  min_lr: 0.003496  loss: 3.6120 (3.8768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7400 (0.6957)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [26]  [1000/2502]  eta: 0:06:29  lr: 0.003495  min_lr: 0.003495  loss: 3.7974 (3.8761)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6172 (0.6866)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [26]  [1200/2502]  eta: 0:05:37  lr: 0.003495  min_lr: 0.003495  loss: 3.4398 (3.8992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6797 (0.6936)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [26]  [1400/2502]  eta: 0:04:45  lr: 0.003495  min_lr: 0.003495  loss: 4.3682 (3.9149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6192 (0.6901)  time: 0.2642  data: 0.0004  max mem: 18975
Epoch: [26]  [1600/2502]  eta: 0:03:53  lr: 0.003495  min_lr: 0.003495  loss: 3.8867 (3.8978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6729 (0.6940)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [26]  [1800/2502]  eta: 0:03:01  lr: 0.003495  min_lr: 0.003495  loss: 3.5889 (3.8976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7136 (0.6972)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [26]  [2000/2502]  eta: 0:02:09  lr: 0.003495  min_lr: 0.003495  loss: 4.1927 (3.8976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6682 (0.6962)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [26]  [2200/2502]  eta: 0:01:17  lr: 0.003495  min_lr: 0.003495  loss: 4.1458 (3.9001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6426 (0.6963)  time: 0.2581  data: 0.0005  max mem: 18975
Epoch: [26]  [2400/2502]  eta: 0:00:26  lr: 0.003495  min_lr: 0.003495  loss: 3.7818 (3.9026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6365 (0.7004)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [26]  [2501/2502]  eta: 0:00:00  lr: 0.003495  min_lr: 0.003495  loss: 4.0118 (3.9030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7554 (0.7018)  time: 0.2305  data: 0.0009  max mem: 18975
Epoch: [26] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.003495  min_lr: 0.003495  loss: 4.0118 (3.9086)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7554 (0.7018)
Test:  [ 0/50]  eta: 0:03:15  loss: 1.0387 (1.0387)  acc1: 82.8000 (82.8000)  acc5: 96.0000 (96.0000)  time: 3.9123  data: 3.7733  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 1.2678 (1.2738)  acc1: 76.8000 (75.4182)  acc5: 94.0000 (93.8909)  time: 0.5791  data: 0.4580  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 1.2952 (1.3026)  acc1: 72.8000 (73.7905)  acc5: 93.6000 (93.3905)  time: 0.2693  data: 0.1497  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.5064 (1.4756)  acc1: 67.6000 (70.4903)  acc5: 87.2000 (90.4129)  time: 0.2681  data: 0.1472  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.8667 (1.5747)  acc1: 62.0000 (68.1659)  acc5: 84.4000 (88.8390)  time: 0.2231  data: 0.0999  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.8605 (1.5993)  acc1: 62.0000 (67.7200)  acc5: 84.4000 (88.5360)  time: 0.2323  data: 0.1102  max mem: 18975
Test: Total time: 0:00:15 (0.3029 s / it)
* Acc@1 67.508 Acc@5 88.526 loss 1.603
Accuracy of the model on the 50000 test images: 67.5%
Max accuracy: 67.51%
Epoch: [27]  [   0/2502]  eta: 1:09:33  lr: 0.003495  min_lr: 0.003495  loss: 4.6264 (4.6264)  weight_decay: 0.0500 (0.0500)  time: 1.6681  data: 1.4041  max mem: 18975
Epoch: [27]  [ 200/2502]  eta: 0:10:15  lr: 0.003494  min_lr: 0.003494  loss: 3.6991 (3.8085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6785 (0.7112)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [27]  [ 400/2502]  eta: 0:09:10  lr: 0.003494  min_lr: 0.003494  loss: 4.1930 (3.8479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7075 (0.7243)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [27]  [ 600/2502]  eta: 0:08:16  lr: 0.003494  min_lr: 0.003494  loss: 4.1557 (3.8876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6204 (0.7168)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [27]  [ 800/2502]  eta: 0:07:22  lr: 0.003494  min_lr: 0.003494  loss: 3.8089 (3.8715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6881 (0.7201)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [27]  [1000/2502]  eta: 0:06:29  lr: 0.003494  min_lr: 0.003494  loss: 4.0828 (3.8597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6628 (0.7112)  time: 0.2577  data: 0.0003  max mem: 18975
Epoch: [27]  [1200/2502]  eta: 0:05:37  lr: 0.003494  min_lr: 0.003494  loss: 3.7322 (3.8598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7469 (0.7166)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [27]  [1400/2502]  eta: 0:04:45  lr: 0.003494  min_lr: 0.003494  loss: 3.6706 (3.8666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7085 (0.7126)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [27]  [1600/2502]  eta: 0:03:53  lr: 0.003494  min_lr: 0.003494  loss: 4.2136 (3.8698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6631 (inf)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [27]  [1800/2502]  eta: 0:03:01  lr: 0.003493  min_lr: 0.003493  loss: 3.6128 (3.8636)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6942 (inf)  time: 0.2557  data: 0.0004  max mem: 18975
Epoch: [27]  [2000/2502]  eta: 0:02:09  lr: 0.003493  min_lr: 0.003493  loss: 4.2893 (3.8696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7298 (inf)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [27]  [2200/2502]  eta: 0:01:17  lr: 0.003493  min_lr: 0.003493  loss: 3.6575 (3.8669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6435 (inf)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [27]  [2400/2502]  eta: 0:00:26  lr: 0.003493  min_lr: 0.003493  loss: 4.1682 (3.8716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6630 (inf)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [27]  [2501/2502]  eta: 0:00:00  lr: 0.003493  min_lr: 0.003493  loss: 4.1012 (3.8780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6911 (inf)  time: 0.2308  data: 0.0009  max mem: 18975
Epoch: [27] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.003493  min_lr: 0.003493  loss: 4.1012 (3.8909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6911 (inf)
Test:  [ 0/50]  eta: 0:02:37  loss: 0.9090 (0.9090)  acc1: 79.2000 (79.2000)  acc5: 96.4000 (96.4000)  time: 3.1489  data: 2.9958  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 1.1724 (1.2029)  acc1: 74.8000 (74.2182)  acc5: 94.0000 (93.3091)  time: 0.5151  data: 0.3915  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1744 (1.2031)  acc1: 74.0000 (74.0191)  acc5: 93.6000 (93.4286)  time: 0.2348  data: 0.1139  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3919 (1.3816)  acc1: 68.4000 (70.4387)  acc5: 86.8000 (90.6065)  time: 0.2450  data: 0.1246  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.8463 (1.4972)  acc1: 60.4000 (67.9805)  acc5: 82.4000 (88.8390)  time: 0.2698  data: 0.1504  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.8313 (1.5186)  acc1: 61.6000 (67.5360)  acc5: 82.8000 (88.5600)  time: 0.2150  data: 0.0954  max mem: 18975
Test: Total time: 0:00:14 (0.2970 s / it)
* Acc@1 67.658 Acc@5 88.690 loss 1.518
Accuracy of the model on the 50000 test images: 67.7%
Max accuracy: 67.66%
Epoch: [28]  [   0/2502]  eta: 1:14:25  lr: 0.003493  min_lr: 0.003493  loss: 3.5341 (3.5341)  weight_decay: 0.0500 (0.0500)  time: 1.7847  data: 1.5143  max mem: 18975
Epoch: [28]  [ 200/2502]  eta: 0:10:13  lr: 0.003493  min_lr: 0.003493  loss: 3.9933 (3.8557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6347 (0.7418)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [28]  [ 400/2502]  eta: 0:09:10  lr: 0.003493  min_lr: 0.003493  loss: 3.8301 (3.8818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6056 (0.7313)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [28]  [ 600/2502]  eta: 0:08:15  lr: 0.003493  min_lr: 0.003493  loss: 3.7942 (3.8750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8049 (0.7415)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [28]  [ 800/2502]  eta: 0:07:21  lr: 0.003492  min_lr: 0.003492  loss: 3.5555 (3.8730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6963 (0.7324)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [28]  [1000/2502]  eta: 0:06:28  lr: 0.003492  min_lr: 0.003492  loss: 3.6458 (3.8762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7193 (0.7338)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [28]  [1200/2502]  eta: 0:05:36  lr: 0.003492  min_lr: 0.003492  loss: 4.4309 (3.8795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7330 (0.7268)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [28]  [1400/2502]  eta: 0:04:44  lr: 0.003492  min_lr: 0.003492  loss: 3.4511 (3.8817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7078 (0.7268)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [28]  [1600/2502]  eta: 0:03:52  lr: 0.003492  min_lr: 0.003492  loss: 3.9588 (3.8749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6729 (0.7278)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [28]  [1800/2502]  eta: 0:03:01  lr: 0.003492  min_lr: 0.003492  loss: 3.5936 (3.8772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6760 (0.7257)  time: 0.2605  data: 0.0004  max mem: 18975
Epoch: [28]  [2000/2502]  eta: 0:02:09  lr: 0.003491  min_lr: 0.003491  loss: 4.3891 (3.8784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7182 (0.7256)  time: 0.2618  data: 0.0004  max mem: 18975
Epoch: [28]  [2200/2502]  eta: 0:01:17  lr: 0.003491  min_lr: 0.003491  loss: 3.8207 (3.8743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6869 (0.7291)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [28]  [2400/2502]  eta: 0:00:26  lr: 0.003491  min_lr: 0.003491  loss: 3.7747 (3.8767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6738 (0.7309)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [28]  [2501/2502]  eta: 0:00:00  lr: 0.003491  min_lr: 0.003491  loss: 4.0662 (3.8786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6604 (0.7279)  time: 0.2310  data: 0.0008  max mem: 18975
Epoch: [28] Total time: 0:10:45 (0.2578 s / it)
Averaged stats: lr: 0.003491  min_lr: 0.003491  loss: 4.0662 (3.8776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6604 (0.7279)
Test:  [ 0/50]  eta: 0:03:12  loss: 0.8616 (0.8616)  acc1: 85.6000 (85.6000)  acc5: 95.6000 (95.6000)  time: 3.8523  data: 3.7194  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 1.1602 (1.1881)  acc1: 76.8000 (76.2545)  acc5: 92.8000 (93.8545)  time: 0.5935  data: 0.4725  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.1730 (1.2259)  acc1: 74.4000 (74.5524)  acc5: 93.6000 (93.7333)  time: 0.2475  data: 0.1273  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.4300 (1.3935)  acc1: 67.6000 (71.4839)  acc5: 89.6000 (91.3032)  time: 0.2304  data: 0.1094  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.8402 (1.5050)  acc1: 62.8000 (69.1220)  acc5: 84.0000 (89.6195)  time: 0.2289  data: 0.1088  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.8239 (1.5335)  acc1: 62.8000 (68.6720)  acc5: 84.4000 (89.4640)  time: 0.2044  data: 0.0854  max mem: 18975
Test: Total time: 0:00:14 (0.2986 s / it)
* Acc@1 68.860 Acc@5 89.318 loss 1.528
Accuracy of the model on the 50000 test images: 68.9%
Max accuracy: 68.86%
Epoch: [29]  [   0/2502]  eta: 1:08:39  lr: 0.003491  min_lr: 0.003491  loss: 4.7973 (4.7973)  weight_decay: 0.0500 (0.0500)  time: 1.6466  data: 1.3817  max mem: 18975
Epoch: [29]  [ 200/2502]  eta: 0:10:12  lr: 0.003491  min_lr: 0.003491  loss: 3.6546 (3.8854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6560 (0.6945)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [29]  [ 400/2502]  eta: 0:09:09  lr: 0.003491  min_lr: 0.003491  loss: 4.1959 (3.9038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6575 (0.7144)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [29]  [ 600/2502]  eta: 0:08:14  lr: 0.003491  min_lr: 0.003491  loss: 3.6118 (3.9073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7623 (0.7200)  time: 0.2627  data: 0.0004  max mem: 18975
Epoch: [29]  [ 800/2502]  eta: 0:07:21  lr: 0.003490  min_lr: 0.003490  loss: 3.9650 (3.8954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7028 (0.7176)  time: 0.2664  data: 0.0003  max mem: 18975
Epoch: [29]  [1000/2502]  eta: 0:06:28  lr: 0.003490  min_lr: 0.003490  loss: 4.0351 (3.8833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7069 (0.7217)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [29]  [1200/2502]  eta: 0:05:36  lr: 0.003490  min_lr: 0.003490  loss: 3.4850 (3.8499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6374 (0.7182)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [29]  [1400/2502]  eta: 0:04:44  lr: 0.003490  min_lr: 0.003490  loss: 4.3900 (3.8606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6929 (0.7163)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [29]  [1600/2502]  eta: 0:03:52  lr: 0.003490  min_lr: 0.003490  loss: 4.2324 (3.8562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7052 (0.7155)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [29]  [1800/2502]  eta: 0:03:00  lr: 0.003490  min_lr: 0.003490  loss: 3.9049 (3.8630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6582 (0.7159)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [29]  [2000/2502]  eta: 0:02:09  lr: 0.003489  min_lr: 0.003489  loss: 3.9719 (3.8567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7405 (0.7197)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [29]  [2200/2502]  eta: 0:01:17  lr: 0.003489  min_lr: 0.003489  loss: 4.0526 (3.8560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7600 (0.7193)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [29]  [2400/2502]  eta: 0:00:26  lr: 0.003489  min_lr: 0.003489  loss: 3.0974 (3.8577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7435 (0.7186)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [29]  [2501/2502]  eta: 0:00:00  lr: 0.003489  min_lr: 0.003489  loss: 4.1709 (3.8603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7753 (0.7217)  time: 0.2307  data: 0.0007  max mem: 18975
Epoch: [29] Total time: 0:10:44 (0.2575 s / it)
Averaged stats: lr: 0.003489  min_lr: 0.003489  loss: 4.1709 (3.8601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7753 (0.7217)
Test:  [ 0/50]  eta: 0:02:07  loss: 0.9184 (0.9184)  acc1: 85.2000 (85.2000)  acc5: 96.4000 (96.4000)  time: 2.5533  data: 2.4120  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 1.1947 (1.2692)  acc1: 76.8000 (75.5636)  acc5: 94.0000 (94.0000)  time: 0.4637  data: 0.3405  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.1947 (1.2750)  acc1: 75.2000 (75.0286)  acc5: 94.4000 (94.2667)  time: 0.2329  data: 0.1114  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.4768 (1.4401)  acc1: 68.8000 (71.9613)  acc5: 90.0000 (91.6516)  time: 0.2336  data: 0.1131  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.8697 (1.5525)  acc1: 61.6000 (69.5707)  acc5: 84.0000 (89.7951)  time: 0.2546  data: 0.1355  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.8697 (1.5809)  acc1: 61.6000 (68.9680)  acc5: 84.0000 (89.4560)  time: 0.2280  data: 0.1090  max mem: 18975
Test: Total time: 0:00:14 (0.2852 s / it)
* Acc@1 68.512 Acc@5 89.286 loss 1.584
Accuracy of the model on the 50000 test images: 68.5%
Max accuracy: 68.86%
Epoch: [30]  [   0/2502]  eta: 1:26:36  lr: 0.003489  min_lr: 0.003489  loss: 5.0249 (5.0249)  weight_decay: 0.0500 (0.0500)  time: 2.0769  data: 1.8079  max mem: 18975
Epoch: [30]  [ 200/2502]  eta: 0:10:17  lr: 0.003489  min_lr: 0.003489  loss: 4.1016 (3.8670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6759 (0.6713)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [30]  [ 400/2502]  eta: 0:09:12  lr: 0.003489  min_lr: 0.003489  loss: 3.2242 (3.8430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7084 (0.7207)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [30]  [ 600/2502]  eta: 0:08:16  lr: 0.003488  min_lr: 0.003488  loss: 4.1676 (3.8257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7008 (0.7221)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [30]  [ 800/2502]  eta: 0:07:22  lr: 0.003488  min_lr: 0.003488  loss: 4.3388 (3.8282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7788 (0.7338)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [30]  [1000/2502]  eta: 0:06:29  lr: 0.003488  min_lr: 0.003488  loss: 3.9683 (3.8377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6817 (0.7300)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [30]  [1200/2502]  eta: 0:05:37  lr: 0.003488  min_lr: 0.003488  loss: 4.1011 (3.8446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7489 (0.7290)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [30]  [1400/2502]  eta: 0:04:44  lr: 0.003488  min_lr: 0.003488  loss: 3.9309 (3.8500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7115 (0.7223)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [30]  [1600/2502]  eta: 0:03:53  lr: 0.003488  min_lr: 0.003488  loss: 3.9895 (3.8534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6980 (0.7193)  time: 0.2573  data: 0.0003  max mem: 18975
Epoch: [30]  [1800/2502]  eta: 0:03:01  lr: 0.003487  min_lr: 0.003487  loss: 3.9878 (3.8569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7426 (0.7229)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [30]  [2000/2502]  eta: 0:02:09  lr: 0.003487  min_lr: 0.003487  loss: 4.0441 (3.8642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7663 (0.7274)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [30]  [2200/2502]  eta: 0:01:17  lr: 0.003487  min_lr: 0.003487  loss: 3.5760 (3.8564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6717 (0.7249)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [30]  [2400/2502]  eta: 0:00:26  lr: 0.003487  min_lr: 0.003487  loss: 4.3914 (3.8575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7269 (0.7245)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [30]  [2501/2502]  eta: 0:00:00  lr: 0.003487  min_lr: 0.003487  loss: 4.0284 (3.8546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7098 (0.7247)  time: 0.2314  data: 0.0007  max mem: 18975
Epoch: [30] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.003487  min_lr: 0.003487  loss: 4.0284 (3.8340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7098 (0.7247)
Test:  [ 0/50]  eta: 0:02:10  loss: 1.0508 (1.0508)  acc1: 83.2000 (83.2000)  acc5: 94.4000 (94.4000)  time: 2.6010  data: 2.4589  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 1.1307 (1.1850)  acc1: 79.2000 (76.5455)  acc5: 94.0000 (93.8182)  time: 0.5079  data: 0.3817  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1307 (1.1853)  acc1: 74.0000 (75.2381)  acc5: 94.0000 (93.8857)  time: 0.2876  data: 0.1655  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.4230 (1.3546)  acc1: 69.2000 (72.0000)  acc5: 89.2000 (91.3032)  time: 0.2625  data: 0.1432  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.7305 (1.4503)  acc1: 62.8000 (69.5902)  acc5: 85.6000 (90.0683)  time: 0.2567  data: 0.1377  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.7217 (1.4695)  acc1: 63.2000 (69.1520)  acc5: 86.0000 (89.8240)  time: 0.2092  data: 0.0903  max mem: 18975
Test: Total time: 0:00:15 (0.3001 s / it)
* Acc@1 68.974 Acc@5 89.548 loss 1.478
Accuracy of the model on the 50000 test images: 69.0%
Max accuracy: 68.97%
Epoch: [31]  [   0/2502]  eta: 1:29:52  lr: 0.003487  min_lr: 0.003487  loss: 2.6790 (2.6790)  weight_decay: 0.0500 (0.0500)  time: 2.1553  data: 1.8850  max mem: 18975
Epoch: [31]  [ 200/2502]  eta: 0:10:18  lr: 0.003486  min_lr: 0.003486  loss: 3.7088 (3.8208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6959 (0.7259)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [31]  [ 400/2502]  eta: 0:09:12  lr: 0.003486  min_lr: 0.003486  loss: 3.5889 (3.8404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7141 (0.7151)  time: 0.2574  data: 0.0005  max mem: 18975
Epoch: [31]  [ 600/2502]  eta: 0:08:16  lr: 0.003486  min_lr: 0.003486  loss: 3.5654 (3.8340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7596 (0.7266)  time: 0.2576  data: 0.0005  max mem: 18975
Epoch: [31]  [ 800/2502]  eta: 0:07:22  lr: 0.003486  min_lr: 0.003486  loss: 3.4513 (3.8315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6507 (0.7224)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [31]  [1000/2502]  eta: 0:06:29  lr: 0.003486  min_lr: 0.003486  loss: 4.0216 (3.8261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6532 (0.7217)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [31]  [1200/2502]  eta: 0:05:37  lr: 0.003486  min_lr: 0.003486  loss: 3.6750 (3.8168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7204 (0.7276)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [31]  [1400/2502]  eta: 0:04:45  lr: 0.003485  min_lr: 0.003485  loss: 3.6969 (3.8243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6398 (0.7289)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [31]  [1600/2502]  eta: 0:03:53  lr: 0.003485  min_lr: 0.003485  loss: 3.8384 (3.8231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7591 (0.7320)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [31]  [1800/2502]  eta: 0:03:01  lr: 0.003485  min_lr: 0.003485  loss: 3.9519 (3.8209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6652 (0.7327)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [31]  [2000/2502]  eta: 0:02:09  lr: 0.003485  min_lr: 0.003485  loss: 3.8397 (3.8168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7153 (0.7327)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [31]  [2200/2502]  eta: 0:01:18  lr: 0.003484  min_lr: 0.003484  loss: 4.1433 (3.8244)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6066 (0.7356)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [31]  [2400/2502]  eta: 0:00:26  lr: 0.003484  min_lr: 0.003484  loss: 3.8595 (3.8300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7218 (0.7352)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [31]  [2501/2502]  eta: 0:00:00  lr: 0.003484  min_lr: 0.003484  loss: 3.6913 (3.8291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6768 (0.7341)  time: 0.2307  data: 0.0009  max mem: 18975
Epoch: [31] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.003484  min_lr: 0.003484  loss: 3.6913 (3.8265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6768 (0.7341)
Test:  [ 0/50]  eta: 0:02:15  loss: 0.8430 (0.8430)  acc1: 84.4000 (84.4000)  acc5: 94.4000 (94.4000)  time: 2.7046  data: 2.5521  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.0928 (1.1532)  acc1: 78.0000 (76.1091)  acc5: 94.4000 (93.9273)  time: 0.5318  data: 0.4060  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0928 (1.1539)  acc1: 73.6000 (75.3714)  acc5: 94.0000 (94.0381)  time: 0.2858  data: 0.1643  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3043 (1.3324)  acc1: 70.0000 (72.3226)  acc5: 89.6000 (91.5355)  time: 0.2357  data: 0.1152  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6846 (1.4305)  acc1: 63.6000 (70.1463)  acc5: 85.6000 (90.0098)  time: 0.2334  data: 0.1134  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6784 (1.4560)  acc1: 64.4000 (69.6960)  acc5: 85.6000 (89.7360)  time: 0.2243  data: 0.1054  max mem: 18975
Test: Total time: 0:00:14 (0.2937 s / it)
* Acc@1 69.212 Acc@5 89.554 loss 1.464
Accuracy of the model on the 50000 test images: 69.2%
Max accuracy: 69.21%
Epoch: [32]  [   0/2502]  eta: 1:27:08  lr: 0.003484  min_lr: 0.003484  loss: 4.6514 (4.6514)  weight_decay: 0.0500 (0.0500)  time: 2.0896  data: 1.8077  max mem: 18975
Epoch: [32]  [ 200/2502]  eta: 0:10:17  lr: 0.003484  min_lr: 0.003484  loss: 3.9603 (3.7882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6960 (0.7132)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [32]  [ 400/2502]  eta: 0:09:12  lr: 0.003484  min_lr: 0.003484  loss: 3.9612 (3.8036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6918 (0.7325)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [32]  [ 600/2502]  eta: 0:08:16  lr: 0.003484  min_lr: 0.003484  loss: 3.7358 (3.8133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7365 (0.7160)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [32]  [ 800/2502]  eta: 0:07:22  lr: 0.003483  min_lr: 0.003483  loss: 3.8289 (3.8077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6824 (0.7208)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [32]  [1000/2502]  eta: 0:06:29  lr: 0.003483  min_lr: 0.003483  loss: 4.2532 (3.8212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8040 (0.7299)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [32]  [1200/2502]  eta: 0:05:37  lr: 0.003483  min_lr: 0.003483  loss: 3.8186 (3.8179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7459 (0.7329)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [32]  [1400/2502]  eta: 0:04:45  lr: 0.003483  min_lr: 0.003483  loss: 4.1925 (3.8247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7216 (0.7357)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [32]  [1600/2502]  eta: 0:03:53  lr: 0.003482  min_lr: 0.003482  loss: 4.1772 (3.8204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6249 (0.7316)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [32]  [1800/2502]  eta: 0:03:01  lr: 0.003482  min_lr: 0.003482  loss: 3.8987 (3.8081)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7343 (0.7320)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [32]  [2000/2502]  eta: 0:02:09  lr: 0.003482  min_lr: 0.003482  loss: 3.9934 (3.8032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7090 (0.7360)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [32]  [2200/2502]  eta: 0:01:18  lr: 0.003482  min_lr: 0.003482  loss: 3.9326 (3.8029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7523 (0.7368)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [32]  [2400/2502]  eta: 0:00:26  lr: 0.003482  min_lr: 0.003482  loss: 4.0848 (3.8056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7483 (0.7368)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [32]  [2501/2502]  eta: 0:00:00  lr: 0.003481  min_lr: 0.003481  loss: 4.1830 (3.8062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (0.7340)  time: 0.2310  data: 0.0009  max mem: 18975
Epoch: [32] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.003481  min_lr: 0.003481  loss: 4.1830 (3.7962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (0.7340)
Test:  [ 0/50]  eta: 0:03:05  loss: 0.8758 (0.8758)  acc1: 87.2000 (87.2000)  acc5: 95.6000 (95.6000)  time: 3.7045  data: 3.5659  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 1.2053 (1.1905)  acc1: 74.4000 (75.6000)  acc5: 94.0000 (93.6727)  time: 0.6203  data: 0.4968  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 1.2170 (1.2016)  acc1: 74.4000 (75.3333)  acc5: 94.0000 (93.9619)  time: 0.2847  data: 0.1640  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3641 (1.3572)  acc1: 69.2000 (72.5677)  acc5: 91.2000 (91.6129)  time: 0.2323  data: 0.1129  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.8029 (1.4641)  acc1: 64.0000 (70.0293)  acc5: 85.6000 (90.0098)  time: 0.1916  data: 0.0694  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.7603 (1.4853)  acc1: 64.0000 (69.5440)  acc5: 85.6000 (89.6960)  time: 0.2153  data: 0.0932  max mem: 18975
Test: Total time: 0:00:14 (0.2976 s / it)
* Acc@1 69.498 Acc@5 89.768 loss 1.488
Accuracy of the model on the 50000 test images: 69.5%
Max accuracy: 69.50%
Epoch: [33]  [   0/2502]  eta: 1:16:07  lr: 0.003481  min_lr: 0.003481  loss: 3.8514 (3.8514)  weight_decay: 0.0500 (0.0500)  time: 1.8256  data: 1.5575  max mem: 18975
Epoch: [33]  [ 200/2502]  eta: 0:10:14  lr: 0.003481  min_lr: 0.003481  loss: 3.3969 (3.8684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7560 (0.7438)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [33]  [ 400/2502]  eta: 0:09:10  lr: 0.003481  min_lr: 0.003481  loss: 3.8612 (3.8702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6577 (0.7241)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [33]  [ 600/2502]  eta: 0:08:14  lr: 0.003481  min_lr: 0.003481  loss: 4.1983 (3.8553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7386 (0.7273)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [33]  [ 800/2502]  eta: 0:07:21  lr: 0.003480  min_lr: 0.003480  loss: 4.1572 (3.8339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6863 (0.7303)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [33]  [1000/2502]  eta: 0:06:28  lr: 0.003480  min_lr: 0.003480  loss: 3.9117 (3.8314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7010 (0.7269)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [33]  [1200/2502]  eta: 0:05:36  lr: 0.003480  min_lr: 0.003480  loss: 4.0107 (3.8339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6824 (0.7365)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [33]  [1400/2502]  eta: 0:04:44  lr: 0.003480  min_lr: 0.003480  loss: 4.0252 (3.8318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6771 (0.7322)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [33]  [1600/2502]  eta: 0:03:52  lr: 0.003480  min_lr: 0.003480  loss: 3.8019 (3.8276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7465 (0.7349)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [33]  [1800/2502]  eta: 0:03:01  lr: 0.003479  min_lr: 0.003479  loss: 3.8486 (3.8250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7446 (0.7380)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [33]  [2000/2502]  eta: 0:02:09  lr: 0.003479  min_lr: 0.003479  loss: 3.4121 (3.8244)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7319 (0.7390)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [33]  [2200/2502]  eta: 0:01:17  lr: 0.003479  min_lr: 0.003479  loss: 3.5736 (3.8129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6839 (0.7383)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [33]  [2400/2502]  eta: 0:00:26  lr: 0.003479  min_lr: 0.003479  loss: 3.3535 (3.8113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.7393)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [33]  [2501/2502]  eta: 0:00:00  lr: 0.003478  min_lr: 0.003478  loss: 3.5269 (3.8090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6624 (0.7384)  time: 0.2308  data: 0.0007  max mem: 18975
Epoch: [33] Total time: 0:10:45 (0.2578 s / it)
Averaged stats: lr: 0.003478  min_lr: 0.003478  loss: 3.5269 (3.8062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6624 (0.7384)
Test:  [ 0/50]  eta: 0:02:45  loss: 0.8076 (0.8076)  acc1: 86.8000 (86.8000)  acc5: 96.4000 (96.4000)  time: 3.3177  data: 3.1745  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.1952 (1.1837)  acc1: 75.6000 (76.1455)  acc5: 94.0000 (93.8909)  time: 0.5449  data: 0.4228  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.2536 (1.2298)  acc1: 74.4000 (75.6191)  acc5: 94.0000 (94.1333)  time: 0.2576  data: 0.1371  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3922 (1.3895)  acc1: 71.6000 (72.1290)  acc5: 90.8000 (91.5613)  time: 0.2512  data: 0.1312  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.8041 (1.4851)  acc1: 64.8000 (70.1171)  acc5: 84.8000 (90.0390)  time: 0.2261  data: 0.1071  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6795 (1.5066)  acc1: 65.6000 (69.7920)  acc5: 86.4000 (89.8320)  time: 0.2111  data: 0.0922  max mem: 18975
Test: Total time: 0:00:14 (0.2853 s / it)
* Acc@1 69.690 Acc@5 89.912 loss 1.506
Accuracy of the model on the 50000 test images: 69.7%
Max accuracy: 69.69%
Epoch: [34]  [   0/2502]  eta: 1:36:23  lr: 0.003478  min_lr: 0.003478  loss: 4.2459 (4.2459)  weight_decay: 0.0500 (0.0500)  time: 2.3116  data: 2.0442  max mem: 18975
Epoch: [34]  [ 200/2502]  eta: 0:10:18  lr: 0.003478  min_lr: 0.003478  loss: 3.3551 (3.7669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6857 (0.7379)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [34]  [ 400/2502]  eta: 0:09:12  lr: 0.003478  min_lr: 0.003478  loss: 3.2861 (3.7498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7151 (0.7557)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [34]  [ 600/2502]  eta: 0:08:15  lr: 0.003478  min_lr: 0.003478  loss: 3.4516 (3.7730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7489 (inf)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [34]  [ 800/2502]  eta: 0:07:22  lr: 0.003477  min_lr: 0.003477  loss: 3.5009 (3.7756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (inf)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [34]  [1000/2502]  eta: 0:06:29  lr: 0.003477  min_lr: 0.003477  loss: 4.1095 (3.7627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [34]  [1200/2502]  eta: 0:05:37  lr: 0.003477  min_lr: 0.003477  loss: 4.0429 (3.7738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7164 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [34]  [1400/2502]  eta: 0:04:45  lr: 0.003477  min_lr: 0.003477  loss: 3.5106 (3.7736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7386 (inf)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [34]  [1600/2502]  eta: 0:03:53  lr: 0.003476  min_lr: 0.003476  loss: 3.3926 (3.7663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7157 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [34]  [1800/2502]  eta: 0:03:01  lr: 0.003476  min_lr: 0.003476  loss: 3.5065 (3.7721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7363 (inf)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [34]  [2000/2502]  eta: 0:02:09  lr: 0.003476  min_lr: 0.003476  loss: 3.4604 (3.7671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8050 (inf)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [34]  [2200/2502]  eta: 0:01:17  lr: 0.003476  min_lr: 0.003476  loss: 3.8524 (3.7709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6697 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [34]  [2400/2502]  eta: 0:00:26  lr: 0.003475  min_lr: 0.003475  loss: 3.9847 (3.7762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7221 (inf)  time: 0.2606  data: 0.0004  max mem: 18975
Epoch: [34]  [2501/2502]  eta: 0:00:00  lr: 0.003475  min_lr: 0.003475  loss: 3.7760 (3.7769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7727 (inf)  time: 0.2312  data: 0.0009  max mem: 18975
Epoch: [34] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003475  min_lr: 0.003475  loss: 3.7760 (3.7787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7727 (inf)
Test:  [ 0/50]  eta: 0:02:34  loss: 0.7198 (0.7198)  acc1: 88.0000 (88.0000)  acc5: 97.6000 (97.6000)  time: 3.0841  data: 2.9293  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.1060 (1.1324)  acc1: 77.2000 (76.6546)  acc5: 94.4000 (94.2545)  time: 0.5423  data: 0.4194  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.1218 (1.1439)  acc1: 74.8000 (76.1905)  acc5: 94.8000 (94.4571)  time: 0.2878  data: 0.1669  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3451 (1.3064)  acc1: 70.0000 (72.7484)  acc5: 90.8000 (92.0129)  time: 0.2479  data: 0.1243  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.7090 (1.3997)  acc1: 63.2000 (70.5854)  acc5: 85.6000 (90.5951)  time: 0.2098  data: 0.0877  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6008 (1.4244)  acc1: 64.8000 (70.1120)  acc5: 86.0000 (90.3360)  time: 0.2066  data: 0.0876  max mem: 18975
Test: Total time: 0:00:14 (0.2840 s / it)
* Acc@1 69.916 Acc@5 89.978 loss 1.429
Accuracy of the model on the 50000 test images: 69.9%
Max accuracy: 69.92%
Epoch: [35]  [   0/2502]  eta: 1:13:25  lr: 0.003475  min_lr: 0.003475  loss: 4.6254 (4.6254)  weight_decay: 0.0500 (0.0500)  time: 1.7608  data: 1.5008  max mem: 18975
Epoch: [35]  [ 200/2502]  eta: 0:10:15  lr: 0.003475  min_lr: 0.003475  loss: 3.9189 (3.7828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7717 (0.7498)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [35]  [ 400/2502]  eta: 0:09:11  lr: 0.003475  min_lr: 0.003475  loss: 3.7183 (3.7999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7200 (0.7395)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [35]  [ 600/2502]  eta: 0:08:15  lr: 0.003474  min_lr: 0.003474  loss: 3.8043 (3.7807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6626 (0.7335)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [35]  [ 800/2502]  eta: 0:07:21  lr: 0.003474  min_lr: 0.003474  loss: 3.7839 (3.7875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6495 (0.7312)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [35]  [1000/2502]  eta: 0:06:28  lr: 0.003474  min_lr: 0.003474  loss: 4.1024 (3.8043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6943 (0.7270)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [35]  [1200/2502]  eta: 0:05:36  lr: 0.003474  min_lr: 0.003474  loss: 4.0824 (3.7966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7471 (0.7253)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [35]  [1400/2502]  eta: 0:04:44  lr: 0.003473  min_lr: 0.003473  loss: 4.0255 (3.7884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7075 (0.7318)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [35]  [1600/2502]  eta: 0:03:52  lr: 0.003473  min_lr: 0.003473  loss: 3.4531 (3.7858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6614 (0.7276)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [35]  [1800/2502]  eta: 0:03:01  lr: 0.003473  min_lr: 0.003473  loss: 4.1083 (3.7901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7190 (0.7344)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [35]  [2000/2502]  eta: 0:02:09  lr: 0.003473  min_lr: 0.003473  loss: 3.6621 (3.7951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7290 (0.7373)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [35]  [2200/2502]  eta: 0:01:17  lr: 0.003472  min_lr: 0.003472  loss: 4.1494 (3.7946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6993 (0.7351)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [35]  [2400/2502]  eta: 0:00:26  lr: 0.003472  min_lr: 0.003472  loss: 3.6281 (3.8008)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7536 (0.7360)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [35]  [2501/2502]  eta: 0:00:00  lr: 0.003472  min_lr: 0.003472  loss: 3.6138 (3.7949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7193 (0.7366)  time: 0.2306  data: 0.0007  max mem: 18975
Epoch: [35] Total time: 0:10:45 (0.2578 s / it)
Averaged stats: lr: 0.003472  min_lr: 0.003472  loss: 3.6138 (3.7771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7193 (0.7366)
Test:  [ 0/50]  eta: 0:02:31  loss: 0.8542 (0.8542)  acc1: 84.4000 (84.4000)  acc5: 96.0000 (96.0000)  time: 3.0290  data: 2.8782  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 1.1481 (1.1671)  acc1: 78.0000 (77.0909)  acc5: 94.8000 (94.1455)  time: 0.4756  data: 0.3535  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1464 (1.1524)  acc1: 74.8000 (76.2667)  acc5: 94.4000 (94.4762)  time: 0.2478  data: 0.1284  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3201 (1.3067)  acc1: 72.0000 (73.1742)  acc5: 92.0000 (91.9871)  time: 0.2466  data: 0.1262  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6908 (1.3984)  acc1: 64.4000 (70.7317)  acc5: 85.6000 (90.5366)  time: 0.2242  data: 0.1027  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6908 (1.4281)  acc1: 63.6000 (70.1600)  acc5: 85.2000 (90.1200)  time: 0.2177  data: 0.0955  max mem: 18975
Test: Total time: 0:00:14 (0.2807 s / it)
* Acc@1 69.960 Acc@5 90.106 loss 1.430
Accuracy of the model on the 50000 test images: 70.0%
Max accuracy: 69.96%
Epoch: [36]  [   0/2502]  eta: 1:27:52  lr: 0.003472  min_lr: 0.003472  loss: 2.9357 (2.9357)  weight_decay: 0.0500 (0.0500)  time: 2.1072  data: 1.8453  max mem: 18975
Epoch: [36]  [ 200/2502]  eta: 0:10:19  lr: 0.003472  min_lr: 0.003472  loss: 3.7493 (3.8078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7508 (0.7975)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [36]  [ 400/2502]  eta: 0:09:12  lr: 0.003471  min_lr: 0.003471  loss: 3.6716 (3.7658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7558 (0.7933)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [36]  [ 600/2502]  eta: 0:08:16  lr: 0.003471  min_lr: 0.003471  loss: 3.2510 (3.7485)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7401 (0.7816)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [36]  [ 800/2502]  eta: 0:07:22  lr: 0.003471  min_lr: 0.003471  loss: 3.5922 (3.7484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6954 (0.7719)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [36]  [1000/2502]  eta: 0:06:29  lr: 0.003470  min_lr: 0.003470  loss: 3.7792 (3.7531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6402 (0.7745)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [36]  [1200/2502]  eta: 0:05:37  lr: 0.003470  min_lr: 0.003470  loss: 3.3236 (3.7609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7135 (0.7677)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [36]  [1400/2502]  eta: 0:04:45  lr: 0.003470  min_lr: 0.003470  loss: 3.5350 (3.7661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7367 (0.7687)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [36]  [1600/2502]  eta: 0:03:53  lr: 0.003470  min_lr: 0.003470  loss: 3.3521 (3.7633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7594 (0.7703)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [36]  [1800/2502]  eta: 0:03:01  lr: 0.003469  min_lr: 0.003469  loss: 3.5708 (3.7658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7576 (0.7667)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [36]  [2000/2502]  eta: 0:02:09  lr: 0.003469  min_lr: 0.003469  loss: 3.7885 (3.7755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7587 (0.7721)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [36]  [2200/2502]  eta: 0:01:17  lr: 0.003469  min_lr: 0.003469  loss: 3.4767 (3.7764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6648 (0.7682)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [36]  [2400/2502]  eta: 0:00:26  lr: 0.003468  min_lr: 0.003468  loss: 3.9274 (3.7733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6919 (0.7643)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [36]  [2501/2502]  eta: 0:00:00  lr: 0.003468  min_lr: 0.003468  loss: 4.0408 (3.7783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6862 (0.7637)  time: 0.2299  data: 0.0009  max mem: 18975
Epoch: [36] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003468  min_lr: 0.003468  loss: 4.0408 (3.7599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6862 (0.7637)
Test:  [ 0/50]  eta: 0:02:50  loss: 0.8501 (0.8501)  acc1: 85.6000 (85.6000)  acc5: 96.4000 (96.4000)  time: 3.4072  data: 3.2689  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 1.0596 (1.1412)  acc1: 78.8000 (77.0182)  acc5: 94.4000 (94.2545)  time: 0.4993  data: 0.3777  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.2109 (1.1808)  acc1: 74.8000 (76.5333)  acc5: 94.4000 (94.4952)  time: 0.2150  data: 0.0954  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3820 (1.3712)  acc1: 71.6000 (73.0839)  acc5: 91.2000 (92.0645)  time: 0.2639  data: 0.1433  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.8078 (1.4786)  acc1: 62.8000 (70.6244)  acc5: 85.6000 (90.3415)  time: 0.2792  data: 0.1586  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.7895 (1.4937)  acc1: 63.6000 (70.1520)  acc5: 85.6000 (90.1680)  time: 0.2098  data: 0.0907  max mem: 18975
Test: Total time: 0:00:14 (0.2986 s / it)
* Acc@1 70.170 Acc@5 90.220 loss 1.491
Accuracy of the model on the 50000 test images: 70.2%
Max accuracy: 70.17%
Epoch: [37]  [   0/2502]  eta: 1:20:56  lr: 0.003468  min_lr: 0.003468  loss: 3.9528 (3.9528)  weight_decay: 0.0500 (0.0500)  time: 1.9410  data: 1.6681  max mem: 18975
Epoch: [37]  [ 200/2502]  eta: 0:10:16  lr: 0.003468  min_lr: 0.003468  loss: 3.7599 (3.7841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6629 (0.7219)  time: 0.2574  data: 0.0005  max mem: 18975
Epoch: [37]  [ 400/2502]  eta: 0:09:12  lr: 0.003468  min_lr: 0.003468  loss: 3.2170 (3.7264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7165 (0.7503)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [37]  [ 600/2502]  eta: 0:08:16  lr: 0.003467  min_lr: 0.003467  loss: 3.4756 (3.7331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7109 (0.7404)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [37]  [ 800/2502]  eta: 0:07:22  lr: 0.003467  min_lr: 0.003467  loss: 3.9207 (3.7609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6145 (0.7298)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [37]  [1000/2502]  eta: 0:06:29  lr: 0.003467  min_lr: 0.003467  loss: 3.6662 (3.7566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8941 (0.7463)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [37]  [1200/2502]  eta: 0:05:37  lr: 0.003466  min_lr: 0.003466  loss: 3.4474 (3.7524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7006 (0.7409)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [37]  [1400/2502]  eta: 0:04:45  lr: 0.003466  min_lr: 0.003466  loss: 4.0811 (3.7488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7651 (0.7434)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [37]  [1600/2502]  eta: 0:03:53  lr: 0.003466  min_lr: 0.003466  loss: 3.7443 (3.7479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6739 (0.7439)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [37]  [1800/2502]  eta: 0:03:01  lr: 0.003466  min_lr: 0.003466  loss: 4.0848 (3.7479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6911 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [37]  [2000/2502]  eta: 0:02:09  lr: 0.003465  min_lr: 0.003465  loss: 3.3919 (3.7451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7260 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [37]  [2200/2502]  eta: 0:01:17  lr: 0.003465  min_lr: 0.003465  loss: 4.2117 (3.7531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6961 (inf)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [37]  [2400/2502]  eta: 0:00:26  lr: 0.003465  min_lr: 0.003465  loss: 3.6861 (3.7498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7656 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [37]  [2501/2502]  eta: 0:00:00  lr: 0.003464  min_lr: 0.003464  loss: 3.7821 (3.7569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6733 (inf)  time: 0.2310  data: 0.0008  max mem: 18975
Epoch: [37] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.003464  min_lr: 0.003464  loss: 3.7821 (3.7463)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6733 (inf)
Test:  [ 0/50]  eta: 0:02:49  loss: 0.8108 (0.8108)  acc1: 86.0000 (86.0000)  acc5: 96.8000 (96.8000)  time: 3.3929  data: 3.2519  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.1217 (1.1704)  acc1: 78.0000 (77.7455)  acc5: 94.8000 (94.4727)  time: 0.5403  data: 0.4163  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1236 (1.1762)  acc1: 75.2000 (76.4191)  acc5: 94.4000 (94.3810)  time: 0.2486  data: 0.1275  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3686 (1.3376)  acc1: 69.6000 (72.9806)  acc5: 90.0000 (92.0387)  time: 0.2660  data: 0.1465  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.7523 (1.4363)  acc1: 63.6000 (70.6732)  acc5: 85.6000 (90.6342)  time: 0.2338  data: 0.1148  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.7339 (1.4609)  acc1: 63.6000 (70.1520)  acc5: 86.0000 (90.4400)  time: 0.2049  data: 0.0859  max mem: 18975
Test: Total time: 0:00:14 (0.2931 s / it)
* Acc@1 70.400 Acc@5 90.252 loss 1.459
Accuracy of the model on the 50000 test images: 70.4%
Max accuracy: 70.40%
Epoch: [38]  [   0/2502]  eta: 1:19:06  lr: 0.003464  min_lr: 0.003464  loss: 4.0179 (4.0179)  weight_decay: 0.0500 (0.0500)  time: 1.8971  data: 1.6343  max mem: 18975
Epoch: [38]  [ 200/2502]  eta: 0:10:23  lr: 0.003464  min_lr: 0.003464  loss: 3.5432 (3.6859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7214 (0.7393)  time: 0.2639  data: 0.0003  max mem: 18975
Epoch: [38]  [ 400/2502]  eta: 0:09:15  lr: 0.003464  min_lr: 0.003464  loss: 3.1335 (3.7110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.7264)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [38]  [ 600/2502]  eta: 0:08:18  lr: 0.003463  min_lr: 0.003463  loss: 3.5533 (3.7170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (0.7451)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [38]  [ 800/2502]  eta: 0:07:23  lr: 0.003463  min_lr: 0.003463  loss: 3.9487 (3.7372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6898 (0.7464)  time: 0.2558  data: 0.0003  max mem: 18975
Epoch: [38]  [1000/2502]  eta: 0:06:30  lr: 0.003463  min_lr: 0.003463  loss: 4.0487 (3.7329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6933 (0.7461)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [38]  [1200/2502]  eta: 0:05:37  lr: 0.003463  min_lr: 0.003463  loss: 4.3963 (3.7350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7062 (0.7417)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [38]  [1400/2502]  eta: 0:04:45  lr: 0.003462  min_lr: 0.003462  loss: 3.8351 (3.7291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (0.7392)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [38]  [1600/2502]  eta: 0:03:53  lr: 0.003462  min_lr: 0.003462  loss: 4.2048 (3.7287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8050 (0.7433)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [38]  [1800/2502]  eta: 0:03:01  lr: 0.003462  min_lr: 0.003462  loss: 3.7439 (3.7212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7244 (0.7415)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [38]  [2000/2502]  eta: 0:02:09  lr: 0.003461  min_lr: 0.003461  loss: 3.9261 (3.7182)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7716 (0.7458)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [38]  [2200/2502]  eta: 0:01:18  lr: 0.003461  min_lr: 0.003461  loss: 4.2206 (3.7213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7568 (0.7501)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [38]  [2400/2502]  eta: 0:00:26  lr: 0.003461  min_lr: 0.003461  loss: 3.7560 (3.7220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7533 (0.7523)  time: 0.2569  data: 0.0005  max mem: 18975
Epoch: [38]  [2501/2502]  eta: 0:00:00  lr: 0.003460  min_lr: 0.003460  loss: 4.0287 (3.7208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6911 (0.7510)  time: 0.2310  data: 0.0009  max mem: 18975
Epoch: [38] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.003460  min_lr: 0.003460  loss: 4.0287 (3.7416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6911 (0.7510)
Test:  [ 0/50]  eta: 0:02:41  loss: 0.8472 (0.8472)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 3.2262  data: 3.0857  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.1939 (1.1729)  acc1: 78.0000 (77.1273)  acc5: 94.4000 (94.1091)  time: 0.5265  data: 0.4054  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1691 (1.1648)  acc1: 75.6000 (76.3619)  acc5: 94.4000 (94.6095)  time: 0.2421  data: 0.1204  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3962 (1.3413)  acc1: 70.8000 (72.9419)  acc5: 91.2000 (92.0258)  time: 0.2514  data: 0.1298  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.8323 (1.4572)  acc1: 64.0000 (70.4293)  acc5: 85.2000 (90.5756)  time: 0.2615  data: 0.1423  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.7265 (1.4821)  acc1: 64.0000 (69.9280)  acc5: 86.0000 (90.3040)  time: 0.2008  data: 0.0813  max mem: 18975
Test: Total time: 0:00:14 (0.2963 s / it)
* Acc@1 69.946 Acc@5 90.232 loss 1.488
Accuracy of the model on the 50000 test images: 69.9%
Max accuracy: 70.40%
Epoch: [39]  [   0/2502]  eta: 1:33:33  lr: 0.003460  min_lr: 0.003460  loss: 4.8359 (4.8359)  weight_decay: 0.0500 (0.0500)  time: 2.2436  data: 1.9632  max mem: 18975
Epoch: [39]  [ 200/2502]  eta: 0:10:21  lr: 0.003460  min_lr: 0.003460  loss: 3.9583 (3.7363)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [39]  [ 400/2502]  eta: 0:09:15  lr: 0.003460  min_lr: 0.003460  loss: 4.0774 (3.7405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7023 (nan)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [39]  [ 600/2502]  eta: 0:08:18  lr: 0.003459  min_lr: 0.003459  loss: 3.5227 (3.7316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6719 (nan)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [39]  [ 800/2502]  eta: 0:07:23  lr: 0.003459  min_lr: 0.003459  loss: 3.7590 (3.7368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6541 (nan)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [39]  [1000/2502]  eta: 0:06:30  lr: 0.003459  min_lr: 0.003459  loss: 3.4395 (3.7288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7297 (nan)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [39]  [1200/2502]  eta: 0:05:38  lr: 0.003458  min_lr: 0.003458  loss: 3.5004 (3.7381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7015 (nan)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [39]  [1400/2502]  eta: 0:04:45  lr: 0.003458  min_lr: 0.003458  loss: 3.5588 (3.7383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6867 (nan)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [39]  [1600/2502]  eta: 0:03:53  lr: 0.003458  min_lr: 0.003458  loss: 3.4923 (3.7322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6890 (nan)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [39]  [1800/2502]  eta: 0:03:01  lr: 0.003457  min_lr: 0.003457  loss: 3.8600 (3.7463)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6317 (nan)  time: 0.2604  data: 0.0004  max mem: 18975
Epoch: [39]  [2000/2502]  eta: 0:02:10  lr: 0.003457  min_lr: 0.003457  loss: 4.0499 (3.7488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (nan)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [39]  [2200/2502]  eta: 0:01:18  lr: 0.003457  min_lr: 0.003457  loss: 3.2753 (3.7383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7177 (nan)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [39]  [2400/2502]  eta: 0:00:26  lr: 0.003456  min_lr: 0.003456  loss: 3.1814 (3.7269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7358 (nan)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [39]  [2501/2502]  eta: 0:00:00  lr: 0.003456  min_lr: 0.003456  loss: 4.1684 (3.7292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7771 (nan)  time: 0.2308  data: 0.0007  max mem: 18975
Epoch: [39] Total time: 0:10:46 (0.2586 s / it)
Averaged stats: lr: 0.003456  min_lr: 0.003456  loss: 4.1684 (3.7286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7771 (nan)
Test:  [ 0/50]  eta: 0:02:16  loss: 0.8958 (0.8958)  acc1: 86.8000 (86.8000)  acc5: 96.0000 (96.0000)  time: 2.7356  data: 2.5879  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 1.2442 (1.1877)  acc1: 78.8000 (77.4909)  acc5: 95.2000 (94.5818)  time: 0.4652  data: 0.3414  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.1946 (1.1868)  acc1: 75.6000 (77.3524)  acc5: 95.2000 (94.8381)  time: 0.2319  data: 0.1116  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3125 (1.3503)  acc1: 70.4000 (73.6516)  acc5: 90.8000 (92.3097)  time: 0.2349  data: 0.1152  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.7365 (1.4487)  acc1: 64.8000 (71.2585)  acc5: 86.0000 (90.8488)  time: 0.2572  data: 0.1376  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.7317 (1.4583)  acc1: 64.8000 (70.7680)  acc5: 86.0000 (90.6240)  time: 0.2211  data: 0.1022  max mem: 18975
Test: Total time: 0:00:14 (0.2842 s / it)
* Acc@1 70.868 Acc@5 90.484 loss 1.462
Accuracy of the model on the 50000 test images: 70.9%
Max accuracy: 70.87%
Epoch: [40]  [   0/2502]  eta: 1:13:35  lr: 0.003456  min_lr: 0.003456  loss: 3.5005 (3.5005)  weight_decay: 0.0500 (0.0500)  time: 1.7649  data: 1.4904  max mem: 18975
Epoch: [40]  [ 200/2502]  eta: 0:10:11  lr: 0.003456  min_lr: 0.003456  loss: 3.2395 (3.7353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7132 (0.7468)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [40]  [ 400/2502]  eta: 0:09:09  lr: 0.003455  min_lr: 0.003455  loss: 2.9850 (3.6900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7529 (0.7448)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [40]  [ 600/2502]  eta: 0:08:15  lr: 0.003455  min_lr: 0.003455  loss: 3.6200 (3.6927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8903 (0.7746)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [40]  [ 800/2502]  eta: 0:07:21  lr: 0.003455  min_lr: 0.003455  loss: 3.3457 (3.6909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7321 (0.7621)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [40]  [1000/2502]  eta: 0:06:29  lr: 0.003454  min_lr: 0.003454  loss: 3.3028 (3.6998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7564 (0.7667)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [40]  [1200/2502]  eta: 0:05:36  lr: 0.003454  min_lr: 0.003454  loss: 3.8226 (3.7042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7028 (0.7607)  time: 0.2554  data: 0.0004  max mem: 18975
Epoch: [40]  [1400/2502]  eta: 0:04:44  lr: 0.003454  min_lr: 0.003454  loss: 3.8839 (3.7082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7001 (0.7584)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [40]  [1600/2502]  eta: 0:03:52  lr: 0.003453  min_lr: 0.003453  loss: 3.5665 (3.7167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7346 (0.7541)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [40]  [1800/2502]  eta: 0:03:01  lr: 0.003453  min_lr: 0.003453  loss: 3.5654 (3.7064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7753 (0.7527)  time: 0.2583  data: 0.0005  max mem: 18975
Epoch: [40]  [2000/2502]  eta: 0:02:09  lr: 0.003453  min_lr: 0.003453  loss: 3.5371 (3.7048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6474 (0.7495)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [40]  [2200/2502]  eta: 0:01:17  lr: 0.003452  min_lr: 0.003452  loss: 3.7414 (3.7142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6944 (0.7534)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [40]  [2400/2502]  eta: 0:00:26  lr: 0.003452  min_lr: 0.003452  loss: 4.3855 (3.7214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6298 (0.7487)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [40]  [2501/2502]  eta: 0:00:00  lr: 0.003452  min_lr: 0.003452  loss: 4.2329 (3.7255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7462 (0.7506)  time: 0.2304  data: 0.0010  max mem: 18975
Epoch: [40] Total time: 0:10:44 (0.2578 s / it)
Averaged stats: lr: 0.003452  min_lr: 0.003452  loss: 4.2329 (3.7257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7462 (0.7506)
Test:  [ 0/50]  eta: 0:02:52  loss: 0.8453 (0.8453)  acc1: 89.2000 (89.2000)  acc5: 96.8000 (96.8000)  time: 3.4481  data: 3.3084  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.2701 (1.2017)  acc1: 76.0000 (77.4182)  acc5: 94.0000 (94.6182)  time: 0.5504  data: 0.4284  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.2506 (1.1935)  acc1: 76.0000 (76.8000)  acc5: 93.6000 (94.5714)  time: 0.2319  data: 0.1121  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3877 (1.3475)  acc1: 72.0000 (73.6645)  acc5: 90.4000 (92.1290)  time: 0.2289  data: 0.1096  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.7197 (1.4451)  acc1: 66.0000 (71.5902)  acc5: 86.4000 (90.6634)  time: 0.2332  data: 0.1142  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.7197 (1.4783)  acc1: 64.8000 (70.6240)  acc5: 86.0000 (90.4080)  time: 0.1952  data: 0.0756  max mem: 18975
Test: Total time: 0:00:14 (0.2908 s / it)
* Acc@1 70.690 Acc@5 90.542 loss 1.475
Accuracy of the model on the 50000 test images: 70.7%
Max accuracy: 70.87%
Epoch: [41]  [   0/2502]  eta: 1:32:27  lr: 0.003452  min_lr: 0.003452  loss: 4.2867 (4.2867)  weight_decay: 0.0500 (0.0500)  time: 2.2173  data: 1.3777  max mem: 18975
Epoch: [41]  [ 200/2502]  eta: 0:10:19  lr: 0.003451  min_lr: 0.003451  loss: 3.7826 (3.6676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7090 (0.7639)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [41]  [ 400/2502]  eta: 0:09:12  lr: 0.003451  min_lr: 0.003451  loss: 3.5692 (3.6926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7039 (0.7652)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [41]  [ 600/2502]  eta: 0:08:16  lr: 0.003451  min_lr: 0.003451  loss: 3.5356 (3.6852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7026 (0.7644)  time: 0.2628  data: 0.0004  max mem: 18975
Epoch: [41]  [ 800/2502]  eta: 0:07:22  lr: 0.003450  min_lr: 0.003450  loss: 3.7172 (3.6864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7087 (0.7533)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [41]  [1000/2502]  eta: 0:06:29  lr: 0.003450  min_lr: 0.003450  loss: 3.7812 (3.6978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6771 (0.7492)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [41]  [1200/2502]  eta: 0:05:37  lr: 0.003449  min_lr: 0.003449  loss: 4.2005 (3.6973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6833 (0.7508)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [41]  [1400/2502]  eta: 0:04:45  lr: 0.003449  min_lr: 0.003449  loss: 4.1237 (3.7037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7393 (0.7573)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [41]  [1600/2502]  eta: 0:03:53  lr: 0.003449  min_lr: 0.003449  loss: 3.7980 (3.7069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6903 (0.7505)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [41]  [1800/2502]  eta: 0:03:01  lr: 0.003448  min_lr: 0.003448  loss: 4.0996 (3.7116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8032 (0.7523)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [41]  [2000/2502]  eta: 0:02:09  lr: 0.003448  min_lr: 0.003448  loss: 3.9989 (3.7179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6661 (0.7484)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [41]  [2200/2502]  eta: 0:01:17  lr: 0.003448  min_lr: 0.003448  loss: 3.5319 (3.7195)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7718 (0.7512)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [41]  [2400/2502]  eta: 0:00:26  lr: 0.003447  min_lr: 0.003447  loss: 4.1606 (3.7200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7345 (0.7491)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [41]  [2501/2502]  eta: 0:00:00  lr: 0.003447  min_lr: 0.003447  loss: 3.8131 (3.7200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6957 (0.7493)  time: 0.2324  data: 0.0009  max mem: 18975
Epoch: [41] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.003447  min_lr: 0.003447  loss: 3.8131 (3.7175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6957 (0.7493)
Test:  [ 0/50]  eta: 0:02:42  loss: 0.8109 (0.8109)  acc1: 87.6000 (87.6000)  acc5: 97.2000 (97.2000)  time: 3.2459  data: 3.1061  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.1616 (1.1709)  acc1: 80.0000 (78.0727)  acc5: 94.8000 (94.4364)  time: 0.5367  data: 0.4134  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1629 (1.1911)  acc1: 74.4000 (76.4191)  acc5: 94.8000 (94.5524)  time: 0.2501  data: 0.1296  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.4581 (1.3517)  acc1: 70.8000 (73.2000)  acc5: 90.8000 (92.4903)  time: 0.2551  data: 0.1359  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.7575 (1.4469)  acc1: 64.0000 (70.7902)  acc5: 86.0000 (91.1317)  time: 0.2577  data: 0.1386  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.7575 (1.4600)  acc1: 64.8000 (70.4320)  acc5: 86.0000 (90.8000)  time: 0.2021  data: 0.0830  max mem: 18975
Test: Total time: 0:00:15 (0.3006 s / it)
* Acc@1 70.548 Acc@5 90.690 loss 1.463
Accuracy of the model on the 50000 test images: 70.5%
Max accuracy: 70.87%
Epoch: [42]  [   0/2502]  eta: 1:30:17  lr: 0.003447  min_lr: 0.003447  loss: 3.2548 (3.2548)  weight_decay: 0.0500 (0.0500)  time: 2.1651  data: 1.5355  max mem: 18975
Epoch: [42]  [ 200/2502]  eta: 0:10:16  lr: 0.003447  min_lr: 0.003447  loss: 3.5934 (3.6659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6505 (0.7103)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [42]  [ 400/2502]  eta: 0:09:11  lr: 0.003446  min_lr: 0.003446  loss: 3.8758 (3.6696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (0.7351)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [42]  [ 600/2502]  eta: 0:08:15  lr: 0.003446  min_lr: 0.003446  loss: 3.7313 (3.6740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7373 (0.7315)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [42]  [ 800/2502]  eta: 0:07:21  lr: 0.003445  min_lr: 0.003445  loss: 3.9844 (3.6816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8579 (0.7428)  time: 0.2573  data: 0.0003  max mem: 18975
Epoch: [42]  [1000/2502]  eta: 0:06:29  lr: 0.003445  min_lr: 0.003445  loss: 3.4554 (3.6811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7123 (0.7518)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [42]  [1200/2502]  eta: 0:05:36  lr: 0.003445  min_lr: 0.003445  loss: 3.4261 (3.6825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7107 (0.7575)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [42]  [1400/2502]  eta: 0:04:44  lr: 0.003444  min_lr: 0.003444  loss: 3.4208 (3.6946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6483 (0.7556)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [42]  [1600/2502]  eta: 0:03:52  lr: 0.003444  min_lr: 0.003444  loss: 3.8790 (3.6948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6554 (0.7568)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [42]  [1800/2502]  eta: 0:03:01  lr: 0.003443  min_lr: 0.003443  loss: 3.9051 (3.6935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6392 (0.7563)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [42]  [2000/2502]  eta: 0:02:09  lr: 0.003443  min_lr: 0.003443  loss: 3.1070 (3.6904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6876 (0.7546)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [42]  [2200/2502]  eta: 0:01:17  lr: 0.003443  min_lr: 0.003443  loss: 3.1295 (3.6925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6867 (0.7497)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [42]  [2400/2502]  eta: 0:00:26  lr: 0.003442  min_lr: 0.003442  loss: 3.9618 (3.6910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8030 (0.7527)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [42]  [2501/2502]  eta: 0:00:00  lr: 0.003442  min_lr: 0.003442  loss: 4.0947 (3.6912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7481 (0.7548)  time: 0.2317  data: 0.0009  max mem: 18975
Epoch: [42] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.003442  min_lr: 0.003442  loss: 4.0947 (3.7111)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7481 (0.7548)
Test:  [ 0/50]  eta: 0:02:34  loss: 0.8169 (0.8169)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 3.0868  data: 2.9418  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 1.0590 (1.1253)  acc1: 80.8000 (79.6727)  acc5: 94.4000 (94.6545)  time: 0.4392  data: 0.3166  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1812 (1.1720)  acc1: 76.4000 (78.0191)  acc5: 94.4000 (94.4952)  time: 0.2322  data: 0.1120  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.4198 (1.3340)  acc1: 72.4000 (74.4903)  acc5: 90.0000 (92.0903)  time: 0.2925  data: 0.1730  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.7396 (1.4325)  acc1: 66.8000 (72.1951)  acc5: 85.6000 (90.6146)  time: 0.2755  data: 0.1559  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.7041 (1.4513)  acc1: 66.4000 (71.6480)  acc5: 86.8000 (90.4720)  time: 0.2105  data: 0.0909  max mem: 18975
Test: Total time: 0:00:14 (0.2976 s / it)
* Acc@1 71.392 Acc@5 90.702 loss 1.447
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.39%
Epoch: [43]  [   0/2502]  eta: 1:18:59  lr: 0.003442  min_lr: 0.003442  loss: 4.1333 (4.1333)  weight_decay: 0.0500 (0.0500)  time: 1.8942  data: 1.4471  max mem: 18975
Epoch: [43]  [ 200/2502]  eta: 0:10:18  lr: 0.003442  min_lr: 0.003442  loss: 3.8342 (3.6920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7217 (0.7763)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [43]  [ 400/2502]  eta: 0:09:12  lr: 0.003441  min_lr: 0.003441  loss: 3.6689 (3.7032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6820 (0.7477)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [43]  [ 600/2502]  eta: 0:08:16  lr: 0.003441  min_lr: 0.003441  loss: 3.8052 (3.6970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7018 (0.7423)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [43]  [ 800/2502]  eta: 0:07:22  lr: 0.003440  min_lr: 0.003440  loss: 2.9573 (3.6929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6819 (0.7465)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [43]  [1000/2502]  eta: 0:06:29  lr: 0.003440  min_lr: 0.003440  loss: 4.2344 (3.6966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8255 (0.7575)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [43]  [1200/2502]  eta: 0:05:37  lr: 0.003440  min_lr: 0.003440  loss: 3.3122 (3.6919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7123 (0.7594)  time: 0.2661  data: 0.0004  max mem: 18975
Epoch: [43]  [1400/2502]  eta: 0:04:45  lr: 0.003439  min_lr: 0.003439  loss: 3.4232 (3.6883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7654 (0.7588)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [43]  [1600/2502]  eta: 0:03:53  lr: 0.003439  min_lr: 0.003439  loss: 3.5697 (3.6834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6964 (0.7552)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [43]  [1800/2502]  eta: 0:03:01  lr: 0.003438  min_lr: 0.003438  loss: 3.7835 (3.6926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (0.7487)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [43]  [2000/2502]  eta: 0:02:09  lr: 0.003438  min_lr: 0.003438  loss: 3.1594 (3.6909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6583 (nan)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [43]  [2200/2502]  eta: 0:01:18  lr: 0.003438  min_lr: 0.003438  loss: 3.0844 (3.6880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7217 (nan)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [43]  [2400/2502]  eta: 0:00:26  lr: 0.003437  min_lr: 0.003437  loss: 3.7029 (3.6920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7148 (nan)  time: 0.2586  data: 0.0004  max mem: 18975
Epoch: [43]  [2501/2502]  eta: 0:00:00  lr: 0.003437  min_lr: 0.003437  loss: 3.7910 (3.6915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6804 (nan)  time: 0.2310  data: 0.0009  max mem: 18975
Epoch: [43] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.003437  min_lr: 0.003437  loss: 3.7910 (3.6978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6804 (nan)
Test:  [ 0/50]  eta: 0:02:30  loss: 0.7937 (0.7937)  acc1: 85.6000 (85.6000)  acc5: 95.2000 (95.2000)  time: 3.0161  data: 2.8628  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.9877 (1.0449)  acc1: 76.8000 (79.0182)  acc5: 95.2000 (94.8364)  time: 0.4660  data: 0.3437  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0570 (1.0752)  acc1: 75.2000 (77.8667)  acc5: 95.2000 (95.0286)  time: 0.2486  data: 0.1295  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2374 (1.2374)  acc1: 72.4000 (74.9161)  acc5: 91.6000 (92.5548)  time: 0.2803  data: 0.1612  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6674 (1.3339)  acc1: 65.2000 (72.3610)  acc5: 86.0000 (91.1415)  time: 0.2558  data: 0.1368  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6283 (1.3648)  acc1: 65.6000 (71.7920)  acc5: 86.4000 (90.8320)  time: 0.1984  data: 0.0795  max mem: 18975
Test: Total time: 0:00:14 (0.2934 s / it)
* Acc@1 71.406 Acc@5 90.764 loss 1.371
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.41%
Epoch: [44]  [   0/2502]  eta: 1:17:52  lr: 0.003437  min_lr: 0.003437  loss: 2.5779 (2.5779)  weight_decay: 0.0500 (0.0500)  time: 1.8674  data: 1.6060  max mem: 18975
Epoch: [44]  [ 200/2502]  eta: 0:10:16  lr: 0.003437  min_lr: 0.003437  loss: 3.4419 (3.6226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7859 (0.7624)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [44]  [ 400/2502]  eta: 0:09:11  lr: 0.003436  min_lr: 0.003436  loss: 3.0870 (3.6176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6931 (0.7564)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [44]  [ 600/2502]  eta: 0:08:15  lr: 0.003436  min_lr: 0.003436  loss: 3.7599 (3.6174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6339 (0.7425)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [44]  [ 800/2502]  eta: 0:07:22  lr: 0.003435  min_lr: 0.003435  loss: 3.6924 (3.6330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7496 (0.7380)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [44]  [1000/2502]  eta: 0:06:29  lr: 0.003435  min_lr: 0.003435  loss: 3.7756 (3.6479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6340 (0.7418)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [44]  [1200/2502]  eta: 0:05:37  lr: 0.003434  min_lr: 0.003434  loss: 4.0554 (3.6572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7348 (0.7414)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [44]  [1400/2502]  eta: 0:04:45  lr: 0.003434  min_lr: 0.003434  loss: 3.1064 (3.6499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6966 (0.7406)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [44]  [1600/2502]  eta: 0:03:53  lr: 0.003434  min_lr: 0.003434  loss: 4.0241 (3.6562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6765 (0.7372)  time: 0.2573  data: 0.0005  max mem: 18975
Epoch: [44]  [1800/2502]  eta: 0:03:01  lr: 0.003433  min_lr: 0.003433  loss: 3.8272 (3.6650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7302 (0.7398)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [44]  [2000/2502]  eta: 0:02:09  lr: 0.003433  min_lr: 0.003433  loss: 3.6348 (3.6733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7516 (0.7420)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [44]  [2200/2502]  eta: 0:01:17  lr: 0.003432  min_lr: 0.003432  loss: 4.2331 (3.6750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8307 (0.7477)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [44]  [2400/2502]  eta: 0:00:26  lr: 0.003432  min_lr: 0.003432  loss: 3.8902 (3.6821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6916 (0.7518)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [44]  [2501/2502]  eta: 0:00:00  lr: 0.003432  min_lr: 0.003432  loss: 4.0260 (3.6899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7485 (0.7513)  time: 0.2309  data: 0.0008  max mem: 18975
Epoch: [44] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.003432  min_lr: 0.003432  loss: 4.0260 (3.6989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7485 (0.7513)
Test:  [ 0/50]  eta: 0:03:02  loss: 0.9028 (0.9028)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 3.6452  data: 3.5066  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 1.1397 (1.1784)  acc1: 79.6000 (79.2727)  acc5: 96.0000 (95.0182)  time: 0.5968  data: 0.4743  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 1.1635 (1.2082)  acc1: 76.8000 (78.0571)  acc5: 95.2000 (94.9905)  time: 0.2872  data: 0.1672  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3639 (1.3583)  acc1: 72.8000 (74.7097)  acc5: 91.2000 (92.6323)  time: 0.2555  data: 0.1363  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.7626 (1.4464)  acc1: 65.6000 (72.2927)  acc5: 86.8000 (91.3171)  time: 0.2057  data: 0.0859  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.7589 (1.4695)  acc1: 64.8000 (71.5360)  acc5: 86.8000 (90.9680)  time: 0.2071  data: 0.0858  max mem: 18975
Test: Total time: 0:00:14 (0.2944 s / it)
* Acc@1 71.444 Acc@5 91.016 loss 1.470
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.44%
Epoch: [45]  [   0/2502]  eta: 1:34:57  lr: 0.003432  min_lr: 0.003432  loss: 3.6995 (3.6995)  weight_decay: 0.0500 (0.0500)  time: 2.2771  data: 1.9974  max mem: 18975
Epoch: [45]  [ 200/2502]  eta: 0:10:19  lr: 0.003431  min_lr: 0.003431  loss: 4.0977 (3.6496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7521 (0.7657)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [45]  [ 400/2502]  eta: 0:09:12  lr: 0.003431  min_lr: 0.003431  loss: 4.0686 (3.6569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.7807)  time: 0.2581  data: 0.0003  max mem: 18975
Epoch: [45]  [ 600/2502]  eta: 0:08:16  lr: 0.003430  min_lr: 0.003430  loss: 3.1715 (3.6447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7014 (0.7702)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [45]  [ 800/2502]  eta: 0:07:22  lr: 0.003430  min_lr: 0.003430  loss: 3.4965 (3.6668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7106 (0.7760)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [45]  [1000/2502]  eta: 0:06:29  lr: 0.003429  min_lr: 0.003429  loss: 4.0070 (3.6764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8243 (0.7751)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [45]  [1200/2502]  eta: 0:05:37  lr: 0.003429  min_lr: 0.003429  loss: 4.0197 (3.6752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6898 (0.7722)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [45]  [1400/2502]  eta: 0:04:45  lr: 0.003429  min_lr: 0.003429  loss: 3.7992 (3.6858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6598 (0.7691)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [45]  [1600/2502]  eta: 0:03:53  lr: 0.003428  min_lr: 0.003428  loss: 3.6679 (3.6843)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7024 (0.7642)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [45]  [1800/2502]  eta: 0:03:01  lr: 0.003428  min_lr: 0.003428  loss: 3.8823 (3.6832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7248 (0.7663)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [45]  [2000/2502]  eta: 0:02:09  lr: 0.003427  min_lr: 0.003427  loss: 3.2479 (3.6783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6339 (0.7617)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [45]  [2200/2502]  eta: 0:01:17  lr: 0.003427  min_lr: 0.003427  loss: 3.6305 (3.6799)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6956 (0.7622)  time: 0.2616  data: 0.0004  max mem: 18975
Epoch: [45]  [2400/2502]  eta: 0:00:26  lr: 0.003426  min_lr: 0.003426  loss: 4.0003 (3.6758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7664 (0.7627)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [45]  [2501/2502]  eta: 0:00:00  lr: 0.003426  min_lr: 0.003426  loss: 4.1063 (3.6758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7390 (0.7622)  time: 0.2315  data: 0.0009  max mem: 18975
Epoch: [45] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.003426  min_lr: 0.003426  loss: 4.1063 (3.6797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7390 (0.7622)
Test:  [ 0/50]  eta: 0:02:29  loss: 0.8623 (0.8623)  acc1: 85.6000 (85.6000)  acc5: 97.2000 (97.2000)  time: 2.9829  data: 2.8404  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 1.1271 (1.1610)  acc1: 78.8000 (78.4364)  acc5: 94.4000 (94.1091)  time: 0.5784  data: 0.4564  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 1.1424 (1.1686)  acc1: 76.4000 (77.7333)  acc5: 94.4000 (94.5714)  time: 0.3069  data: 0.1873  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3070 (1.3151)  acc1: 73.2000 (74.1806)  acc5: 92.4000 (92.5290)  time: 0.2567  data: 0.1359  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.7081 (1.4107)  acc1: 66.0000 (71.9122)  acc5: 86.8000 (91.2585)  time: 0.2296  data: 0.1091  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6900 (1.4360)  acc1: 66.0000 (71.2480)  acc5: 88.0000 (91.0320)  time: 0.2141  data: 0.0950  max mem: 18975
Test: Total time: 0:00:15 (0.3072 s / it)
* Acc@1 71.266 Acc@5 91.028 loss 1.435
Accuracy of the model on the 50000 test images: 71.3%
Max accuracy: 71.44%
Epoch: [46]  [   0/2502]  eta: 1:34:44  lr: 0.003426  min_lr: 0.003426  loss: 3.8338 (3.8338)  weight_decay: 0.0500 (0.0500)  time: 2.2720  data: 1.4365  max mem: 18975
Epoch: [46]  [ 200/2502]  eta: 0:10:22  lr: 0.003426  min_lr: 0.003426  loss: 3.5051 (3.6211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7232 (0.7917)  time: 0.2579  data: 0.0005  max mem: 18975
Epoch: [46]  [ 400/2502]  eta: 0:09:14  lr: 0.003425  min_lr: 0.003425  loss: 3.3710 (3.6043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7185 (0.7624)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [46]  [ 600/2502]  eta: 0:08:17  lr: 0.003425  min_lr: 0.003425  loss: 3.6162 (3.6155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7975 (0.7705)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [46]  [ 800/2502]  eta: 0:07:23  lr: 0.003424  min_lr: 0.003424  loss: 3.7206 (3.6163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7407 (0.7750)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [46]  [1000/2502]  eta: 0:06:30  lr: 0.003424  min_lr: 0.003424  loss: 4.2175 (3.6415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7704 (0.7746)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [46]  [1200/2502]  eta: 0:05:37  lr: 0.003423  min_lr: 0.003423  loss: 3.4688 (3.6599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7140 (0.7741)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [46]  [1400/2502]  eta: 0:04:45  lr: 0.003423  min_lr: 0.003423  loss: 3.4478 (3.6606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7638 (0.7781)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [46]  [1600/2502]  eta: 0:03:53  lr: 0.003422  min_lr: 0.003422  loss: 3.9059 (3.6605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8080 (0.7795)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [46]  [1800/2502]  eta: 0:03:01  lr: 0.003422  min_lr: 0.003422  loss: 3.0061 (3.6656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7198 (0.7778)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [46]  [2000/2502]  eta: 0:02:09  lr: 0.003422  min_lr: 0.003422  loss: 3.3326 (3.6625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7993 (0.7801)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [46]  [2200/2502]  eta: 0:01:18  lr: 0.003421  min_lr: 0.003421  loss: 3.7505 (3.6692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6846 (0.7777)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [46]  [2400/2502]  eta: 0:00:26  lr: 0.003421  min_lr: 0.003421  loss: 3.1869 (3.6733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6573 (0.7764)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [46]  [2501/2502]  eta: 0:00:00  lr: 0.003420  min_lr: 0.003420  loss: 3.8174 (3.6699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.7754)  time: 0.2309  data: 0.0008  max mem: 18975
Epoch: [46] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.003420  min_lr: 0.003420  loss: 3.8174 (3.6838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.7754)
Test:  [ 0/50]  eta: 0:03:14  loss: 0.7712 (0.7712)  acc1: 88.4000 (88.4000)  acc5: 96.8000 (96.8000)  time: 3.8967  data: 3.7457  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 1.0888 (1.0915)  acc1: 76.8000 (77.8182)  acc5: 95.2000 (95.0909)  time: 0.6102  data: 0.4880  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 1.0888 (1.0971)  acc1: 76.8000 (77.5429)  acc5: 95.2000 (95.1238)  time: 0.2869  data: 0.1667  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2432 (1.2457)  acc1: 71.6000 (74.5290)  acc5: 91.2000 (92.9936)  time: 0.2782  data: 0.1573  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6419 (1.3508)  acc1: 65.6000 (72.2244)  acc5: 87.2000 (91.5610)  time: 0.2271  data: 0.1064  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6349 (1.3735)  acc1: 65.2000 (71.5280)  acc5: 87.6000 (91.3600)  time: 0.2262  data: 0.1063  max mem: 18975
Test: Total time: 0:00:15 (0.3073 s / it)
* Acc@1 71.702 Acc@5 91.320 loss 1.371
Accuracy of the model on the 50000 test images: 71.7%
Max accuracy: 71.70%
Epoch: [47]  [   0/2502]  eta: 1:40:38  lr: 0.003420  min_lr: 0.003420  loss: 2.4602 (2.4602)  weight_decay: 0.0500 (0.0500)  time: 2.4134  data: 2.1533  max mem: 18975
Epoch: [47]  [ 200/2502]  eta: 0:10:23  lr: 0.003420  min_lr: 0.003420  loss: 3.7859 (3.7130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6752 (0.7851)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [47]  [ 400/2502]  eta: 0:09:15  lr: 0.003419  min_lr: 0.003419  loss: 3.9770 (3.6953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7037 (0.7706)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [47]  [ 600/2502]  eta: 0:08:17  lr: 0.003419  min_lr: 0.003419  loss: 3.5075 (3.6767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7429 (0.7732)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [47]  [ 800/2502]  eta: 0:07:23  lr: 0.003418  min_lr: 0.003418  loss: 3.7726 (3.6863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7530 (0.7799)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [47]  [1000/2502]  eta: 0:06:30  lr: 0.003418  min_lr: 0.003418  loss: 3.6392 (3.6915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6752 (0.7669)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [47]  [1200/2502]  eta: 0:05:38  lr: 0.003417  min_lr: 0.003417  loss: 4.1219 (3.7013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7756 (0.7646)  time: 0.2588  data: 0.0004  max mem: 18975
Epoch: [47]  [1400/2502]  eta: 0:04:45  lr: 0.003417  min_lr: 0.003417  loss: 3.3816 (3.7027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7400 (0.7633)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [47]  [1600/2502]  eta: 0:03:53  lr: 0.003417  min_lr: 0.003417  loss: 4.1163 (3.6938)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7428 (0.7572)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [47]  [1800/2502]  eta: 0:03:01  lr: 0.003416  min_lr: 0.003416  loss: 4.0330 (3.6935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7345 (0.7552)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [47]  [2000/2502]  eta: 0:02:09  lr: 0.003416  min_lr: 0.003416  loss: 3.7442 (3.6889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6935 (0.7523)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [47]  [2200/2502]  eta: 0:01:18  lr: 0.003415  min_lr: 0.003415  loss: 3.8510 (3.6881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6866 (0.7517)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [47]  [2400/2502]  eta: 0:00:26  lr: 0.003415  min_lr: 0.003415  loss: 4.0747 (3.6910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7437 (0.7579)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [47]  [2501/2502]  eta: 0:00:00  lr: 0.003414  min_lr: 0.003414  loss: 3.2650 (3.6868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7505 (0.7580)  time: 0.2306  data: 0.0007  max mem: 18975
Epoch: [47] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.003414  min_lr: 0.003414  loss: 3.2650 (3.6810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7505 (0.7580)
Test:  [ 0/50]  eta: 0:02:18  loss: 0.7659 (0.7659)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 2.7709  data: 2.6319  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 1.0065 (1.0338)  acc1: 80.0000 (79.4545)  acc5: 95.2000 (94.8727)  time: 0.4532  data: 0.3310  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.0089 (1.0435)  acc1: 76.0000 (78.0571)  acc5: 94.8000 (94.9714)  time: 0.2456  data: 0.1259  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1691 (1.2027)  acc1: 71.2000 (74.7742)  acc5: 92.0000 (92.8258)  time: 0.2815  data: 0.1624  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5756 (1.2941)  acc1: 66.4000 (72.7610)  acc5: 87.2000 (91.5707)  time: 0.2645  data: 0.1448  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5756 (1.3262)  acc1: 66.8000 (72.0240)  acc5: 86.8000 (91.1760)  time: 0.2124  data: 0.0928  max mem: 18975
Test: Total time: 0:00:14 (0.2966 s / it)
* Acc@1 71.964 Acc@5 91.108 loss 1.328
Accuracy of the model on the 50000 test images: 72.0%
Max accuracy: 71.96%
Epoch: [48]  [   0/2502]  eta: 1:36:08  lr: 0.003414  min_lr: 0.003414  loss: 4.5271 (4.5271)  weight_decay: 0.0500 (0.0500)  time: 2.3057  data: 2.0299  max mem: 18975
Epoch: [48]  [ 200/2502]  eta: 0:10:22  lr: 0.003414  min_lr: 0.003414  loss: 3.9770 (3.6755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7116 (0.7526)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [48]  [ 400/2502]  eta: 0:09:14  lr: 0.003413  min_lr: 0.003413  loss: 4.1475 (3.7199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7759 (0.7657)  time: 0.2582  data: 0.0005  max mem: 18975
Epoch: [48]  [ 600/2502]  eta: 0:08:17  lr: 0.003413  min_lr: 0.003413  loss: 3.6158 (3.7139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7546 (0.7736)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [48]  [ 800/2502]  eta: 0:07:23  lr: 0.003412  min_lr: 0.003412  loss: 3.6967 (3.7229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7301 (0.7626)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [48]  [1000/2502]  eta: 0:06:30  lr: 0.003412  min_lr: 0.003412  loss: 3.9148 (3.7103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7031 (0.7608)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [48]  [1200/2502]  eta: 0:05:37  lr: 0.003411  min_lr: 0.003411  loss: 4.1772 (3.7060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7786 (0.7715)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [48]  [1400/2502]  eta: 0:04:45  lr: 0.003411  min_lr: 0.003411  loss: 4.0038 (3.7159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8088 (0.7670)  time: 0.2573  data: 0.0003  max mem: 18975
Epoch: [48]  [1600/2502]  eta: 0:03:53  lr: 0.003410  min_lr: 0.003410  loss: 3.4910 (3.7162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6703 (0.7631)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [48]  [1800/2502]  eta: 0:03:01  lr: 0.003410  min_lr: 0.003410  loss: 3.8895 (3.7203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7869 (0.7607)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [48]  [2000/2502]  eta: 0:02:09  lr: 0.003409  min_lr: 0.003409  loss: 3.8138 (3.7201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7025 (0.7638)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [48]  [2200/2502]  eta: 0:01:18  lr: 0.003409  min_lr: 0.003409  loss: 3.8371 (3.7112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6885 (0.7610)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [48]  [2400/2502]  eta: 0:00:26  lr: 0.003408  min_lr: 0.003408  loss: 3.6459 (3.6996)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7708 (0.7630)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [48]  [2501/2502]  eta: 0:00:00  lr: 0.003408  min_lr: 0.003408  loss: 4.0590 (3.7007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7156 (0.7622)  time: 0.2308  data: 0.0007  max mem: 18975
Epoch: [48] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.003408  min_lr: 0.003408  loss: 4.0590 (3.6735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7156 (0.7622)
Test:  [ 0/50]  eta: 0:03:08  loss: 0.8733 (0.8733)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 3.7697  data: 3.6310  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 1.2323 (1.1752)  acc1: 80.0000 (79.2727)  acc5: 95.2000 (95.0546)  time: 0.5909  data: 0.4692  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 1.2061 (1.1929)  acc1: 76.0000 (78.1333)  acc5: 95.2000 (95.0476)  time: 0.2809  data: 0.1614  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3808 (1.3401)  acc1: 69.6000 (74.8645)  acc5: 91.6000 (92.9677)  time: 0.2788  data: 0.1598  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.7080 (1.4342)  acc1: 66.4000 (72.5073)  acc5: 87.2000 (91.6293)  time: 0.2209  data: 0.1020  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.7076 (1.4535)  acc1: 67.2000 (71.9280)  acc5: 87.2000 (91.4240)  time: 0.2207  data: 0.1018  max mem: 18975
Test: Total time: 0:00:14 (0.2997 s / it)
* Acc@1 71.842 Acc@5 91.256 loss 1.452
Accuracy of the model on the 50000 test images: 71.8%
Max accuracy: 71.96%
Epoch: [49]  [   0/2502]  eta: 1:31:10  lr: 0.003408  min_lr: 0.003408  loss: 4.1536 (4.1536)  weight_decay: 0.0500 (0.0500)  time: 2.1863  data: 1.5641  max mem: 18975
Epoch: [49]  [ 200/2502]  eta: 0:10:18  lr: 0.003408  min_lr: 0.003408  loss: 3.8750 (3.7166)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6761 (0.7258)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [49]  [ 400/2502]  eta: 0:09:14  lr: 0.003407  min_lr: 0.003407  loss: 3.7391 (3.6901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6861 (0.7139)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [49]  [ 600/2502]  eta: 0:08:17  lr: 0.003407  min_lr: 0.003407  loss: 4.0459 (3.6856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7425 (0.7333)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [49]  [ 800/2502]  eta: 0:07:23  lr: 0.003406  min_lr: 0.003406  loss: 3.0120 (3.6817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7560 (0.7415)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [49]  [1000/2502]  eta: 0:06:30  lr: 0.003406  min_lr: 0.003406  loss: 3.6253 (3.6842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7379 (0.7451)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [49]  [1200/2502]  eta: 0:05:37  lr: 0.003405  min_lr: 0.003405  loss: 4.1389 (3.6825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7699 (0.7454)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [49]  [1400/2502]  eta: 0:04:45  lr: 0.003405  min_lr: 0.003405  loss: 4.0575 (3.6754)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7218 (0.7455)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [49]  [1600/2502]  eta: 0:03:53  lr: 0.003404  min_lr: 0.003404  loss: 4.0955 (3.6794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7362 (0.7492)  time: 0.2587  data: 0.0003  max mem: 18975
Epoch: [49]  [1800/2502]  eta: 0:03:01  lr: 0.003404  min_lr: 0.003404  loss: 3.8929 (3.6755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6884 (0.7500)  time: 0.2574  data: 0.0003  max mem: 18975
Epoch: [49]  [2000/2502]  eta: 0:02:09  lr: 0.003403  min_lr: 0.003403  loss: 3.7192 (3.6766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7547 (0.7512)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [49]  [2200/2502]  eta: 0:01:18  lr: 0.003403  min_lr: 0.003403  loss: 3.8264 (3.6811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7689 (0.7510)  time: 0.2577  data: 0.0005  max mem: 18975
Epoch: [49]  [2400/2502]  eta: 0:00:26  lr: 0.003402  min_lr: 0.003402  loss: 3.3454 (3.6767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7239 (0.7523)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [49]  [2501/2502]  eta: 0:00:00  lr: 0.003402  min_lr: 0.003402  loss: 3.3477 (3.6774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6940 (0.7523)  time: 0.2312  data: 0.0010  max mem: 18975
Epoch: [49] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.003402  min_lr: 0.003402  loss: 3.3477 (3.6620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6940 (0.7523)
Test:  [ 0/50]  eta: 0:02:50  loss: 0.8634 (0.8634)  acc1: 87.2000 (87.2000)  acc5: 96.8000 (96.8000)  time: 3.4061  data: 3.2650  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.1340 (1.1064)  acc1: 79.2000 (78.0727)  acc5: 95.2000 (94.5091)  time: 0.5509  data: 0.4237  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0558 (1.1035)  acc1: 76.0000 (77.5619)  acc5: 95.2000 (94.9524)  time: 0.2567  data: 0.1336  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2465 (1.2517)  acc1: 71.2000 (74.4387)  acc5: 91.2000 (92.6452)  time: 0.2255  data: 0.1058  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6283 (1.3346)  acc1: 66.8000 (72.5268)  acc5: 87.2000 (91.5317)  time: 0.2051  data: 0.0859  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6018 (1.3556)  acc1: 67.6000 (72.1120)  acc5: 88.0000 (91.3520)  time: 0.2095  data: 0.0904  max mem: 18975
Test: Total time: 0:00:14 (0.2888 s / it)
* Acc@1 71.868 Acc@5 91.196 loss 1.357
Accuracy of the model on the 50000 test images: 71.9%
Max accuracy: 71.96%
Epoch: [50]  [   0/2502]  eta: 1:16:39  lr: 0.003402  min_lr: 0.003402  loss: 4.1210 (4.1210)  weight_decay: 0.0500 (0.0500)  time: 1.8382  data: 1.4041  max mem: 18975
Epoch: [50]  [ 200/2502]  eta: 0:10:17  lr: 0.003401  min_lr: 0.003401  loss: 3.3582 (3.6339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7625 (0.7577)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [50]  [ 400/2502]  eta: 0:09:11  lr: 0.003401  min_lr: 0.003401  loss: 3.8175 (3.6456)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6645 (0.7536)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [50]  [ 600/2502]  eta: 0:08:15  lr: 0.003400  min_lr: 0.003400  loss: 3.6884 (3.6342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8204 (inf)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [50]  [ 800/2502]  eta: 0:07:21  lr: 0.003400  min_lr: 0.003400  loss: 3.5681 (3.6601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7769 (inf)  time: 0.2572  data: 0.0003  max mem: 18975
Epoch: [50]  [1000/2502]  eta: 0:06:29  lr: 0.003399  min_lr: 0.003399  loss: 3.6268 (3.6374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7744 (inf)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [50]  [1200/2502]  eta: 0:05:36  lr: 0.003399  min_lr: 0.003399  loss: 3.5341 (3.6460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6865 (inf)  time: 0.2636  data: 0.0004  max mem: 18975
Epoch: [50]  [1400/2502]  eta: 0:04:44  lr: 0.003398  min_lr: 0.003398  loss: 3.2545 (3.6416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7100 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [50]  [1600/2502]  eta: 0:03:53  lr: 0.003398  min_lr: 0.003398  loss: 3.4678 (3.6464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7375 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [50]  [1800/2502]  eta: 0:03:01  lr: 0.003397  min_lr: 0.003397  loss: 4.0704 (3.6410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7516 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [50]  [2000/2502]  eta: 0:02:09  lr: 0.003397  min_lr: 0.003397  loss: 3.8329 (3.6478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7981 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [50]  [2200/2502]  eta: 0:01:17  lr: 0.003396  min_lr: 0.003396  loss: 4.0761 (3.6465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7391 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [50]  [2400/2502]  eta: 0:00:26  lr: 0.003396  min_lr: 0.003396  loss: 4.0701 (3.6514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7066 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [50]  [2501/2502]  eta: 0:00:00  lr: 0.003395  min_lr: 0.003395  loss: 3.8498 (3.6502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7878 (inf)  time: 0.2309  data: 0.0008  max mem: 18975
Epoch: [50] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.003395  min_lr: 0.003395  loss: 3.8498 (3.6578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7878 (inf)
Test:  [ 0/50]  eta: 0:03:09  loss: 0.7804 (0.7804)  acc1: 86.8000 (86.8000)  acc5: 97.2000 (97.2000)  time: 3.7835  data: 3.6451  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 1.1115 (1.1488)  acc1: 80.4000 (78.3273)  acc5: 95.2000 (95.0909)  time: 0.5874  data: 0.4649  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.1115 (1.1246)  acc1: 76.8000 (77.8667)  acc5: 95.6000 (95.2571)  time: 0.2417  data: 0.1210  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2806 (1.2608)  acc1: 73.2000 (74.7484)  acc5: 90.8000 (93.0839)  time: 0.2092  data: 0.0894  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6460 (1.3554)  acc1: 66.4000 (72.4683)  acc5: 87.2000 (91.6585)  time: 0.2010  data: 0.0820  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5726 (1.3604)  acc1: 66.4000 (72.2000)  acc5: 87.6000 (91.4240)  time: 0.2034  data: 0.0833  max mem: 18975
Test: Total time: 0:00:13 (0.2777 s / it)
* Acc@1 72.108 Acc@5 91.414 loss 1.360
Accuracy of the model on the 50000 test images: 72.1%
Max accuracy: 72.11%
Epoch: [51]  [   0/2502]  eta: 1:10:51  lr: 0.003395  min_lr: 0.003395  loss: 2.5029 (2.5029)  weight_decay: 0.0500 (0.0500)  time: 1.6993  data: 1.4141  max mem: 18975
Epoch: [51]  [ 200/2502]  eta: 0:10:17  lr: 0.003395  min_lr: 0.003395  loss: 3.9349 (3.6743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7118 (0.7351)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [51]  [ 400/2502]  eta: 0:09:12  lr: 0.003394  min_lr: 0.003394  loss: 3.8739 (3.6619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6805 (0.7594)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [51]  [ 600/2502]  eta: 0:08:16  lr: 0.003394  min_lr: 0.003394  loss: 4.0189 (3.6498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7002 (0.7522)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [51]  [ 800/2502]  eta: 0:07:22  lr: 0.003393  min_lr: 0.003393  loss: 3.8746 (3.6406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8131 (0.7653)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [51]  [1000/2502]  eta: 0:06:29  lr: 0.003393  min_lr: 0.003393  loss: 3.4795 (3.6543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7715 (0.7694)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [51]  [1200/2502]  eta: 0:05:37  lr: 0.003392  min_lr: 0.003392  loss: 3.2838 (3.6651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6586 (0.7631)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [51]  [1400/2502]  eta: 0:04:45  lr: 0.003391  min_lr: 0.003391  loss: 3.8482 (3.6546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8149 (0.7660)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [51]  [1600/2502]  eta: 0:03:53  lr: 0.003391  min_lr: 0.003391  loss: 3.6064 (3.6567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7931 (0.7673)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [51]  [1800/2502]  eta: 0:03:01  lr: 0.003390  min_lr: 0.003390  loss: 3.4755 (3.6567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7251 (0.7655)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [51]  [2000/2502]  eta: 0:02:09  lr: 0.003390  min_lr: 0.003390  loss: 3.7828 (3.6675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6987 (0.7645)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [51]  [2200/2502]  eta: 0:01:17  lr: 0.003389  min_lr: 0.003389  loss: 3.0219 (3.6715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6814 (0.7605)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [51]  [2400/2502]  eta: 0:00:26  lr: 0.003389  min_lr: 0.003389  loss: 3.7760 (3.6676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7969 (0.7595)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [51]  [2501/2502]  eta: 0:00:00  lr: 0.003388  min_lr: 0.003388  loss: 3.4667 (3.6675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7673 (0.7607)  time: 0.2345  data: 0.0009  max mem: 18975
Epoch: [51] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.003388  min_lr: 0.003388  loss: 3.4667 (3.6550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7673 (0.7607)
Test:  [ 0/50]  eta: 0:03:06  loss: 0.7781 (0.7781)  acc1: 89.2000 (89.2000)  acc5: 96.8000 (96.8000)  time: 3.7216  data: 3.5728  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.0603 (1.0542)  acc1: 78.0000 (78.8000)  acc5: 95.2000 (94.7636)  time: 0.5273  data: 0.4046  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0385 (1.0450)  acc1: 76.8000 (78.0952)  acc5: 95.2000 (95.0286)  time: 0.2487  data: 0.1290  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2545 (1.1867)  acc1: 73.6000 (74.8645)  acc5: 91.2000 (92.9936)  time: 0.2911  data: 0.1718  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5746 (1.2818)  acc1: 66.8000 (72.7805)  acc5: 87.6000 (91.6585)  time: 0.2424  data: 0.1232  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5795 (1.3016)  acc1: 66.4000 (72.3840)  acc5: 87.6000 (91.4240)  time: 0.2315  data: 0.1125  max mem: 18975
Test: Total time: 0:00:14 (0.2943 s / it)
* Acc@1 72.222 Acc@5 91.444 loss 1.307
Accuracy of the model on the 50000 test images: 72.2%
Max accuracy: 72.22%
Epoch: [52]  [   0/2502]  eta: 1:27:38  lr: 0.003388  min_lr: 0.003388  loss: 2.6051 (2.6051)  weight_decay: 0.0500 (0.0500)  time: 2.1019  data: 1.8317  max mem: 18975
Epoch: [52]  [ 200/2502]  eta: 0:10:17  lr: 0.003388  min_lr: 0.003388  loss: 4.0104 (3.6592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7606 (0.7489)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [52]  [ 400/2502]  eta: 0:09:12  lr: 0.003387  min_lr: 0.003387  loss: 3.7491 (3.6372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7120 (0.7368)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [52]  [ 600/2502]  eta: 0:08:15  lr: 0.003387  min_lr: 0.003387  loss: 3.4524 (3.6369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7334 (0.7339)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [52]  [ 800/2502]  eta: 0:07:22  lr: 0.003386  min_lr: 0.003386  loss: 3.4634 (3.6357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6714 (0.7438)  time: 0.2573  data: 0.0003  max mem: 18975
Epoch: [52]  [1000/2502]  eta: 0:06:29  lr: 0.003386  min_lr: 0.003386  loss: 3.5670 (3.6287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8124 (0.7573)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [52]  [1200/2502]  eta: 0:05:37  lr: 0.003385  min_lr: 0.003385  loss: 3.6537 (3.6476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7792 (0.7629)  time: 0.2639  data: 0.0003  max mem: 18975
Epoch: [52]  [1400/2502]  eta: 0:04:44  lr: 0.003385  min_lr: 0.003385  loss: 3.2499 (3.6412)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7262 (0.7679)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [52]  [1600/2502]  eta: 0:03:53  lr: 0.003384  min_lr: 0.003384  loss: 4.1094 (3.6441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8248 (0.7753)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [52]  [1800/2502]  eta: 0:03:01  lr: 0.003383  min_lr: 0.003383  loss: 3.6974 (3.6405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6884 (0.7747)  time: 0.2555  data: 0.0004  max mem: 18975
Epoch: [52]  [2000/2502]  eta: 0:02:09  lr: 0.003383  min_lr: 0.003383  loss: 3.7023 (3.6494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6892 (0.7679)  time: 0.2580  data: 0.0003  max mem: 18975
Epoch: [52]  [2200/2502]  eta: 0:01:17  lr: 0.003382  min_lr: 0.003382  loss: 3.3745 (3.6413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7436 (0.7695)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [52]  [2400/2502]  eta: 0:00:26  lr: 0.003382  min_lr: 0.003382  loss: 3.9100 (3.6374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7273 (0.7694)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [52]  [2501/2502]  eta: 0:00:00  lr: 0.003381  min_lr: 0.003381  loss: 3.1446 (3.6341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7045 (0.7684)  time: 0.2309  data: 0.0007  max mem: 18975
Epoch: [52] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.003381  min_lr: 0.003381  loss: 3.1446 (3.6368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7045 (0.7684)
Test:  [ 0/50]  eta: 0:02:15  loss: 0.7349 (0.7349)  acc1: 88.4000 (88.4000)  acc5: 97.2000 (97.2000)  time: 2.7021  data: 2.5359  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 1.0897 (1.0663)  acc1: 80.0000 (79.2000)  acc5: 94.8000 (94.5455)  time: 0.4780  data: 0.3529  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.0601 (1.0793)  acc1: 76.8000 (78.1905)  acc5: 94.8000 (94.7810)  time: 0.2323  data: 0.1111  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2930 (1.2224)  acc1: 72.8000 (75.1613)  acc5: 91.6000 (92.8129)  time: 0.2456  data: 0.1253  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6038 (1.3208)  acc1: 66.0000 (72.7024)  acc5: 87.6000 (91.6000)  time: 0.2820  data: 0.1620  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6038 (1.3458)  acc1: 66.0000 (72.1840)  acc5: 88.0000 (91.3920)  time: 0.2191  data: 0.0991  max mem: 18975
Test: Total time: 0:00:14 (0.2905 s / it)
* Acc@1 72.218 Acc@5 91.416 loss 1.344
Accuracy of the model on the 50000 test images: 72.2%
Max accuracy: 72.22%
Epoch: [53]  [   0/2502]  eta: 1:35:47  lr: 0.003381  min_lr: 0.003381  loss: 2.9462 (2.9462)  weight_decay: 0.0500 (0.0500)  time: 2.2973  data: 2.0315  max mem: 18975
Epoch: [53]  [ 200/2502]  eta: 0:10:17  lr: 0.003381  min_lr: 0.003381  loss: 3.8436 (3.6472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6906 (0.7939)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [53]  [ 400/2502]  eta: 0:09:12  lr: 0.003380  min_lr: 0.003380  loss: 3.4509 (3.6415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7353 (0.8050)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [53]  [ 600/2502]  eta: 0:08:16  lr: 0.003380  min_lr: 0.003380  loss: 3.9130 (3.6620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7207 (0.7799)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [53]  [ 800/2502]  eta: 0:07:22  lr: 0.003379  min_lr: 0.003379  loss: 3.4085 (3.6539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7621 (0.7793)  time: 0.2566  data: 0.0005  max mem: 18975
Epoch: [53]  [1000/2502]  eta: 0:06:30  lr: 0.003379  min_lr: 0.003379  loss: 3.8381 (3.6435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7324 (0.7810)  time: 0.2649  data: 0.0004  max mem: 18975
Epoch: [53]  [1200/2502]  eta: 0:05:37  lr: 0.003378  min_lr: 0.003378  loss: 3.9615 (3.6353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6921 (0.7840)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [53]  [1400/2502]  eta: 0:04:45  lr: 0.003377  min_lr: 0.003377  loss: 3.8608 (3.6426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7436 (inf)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [53]  [1600/2502]  eta: 0:03:53  lr: 0.003377  min_lr: 0.003377  loss: 4.0100 (3.6533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7094 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [53]  [1800/2502]  eta: 0:03:01  lr: 0.003376  min_lr: 0.003376  loss: 3.9631 (3.6540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7692 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [53]  [2000/2502]  eta: 0:02:09  lr: 0.003376  min_lr: 0.003376  loss: 3.8681 (3.6491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7387 (inf)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [53]  [2200/2502]  eta: 0:01:18  lr: 0.003375  min_lr: 0.003375  loss: 3.2905 (3.6434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6972 (inf)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [53]  [2400/2502]  eta: 0:00:26  lr: 0.003375  min_lr: 0.003375  loss: 3.9560 (3.6507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7858 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [53]  [2501/2502]  eta: 0:00:00  lr: 0.003374  min_lr: 0.003374  loss: 3.7417 (3.6533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8200 (inf)  time: 0.2309  data: 0.0007  max mem: 18975
Epoch: [53] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.003374  min_lr: 0.003374  loss: 3.7417 (3.6458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8200 (inf)
Test:  [ 0/50]  eta: 0:02:21  loss: 0.9105 (0.9105)  acc1: 88.8000 (88.8000)  acc5: 96.8000 (96.8000)  time: 2.8329  data: 2.6827  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 1.0960 (1.1443)  acc1: 79.2000 (79.5636)  acc5: 95.6000 (94.7273)  time: 0.4311  data: 0.3087  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.1316 (1.1736)  acc1: 76.8000 (78.1143)  acc5: 95.2000 (94.9714)  time: 0.2237  data: 0.1044  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.4371 (1.3237)  acc1: 70.8000 (75.0839)  acc5: 92.0000 (92.8258)  time: 0.2786  data: 0.1594  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.7050 (1.4129)  acc1: 66.8000 (72.9951)  acc5: 87.2000 (91.5902)  time: 0.2762  data: 0.1553  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6554 (1.4374)  acc1: 68.0000 (72.3200)  acc5: 88.0000 (91.4000)  time: 0.2047  data: 0.0837  max mem: 18975
Test: Total time: 0:00:14 (0.2880 s / it)
* Acc@1 72.022 Acc@5 91.442 loss 1.445
Accuracy of the model on the 50000 test images: 72.0%
Max accuracy: 72.22%
Epoch: [54]  [   0/2502]  eta: 1:34:44  lr: 0.003374  min_lr: 0.003374  loss: 4.0755 (4.0755)  weight_decay: 0.0500 (0.0500)  time: 2.2718  data: 1.9216  max mem: 18975
Epoch: [54]  [ 200/2502]  eta: 0:10:20  lr: 0.003374  min_lr: 0.003374  loss: 3.4766 (3.6465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7764 (0.7674)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [54]  [ 400/2502]  eta: 0:09:13  lr: 0.003373  min_lr: 0.003373  loss: 3.8072 (3.6316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7880 (0.7933)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [54]  [ 600/2502]  eta: 0:08:16  lr: 0.003372  min_lr: 0.003372  loss: 3.6929 (3.6379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6938 (0.7775)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [54]  [ 800/2502]  eta: 0:07:22  lr: 0.003372  min_lr: 0.003372  loss: 4.0090 (3.6429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6864 (0.7530)  time: 0.2646  data: 0.0004  max mem: 18975
Epoch: [54]  [1000/2502]  eta: 0:06:29  lr: 0.003371  min_lr: 0.003371  loss: 3.5733 (3.6460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6962 (0.7613)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [54]  [1200/2502]  eta: 0:05:37  lr: 0.003371  min_lr: 0.003371  loss: 3.8773 (3.6430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7362 (0.7598)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [54]  [1400/2502]  eta: 0:04:45  lr: 0.003370  min_lr: 0.003370  loss: 3.6013 (3.6371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6946 (0.7620)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [54]  [1600/2502]  eta: 0:03:53  lr: 0.003370  min_lr: 0.003370  loss: 3.7647 (3.6460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8132 (0.7684)  time: 0.2565  data: 0.0005  max mem: 18975
Epoch: [54]  [1800/2502]  eta: 0:03:01  lr: 0.003369  min_lr: 0.003369  loss: 3.8258 (3.6486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6885 (0.7633)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [54]  [2000/2502]  eta: 0:02:09  lr: 0.003368  min_lr: 0.003368  loss: 4.0614 (3.6456)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7594 (0.7660)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [54]  [2200/2502]  eta: 0:01:17  lr: 0.003368  min_lr: 0.003368  loss: 3.0147 (3.6468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7025 (0.7631)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [54]  [2400/2502]  eta: 0:00:26  lr: 0.003367  min_lr: 0.003367  loss: 4.1398 (3.6509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7525 (0.7617)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [54]  [2501/2502]  eta: 0:00:00  lr: 0.003367  min_lr: 0.003367  loss: 4.0011 (3.6514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6735 (0.7596)  time: 0.2307  data: 0.0008  max mem: 18975
Epoch: [54] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.003367  min_lr: 0.003367  loss: 4.0011 (3.6433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6735 (0.7596)
Test:  [ 0/50]  eta: 0:02:23  loss: 0.8315 (0.8315)  acc1: 87.2000 (87.2000)  acc5: 96.8000 (96.8000)  time: 2.8704  data: 2.7139  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 1.1659 (1.1579)  acc1: 79.6000 (78.5818)  acc5: 94.4000 (94.8000)  time: 0.4824  data: 0.3578  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.1659 (1.1665)  acc1: 78.8000 (77.4286)  acc5: 94.4000 (94.6286)  time: 0.2327  data: 0.1121  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3960 (1.2966)  acc1: 70.8000 (74.4129)  acc5: 91.6000 (92.8258)  time: 0.2677  data: 0.1465  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6523 (1.3877)  acc1: 67.6000 (72.0781)  acc5: 86.8000 (91.3756)  time: 0.2994  data: 0.1782  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6282 (1.3976)  acc1: 67.6000 (71.7760)  acc5: 86.4000 (91.2720)  time: 0.2293  data: 0.1097  max mem: 18975
Test: Total time: 0:00:15 (0.3048 s / it)
* Acc@1 71.922 Acc@5 91.396 loss 1.398
Accuracy of the model on the 50000 test images: 71.9%
Max accuracy: 72.22%
Epoch: [55]  [   0/2502]  eta: 1:32:15  lr: 0.003367  min_lr: 0.003367  loss: 2.9603 (2.9603)  weight_decay: 0.0500 (0.0500)  time: 2.2124  data: 1.3841  max mem: 18975
Epoch: [55]  [ 200/2502]  eta: 0:10:17  lr: 0.003366  min_lr: 0.003366  loss: 3.5739 (3.5578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7640 (0.7673)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [55]  [ 400/2502]  eta: 0:09:12  lr: 0.003366  min_lr: 0.003366  loss: 3.6909 (3.5449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7161 (0.7725)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [55]  [ 600/2502]  eta: 0:08:16  lr: 0.003365  min_lr: 0.003365  loss: 3.6673 (3.5783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7350 (0.7554)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [55]  [ 800/2502]  eta: 0:07:23  lr: 0.003364  min_lr: 0.003364  loss: 3.8715 (3.6020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6722 (0.7547)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [55]  [1000/2502]  eta: 0:06:30  lr: 0.003364  min_lr: 0.003364  loss: 4.0673 (3.6025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7390 (0.7605)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [55]  [1200/2502]  eta: 0:05:37  lr: 0.003363  min_lr: 0.003363  loss: 2.7813 (3.6053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7081 (0.7554)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [55]  [1400/2502]  eta: 0:04:45  lr: 0.003363  min_lr: 0.003363  loss: 3.7288 (3.6153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6897 (0.7533)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [55]  [1600/2502]  eta: 0:03:53  lr: 0.003362  min_lr: 0.003362  loss: 4.1354 (3.6207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7436 (0.7538)  time: 0.2558  data: 0.0003  max mem: 18975
Epoch: [55]  [1800/2502]  eta: 0:03:01  lr: 0.003361  min_lr: 0.003361  loss: 3.8730 (3.6192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7085 (0.7533)  time: 0.2621  data: 0.0004  max mem: 18975
Epoch: [55]  [2000/2502]  eta: 0:02:09  lr: 0.003361  min_lr: 0.003361  loss: 3.5115 (3.6154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7364 (0.7553)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [55]  [2200/2502]  eta: 0:01:17  lr: 0.003360  min_lr: 0.003360  loss: 3.7341 (3.6105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7055 (0.7542)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [55]  [2400/2502]  eta: 0:00:26  lr: 0.003360  min_lr: 0.003360  loss: 3.9275 (3.6062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7374 (0.7577)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [55]  [2501/2502]  eta: 0:00:00  lr: 0.003359  min_lr: 0.003359  loss: 4.0790 (3.6104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6903 (0.7579)  time: 0.2306  data: 0.0007  max mem: 18975
Epoch: [55] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003359  min_lr: 0.003359  loss: 4.0790 (3.6302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6903 (0.7579)
Test:  [ 0/50]  eta: 0:02:28  loss: 0.7997 (0.7997)  acc1: 88.8000 (88.8000)  acc5: 96.8000 (96.8000)  time: 2.9739  data: 2.8303  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 1.1778 (1.1356)  acc1: 80.0000 (78.6182)  acc5: 95.6000 (95.1636)  time: 0.4625  data: 0.3412  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1241 (1.1325)  acc1: 78.0000 (78.0000)  acc5: 95.6000 (95.2191)  time: 0.2540  data: 0.1335  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2870 (1.2772)  acc1: 73.2000 (74.9419)  acc5: 91.2000 (93.2258)  time: 0.2863  data: 0.1659  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6754 (1.3737)  acc1: 66.4000 (72.9171)  acc5: 88.4000 (91.7756)  time: 0.2716  data: 0.1509  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6447 (1.3909)  acc1: 67.6000 (72.4560)  acc5: 87.6000 (91.4800)  time: 0.2087  data: 0.0882  max mem: 18975
Test: Total time: 0:00:14 (0.2993 s / it)
* Acc@1 72.418 Acc@5 91.380 loss 1.394
Accuracy of the model on the 50000 test images: 72.4%
Max accuracy: 72.42%
Epoch: [56]  [   0/2502]  eta: 1:12:20  lr: 0.003359  min_lr: 0.003359  loss: 4.0031 (4.0031)  weight_decay: 0.0500 (0.0500)  time: 1.7346  data: 1.4589  max mem: 18975
Epoch: [56]  [ 200/2502]  eta: 0:10:15  lr: 0.003359  min_lr: 0.003359  loss: 3.6225 (3.6361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7246 (0.7104)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [56]  [ 400/2502]  eta: 0:09:10  lr: 0.003358  min_lr: 0.003358  loss: 3.3490 (3.6073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7084 (0.7242)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [56]  [ 600/2502]  eta: 0:08:15  lr: 0.003357  min_lr: 0.003357  loss: 3.7250 (3.6025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7057 (0.7394)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [56]  [ 800/2502]  eta: 0:07:22  lr: 0.003357  min_lr: 0.003357  loss: 3.6955 (3.5954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6814 (0.7374)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [56]  [1000/2502]  eta: 0:06:29  lr: 0.003356  min_lr: 0.003356  loss: 3.1174 (3.5937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7684 (0.7446)  time: 0.2584  data: 0.0005  max mem: 18975
Epoch: [56]  [1200/2502]  eta: 0:05:37  lr: 0.003355  min_lr: 0.003355  loss: 3.7381 (3.5996)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7568 (0.7516)  time: 0.2569  data: 0.0005  max mem: 18975
Epoch: [56]  [1400/2502]  eta: 0:04:45  lr: 0.003355  min_lr: 0.003355  loss: 3.4076 (3.6009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6424 (0.7552)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [56]  [1600/2502]  eta: 0:03:53  lr: 0.003354  min_lr: 0.003354  loss: 3.3708 (3.6032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7034 (0.7600)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [56]  [1800/2502]  eta: 0:03:01  lr: 0.003354  min_lr: 0.003354  loss: 3.6224 (3.6153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7831 (inf)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [56]  [2000/2502]  eta: 0:02:09  lr: 0.003353  min_lr: 0.003353  loss: 2.6962 (3.6070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6780 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [56]  [2200/2502]  eta: 0:01:18  lr: 0.003352  min_lr: 0.003352  loss: 3.8297 (3.6069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7643 (inf)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [56]  [2400/2502]  eta: 0:00:26  lr: 0.003352  min_lr: 0.003352  loss: 3.8820 (3.6079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7616 (inf)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [56]  [2501/2502]  eta: 0:00:00  lr: 0.003351  min_lr: 0.003351  loss: 3.6906 (3.6074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7276 (inf)  time: 0.2311  data: 0.0009  max mem: 18975
Epoch: [56] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.003351  min_lr: 0.003351  loss: 3.6906 (3.6240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7276 (inf)
Test:  [ 0/50]  eta: 0:02:29  loss: 0.7325 (0.7325)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 2.9902  data: 2.8469  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 1.1335 (1.1036)  acc1: 78.4000 (79.4909)  acc5: 95.2000 (95.4545)  time: 0.4795  data: 0.3580  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1269 (1.0997)  acc1: 77.6000 (78.6286)  acc5: 95.2000 (95.4286)  time: 0.2587  data: 0.1391  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3014 (1.2474)  acc1: 73.6000 (75.4581)  acc5: 92.4000 (93.3548)  time: 0.2907  data: 0.1710  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6204 (1.3384)  acc1: 67.2000 (73.3463)  acc5: 88.0000 (92.1268)  time: 0.2633  data: 0.1431  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5322 (1.3601)  acc1: 68.0000 (72.7120)  acc5: 88.4000 (91.9040)  time: 0.1964  data: 0.0765  max mem: 18975
Test: Total time: 0:00:14 (0.3000 s / it)
* Acc@1 72.534 Acc@5 91.718 loss 1.361
Accuracy of the model on the 50000 test images: 72.5%
Max accuracy: 72.53%
Epoch: [57]  [   0/2502]  eta: 1:13:00  lr: 0.003351  min_lr: 0.003351  loss: 4.1599 (4.1599)  weight_decay: 0.0500 (0.0500)  time: 1.7509  data: 1.4673  max mem: 18975
Epoch: [57]  [ 200/2502]  eta: 0:10:17  lr: 0.003351  min_lr: 0.003351  loss: 3.0217 (3.5678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7448 (0.7716)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [57]  [ 400/2502]  eta: 0:09:11  lr: 0.003350  min_lr: 0.003350  loss: 4.1539 (3.6007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7508 (0.7769)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [57]  [ 600/2502]  eta: 0:08:15  lr: 0.003349  min_lr: 0.003349  loss: 3.0643 (3.5935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7987 (0.7850)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [57]  [ 800/2502]  eta: 0:07:22  lr: 0.003349  min_lr: 0.003349  loss: 3.2842 (3.5811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7071 (0.7662)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [57]  [1000/2502]  eta: 0:06:29  lr: 0.003348  min_lr: 0.003348  loss: 3.9952 (3.5933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7353 (0.7637)  time: 0.2580  data: 0.0003  max mem: 18975
Epoch: [57]  [1200/2502]  eta: 0:05:37  lr: 0.003348  min_lr: 0.003348  loss: 3.5723 (3.5878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7495 (0.7654)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [57]  [1400/2502]  eta: 0:04:45  lr: 0.003347  min_lr: 0.003347  loss: 3.6415 (3.5895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7518 (0.7661)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [57]  [1600/2502]  eta: 0:03:53  lr: 0.003346  min_lr: 0.003346  loss: 3.9091 (3.5984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7451 (0.7696)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [57]  [1800/2502]  eta: 0:03:01  lr: 0.003346  min_lr: 0.003346  loss: 3.8918 (3.6067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6577 (0.7649)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [57]  [2000/2502]  eta: 0:02:09  lr: 0.003345  min_lr: 0.003345  loss: 3.9233 (3.6142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7694 (0.7652)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [57]  [2200/2502]  eta: 0:01:17  lr: 0.003344  min_lr: 0.003344  loss: 3.7346 (3.6233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7434 (0.7641)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [57]  [2400/2502]  eta: 0:00:26  lr: 0.003344  min_lr: 0.003344  loss: 3.7418 (3.6266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7480 (0.7633)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [57]  [2501/2502]  eta: 0:00:00  lr: 0.003343  min_lr: 0.003343  loss: 3.5447 (3.6295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6965 (0.7622)  time: 0.2308  data: 0.0009  max mem: 18975
Epoch: [57] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.003343  min_lr: 0.003343  loss: 3.5447 (3.6181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6965 (0.7622)
Test:  [ 0/50]  eta: 0:03:14  loss: 0.8280 (0.8280)  acc1: 87.2000 (87.2000)  acc5: 96.8000 (96.8000)  time: 3.8812  data: 3.7427  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 1.0358 (1.0729)  acc1: 80.8000 (78.6909)  acc5: 95.6000 (95.0909)  time: 0.6216  data: 0.4993  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 1.0415 (1.0718)  acc1: 77.2000 (77.6952)  acc5: 96.0000 (95.2191)  time: 0.2655  data: 0.1451  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1909 (1.2261)  acc1: 71.2000 (74.8516)  acc5: 91.2000 (93.0452)  time: 0.2471  data: 0.1276  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5877 (1.3188)  acc1: 67.6000 (72.8488)  acc5: 87.6000 (91.8829)  time: 0.2187  data: 0.0998  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5650 (1.3423)  acc1: 67.6000 (72.3600)  acc5: 88.0000 (91.7440)  time: 0.2223  data: 0.1035  max mem: 18975
Test: Total time: 0:00:14 (0.2962 s / it)
* Acc@1 72.502 Acc@5 91.766 loss 1.340
Accuracy of the model on the 50000 test images: 72.5%
Max accuracy: 72.53%
Epoch: [58]  [   0/2502]  eta: 1:31:50  lr: 0.003343  min_lr: 0.003343  loss: 2.7325 (2.7325)  weight_decay: 0.0500 (0.0500)  time: 2.2024  data: 1.8298  max mem: 18975
Epoch: [58]  [ 200/2502]  eta: 0:10:20  lr: 0.003343  min_lr: 0.003343  loss: 2.8901 (3.5976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7293 (0.7388)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [58]  [ 400/2502]  eta: 0:09:13  lr: 0.003342  min_lr: 0.003342  loss: 3.9139 (3.6110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6902 (0.7430)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [58]  [ 600/2502]  eta: 0:08:16  lr: 0.003341  min_lr: 0.003341  loss: 3.7693 (3.6092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7424 (0.7375)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [58]  [ 800/2502]  eta: 0:07:23  lr: 0.003341  min_lr: 0.003341  loss: 3.7288 (3.6122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7334 (0.7432)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [58]  [1000/2502]  eta: 0:06:30  lr: 0.003340  min_lr: 0.003340  loss: 3.8589 (3.6206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7590 (0.7490)  time: 0.2644  data: 0.0004  max mem: 18975
Epoch: [58]  [1200/2502]  eta: 0:05:37  lr: 0.003339  min_lr: 0.003339  loss: 3.1246 (3.6167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6643 (0.7462)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [58]  [1400/2502]  eta: 0:04:45  lr: 0.003339  min_lr: 0.003339  loss: 4.0527 (3.6271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6799 (0.7476)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [58]  [1600/2502]  eta: 0:03:53  lr: 0.003338  min_lr: 0.003338  loss: 3.4322 (3.6130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7250 (0.7507)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [58]  [1800/2502]  eta: 0:03:01  lr: 0.003337  min_lr: 0.003337  loss: 3.7521 (3.6143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7242 (0.7504)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [58]  [2000/2502]  eta: 0:02:09  lr: 0.003337  min_lr: 0.003337  loss: 4.0302 (3.6129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6608 (0.7457)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [58]  [2200/2502]  eta: 0:01:18  lr: 0.003336  min_lr: 0.003336  loss: 3.0609 (3.6165)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8246 (0.7493)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [58]  [2400/2502]  eta: 0:00:26  lr: 0.003335  min_lr: 0.003335  loss: 3.0438 (3.6204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8002 (0.7502)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [58]  [2501/2502]  eta: 0:00:00  lr: 0.003335  min_lr: 0.003335  loss: 3.5582 (3.6226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7354 (0.7515)  time: 0.2305  data: 0.0009  max mem: 18975
Epoch: [58] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.003335  min_lr: 0.003335  loss: 3.5582 (3.6165)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7354 (0.7515)
Test:  [ 0/50]  eta: 0:02:39  loss: 0.7516 (0.7516)  acc1: 88.4000 (88.4000)  acc5: 96.8000 (96.8000)  time: 3.1821  data: 3.0383  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 1.0095 (1.0609)  acc1: 80.0000 (79.3091)  acc5: 96.0000 (95.2000)  time: 0.4977  data: 0.3746  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0470 (1.0770)  acc1: 78.0000 (78.6667)  acc5: 95.6000 (95.4095)  time: 0.2265  data: 0.1062  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3113 (1.2358)  acc1: 71.6000 (75.5613)  acc5: 91.6000 (93.1226)  time: 0.2096  data: 0.0902  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 1.6216 (1.3256)  acc1: 68.0000 (73.3268)  acc5: 87.2000 (92.0293)  time: 0.2103  data: 0.0908  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6216 (1.3454)  acc1: 66.4000 (72.5280)  acc5: 88.4000 (91.6960)  time: 0.2319  data: 0.1125  max mem: 18975
Test: Total time: 0:00:13 (0.2710 s / it)
* Acc@1 72.284 Acc@5 91.520 loss 1.352
Accuracy of the model on the 50000 test images: 72.3%
Max accuracy: 72.53%
Epoch: [59]  [   0/2502]  eta: 1:26:12  lr: 0.003335  min_lr: 0.003335  loss: 2.6933 (2.6933)  weight_decay: 0.0500 (0.0500)  time: 2.0673  data: 1.6218  max mem: 18975
Epoch: [59]  [ 200/2502]  eta: 0:10:18  lr: 0.003334  min_lr: 0.003334  loss: 3.9389 (3.5878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6919 (0.7285)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [59]  [ 400/2502]  eta: 0:09:12  lr: 0.003334  min_lr: 0.003334  loss: 3.3229 (3.5836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7243 (0.7378)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [59]  [ 600/2502]  eta: 0:08:16  lr: 0.003333  min_lr: 0.003333  loss: 3.9931 (3.5718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7429 (0.7493)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [59]  [ 800/2502]  eta: 0:07:22  lr: 0.003332  min_lr: 0.003332  loss: 3.8007 (3.5676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7083 (0.7548)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [59]  [1000/2502]  eta: 0:06:29  lr: 0.003332  min_lr: 0.003332  loss: 3.8236 (3.5999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6633 (0.7558)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [59]  [1200/2502]  eta: 0:05:37  lr: 0.003331  min_lr: 0.003331  loss: 3.8605 (3.6118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7348 (0.7581)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [59]  [1400/2502]  eta: 0:04:45  lr: 0.003330  min_lr: 0.003330  loss: 3.4062 (3.6116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7153 (0.7626)  time: 0.2629  data: 0.0004  max mem: 18975
Epoch: [59]  [1600/2502]  eta: 0:03:53  lr: 0.003330  min_lr: 0.003330  loss: 3.9612 (3.6206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7879 (0.7678)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [59]  [1800/2502]  eta: 0:03:01  lr: 0.003329  min_lr: 0.003329  loss: 3.8535 (3.6264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7087 (0.7658)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [59]  [2000/2502]  eta: 0:02:09  lr: 0.003328  min_lr: 0.003328  loss: 3.0299 (3.6198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7132 (0.7646)  time: 0.2584  data: 0.0004  max mem: 18975
Epoch: [59]  [2200/2502]  eta: 0:01:18  lr: 0.003328  min_lr: 0.003328  loss: 3.3054 (3.6217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7106 (0.7647)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [59]  [2400/2502]  eta: 0:00:26  lr: 0.003327  min_lr: 0.003327  loss: 3.2986 (3.6156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8239 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [59]  [2501/2502]  eta: 0:00:00  lr: 0.003327  min_lr: 0.003327  loss: 3.2684 (3.6158)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6606 (inf)  time: 0.2317  data: 0.0009  max mem: 18975
Epoch: [59] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.003327  min_lr: 0.003327  loss: 3.2684 (3.6053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6606 (inf)
Test:  [ 0/50]  eta: 0:02:27  loss: 0.7880 (0.7880)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 2.9404  data: 2.7986  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 1.0250 (1.0877)  acc1: 78.8000 (79.2364)  acc5: 95.2000 (95.1636)  time: 0.5142  data: 0.3920  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0591 (1.0852)  acc1: 76.0000 (78.4952)  acc5: 95.2000 (95.2381)  time: 0.2730  data: 0.1527  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2320 (1.2288)  acc1: 74.0000 (75.3548)  acc5: 91.2000 (93.1871)  time: 0.2581  data: 0.1375  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5879 (1.3166)  acc1: 67.6000 (73.2781)  acc5: 88.0000 (92.0781)  time: 0.2477  data: 0.1269  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5091 (1.3340)  acc1: 68.4000 (72.8000)  acc5: 88.4000 (91.8080)  time: 0.2209  data: 0.1005  max mem: 18975
Test: Total time: 0:00:15 (0.3013 s / it)
* Acc@1 72.618 Acc@5 91.734 loss 1.338
Accuracy of the model on the 50000 test images: 72.6%
Max accuracy: 72.62%
Epoch: [60]  [   0/2502]  eta: 1:08:47  lr: 0.003327  min_lr: 0.003327  loss: 4.1735 (4.1735)  weight_decay: 0.0500 (0.0500)  time: 1.6497  data: 1.3833  max mem: 18975
Epoch: [60]  [ 200/2502]  eta: 0:10:12  lr: 0.003326  min_lr: 0.003326  loss: 3.5390 (3.5514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7367 (0.7304)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [60]  [ 400/2502]  eta: 0:09:09  lr: 0.003325  min_lr: 0.003325  loss: 3.9834 (3.5971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7123 (inf)  time: 0.2625  data: 0.0003  max mem: 18975
Epoch: [60]  [ 600/2502]  eta: 0:08:14  lr: 0.003325  min_lr: 0.003325  loss: 3.6163 (3.5955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7294 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [60]  [ 800/2502]  eta: 0:07:21  lr: 0.003324  min_lr: 0.003324  loss: 3.5640 (3.6158)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7553 (inf)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [60]  [1000/2502]  eta: 0:06:28  lr: 0.003323  min_lr: 0.003323  loss: 3.7614 (3.6251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7212 (inf)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [60]  [1200/2502]  eta: 0:05:36  lr: 0.003323  min_lr: 0.003323  loss: 3.9574 (3.6302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6752 (inf)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [60]  [1400/2502]  eta: 0:04:44  lr: 0.003322  min_lr: 0.003322  loss: 3.8331 (3.6321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7340 (inf)  time: 0.2573  data: 0.0003  max mem: 18975
Epoch: [60]  [1600/2502]  eta: 0:03:52  lr: 0.003321  min_lr: 0.003321  loss: 3.7753 (3.6400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7564 (inf)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [60]  [1800/2502]  eta: 0:03:01  lr: 0.003321  min_lr: 0.003321  loss: 3.1265 (3.6358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7010 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [60]  [2000/2502]  eta: 0:02:09  lr: 0.003320  min_lr: 0.003320  loss: 4.1596 (3.6302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6797 (inf)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [60]  [2200/2502]  eta: 0:01:17  lr: 0.003319  min_lr: 0.003319  loss: 3.9887 (3.6278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7082 (inf)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [60]  [2400/2502]  eta: 0:00:26  lr: 0.003318  min_lr: 0.003318  loss: 3.9562 (3.6282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6824 (inf)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [60]  [2501/2502]  eta: 0:00:00  lr: 0.003318  min_lr: 0.003318  loss: 3.7302 (3.6271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7277 (inf)  time: 0.2311  data: 0.0009  max mem: 18975
Epoch: [60] Total time: 0:10:44 (0.2576 s / it)
Averaged stats: lr: 0.003318  min_lr: 0.003318  loss: 3.7302 (3.6169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7277 (inf)
Test:  [ 0/50]  eta: 0:02:25  loss: 0.8397 (0.8397)  acc1: 89.6000 (89.6000)  acc5: 96.4000 (96.4000)  time: 2.9086  data: 2.7670  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.9902 (1.0532)  acc1: 79.6000 (79.2727)  acc5: 95.6000 (94.9818)  time: 0.4912  data: 0.3662  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0460 (1.0850)  acc1: 77.2000 (78.2095)  acc5: 95.2000 (95.0095)  time: 0.2487  data: 0.1273  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3613 (1.2263)  acc1: 73.2000 (75.4194)  acc5: 90.8000 (92.8258)  time: 0.2656  data: 0.1463  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6466 (1.3303)  acc1: 66.8000 (73.0342)  acc5: 86.8000 (91.4829)  time: 0.2714  data: 0.1524  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6394 (1.3467)  acc1: 66.8000 (72.4480)  acc5: 87.2000 (91.3280)  time: 0.2082  data: 0.0894  max mem: 18975
Test: Total time: 0:00:14 (0.2971 s / it)
* Acc@1 72.514 Acc@5 91.598 loss 1.348
Accuracy of the model on the 50000 test images: 72.5%
Max accuracy: 72.62%
Epoch: [61]  [   0/2502]  eta: 1:22:51  lr: 0.003318  min_lr: 0.003318  loss: 4.1711 (4.1711)  weight_decay: 0.0500 (0.0500)  time: 1.9872  data: 1.7198  max mem: 18975
Epoch: [61]  [ 200/2502]  eta: 0:10:15  lr: 0.003317  min_lr: 0.003317  loss: 3.0418 (3.4990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7267 (0.7923)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [61]  [ 400/2502]  eta: 0:09:10  lr: 0.003317  min_lr: 0.003317  loss: 4.1091 (3.5967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6700 (0.7460)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [61]  [ 600/2502]  eta: 0:08:15  lr: 0.003316  min_lr: 0.003316  loss: 3.4809 (3.6118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6881 (0.7422)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [61]  [ 800/2502]  eta: 0:07:22  lr: 0.003315  min_lr: 0.003315  loss: 3.5657 (3.6104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7072 (0.7432)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [61]  [1000/2502]  eta: 0:06:29  lr: 0.003315  min_lr: 0.003315  loss: 3.3971 (3.5925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7667 (0.7479)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [61]  [1200/2502]  eta: 0:05:36  lr: 0.003314  min_lr: 0.003314  loss: 3.3010 (3.5919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6564 (0.7501)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [61]  [1400/2502]  eta: 0:04:44  lr: 0.003313  min_lr: 0.003313  loss: 2.9013 (3.5684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7649 (0.7507)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [61]  [1600/2502]  eta: 0:03:53  lr: 0.003313  min_lr: 0.003313  loss: 3.9431 (3.5754)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7133 (0.7537)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [61]  [1800/2502]  eta: 0:03:01  lr: 0.003312  min_lr: 0.003312  loss: 3.3000 (3.5791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8394 (0.7561)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [61]  [2000/2502]  eta: 0:02:09  lr: 0.003311  min_lr: 0.003311  loss: 3.6564 (3.5828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7673 (0.7578)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [61]  [2200/2502]  eta: 0:01:17  lr: 0.003310  min_lr: 0.003310  loss: 3.8966 (3.5830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7302 (0.7561)  time: 0.2613  data: 0.0003  max mem: 18975
Epoch: [61]  [2400/2502]  eta: 0:00:26  lr: 0.003310  min_lr: 0.003310  loss: 3.8344 (3.5846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7508 (0.7587)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [61]  [2501/2502]  eta: 0:00:00  lr: 0.003309  min_lr: 0.003309  loss: 3.3638 (3.5826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7136 (0.7584)  time: 0.2310  data: 0.0009  max mem: 18975
Epoch: [61] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003309  min_lr: 0.003309  loss: 3.3638 (3.5918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7136 (0.7584)
Test:  [ 0/50]  eta: 0:03:11  loss: 0.7596 (0.7596)  acc1: 87.6000 (87.6000)  acc5: 97.2000 (97.2000)  time: 3.8206  data: 3.6850  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.0653 (1.0777)  acc1: 80.8000 (79.4545)  acc5: 95.6000 (94.8727)  time: 0.5659  data: 0.4452  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0653 (1.0995)  acc1: 76.8000 (78.4952)  acc5: 95.2000 (95.0095)  time: 0.2415  data: 0.1223  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3749 (1.2391)  acc1: 72.4000 (75.3161)  acc5: 90.8000 (93.0065)  time: 0.2461  data: 0.1266  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5842 (1.3372)  acc1: 67.2000 (73.2293)  acc5: 87.6000 (91.6098)  time: 0.2316  data: 0.1121  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5842 (1.3564)  acc1: 68.0000 (72.7040)  acc5: 87.6000 (91.4400)  time: 0.2310  data: 0.1120  max mem: 18975
Test: Total time: 0:00:14 (0.2891 s / it)
* Acc@1 72.430 Acc@5 91.558 loss 1.357
Accuracy of the model on the 50000 test images: 72.4%
Max accuracy: 72.62%
Epoch: [62]  [   0/2502]  eta: 1:43:10  lr: 0.003309  min_lr: 0.003309  loss: 4.4681 (4.4681)  weight_decay: 0.0500 (0.0500)  time: 2.4742  data: 2.1975  max mem: 18975
Epoch: [62]  [ 200/2502]  eta: 0:10:23  lr: 0.003309  min_lr: 0.003309  loss: 3.1554 (3.5842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7531 (0.7527)  time: 0.2576  data: 0.0003  max mem: 18975
Epoch: [62]  [ 400/2502]  eta: 0:09:14  lr: 0.003308  min_lr: 0.003308  loss: 3.5204 (3.5874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7903 (0.7551)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [62]  [ 600/2502]  eta: 0:08:18  lr: 0.003307  min_lr: 0.003307  loss: 4.0810 (3.5970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7552 (0.7499)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [62]  [ 800/2502]  eta: 0:07:23  lr: 0.003306  min_lr: 0.003306  loss: 2.9949 (3.5842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7791 (0.7593)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [62]  [1000/2502]  eta: 0:06:30  lr: 0.003306  min_lr: 0.003306  loss: 3.7928 (3.5800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7565 (0.7577)  time: 0.2666  data: 0.0004  max mem: 18975
Epoch: [62]  [1200/2502]  eta: 0:05:38  lr: 0.003305  min_lr: 0.003305  loss: 3.5852 (3.5794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7972 (0.7592)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [62]  [1400/2502]  eta: 0:04:45  lr: 0.003304  min_lr: 0.003304  loss: 3.6597 (3.5853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6982 (0.7527)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [62]  [1600/2502]  eta: 0:03:53  lr: 0.003304  min_lr: 0.003304  loss: 3.8658 (3.5902)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7381 (0.7524)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [62]  [1800/2502]  eta: 0:03:01  lr: 0.003303  min_lr: 0.003303  loss: 3.3517 (3.5933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7082 (0.7574)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [62]  [2000/2502]  eta: 0:02:09  lr: 0.003302  min_lr: 0.003302  loss: 3.6662 (3.5902)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8390 (0.7591)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [62]  [2200/2502]  eta: 0:01:18  lr: 0.003301  min_lr: 0.003301  loss: 3.4931 (3.5990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7672 (0.7617)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [62]  [2400/2502]  eta: 0:00:26  lr: 0.003301  min_lr: 0.003301  loss: 3.8452 (3.5975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6833 (0.7588)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [62]  [2501/2502]  eta: 0:00:00  lr: 0.003300  min_lr: 0.003300  loss: 3.5369 (3.5978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7329 (0.7597)  time: 0.2310  data: 0.0007  max mem: 18975
Epoch: [62] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.003300  min_lr: 0.003300  loss: 3.5369 (3.5869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7329 (0.7597)
Test:  [ 0/50]  eta: 0:03:22  loss: 0.6574 (0.6574)  acc1: 92.4000 (92.4000)  acc5: 98.0000 (98.0000)  time: 4.0457  data: 3.9033  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.0586 (1.0473)  acc1: 80.4000 (80.5818)  acc5: 95.6000 (95.2727)  time: 0.5739  data: 0.4522  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0914 (1.0741)  acc1: 76.8000 (78.8571)  acc5: 94.8000 (95.3714)  time: 0.2402  data: 0.1202  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2921 (1.2144)  acc1: 72.4000 (75.5742)  acc5: 92.0000 (93.1871)  time: 0.2533  data: 0.1332  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5907 (1.3011)  acc1: 67.2000 (73.3659)  acc5: 87.2000 (91.9122)  time: 0.2337  data: 0.1140  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5524 (1.3179)  acc1: 66.8000 (72.6480)  acc5: 89.6000 (91.7760)  time: 0.2097  data: 0.0905  max mem: 18975
Test: Total time: 0:00:14 (0.2941 s / it)
* Acc@1 72.742 Acc@5 91.728 loss 1.323
Accuracy of the model on the 50000 test images: 72.7%
Max accuracy: 72.74%
Epoch: [63]  [   0/2502]  eta: 1:22:01  lr: 0.003300  min_lr: 0.003300  loss: 2.7746 (2.7746)  weight_decay: 0.0500 (0.0500)  time: 1.9670  data: 1.6856  max mem: 18975
Epoch: [63]  [ 200/2502]  eta: 0:10:15  lr: 0.003300  min_lr: 0.003300  loss: 3.9808 (3.5838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.7465)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [63]  [ 400/2502]  eta: 0:09:11  lr: 0.003299  min_lr: 0.003299  loss: 3.9888 (3.5795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8026 (0.7721)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [63]  [ 600/2502]  eta: 0:08:15  lr: 0.003298  min_lr: 0.003298  loss: 3.9383 (3.5870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6935 (0.7570)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [63]  [ 800/2502]  eta: 0:07:22  lr: 0.003297  min_lr: 0.003297  loss: 3.1721 (3.5942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7346 (0.7616)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [63]  [1000/2502]  eta: 0:06:29  lr: 0.003297  min_lr: 0.003297  loss: 3.2893 (3.5946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7981 (0.7581)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [63]  [1200/2502]  eta: 0:05:37  lr: 0.003296  min_lr: 0.003296  loss: 3.2088 (3.5905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6969 (0.7552)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [63]  [1400/2502]  eta: 0:04:45  lr: 0.003295  min_lr: 0.003295  loss: 3.8425 (3.5891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7394 (0.7605)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [63]  [1600/2502]  eta: 0:03:53  lr: 0.003294  min_lr: 0.003294  loss: 3.7735 (3.5870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7768 (0.7606)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [63]  [1800/2502]  eta: 0:03:01  lr: 0.003294  min_lr: 0.003294  loss: 3.8451 (3.5826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7489 (0.7649)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [63]  [2000/2502]  eta: 0:02:09  lr: 0.003293  min_lr: 0.003293  loss: 3.6745 (3.5901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6873 (0.7616)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [63]  [2200/2502]  eta: 0:01:17  lr: 0.003292  min_lr: 0.003292  loss: 3.8979 (3.5906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7264 (0.7632)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [63]  [2400/2502]  eta: 0:00:26  lr: 0.003291  min_lr: 0.003291  loss: 4.0656 (3.5916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6473 (0.7599)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [63]  [2501/2502]  eta: 0:00:00  lr: 0.003291  min_lr: 0.003291  loss: 3.6580 (3.5931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7536 (0.7598)  time: 0.2311  data: 0.0007  max mem: 18975
Epoch: [63] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003291  min_lr: 0.003291  loss: 3.6580 (3.5889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7536 (0.7598)
Test:  [ 0/50]  eta: 0:03:25  loss: 0.6892 (0.6892)  acc1: 88.0000 (88.0000)  acc5: 96.0000 (96.0000)  time: 4.1011  data: 3.9598  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.0431 (1.0681)  acc1: 80.4000 (79.7818)  acc5: 96.0000 (95.1636)  time: 0.5708  data: 0.4489  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 1.1228 (1.1077)  acc1: 76.8000 (78.2286)  acc5: 95.6000 (95.2381)  time: 0.2554  data: 0.1359  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2793 (1.2351)  acc1: 73.6000 (75.4839)  acc5: 92.4000 (93.3677)  time: 0.2896  data: 0.1660  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5858 (1.3237)  acc1: 68.0000 (73.5902)  acc5: 88.8000 (92.1659)  time: 0.2489  data: 0.1255  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5633 (1.3350)  acc1: 68.0000 (73.1760)  acc5: 89.6000 (92.0320)  time: 0.2442  data: 0.1253  max mem: 18975
Test: Total time: 0:00:15 (0.3073 s / it)
* Acc@1 72.992 Acc@5 91.898 loss 1.341
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 72.99%
Epoch: [64]  [   0/2502]  eta: 1:37:38  lr: 0.003291  min_lr: 0.003291  loss: 4.6026 (4.6026)  weight_decay: 0.0500 (0.0500)  time: 2.3414  data: 2.0592  max mem: 18975
Epoch: [64]  [ 200/2502]  eta: 0:10:20  lr: 0.003290  min_lr: 0.003290  loss: 3.7711 (3.5139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6682 (0.7638)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [64]  [ 400/2502]  eta: 0:09:12  lr: 0.003290  min_lr: 0.003290  loss: 3.9897 (3.5803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7642 (0.7765)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [64]  [ 600/2502]  eta: 0:08:16  lr: 0.003289  min_lr: 0.003289  loss: 3.5573 (3.5773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6861 (0.7709)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [64]  [ 800/2502]  eta: 0:07:23  lr: 0.003288  min_lr: 0.003288  loss: 3.2474 (3.5658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6501 (0.7635)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [64]  [1000/2502]  eta: 0:06:30  lr: 0.003287  min_lr: 0.003287  loss: 3.6177 (3.5656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8077 (0.7667)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [64]  [1200/2502]  eta: 0:05:37  lr: 0.003287  min_lr: 0.003287  loss: 3.5457 (3.5677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8766 (0.7714)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [64]  [1400/2502]  eta: 0:04:45  lr: 0.003286  min_lr: 0.003286  loss: 3.2038 (3.5730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8313 (0.7679)  time: 0.2568  data: 0.0005  max mem: 18975
Epoch: [64]  [1600/2502]  eta: 0:03:53  lr: 0.003285  min_lr: 0.003285  loss: 4.0606 (3.5709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7731 (0.7726)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [64]  [1800/2502]  eta: 0:03:01  lr: 0.003284  min_lr: 0.003284  loss: 3.7370 (3.5683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8104 (0.7727)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [64]  [2000/2502]  eta: 0:02:09  lr: 0.003284  min_lr: 0.003284  loss: 3.7731 (3.5781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7341 (0.7693)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [64]  [2200/2502]  eta: 0:01:18  lr: 0.003283  min_lr: 0.003283  loss: 3.3528 (3.5835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6911 (0.7659)  time: 0.2643  data: 0.0004  max mem: 18975
Epoch: [64]  [2400/2502]  eta: 0:00:26  lr: 0.003282  min_lr: 0.003282  loss: 3.8008 (3.5760)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7057 (0.7627)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [64]  [2501/2502]  eta: 0:00:00  lr: 0.003282  min_lr: 0.003282  loss: 4.0197 (3.5755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8137 (0.7651)  time: 0.2309  data: 0.0009  max mem: 18975
Epoch: [64] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.003282  min_lr: 0.003282  loss: 4.0197 (3.5821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8137 (0.7651)
Test:  [ 0/50]  eta: 0:02:41  loss: 0.8227 (0.8227)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 3.2379  data: 3.0855  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 1.2100 (1.1471)  acc1: 80.0000 (79.6364)  acc5: 95.2000 (95.4546)  time: 0.5235  data: 0.4003  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1395 (1.1456)  acc1: 78.4000 (78.4952)  acc5: 95.2000 (95.5619)  time: 0.2487  data: 0.1267  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2982 (1.2883)  acc1: 72.8000 (75.8839)  acc5: 92.0000 (93.1742)  time: 0.2159  data: 0.0945  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6646 (1.3813)  acc1: 67.6000 (73.5415)  acc5: 87.2000 (91.8244)  time: 0.2214  data: 0.1024  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6285 (1.3978)  acc1: 65.6000 (72.9200)  acc5: 88.4000 (91.6720)  time: 0.2325  data: 0.1135  max mem: 18975
Test: Total time: 0:00:14 (0.2907 s / it)
* Acc@1 73.076 Acc@5 91.862 loss 1.399
Accuracy of the model on the 50000 test images: 73.1%
Max accuracy: 73.08%
Epoch: [65]  [   0/2502]  eta: 1:10:43  lr: 0.003282  min_lr: 0.003282  loss: 3.1256 (3.1256)  weight_decay: 0.0500 (0.0500)  time: 1.6960  data: 1.4138  max mem: 18975
Epoch: [65]  [ 200/2502]  eta: 0:10:19  lr: 0.003281  min_lr: 0.003281  loss: 3.7733 (3.5679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6867 (0.7243)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [65]  [ 400/2502]  eta: 0:09:14  lr: 0.003280  min_lr: 0.003280  loss: 3.6232 (3.5686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7772 (0.7435)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [65]  [ 600/2502]  eta: 0:08:16  lr: 0.003279  min_lr: 0.003279  loss: 3.6392 (3.5542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7736 (0.7521)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [65]  [ 800/2502]  eta: 0:07:23  lr: 0.003279  min_lr: 0.003279  loss: 3.4100 (3.5631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7346 (0.7554)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [65]  [1000/2502]  eta: 0:06:29  lr: 0.003278  min_lr: 0.003278  loss: 4.0183 (3.5791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8083 (0.7558)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [65]  [1200/2502]  eta: 0:05:37  lr: 0.003277  min_lr: 0.003277  loss: 3.2571 (3.5769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7960 (0.7570)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [65]  [1400/2502]  eta: 0:04:45  lr: 0.003276  min_lr: 0.003276  loss: 3.5914 (3.5886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8177 (0.7606)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [65]  [1600/2502]  eta: 0:03:53  lr: 0.003276  min_lr: 0.003276  loss: 3.5368 (3.5921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7252 (0.7584)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [65]  [1800/2502]  eta: 0:03:01  lr: 0.003275  min_lr: 0.003275  loss: 3.7693 (3.5883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7725 (0.7572)  time: 0.2589  data: 0.0004  max mem: 18975
Epoch: [65]  [2000/2502]  eta: 0:02:09  lr: 0.003274  min_lr: 0.003274  loss: 3.8037 (3.5967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7439 (0.7550)  time: 0.2635  data: 0.0003  max mem: 18975
Epoch: [65]  [2200/2502]  eta: 0:01:17  lr: 0.003273  min_lr: 0.003273  loss: 3.7665 (3.5947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6612 (0.7515)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [65]  [2400/2502]  eta: 0:00:26  lr: 0.003272  min_lr: 0.003272  loss: 3.5626 (3.6036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7571 (0.7534)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [65]  [2501/2502]  eta: 0:00:00  lr: 0.003272  min_lr: 0.003272  loss: 3.5210 (3.6043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7278 (0.7539)  time: 0.2309  data: 0.0008  max mem: 18975
Epoch: [65] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.003272  min_lr: 0.003272  loss: 3.5210 (3.5979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7278 (0.7539)
Test:  [ 0/50]  eta: 0:02:54  loss: 0.7499 (0.7499)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 3.4808  data: 3.3369  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9989 (1.0450)  acc1: 80.4000 (79.4909)  acc5: 95.6000 (95.3455)  time: 0.5148  data: 0.3915  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0118 (1.0497)  acc1: 78.0000 (78.8952)  acc5: 95.6000 (95.6381)  time: 0.2141  data: 0.0935  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2035 (1.1858)  acc1: 75.2000 (76.0645)  acc5: 92.8000 (93.4194)  time: 0.2267  data: 0.1072  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5538 (1.2782)  acc1: 67.2000 (73.6488)  acc5: 88.4000 (92.0585)  time: 0.2506  data: 0.1315  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5538 (1.2924)  acc1: 67.2000 (73.1040)  acc5: 88.4000 (91.8960)  time: 0.2157  data: 0.0968  max mem: 18975
Test: Total time: 0:00:14 (0.2896 s / it)
* Acc@1 72.970 Acc@5 91.864 loss 1.298
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.08%
Epoch: [66]  [   0/2502]  eta: 1:20:40  lr: 0.003272  min_lr: 0.003272  loss: 4.0052 (4.0052)  weight_decay: 0.0500 (0.0500)  time: 1.9347  data: 1.4798  max mem: 18975
Epoch: [66]  [ 200/2502]  eta: 0:10:15  lr: 0.003271  min_lr: 0.003271  loss: 3.6927 (3.5695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7291 (0.7475)  time: 0.2568  data: 0.0005  max mem: 18975
Epoch: [66]  [ 400/2502]  eta: 0:09:10  lr: 0.003271  min_lr: 0.003271  loss: 4.0738 (3.6195)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7139 (0.7534)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [66]  [ 600/2502]  eta: 0:08:15  lr: 0.003270  min_lr: 0.003270  loss: 3.7781 (3.5916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6840 (0.7461)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [66]  [ 800/2502]  eta: 0:07:21  lr: 0.003269  min_lr: 0.003269  loss: 3.5599 (3.5916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7894 (0.7549)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [66]  [1000/2502]  eta: 0:06:29  lr: 0.003268  min_lr: 0.003268  loss: 3.7899 (3.5972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7811 (0.7596)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [66]  [1200/2502]  eta: 0:05:37  lr: 0.003267  min_lr: 0.003267  loss: 3.0452 (3.5927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7560 (0.7598)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [66]  [1400/2502]  eta: 0:04:45  lr: 0.003267  min_lr: 0.003267  loss: 3.3561 (3.5826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7241 (0.7584)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [66]  [1600/2502]  eta: 0:03:53  lr: 0.003266  min_lr: 0.003266  loss: 3.1032 (3.5743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7499 (inf)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [66]  [1800/2502]  eta: 0:03:01  lr: 0.003265  min_lr: 0.003265  loss: 3.9725 (3.5782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7154 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [66]  [2000/2502]  eta: 0:02:09  lr: 0.003264  min_lr: 0.003264  loss: 3.6948 (3.5828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8141 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [66]  [2200/2502]  eta: 0:01:17  lr: 0.003264  min_lr: 0.003264  loss: 3.5468 (3.5796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7365 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [66]  [2400/2502]  eta: 0:00:26  lr: 0.003263  min_lr: 0.003263  loss: 3.9394 (3.5775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7204 (inf)  time: 0.2639  data: 0.0004  max mem: 18975
Epoch: [66]  [2501/2502]  eta: 0:00:00  lr: 0.003262  min_lr: 0.003262  loss: 3.7349 (3.5830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8713 (inf)  time: 0.2312  data: 0.0009  max mem: 18975
Epoch: [66] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003262  min_lr: 0.003262  loss: 3.7349 (3.5812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8713 (inf)
Test:  [ 0/50]  eta: 0:02:24  loss: 0.8251 (0.8251)  acc1: 87.6000 (87.6000)  acc5: 99.6000 (99.6000)  time: 2.8908  data: 2.7333  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 1.1202 (1.1411)  acc1: 78.4000 (78.4364)  acc5: 95.6000 (95.3818)  time: 0.4938  data: 0.3705  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1202 (1.1505)  acc1: 76.4000 (77.7333)  acc5: 95.2000 (95.2571)  time: 0.2625  data: 0.1429  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3055 (1.2851)  acc1: 73.6000 (75.2000)  acc5: 90.8000 (93.0323)  time: 0.2581  data: 0.1384  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6592 (1.3830)  acc1: 68.0000 (72.9268)  acc5: 87.2000 (91.7268)  time: 0.2360  data: 0.1147  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6789 (1.4007)  acc1: 67.2000 (72.2320)  acc5: 87.6000 (91.5200)  time: 0.2481  data: 0.1257  max mem: 18975
Test: Total time: 0:00:14 (0.2884 s / it)
* Acc@1 72.212 Acc@5 91.506 loss 1.400
Accuracy of the model on the 50000 test images: 72.2%
Max accuracy: 73.08%
Epoch: [67]  [   0/2502]  eta: 1:31:35  lr: 0.003262  min_lr: 0.003262  loss: 4.4413 (4.4413)  weight_decay: 0.0500 (0.0500)  time: 2.1966  data: 1.8331  max mem: 18975
Epoch: [67]  [ 200/2502]  eta: 0:10:20  lr: 0.003262  min_lr: 0.003262  loss: 3.2531 (3.5842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6727 (0.7411)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [67]  [ 400/2502]  eta: 0:09:13  lr: 0.003261  min_lr: 0.003261  loss: 3.6887 (3.5769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7371 (0.7451)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [67]  [ 600/2502]  eta: 0:08:16  lr: 0.003260  min_lr: 0.003260  loss: 3.1666 (3.6082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7374 (0.7530)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [67]  [ 800/2502]  eta: 0:07:22  lr: 0.003259  min_lr: 0.003259  loss: 3.8751 (3.6119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7377 (0.7501)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [67]  [1000/2502]  eta: 0:06:29  lr: 0.003258  min_lr: 0.003258  loss: 3.5991 (3.5931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6497 (0.7414)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [67]  [1200/2502]  eta: 0:05:37  lr: 0.003258  min_lr: 0.003258  loss: 3.5532 (3.6027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7221 (0.7478)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [67]  [1400/2502]  eta: 0:04:45  lr: 0.003257  min_lr: 0.003257  loss: 3.8230 (3.5963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7817 (0.7518)  time: 0.2651  data: 0.0004  max mem: 18975
Epoch: [67]  [1600/2502]  eta: 0:03:53  lr: 0.003256  min_lr: 0.003256  loss: 3.5204 (3.5939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7485 (0.7542)  time: 0.2639  data: 0.0004  max mem: 18975
Epoch: [67]  [1800/2502]  eta: 0:03:01  lr: 0.003255  min_lr: 0.003255  loss: 3.1936 (3.5921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6957 (inf)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [67]  [2000/2502]  eta: 0:02:09  lr: 0.003254  min_lr: 0.003254  loss: 3.7108 (3.5920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6891 (inf)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [67]  [2200/2502]  eta: 0:01:18  lr: 0.003254  min_lr: 0.003254  loss: 3.4954 (3.5885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8478 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [67]  [2400/2502]  eta: 0:00:26  lr: 0.003253  min_lr: 0.003253  loss: 3.3658 (3.5844)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6859 (inf)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [67]  [2501/2502]  eta: 0:00:00  lr: 0.003252  min_lr: 0.003252  loss: 3.7080 (3.5877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7107 (inf)  time: 0.2309  data: 0.0009  max mem: 18975
Epoch: [67] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.003252  min_lr: 0.003252  loss: 3.7080 (3.5920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7107 (inf)
Test:  [ 0/50]  eta: 0:03:21  loss: 0.9294 (0.9294)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 4.0237  data: 3.8850  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.1827 (1.1787)  acc1: 80.0000 (79.1273)  acc5: 95.2000 (94.5091)  time: 0.5667  data: 0.4444  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1731 (1.1620)  acc1: 79.6000 (78.5524)  acc5: 95.6000 (95.2381)  time: 0.2161  data: 0.0962  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2472 (1.2960)  acc1: 72.8000 (75.8452)  acc5: 93.2000 (93.3806)  time: 0.2366  data: 0.1170  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6593 (1.3863)  acc1: 67.2000 (73.5317)  acc5: 88.4000 (92.1951)  time: 0.2690  data: 0.1497  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6593 (1.4010)  acc1: 68.0000 (73.1120)  acc5: 88.8000 (91.9760)  time: 0.2182  data: 0.0993  max mem: 18975
Test: Total time: 0:00:15 (0.3064 s / it)
* Acc@1 72.984 Acc@5 91.948 loss 1.408
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.08%
Epoch: [68]  [   0/2502]  eta: 1:32:55  lr: 0.003252  min_lr: 0.003252  loss: 2.6872 (2.6872)  weight_decay: 0.0500 (0.0500)  time: 2.2285  data: 1.8584  max mem: 18975
Epoch: [68]  [ 200/2502]  eta: 0:10:20  lr: 0.003252  min_lr: 0.003252  loss: 3.0041 (3.5892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7596 (0.7837)  time: 0.2644  data: 0.0005  max mem: 18975
Epoch: [68]  [ 400/2502]  eta: 0:09:15  lr: 0.003251  min_lr: 0.003251  loss: 4.0862 (3.5870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7244 (0.7891)  time: 0.2593  data: 0.0004  max mem: 18975
Epoch: [68]  [ 600/2502]  eta: 0:08:17  lr: 0.003250  min_lr: 0.003250  loss: 3.1473 (3.5841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6834 (0.7615)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [68]  [ 800/2502]  eta: 0:07:23  lr: 0.003249  min_lr: 0.003249  loss: 3.5499 (3.5785)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7611 (0.7607)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [68]  [1000/2502]  eta: 0:06:30  lr: 0.003248  min_lr: 0.003248  loss: 3.9791 (3.5807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7274 (0.7546)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [68]  [1200/2502]  eta: 0:05:37  lr: 0.003248  min_lr: 0.003248  loss: 3.8375 (3.5683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6875 (0.7490)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [68]  [1400/2502]  eta: 0:04:45  lr: 0.003247  min_lr: 0.003247  loss: 3.1406 (3.5710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7673 (0.7549)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [68]  [1600/2502]  eta: 0:03:53  lr: 0.003246  min_lr: 0.003246  loss: 3.7346 (3.5597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7580 (0.7580)  time: 0.2615  data: 0.0004  max mem: 18975
Epoch: [68]  [1800/2502]  eta: 0:03:01  lr: 0.003245  min_lr: 0.003245  loss: 3.8081 (3.5577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6814 (0.7531)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [68]  [2000/2502]  eta: 0:02:09  lr: 0.003244  min_lr: 0.003244  loss: 3.7088 (3.5547)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7157 (0.7539)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [68]  [2200/2502]  eta: 0:01:18  lr: 0.003243  min_lr: 0.003243  loss: 3.9725 (3.5646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7905 (0.7584)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [68]  [2400/2502]  eta: 0:00:26  lr: 0.003243  min_lr: 0.003243  loss: 3.4232 (3.5628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6891 (0.7556)  time: 0.2573  data: 0.0003  max mem: 18975
Epoch: [68]  [2501/2502]  eta: 0:00:00  lr: 0.003242  min_lr: 0.003242  loss: 3.1846 (3.5590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7198 (0.7567)  time: 0.2316  data: 0.0013  max mem: 18975
Epoch: [68] Total time: 0:10:46 (0.2585 s / it)
Averaged stats: lr: 0.003242  min_lr: 0.003242  loss: 3.1846 (3.5789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7198 (0.7567)
Test:  [ 0/50]  eta: 0:03:09  loss: 0.6607 (0.6607)  acc1: 90.0000 (90.0000)  acc5: 98.0000 (98.0000)  time: 3.7926  data: 3.6457  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 1.0024 (1.0261)  acc1: 80.0000 (79.7091)  acc5: 95.2000 (95.2000)  time: 0.6067  data: 0.4841  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0094 (1.0435)  acc1: 77.6000 (78.8000)  acc5: 95.2000 (95.3143)  time: 0.2552  data: 0.1350  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2094 (1.1869)  acc1: 71.2000 (75.7290)  acc5: 91.2000 (93.2387)  time: 0.2267  data: 0.1070  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5293 (1.2760)  acc1: 67.6000 (73.5122)  acc5: 88.0000 (91.9415)  time: 0.2257  data: 0.1066  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5083 (1.2858)  acc1: 66.4000 (72.8560)  acc5: 88.4000 (91.9200)  time: 0.2299  data: 0.1110  max mem: 18975
Test: Total time: 0:00:15 (0.3020 s / it)
* Acc@1 72.962 Acc@5 92.032 loss 1.284
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.08%
Epoch: [69]  [   0/2502]  eta: 1:22:20  lr: 0.003242  min_lr: 0.003242  loss: 2.5748 (2.5748)  weight_decay: 0.0500 (0.0500)  time: 1.9746  data: 1.7258  max mem: 18975
Epoch: [69]  [ 200/2502]  eta: 0:10:16  lr: 0.003241  min_lr: 0.003241  loss: 3.8057 (3.5737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7627 (0.7777)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [69]  [ 400/2502]  eta: 0:09:11  lr: 0.003241  min_lr: 0.003241  loss: 3.7247 (3.5564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7149 (0.7542)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [69]  [ 600/2502]  eta: 0:08:15  lr: 0.003240  min_lr: 0.003240  loss: 3.4461 (3.5707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7104 (0.7696)  time: 0.2566  data: 0.0005  max mem: 18975
Epoch: [69]  [ 800/2502]  eta: 0:07:22  lr: 0.003239  min_lr: 0.003239  loss: 3.1815 (3.5995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6884 (0.7612)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [69]  [1000/2502]  eta: 0:06:29  lr: 0.003238  min_lr: 0.003238  loss: 3.3836 (3.5935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7300 (0.7636)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [69]  [1200/2502]  eta: 0:05:37  lr: 0.003237  min_lr: 0.003237  loss: 3.7878 (3.5838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7452 (0.7617)  time: 0.2584  data: 0.0004  max mem: 18975
Epoch: [69]  [1400/2502]  eta: 0:04:45  lr: 0.003236  min_lr: 0.003236  loss: 3.0922 (3.5842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6387 (0.7613)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [69]  [1600/2502]  eta: 0:03:53  lr: 0.003236  min_lr: 0.003236  loss: 3.0446 (3.5828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8127 (0.7642)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [69]  [1800/2502]  eta: 0:03:01  lr: 0.003235  min_lr: 0.003235  loss: 3.3055 (3.5798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7101 (0.7617)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [69]  [2000/2502]  eta: 0:02:09  lr: 0.003234  min_lr: 0.003234  loss: 4.0406 (3.5802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6970 (0.7561)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [69]  [2200/2502]  eta: 0:01:18  lr: 0.003233  min_lr: 0.003233  loss: 3.2302 (3.5708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7225 (0.7555)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [69]  [2400/2502]  eta: 0:00:26  lr: 0.003232  min_lr: 0.003232  loss: 3.9673 (3.5788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7362 (0.7555)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [69]  [2501/2502]  eta: 0:00:00  lr: 0.003232  min_lr: 0.003232  loss: 3.2542 (3.5766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7270 (0.7539)  time: 0.2306  data: 0.0007  max mem: 18975
Epoch: [69] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.003232  min_lr: 0.003232  loss: 3.2542 (3.5647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7270 (0.7539)
Test:  [ 0/50]  eta: 0:02:19  loss: 0.6240 (0.6240)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 2.7881  data: 2.6494  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 1.0130 (1.0166)  acc1: 80.4000 (80.0364)  acc5: 94.4000 (95.3091)  time: 0.4699  data: 0.3459  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0370 (1.0310)  acc1: 78.0000 (78.8571)  acc5: 94.4000 (95.3524)  time: 0.2543  data: 0.1334  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1776 (1.1786)  acc1: 72.4000 (75.8839)  acc5: 92.8000 (93.3548)  time: 0.2563  data: 0.1371  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 1.5600 (1.2658)  acc1: 67.6000 (73.7854)  acc5: 87.6000 (92.1951)  time: 0.2124  data: 0.0933  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5493 (1.2872)  acc1: 67.6000 (73.2320)  acc5: 88.4000 (92.0240)  time: 0.1832  data: 0.0636  max mem: 18975
Test: Total time: 0:00:13 (0.2791 s / it)
* Acc@1 73.162 Acc@5 91.942 loss 1.290
Accuracy of the model on the 50000 test images: 73.2%
Max accuracy: 73.16%
Epoch: [70]  [   0/2502]  eta: 1:33:37  lr: 0.003232  min_lr: 0.003232  loss: 3.1159 (3.1159)  weight_decay: 0.0500 (0.0500)  time: 2.2451  data: 1.9725  max mem: 18975
Epoch: [70]  [ 200/2502]  eta: 0:10:20  lr: 0.003231  min_lr: 0.003231  loss: 3.5612 (3.5845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6980 (0.7574)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [70]  [ 400/2502]  eta: 0:09:14  lr: 0.003230  min_lr: 0.003230  loss: 3.2268 (3.5246)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7578 (0.7531)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [70]  [ 600/2502]  eta: 0:08:16  lr: 0.003229  min_lr: 0.003229  loss: 2.9148 (3.5260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7786 (0.7591)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [70]  [ 800/2502]  eta: 0:07:23  lr: 0.003228  min_lr: 0.003228  loss: 3.4424 (3.5393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8092 (0.7665)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [70]  [1000/2502]  eta: 0:06:30  lr: 0.003228  min_lr: 0.003228  loss: 3.4147 (3.5371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7213 (0.7606)  time: 0.2624  data: 0.0004  max mem: 18975
Epoch: [70]  [1200/2502]  eta: 0:05:37  lr: 0.003227  min_lr: 0.003227  loss: 3.5455 (3.5387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6761 (0.7616)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [70]  [1400/2502]  eta: 0:04:45  lr: 0.003226  min_lr: 0.003226  loss: 3.0394 (3.5431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7435 (0.7621)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [70]  [1600/2502]  eta: 0:03:53  lr: 0.003225  min_lr: 0.003225  loss: 3.5747 (3.5364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7088 (0.7650)  time: 0.2603  data: 0.0004  max mem: 18975
Epoch: [70]  [1800/2502]  eta: 0:03:01  lr: 0.003224  min_lr: 0.003224  loss: 3.5451 (3.5425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6830 (0.7607)  time: 0.2573  data: 0.0003  max mem: 18975
Epoch: [70]  [2000/2502]  eta: 0:02:09  lr: 0.003223  min_lr: 0.003223  loss: 3.2046 (3.5400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7345 (0.7613)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [70]  [2200/2502]  eta: 0:01:18  lr: 0.003223  min_lr: 0.003223  loss: 3.6192 (3.5434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6975 (0.7590)  time: 0.2603  data: 0.0004  max mem: 18975
Epoch: [70]  [2400/2502]  eta: 0:00:26  lr: 0.003222  min_lr: 0.003222  loss: 3.8571 (3.5507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7216 (0.7595)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [70]  [2501/2502]  eta: 0:00:00  lr: 0.003221  min_lr: 0.003221  loss: 3.1985 (3.5492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7026 (0.7585)  time: 0.2308  data: 0.0008  max mem: 18975
Epoch: [70] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.003221  min_lr: 0.003221  loss: 3.1985 (3.5669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7026 (0.7585)
Test:  [ 0/50]  eta: 0:02:45  loss: 0.8602 (0.8602)  acc1: 87.6000 (87.6000)  acc5: 97.2000 (97.2000)  time: 3.3123  data: 3.1576  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 1.1532 (1.1307)  acc1: 78.4000 (79.0909)  acc5: 95.6000 (95.3818)  time: 0.5223  data: 0.3996  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1532 (1.1378)  acc1: 76.8000 (78.6286)  acc5: 95.6000 (95.4667)  time: 0.2531  data: 0.1336  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2557 (1.2855)  acc1: 71.6000 (75.6903)  acc5: 92.8000 (93.3161)  time: 0.2428  data: 0.1234  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6193 (1.3665)  acc1: 68.8000 (73.9220)  acc5: 88.0000 (92.2829)  time: 0.2371  data: 0.1181  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.6056 (1.3842)  acc1: 68.4000 (73.4000)  acc5: 89.2000 (92.0000)  time: 0.2160  data: 0.0970  max mem: 18975
Test: Total time: 0:00:14 (0.2955 s / it)
* Acc@1 73.322 Acc@5 92.078 loss 1.390
Accuracy of the model on the 50000 test images: 73.3%
Max accuracy: 73.32%
Epoch: [71]  [   0/2502]  eta: 1:16:31  lr: 0.003221  min_lr: 0.003221  loss: 2.2883 (2.2883)  weight_decay: 0.0500 (0.0500)  time: 1.8351  data: 1.5692  max mem: 18975
Epoch: [71]  [ 200/2502]  eta: 0:10:16  lr: 0.003220  min_lr: 0.003220  loss: 3.6062 (3.5232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7708 (0.7454)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [71]  [ 400/2502]  eta: 0:09:12  lr: 0.003220  min_lr: 0.003220  loss: 3.6226 (3.5619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7280 (0.7472)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [71]  [ 600/2502]  eta: 0:08:16  lr: 0.003219  min_lr: 0.003219  loss: 3.7059 (3.5668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6929 (0.7476)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [71]  [ 800/2502]  eta: 0:07:22  lr: 0.003218  min_lr: 0.003218  loss: 3.6048 (3.5546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7393 (0.7639)  time: 0.2637  data: 0.0004  max mem: 18975
Epoch: [71]  [1000/2502]  eta: 0:06:29  lr: 0.003217  min_lr: 0.003217  loss: 3.1279 (3.5376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7131 (0.7674)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [71]  [1200/2502]  eta: 0:05:37  lr: 0.003216  min_lr: 0.003216  loss: 3.6959 (3.5265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7280 (0.7642)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [71]  [1400/2502]  eta: 0:04:45  lr: 0.003215  min_lr: 0.003215  loss: 3.0327 (3.5292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7457 (0.7609)  time: 0.2562  data: 0.0005  max mem: 18975
Epoch: [71]  [1600/2502]  eta: 0:03:53  lr: 0.003214  min_lr: 0.003214  loss: 3.7693 (3.5292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7070 (0.7604)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [71]  [1800/2502]  eta: 0:03:01  lr: 0.003214  min_lr: 0.003214  loss: 3.4346 (3.5435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8234 (0.7652)  time: 0.2578  data: 0.0003  max mem: 18975
Epoch: [71]  [2000/2502]  eta: 0:02:09  lr: 0.003213  min_lr: 0.003213  loss: 3.6094 (3.5454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6619 (0.7605)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [71]  [2200/2502]  eta: 0:01:17  lr: 0.003212  min_lr: 0.003212  loss: 3.3147 (3.5430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6724 (0.7574)  time: 0.2591  data: 0.0004  max mem: 18975
Epoch: [71]  [2400/2502]  eta: 0:00:26  lr: 0.003211  min_lr: 0.003211  loss: 4.1108 (3.5402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8311 (0.7613)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [71]  [2501/2502]  eta: 0:00:00  lr: 0.003211  min_lr: 0.003211  loss: 3.0316 (3.5411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7613 (0.7606)  time: 0.2304  data: 0.0008  max mem: 18975
Epoch: [71] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.003211  min_lr: 0.003211  loss: 3.0316 (3.5598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7613 (0.7606)
Test:  [ 0/50]  eta: 0:02:18  loss: 0.6896 (0.6896)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 2.7756  data: 2.6291  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.9330 (0.9639)  acc1: 80.8000 (79.8182)  acc5: 96.0000 (95.3818)  time: 0.4870  data: 0.3637  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9330 (0.9666)  acc1: 77.6000 (79.0667)  acc5: 96.0000 (95.4857)  time: 0.2624  data: 0.1421  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1651 (1.1179)  acc1: 72.4000 (75.9097)  acc5: 92.4000 (93.5742)  time: 0.2605  data: 0.1403  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4598 (1.2034)  acc1: 67.6000 (73.5902)  acc5: 88.4000 (92.5561)  time: 0.2482  data: 0.1283  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4411 (1.2166)  acc1: 67.6000 (73.2240)  acc5: 89.2000 (92.3440)  time: 0.2138  data: 0.0948  max mem: 18975
Test: Total time: 0:00:14 (0.2915 s / it)
* Acc@1 73.366 Acc@5 92.194 loss 1.219
Accuracy of the model on the 50000 test images: 73.4%
Max accuracy: 73.37%
Epoch: [72]  [   0/2502]  eta: 1:24:04  lr: 0.003211  min_lr: 0.003211  loss: 4.2169 (4.2169)  weight_decay: 0.0500 (0.0500)  time: 2.0161  data: 1.7388  max mem: 18975
Epoch: [72]  [ 200/2502]  eta: 0:10:23  lr: 0.003210  min_lr: 0.003210  loss: 3.9058 (3.5293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7060 (0.7454)  time: 0.2572  data: 0.0003  max mem: 18975
Epoch: [72]  [ 400/2502]  eta: 0:09:15  lr: 0.003209  min_lr: 0.003209  loss: 3.4431 (3.5253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7110 (0.7636)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [72]  [ 600/2502]  eta: 0:08:17  lr: 0.003208  min_lr: 0.003208  loss: 3.5839 (3.5483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7118 (0.7527)  time: 0.2566  data: 0.0009  max mem: 18975
Epoch: [72]  [ 800/2502]  eta: 0:07:23  lr: 0.003207  min_lr: 0.003207  loss: 3.2854 (3.5368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7480 (0.7613)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [72]  [1000/2502]  eta: 0:06:30  lr: 0.003206  min_lr: 0.003206  loss: 3.6338 (3.5339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7917 (0.7622)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [72]  [1200/2502]  eta: 0:05:37  lr: 0.003205  min_lr: 0.003205  loss: 3.7313 (3.5317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6927 (0.7620)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [72]  [1400/2502]  eta: 0:04:45  lr: 0.003205  min_lr: 0.003205  loss: 3.9923 (3.5379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7361 (0.7637)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [72]  [1600/2502]  eta: 0:03:53  lr: 0.003204  min_lr: 0.003204  loss: 3.3257 (3.5415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7133 (0.7604)  time: 0.2658  data: 0.0004  max mem: 18975
Epoch: [72]  [1800/2502]  eta: 0:03:01  lr: 0.003203  min_lr: 0.003203  loss: 3.6399 (3.5445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7900 (0.7607)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [72]  [2000/2502]  eta: 0:02:09  lr: 0.003202  min_lr: 0.003202  loss: 3.8399 (3.5454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7704 (0.7652)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [72]  [2200/2502]  eta: 0:01:18  lr: 0.003201  min_lr: 0.003201  loss: 3.6043 (3.5519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7807 (0.7652)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [72]  [2400/2502]  eta: 0:00:26  lr: 0.003200  min_lr: 0.003200  loss: 3.9913 (3.5557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7130 (0.7657)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [72]  [2501/2502]  eta: 0:00:00  lr: 0.003200  min_lr: 0.003200  loss: 3.5726 (3.5572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (0.7643)  time: 0.2312  data: 0.0007  max mem: 18975
Epoch: [72] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.003200  min_lr: 0.003200  loss: 3.5726 (3.5610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7534 (0.7643)
Test:  [ 0/50]  eta: 0:02:25  loss: 0.7220 (0.7220)  acc1: 89.6000 (89.6000)  acc5: 97.6000 (97.6000)  time: 2.9179  data: 2.7791  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 1.0971 (1.0733)  acc1: 79.2000 (80.4000)  acc5: 96.4000 (95.6000)  time: 0.4934  data: 0.3713  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.0982 (1.0992)  acc1: 78.8000 (79.6191)  acc5: 95.6000 (95.4476)  time: 0.2220  data: 0.0991  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3318 (1.2324)  acc1: 73.2000 (76.6710)  acc5: 92.0000 (93.4323)  time: 0.2544  data: 0.1320  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5409 (1.3093)  acc1: 69.6000 (74.5659)  acc5: 88.4000 (92.4488)  time: 0.2750  data: 0.1558  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5155 (1.3316)  acc1: 68.4000 (73.8880)  acc5: 89.2000 (92.1600)  time: 0.1990  data: 0.0800  max mem: 18975
Test: Total time: 0:00:14 (0.2897 s / it)
* Acc@1 73.344 Acc@5 92.038 loss 1.341
Accuracy of the model on the 50000 test images: 73.3%
Max accuracy: 73.37%
Epoch: [73]  [   0/2502]  eta: 1:32:10  lr: 0.003200  min_lr: 0.003200  loss: 4.1669 (4.1669)  weight_decay: 0.0500 (0.0500)  time: 2.2105  data: 1.4719  max mem: 18975
Epoch: [73]  [ 200/2502]  eta: 0:10:18  lr: 0.003199  min_lr: 0.003199  loss: 3.6308 (3.6019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7142 (0.7493)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [73]  [ 400/2502]  eta: 0:09:11  lr: 0.003198  min_lr: 0.003198  loss: 3.6681 (3.6165)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7437 (0.7558)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [73]  [ 600/2502]  eta: 0:08:16  lr: 0.003197  min_lr: 0.003197  loss: 3.1902 (3.5729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7050 (0.7559)  time: 0.2557  data: 0.0003  max mem: 18975
Epoch: [73]  [ 800/2502]  eta: 0:07:21  lr: 0.003196  min_lr: 0.003196  loss: 3.7525 (3.5688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7066 (0.7464)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [73]  [1000/2502]  eta: 0:06:28  lr: 0.003195  min_lr: 0.003195  loss: 3.9118 (3.5816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7904 (0.7508)  time: 0.2556  data: 0.0004  max mem: 18975
Epoch: [73]  [1200/2502]  eta: 0:05:36  lr: 0.003194  min_lr: 0.003194  loss: 3.4475 (3.5873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6863 (0.7471)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [73]  [1400/2502]  eta: 0:04:44  lr: 0.003193  min_lr: 0.003193  loss: 3.2947 (3.5853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8366 (0.7564)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [73]  [1600/2502]  eta: 0:03:52  lr: 0.003193  min_lr: 0.003193  loss: 3.7548 (3.5879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7393 (0.7598)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [73]  [1800/2502]  eta: 0:03:01  lr: 0.003192  min_lr: 0.003192  loss: 3.8414 (3.5771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6989 (0.7587)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [73]  [2000/2502]  eta: 0:02:09  lr: 0.003191  min_lr: 0.003191  loss: 3.8740 (3.5771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7653 (0.7596)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [73]  [2200/2502]  eta: 0:01:17  lr: 0.003190  min_lr: 0.003190  loss: 3.7699 (3.5728)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7969 (0.7635)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [73]  [2400/2502]  eta: 0:00:26  lr: 0.003189  min_lr: 0.003189  loss: 3.8287 (3.5733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7722 (0.7639)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [73]  [2501/2502]  eta: 0:00:00  lr: 0.003189  min_lr: 0.003189  loss: 3.5769 (3.5749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7038 (0.7621)  time: 0.2308  data: 0.0010  max mem: 18975
Epoch: [73] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.003189  min_lr: 0.003189  loss: 3.5769 (3.5627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7038 (0.7621)
Test:  [ 0/50]  eta: 0:02:11  loss: 0.6637 (0.6637)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 2.6353  data: 2.4937  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 1.0269 (1.0273)  acc1: 80.0000 (79.6000)  acc5: 96.0000 (95.1636)  time: 0.4266  data: 0.3047  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.0626 (1.0337)  acc1: 78.0000 (78.3619)  acc5: 95.6000 (95.4476)  time: 0.2338  data: 0.1141  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2218 (1.1804)  acc1: 73.2000 (75.6129)  acc5: 92.8000 (93.6387)  time: 0.2594  data: 0.1382  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5245 (1.2609)  acc1: 69.2000 (73.8146)  acc5: 88.8000 (92.5268)  time: 0.2665  data: 0.1434  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5177 (1.2778)  acc1: 68.4000 (73.3280)  acc5: 89.6000 (92.3520)  time: 0.2210  data: 0.0988  max mem: 18975
Test: Total time: 0:00:14 (0.2859 s / it)
* Acc@1 73.296 Acc@5 92.322 loss 1.281
Accuracy of the model on the 50000 test images: 73.3%
Max accuracy: 73.37%
Epoch: [74]  [   0/2502]  eta: 1:34:57  lr: 0.003189  min_lr: 0.003189  loss: 2.8284 (2.8284)  weight_decay: 0.0500 (0.0500)  time: 2.2772  data: 1.7539  max mem: 18975
Epoch: [74]  [ 200/2502]  eta: 0:10:17  lr: 0.003188  min_lr: 0.003188  loss: 2.9784 (3.4951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7412 (0.7322)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [74]  [ 400/2502]  eta: 0:09:12  lr: 0.003187  min_lr: 0.003187  loss: 3.8840 (3.5415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7075 (inf)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [74]  [ 600/2502]  eta: 0:08:16  lr: 0.003186  min_lr: 0.003186  loss: 3.6606 (3.5316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7214 (inf)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [74]  [ 800/2502]  eta: 0:07:23  lr: 0.003185  min_lr: 0.003185  loss: 3.7367 (3.5433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7383 (inf)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [74]  [1000/2502]  eta: 0:06:30  lr: 0.003184  min_lr: 0.003184  loss: 3.3767 (3.5541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7269 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [74]  [1200/2502]  eta: 0:05:37  lr: 0.003183  min_lr: 0.003183  loss: 3.7234 (3.5591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6658 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [74]  [1400/2502]  eta: 0:04:45  lr: 0.003182  min_lr: 0.003182  loss: 3.5761 (3.5635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7737 (inf)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [74]  [1600/2502]  eta: 0:03:53  lr: 0.003181  min_lr: 0.003181  loss: 3.0535 (3.5685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7638 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [74]  [1800/2502]  eta: 0:03:01  lr: 0.003180  min_lr: 0.003180  loss: 3.7209 (3.5640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7243 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [74]  [2000/2502]  eta: 0:02:09  lr: 0.003180  min_lr: 0.003180  loss: 3.8759 (3.5648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7296 (inf)  time: 0.2592  data: 0.0004  max mem: 18975
Epoch: [74]  [2200/2502]  eta: 0:01:18  lr: 0.003179  min_lr: 0.003179  loss: 3.3654 (3.5603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6679 (inf)  time: 0.2578  data: 0.0005  max mem: 18975
Epoch: [74]  [2400/2502]  eta: 0:00:26  lr: 0.003178  min_lr: 0.003178  loss: 3.0502 (3.5564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7030 (inf)  time: 0.2580  data: 0.0005  max mem: 18975
Epoch: [74]  [2501/2502]  eta: 0:00:00  lr: 0.003177  min_lr: 0.003177  loss: 3.1738 (3.5560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7136 (inf)  time: 0.2313  data: 0.0009  max mem: 18975
Epoch: [74] Total time: 0:10:46 (0.2586 s / it)
Averaged stats: lr: 0.003177  min_lr: 0.003177  loss: 3.1738 (3.5479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7136 (inf)
Test:  [ 0/50]  eta: 0:02:35  loss: 0.6227 (0.6227)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 3.1108  data: 2.9716  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9910 (1.0492)  acc1: 80.4000 (79.1273)  acc5: 94.8000 (94.9091)  time: 0.5212  data: 0.3985  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9907 (1.0533)  acc1: 78.4000 (78.7429)  acc5: 95.2000 (95.3714)  time: 0.2693  data: 0.1492  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2492 (1.1841)  acc1: 72.4000 (75.9613)  acc5: 92.8000 (93.4065)  time: 0.2842  data: 0.1649  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5084 (1.2688)  acc1: 68.8000 (73.9902)  acc5: 88.8000 (92.3707)  time: 0.2660  data: 0.1467  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5049 (1.2794)  acc1: 68.8000 (73.5440)  acc5: 89.6000 (92.2000)  time: 0.2394  data: 0.1204  max mem: 18975
Test: Total time: 0:00:15 (0.3083 s / it)
* Acc@1 73.496 Acc@5 92.312 loss 1.275
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.50%
Epoch: [75]  [   0/2502]  eta: 1:33:47  lr: 0.003177  min_lr: 0.003177  loss: 2.9018 (2.9018)  weight_decay: 0.0500 (0.0500)  time: 2.2492  data: 1.9764  max mem: 18975
Epoch: [75]  [ 200/2502]  eta: 0:10:20  lr: 0.003176  min_lr: 0.003176  loss: 3.1171 (3.5592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7807 (0.7865)  time: 0.2587  data: 0.0004  max mem: 18975
Epoch: [75]  [ 400/2502]  eta: 0:09:15  lr: 0.003176  min_lr: 0.003176  loss: 3.8723 (3.5484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6990 (0.7618)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [75]  [ 600/2502]  eta: 0:08:17  lr: 0.003175  min_lr: 0.003175  loss: 3.7983 (3.5537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7146 (0.7599)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [75]  [ 800/2502]  eta: 0:07:23  lr: 0.003174  min_lr: 0.003174  loss: 3.2302 (3.5556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8048 (0.7664)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [75]  [1000/2502]  eta: 0:06:29  lr: 0.003173  min_lr: 0.003173  loss: 3.5356 (3.5720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7508 (0.7626)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [75]  [1200/2502]  eta: 0:05:37  lr: 0.003172  min_lr: 0.003172  loss: 3.7241 (3.5613)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7009 (0.7594)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [75]  [1400/2502]  eta: 0:04:45  lr: 0.003171  min_lr: 0.003171  loss: 3.3263 (3.5661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7789 (0.7594)  time: 0.2583  data: 0.0004  max mem: 18975
Epoch: [75]  [1600/2502]  eta: 0:03:53  lr: 0.003170  min_lr: 0.003170  loss: 3.5115 (3.5620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6973 (0.7623)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [75]  [1800/2502]  eta: 0:03:01  lr: 0.003169  min_lr: 0.003169  loss: 3.7160 (3.5617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7389 (0.7614)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [75]  [2000/2502]  eta: 0:02:09  lr: 0.003168  min_lr: 0.003168  loss: 3.1846 (3.5624)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7843 (0.7632)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [75]  [2200/2502]  eta: 0:01:18  lr: 0.003167  min_lr: 0.003167  loss: 4.0063 (3.5601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7194 (0.7638)  time: 0.2558  data: 0.0003  max mem: 18975
Epoch: [75]  [2400/2502]  eta: 0:00:26  lr: 0.003166  min_lr: 0.003166  loss: 3.7722 (3.5627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7338 (0.7641)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [75]  [2501/2502]  eta: 0:00:00  lr: 0.003166  min_lr: 0.003166  loss: 3.6699 (3.5644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6932 (0.7648)  time: 0.2312  data: 0.0009  max mem: 18975
Epoch: [75] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.003166  min_lr: 0.003166  loss: 3.6699 (3.5529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6932 (0.7648)
Test:  [ 0/50]  eta: 0:03:17  loss: 0.7795 (0.7795)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 3.9508  data: 3.8057  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.1224 (1.0631)  acc1: 79.6000 (80.2545)  acc5: 96.4000 (95.9273)  time: 0.5580  data: 0.4354  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.1230 (1.1003)  acc1: 76.8000 (78.9905)  acc5: 96.0000 (95.7333)  time: 0.2265  data: 0.1067  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2483 (1.2265)  acc1: 73.2000 (76.2194)  acc5: 91.6000 (93.8194)  time: 0.2438  data: 0.1235  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5860 (1.3121)  acc1: 68.0000 (74.0390)  acc5: 88.0000 (92.4098)  time: 0.2556  data: 0.1349  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5690 (1.3275)  acc1: 68.4000 (73.4400)  acc5: 88.0000 (92.2560)  time: 0.2356  data: 0.1161  max mem: 18975
Test: Total time: 0:00:15 (0.3060 s / it)
* Acc@1 73.642 Acc@5 92.348 loss 1.330
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 73.64%
Epoch: [76]  [   0/2502]  eta: 1:15:27  lr: 0.003166  min_lr: 0.003166  loss: 2.8203 (2.8203)  weight_decay: 0.0500 (0.0500)  time: 1.8096  data: 1.5458  max mem: 18975
Epoch: [76]  [ 200/2502]  eta: 0:10:15  lr: 0.003165  min_lr: 0.003165  loss: 3.5190 (3.4368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6921 (0.7245)  time: 0.2634  data: 0.0004  max mem: 18975
Epoch: [76]  [ 400/2502]  eta: 0:09:10  lr: 0.003164  min_lr: 0.003164  loss: 3.3661 (3.4609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7412 (0.7334)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [76]  [ 600/2502]  eta: 0:08:15  lr: 0.003163  min_lr: 0.003163  loss: 3.1554 (3.4734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7051 (0.7312)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [76]  [ 800/2502]  eta: 0:07:21  lr: 0.003162  min_lr: 0.003162  loss: 3.4088 (3.4830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7419 (0.7386)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [76]  [1000/2502]  eta: 0:06:28  lr: 0.003161  min_lr: 0.003161  loss: 3.3606 (3.4834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7101 (0.7411)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [76]  [1200/2502]  eta: 0:05:36  lr: 0.003160  min_lr: 0.003160  loss: 3.7055 (3.4970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7537 (0.7439)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [76]  [1400/2502]  eta: 0:04:45  lr: 0.003159  min_lr: 0.003159  loss: 3.4218 (3.5041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8124 (0.7429)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [76]  [1600/2502]  eta: 0:03:53  lr: 0.003158  min_lr: 0.003158  loss: 3.6711 (3.5114)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7118 (0.7438)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [76]  [1800/2502]  eta: 0:03:01  lr: 0.003158  min_lr: 0.003158  loss: 3.5867 (3.5146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7657 (0.7417)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [76]  [2000/2502]  eta: 0:02:09  lr: 0.003157  min_lr: 0.003157  loss: 3.9667 (3.5166)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7100 (0.7409)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [76]  [2200/2502]  eta: 0:01:17  lr: 0.003156  min_lr: 0.003156  loss: 3.4251 (3.5142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7707 (0.7464)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [76]  [2400/2502]  eta: 0:00:26  lr: 0.003155  min_lr: 0.003155  loss: 3.7269 (3.5161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7944 (0.7462)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [76]  [2501/2502]  eta: 0:00:00  lr: 0.003154  min_lr: 0.003154  loss: 3.5769 (3.5146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6966 (0.7455)  time: 0.2311  data: 0.0009  max mem: 18975
Epoch: [76] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003154  min_lr: 0.003154  loss: 3.5769 (3.5416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6966 (0.7455)
Test:  [ 0/50]  eta: 0:02:17  loss: 0.7234 (0.7234)  acc1: 89.6000 (89.6000)  acc5: 97.6000 (97.6000)  time: 2.7572  data: 2.6113  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 1.0383 (1.0435)  acc1: 80.8000 (79.0182)  acc5: 95.6000 (95.0182)  time: 0.4606  data: 0.3389  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0645 (1.0524)  acc1: 77.6000 (78.0191)  acc5: 95.6000 (95.2571)  time: 0.2671  data: 0.1477  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2054 (1.1877)  acc1: 72.4000 (75.6000)  acc5: 90.8000 (93.3419)  time: 0.2698  data: 0.1504  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5654 (1.2766)  acc1: 67.6000 (73.3951)  acc5: 88.8000 (92.2244)  time: 0.2436  data: 0.1236  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5548 (1.2942)  acc1: 67.6000 (72.8800)  acc5: 89.2000 (92.0240)  time: 0.2086  data: 0.0843  max mem: 18975
Test: Total time: 0:00:14 (0.2926 s / it)
* Acc@1 73.304 Acc@5 92.200 loss 1.294
Accuracy of the model on the 50000 test images: 73.3%
Max accuracy: 73.64%
Epoch: [77]  [   0/2502]  eta: 1:35:17  lr: 0.003154  min_lr: 0.003154  loss: 3.5840 (3.5840)  weight_decay: 0.0500 (0.0500)  time: 2.2852  data: 1.5529  max mem: 18975
Epoch: [77]  [ 200/2502]  eta: 0:10:20  lr: 0.003153  min_lr: 0.003153  loss: 3.5078 (3.5558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7116 (0.7500)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [77]  [ 400/2502]  eta: 0:09:13  lr: 0.003152  min_lr: 0.003152  loss: 3.9281 (3.5925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7546 (0.7469)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [77]  [ 600/2502]  eta: 0:08:17  lr: 0.003151  min_lr: 0.003151  loss: 3.5543 (3.5482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7446 (0.7585)  time: 0.2567  data: 0.0005  max mem: 18975
Epoch: [77]  [ 800/2502]  eta: 0:07:23  lr: 0.003150  min_lr: 0.003150  loss: 3.8390 (3.5559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7057 (0.7455)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [77]  [1000/2502]  eta: 0:06:29  lr: 0.003150  min_lr: 0.003150  loss: 3.6859 (3.5567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6879 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [77]  [1200/2502]  eta: 0:05:37  lr: 0.003149  min_lr: 0.003149  loss: 3.8996 (3.5576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7046 (inf)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [77]  [1400/2502]  eta: 0:04:45  lr: 0.003148  min_lr: 0.003148  loss: 3.0707 (3.5450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7535 (inf)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [77]  [1600/2502]  eta: 0:03:53  lr: 0.003147  min_lr: 0.003147  loss: 3.4727 (3.5328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7186 (inf)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [77]  [1800/2502]  eta: 0:03:01  lr: 0.003146  min_lr: 0.003146  loss: 2.9253 (3.5282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7491 (inf)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [77]  [2000/2502]  eta: 0:02:09  lr: 0.003145  min_lr: 0.003145  loss: 3.5748 (3.5289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7367 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [77]  [2200/2502]  eta: 0:01:17  lr: 0.003144  min_lr: 0.003144  loss: 2.9261 (3.5261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7523 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [77]  [2400/2502]  eta: 0:00:26  lr: 0.003143  min_lr: 0.003143  loss: 3.9890 (3.5318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6639 (inf)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [77]  [2501/2502]  eta: 0:00:00  lr: 0.003142  min_lr: 0.003142  loss: 3.3090 (3.5332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7903 (inf)  time: 0.2305  data: 0.0009  max mem: 18975
Epoch: [77] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.003142  min_lr: 0.003142  loss: 3.3090 (3.5430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7903 (inf)
Test:  [ 0/50]  eta: 0:03:11  loss: 0.7040 (0.7040)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 3.8213  data: 3.6710  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.9977 (1.0082)  acc1: 81.2000 (79.5273)  acc5: 95.2000 (95.3455)  time: 0.5697  data: 0.4464  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.9604 (1.0085)  acc1: 78.8000 (78.5524)  acc5: 96.0000 (95.5810)  time: 0.2672  data: 0.1465  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2279 (1.1494)  acc1: 72.4000 (75.7548)  acc5: 92.4000 (93.8194)  time: 0.2537  data: 0.1326  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4616 (1.2279)  acc1: 69.2000 (73.8537)  acc5: 90.0000 (92.7610)  time: 0.1971  data: 0.0767  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4453 (1.2551)  acc1: 68.0000 (73.4160)  acc5: 90.0000 (92.5360)  time: 0.1967  data: 0.0776  max mem: 18975
Test: Total time: 0:00:14 (0.2864 s / it)
* Acc@1 73.946 Acc@5 92.480 loss 1.254
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 73.95%
Epoch: [78]  [   0/2502]  eta: 1:19:54  lr: 0.003142  min_lr: 0.003142  loss: 2.6778 (2.6778)  weight_decay: 0.0500 (0.0500)  time: 1.9162  data: 1.6287  max mem: 18975
Epoch: [78]  [ 200/2502]  eta: 0:10:17  lr: 0.003141  min_lr: 0.003141  loss: 3.1356 (3.5387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6915 (0.7435)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [78]  [ 400/2502]  eta: 0:09:11  lr: 0.003141  min_lr: 0.003141  loss: 3.1044 (3.5582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7240 (0.7529)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [78]  [ 600/2502]  eta: 0:08:16  lr: 0.003140  min_lr: 0.003140  loss: 3.7568 (3.5627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6971 (0.7468)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [78]  [ 800/2502]  eta: 0:07:23  lr: 0.003139  min_lr: 0.003139  loss: 4.0138 (3.5907)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7711 (0.7536)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [78]  [1000/2502]  eta: 0:06:29  lr: 0.003138  min_lr: 0.003138  loss: 3.5773 (3.5702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.7580)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [78]  [1200/2502]  eta: 0:05:37  lr: 0.003137  min_lr: 0.003137  loss: 3.8014 (3.5618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7513 (0.7593)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [78]  [1400/2502]  eta: 0:04:45  lr: 0.003136  min_lr: 0.003136  loss: 3.7410 (3.5704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7218 (0.7578)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [78]  [1600/2502]  eta: 0:03:53  lr: 0.003135  min_lr: 0.003135  loss: 2.8880 (3.5628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7698 (0.7613)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [78]  [1800/2502]  eta: 0:03:01  lr: 0.003134  min_lr: 0.003134  loss: 3.7250 (3.5632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.7561)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [78]  [2000/2502]  eta: 0:02:09  lr: 0.003133  min_lr: 0.003133  loss: 3.2886 (3.5620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7834 (0.7603)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [78]  [2200/2502]  eta: 0:01:17  lr: 0.003132  min_lr: 0.003132  loss: 3.4140 (3.5550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6783 (0.7565)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [78]  [2400/2502]  eta: 0:00:26  lr: 0.003131  min_lr: 0.003131  loss: 3.7796 (3.5529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7385 (0.7559)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [78]  [2501/2502]  eta: 0:00:00  lr: 0.003130  min_lr: 0.003130  loss: 3.2921 (3.5512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6423 (0.7520)  time: 0.2316  data: 0.0008  max mem: 18975
Epoch: [78] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003130  min_lr: 0.003130  loss: 3.2921 (3.5345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6423 (0.7520)
Test:  [ 0/50]  eta: 0:02:45  loss: 0.7623 (0.7623)  acc1: 88.4000 (88.4000)  acc5: 97.6000 (97.6000)  time: 3.3137  data: 3.1705  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.1191 (1.0740)  acc1: 79.2000 (78.9455)  acc5: 95.6000 (94.8727)  time: 0.5711  data: 0.4461  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0566 (1.0737)  acc1: 78.4000 (78.5905)  acc5: 95.2000 (95.2191)  time: 0.2614  data: 0.1398  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2146 (1.1832)  acc1: 75.2000 (76.4516)  acc5: 92.4000 (93.3548)  time: 0.2446  data: 0.1253  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4754 (1.2644)  acc1: 69.6000 (74.4976)  acc5: 89.2000 (92.1951)  time: 0.2299  data: 0.1111  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4968 (1.2815)  acc1: 68.0000 (73.6640)  acc5: 89.2000 (92.1120)  time: 0.2198  data: 0.1011  max mem: 18975
Test: Total time: 0:00:14 (0.2945 s / it)
* Acc@1 73.786 Acc@5 92.272 loss 1.279
Accuracy of the model on the 50000 test images: 73.8%
Max accuracy: 73.95%
Epoch: [79]  [   0/2502]  eta: 1:27:31  lr: 0.003130  min_lr: 0.003130  loss: 2.7865 (2.7865)  weight_decay: 0.0500 (0.0500)  time: 2.0990  data: 1.8259  max mem: 18975
Epoch: [79]  [ 200/2502]  eta: 0:10:21  lr: 0.003130  min_lr: 0.003130  loss: 3.4645 (3.4742)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7535 (nan)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [79]  [ 400/2502]  eta: 0:09:15  lr: 0.003129  min_lr: 0.003129  loss: 4.0037 (3.5304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7676 (nan)  time: 0.2572  data: 0.0003  max mem: 18975
Epoch: [79]  [ 600/2502]  eta: 0:08:18  lr: 0.003128  min_lr: 0.003128  loss: 3.4514 (3.5352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7783 (nan)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [79]  [ 800/2502]  eta: 0:07:23  lr: 0.003127  min_lr: 0.003127  loss: 4.0263 (3.5241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7115 (nan)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [79]  [1000/2502]  eta: 0:06:30  lr: 0.003126  min_lr: 0.003126  loss: 3.1993 (3.5155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7551 (nan)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [79]  [1200/2502]  eta: 0:05:37  lr: 0.003125  min_lr: 0.003125  loss: 3.2202 (3.5202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6848 (nan)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [79]  [1400/2502]  eta: 0:04:45  lr: 0.003124  min_lr: 0.003124  loss: 3.5627 (3.5273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7990 (nan)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [79]  [1600/2502]  eta: 0:03:53  lr: 0.003123  min_lr: 0.003123  loss: 3.4324 (3.5262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6819 (nan)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [79]  [1800/2502]  eta: 0:03:01  lr: 0.003122  min_lr: 0.003122  loss: 3.7687 (3.5334)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8213 (nan)  time: 0.2642  data: 0.0005  max mem: 18975
Epoch: [79]  [2000/2502]  eta: 0:02:09  lr: 0.003121  min_lr: 0.003121  loss: 3.5819 (3.5302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7769 (nan)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [79]  [2200/2502]  eta: 0:01:18  lr: 0.003120  min_lr: 0.003120  loss: 3.9056 (3.5317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6979 (nan)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [79]  [2400/2502]  eta: 0:00:26  lr: 0.003119  min_lr: 0.003119  loss: 2.6398 (3.5302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6903 (nan)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [79]  [2501/2502]  eta: 0:00:00  lr: 0.003118  min_lr: 0.003118  loss: 4.0045 (3.5248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8047 (nan)  time: 0.2348  data: 0.0009  max mem: 18975
Epoch: [79] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.003118  min_lr: 0.003118  loss: 4.0045 (3.5255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8047 (nan)
Test:  [ 0/50]  eta: 0:02:42  loss: 0.7395 (0.7395)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 3.2403  data: 3.0934  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 1.0037 (1.0611)  acc1: 80.0000 (79.6364)  acc5: 95.6000 (95.3091)  time: 0.5207  data: 0.3972  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0730 (1.0898)  acc1: 77.6000 (78.5714)  acc5: 95.6000 (95.5429)  time: 0.2492  data: 0.1291  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2400 (1.2198)  acc1: 72.8000 (75.9742)  acc5: 93.2000 (93.5742)  time: 0.2529  data: 0.1323  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5760 (1.3115)  acc1: 69.6000 (74.1854)  acc5: 88.0000 (92.3024)  time: 0.2596  data: 0.1381  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5760 (1.3250)  acc1: 69.6000 (73.7360)  acc5: 88.0000 (92.0800)  time: 0.2080  data: 0.0867  max mem: 18975
Test: Total time: 0:00:14 (0.2984 s / it)
* Acc@1 73.912 Acc@5 92.166 loss 1.324
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 73.95%
Epoch: [80]  [   0/2502]  eta: 1:13:08  lr: 0.003118  min_lr: 0.003118  loss: 4.2238 (4.2238)  weight_decay: 0.0500 (0.0500)  time: 1.7539  data: 1.3925  max mem: 18975
Epoch: [80]  [ 200/2502]  eta: 0:10:15  lr: 0.003117  min_lr: 0.003117  loss: 3.7646 (3.5049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7065 (0.7258)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [80]  [ 400/2502]  eta: 0:09:12  lr: 0.003116  min_lr: 0.003116  loss: 3.8352 (3.5605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7161 (0.7270)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [80]  [ 600/2502]  eta: 0:08:15  lr: 0.003115  min_lr: 0.003115  loss: 3.4018 (3.5488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6822 (0.7351)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [80]  [ 800/2502]  eta: 0:07:21  lr: 0.003114  min_lr: 0.003114  loss: 3.2492 (3.5466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7033 (0.7340)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [80]  [1000/2502]  eta: 0:06:29  lr: 0.003113  min_lr: 0.003113  loss: 3.3113 (3.5518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7273 (0.7333)  time: 0.2573  data: 0.0003  max mem: 18975
Epoch: [80]  [1200/2502]  eta: 0:05:37  lr: 0.003112  min_lr: 0.003112  loss: 3.5648 (3.5517)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7296 (0.7355)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [80]  [1400/2502]  eta: 0:04:44  lr: 0.003111  min_lr: 0.003111  loss: 3.7974 (3.5558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7404 (0.7355)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [80]  [1600/2502]  eta: 0:03:53  lr: 0.003110  min_lr: 0.003110  loss: 3.8933 (3.5604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7271 (0.7367)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [80]  [1800/2502]  eta: 0:03:01  lr: 0.003109  min_lr: 0.003109  loss: 3.7840 (3.5616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7367 (0.7388)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [80]  [2000/2502]  eta: 0:02:09  lr: 0.003108  min_lr: 0.003108  loss: 3.3698 (3.5582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7813 (0.7405)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [80]  [2200/2502]  eta: 0:01:17  lr: 0.003107  min_lr: 0.003107  loss: 3.4286 (3.5558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6978 (0.7428)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [80]  [2400/2502]  eta: 0:00:26  lr: 0.003106  min_lr: 0.003106  loss: 3.7885 (3.5588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7195 (0.7474)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [80]  [2501/2502]  eta: 0:00:00  lr: 0.003106  min_lr: 0.003106  loss: 3.5552 (3.5557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7607 (0.7499)  time: 0.2308  data: 0.0007  max mem: 18975
Epoch: [80] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003106  min_lr: 0.003106  loss: 3.5552 (3.5332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7607 (0.7499)
Test:  [ 0/50]  eta: 0:02:40  loss: 0.6890 (0.6890)  acc1: 89.6000 (89.6000)  acc5: 98.0000 (98.0000)  time: 3.2055  data: 3.0662  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.0231 (1.0380)  acc1: 78.8000 (79.4182)  acc5: 95.2000 (95.2000)  time: 0.5295  data: 0.4078  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0231 (1.0503)  acc1: 76.8000 (78.4762)  acc5: 96.4000 (95.5810)  time: 0.2684  data: 0.1485  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2889 (1.1715)  acc1: 72.8000 (76.0903)  acc5: 92.4000 (93.7936)  time: 0.2893  data: 0.1685  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4963 (1.2534)  acc1: 69.6000 (74.0000)  acc5: 88.8000 (92.6634)  time: 0.2457  data: 0.1238  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4992 (1.2701)  acc1: 69.6000 (73.6320)  acc5: 89.2000 (92.4480)  time: 0.2450  data: 0.1237  max mem: 18975
Test: Total time: 0:00:14 (0.2933 s / it)
* Acc@1 73.818 Acc@5 92.386 loss 1.273
Accuracy of the model on the 50000 test images: 73.8%
Max accuracy: 73.95%
Epoch: [81]  [   0/2502]  eta: 1:38:23  lr: 0.003106  min_lr: 0.003106  loss: 3.9159 (3.9159)  weight_decay: 0.0500 (0.0500)  time: 2.3597  data: 2.0943  max mem: 18975
Epoch: [81]  [ 200/2502]  eta: 0:10:21  lr: 0.003105  min_lr: 0.003105  loss: 3.5281 (3.5369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7386 (0.7190)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [81]  [ 400/2502]  eta: 0:09:14  lr: 0.003104  min_lr: 0.003104  loss: 3.0740 (3.5119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7117 (0.7352)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [81]  [ 600/2502]  eta: 0:08:17  lr: 0.003103  min_lr: 0.003103  loss: 3.7671 (3.5247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7908 (0.7430)  time: 0.2583  data: 0.0004  max mem: 18975
Epoch: [81]  [ 800/2502]  eta: 0:07:23  lr: 0.003102  min_lr: 0.003102  loss: 3.7338 (3.5138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7467 (0.7408)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [81]  [1000/2502]  eta: 0:06:30  lr: 0.003101  min_lr: 0.003101  loss: 3.0485 (3.5052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7520 (0.7499)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [81]  [1200/2502]  eta: 0:05:37  lr: 0.003100  min_lr: 0.003100  loss: 3.6575 (3.5254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7295 (0.7509)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [81]  [1400/2502]  eta: 0:04:45  lr: 0.003099  min_lr: 0.003099  loss: 3.7704 (3.5277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6671 (0.7467)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [81]  [1600/2502]  eta: 0:03:53  lr: 0.003098  min_lr: 0.003098  loss: 3.4585 (3.5361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6873 (0.7477)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [81]  [1800/2502]  eta: 0:03:01  lr: 0.003097  min_lr: 0.003097  loss: 3.3164 (3.5373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7420 (0.7504)  time: 0.2568  data: 0.0005  max mem: 18975
Epoch: [81]  [2000/2502]  eta: 0:02:09  lr: 0.003096  min_lr: 0.003096  loss: 3.6820 (3.5425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7064 (0.7506)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [81]  [2200/2502]  eta: 0:01:18  lr: 0.003095  min_lr: 0.003095  loss: 3.7097 (3.5457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6954 (0.7503)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [81]  [2400/2502]  eta: 0:00:26  lr: 0.003094  min_lr: 0.003094  loss: 3.9490 (3.5525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7383 (0.7487)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [81]  [2501/2502]  eta: 0:00:00  lr: 0.003094  min_lr: 0.003094  loss: 3.7192 (3.5541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7406 (0.7498)  time: 0.2310  data: 0.0009  max mem: 18975
Epoch: [81] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.003094  min_lr: 0.003094  loss: 3.7192 (3.5424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7406 (0.7498)
Test:  [ 0/50]  eta: 0:02:11  loss: 0.7956 (0.7956)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 2.6202  data: 2.4767  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 1.0883 (1.0916)  acc1: 80.0000 (79.8182)  acc5: 95.6000 (95.2727)  time: 0.4638  data: 0.3406  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.0883 (1.0947)  acc1: 77.6000 (78.8571)  acc5: 95.2000 (95.4857)  time: 0.2485  data: 0.1262  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.3724 (1.2402)  acc1: 74.4000 (76.0258)  acc5: 91.6000 (93.3936)  time: 0.2469  data: 0.1255  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5742 (1.3376)  acc1: 68.4000 (74.0878)  acc5: 88.4000 (92.2049)  time: 0.2586  data: 0.1395  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5742 (1.3546)  acc1: 68.0000 (73.4960)  acc5: 88.4000 (92.0560)  time: 0.2205  data: 0.1014  max mem: 18975
Test: Total time: 0:00:14 (0.2870 s / it)
* Acc@1 73.460 Acc@5 92.100 loss 1.348
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.95%
Epoch: [82]  [   0/2502]  eta: 1:36:17  lr: 0.003093  min_lr: 0.003093  loss: 4.0264 (4.0264)  weight_decay: 0.0500 (0.0500)  time: 2.3091  data: 1.8914  max mem: 18975
Epoch: [82]  [ 200/2502]  eta: 0:10:23  lr: 0.003092  min_lr: 0.003092  loss: 3.7231 (3.4115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8063 (0.7748)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [82]  [ 400/2502]  eta: 0:09:13  lr: 0.003091  min_lr: 0.003091  loss: 3.4450 (3.4494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8240 (0.7652)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [82]  [ 600/2502]  eta: 0:08:16  lr: 0.003090  min_lr: 0.003090  loss: 3.3434 (3.4641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7092 (0.7644)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [82]  [ 800/2502]  eta: 0:07:22  lr: 0.003089  min_lr: 0.003089  loss: 3.6772 (3.4818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6667 (0.7567)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [82]  [1000/2502]  eta: 0:06:30  lr: 0.003088  min_lr: 0.003088  loss: 3.4976 (3.5017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6988 (0.7500)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [82]  [1200/2502]  eta: 0:05:37  lr: 0.003087  min_lr: 0.003087  loss: 3.2525 (3.4984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7966 (0.7555)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [82]  [1400/2502]  eta: 0:04:45  lr: 0.003086  min_lr: 0.003086  loss: 3.4223 (3.5129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6675 (0.7477)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [82]  [1600/2502]  eta: 0:03:53  lr: 0.003085  min_lr: 0.003085  loss: 3.5476 (3.5105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7854 (0.7523)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [82]  [1800/2502]  eta: 0:03:01  lr: 0.003084  min_lr: 0.003084  loss: 3.8100 (3.5126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6474 (0.7489)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [82]  [2000/2502]  eta: 0:02:09  lr: 0.003083  min_lr: 0.003083  loss: 3.5642 (3.5188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7668 (0.7528)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [82]  [2200/2502]  eta: 0:01:17  lr: 0.003082  min_lr: 0.003082  loss: 3.5733 (3.5208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6547 (0.7538)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [82]  [2400/2502]  eta: 0:00:26  lr: 0.003081  min_lr: 0.003081  loss: 3.6206 (3.5272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7369 (0.7564)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [82]  [2501/2502]  eta: 0:00:00  lr: 0.003081  min_lr: 0.003081  loss: 3.5051 (3.5284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6842 (0.7541)  time: 0.2304  data: 0.0007  max mem: 18975
Epoch: [82] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.003081  min_lr: 0.003081  loss: 3.5051 (3.5204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6842 (0.7541)
Test:  [ 0/50]  eta: 0:02:48  loss: 0.6626 (0.6626)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 3.3605  data: 3.2198  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 1.0055 (1.0405)  acc1: 81.2000 (80.2909)  acc5: 95.2000 (95.0909)  time: 0.5167  data: 0.3939  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9956 (1.0320)  acc1: 78.0000 (79.7143)  acc5: 95.2000 (95.3524)  time: 0.2471  data: 0.1270  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2764 (1.1660)  acc1: 73.6000 (76.7226)  acc5: 91.6000 (93.7677)  time: 0.2679  data: 0.1488  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5636 (1.2674)  acc1: 69.6000 (74.6146)  acc5: 88.8000 (92.5463)  time: 0.2416  data: 0.1226  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5280 (1.2874)  acc1: 69.2000 (74.0480)  acc5: 88.8000 (92.3520)  time: 0.1789  data: 0.0600  max mem: 18975
Test: Total time: 0:00:14 (0.2925 s / it)
* Acc@1 74.094 Acc@5 92.362 loss 1.282
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.09%
Epoch: [83]  [   0/2502]  eta: 1:20:40  lr: 0.003081  min_lr: 0.003081  loss: 4.4913 (4.4913)  weight_decay: 0.0500 (0.0500)  time: 1.9345  data: 1.6613  max mem: 18975
Epoch: [83]  [ 200/2502]  eta: 0:10:14  lr: 0.003080  min_lr: 0.003080  loss: 3.6583 (3.5515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7603 (0.7651)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [83]  [ 400/2502]  eta: 0:09:10  lr: 0.003079  min_lr: 0.003079  loss: 3.7023 (3.5334)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7156 (0.7715)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [83]  [ 600/2502]  eta: 0:08:14  lr: 0.003078  min_lr: 0.003078  loss: 3.4637 (3.5245)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6866 (0.7607)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [83]  [ 800/2502]  eta: 0:07:21  lr: 0.003077  min_lr: 0.003077  loss: 3.8805 (3.5098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7593 (0.7649)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [83]  [1000/2502]  eta: 0:06:28  lr: 0.003076  min_lr: 0.003076  loss: 3.5696 (3.5067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7282 (0.7655)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [83]  [1200/2502]  eta: 0:05:36  lr: 0.003075  min_lr: 0.003075  loss: 3.4460 (3.5134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7726 (0.7628)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [83]  [1400/2502]  eta: 0:04:44  lr: 0.003074  min_lr: 0.003074  loss: 3.8947 (3.5167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7090 (0.7618)  time: 0.2571  data: 0.0005  max mem: 18975
Epoch: [83]  [1600/2502]  eta: 0:03:52  lr: 0.003073  min_lr: 0.003073  loss: 3.6732 (3.5191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7291 (0.7593)  time: 0.2580  data: 0.0005  max mem: 18975
Epoch: [83]  [1800/2502]  eta: 0:03:01  lr: 0.003072  min_lr: 0.003072  loss: 3.8118 (3.5152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7389 (0.7562)  time: 0.2577  data: 0.0005  max mem: 18975
Epoch: [83]  [2000/2502]  eta: 0:02:09  lr: 0.003071  min_lr: 0.003071  loss: 3.9544 (3.5189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6926 (0.7557)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [83]  [2200/2502]  eta: 0:01:17  lr: 0.003070  min_lr: 0.003070  loss: 3.9148 (3.5242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8079 (0.7549)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [83]  [2400/2502]  eta: 0:00:26  lr: 0.003069  min_lr: 0.003069  loss: 3.6510 (3.5202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7629 (0.7554)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [83]  [2501/2502]  eta: 0:00:00  lr: 0.003068  min_lr: 0.003068  loss: 3.2413 (3.5241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7313 (0.7557)  time: 0.2306  data: 0.0008  max mem: 18975
Epoch: [83] Total time: 0:10:45 (0.2578 s / it)
Averaged stats: lr: 0.003068  min_lr: 0.003068  loss: 3.2413 (3.5170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7313 (0.7557)
Test:  [ 0/50]  eta: 0:02:11  loss: 0.7200 (0.7200)  acc1: 89.2000 (89.2000)  acc5: 97.6000 (97.6000)  time: 2.6368  data: 2.4926  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 1.0810 (1.0549)  acc1: 80.4000 (80.3273)  acc5: 95.2000 (94.9818)  time: 0.4269  data: 0.3052  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0223 (1.0499)  acc1: 78.4000 (79.6762)  acc5: 95.2000 (95.3524)  time: 0.2738  data: 0.1541  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2374 (1.1755)  acc1: 74.0000 (76.9032)  acc5: 93.2000 (93.5742)  time: 0.3061  data: 0.1867  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4722 (1.2509)  acc1: 69.6000 (74.9073)  acc5: 90.0000 (92.7512)  time: 0.2459  data: 0.1260  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4722 (1.2619)  acc1: 69.2000 (74.4320)  acc5: 90.4000 (92.6480)  time: 0.1908  data: 0.0708  max mem: 18975
Test: Total time: 0:00:14 (0.2923 s / it)
* Acc@1 74.126 Acc@5 92.476 loss 1.265
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.13%
Epoch: [84]  [   0/2502]  eta: 1:12:05  lr: 0.003068  min_lr: 0.003068  loss: 3.8580 (3.8580)  weight_decay: 0.0500 (0.0500)  time: 1.7288  data: 1.4672  max mem: 18975
Epoch: [84]  [ 200/2502]  eta: 0:10:15  lr: 0.003067  min_lr: 0.003067  loss: 3.8596 (3.6011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7727 (0.7458)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [84]  [ 400/2502]  eta: 0:09:12  lr: 0.003066  min_lr: 0.003066  loss: 3.2630 (3.5197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7787 (0.7654)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [84]  [ 600/2502]  eta: 0:08:16  lr: 0.003065  min_lr: 0.003065  loss: 3.6874 (3.5069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8425 (0.7688)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [84]  [ 800/2502]  eta: 0:07:22  lr: 0.003064  min_lr: 0.003064  loss: 3.0320 (3.5217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7594 (0.7739)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [84]  [1000/2502]  eta: 0:06:29  lr: 0.003063  min_lr: 0.003063  loss: 3.7156 (3.5217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8068 (0.7674)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [84]  [1200/2502]  eta: 0:05:37  lr: 0.003062  min_lr: 0.003062  loss: 3.5773 (3.5128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7984 (0.7690)  time: 0.2639  data: 0.0004  max mem: 18975
Epoch: [84]  [1400/2502]  eta: 0:04:45  lr: 0.003061  min_lr: 0.003061  loss: 3.5040 (3.5112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6977 (0.7662)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [84]  [1600/2502]  eta: 0:03:53  lr: 0.003060  min_lr: 0.003060  loss: 3.7672 (3.5098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7902 (0.7731)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [84]  [1800/2502]  eta: 0:03:01  lr: 0.003059  min_lr: 0.003059  loss: 4.0971 (3.5122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7335 (0.7682)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [84]  [2000/2502]  eta: 0:02:09  lr: 0.003058  min_lr: 0.003058  loss: 3.3397 (3.5197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7198 (0.7707)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [84]  [2200/2502]  eta: 0:01:18  lr: 0.003057  min_lr: 0.003057  loss: 3.6291 (3.5242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7034 (0.7661)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [84]  [2400/2502]  eta: 0:00:26  lr: 0.003056  min_lr: 0.003056  loss: 3.5748 (3.5261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7206 (0.7639)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [84]  [2501/2502]  eta: 0:00:00  lr: 0.003055  min_lr: 0.003055  loss: 3.6223 (3.5256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7279 (0.7645)  time: 0.2309  data: 0.0008  max mem: 18975
Epoch: [84] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.003055  min_lr: 0.003055  loss: 3.6223 (3.5091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7279 (0.7645)
Test:  [ 0/50]  eta: 0:03:22  loss: 0.7402 (0.7402)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 4.0592  data: 3.9137  max mem: 18975
Test:  [10/50]  eta: 0:00:25  loss: 1.1354 (1.1195)  acc1: 80.4000 (80.0364)  acc5: 96.0000 (95.4546)  time: 0.6316  data: 0.5077  max mem: 18975
Test:  [20/50]  eta: 0:00:14  loss: 1.1354 (1.1231)  acc1: 79.2000 (79.4667)  acc5: 96.0000 (95.4857)  time: 0.2951  data: 0.1736  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3332 (1.2423)  acc1: 74.0000 (76.8516)  acc5: 91.6000 (93.7161)  time: 0.2658  data: 0.1453  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5415 (1.3228)  acc1: 68.8000 (74.5463)  acc5: 89.2000 (92.6439)  time: 0.2023  data: 0.0828  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5253 (1.3318)  acc1: 69.2000 (74.0640)  acc5: 90.0000 (92.5680)  time: 0.2016  data: 0.0827  max mem: 18975
Test: Total time: 0:00:15 (0.3037 s / it)
* Acc@1 74.014 Acc@5 92.496 loss 1.333
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.13%
Epoch: [85]  [   0/2502]  eta: 1:43:21  lr: 0.003055  min_lr: 0.003055  loss: 4.0251 (4.0251)  weight_decay: 0.0500 (0.0500)  time: 2.4786  data: 1.8611  max mem: 18975
Epoch: [85]  [ 200/2502]  eta: 0:10:21  lr: 0.003054  min_lr: 0.003054  loss: 3.3661 (3.4272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6896 (0.7493)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [85]  [ 400/2502]  eta: 0:09:15  lr: 0.003053  min_lr: 0.003053  loss: 3.8796 (3.5041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7671 (0.7743)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [85]  [ 600/2502]  eta: 0:08:18  lr: 0.003052  min_lr: 0.003052  loss: 3.7484 (3.5045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7079 (0.7567)  time: 0.2636  data: 0.0004  max mem: 18975
Epoch: [85]  [ 800/2502]  eta: 0:07:24  lr: 0.003051  min_lr: 0.003051  loss: 3.5361 (3.4947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7430 (0.7653)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [85]  [1000/2502]  eta: 0:06:30  lr: 0.003050  min_lr: 0.003050  loss: 3.3307 (3.4992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7041 (0.7599)  time: 0.2565  data: 0.0005  max mem: 18975
Epoch: [85]  [1200/2502]  eta: 0:05:37  lr: 0.003049  min_lr: 0.003049  loss: 3.2481 (3.4788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7043 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [85]  [1400/2502]  eta: 0:04:45  lr: 0.003048  min_lr: 0.003048  loss: 3.1941 (3.4908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (inf)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [85]  [1600/2502]  eta: 0:03:53  lr: 0.003047  min_lr: 0.003047  loss: 3.2665 (3.4919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7354 (inf)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [85]  [1800/2502]  eta: 0:03:01  lr: 0.003046  min_lr: 0.003046  loss: 3.6768 (3.4896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7765 (inf)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [85]  [2000/2502]  eta: 0:02:09  lr: 0.003044  min_lr: 0.003044  loss: 3.3336 (3.4913)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (inf)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [85]  [2200/2502]  eta: 0:01:18  lr: 0.003043  min_lr: 0.003043  loss: 3.9617 (3.4924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6968 (inf)  time: 0.2571  data: 0.0005  max mem: 18975
Epoch: [85]  [2400/2502]  eta: 0:00:26  lr: 0.003042  min_lr: 0.003042  loss: 3.2067 (3.4923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7771 (inf)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [85]  [2501/2502]  eta: 0:00:00  lr: 0.003042  min_lr: 0.003042  loss: 3.7045 (3.4947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6904 (inf)  time: 0.2314  data: 0.0009  max mem: 18975
Epoch: [85] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.003042  min_lr: 0.003042  loss: 3.7045 (3.5109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6904 (inf)
Test:  [ 0/50]  eta: 0:02:33  loss: 0.6925 (0.6925)  acc1: 91.2000 (91.2000)  acc5: 98.0000 (98.0000)  time: 3.0718  data: 2.9334  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 1.0292 (1.0654)  acc1: 80.4000 (80.1818)  acc5: 96.0000 (95.6000)  time: 0.4532  data: 0.3321  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0292 (1.0534)  acc1: 77.2000 (79.3333)  acc5: 95.2000 (96.0000)  time: 0.2379  data: 0.1185  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2038 (1.1903)  acc1: 74.4000 (76.4903)  acc5: 93.6000 (94.0258)  time: 0.2807  data: 0.1606  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5340 (1.2788)  acc1: 69.6000 (74.5756)  acc5: 88.8000 (92.5854)  time: 0.2579  data: 0.1363  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5280 (1.2945)  acc1: 69.6000 (73.9840)  acc5: 89.2000 (92.4800)  time: 0.2018  data: 0.0811  max mem: 18975
Test: Total time: 0:00:14 (0.2922 s / it)
* Acc@1 74.114 Acc@5 92.644 loss 1.290
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.13%
Epoch: [86]  [   0/2502]  eta: 1:29:39  lr: 0.003042  min_lr: 0.003042  loss: 2.3883 (2.3883)  weight_decay: 0.0500 (0.0500)  time: 2.1500  data: 1.8541  max mem: 18975
Epoch: [86]  [ 200/2502]  eta: 0:10:18  lr: 0.003041  min_lr: 0.003041  loss: 3.8477 (3.4960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7756 (0.7715)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [86]  [ 400/2502]  eta: 0:09:12  lr: 0.003040  min_lr: 0.003040  loss: 3.2118 (3.5161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8326 (0.7822)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [86]  [ 600/2502]  eta: 0:08:16  lr: 0.003039  min_lr: 0.003039  loss: 3.5669 (3.5186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6806 (0.7713)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [86]  [ 800/2502]  eta: 0:07:22  lr: 0.003038  min_lr: 0.003038  loss: 3.6749 (3.5272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8001 (nan)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [86]  [1000/2502]  eta: 0:06:30  lr: 0.003037  min_lr: 0.003037  loss: 3.6834 (3.5338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7533 (nan)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [86]  [1200/2502]  eta: 0:05:37  lr: 0.003035  min_lr: 0.003035  loss: 2.9524 (3.5206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7980 (nan)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [86]  [1400/2502]  eta: 0:04:45  lr: 0.003034  min_lr: 0.003034  loss: 3.7177 (3.5137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7866 (nan)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [86]  [1600/2502]  eta: 0:03:53  lr: 0.003033  min_lr: 0.003033  loss: 3.1818 (3.5149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7478 (nan)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [86]  [1800/2502]  eta: 0:03:01  lr: 0.003032  min_lr: 0.003032  loss: 3.0596 (3.5162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7362 (nan)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [86]  [2000/2502]  eta: 0:02:09  lr: 0.003031  min_lr: 0.003031  loss: 3.2999 (3.5140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6725 (nan)  time: 0.2566  data: 0.0005  max mem: 18975
Epoch: [86]  [2200/2502]  eta: 0:01:18  lr: 0.003030  min_lr: 0.003030  loss: 3.0613 (3.5190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7383 (nan)  time: 0.2642  data: 0.0005  max mem: 18975
Epoch: [86]  [2400/2502]  eta: 0:00:26  lr: 0.003029  min_lr: 0.003029  loss: 3.1230 (3.5212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7454 (nan)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [86]  [2501/2502]  eta: 0:00:00  lr: 0.003029  min_lr: 0.003029  loss: 3.1739 (3.5189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7149 (nan)  time: 0.2310  data: 0.0009  max mem: 18975
Epoch: [86] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.003029  min_lr: 0.003029  loss: 3.1739 (3.5081)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7149 (nan)
Test:  [ 0/50]  eta: 0:02:39  loss: 0.6610 (0.6610)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 3.1932  data: 3.0540  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9298 (0.9661)  acc1: 80.8000 (80.0727)  acc5: 96.4000 (95.5636)  time: 0.5217  data: 0.4003  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9350 (0.9855)  acc1: 78.0000 (78.6667)  acc5: 96.4000 (95.8095)  time: 0.2517  data: 0.1321  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2031 (1.1206)  acc1: 73.6000 (76.3484)  acc5: 91.2000 (93.6129)  time: 0.2672  data: 0.1479  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4663 (1.2088)  acc1: 70.0000 (74.5171)  acc5: 88.4000 (92.5366)  time: 0.2599  data: 0.1409  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4663 (1.2284)  acc1: 69.6000 (73.8560)  acc5: 89.2000 (92.4640)  time: 0.2057  data: 0.0868  max mem: 18975
Test: Total time: 0:00:14 (0.2994 s / it)
* Acc@1 74.044 Acc@5 92.638 loss 1.227
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.13%
Epoch: [87]  [   0/2502]  eta: 1:39:23  lr: 0.003029  min_lr: 0.003029  loss: 2.4920 (2.4920)  weight_decay: 0.0500 (0.0500)  time: 2.3836  data: 1.4595  max mem: 18975
Epoch: [87]  [ 200/2502]  eta: 0:10:24  lr: 0.003027  min_lr: 0.003027  loss: 3.4746 (3.4335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7476 (0.7586)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [87]  [ 400/2502]  eta: 0:09:14  lr: 0.003026  min_lr: 0.003026  loss: 3.1922 (3.4768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6735 (0.7287)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [87]  [ 600/2502]  eta: 0:08:16  lr: 0.003025  min_lr: 0.003025  loss: 3.2563 (3.4787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7363 (0.7452)  time: 0.2557  data: 0.0003  max mem: 18975
Epoch: [87]  [ 800/2502]  eta: 0:07:22  lr: 0.003024  min_lr: 0.003024  loss: 3.2683 (3.4660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7521 (0.7444)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [87]  [1000/2502]  eta: 0:06:29  lr: 0.003023  min_lr: 0.003023  loss: 2.9897 (3.4822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7147 (0.7496)  time: 0.2567  data: 0.0005  max mem: 18975
Epoch: [87]  [1200/2502]  eta: 0:05:37  lr: 0.003022  min_lr: 0.003022  loss: 3.1662 (3.4895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7796 (0.7533)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [87]  [1400/2502]  eta: 0:04:45  lr: 0.003021  min_lr: 0.003021  loss: 3.3921 (3.4762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6928 (0.7561)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [87]  [1600/2502]  eta: 0:03:53  lr: 0.003020  min_lr: 0.003020  loss: 3.4709 (3.4774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6892 (0.7513)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [87]  [1800/2502]  eta: 0:03:01  lr: 0.003019  min_lr: 0.003019  loss: 3.2991 (3.4870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6871 (0.7469)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [87]  [2000/2502]  eta: 0:02:09  lr: 0.003018  min_lr: 0.003018  loss: 3.4940 (3.4954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6790 (0.7479)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [87]  [2200/2502]  eta: 0:01:17  lr: 0.003017  min_lr: 0.003017  loss: 3.7147 (3.5045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6921 (0.7450)  time: 0.2585  data: 0.0003  max mem: 18975
Epoch: [87]  [2400/2502]  eta: 0:00:26  lr: 0.003016  min_lr: 0.003016  loss: 3.4232 (3.5007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7200 (0.7458)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [87]  [2501/2502]  eta: 0:00:00  lr: 0.003015  min_lr: 0.003015  loss: 3.5410 (3.5037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8210 (0.7477)  time: 0.2306  data: 0.0007  max mem: 18975
Epoch: [87] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.003015  min_lr: 0.003015  loss: 3.5410 (3.5079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8210 (0.7477)
Test:  [ 0/50]  eta: 0:03:22  loss: 0.5772 (0.5772)  acc1: 91.6000 (91.6000)  acc5: 100.0000 (100.0000)  time: 4.0556  data: 3.9104  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9540 (0.9675)  acc1: 80.0000 (79.8545)  acc5: 95.6000 (95.8182)  time: 0.5151  data: 0.3921  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9845 (0.9941)  acc1: 78.0000 (78.7810)  acc5: 96.0000 (95.8667)  time: 0.1705  data: 0.0504  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2361 (1.1317)  acc1: 74.4000 (76.6194)  acc5: 92.4000 (94.0645)  time: 0.2374  data: 0.1179  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5216 (1.2300)  acc1: 70.8000 (74.5951)  acc5: 89.2000 (92.7512)  time: 0.2974  data: 0.1768  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4928 (1.2452)  acc1: 70.8000 (74.0720)  acc5: 89.2000 (92.5920)  time: 0.2303  data: 0.1100  max mem: 18975
Test: Total time: 0:00:14 (0.3000 s / it)
* Acc@1 74.056 Acc@5 92.560 loss 1.244
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.13%
Epoch: [88]  [   0/2502]  eta: 1:19:58  lr: 0.003015  min_lr: 0.003015  loss: 3.3505 (3.3505)  weight_decay: 0.0500 (0.0500)  time: 1.9178  data: 1.6424  max mem: 18975
Epoch: [88]  [ 200/2502]  eta: 0:10:14  lr: 0.003014  min_lr: 0.003014  loss: 3.2421 (3.4906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7086 (0.7538)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [88]  [ 400/2502]  eta: 0:09:10  lr: 0.003013  min_lr: 0.003013  loss: 2.8332 (3.5112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7784 (0.8036)  time: 0.2574  data: 0.0005  max mem: 18975
Epoch: [88]  [ 600/2502]  eta: 0:08:15  lr: 0.003012  min_lr: 0.003012  loss: 3.9928 (3.5376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7567 (0.7915)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [88]  [ 800/2502]  eta: 0:07:22  lr: 0.003011  min_lr: 0.003011  loss: 3.7335 (3.5381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7674 (0.7808)  time: 0.2569  data: 0.0005  max mem: 18975
Epoch: [88]  [1000/2502]  eta: 0:06:29  lr: 0.003010  min_lr: 0.003010  loss: 3.9486 (3.5436)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7226 (0.7768)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [88]  [1200/2502]  eta: 0:05:37  lr: 0.003009  min_lr: 0.003009  loss: 3.4779 (3.5458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8144 (0.7786)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [88]  [1400/2502]  eta: 0:04:45  lr: 0.003007  min_lr: 0.003007  loss: 3.7114 (3.5407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7749 (0.7775)  time: 0.2589  data: 0.0003  max mem: 18975
Epoch: [88]  [1600/2502]  eta: 0:03:53  lr: 0.003006  min_lr: 0.003006  loss: 3.5971 (3.5331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7680 (0.7778)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [88]  [1800/2502]  eta: 0:03:01  lr: 0.003005  min_lr: 0.003005  loss: 3.4762 (3.5402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7033 (0.7734)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [88]  [2000/2502]  eta: 0:02:09  lr: 0.003004  min_lr: 0.003004  loss: 3.5693 (3.5272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6754 (0.7698)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [88]  [2200/2502]  eta: 0:01:18  lr: 0.003003  min_lr: 0.003003  loss: 3.1996 (3.5222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7599 (0.7693)  time: 0.2583  data: 0.0003  max mem: 18975
Epoch: [88]  [2400/2502]  eta: 0:00:26  lr: 0.003002  min_lr: 0.003002  loss: 3.6598 (3.5115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7518 (0.7709)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [88]  [2501/2502]  eta: 0:00:00  lr: 0.003001  min_lr: 0.003001  loss: 3.2587 (3.5128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7352 (0.7706)  time: 0.2312  data: 0.0009  max mem: 18975
Epoch: [88] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.003001  min_lr: 0.003001  loss: 3.2587 (3.5095)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7352 (0.7706)
Test:  [ 0/50]  eta: 0:02:42  loss: 0.7254 (0.7254)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 3.2519  data: 3.1065  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9951 (1.0202)  acc1: 81.2000 (80.1455)  acc5: 96.8000 (96.2182)  time: 0.5412  data: 0.4188  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9975 (1.0373)  acc1: 78.8000 (79.2381)  acc5: 96.8000 (96.0571)  time: 0.2654  data: 0.1459  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2920 (1.1737)  acc1: 73.2000 (76.4387)  acc5: 92.0000 (94.2065)  time: 0.2461  data: 0.1266  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5365 (1.2640)  acc1: 68.0000 (74.2829)  acc5: 90.4000 (93.0829)  time: 0.2495  data: 0.1300  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5240 (1.2825)  acc1: 68.0000 (73.7120)  acc5: 90.4000 (92.8720)  time: 0.2213  data: 0.1026  max mem: 18975
Test: Total time: 0:00:15 (0.3058 s / it)
* Acc@1 74.034 Acc@5 92.746 loss 1.281
Accuracy of the model on the 50000 test images: 74.0%
Max accuracy: 74.13%
Epoch: [89]  [   0/2502]  eta: 1:27:46  lr: 0.003001  min_lr: 0.003001  loss: 3.7681 (3.7681)  weight_decay: 0.0500 (0.0500)  time: 2.1048  data: 1.4559  max mem: 18975
Epoch: [89]  [ 200/2502]  eta: 0:10:17  lr: 0.003000  min_lr: 0.003000  loss: 3.2393 (3.3775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7472 (0.7710)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [89]  [ 400/2502]  eta: 0:09:12  lr: 0.002999  min_lr: 0.002999  loss: 3.1107 (3.4136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7182 (0.7921)  time: 0.2634  data: 0.0003  max mem: 18975
Epoch: [89]  [ 600/2502]  eta: 0:08:16  lr: 0.002998  min_lr: 0.002998  loss: 3.6105 (3.4653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7569 (0.7831)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [89]  [ 800/2502]  eta: 0:07:22  lr: 0.002997  min_lr: 0.002997  loss: 3.1631 (3.4564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.7755)  time: 0.2584  data: 0.0004  max mem: 18975
Epoch: [89]  [1000/2502]  eta: 0:06:29  lr: 0.002996  min_lr: 0.002996  loss: 2.9882 (3.4599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7704 (0.7762)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [89]  [1200/2502]  eta: 0:05:37  lr: 0.002995  min_lr: 0.002995  loss: 3.5016 (3.4694)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6952 (0.7722)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [89]  [1400/2502]  eta: 0:04:45  lr: 0.002994  min_lr: 0.002994  loss: 3.5198 (3.4682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7415 (0.7691)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [89]  [1600/2502]  eta: 0:03:53  lr: 0.002993  min_lr: 0.002993  loss: 3.0976 (3.4596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7249 (0.7658)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [89]  [1800/2502]  eta: 0:03:01  lr: 0.002991  min_lr: 0.002991  loss: 3.6837 (3.4696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7518 (0.7653)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [89]  [2000/2502]  eta: 0:02:09  lr: 0.002990  min_lr: 0.002990  loss: 3.2589 (3.4705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7792 (0.7628)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [89]  [2200/2502]  eta: 0:01:17  lr: 0.002989  min_lr: 0.002989  loss: 3.1645 (3.4706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7473 (0.7635)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [89]  [2400/2502]  eta: 0:00:26  lr: 0.002988  min_lr: 0.002988  loss: 3.7715 (3.4709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7359 (0.7612)  time: 0.2670  data: 0.0004  max mem: 18975
Epoch: [89]  [2501/2502]  eta: 0:00:00  lr: 0.002988  min_lr: 0.002988  loss: 3.6870 (3.4732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7308 (0.7611)  time: 0.2382  data: 0.0009  max mem: 18975
Epoch: [89] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002988  min_lr: 0.002988  loss: 3.6870 (3.4884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7308 (0.7611)
Test:  [ 0/50]  eta: 0:02:19  loss: 0.6212 (0.6212)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 2.7865  data: 2.6503  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 1.0384 (1.0798)  acc1: 78.8000 (80.3273)  acc5: 96.0000 (95.4182)  time: 0.5097  data: 0.3890  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0870 (1.0945)  acc1: 77.2000 (79.2952)  acc5: 96.0000 (95.4476)  time: 0.2875  data: 0.1682  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2746 (1.2179)  acc1: 73.2000 (76.5161)  acc5: 92.4000 (93.6516)  time: 0.3030  data: 0.1822  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5066 (1.2948)  acc1: 69.2000 (74.6049)  acc5: 88.4000 (92.7122)  time: 0.2692  data: 0.1474  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4939 (1.3136)  acc1: 69.6000 (74.1440)  acc5: 88.8000 (92.4080)  time: 0.2338  data: 0.1135  max mem: 18975
Test: Total time: 0:00:15 (0.3108 s / it)
* Acc@1 74.082 Acc@5 92.530 loss 1.312
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.13%
Epoch: [90]  [   0/2502]  eta: 1:37:59  lr: 0.002988  min_lr: 0.002988  loss: 2.3527 (2.3527)  weight_decay: 0.0500 (0.0500)  time: 2.3498  data: 2.0660  max mem: 18975
Epoch: [90]  [ 200/2502]  eta: 0:10:20  lr: 0.002986  min_lr: 0.002986  loss: 3.1813 (3.4554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7516 (0.7638)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [90]  [ 400/2502]  eta: 0:09:13  lr: 0.002985  min_lr: 0.002985  loss: 3.7695 (3.4643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.7701)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [90]  [ 600/2502]  eta: 0:08:16  lr: 0.002984  min_lr: 0.002984  loss: 3.7046 (3.4324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7842 (0.7680)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [90]  [ 800/2502]  eta: 0:07:22  lr: 0.002983  min_lr: 0.002983  loss: 3.8544 (3.4375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6944 (0.7604)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [90]  [1000/2502]  eta: 0:06:29  lr: 0.002982  min_lr: 0.002982  loss: 3.2196 (3.4507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6978 (0.7570)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [90]  [1200/2502]  eta: 0:05:37  lr: 0.002981  min_lr: 0.002981  loss: 2.7319 (3.4523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7438 (0.7571)  time: 0.2632  data: 0.0004  max mem: 18975
Epoch: [90]  [1400/2502]  eta: 0:04:45  lr: 0.002980  min_lr: 0.002980  loss: 3.7937 (3.4644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7121 (0.7567)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [90]  [1600/2502]  eta: 0:03:53  lr: 0.002979  min_lr: 0.002979  loss: 3.7041 (3.4665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7817 (0.7595)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [90]  [1800/2502]  eta: 0:03:01  lr: 0.002978  min_lr: 0.002978  loss: 3.3135 (3.4658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7264 (0.7596)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [90]  [2000/2502]  eta: 0:02:09  lr: 0.002976  min_lr: 0.002976  loss: 3.8560 (3.4688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7278 (0.7599)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [90]  [2200/2502]  eta: 0:01:18  lr: 0.002975  min_lr: 0.002975  loss: 3.5414 (3.4707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7848 (0.7605)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [90]  [2400/2502]  eta: 0:00:26  lr: 0.002974  min_lr: 0.002974  loss: 3.7790 (3.4745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7478 (0.7604)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [90]  [2501/2502]  eta: 0:00:00  lr: 0.002974  min_lr: 0.002974  loss: 3.6956 (3.4733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7529 (0.7616)  time: 0.2302  data: 0.0007  max mem: 18975
Epoch: [90] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002974  min_lr: 0.002974  loss: 3.6956 (3.4910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7529 (0.7616)
Test:  [ 0/50]  eta: 0:03:20  loss: 0.6775 (0.6775)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 4.0050  data: 3.8613  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 1.0414 (1.0519)  acc1: 81.6000 (80.2545)  acc5: 95.6000 (95.6727)  time: 0.6068  data: 0.4840  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0094 (1.0502)  acc1: 78.0000 (79.6191)  acc5: 95.6000 (95.8095)  time: 0.2481  data: 0.1282  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1761 (1.1770)  acc1: 73.6000 (77.1484)  acc5: 92.4000 (93.9226)  time: 0.2280  data: 0.1089  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5511 (1.2640)  acc1: 68.8000 (75.1415)  acc5: 89.2000 (92.8585)  time: 0.2042  data: 0.0842  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5480 (1.2801)  acc1: 68.8000 (74.6320)  acc5: 89.6000 (92.7040)  time: 0.2258  data: 0.1059  max mem: 18975
Test: Total time: 0:00:14 (0.2943 s / it)
* Acc@1 74.332 Acc@5 92.644 loss 1.282
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.33%
Epoch: [91]  [   0/2502]  eta: 1:16:34  lr: 0.002974  min_lr: 0.002974  loss: 3.9217 (3.9217)  weight_decay: 0.0500 (0.0500)  time: 1.8362  data: 1.5661  max mem: 18975
Epoch: [91]  [ 200/2502]  eta: 0:10:22  lr: 0.002973  min_lr: 0.002973  loss: 2.9810 (3.5357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7341 (0.7249)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [91]  [ 400/2502]  eta: 0:09:13  lr: 0.002971  min_lr: 0.002971  loss: 3.8912 (3.4878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6989 (0.7441)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [91]  [ 600/2502]  eta: 0:08:16  lr: 0.002970  min_lr: 0.002970  loss: 3.2894 (3.4805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7324 (0.7531)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [91]  [ 800/2502]  eta: 0:07:22  lr: 0.002969  min_lr: 0.002969  loss: 3.5047 (3.4834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6875 (0.7582)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [91]  [1000/2502]  eta: 0:06:29  lr: 0.002968  min_lr: 0.002968  loss: 3.5528 (3.4858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (0.7626)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [91]  [1200/2502]  eta: 0:05:37  lr: 0.002967  min_lr: 0.002967  loss: 3.3353 (3.4772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7894 (0.7634)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [91]  [1400/2502]  eta: 0:04:44  lr: 0.002966  min_lr: 0.002966  loss: 3.6319 (3.4762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7114 (0.7616)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [91]  [1600/2502]  eta: 0:03:53  lr: 0.002965  min_lr: 0.002965  loss: 3.7180 (3.4801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7470 (0.7618)  time: 0.2634  data: 0.0004  max mem: 18975
Epoch: [91]  [1800/2502]  eta: 0:03:01  lr: 0.002963  min_lr: 0.002963  loss: 3.0427 (3.4796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7418 (0.7622)  time: 0.2570  data: 0.0005  max mem: 18975
Epoch: [91]  [2000/2502]  eta: 0:02:09  lr: 0.002962  min_lr: 0.002962  loss: 3.3230 (3.4856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7555 (0.7648)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [91]  [2200/2502]  eta: 0:01:17  lr: 0.002961  min_lr: 0.002961  loss: 3.4397 (3.4912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7456 (0.7640)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [91]  [2400/2502]  eta: 0:00:26  lr: 0.002960  min_lr: 0.002960  loss: 3.1934 (3.4911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7043 (0.7621)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [91]  [2501/2502]  eta: 0:00:00  lr: 0.002960  min_lr: 0.002960  loss: 3.6176 (3.4891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7571 (0.7644)  time: 0.2311  data: 0.0010  max mem: 18975
Epoch: [91] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.002960  min_lr: 0.002960  loss: 3.6176 (3.4947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7571 (0.7644)
Test:  [ 0/50]  eta: 0:03:16  loss: 0.6808 (0.6808)  acc1: 90.0000 (90.0000)  acc5: 98.0000 (98.0000)  time: 3.9365  data: 3.7884  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.0034 (1.0130)  acc1: 81.2000 (80.2182)  acc5: 95.2000 (95.2364)  time: 0.5284  data: 0.4063  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0034 (1.0230)  acc1: 79.2000 (79.6000)  acc5: 95.6000 (95.4857)  time: 0.2031  data: 0.0832  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2795 (1.1520)  acc1: 74.4000 (77.1226)  acc5: 92.8000 (93.9097)  time: 0.2301  data: 0.1094  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4693 (1.2339)  acc1: 69.6000 (75.1805)  acc5: 89.6000 (92.8293)  time: 0.2459  data: 0.1259  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4539 (1.2563)  acc1: 68.8000 (74.2240)  acc5: 89.2000 (92.5040)  time: 0.2093  data: 0.0905  max mem: 18975
Test: Total time: 0:00:14 (0.2913 s / it)
* Acc@1 74.174 Acc@5 92.604 loss 1.255
Accuracy of the model on the 50000 test images: 74.2%
Max accuracy: 74.33%
Epoch: [92]  [   0/2502]  eta: 1:36:24  lr: 0.002960  min_lr: 0.002960  loss: 3.9824 (3.9824)  weight_decay: 0.0500 (0.0500)  time: 2.3118  data: 1.7738  max mem: 18975
Epoch: [92]  [ 200/2502]  eta: 0:10:19  lr: 0.002958  min_lr: 0.002958  loss: 3.7487 (3.4823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6850 (0.7062)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [92]  [ 400/2502]  eta: 0:09:13  lr: 0.002957  min_lr: 0.002957  loss: 3.3321 (3.5082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7363 (0.7235)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [92]  [ 600/2502]  eta: 0:08:16  lr: 0.002956  min_lr: 0.002956  loss: 3.8036 (3.5290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7202 (0.7371)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [92]  [ 800/2502]  eta: 0:07:22  lr: 0.002955  min_lr: 0.002955  loss: 3.8488 (3.5242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8124 (0.7550)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [92]  [1000/2502]  eta: 0:06:29  lr: 0.002954  min_lr: 0.002954  loss: 3.8849 (3.5218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7793 (0.7571)  time: 0.2575  data: 0.0005  max mem: 18975
Epoch: [92]  [1200/2502]  eta: 0:05:37  lr: 0.002953  min_lr: 0.002953  loss: 3.7724 (3.5314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7006 (0.7545)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [92]  [1400/2502]  eta: 0:04:45  lr: 0.002952  min_lr: 0.002952  loss: 3.7156 (3.5277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7825 (0.7578)  time: 0.2632  data: 0.0004  max mem: 18975
Epoch: [92]  [1600/2502]  eta: 0:03:53  lr: 0.002950  min_lr: 0.002950  loss: 3.5889 (3.5187)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8000 (0.7602)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [92]  [1800/2502]  eta: 0:03:01  lr: 0.002949  min_lr: 0.002949  loss: 3.6902 (3.5219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7130 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [92]  [2000/2502]  eta: 0:02:09  lr: 0.002948  min_lr: 0.002948  loss: 3.3934 (3.5234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7415 (inf)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [92]  [2200/2502]  eta: 0:01:17  lr: 0.002947  min_lr: 0.002947  loss: 3.8139 (3.5141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7373 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [92]  [2400/2502]  eta: 0:00:26  lr: 0.002946  min_lr: 0.002946  loss: 3.6510 (3.5219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7448 (inf)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [92]  [2501/2502]  eta: 0:00:00  lr: 0.002945  min_lr: 0.002945  loss: 3.5372 (3.5201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8073 (inf)  time: 0.2317  data: 0.0009  max mem: 18975
Epoch: [92] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.002945  min_lr: 0.002945  loss: 3.5372 (3.4889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8073 (inf)
Test:  [ 0/50]  eta: 0:02:57  loss: 0.6143 (0.6143)  acc1: 90.4000 (90.4000)  acc5: 97.2000 (97.2000)  time: 3.5494  data: 3.4112  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.0265 (1.0211)  acc1: 80.4000 (81.1636)  acc5: 96.4000 (95.4545)  time: 0.5392  data: 0.4170  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0265 (1.0319)  acc1: 78.4000 (79.9429)  acc5: 96.0000 (95.4667)  time: 0.2306  data: 0.1106  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2679 (1.1494)  acc1: 74.4000 (77.3161)  acc5: 92.0000 (93.8968)  time: 0.2056  data: 0.0863  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 1.4919 (1.2317)  acc1: 68.4000 (74.9951)  acc5: 88.8000 (92.8098)  time: 0.1901  data: 0.0711  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4700 (1.2485)  acc1: 68.4000 (74.3200)  acc5: 88.8000 (92.4960)  time: 0.1785  data: 0.0595  max mem: 18975
Test: Total time: 0:00:13 (0.2724 s / it)
* Acc@1 74.300 Acc@5 92.578 loss 1.258
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.33%
Epoch: [93]  [   0/2502]  eta: 1:40:07  lr: 0.002945  min_lr: 0.002945  loss: 4.1058 (4.1058)  weight_decay: 0.0500 (0.0500)  time: 2.4011  data: 2.1303  max mem: 18975
Epoch: [93]  [ 200/2502]  eta: 0:10:19  lr: 0.002944  min_lr: 0.002944  loss: 3.8593 (3.4161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6624 (0.7097)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [93]  [ 400/2502]  eta: 0:09:12  lr: 0.002943  min_lr: 0.002943  loss: 3.2635 (3.4240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8034 (0.7589)  time: 0.2584  data: 0.0004  max mem: 18975
Epoch: [93]  [ 600/2502]  eta: 0:08:16  lr: 0.002942  min_lr: 0.002942  loss: 3.0134 (3.4498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7666 (0.7634)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [93]  [ 800/2502]  eta: 0:07:22  lr: 0.002941  min_lr: 0.002941  loss: 3.5367 (3.4495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7064 (0.7559)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [93]  [1000/2502]  eta: 0:06:29  lr: 0.002940  min_lr: 0.002940  loss: 3.3211 (3.4396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7474 (0.7620)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [93]  [1200/2502]  eta: 0:05:37  lr: 0.002938  min_lr: 0.002938  loss: 3.6014 (3.4404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7341 (0.7593)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [93]  [1400/2502]  eta: 0:04:45  lr: 0.002937  min_lr: 0.002937  loss: 3.3403 (3.4507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7635 (0.7619)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [93]  [1600/2502]  eta: 0:03:53  lr: 0.002936  min_lr: 0.002936  loss: 3.8062 (3.4632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7551 (0.7639)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [93]  [1800/2502]  eta: 0:03:01  lr: 0.002935  min_lr: 0.002935  loss: 2.9966 (3.4681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7754 (0.7660)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [93]  [2000/2502]  eta: 0:02:09  lr: 0.002934  min_lr: 0.002934  loss: 3.9528 (3.4692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7678 (0.7654)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [93]  [2200/2502]  eta: 0:01:17  lr: 0.002933  min_lr: 0.002933  loss: 3.6212 (3.4770)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7435 (0.7638)  time: 0.2585  data: 0.0004  max mem: 18975
Epoch: [93]  [2400/2502]  eta: 0:00:26  lr: 0.002931  min_lr: 0.002931  loss: 3.6749 (3.4840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7078 (nan)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [93]  [2501/2502]  eta: 0:00:00  lr: 0.002931  min_lr: 0.002931  loss: 3.6827 (3.4883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7554 (nan)  time: 0.2303  data: 0.0009  max mem: 18975
Epoch: [93] Total time: 0:10:44 (0.2577 s / it)
Averaged stats: lr: 0.002931  min_lr: 0.002931  loss: 3.6827 (3.4914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7554 (nan)
Test:  [ 0/50]  eta: 0:02:57  loss: 0.7346 (0.7346)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 3.5554  data: 3.4143  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.9492 (1.0192)  acc1: 82.4000 (80.3273)  acc5: 96.4000 (95.5636)  time: 0.4723  data: 0.3501  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9983 (1.0323)  acc1: 78.4000 (79.7143)  acc5: 95.6000 (95.3905)  time: 0.2004  data: 0.0801  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2074 (1.1606)  acc1: 75.6000 (76.9548)  acc5: 93.2000 (93.5484)  time: 0.2734  data: 0.1533  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4623 (1.2384)  acc1: 70.0000 (75.0146)  acc5: 89.6000 (92.7805)  time: 0.2789  data: 0.1590  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4000 (1.2420)  acc1: 69.6000 (74.6160)  acc5: 90.4000 (92.7040)  time: 0.2074  data: 0.0878  max mem: 18975
Test: Total time: 0:00:14 (0.2958 s / it)
* Acc@1 74.482 Acc@5 92.708 loss 1.243
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.48%
Epoch: [94]  [   0/2502]  eta: 1:13:48  lr: 0.002931  min_lr: 0.002931  loss: 4.1562 (4.1562)  weight_decay: 0.0500 (0.0500)  time: 1.7701  data: 1.4897  max mem: 18975
Epoch: [94]  [ 200/2502]  eta: 0:10:16  lr: 0.002930  min_lr: 0.002930  loss: 3.8626 (3.4693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7502 (0.7563)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [94]  [ 400/2502]  eta: 0:09:11  lr: 0.002929  min_lr: 0.002929  loss: 3.7979 (3.4943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8109 (0.7750)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [94]  [ 600/2502]  eta: 0:08:16  lr: 0.002927  min_lr: 0.002927  loss: 3.7006 (3.4775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7021 (0.7678)  time: 0.2624  data: 0.0004  max mem: 18975
Epoch: [94]  [ 800/2502]  eta: 0:07:22  lr: 0.002926  min_lr: 0.002926  loss: 2.8990 (3.4747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7274 (0.7667)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [94]  [1000/2502]  eta: 0:06:29  lr: 0.002925  min_lr: 0.002925  loss: 3.2389 (3.4664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7471 (0.7747)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [94]  [1200/2502]  eta: 0:05:37  lr: 0.002924  min_lr: 0.002924  loss: 3.5245 (3.4725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7351 (0.7671)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [94]  [1400/2502]  eta: 0:04:44  lr: 0.002923  min_lr: 0.002923  loss: 3.8612 (3.4790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7395 (0.7643)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [94]  [1600/2502]  eta: 0:03:53  lr: 0.002922  min_lr: 0.002922  loss: 3.7073 (3.4848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7440 (0.7644)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [94]  [1800/2502]  eta: 0:03:01  lr: 0.002920  min_lr: 0.002920  loss: 3.6860 (3.4824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7187 (0.7621)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [94]  [2000/2502]  eta: 0:02:09  lr: 0.002919  min_lr: 0.002919  loss: 2.9419 (3.4751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8243 (0.7613)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [94]  [2200/2502]  eta: 0:01:17  lr: 0.002918  min_lr: 0.002918  loss: 3.7338 (3.4715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7815 (0.7602)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [94]  [2400/2502]  eta: 0:00:26  lr: 0.002917  min_lr: 0.002917  loss: 3.8000 (3.4731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7492 (0.7602)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [94]  [2501/2502]  eta: 0:00:00  lr: 0.002916  min_lr: 0.002916  loss: 3.8332 (3.4719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7445 (0.7595)  time: 0.2309  data: 0.0009  max mem: 18975
Epoch: [94] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.002916  min_lr: 0.002916  loss: 3.8332 (3.4892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7445 (0.7595)
Test:  [ 0/50]  eta: 0:03:15  loss: 0.7794 (0.7794)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 3.9023  data: 3.7637  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.1672 (1.1386)  acc1: 80.4000 (79.5636)  acc5: 95.6000 (95.4182)  time: 0.5645  data: 0.4421  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1387 (1.1416)  acc1: 76.8000 (78.8571)  acc5: 96.0000 (95.6571)  time: 0.2086  data: 0.0886  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2397 (1.2668)  acc1: 75.2000 (76.2839)  acc5: 94.0000 (93.7290)  time: 0.2399  data: 0.1207  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.6056 (1.3449)  acc1: 69.2000 (74.2634)  acc5: 89.2000 (92.8098)  time: 0.2642  data: 0.1452  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5538 (1.3568)  acc1: 69.2000 (73.7920)  acc5: 89.6000 (92.7280)  time: 0.1967  data: 0.0777  max mem: 18975
Test: Total time: 0:00:14 (0.2983 s / it)
* Acc@1 73.880 Acc@5 92.502 loss 1.359
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 74.48%
Epoch: [95]  [   0/2502]  eta: 1:33:11  lr: 0.002916  min_lr: 0.002916  loss: 3.3992 (3.3992)  weight_decay: 0.0500 (0.0500)  time: 2.2350  data: 1.9667  max mem: 18975
Epoch: [95]  [ 200/2502]  eta: 0:10:18  lr: 0.002915  min_lr: 0.002915  loss: 3.5786 (3.4037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7130 (0.7369)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [95]  [ 400/2502]  eta: 0:09:13  lr: 0.002914  min_lr: 0.002914  loss: 3.2982 (3.4032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7055 (0.7326)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [95]  [ 600/2502]  eta: 0:08:18  lr: 0.002913  min_lr: 0.002913  loss: 3.7043 (3.4295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8187 (0.7653)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [95]  [ 800/2502]  eta: 0:07:23  lr: 0.002912  min_lr: 0.002912  loss: 3.5119 (3.4302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7438 (0.7629)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [95]  [1000/2502]  eta: 0:06:30  lr: 0.002910  min_lr: 0.002910  loss: 3.5876 (3.4290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8130 (0.7726)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [95]  [1200/2502]  eta: 0:05:38  lr: 0.002909  min_lr: 0.002909  loss: 3.7602 (3.4348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7331 (0.7746)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [95]  [1400/2502]  eta: 0:04:45  lr: 0.002908  min_lr: 0.002908  loss: 3.8069 (3.4472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7955 (0.7747)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [95]  [1600/2502]  eta: 0:03:53  lr: 0.002907  min_lr: 0.002907  loss: 3.9951 (3.4550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7679 (0.7711)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [95]  [1800/2502]  eta: 0:03:01  lr: 0.002906  min_lr: 0.002906  loss: 2.9576 (3.4502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7939 (0.7711)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [95]  [2000/2502]  eta: 0:02:09  lr: 0.002905  min_lr: 0.002905  loss: 3.2009 (3.4552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7045 (0.7705)  time: 0.2601  data: 0.0004  max mem: 18975
Epoch: [95]  [2200/2502]  eta: 0:01:18  lr: 0.002903  min_lr: 0.002903  loss: 3.4680 (3.4539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6842 (0.7676)  time: 0.2584  data: 0.0004  max mem: 18975
Epoch: [95]  [2400/2502]  eta: 0:00:26  lr: 0.002902  min_lr: 0.002902  loss: 3.2505 (3.4560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7283 (0.7636)  time: 0.2584  data: 0.0004  max mem: 18975
Epoch: [95]  [2501/2502]  eta: 0:00:00  lr: 0.002902  min_lr: 0.002902  loss: 3.3408 (3.4554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7945 (0.7643)  time: 0.2317  data: 0.0012  max mem: 18975
Epoch: [95] Total time: 0:10:46 (0.2586 s / it)
Averaged stats: lr: 0.002902  min_lr: 0.002902  loss: 3.3408 (3.4684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7945 (0.7643)
Test:  [ 0/50]  eta: 0:02:36  loss: 0.7113 (0.7113)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 3.1229  data: 2.9763  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9100 (0.9723)  acc1: 82.4000 (81.1636)  acc5: 96.0000 (95.4909)  time: 0.5228  data: 0.3998  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9373 (0.9745)  acc1: 80.4000 (80.5143)  acc5: 96.0000 (95.8286)  time: 0.2629  data: 0.1430  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1361 (1.1043)  acc1: 76.0000 (77.8710)  acc5: 92.4000 (93.9871)  time: 0.2625  data: 0.1427  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4506 (1.1902)  acc1: 69.6000 (75.6585)  acc5: 89.2000 (92.8878)  time: 0.2304  data: 0.1107  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4383 (1.2109)  acc1: 68.4000 (74.7440)  acc5: 89.6000 (92.7280)  time: 0.2082  data: 0.0891  max mem: 18975
Test: Total time: 0:00:14 (0.2941 s / it)
* Acc@1 74.624 Acc@5 92.816 loss 1.214
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 74.62%
Epoch: [96]  [   0/2502]  eta: 1:32:24  lr: 0.002902  min_lr: 0.002902  loss: 2.7830 (2.7830)  weight_decay: 0.0500 (0.0500)  time: 2.2160  data: 1.9354  max mem: 18975
Epoch: [96]  [ 200/2502]  eta: 0:10:19  lr: 0.002900  min_lr: 0.002900  loss: 3.8279 (3.4855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7372 (0.7672)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [96]  [ 400/2502]  eta: 0:09:13  lr: 0.002899  min_lr: 0.002899  loss: 3.6055 (3.4804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7941 (0.7780)  time: 0.2637  data: 0.0004  max mem: 18975
Epoch: [96]  [ 600/2502]  eta: 0:08:17  lr: 0.002898  min_lr: 0.002898  loss: 3.6028 (3.4747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7447 (0.7779)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [96]  [ 800/2502]  eta: 0:07:23  lr: 0.002897  min_lr: 0.002897  loss: 3.6256 (3.4802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7926 (0.7742)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [96]  [1000/2502]  eta: 0:06:30  lr: 0.002896  min_lr: 0.002896  loss: 3.5821 (3.4720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8276 (0.7751)  time: 0.2594  data: 0.0005  max mem: 18975
Epoch: [96]  [1200/2502]  eta: 0:05:37  lr: 0.002894  min_lr: 0.002894  loss: 4.0083 (3.4692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7369 (0.7681)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [96]  [1400/2502]  eta: 0:04:45  lr: 0.002893  min_lr: 0.002893  loss: 3.7957 (3.4614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8136 (0.7668)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [96]  [1600/2502]  eta: 0:03:53  lr: 0.002892  min_lr: 0.002892  loss: 3.4504 (3.4682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7596 (0.7682)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [96]  [1800/2502]  eta: 0:03:01  lr: 0.002891  min_lr: 0.002891  loss: 3.6710 (3.4687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7330 (0.7653)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [96]  [2000/2502]  eta: 0:02:09  lr: 0.002890  min_lr: 0.002890  loss: 3.1635 (3.4709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (0.7655)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [96]  [2200/2502]  eta: 0:01:18  lr: 0.002888  min_lr: 0.002888  loss: 3.3344 (3.4693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7490 (0.7630)  time: 0.2624  data: 0.0004  max mem: 18975
Epoch: [96]  [2400/2502]  eta: 0:00:26  lr: 0.002887  min_lr: 0.002887  loss: 3.4478 (3.4701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8448 (0.7642)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [96]  [2501/2502]  eta: 0:00:00  lr: 0.002887  min_lr: 0.002887  loss: 3.4716 (3.4757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6688 (0.7622)  time: 0.2311  data: 0.0009  max mem: 18975
Epoch: [96] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.002887  min_lr: 0.002887  loss: 3.4716 (3.4711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6688 (0.7622)
Test:  [ 0/50]  eta: 0:02:15  loss: 0.6709 (0.6709)  acc1: 92.0000 (92.0000)  acc5: 98.0000 (98.0000)  time: 2.7128  data: 2.5677  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 0.9403 (0.9916)  acc1: 82.8000 (81.1273)  acc5: 96.8000 (95.6364)  time: 0.4284  data: 0.3064  max mem: 18975
Test:  [20/50]  eta: 0:00:09  loss: 0.9637 (1.0019)  acc1: 79.2000 (80.0000)  acc5: 96.0000 (95.7905)  time: 0.2048  data: 0.0848  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1907 (1.1395)  acc1: 73.2000 (77.1226)  acc5: 92.4000 (93.9484)  time: 0.2357  data: 0.1161  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 1.4797 (1.2214)  acc1: 68.4000 (74.9463)  acc5: 89.6000 (92.9659)  time: 0.2396  data: 0.1204  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4522 (1.2438)  acc1: 68.4000 (74.4320)  acc5: 89.6000 (92.6720)  time: 0.2128  data: 0.0938  max mem: 18975
Test: Total time: 0:00:13 (0.2733 s / it)
* Acc@1 74.542 Acc@5 92.842 loss 1.241
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.62%
Epoch: [97]  [   0/2502]  eta: 1:21:58  lr: 0.002887  min_lr: 0.002887  loss: 4.2208 (4.2208)  weight_decay: 0.0500 (0.0500)  time: 1.9656  data: 1.5901  max mem: 18975
Epoch: [97]  [ 200/2502]  eta: 0:10:17  lr: 0.002886  min_lr: 0.002886  loss: 3.5486 (3.4295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8167 (0.7752)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [97]  [ 400/2502]  eta: 0:09:11  lr: 0.002884  min_lr: 0.002884  loss: 3.6956 (3.4392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7638 (0.7743)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [97]  [ 600/2502]  eta: 0:08:16  lr: 0.002883  min_lr: 0.002883  loss: 3.8237 (3.4616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7226 (0.7743)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [97]  [ 800/2502]  eta: 0:07:22  lr: 0.002882  min_lr: 0.002882  loss: 3.4552 (3.4520)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7518 (0.7750)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [97]  [1000/2502]  eta: 0:06:29  lr: 0.002881  min_lr: 0.002881  loss: 3.6599 (3.4709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7984 (0.7718)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [97]  [1200/2502]  eta: 0:05:37  lr: 0.002880  min_lr: 0.002880  loss: 3.7090 (3.4720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7543 (0.7709)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [97]  [1400/2502]  eta: 0:04:45  lr: 0.002878  min_lr: 0.002878  loss: 3.2387 (3.4732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7205 (0.7719)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [97]  [1600/2502]  eta: 0:03:53  lr: 0.002877  min_lr: 0.002877  loss: 3.0765 (3.4725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6957 (0.7688)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [97]  [1800/2502]  eta: 0:03:01  lr: 0.002876  min_lr: 0.002876  loss: 3.8280 (3.4771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8092 (0.7677)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [97]  [2000/2502]  eta: 0:02:09  lr: 0.002875  min_lr: 0.002875  loss: 3.6949 (3.4767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8352 (0.7702)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [97]  [2200/2502]  eta: 0:01:17  lr: 0.002874  min_lr: 0.002874  loss: 3.8325 (3.4840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7062 (0.7719)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [97]  [2400/2502]  eta: 0:00:26  lr: 0.002872  min_lr: 0.002872  loss: 3.2445 (3.4823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6845 (0.7714)  time: 0.2567  data: 0.0005  max mem: 18975
Epoch: [97]  [2501/2502]  eta: 0:00:00  lr: 0.002872  min_lr: 0.002872  loss: 3.5139 (3.4818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7103 (0.7706)  time: 0.2307  data: 0.0008  max mem: 18975
Epoch: [97] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.002872  min_lr: 0.002872  loss: 3.5139 (3.4690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7103 (0.7706)
Test:  [ 0/50]  eta: 0:02:53  loss: 0.7867 (0.7867)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 3.4727  data: 3.3283  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.9519 (1.0277)  acc1: 82.4000 (81.2000)  acc5: 95.2000 (95.4909)  time: 0.5602  data: 0.4373  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0158 (1.0387)  acc1: 79.6000 (80.0952)  acc5: 96.0000 (95.6571)  time: 0.2509  data: 0.1307  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1889 (1.1509)  acc1: 74.0000 (77.3419)  acc5: 92.8000 (93.9097)  time: 0.2390  data: 0.1189  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4728 (1.2316)  acc1: 69.6000 (75.2488)  acc5: 89.6000 (93.0634)  time: 0.2382  data: 0.1182  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4803 (1.2458)  acc1: 69.2000 (74.7280)  acc5: 90.0000 (92.8480)  time: 0.2371  data: 0.1180  max mem: 18975
Test: Total time: 0:00:14 (0.2885 s / it)
* Acc@1 74.832 Acc@5 92.794 loss 1.244
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 74.83%
Epoch: [98]  [   0/2502]  eta: 1:34:09  lr: 0.002872  min_lr: 0.002872  loss: 4.3316 (4.3316)  weight_decay: 0.0500 (0.0500)  time: 2.2580  data: 1.9798  max mem: 18975
Epoch: [98]  [ 200/2502]  eta: 0:10:19  lr: 0.002871  min_lr: 0.002871  loss: 3.7297 (3.4731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7680 (0.7741)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [98]  [ 400/2502]  eta: 0:09:12  lr: 0.002869  min_lr: 0.002869  loss: 3.5218 (3.4749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7622 (inf)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [98]  [ 600/2502]  eta: 0:08:17  lr: 0.002868  min_lr: 0.002868  loss: 3.6424 (3.4798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7105 (inf)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [98]  [ 800/2502]  eta: 0:07:23  lr: 0.002867  min_lr: 0.002867  loss: 3.6142 (3.4724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7417 (inf)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [98]  [1000/2502]  eta: 0:06:29  lr: 0.002866  min_lr: 0.002866  loss: 3.3256 (3.4653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8082 (inf)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [98]  [1200/2502]  eta: 0:05:37  lr: 0.002864  min_lr: 0.002864  loss: 3.8877 (3.4607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7215 (inf)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [98]  [1400/2502]  eta: 0:04:45  lr: 0.002863  min_lr: 0.002863  loss: 3.4592 (3.4596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7175 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [98]  [1600/2502]  eta: 0:03:53  lr: 0.002862  min_lr: 0.002862  loss: 3.6108 (3.4625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6683 (inf)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [98]  [1800/2502]  eta: 0:03:01  lr: 0.002861  min_lr: 0.002861  loss: 3.3182 (3.4543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7778 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [98]  [2000/2502]  eta: 0:02:09  lr: 0.002860  min_lr: 0.002860  loss: 3.7160 (3.4529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7075 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [98]  [2200/2502]  eta: 0:01:17  lr: 0.002858  min_lr: 0.002858  loss: 3.4946 (3.4540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7290 (inf)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [98]  [2400/2502]  eta: 0:00:26  lr: 0.002857  min_lr: 0.002857  loss: 3.5788 (3.4537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7639 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [98]  [2501/2502]  eta: 0:00:00  lr: 0.002857  min_lr: 0.002857  loss: 3.5956 (3.4615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7443 (inf)  time: 0.2304  data: 0.0009  max mem: 18975
Epoch: [98] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.002857  min_lr: 0.002857  loss: 3.5956 (3.4681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7443 (inf)
Test:  [ 0/50]  eta: 0:03:19  loss: 0.6374 (0.6374)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 3.9980  data: 3.8595  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 1.0218 (1.0501)  acc1: 80.8000 (80.6545)  acc5: 94.8000 (95.6000)  time: 0.6009  data: 0.4786  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.0254 (1.0531)  acc1: 78.8000 (79.9429)  acc5: 96.4000 (96.1143)  time: 0.2274  data: 0.1072  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2052 (1.1840)  acc1: 74.0000 (77.1613)  acc5: 94.0000 (93.9097)  time: 0.2041  data: 0.0846  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5073 (1.2601)  acc1: 69.6000 (75.0732)  acc5: 89.2000 (93.0439)  time: 0.2276  data: 0.1085  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4939 (1.2718)  acc1: 69.6000 (74.5440)  acc5: 90.4000 (92.9440)  time: 0.2060  data: 0.0871  max mem: 18975
Test: Total time: 0:00:14 (0.2960 s / it)
* Acc@1 74.492 Acc@5 92.792 loss 1.276
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.83%
Epoch: [99]  [   0/2502]  eta: 1:28:17  lr: 0.002857  min_lr: 0.002857  loss: 2.3112 (2.3112)  weight_decay: 0.0500 (0.0500)  time: 2.1174  data: 1.7125  max mem: 18975
Epoch: [99]  [ 200/2502]  eta: 0:10:17  lr: 0.002855  min_lr: 0.002855  loss: 3.3108 (3.4423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7221 (0.7613)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [99]  [ 400/2502]  eta: 0:09:13  lr: 0.002854  min_lr: 0.002854  loss: 3.8180 (3.4744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7345 (0.7500)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [99]  [ 600/2502]  eta: 0:08:16  lr: 0.002853  min_lr: 0.002853  loss: 3.3785 (3.4725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7597 (0.7618)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [99]  [ 800/2502]  eta: 0:07:22  lr: 0.002852  min_lr: 0.002852  loss: 2.9161 (3.4634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (0.7615)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [99]  [1000/2502]  eta: 0:06:29  lr: 0.002850  min_lr: 0.002850  loss: 3.9206 (3.4697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6760 (0.7525)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [99]  [1200/2502]  eta: 0:05:37  lr: 0.002849  min_lr: 0.002849  loss: 3.3278 (3.4705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7702 (0.7614)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [99]  [1400/2502]  eta: 0:04:44  lr: 0.002848  min_lr: 0.002848  loss: 3.2860 (3.4678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7166 (0.7571)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [99]  [1600/2502]  eta: 0:03:52  lr: 0.002847  min_lr: 0.002847  loss: 3.4549 (3.4692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7929 (0.7584)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [99]  [1800/2502]  eta: 0:03:01  lr: 0.002846  min_lr: 0.002846  loss: 2.9803 (3.4650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.7581)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [99]  [2000/2502]  eta: 0:02:09  lr: 0.002844  min_lr: 0.002844  loss: 3.8788 (3.4638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7561 (0.7606)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [99]  [2200/2502]  eta: 0:01:17  lr: 0.002843  min_lr: 0.002843  loss: 3.5345 (3.4621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6830 (0.7578)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [99]  [2400/2502]  eta: 0:00:26  lr: 0.002842  min_lr: 0.002842  loss: 2.9115 (3.4538)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7637 (0.7592)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [99]  [2501/2502]  eta: 0:00:00  lr: 0.002841  min_lr: 0.002841  loss: 3.5063 (3.4562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7378 (0.7583)  time: 0.2314  data: 0.0013  max mem: 18975
Epoch: [99] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.002841  min_lr: 0.002841  loss: 3.5063 (3.4674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7378 (0.7583)
Test:  [ 0/50]  eta: 0:02:54  loss: 0.5442 (0.5442)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 3.4920  data: 3.3536  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9794 (0.9604)  acc1: 80.4000 (80.7636)  acc5: 96.4000 (96.2182)  time: 0.5471  data: 0.4260  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9726 (0.9740)  acc1: 78.4000 (79.6762)  acc5: 96.0000 (96.2667)  time: 0.2347  data: 0.1153  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1861 (1.1005)  acc1: 74.4000 (77.1613)  acc5: 93.2000 (94.2968)  time: 0.2340  data: 0.1144  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4324 (1.1887)  acc1: 70.0000 (75.1707)  acc5: 89.2000 (93.1220)  time: 0.2367  data: 0.1169  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4324 (1.2037)  acc1: 69.2000 (74.5760)  acc5: 89.6000 (92.8560)  time: 0.2371  data: 0.1177  max mem: 18975
Test: Total time: 0:00:14 (0.2822 s / it)
* Acc@1 74.648 Acc@5 92.658 loss 1.212
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 74.83%
Epoch: [100]  [   0/2502]  eta: 1:23:23  lr: 0.002841  min_lr: 0.002841  loss: 4.0819 (4.0819)  weight_decay: 0.0500 (0.0500)  time: 1.9997  data: 1.4311  max mem: 18975
Epoch: [100]  [ 200/2502]  eta: 0:10:17  lr: 0.002840  min_lr: 0.002840  loss: 3.1660 (3.4345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7451 (0.7858)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [100]  [ 400/2502]  eta: 0:09:12  lr: 0.002839  min_lr: 0.002839  loss: 3.5577 (3.3955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7563 (0.7755)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [100]  [ 600/2502]  eta: 0:08:15  lr: 0.002838  min_lr: 0.002838  loss: 4.0115 (3.4336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7760 (0.7823)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [100]  [ 800/2502]  eta: 0:07:22  lr: 0.002836  min_lr: 0.002836  loss: 3.2006 (3.4321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8044 (0.7912)  time: 0.2576  data: 0.0005  max mem: 18975
Epoch: [100]  [1000/2502]  eta: 0:06:29  lr: 0.002835  min_lr: 0.002835  loss: 3.6168 (3.4200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7091 (0.7819)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [100]  [1200/2502]  eta: 0:05:37  lr: 0.002834  min_lr: 0.002834  loss: 3.3603 (3.4236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7198 (0.7733)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [100]  [1400/2502]  eta: 0:04:45  lr: 0.002833  min_lr: 0.002833  loss: 3.7041 (3.4254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7242 (0.7751)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [100]  [1600/2502]  eta: 0:03:53  lr: 0.002831  min_lr: 0.002831  loss: 4.0193 (3.4327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7403 (0.7743)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [100]  [1800/2502]  eta: 0:03:01  lr: 0.002830  min_lr: 0.002830  loss: 3.4614 (3.4422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7709 (0.7739)  time: 0.2646  data: 0.0004  max mem: 18975
Epoch: [100]  [2000/2502]  eta: 0:02:09  lr: 0.002829  min_lr: 0.002829  loss: 2.9962 (3.4399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7463 (0.7708)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [100]  [2200/2502]  eta: 0:01:18  lr: 0.002828  min_lr: 0.002828  loss: 3.5339 (3.4497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7922 (0.7728)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [100]  [2400/2502]  eta: 0:00:26  lr: 0.002826  min_lr: 0.002826  loss: 3.1236 (3.4516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7091 (0.7706)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [100]  [2501/2502]  eta: 0:00:00  lr: 0.002826  min_lr: 0.002826  loss: 3.3731 (3.4528)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7039 (0.7686)  time: 0.2367  data: 0.0009  max mem: 18975
Epoch: [100] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.002826  min_lr: 0.002826  loss: 3.3731 (3.4540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7039 (0.7686)
Test:  [ 0/50]  eta: 0:03:13  loss: 0.5964 (0.5964)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 3.8772  data: 3.7354  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9387 (0.9733)  acc1: 82.4000 (81.3455)  acc5: 96.4000 (95.7818)  time: 0.5197  data: 0.3970  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9799 (0.9913)  acc1: 78.0000 (80.1143)  acc5: 96.0000 (95.9619)  time: 0.1986  data: 0.0786  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1842 (1.1326)  acc1: 74.0000 (77.5226)  acc5: 93.6000 (94.1936)  time: 0.2166  data: 0.0947  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4869 (1.2170)  acc1: 70.8000 (75.6000)  acc5: 89.6000 (93.1512)  time: 0.2226  data: 0.1008  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4748 (1.2361)  acc1: 70.8000 (74.9920)  acc5: 89.2000 (92.8640)  time: 0.1990  data: 0.0800  max mem: 18975
Test: Total time: 0:00:13 (0.2796 s / it)
* Acc@1 74.606 Acc@5 92.830 loss 1.239
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 74.83%
Epoch: [101]  [   0/2502]  eta: 1:34:30  lr: 0.002826  min_lr: 0.002826  loss: 2.0804 (2.0804)  weight_decay: 0.0500 (0.0500)  time: 2.2666  data: 1.5047  max mem: 18975
Epoch: [101]  [ 200/2502]  eta: 0:10:19  lr: 0.002825  min_lr: 0.002825  loss: 3.8311 (3.4502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7652 (0.7598)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [101]  [ 400/2502]  eta: 0:09:13  lr: 0.002823  min_lr: 0.002823  loss: 3.5005 (3.4601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8191 (0.7706)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [101]  [ 600/2502]  eta: 0:08:16  lr: 0.002822  min_lr: 0.002822  loss: 3.5029 (3.4587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7303 (0.7749)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [101]  [ 800/2502]  eta: 0:07:23  lr: 0.002821  min_lr: 0.002821  loss: 3.2346 (3.4481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7102 (0.7777)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [101]  [1000/2502]  eta: 0:06:30  lr: 0.002820  min_lr: 0.002820  loss: 3.7439 (3.4459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7044 (0.7780)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [101]  [1200/2502]  eta: 0:05:37  lr: 0.002818  min_lr: 0.002818  loss: 3.1467 (3.4376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7698 (0.7792)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [101]  [1400/2502]  eta: 0:04:45  lr: 0.002817  min_lr: 0.002817  loss: 3.5296 (3.4305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6891 (0.7758)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [101]  [1600/2502]  eta: 0:03:53  lr: 0.002816  min_lr: 0.002816  loss: 3.0922 (3.4298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7422 (0.7747)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [101]  [1800/2502]  eta: 0:03:01  lr: 0.002815  min_lr: 0.002815  loss: 3.2488 (3.4338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7420 (inf)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [101]  [2000/2502]  eta: 0:02:09  lr: 0.002813  min_lr: 0.002813  loss: 3.7590 (3.4430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8119 (inf)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [101]  [2200/2502]  eta: 0:01:17  lr: 0.002812  min_lr: 0.002812  loss: 3.6462 (3.4375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8680 (inf)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [101]  [2400/2502]  eta: 0:00:26  lr: 0.002811  min_lr: 0.002811  loss: 3.1296 (3.4435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7358 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [101]  [2501/2502]  eta: 0:00:00  lr: 0.002810  min_lr: 0.002810  loss: 3.4272 (3.4462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6967 (inf)  time: 0.2313  data: 0.0009  max mem: 18975
Epoch: [101] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002810  min_lr: 0.002810  loss: 3.4272 (3.4523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6967 (inf)
Test:  [ 0/50]  eta: 0:02:41  loss: 0.8244 (0.8244)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 3.2339  data: 3.0899  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.0337 (1.0977)  acc1: 80.8000 (80.7273)  acc5: 94.8000 (95.3455)  time: 0.5346  data: 0.4120  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0337 (1.0841)  acc1: 78.8000 (79.6952)  acc5: 95.6000 (95.7143)  time: 0.2469  data: 0.1257  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2253 (1.2009)  acc1: 73.6000 (77.0968)  acc5: 92.8000 (93.9355)  time: 0.2422  data: 0.1170  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4977 (1.2717)  acc1: 70.0000 (75.3561)  acc5: 89.6000 (92.8878)  time: 0.2424  data: 0.1188  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4796 (1.2938)  acc1: 70.0000 (74.6080)  acc5: 89.6000 (92.5120)  time: 0.2079  data: 0.0892  max mem: 18975
Test: Total time: 0:00:14 (0.2913 s / it)
* Acc@1 74.816 Acc@5 92.766 loss 1.291
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 74.83%
Epoch: [102]  [   0/2502]  eta: 1:26:52  lr: 0.002810  min_lr: 0.002810  loss: 2.8481 (2.8481)  weight_decay: 0.0500 (0.0500)  time: 2.0832  data: 1.4854  max mem: 18975
Epoch: [102]  [ 200/2502]  eta: 0:10:17  lr: 0.002809  min_lr: 0.002809  loss: 3.7956 (3.4040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7276 (0.7583)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [102]  [ 400/2502]  eta: 0:09:12  lr: 0.002808  min_lr: 0.002808  loss: 3.7450 (3.3862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7687 (0.7768)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [102]  [ 600/2502]  eta: 0:08:16  lr: 0.002807  min_lr: 0.002807  loss: 3.3868 (3.4011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7397 (0.7692)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [102]  [ 800/2502]  eta: 0:07:22  lr: 0.002805  min_lr: 0.002805  loss: 3.6211 (3.3982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7081 (0.7657)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [102]  [1000/2502]  eta: 0:06:29  lr: 0.002804  min_lr: 0.002804  loss: 3.7511 (3.4141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7373 (0.7647)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [102]  [1200/2502]  eta: 0:05:37  lr: 0.002803  min_lr: 0.002803  loss: 3.7258 (3.4179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7667 (0.7662)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [102]  [1400/2502]  eta: 0:04:45  lr: 0.002802  min_lr: 0.002802  loss: 3.3069 (3.4232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7826 (0.7685)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [102]  [1600/2502]  eta: 0:03:53  lr: 0.002800  min_lr: 0.002800  loss: 3.8218 (3.4222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7879 (0.7742)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [102]  [1800/2502]  eta: 0:03:01  lr: 0.002799  min_lr: 0.002799  loss: 3.4152 (3.4298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7192 (0.7705)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [102]  [2000/2502]  eta: 0:02:09  lr: 0.002798  min_lr: 0.002798  loss: 3.9306 (3.4343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7465 (0.7718)  time: 0.2611  data: 0.0003  max mem: 18975
Epoch: [102]  [2200/2502]  eta: 0:01:18  lr: 0.002797  min_lr: 0.002797  loss: 3.0431 (3.4403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7837 (0.7736)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [102]  [2400/2502]  eta: 0:00:26  lr: 0.002795  min_lr: 0.002795  loss: 3.4700 (3.4406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7373 (0.7697)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [102]  [2501/2502]  eta: 0:00:00  lr: 0.002795  min_lr: 0.002795  loss: 3.3926 (3.4403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7856 (0.7709)  time: 0.2307  data: 0.0009  max mem: 18975
Epoch: [102] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002795  min_lr: 0.002795  loss: 3.3926 (3.4630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7856 (0.7709)
Test:  [ 0/50]  eta: 0:02:16  loss: 0.7569 (0.7569)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 2.7270  data: 2.5843  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 1.0485 (1.0227)  acc1: 80.4000 (80.3273)  acc5: 96.0000 (95.8545)  time: 0.4686  data: 0.3451  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.0607 (1.0544)  acc1: 78.0000 (79.7143)  acc5: 96.0000 (95.8476)  time: 0.2438  data: 0.1226  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2065 (1.1700)  acc1: 74.0000 (77.0968)  acc5: 92.0000 (94.0645)  time: 0.2681  data: 0.1465  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5030 (1.2511)  acc1: 69.6000 (75.1024)  acc5: 89.2000 (92.9268)  time: 0.2834  data: 0.1626  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4736 (1.2660)  acc1: 69.6000 (74.6400)  acc5: 89.6000 (92.7920)  time: 0.2204  data: 0.1010  max mem: 18975
Test: Total time: 0:00:14 (0.2979 s / it)
* Acc@1 74.828 Acc@5 92.812 loss 1.265
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 74.83%
Epoch: [103]  [   0/2502]  eta: 1:26:11  lr: 0.002795  min_lr: 0.002795  loss: 2.3572 (2.3572)  weight_decay: 0.0500 (0.0500)  time: 2.0670  data: 1.6204  max mem: 18975
Epoch: [103]  [ 200/2502]  eta: 0:10:17  lr: 0.002793  min_lr: 0.002793  loss: 3.5092 (3.4620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6942 (0.7609)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [103]  [ 400/2502]  eta: 0:09:11  lr: 0.002792  min_lr: 0.002792  loss: 3.0744 (3.4845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7430 (0.7611)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [103]  [ 600/2502]  eta: 0:08:15  lr: 0.002791  min_lr: 0.002791  loss: 3.0879 (3.4548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7811 (0.7740)  time: 0.2570  data: 0.0005  max mem: 18975
Epoch: [103]  [ 800/2502]  eta: 0:07:22  lr: 0.002790  min_lr: 0.002790  loss: 3.3935 (3.4377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8165 (0.7748)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [103]  [1000/2502]  eta: 0:06:29  lr: 0.002788  min_lr: 0.002788  loss: 3.3236 (3.4608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6917 (0.7711)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [103]  [1200/2502]  eta: 0:05:36  lr: 0.002787  min_lr: 0.002787  loss: 3.5269 (3.4716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7980 (0.7751)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [103]  [1400/2502]  eta: 0:04:44  lr: 0.002786  min_lr: 0.002786  loss: 2.6768 (3.4614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7932 (0.7741)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [103]  [1600/2502]  eta: 0:03:53  lr: 0.002785  min_lr: 0.002785  loss: 2.9341 (3.4542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7915 (0.7753)  time: 0.2634  data: 0.0004  max mem: 18975
Epoch: [103]  [1800/2502]  eta: 0:03:01  lr: 0.002783  min_lr: 0.002783  loss: 3.2722 (3.4562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7100 (0.7761)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [103]  [2000/2502]  eta: 0:02:09  lr: 0.002782  min_lr: 0.002782  loss: 3.8413 (3.4523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7613 (0.7737)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [103]  [2200/2502]  eta: 0:01:17  lr: 0.002781  min_lr: 0.002781  loss: 3.4430 (3.4448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6993 (0.7734)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [103]  [2400/2502]  eta: 0:00:26  lr: 0.002779  min_lr: 0.002779  loss: 3.5755 (3.4418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6937 (0.7721)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [103]  [2501/2502]  eta: 0:00:00  lr: 0.002779  min_lr: 0.002779  loss: 3.3354 (3.4423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7226 (0.7709)  time: 0.2311  data: 0.0009  max mem: 18975
Epoch: [103] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.002779  min_lr: 0.002779  loss: 3.3354 (3.4485)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7226 (0.7709)
Test:  [ 0/50]  eta: 0:02:21  loss: 0.6693 (0.6693)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 2.8285  data: 2.6769  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9972 (0.9975)  acc1: 81.2000 (80.6182)  acc5: 96.0000 (95.6727)  time: 0.5236  data: 0.3971  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9792 (1.0070)  acc1: 79.2000 (79.7714)  acc5: 96.0000 (95.6571)  time: 0.2610  data: 0.1392  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1644 (1.1276)  acc1: 75.2000 (76.9806)  acc5: 92.4000 (93.9871)  time: 0.2434  data: 0.1232  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4335 (1.2042)  acc1: 69.6000 (75.1805)  acc5: 89.6000 (93.0634)  time: 0.2473  data: 0.1275  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4335 (1.2165)  acc1: 69.6000 (74.5680)  acc5: 90.4000 (92.9920)  time: 0.2233  data: 0.1043  max mem: 18975
Test: Total time: 0:00:14 (0.2890 s / it)
* Acc@1 74.706 Acc@5 92.924 loss 1.217
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 74.83%
Epoch: [104]  [   0/2502]  eta: 1:30:21  lr: 0.002779  min_lr: 0.002779  loss: 3.7326 (3.7326)  weight_decay: 0.0500 (0.0500)  time: 2.1668  data: 1.8755  max mem: 18975
Epoch: [104]  [ 200/2502]  eta: 0:10:19  lr: 0.002778  min_lr: 0.002778  loss: 3.3771 (3.4425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6734 (0.7484)  time: 0.2568  data: 0.0005  max mem: 18975
Epoch: [104]  [ 400/2502]  eta: 0:09:12  lr: 0.002776  min_lr: 0.002776  loss: 3.0447 (3.4159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7635 (0.7565)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [104]  [ 600/2502]  eta: 0:08:17  lr: 0.002775  min_lr: 0.002775  loss: 3.0117 (3.4174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7692 (0.7650)  time: 0.2559  data: 0.0005  max mem: 18975
Epoch: [104]  [ 800/2502]  eta: 0:07:23  lr: 0.002774  min_lr: 0.002774  loss: 3.5501 (3.4264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7522 (0.7642)  time: 0.2573  data: 0.0003  max mem: 18975
Epoch: [104]  [1000/2502]  eta: 0:06:30  lr: 0.002772  min_lr: 0.002772  loss: 3.3979 (3.4224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7700 (0.7619)  time: 0.2586  data: 0.0004  max mem: 18975
Epoch: [104]  [1200/2502]  eta: 0:05:37  lr: 0.002771  min_lr: 0.002771  loss: 3.6602 (3.4169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7968 (0.7663)  time: 0.2574  data: 0.0003  max mem: 18975
Epoch: [104]  [1400/2502]  eta: 0:04:45  lr: 0.002770  min_lr: 0.002770  loss: 3.2313 (3.4222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7198 (0.7664)  time: 0.2557  data: 0.0003  max mem: 18975
Epoch: [104]  [1600/2502]  eta: 0:03:53  lr: 0.002769  min_lr: 0.002769  loss: 3.4717 (3.4357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7637 (0.7698)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [104]  [1800/2502]  eta: 0:03:01  lr: 0.002767  min_lr: 0.002767  loss: 3.4198 (3.4351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8009 (0.7746)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [104]  [2000/2502]  eta: 0:02:09  lr: 0.002766  min_lr: 0.002766  loss: 3.9514 (3.4374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7243 (0.7744)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [104]  [2200/2502]  eta: 0:01:17  lr: 0.002765  min_lr: 0.002765  loss: 3.6239 (3.4417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7632 (0.7726)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [104]  [2400/2502]  eta: 0:00:26  lr: 0.002764  min_lr: 0.002764  loss: 3.0358 (3.4395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7408 (0.7708)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [104]  [2501/2502]  eta: 0:00:00  lr: 0.002763  min_lr: 0.002763  loss: 3.3845 (3.4389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8131 (0.7734)  time: 0.2310  data: 0.0009  max mem: 18975
Epoch: [104] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002763  min_lr: 0.002763  loss: 3.3845 (3.4414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8131 (0.7734)
Test:  [ 0/50]  eta: 0:02:28  loss: 0.6089 (0.6089)  acc1: 91.6000 (91.6000)  acc5: 98.8000 (98.8000)  time: 2.9799  data: 2.8369  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.9690 (0.9578)  acc1: 79.6000 (81.4545)  acc5: 96.4000 (95.7091)  time: 0.4814  data: 0.3590  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9690 (0.9843)  acc1: 78.4000 (79.8667)  acc5: 96.0000 (95.5619)  time: 0.2608  data: 0.1393  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1778 (1.1150)  acc1: 74.0000 (77.0710)  acc5: 92.0000 (93.8452)  time: 0.2762  data: 0.1544  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4503 (1.1989)  acc1: 69.6000 (75.2000)  acc5: 89.6000 (92.8976)  time: 0.2391  data: 0.1192  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4464 (1.2198)  acc1: 70.4000 (74.6160)  acc5: 90.0000 (92.6400)  time: 0.1952  data: 0.0762  max mem: 18975
Test: Total time: 0:00:14 (0.2946 s / it)
* Acc@1 74.782 Acc@5 92.714 loss 1.212
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 74.83%
Epoch: [105]  [   0/2502]  eta: 1:30:33  lr: 0.002763  min_lr: 0.002763  loss: 4.0193 (4.0193)  weight_decay: 0.0500 (0.0500)  time: 2.1716  data: 1.6004  max mem: 18975
Epoch: [105]  [ 200/2502]  eta: 0:10:21  lr: 0.002762  min_lr: 0.002762  loss: 3.5928 (3.3835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (0.7627)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [105]  [ 400/2502]  eta: 0:09:13  lr: 0.002760  min_lr: 0.002760  loss: 3.4960 (3.4155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8571 (0.7765)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [105]  [ 600/2502]  eta: 0:08:16  lr: 0.002759  min_lr: 0.002759  loss: 3.0656 (3.4261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7567 (0.7768)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [105]  [ 800/2502]  eta: 0:07:22  lr: 0.002758  min_lr: 0.002758  loss: 3.9367 (3.4427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7625 (0.7774)  time: 0.2572  data: 0.0005  max mem: 18975
Epoch: [105]  [1000/2502]  eta: 0:06:29  lr: 0.002756  min_lr: 0.002756  loss: 3.9414 (3.4371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7767 (0.7770)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [105]  [1200/2502]  eta: 0:05:37  lr: 0.002755  min_lr: 0.002755  loss: 3.1486 (3.4342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7174 (0.7766)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [105]  [1400/2502]  eta: 0:04:45  lr: 0.002754  min_lr: 0.002754  loss: 3.3063 (3.4324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7496 (0.7764)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [105]  [1600/2502]  eta: 0:03:53  lr: 0.002753  min_lr: 0.002753  loss: 3.5660 (3.4383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7523 (0.7749)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [105]  [1800/2502]  eta: 0:03:01  lr: 0.002751  min_lr: 0.002751  loss: 3.6621 (3.4395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7933 (0.7795)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [105]  [2000/2502]  eta: 0:02:09  lr: 0.002750  min_lr: 0.002750  loss: 2.9631 (3.4440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8169 (0.7821)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [105]  [2200/2502]  eta: 0:01:18  lr: 0.002749  min_lr: 0.002749  loss: 3.5744 (3.4497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (0.7790)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [105]  [2400/2502]  eta: 0:00:26  lr: 0.002747  min_lr: 0.002747  loss: 3.6731 (3.4450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7564 (0.7819)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [105]  [2501/2502]  eta: 0:00:00  lr: 0.002747  min_lr: 0.002747  loss: 2.8733 (3.4449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7076 (0.7797)  time: 0.2307  data: 0.0008  max mem: 18975
Epoch: [105] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.002747  min_lr: 0.002747  loss: 2.8733 (3.4472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7076 (0.7797)
Test:  [ 0/50]  eta: 0:02:26  loss: 0.7716 (0.7716)  acc1: 90.0000 (90.0000)  acc5: 96.4000 (96.4000)  time: 2.9218  data: 2.7773  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.9312 (0.9644)  acc1: 81.6000 (80.5455)  acc5: 96.4000 (95.3455)  time: 0.4558  data: 0.3337  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9611 (0.9640)  acc1: 77.6000 (79.8857)  acc5: 96.4000 (95.9048)  time: 0.2465  data: 0.1244  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0919 (1.1028)  acc1: 74.4000 (77.0839)  acc5: 92.8000 (94.0516)  time: 0.2744  data: 0.1528  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4318 (1.1838)  acc1: 71.2000 (75.2976)  acc5: 89.6000 (93.0537)  time: 0.2337  data: 0.1148  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4023 (1.1993)  acc1: 69.2000 (74.7520)  acc5: 89.6000 (92.9440)  time: 0.1962  data: 0.0774  max mem: 18975
Test: Total time: 0:00:14 (0.2885 s / it)
* Acc@1 74.642 Acc@5 92.938 loss 1.201
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 74.83%
Epoch: [106]  [   0/2502]  eta: 1:28:07  lr: 0.002747  min_lr: 0.002747  loss: 3.9557 (3.9557)  weight_decay: 0.0500 (0.0500)  time: 2.1134  data: 1.8417  max mem: 18975
Epoch: [106]  [ 200/2502]  eta: 0:10:17  lr: 0.002746  min_lr: 0.002746  loss: 3.5821 (3.4726)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8002 (0.7886)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [106]  [ 400/2502]  eta: 0:09:12  lr: 0.002744  min_lr: 0.002744  loss: 3.1492 (3.4400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8130 (0.7807)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [106]  [ 600/2502]  eta: 0:08:15  lr: 0.002743  min_lr: 0.002743  loss: 3.8299 (3.4206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8492 (0.7823)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [106]  [ 800/2502]  eta: 0:07:22  lr: 0.002742  min_lr: 0.002742  loss: 3.0468 (3.4179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7678 (0.7823)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [106]  [1000/2502]  eta: 0:06:29  lr: 0.002740  min_lr: 0.002740  loss: 3.0414 (3.4167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8503 (0.7892)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [106]  [1200/2502]  eta: 0:05:37  lr: 0.002739  min_lr: 0.002739  loss: 3.0779 (3.4073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8221 (0.7904)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [106]  [1400/2502]  eta: 0:04:45  lr: 0.002738  min_lr: 0.002738  loss: 3.7717 (3.4071)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7362 (0.7848)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [106]  [1600/2502]  eta: 0:03:53  lr: 0.002736  min_lr: 0.002736  loss: 3.6229 (3.4141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.7854)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [106]  [1800/2502]  eta: 0:03:01  lr: 0.002735  min_lr: 0.002735  loss: 3.9760 (3.4233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7786 (0.7849)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [106]  [2000/2502]  eta: 0:02:09  lr: 0.002734  min_lr: 0.002734  loss: 3.6081 (3.4296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8472 (0.7896)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [106]  [2200/2502]  eta: 0:01:17  lr: 0.002733  min_lr: 0.002733  loss: 3.2260 (3.4237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6903 (0.7856)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [106]  [2400/2502]  eta: 0:00:26  lr: 0.002731  min_lr: 0.002731  loss: 3.2067 (3.4245)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7657 (0.7869)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [106]  [2501/2502]  eta: 0:00:00  lr: 0.002731  min_lr: 0.002731  loss: 3.8724 (3.4280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7652 (0.7848)  time: 0.2305  data: 0.0007  max mem: 18975
Epoch: [106] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.002731  min_lr: 0.002731  loss: 3.8724 (3.4418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7652 (0.7848)
Test:  [ 0/50]  eta: 0:02:12  loss: 0.6590 (0.6590)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 2.6572  data: 2.5185  max mem: 18975
Test:  [10/50]  eta: 0:00:16  loss: 0.9465 (0.9587)  acc1: 82.0000 (80.5818)  acc5: 97.2000 (95.6364)  time: 0.4129  data: 0.2910  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9465 (0.9701)  acc1: 80.0000 (80.0762)  acc5: 96.4000 (96.0191)  time: 0.2496  data: 0.1298  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1374 (1.1009)  acc1: 74.4000 (77.2774)  acc5: 93.2000 (94.3355)  time: 0.3118  data: 0.1917  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4302 (1.1915)  acc1: 69.2000 (75.1707)  acc5: 90.0000 (93.2000)  time: 0.3019  data: 0.1820  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4302 (1.2060)  acc1: 68.8000 (74.6320)  acc5: 90.4000 (93.0960)  time: 0.2391  data: 0.1204  max mem: 18975
Test: Total time: 0:00:15 (0.3111 s / it)
* Acc@1 74.928 Acc@5 92.992 loss 1.209
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.93%
Epoch: [107]  [   0/2502]  eta: 1:24:44  lr: 0.002731  min_lr: 0.002731  loss: 4.5745 (4.5745)  weight_decay: 0.0500 (0.0500)  time: 2.0323  data: 1.7520  max mem: 18975
Epoch: [107]  [ 200/2502]  eta: 0:10:17  lr: 0.002729  min_lr: 0.002729  loss: 3.0791 (3.3708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7596 (nan)  time: 0.2633  data: 0.0004  max mem: 18975
Epoch: [107]  [ 400/2502]  eta: 0:09:11  lr: 0.002728  min_lr: 0.002728  loss: 3.2472 (3.3950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7825 (nan)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [107]  [ 600/2502]  eta: 0:08:15  lr: 0.002727  min_lr: 0.002727  loss: 3.3967 (3.4232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7649 (nan)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [107]  [ 800/2502]  eta: 0:07:22  lr: 0.002725  min_lr: 0.002725  loss: 3.9052 (3.4305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7386 (nan)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [107]  [1000/2502]  eta: 0:06:29  lr: 0.002724  min_lr: 0.002724  loss: 3.6664 (3.4185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7600 (nan)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [107]  [1200/2502]  eta: 0:05:37  lr: 0.002723  min_lr: 0.002723  loss: 2.9637 (3.4154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8335 (nan)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [107]  [1400/2502]  eta: 0:04:45  lr: 0.002721  min_lr: 0.002721  loss: 3.4637 (3.4242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7830 (nan)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [107]  [1600/2502]  eta: 0:03:53  lr: 0.002720  min_lr: 0.002720  loss: 3.4963 (3.4311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7017 (nan)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [107]  [1800/2502]  eta: 0:03:01  lr: 0.002719  min_lr: 0.002719  loss: 3.8518 (3.4392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8069 (nan)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [107]  [2000/2502]  eta: 0:02:09  lr: 0.002718  min_lr: 0.002718  loss: 3.6102 (3.4395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7075 (nan)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [107]  [2200/2502]  eta: 0:01:17  lr: 0.002716  min_lr: 0.002716  loss: 3.3721 (3.4394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7348 (nan)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [107]  [2400/2502]  eta: 0:00:26  lr: 0.002715  min_lr: 0.002715  loss: 3.4769 (3.4352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7446 (nan)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [107]  [2501/2502]  eta: 0:00:00  lr: 0.002714  min_lr: 0.002714  loss: 3.7864 (3.4364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (nan)  time: 0.2318  data: 0.0012  max mem: 18975
Epoch: [107] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.002714  min_lr: 0.002714  loss: 3.7864 (3.4366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (nan)
Test:  [ 0/50]  eta: 0:02:37  loss: 0.7438 (0.7438)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 3.1442  data: 2.9870  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 1.1348 (1.1130)  acc1: 82.0000 (80.9455)  acc5: 95.6000 (95.8182)  time: 0.4873  data: 0.3629  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1238 (1.1139)  acc1: 78.8000 (79.9238)  acc5: 95.6000 (95.9238)  time: 0.2341  data: 0.1138  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2603 (1.2320)  acc1: 74.8000 (77.3677)  acc5: 94.0000 (94.3355)  time: 0.2599  data: 0.1399  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5469 (1.3129)  acc1: 69.6000 (75.4829)  acc5: 89.6000 (93.1415)  time: 0.2463  data: 0.1263  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5268 (1.3250)  acc1: 69.2000 (74.7680)  acc5: 89.6000 (92.8880)  time: 0.1921  data: 0.0728  max mem: 18975
Test: Total time: 0:00:14 (0.2875 s / it)
* Acc@1 74.888 Acc@5 92.936 loss 1.327
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.93%
Epoch: [108]  [   0/2502]  eta: 1:36:40  lr: 0.002714  min_lr: 0.002714  loss: 3.9247 (3.9247)  weight_decay: 0.0500 (0.0500)  time: 2.3184  data: 1.5110  max mem: 18975
Epoch: [108]  [ 200/2502]  eta: 0:10:20  lr: 0.002713  min_lr: 0.002713  loss: 3.8445 (3.4001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7549 (0.7736)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [108]  [ 400/2502]  eta: 0:09:12  lr: 0.002712  min_lr: 0.002712  loss: 3.8584 (3.4410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7786 (0.7905)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [108]  [ 600/2502]  eta: 0:08:16  lr: 0.002710  min_lr: 0.002710  loss: 3.7368 (3.4427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8118 (0.7892)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [108]  [ 800/2502]  eta: 0:07:22  lr: 0.002709  min_lr: 0.002709  loss: 3.8697 (3.4387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8346 (0.7889)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [108]  [1000/2502]  eta: 0:06:29  lr: 0.002708  min_lr: 0.002708  loss: 3.5325 (3.4364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7600 (0.7870)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [108]  [1200/2502]  eta: 0:05:37  lr: 0.002706  min_lr: 0.002706  loss: 3.7335 (3.4286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7735 (0.7861)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [108]  [1400/2502]  eta: 0:04:45  lr: 0.002705  min_lr: 0.002705  loss: 3.3063 (3.4359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7859 (0.7906)  time: 0.2574  data: 0.0005  max mem: 18975
Epoch: [108]  [1600/2502]  eta: 0:03:53  lr: 0.002704  min_lr: 0.002704  loss: 3.2518 (3.4379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7287 (0.7884)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [108]  [1800/2502]  eta: 0:03:01  lr: 0.002702  min_lr: 0.002702  loss: 3.5286 (3.4363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7127 (0.7844)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [108]  [2000/2502]  eta: 0:02:09  lr: 0.002701  min_lr: 0.002701  loss: 3.7080 (3.4338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7081 (0.7788)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [108]  [2200/2502]  eta: 0:01:17  lr: 0.002700  min_lr: 0.002700  loss: 3.3857 (3.4346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7518 (0.7799)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [108]  [2400/2502]  eta: 0:00:26  lr: 0.002699  min_lr: 0.002699  loss: 3.8259 (3.4392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8008 (0.7780)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [108]  [2501/2502]  eta: 0:00:00  lr: 0.002698  min_lr: 0.002698  loss: 3.7245 (3.4391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7955 (0.7790)  time: 0.2309  data: 0.0007  max mem: 18975
Epoch: [108] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.002698  min_lr: 0.002698  loss: 3.7245 (3.4299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7955 (0.7790)
Test:  [ 0/50]  eta: 0:02:16  loss: 0.7102 (0.7102)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 2.7293  data: 2.5889  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 1.0937 (1.0810)  acc1: 80.0000 (81.1636)  acc5: 95.2000 (95.7455)  time: 0.4646  data: 0.3437  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.1062 (1.1131)  acc1: 79.6000 (80.5143)  acc5: 95.6000 (95.8286)  time: 0.2642  data: 0.1447  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.3394 (1.2460)  acc1: 74.0000 (77.7032)  acc5: 92.4000 (93.9742)  time: 0.2968  data: 0.1759  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5754 (1.3241)  acc1: 70.8000 (75.6293)  acc5: 88.4000 (92.8195)  time: 0.2430  data: 0.1228  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5754 (1.3448)  acc1: 70.0000 (74.9680)  acc5: 88.8000 (92.6400)  time: 0.1908  data: 0.0720  max mem: 18975
Test: Total time: 0:00:14 (0.2835 s / it)
* Acc@1 74.872 Acc@5 92.898 loss 1.350
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.93%
Epoch: [109]  [   0/2502]  eta: 1:28:01  lr: 0.002698  min_lr: 0.002698  loss: 3.1597 (3.1597)  weight_decay: 0.0500 (0.0500)  time: 2.1107  data: 1.6585  max mem: 18975
Epoch: [109]  [ 200/2502]  eta: 0:10:17  lr: 0.002697  min_lr: 0.002697  loss: 2.8247 (3.3774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9102 (0.8215)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [109]  [ 400/2502]  eta: 0:09:12  lr: 0.002695  min_lr: 0.002695  loss: 3.0985 (3.4237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7169 (0.7890)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [109]  [ 600/2502]  eta: 0:08:16  lr: 0.002694  min_lr: 0.002694  loss: 3.1603 (3.4191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7404 (0.7889)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [109]  [ 800/2502]  eta: 0:07:22  lr: 0.002693  min_lr: 0.002693  loss: 3.6691 (3.4536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7541 (0.7817)  time: 0.2556  data: 0.0004  max mem: 18975
Epoch: [109]  [1000/2502]  eta: 0:06:29  lr: 0.002691  min_lr: 0.002691  loss: 3.8398 (3.4594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7941 (0.7812)  time: 0.2555  data: 0.0004  max mem: 18975
Epoch: [109]  [1200/2502]  eta: 0:05:37  lr: 0.002690  min_lr: 0.002690  loss: 2.9834 (3.4584)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7944 (0.7846)  time: 0.2557  data: 0.0004  max mem: 18975
Epoch: [109]  [1400/2502]  eta: 0:04:44  lr: 0.002689  min_lr: 0.002689  loss: 3.6119 (3.4738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7487 (0.7809)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [109]  [1600/2502]  eta: 0:03:53  lr: 0.002687  min_lr: 0.002687  loss: 3.3989 (3.4762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7525 (0.7828)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [109]  [1800/2502]  eta: 0:03:01  lr: 0.002686  min_lr: 0.002686  loss: 3.8305 (3.4674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7727 (0.7859)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [109]  [2000/2502]  eta: 0:02:09  lr: 0.002685  min_lr: 0.002685  loss: 3.5437 (3.4582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7217 (0.7860)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [109]  [2200/2502]  eta: 0:01:17  lr: 0.002683  min_lr: 0.002683  loss: 3.8561 (3.4640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7925 (0.7864)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [109]  [2400/2502]  eta: 0:00:26  lr: 0.002682  min_lr: 0.002682  loss: 3.5920 (3.4626)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7865 (0.7868)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [109]  [2501/2502]  eta: 0:00:00  lr: 0.002681  min_lr: 0.002681  loss: 3.7992 (3.4634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7377 (0.7856)  time: 0.2313  data: 0.0007  max mem: 18975
Epoch: [109] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.002681  min_lr: 0.002681  loss: 3.7992 (3.4534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7377 (0.7856)
Test:  [ 0/50]  eta: 0:02:48  loss: 0.7420 (0.7420)  acc1: 92.4000 (92.4000)  acc5: 99.6000 (99.6000)  time: 3.3723  data: 3.2225  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.0632 (1.0946)  acc1: 82.8000 (82.4000)  acc5: 95.6000 (96.1455)  time: 0.5462  data: 0.4145  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0855 (1.1011)  acc1: 79.6000 (80.7429)  acc5: 95.6000 (95.8476)  time: 0.2382  data: 0.1134  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2374 (1.2023)  acc1: 74.0000 (78.2194)  acc5: 93.2000 (94.3097)  time: 0.2298  data: 0.1098  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4920 (1.2842)  acc1: 71.2000 (76.0098)  acc5: 90.0000 (93.1902)  time: 0.2507  data: 0.1309  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4867 (1.2943)  acc1: 71.6000 (75.6080)  acc5: 89.6000 (92.9520)  time: 0.2272  data: 0.1081  max mem: 18975
Test: Total time: 0:00:15 (0.3003 s / it)
* Acc@1 75.274 Acc@5 93.082 loss 1.299
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.27%
Epoch: [110]  [   0/2502]  eta: 1:35:59  lr: 0.002681  min_lr: 0.002681  loss: 2.6712 (2.6712)  weight_decay: 0.0500 (0.0500)  time: 2.3020  data: 2.0242  max mem: 18975
Epoch: [110]  [ 200/2502]  eta: 0:10:20  lr: 0.002680  min_lr: 0.002680  loss: 3.2659 (3.3987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8627 (0.8305)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [110]  [ 400/2502]  eta: 0:09:13  lr: 0.002679  min_lr: 0.002679  loss: 3.3782 (3.4272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8259 (0.8162)  time: 0.2583  data: 0.0004  max mem: 18975
Epoch: [110]  [ 600/2502]  eta: 0:08:17  lr: 0.002677  min_lr: 0.002677  loss: 3.3818 (3.4103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8110 (0.8200)  time: 0.2670  data: 0.0004  max mem: 18975
Epoch: [110]  [ 800/2502]  eta: 0:07:22  lr: 0.002676  min_lr: 0.002676  loss: 3.7454 (3.4186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7461 (0.8061)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [110]  [1000/2502]  eta: 0:06:29  lr: 0.002675  min_lr: 0.002675  loss: 3.2094 (3.4110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8830 (inf)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [110]  [1200/2502]  eta: 0:05:37  lr: 0.002673  min_lr: 0.002673  loss: 3.8789 (3.4317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7690 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [110]  [1400/2502]  eta: 0:04:45  lr: 0.002672  min_lr: 0.002672  loss: 3.4505 (3.4311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7621 (inf)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [110]  [1600/2502]  eta: 0:03:53  lr: 0.002671  min_lr: 0.002671  loss: 3.2549 (3.4379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7859 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [110]  [1800/2502]  eta: 0:03:01  lr: 0.002669  min_lr: 0.002669  loss: 3.6005 (3.4422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7145 (inf)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [110]  [2000/2502]  eta: 0:02:09  lr: 0.002668  min_lr: 0.002668  loss: 3.6371 (3.4421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7756 (inf)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [110]  [2200/2502]  eta: 0:01:18  lr: 0.002667  min_lr: 0.002667  loss: 3.5610 (3.4382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7957 (inf)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [110]  [2400/2502]  eta: 0:00:26  lr: 0.002665  min_lr: 0.002665  loss: 3.6520 (3.4420)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7504 (inf)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [110]  [2501/2502]  eta: 0:00:00  lr: 0.002665  min_lr: 0.002665  loss: 3.8617 (3.4404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (inf)  time: 0.2313  data: 0.0009  max mem: 18975
Epoch: [110] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.002665  min_lr: 0.002665  loss: 3.8617 (3.4409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (inf)
Test:  [ 0/50]  eta: 0:02:48  loss: 0.6702 (0.6702)  acc1: 91.6000 (91.6000)  acc5: 98.8000 (98.8000)  time: 3.3739  data: 3.2353  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 0.9690 (1.0031)  acc1: 82.4000 (81.6364)  acc5: 96.8000 (95.7818)  time: 0.5789  data: 0.4578  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.9971 (1.0142)  acc1: 78.0000 (80.0381)  acc5: 95.6000 (95.7333)  time: 0.2947  data: 0.1755  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1967 (1.1205)  acc1: 74.4000 (77.7161)  acc5: 93.2000 (94.1677)  time: 0.2411  data: 0.1219  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4162 (1.2052)  acc1: 70.4000 (75.6585)  acc5: 90.0000 (93.1415)  time: 0.2115  data: 0.0924  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3977 (1.2189)  acc1: 70.8000 (75.2000)  acc5: 90.4000 (92.9760)  time: 0.2106  data: 0.0917  max mem: 18975
Test: Total time: 0:00:15 (0.3004 s / it)
* Acc@1 75.288 Acc@5 93.168 loss 1.216
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.29%
Epoch: [111]  [   0/2502]  eta: 1:45:45  lr: 0.002665  min_lr: 0.002665  loss: 3.4669 (3.4669)  weight_decay: 0.0500 (0.0500)  time: 2.5361  data: 2.2696  max mem: 18975
Epoch: [111]  [ 200/2502]  eta: 0:10:23  lr: 0.002663  min_lr: 0.002663  loss: 3.7696 (3.4657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7869 (0.7693)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [111]  [ 400/2502]  eta: 0:09:15  lr: 0.002662  min_lr: 0.002662  loss: 3.2530 (3.4130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7700 (0.7943)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [111]  [ 600/2502]  eta: 0:08:18  lr: 0.002661  min_lr: 0.002661  loss: 3.3254 (3.4197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7485 (0.7841)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [111]  [ 800/2502]  eta: 0:07:24  lr: 0.002659  min_lr: 0.002659  loss: 3.4655 (3.4139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7204 (0.7785)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [111]  [1000/2502]  eta: 0:06:31  lr: 0.002658  min_lr: 0.002658  loss: 3.0611 (3.4160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7975 (0.7843)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [111]  [1200/2502]  eta: 0:05:38  lr: 0.002657  min_lr: 0.002657  loss: 3.8494 (3.4241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7634 (0.7835)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [111]  [1400/2502]  eta: 0:04:45  lr: 0.002655  min_lr: 0.002655  loss: 3.8433 (3.4347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8384 (0.7876)  time: 0.2572  data: 0.0003  max mem: 18975
Epoch: [111]  [1600/2502]  eta: 0:03:53  lr: 0.002654  min_lr: 0.002654  loss: 3.2442 (3.4331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7687 (0.7866)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [111]  [1800/2502]  eta: 0:03:01  lr: 0.002653  min_lr: 0.002653  loss: 3.5844 (3.4350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7835 (0.7860)  time: 0.2659  data: 0.0003  max mem: 18975
Epoch: [111]  [2000/2502]  eta: 0:02:09  lr: 0.002651  min_lr: 0.002651  loss: 3.3847 (3.4351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8202 (0.7852)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [111]  [2200/2502]  eta: 0:01:18  lr: 0.002650  min_lr: 0.002650  loss: 3.7687 (3.4399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8083 (0.7871)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [111]  [2400/2502]  eta: 0:00:26  lr: 0.002648  min_lr: 0.002648  loss: 3.4705 (3.4386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7737 (0.7866)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [111]  [2501/2502]  eta: 0:00:00  lr: 0.002648  min_lr: 0.002648  loss: 3.1946 (3.4389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7794 (0.7869)  time: 0.2307  data: 0.0007  max mem: 18975
Epoch: [111] Total time: 0:10:47 (0.2586 s / it)
Averaged stats: lr: 0.002648  min_lr: 0.002648  loss: 3.1946 (3.4276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7794 (0.7869)
Test:  [ 0/50]  eta: 0:02:31  loss: 0.5578 (0.5578)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 3.0323  data: 2.8897  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.9695 (0.9582)  acc1: 81.2000 (81.2364)  acc5: 96.8000 (96.0000)  time: 0.4512  data: 0.3282  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9588 (0.9609)  acc1: 78.8000 (80.5143)  acc5: 96.4000 (95.9429)  time: 0.2561  data: 0.1354  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0901 (1.0861)  acc1: 76.8000 (78.0000)  acc5: 92.8000 (94.3871)  time: 0.2971  data: 0.1772  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4361 (1.1726)  acc1: 70.4000 (75.9220)  acc5: 90.0000 (93.3659)  time: 0.2600  data: 0.1409  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4277 (1.1918)  acc1: 70.0000 (75.3760)  acc5: 90.0000 (93.0240)  time: 0.2266  data: 0.1077  max mem: 18975
Test: Total time: 0:00:15 (0.3084 s / it)
* Acc@1 75.190 Acc@5 93.096 loss 1.189
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.29%
Epoch: [112]  [   0/2502]  eta: 1:45:19  lr: 0.002648  min_lr: 0.002648  loss: 2.3793 (2.3793)  weight_decay: 0.0500 (0.0500)  time: 2.5257  data: 2.2502  max mem: 18975
Epoch: [112]  [ 200/2502]  eta: 0:10:23  lr: 0.002646  min_lr: 0.002646  loss: 3.7635 (3.4304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7525 (0.7946)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [112]  [ 400/2502]  eta: 0:09:15  lr: 0.002645  min_lr: 0.002645  loss: 3.1806 (3.4137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7758 (0.7972)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [112]  [ 600/2502]  eta: 0:08:18  lr: 0.002644  min_lr: 0.002644  loss: 3.6601 (3.4382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7631 (0.8029)  time: 0.2640  data: 0.0003  max mem: 18975
Epoch: [112]  [ 800/2502]  eta: 0:07:23  lr: 0.002642  min_lr: 0.002642  loss: 3.5715 (3.4314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7421 (0.7927)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [112]  [1000/2502]  eta: 0:06:30  lr: 0.002641  min_lr: 0.002641  loss: 3.0642 (3.4360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8073 (0.7885)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [112]  [1200/2502]  eta: 0:05:37  lr: 0.002640  min_lr: 0.002640  loss: 3.6725 (3.4400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.7882)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [112]  [1400/2502]  eta: 0:04:45  lr: 0.002638  min_lr: 0.002638  loss: 3.3172 (3.4327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7340 (0.7848)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [112]  [1600/2502]  eta: 0:03:53  lr: 0.002637  min_lr: 0.002637  loss: 3.5871 (3.4359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7149 (0.7913)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [112]  [1800/2502]  eta: 0:03:01  lr: 0.002636  min_lr: 0.002636  loss: 3.3553 (3.4326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8131 (0.7963)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [112]  [2000/2502]  eta: 0:02:09  lr: 0.002634  min_lr: 0.002634  loss: 3.0173 (3.4342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8056 (0.7933)  time: 0.2574  data: 0.0005  max mem: 18975
Epoch: [112]  [2200/2502]  eta: 0:01:18  lr: 0.002633  min_lr: 0.002633  loss: 3.0454 (3.4459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8150 (0.7934)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [112]  [2400/2502]  eta: 0:00:26  lr: 0.002632  min_lr: 0.002632  loss: 2.7964 (3.4383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7518 (0.7922)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [112]  [2501/2502]  eta: 0:00:00  lr: 0.002631  min_lr: 0.002631  loss: 3.6204 (3.4375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7726 (0.7915)  time: 0.2308  data: 0.0009  max mem: 18975
Epoch: [112] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.002631  min_lr: 0.002631  loss: 3.6204 (3.4207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7726 (0.7915)
Test:  [ 0/50]  eta: 0:03:10  loss: 0.6499 (0.6499)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 3.8109  data: 3.6668  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9839 (0.9818)  acc1: 81.6000 (81.8545)  acc5: 96.0000 (95.7818)  time: 0.5019  data: 0.3791  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9949 (1.0179)  acc1: 80.8000 (80.9333)  acc5: 96.0000 (96.0381)  time: 0.2033  data: 0.0823  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1649 (1.1495)  acc1: 75.6000 (78.2452)  acc5: 93.2000 (94.1936)  time: 0.2608  data: 0.1405  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4695 (1.2286)  acc1: 70.0000 (76.0976)  acc5: 89.6000 (93.1610)  time: 0.2707  data: 0.1516  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4592 (1.2429)  acc1: 70.4000 (75.6320)  acc5: 90.0000 (92.9680)  time: 0.2074  data: 0.0865  max mem: 18975
Test: Total time: 0:00:14 (0.2977 s / it)
* Acc@1 75.356 Acc@5 93.128 loss 1.246
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.36%
Epoch: [113]  [   0/2502]  eta: 1:21:35  lr: 0.002631  min_lr: 0.002631  loss: 4.2295 (4.2295)  weight_decay: 0.0500 (0.0500)  time: 1.9565  data: 1.6901  max mem: 18975
Epoch: [113]  [ 200/2502]  eta: 0:10:15  lr: 0.002630  min_lr: 0.002630  loss: 3.7749 (3.3505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7895 (0.8034)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [113]  [ 400/2502]  eta: 0:09:12  lr: 0.002628  min_lr: 0.002628  loss: 3.5622 (3.3416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7747 (0.8039)  time: 0.2631  data: 0.0004  max mem: 18975
Epoch: [113]  [ 600/2502]  eta: 0:08:16  lr: 0.002627  min_lr: 0.002627  loss: 2.9582 (3.3791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8025 (0.8038)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [113]  [ 800/2502]  eta: 0:07:22  lr: 0.002625  min_lr: 0.002625  loss: 3.1463 (3.3924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8148 (0.8011)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [113]  [1000/2502]  eta: 0:06:29  lr: 0.002624  min_lr: 0.002624  loss: 3.2948 (3.4207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8787 (0.8093)  time: 0.2583  data: 0.0004  max mem: 18975
Epoch: [113]  [1200/2502]  eta: 0:05:37  lr: 0.002623  min_lr: 0.002623  loss: 3.6172 (3.4184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7179 (0.8042)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [113]  [1400/2502]  eta: 0:04:45  lr: 0.002621  min_lr: 0.002621  loss: 3.0732 (3.4102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7718 (0.8003)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [113]  [1600/2502]  eta: 0:03:53  lr: 0.002620  min_lr: 0.002620  loss: 3.4339 (3.4065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8472 (0.8029)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [113]  [1800/2502]  eta: 0:03:01  lr: 0.002619  min_lr: 0.002619  loss: 3.3296 (3.4104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7929 (0.8009)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [113]  [2000/2502]  eta: 0:02:09  lr: 0.002617  min_lr: 0.002617  loss: 3.5545 (3.4091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7839 (0.8005)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [113]  [2200/2502]  eta: 0:01:18  lr: 0.002616  min_lr: 0.002616  loss: 3.2036 (3.4088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7426 (0.7981)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [113]  [2400/2502]  eta: 0:00:26  lr: 0.002615  min_lr: 0.002615  loss: 3.0546 (3.4050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7702 (0.7969)  time: 0.2621  data: 0.0004  max mem: 18975
Epoch: [113]  [2501/2502]  eta: 0:00:00  lr: 0.002614  min_lr: 0.002614  loss: 3.4882 (3.4064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7233 (0.7954)  time: 0.2312  data: 0.0009  max mem: 18975
Epoch: [113] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.002614  min_lr: 0.002614  loss: 3.4882 (3.4165)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7233 (0.7954)
Test:  [ 0/50]  eta: 0:02:48  loss: 0.5832 (0.5832)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 3.3679  data: 3.2303  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 1.0054 (0.9633)  acc1: 79.6000 (81.0546)  acc5: 96.0000 (95.6727)  time: 0.5359  data: 0.4150  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9992 (0.9660)  acc1: 79.2000 (80.4191)  acc5: 96.0000 (95.9810)  time: 0.2313  data: 0.1117  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1840 (1.0822)  acc1: 75.2000 (77.8968)  acc5: 92.4000 (94.4129)  time: 0.2292  data: 0.1087  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3734 (1.1543)  acc1: 71.2000 (75.9805)  acc5: 90.4000 (93.5512)  time: 0.2485  data: 0.1285  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3356 (1.1682)  acc1: 69.6000 (75.2880)  acc5: 91.2000 (93.3680)  time: 0.2508  data: 0.1321  max mem: 18975
Test: Total time: 0:00:15 (0.3018 s / it)
* Acc@1 75.348 Acc@5 93.302 loss 1.177
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.36%
Epoch: [114]  [   0/2502]  eta: 1:33:35  lr: 0.002614  min_lr: 0.002614  loss: 4.0766 (4.0766)  weight_decay: 0.0500 (0.0500)  time: 2.2444  data: 1.9724  max mem: 18975
Epoch: [114]  [ 200/2502]  eta: 0:10:17  lr: 0.002613  min_lr: 0.002613  loss: 3.3087 (3.4040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7540 (0.7855)  time: 0.2567  data: 0.0005  max mem: 18975
Epoch: [114]  [ 400/2502]  eta: 0:09:11  lr: 0.002611  min_lr: 0.002611  loss: 3.4580 (3.4279)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7913 (0.8027)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [114]  [ 600/2502]  eta: 0:08:15  lr: 0.002610  min_lr: 0.002610  loss: 3.9327 (3.4307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8022 (0.7992)  time: 0.2557  data: 0.0004  max mem: 18975
Epoch: [114]  [ 800/2502]  eta: 0:07:22  lr: 0.002608  min_lr: 0.002608  loss: 3.7604 (3.4259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8250 (0.8034)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [114]  [1000/2502]  eta: 0:06:29  lr: 0.002607  min_lr: 0.002607  loss: 3.1668 (3.4118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7536 (0.8024)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [114]  [1200/2502]  eta: 0:05:37  lr: 0.002606  min_lr: 0.002606  loss: 3.6617 (3.4134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8126 (0.8038)  time: 0.2628  data: 0.0004  max mem: 18975
Epoch: [114]  [1400/2502]  eta: 0:04:45  lr: 0.002604  min_lr: 0.002604  loss: 3.8608 (3.4309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7002 (0.7958)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [114]  [1600/2502]  eta: 0:03:53  lr: 0.002603  min_lr: 0.002603  loss: 3.5364 (3.4296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.7977)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [114]  [1800/2502]  eta: 0:03:01  lr: 0.002602  min_lr: 0.002602  loss: 3.8575 (3.4282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8030 (0.7996)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [114]  [2000/2502]  eta: 0:02:09  lr: 0.002600  min_lr: 0.002600  loss: 3.3760 (3.4268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7832 (0.8025)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [114]  [2200/2502]  eta: 0:01:17  lr: 0.002599  min_lr: 0.002599  loss: 3.1731 (3.4316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8048 (0.8040)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [114]  [2400/2502]  eta: 0:00:26  lr: 0.002597  min_lr: 0.002597  loss: 3.5587 (3.4343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7343 (0.8015)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [114]  [2501/2502]  eta: 0:00:00  lr: 0.002597  min_lr: 0.002597  loss: 3.4916 (3.4341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7650 (0.8014)  time: 0.2305  data: 0.0009  max mem: 18975
Epoch: [114] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.002597  min_lr: 0.002597  loss: 3.4916 (3.4184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7650 (0.8014)
Test:  [ 0/50]  eta: 0:02:42  loss: 0.6694 (0.6694)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 3.2459  data: 3.1024  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9076 (0.9652)  acc1: 84.0000 (81.4182)  acc5: 96.4000 (96.0727)  time: 0.5260  data: 0.4034  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9695 (0.9922)  acc1: 80.0000 (80.3810)  acc5: 96.4000 (96.0952)  time: 0.2240  data: 0.1041  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1663 (1.1144)  acc1: 74.4000 (77.7032)  acc5: 93.2000 (94.3613)  time: 0.2087  data: 0.0881  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 1.4165 (1.1884)  acc1: 71.6000 (75.9024)  acc5: 89.2000 (93.4049)  time: 0.2190  data: 0.0985  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3521 (1.1960)  acc1: 70.8000 (75.3600)  acc5: 92.0000 (93.3440)  time: 0.2097  data: 0.0907  max mem: 18975
Test: Total time: 0:00:13 (0.2725 s / it)
* Acc@1 75.216 Acc@5 93.282 loss 1.203
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.36%
Epoch: [115]  [   0/2502]  eta: 1:20:26  lr: 0.002597  min_lr: 0.002597  loss: 3.9195 (3.9195)  weight_decay: 0.0500 (0.0500)  time: 1.9289  data: 1.3458  max mem: 18975
Epoch: [115]  [ 200/2502]  eta: 0:10:19  lr: 0.002595  min_lr: 0.002595  loss: 3.5287 (3.4026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.7787)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [115]  [ 400/2502]  eta: 0:09:14  lr: 0.002594  min_lr: 0.002594  loss: 3.0112 (3.4046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7061 (0.7867)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [115]  [ 600/2502]  eta: 0:08:17  lr: 0.002593  min_lr: 0.002593  loss: 3.1572 (3.4046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7981 (0.7773)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [115]  [ 800/2502]  eta: 0:07:23  lr: 0.002591  min_lr: 0.002591  loss: 3.1838 (3.3954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7156 (0.7743)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [115]  [1000/2502]  eta: 0:06:30  lr: 0.002590  min_lr: 0.002590  loss: 3.2186 (3.4038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7835 (0.7709)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [115]  [1200/2502]  eta: 0:05:37  lr: 0.002589  min_lr: 0.002589  loss: 2.9722 (3.4090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7418 (0.7779)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [115]  [1400/2502]  eta: 0:04:45  lr: 0.002587  min_lr: 0.002587  loss: 3.2061 (3.4061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7675 (0.7793)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [115]  [1600/2502]  eta: 0:03:53  lr: 0.002586  min_lr: 0.002586  loss: 3.3076 (3.3992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8146 (0.7835)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [115]  [1800/2502]  eta: 0:03:01  lr: 0.002584  min_lr: 0.002584  loss: 2.7674 (3.3949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7173 (0.7803)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [115]  [2000/2502]  eta: 0:02:09  lr: 0.002583  min_lr: 0.002583  loss: 3.4698 (3.3896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7579 (0.7831)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [115]  [2200/2502]  eta: 0:01:17  lr: 0.002582  min_lr: 0.002582  loss: 3.2653 (3.3919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7603 (0.7837)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [115]  [2400/2502]  eta: 0:00:26  lr: 0.002580  min_lr: 0.002580  loss: 3.0218 (3.3947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8547 (0.7852)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [115]  [2501/2502]  eta: 0:00:00  lr: 0.002580  min_lr: 0.002580  loss: 3.2409 (3.3924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7953 (0.7853)  time: 0.2311  data: 0.0008  max mem: 18975
Epoch: [115] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.002580  min_lr: 0.002580  loss: 3.2409 (3.4169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7953 (0.7853)
Test:  [ 0/50]  eta: 0:02:50  loss: 0.7236 (0.7236)  acc1: 92.8000 (92.8000)  acc5: 98.8000 (98.8000)  time: 3.4056  data: 3.2547  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9747 (1.0004)  acc1: 79.6000 (80.9818)  acc5: 96.4000 (96.1091)  time: 0.5185  data: 0.3962  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0108 (1.0171)  acc1: 79.6000 (79.9238)  acc5: 96.4000 (96.0191)  time: 0.2374  data: 0.1165  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1536 (1.1174)  acc1: 75.2000 (77.5613)  acc5: 93.6000 (94.4645)  time: 0.2439  data: 0.1214  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4037 (1.1934)  acc1: 70.4000 (75.3854)  acc5: 90.4000 (93.3366)  time: 0.2427  data: 0.1221  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3821 (1.2055)  acc1: 71.2000 (74.9200)  acc5: 90.4000 (93.1920)  time: 0.2026  data: 0.0828  max mem: 18975
Test: Total time: 0:00:14 (0.2925 s / it)
* Acc@1 75.216 Acc@5 93.140 loss 1.211
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.36%
Epoch: [116]  [   0/2502]  eta: 1:29:55  lr: 0.002580  min_lr: 0.002580  loss: 3.6557 (3.6557)  weight_decay: 0.0500 (0.0500)  time: 2.1565  data: 1.4326  max mem: 18975
Epoch: [116]  [ 200/2502]  eta: 0:10:20  lr: 0.002578  min_lr: 0.002578  loss: 3.0944 (3.4221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7428 (0.7927)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [116]  [ 400/2502]  eta: 0:09:13  lr: 0.002577  min_lr: 0.002577  loss: 3.4595 (3.4169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9575 (0.8361)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [116]  [ 600/2502]  eta: 0:08:17  lr: 0.002575  min_lr: 0.002575  loss: 2.7698 (3.3745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7947 (0.8172)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [116]  [ 800/2502]  eta: 0:07:23  lr: 0.002574  min_lr: 0.002574  loss: 3.5701 (3.3912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7872 (0.8077)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [116]  [1000/2502]  eta: 0:06:30  lr: 0.002573  min_lr: 0.002573  loss: 3.6835 (3.4029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7660 (0.8060)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [116]  [1200/2502]  eta: 0:05:37  lr: 0.002571  min_lr: 0.002571  loss: 3.7371 (3.3967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7330 (0.8049)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [116]  [1400/2502]  eta: 0:04:45  lr: 0.002570  min_lr: 0.002570  loss: 3.6069 (3.4041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7902 (0.7989)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [116]  [1600/2502]  eta: 0:03:53  lr: 0.002568  min_lr: 0.002568  loss: 3.1999 (3.4088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7665 (0.7985)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [116]  [1800/2502]  eta: 0:03:01  lr: 0.002567  min_lr: 0.002567  loss: 3.3118 (3.4058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7785 (0.8016)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [116]  [2000/2502]  eta: 0:02:09  lr: 0.002566  min_lr: 0.002566  loss: 3.0638 (3.4037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7461 (inf)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [116]  [2200/2502]  eta: 0:01:17  lr: 0.002564  min_lr: 0.002564  loss: 3.0937 (3.4059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8395 (inf)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [116]  [2400/2502]  eta: 0:00:26  lr: 0.002563  min_lr: 0.002563  loss: 3.5359 (3.4044)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [116]  [2501/2502]  eta: 0:00:00  lr: 0.002562  min_lr: 0.002562  loss: 3.7347 (3.4049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7902 (nan)  time: 0.2311  data: 0.0010  max mem: 18975
Epoch: [116] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.002562  min_lr: 0.002562  loss: 3.7347 (3.4163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7902 (nan)
Test:  [ 0/50]  eta: 0:03:17  loss: 0.6934 (0.6934)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 3.9506  data: 3.8112  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 1.0283 (1.0223)  acc1: 79.6000 (81.1636)  acc5: 96.0000 (96.0000)  time: 0.5566  data: 0.4347  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0283 (1.0500)  acc1: 79.2000 (80.4191)  acc5: 95.6000 (95.9429)  time: 0.2161  data: 0.0961  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2027 (1.1693)  acc1: 75.6000 (77.6516)  acc5: 92.4000 (94.2839)  time: 0.2279  data: 0.1082  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4676 (1.2421)  acc1: 69.6000 (75.6585)  acc5: 90.0000 (93.3268)  time: 0.2298  data: 0.1106  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4725 (1.2574)  acc1: 69.6000 (74.9760)  acc5: 90.0000 (93.0640)  time: 0.1931  data: 0.0736  max mem: 18975
Test: Total time: 0:00:14 (0.2906 s / it)
* Acc@1 75.072 Acc@5 93.154 loss 1.257
Accuracy of the model on the 50000 test images: 75.1%
Max accuracy: 75.36%
Epoch: [117]  [   0/2502]  eta: 1:35:12  lr: 0.002562  min_lr: 0.002562  loss: 2.5597 (2.5597)  weight_decay: 0.0500 (0.0500)  time: 2.2830  data: 2.0034  max mem: 18975
Epoch: [117]  [ 200/2502]  eta: 0:10:20  lr: 0.002561  min_lr: 0.002561  loss: 3.3405 (3.3372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8064 (0.7697)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [117]  [ 400/2502]  eta: 0:09:13  lr: 0.002559  min_lr: 0.002559  loss: 3.2932 (3.3368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6734 (0.7623)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [117]  [ 600/2502]  eta: 0:08:16  lr: 0.002558  min_lr: 0.002558  loss: 3.2564 (3.3628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7536 (0.7721)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [117]  [ 800/2502]  eta: 0:07:22  lr: 0.002557  min_lr: 0.002557  loss: 3.5903 (3.3851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7827 (0.7677)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [117]  [1000/2502]  eta: 0:06:30  lr: 0.002555  min_lr: 0.002555  loss: 3.8468 (3.3934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7479 (0.7666)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [117]  [1200/2502]  eta: 0:05:37  lr: 0.002554  min_lr: 0.002554  loss: 3.1553 (3.3956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8256 (0.7782)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [117]  [1400/2502]  eta: 0:04:45  lr: 0.002552  min_lr: 0.002552  loss: 3.7481 (3.3920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7962 (0.7828)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [117]  [1600/2502]  eta: 0:03:53  lr: 0.002551  min_lr: 0.002551  loss: 2.9633 (3.3882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8562 (0.7885)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [117]  [1800/2502]  eta: 0:03:01  lr: 0.002550  min_lr: 0.002550  loss: 3.6971 (3.3879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7595 (0.7853)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [117]  [2000/2502]  eta: 0:02:09  lr: 0.002548  min_lr: 0.002548  loss: 3.5467 (3.3915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7369 (0.7844)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [117]  [2200/2502]  eta: 0:01:18  lr: 0.002547  min_lr: 0.002547  loss: 3.0591 (3.3931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7401 (0.7846)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [117]  [2400/2502]  eta: 0:00:26  lr: 0.002545  min_lr: 0.002545  loss: 3.7551 (3.3914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7606 (0.7839)  time: 0.2567  data: 0.0005  max mem: 18975
Epoch: [117]  [2501/2502]  eta: 0:00:00  lr: 0.002545  min_lr: 0.002545  loss: 3.5172 (3.3837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7261 (0.7848)  time: 0.2306  data: 0.0009  max mem: 18975
Epoch: [117] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.002545  min_lr: 0.002545  loss: 3.5172 (3.4034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7261 (0.7848)
Test:  [ 0/50]  eta: 0:03:15  loss: 0.6323 (0.6323)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 3.9178  data: 3.7793  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 0.9605 (0.9695)  acc1: 81.6000 (80.7273)  acc5: 96.4000 (96.0364)  time: 0.6194  data: 0.4979  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.9887 (0.9979)  acc1: 78.0000 (80.0571)  acc5: 96.4000 (96.1143)  time: 0.2909  data: 0.1708  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1822 (1.1099)  acc1: 72.8000 (77.4710)  acc5: 92.0000 (94.2194)  time: 0.2648  data: 0.1449  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4073 (1.1822)  acc1: 70.8000 (75.5902)  acc5: 89.2000 (93.2000)  time: 0.2026  data: 0.0836  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3877 (1.1928)  acc1: 70.0000 (75.0480)  acc5: 90.0000 (93.1120)  time: 0.2022  data: 0.0834  max mem: 18975
Test: Total time: 0:00:14 (0.2995 s / it)
* Acc@1 75.264 Acc@5 93.140 loss 1.191
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.36%
Epoch: [118]  [   0/2502]  eta: 1:36:12  lr: 0.002545  min_lr: 0.002545  loss: 3.3652 (3.3652)  weight_decay: 0.0500 (0.0500)  time: 2.3072  data: 1.9600  max mem: 18975
Epoch: [118]  [ 200/2502]  eta: 0:10:20  lr: 0.002543  min_lr: 0.002543  loss: 3.3921 (3.4214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7976 (0.7928)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [118]  [ 400/2502]  eta: 0:09:12  lr: 0.002542  min_lr: 0.002542  loss: 3.0689 (3.4035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7494 (0.7763)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [118]  [ 600/2502]  eta: 0:08:15  lr: 0.002541  min_lr: 0.002541  loss: 3.3370 (3.3745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7897 (0.8011)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [118]  [ 800/2502]  eta: 0:07:22  lr: 0.002539  min_lr: 0.002539  loss: 3.3999 (3.3668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8180 (0.7969)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [118]  [1000/2502]  eta: 0:06:29  lr: 0.002538  min_lr: 0.002538  loss: 3.7406 (3.3854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7798 (0.7973)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [118]  [1200/2502]  eta: 0:05:37  lr: 0.002536  min_lr: 0.002536  loss: 3.5608 (3.3782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8503 (0.7976)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [118]  [1400/2502]  eta: 0:04:45  lr: 0.002535  min_lr: 0.002535  loss: 3.7147 (3.3751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7381 (0.7945)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [118]  [1600/2502]  eta: 0:03:53  lr: 0.002534  min_lr: 0.002534  loss: 3.8120 (3.3713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7831 (0.7950)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [118]  [1800/2502]  eta: 0:03:01  lr: 0.002532  min_lr: 0.002532  loss: 2.7828 (3.3781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7782 (0.8002)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [118]  [2000/2502]  eta: 0:02:09  lr: 0.002531  min_lr: 0.002531  loss: 3.3418 (3.3806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8187 (0.8022)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [118]  [2200/2502]  eta: 0:01:17  lr: 0.002529  min_lr: 0.002529  loss: 2.8240 (3.3763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8722 (0.8032)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [118]  [2400/2502]  eta: 0:00:26  lr: 0.002528  min_lr: 0.002528  loss: 2.8773 (3.3796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8483 (0.8052)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [118]  [2501/2502]  eta: 0:00:00  lr: 0.002527  min_lr: 0.002527  loss: 3.8114 (3.3821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8012 (0.8046)  time: 0.2305  data: 0.0009  max mem: 18975
Epoch: [118] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.002527  min_lr: 0.002527  loss: 3.8114 (3.4057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8012 (0.8046)
Test:  [ 0/50]  eta: 0:03:14  loss: 0.7866 (0.7866)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 3.8971  data: 3.7522  max mem: 18975
Test:  [10/50]  eta: 0:00:25  loss: 0.9629 (0.9813)  acc1: 83.6000 (81.1636)  acc5: 96.4000 (95.9273)  time: 0.6312  data: 0.5083  max mem: 18975
Test:  [20/50]  eta: 0:00:14  loss: 0.9702 (0.9922)  acc1: 79.2000 (80.5333)  acc5: 96.4000 (95.8476)  time: 0.3042  data: 0.1841  max mem: 18975
Test:  [30/50]  eta: 0:00:08  loss: 1.1324 (1.1159)  acc1: 77.2000 (77.9871)  acc5: 93.6000 (94.2194)  time: 0.2852  data: 0.1655  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4364 (1.1963)  acc1: 70.4000 (75.8537)  acc5: 90.4000 (93.2878)  time: 0.2136  data: 0.0942  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4282 (1.2154)  acc1: 68.4000 (75.1680)  acc5: 90.4000 (93.1600)  time: 0.2129  data: 0.0939  max mem: 18975
Test: Total time: 0:00:15 (0.3087 s / it)
* Acc@1 75.156 Acc@5 93.176 loss 1.220
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.36%
Epoch: [119]  [   0/2502]  eta: 1:33:31  lr: 0.002527  min_lr: 0.002527  loss: 2.5130 (2.5130)  weight_decay: 0.0500 (0.0500)  time: 2.2430  data: 1.9214  max mem: 18975
Epoch: [119]  [ 200/2502]  eta: 0:10:24  lr: 0.002526  min_lr: 0.002526  loss: 3.5785 (3.3698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8148 (0.7971)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [119]  [ 400/2502]  eta: 0:09:15  lr: 0.002524  min_lr: 0.002524  loss: 3.1510 (3.4033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8253 (0.7954)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [119]  [ 600/2502]  eta: 0:08:17  lr: 0.002523  min_lr: 0.002523  loss: 3.1659 (3.3948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8397 (0.8158)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [119]  [ 800/2502]  eta: 0:07:23  lr: 0.002522  min_lr: 0.002522  loss: 3.2613 (3.3830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7349 (0.7976)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [119]  [1000/2502]  eta: 0:06:30  lr: 0.002520  min_lr: 0.002520  loss: 3.3558 (3.3968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7185 (0.7875)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [119]  [1200/2502]  eta: 0:05:37  lr: 0.002519  min_lr: 0.002519  loss: 3.6503 (3.3911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7668 (0.7869)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [119]  [1400/2502]  eta: 0:04:45  lr: 0.002517  min_lr: 0.002517  loss: 3.4090 (3.3940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7878 (0.7935)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [119]  [1600/2502]  eta: 0:03:53  lr: 0.002516  min_lr: 0.002516  loss: 3.7251 (3.3930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7722 (0.7941)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [119]  [1800/2502]  eta: 0:03:01  lr: 0.002515  min_lr: 0.002515  loss: 3.5470 (3.3933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8089 (0.7965)  time: 0.2585  data: 0.0004  max mem: 18975
Epoch: [119]  [2000/2502]  eta: 0:02:09  lr: 0.002513  min_lr: 0.002513  loss: 3.6689 (3.3950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7672 (0.7956)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [119]  [2200/2502]  eta: 0:01:18  lr: 0.002512  min_lr: 0.002512  loss: 3.3868 (3.3962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7857 (0.7949)  time: 0.2609  data: 0.0004  max mem: 18975
Epoch: [119]  [2400/2502]  eta: 0:00:26  lr: 0.002510  min_lr: 0.002510  loss: 3.7323 (3.3992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7536 (0.7932)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [119]  [2501/2502]  eta: 0:00:00  lr: 0.002510  min_lr: 0.002510  loss: 3.4199 (3.4003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8181 (0.7939)  time: 0.2310  data: 0.0008  max mem: 18975
Epoch: [119] Total time: 0:10:47 (0.2587 s / it)
Averaged stats: lr: 0.002510  min_lr: 0.002510  loss: 3.4199 (3.4062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8181 (0.7939)
Test:  [ 0/50]  eta: 0:02:20  loss: 0.6072 (0.6072)  acc1: 92.4000 (92.4000)  acc5: 98.4000 (98.4000)  time: 2.8117  data: 2.6664  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.9964 (1.0054)  acc1: 81.6000 (81.1273)  acc5: 96.4000 (96.0000)  time: 0.4855  data: 0.3634  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0004 (1.0140)  acc1: 80.0000 (80.2476)  acc5: 96.0000 (96.0191)  time: 0.2449  data: 0.1255  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2085 (1.1410)  acc1: 74.8000 (77.6516)  acc5: 93.2000 (94.4258)  time: 0.2479  data: 0.1289  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4744 (1.2284)  acc1: 70.4000 (75.8049)  acc5: 90.4000 (93.3659)  time: 0.2677  data: 0.1481  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4354 (1.2377)  acc1: 70.4000 (75.3920)  acc5: 90.4000 (93.1920)  time: 0.2162  data: 0.0968  max mem: 18975
Test: Total time: 0:00:14 (0.2920 s / it)
* Acc@1 75.396 Acc@5 93.152 loss 1.233
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.40%
Epoch: [120]  [   0/2502]  eta: 1:25:31  lr: 0.002510  min_lr: 0.002510  loss: 3.6100 (3.6100)  weight_decay: 0.0500 (0.0500)  time: 2.0509  data: 1.7664  max mem: 18975
Epoch: [120]  [ 200/2502]  eta: 0:10:16  lr: 0.002508  min_lr: 0.002508  loss: 3.4071 (3.3646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8570 (0.8404)  time: 0.2636  data: 0.0004  max mem: 18975
Epoch: [120]  [ 400/2502]  eta: 0:09:11  lr: 0.002507  min_lr: 0.002507  loss: 3.5994 (3.3174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7581 (0.8362)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [120]  [ 600/2502]  eta: 0:08:15  lr: 0.002505  min_lr: 0.002505  loss: 3.5492 (3.3434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7953 (0.8253)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [120]  [ 800/2502]  eta: 0:07:22  lr: 0.002504  min_lr: 0.002504  loss: 3.7823 (3.3706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8501 (0.8202)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [120]  [1000/2502]  eta: 0:06:29  lr: 0.002502  min_lr: 0.002502  loss: 2.9976 (3.3648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7589 (0.8199)  time: 0.2587  data: 0.0004  max mem: 18975
Epoch: [120]  [1200/2502]  eta: 0:05:37  lr: 0.002501  min_lr: 0.002501  loss: 3.7217 (3.3747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7423 (0.8139)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [120]  [1400/2502]  eta: 0:04:45  lr: 0.002500  min_lr: 0.002500  loss: 3.3672 (3.3784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8021 (0.8183)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [120]  [1600/2502]  eta: 0:03:53  lr: 0.002498  min_lr: 0.002498  loss: 3.7408 (3.3799)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8567 (0.8178)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [120]  [1800/2502]  eta: 0:03:01  lr: 0.002497  min_lr: 0.002497  loss: 3.6568 (3.3807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8521 (0.8204)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [120]  [2000/2502]  eta: 0:02:09  lr: 0.002495  min_lr: 0.002495  loss: 3.3384 (3.3777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7741 (0.8177)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [120]  [2200/2502]  eta: 0:01:17  lr: 0.002494  min_lr: 0.002494  loss: 2.9169 (3.3789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7903 (0.8160)  time: 0.2557  data: 0.0004  max mem: 18975
Epoch: [120]  [2400/2502]  eta: 0:00:26  lr: 0.002493  min_lr: 0.002493  loss: 3.0018 (3.3788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8219 (0.8144)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [120]  [2501/2502]  eta: 0:00:00  lr: 0.002492  min_lr: 0.002492  loss: 2.9522 (3.3805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7393 (0.8119)  time: 0.2304  data: 0.0007  max mem: 18975
Epoch: [120] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.002492  min_lr: 0.002492  loss: 2.9522 (3.3993)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7393 (0.8119)
Test:  [ 0/50]  eta: 0:02:46  loss: 0.5871 (0.5871)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 3.3259  data: 3.1910  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9141 (0.8947)  acc1: 81.2000 (81.2364)  acc5: 95.6000 (96.0727)  time: 0.5360  data: 0.4154  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9054 (0.9038)  acc1: 80.8000 (80.8952)  acc5: 95.6000 (96.2095)  time: 0.2451  data: 0.1256  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0959 (1.0267)  acc1: 75.6000 (78.4516)  acc5: 92.4000 (94.6581)  time: 0.2357  data: 0.1156  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3255 (1.1063)  acc1: 70.0000 (76.3902)  acc5: 90.8000 (93.7366)  time: 0.2370  data: 0.1172  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3255 (1.1235)  acc1: 70.0000 (75.8000)  acc5: 91.2000 (93.5440)  time: 0.2138  data: 0.0947  max mem: 18975
Test: Total time: 0:00:14 (0.2912 s / it)
* Acc@1 75.964 Acc@5 93.470 loss 1.123
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 75.96%
Epoch: [121]  [   0/2502]  eta: 1:34:55  lr: 0.002492  min_lr: 0.002492  loss: 3.1526 (3.1526)  weight_decay: 0.0500 (0.0500)  time: 2.2763  data: 2.0135  max mem: 18975
Epoch: [121]  [ 200/2502]  eta: 0:10:21  lr: 0.002490  min_lr: 0.002490  loss: 3.7178 (3.4063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7757 (0.7747)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [121]  [ 400/2502]  eta: 0:09:15  lr: 0.002489  min_lr: 0.002489  loss: 3.3176 (3.3873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7734 (0.7851)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [121]  [ 600/2502]  eta: 0:08:17  lr: 0.002488  min_lr: 0.002488  loss: 3.2356 (3.4042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7707 (0.7872)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [121]  [ 800/2502]  eta: 0:07:23  lr: 0.002486  min_lr: 0.002486  loss: 3.0278 (3.3932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7423 (0.7826)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [121]  [1000/2502]  eta: 0:06:30  lr: 0.002485  min_lr: 0.002485  loss: 3.2159 (3.3884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7244 (0.7819)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [121]  [1200/2502]  eta: 0:05:37  lr: 0.002483  min_lr: 0.002483  loss: 3.2287 (3.3714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8035 (0.7937)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [121]  [1400/2502]  eta: 0:04:45  lr: 0.002482  min_lr: 0.002482  loss: 3.6057 (3.3808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7988 (0.7953)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [121]  [1600/2502]  eta: 0:03:53  lr: 0.002480  min_lr: 0.002480  loss: 3.0961 (3.3799)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7671 (0.7957)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [121]  [1800/2502]  eta: 0:03:01  lr: 0.002479  min_lr: 0.002479  loss: 3.4890 (3.3812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8890 (0.8008)  time: 0.2572  data: 0.0005  max mem: 18975
Epoch: [121]  [2000/2502]  eta: 0:02:09  lr: 0.002478  min_lr: 0.002478  loss: 2.7487 (3.3818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7908 (0.8009)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [121]  [2200/2502]  eta: 0:01:18  lr: 0.002476  min_lr: 0.002476  loss: 3.8047 (3.3893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7898 (0.8024)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [121]  [2400/2502]  eta: 0:00:26  lr: 0.002475  min_lr: 0.002475  loss: 3.5257 (3.3900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8117 (0.8039)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [121]  [2501/2502]  eta: 0:00:00  lr: 0.002474  min_lr: 0.002474  loss: 3.7439 (3.3918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7633 (0.8032)  time: 0.2310  data: 0.0010  max mem: 18975
Epoch: [121] Total time: 0:10:46 (0.2585 s / it)
Averaged stats: lr: 0.002474  min_lr: 0.002474  loss: 3.7439 (3.3932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7633 (0.8032)
Test:  [ 0/50]  eta: 0:03:12  loss: 0.8746 (0.8746)  acc1: 90.0000 (90.0000)  acc5: 98.0000 (98.0000)  time: 3.8557  data: 3.7172  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 1.1608 (1.1292)  acc1: 78.8000 (80.6182)  acc5: 96.0000 (95.4909)  time: 0.6078  data: 0.4852  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 1.1510 (1.1457)  acc1: 78.8000 (80.0571)  acc5: 96.0000 (95.8667)  time: 0.2503  data: 0.1303  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2625 (1.2529)  acc1: 75.2000 (77.6774)  acc5: 92.8000 (94.2194)  time: 0.2168  data: 0.0977  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.5337 (1.3249)  acc1: 70.4000 (75.6781)  acc5: 90.4000 (93.2585)  time: 0.2210  data: 0.1021  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.5228 (1.3343)  acc1: 70.0000 (75.1520)  acc5: 90.4000 (93.1040)  time: 0.2062  data: 0.0875  max mem: 18975
Test: Total time: 0:00:15 (0.3023 s / it)
* Acc@1 75.320 Acc@5 93.288 loss 1.332
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.96%
Epoch: [122]  [   0/2502]  eta: 1:37:13  lr: 0.002474  min_lr: 0.002474  loss: 3.9994 (3.9994)  weight_decay: 0.0500 (0.0500)  time: 2.3317  data: 1.4902  max mem: 18975
Epoch: [122]  [ 200/2502]  eta: 0:10:18  lr: 0.002473  min_lr: 0.002473  loss: 3.9659 (3.3771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7904 (0.8038)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [122]  [ 400/2502]  eta: 0:09:11  lr: 0.002471  min_lr: 0.002471  loss: 3.8736 (3.3894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8376 (0.8055)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [122]  [ 600/2502]  eta: 0:08:15  lr: 0.002470  min_lr: 0.002470  loss: 3.4926 (3.3946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8573 (0.8169)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [122]  [ 800/2502]  eta: 0:07:21  lr: 0.002468  min_lr: 0.002468  loss: 3.3233 (3.3922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7468 (0.8173)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [122]  [1000/2502]  eta: 0:06:28  lr: 0.002467  min_lr: 0.002467  loss: 3.2591 (3.3823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7929 (0.8140)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [122]  [1200/2502]  eta: 0:05:36  lr: 0.002465  min_lr: 0.002465  loss: 3.8527 (3.3808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7268 (0.8100)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [122]  [1400/2502]  eta: 0:04:44  lr: 0.002464  min_lr: 0.002464  loss: 3.1378 (3.3927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8602 (0.8090)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [122]  [1600/2502]  eta: 0:03:53  lr: 0.002463  min_lr: 0.002463  loss: 3.6347 (3.3839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8002 (0.8093)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [122]  [1800/2502]  eta: 0:03:01  lr: 0.002461  min_lr: 0.002461  loss: 3.7125 (3.3993)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7722 (0.8059)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [122]  [2000/2502]  eta: 0:02:09  lr: 0.002460  min_lr: 0.002460  loss: 3.7577 (3.3917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7760 (0.8081)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [122]  [2200/2502]  eta: 0:01:17  lr: 0.002458  min_lr: 0.002458  loss: 3.6231 (3.4010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8461 (0.8101)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [122]  [2400/2502]  eta: 0:00:26  lr: 0.002457  min_lr: 0.002457  loss: 3.7230 (3.4052)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [122]  [2501/2502]  eta: 0:00:00  lr: 0.002456  min_lr: 0.002456  loss: 3.1438 (3.4048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7914 (nan)  time: 0.2313  data: 0.0009  max mem: 18975
Epoch: [122] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.002456  min_lr: 0.002456  loss: 3.1438 (3.3923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7914 (nan)
Test:  [ 0/50]  eta: 0:02:48  loss: 0.5434 (0.5434)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 3.3617  data: 3.2159  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9045 (0.9064)  acc1: 80.8000 (81.6000)  acc5: 96.4000 (96.1091)  time: 0.5198  data: 0.3955  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9387 (0.9432)  acc1: 78.0000 (80.0762)  acc5: 96.4000 (96.2857)  time: 0.2321  data: 0.1115  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1625 (1.0855)  acc1: 73.6000 (77.5871)  acc5: 93.2000 (94.4129)  time: 0.2392  data: 0.1199  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4249 (1.1676)  acc1: 70.8000 (75.7073)  acc5: 90.0000 (93.4537)  time: 0.2330  data: 0.1137  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4193 (1.1923)  acc1: 70.0000 (75.0320)  acc5: 90.4000 (93.2880)  time: 0.2047  data: 0.0857  max mem: 18975
Test: Total time: 0:00:14 (0.2807 s / it)
* Acc@1 75.438 Acc@5 93.284 loss 1.192
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.96%
Epoch: [123]  [   0/2502]  eta: 1:38:07  lr: 0.002456  min_lr: 0.002456  loss: 3.7447 (3.7447)  weight_decay: 0.0500 (0.0500)  time: 2.3533  data: 1.3864  max mem: 18975
Epoch: [123]  [ 200/2502]  eta: 0:10:23  lr: 0.002455  min_lr: 0.002455  loss: 2.7312 (3.3616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8261 (0.7919)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [123]  [ 400/2502]  eta: 0:09:14  lr: 0.002453  min_lr: 0.002453  loss: 3.3679 (3.3628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8032 (0.7898)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [123]  [ 600/2502]  eta: 0:08:17  lr: 0.002452  min_lr: 0.002452  loss: 2.8673 (3.3594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7484 (0.7903)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [123]  [ 800/2502]  eta: 0:07:23  lr: 0.002450  min_lr: 0.002450  loss: 3.2438 (3.3708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9115 (0.8049)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [123]  [1000/2502]  eta: 0:06:30  lr: 0.002449  min_lr: 0.002449  loss: 3.5425 (3.3764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7646 (0.8033)  time: 0.2558  data: 0.0003  max mem: 18975
Epoch: [123]  [1200/2502]  eta: 0:05:37  lr: 0.002447  min_lr: 0.002447  loss: 3.1503 (3.3711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7614 (0.8009)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [123]  [1400/2502]  eta: 0:04:45  lr: 0.002446  min_lr: 0.002446  loss: 3.7999 (3.3747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7983 (0.8038)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [123]  [1600/2502]  eta: 0:03:53  lr: 0.002445  min_lr: 0.002445  loss: 3.3776 (3.3778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8359 (0.8067)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [123]  [1800/2502]  eta: 0:03:01  lr: 0.002443  min_lr: 0.002443  loss: 3.7601 (3.3767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7629 (0.8084)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [123]  [2000/2502]  eta: 0:02:09  lr: 0.002442  min_lr: 0.002442  loss: 3.3477 (3.3749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7732 (0.8116)  time: 0.2636  data: 0.0004  max mem: 18975
Epoch: [123]  [2200/2502]  eta: 0:01:17  lr: 0.002440  min_lr: 0.002440  loss: 3.7826 (3.3752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8508 (0.8108)  time: 0.2580  data: 0.0005  max mem: 18975
Epoch: [123]  [2400/2502]  eta: 0:00:26  lr: 0.002439  min_lr: 0.002439  loss: 3.8903 (3.3703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7874 (0.8113)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [123]  [2501/2502]  eta: 0:00:00  lr: 0.002438  min_lr: 0.002438  loss: 3.3005 (3.3750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7729 (0.8102)  time: 0.2364  data: 0.0009  max mem: 18975
Epoch: [123] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002438  min_lr: 0.002438  loss: 3.3005 (3.3874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7729 (0.8102)
Test:  [ 0/50]  eta: 0:03:12  loss: 0.6428 (0.6428)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 3.8505  data: 3.7072  max mem: 18975
Test:  [10/50]  eta: 0:00:25  loss: 0.9553 (0.9815)  acc1: 80.4000 (81.3818)  acc5: 95.2000 (95.6000)  time: 0.6281  data: 0.5051  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9704 (0.9920)  acc1: 79.6000 (80.9524)  acc5: 96.0000 (95.7905)  time: 0.2565  data: 0.1340  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2013 (1.1049)  acc1: 76.0000 (78.2968)  acc5: 93.2000 (94.1290)  time: 0.2214  data: 0.0997  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3819 (1.1776)  acc1: 70.4000 (76.1171)  acc5: 90.4000 (93.2390)  time: 0.2226  data: 0.1008  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3817 (1.1897)  acc1: 69.6000 (75.6080)  acc5: 90.4000 (93.1280)  time: 0.2223  data: 0.1007  max mem: 18975
Test: Total time: 0:00:14 (0.2921 s / it)
* Acc@1 75.642 Acc@5 93.178 loss 1.186
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.96%
Epoch: [124]  [   0/2502]  eta: 1:33:51  lr: 0.002438  min_lr: 0.002438  loss: 3.8134 (3.8134)  weight_decay: 0.0500 (0.0500)  time: 2.2508  data: 1.3898  max mem: 18975
Epoch: [124]  [ 200/2502]  eta: 0:10:19  lr: 0.002437  min_lr: 0.002437  loss: 3.7770 (3.3302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7659 (0.8117)  time: 0.2646  data: 0.0004  max mem: 18975
Epoch: [124]  [ 400/2502]  eta: 0:09:13  lr: 0.002435  min_lr: 0.002435  loss: 3.5311 (3.3240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7996 (0.8053)  time: 0.2576  data: 0.0003  max mem: 18975
Epoch: [124]  [ 600/2502]  eta: 0:08:16  lr: 0.002434  min_lr: 0.002434  loss: 2.8229 (3.3305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7844 (0.8089)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [124]  [ 800/2502]  eta: 0:07:22  lr: 0.002432  min_lr: 0.002432  loss: 3.1987 (3.3590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8046 (0.8062)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [124]  [1000/2502]  eta: 0:06:29  lr: 0.002431  min_lr: 0.002431  loss: 2.9210 (3.3611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8306 (0.8064)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [124]  [1200/2502]  eta: 0:05:37  lr: 0.002429  min_lr: 0.002429  loss: 3.6442 (3.3796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.8049)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [124]  [1400/2502]  eta: 0:04:45  lr: 0.002428  min_lr: 0.002428  loss: 3.4695 (3.3836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7500 (0.8022)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [124]  [1600/2502]  eta: 0:03:53  lr: 0.002427  min_lr: 0.002427  loss: 3.6551 (3.3786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8646 (0.8062)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [124]  [1800/2502]  eta: 0:03:01  lr: 0.002425  min_lr: 0.002425  loss: 3.6398 (3.3780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7705 (0.8032)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [124]  [2000/2502]  eta: 0:02:09  lr: 0.002424  min_lr: 0.002424  loss: 3.6453 (3.3748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7783 (0.8000)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [124]  [2200/2502]  eta: 0:01:17  lr: 0.002422  min_lr: 0.002422  loss: 3.6770 (3.3840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8078 (0.8015)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [124]  [2400/2502]  eta: 0:00:26  lr: 0.002421  min_lr: 0.002421  loss: 3.4930 (3.3851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7478 (0.8030)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [124]  [2501/2502]  eta: 0:00:00  lr: 0.002420  min_lr: 0.002420  loss: 3.4221 (3.3823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8586 (0.8043)  time: 0.2313  data: 0.0009  max mem: 18975
Epoch: [124] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.002420  min_lr: 0.002420  loss: 3.4221 (3.3916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8586 (0.8043)
Test:  [ 0/50]  eta: 0:02:33  loss: 0.5737 (0.5737)  acc1: 93.2000 (93.2000)  acc5: 99.2000 (99.2000)  time: 3.0629  data: 2.9192  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.9041 (0.9268)  acc1: 83.2000 (81.9273)  acc5: 96.4000 (95.7455)  time: 0.5590  data: 0.4341  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9237 (0.9406)  acc1: 80.0000 (81.0667)  acc5: 96.4000 (96.0762)  time: 0.3006  data: 0.1794  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1346 (1.0782)  acc1: 74.0000 (78.0516)  acc5: 94.4000 (94.3097)  time: 0.2855  data: 0.1661  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4132 (1.1561)  acc1: 70.4000 (76.0781)  acc5: 89.6000 (93.3268)  time: 0.2335  data: 0.1129  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3804 (1.1714)  acc1: 70.8000 (75.6000)  acc5: 90.8000 (93.2400)  time: 0.2215  data: 0.1012  max mem: 18975
Test: Total time: 0:00:14 (0.2983 s / it)
* Acc@1 75.710 Acc@5 93.236 loss 1.167
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 75.96%
Epoch: [125]  [   0/2502]  eta: 1:35:37  lr: 0.002420  min_lr: 0.002420  loss: 3.1881 (3.1881)  weight_decay: 0.0500 (0.0500)  time: 2.2932  data: 1.9653  max mem: 18975
Epoch: [125]  [ 200/2502]  eta: 0:10:21  lr: 0.002419  min_lr: 0.002419  loss: 3.1614 (3.4299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8337 (0.8192)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [125]  [ 400/2502]  eta: 0:09:14  lr: 0.002417  min_lr: 0.002417  loss: 3.4840 (3.4245)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8130 (0.8152)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [125]  [ 600/2502]  eta: 0:08:17  lr: 0.002416  min_lr: 0.002416  loss: 2.7516 (3.3893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8423 (0.8196)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [125]  [ 800/2502]  eta: 0:07:22  lr: 0.002414  min_lr: 0.002414  loss: 3.6352 (3.3943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6839 (0.8212)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [125]  [1000/2502]  eta: 0:06:30  lr: 0.002413  min_lr: 0.002413  loss: 3.6617 (3.3805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7478 (0.8190)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [125]  [1200/2502]  eta: 0:05:37  lr: 0.002411  min_lr: 0.002411  loss: 3.3616 (3.3724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8573 (0.8154)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [125]  [1400/2502]  eta: 0:04:45  lr: 0.002410  min_lr: 0.002410  loss: 3.0329 (3.3677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7262 (0.8176)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [125]  [1600/2502]  eta: 0:03:53  lr: 0.002408  min_lr: 0.002408  loss: 3.3140 (3.3709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7885 (0.8203)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [125]  [1800/2502]  eta: 0:03:01  lr: 0.002407  min_lr: 0.002407  loss: 3.6890 (3.3660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7904 (0.8165)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [125]  [2000/2502]  eta: 0:02:09  lr: 0.002405  min_lr: 0.002405  loss: 3.7416 (3.3757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8224 (0.8209)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [125]  [2200/2502]  eta: 0:01:18  lr: 0.002404  min_lr: 0.002404  loss: 3.1315 (3.3740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7795 (0.8188)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [125]  [2400/2502]  eta: 0:00:26  lr: 0.002403  min_lr: 0.002403  loss: 3.7132 (3.3764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8197 (0.8204)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [125]  [2501/2502]  eta: 0:00:00  lr: 0.002402  min_lr: 0.002402  loss: 3.6131 (3.3789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7417 (0.8187)  time: 0.2307  data: 0.0007  max mem: 18975
Epoch: [125] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.002402  min_lr: 0.002402  loss: 3.6131 (3.3858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7417 (0.8187)
Test:  [ 0/50]  eta: 0:02:08  loss: 0.6430 (0.6430)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 2.5645  data: 2.4137  max mem: 18975
Test:  [10/50]  eta: 0:00:16  loss: 0.9796 (0.9990)  acc1: 83.6000 (82.1091)  acc5: 96.4000 (96.1091)  time: 0.4044  data: 0.2821  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9929 (1.0087)  acc1: 78.8000 (81.0857)  acc5: 96.4000 (96.3238)  time: 0.2341  data: 0.1128  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1891 (1.1125)  acc1: 76.0000 (78.4516)  acc5: 93.2000 (94.7742)  time: 0.2876  data: 0.1666  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4326 (1.1946)  acc1: 71.2000 (76.2439)  acc5: 90.8000 (93.7854)  time: 0.2867  data: 0.1660  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4521 (1.2095)  acc1: 70.0000 (75.8160)  acc5: 91.2000 (93.6160)  time: 0.2116  data: 0.0910  max mem: 18975
Test: Total time: 0:00:14 (0.2886 s / it)
* Acc@1 75.764 Acc@5 93.486 loss 1.214
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 75.96%
Epoch: [126]  [   0/2502]  eta: 1:39:58  lr: 0.002402  min_lr: 0.002402  loss: 2.2918 (2.2918)  weight_decay: 0.0500 (0.0500)  time: 2.3973  data: 2.1258  max mem: 18975
Epoch: [126]  [ 200/2502]  eta: 0:10:21  lr: 0.002400  min_lr: 0.002400  loss: 3.7057 (3.4006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8094 (0.8064)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [126]  [ 400/2502]  eta: 0:09:13  lr: 0.002399  min_lr: 0.002399  loss: 3.2134 (3.3789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8301 (0.8169)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [126]  [ 600/2502]  eta: 0:08:16  lr: 0.002397  min_lr: 0.002397  loss: 3.5089 (3.3567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7792 (0.8087)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [126]  [ 800/2502]  eta: 0:07:22  lr: 0.002396  min_lr: 0.002396  loss: 3.4528 (3.3468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7502 (0.8046)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [126]  [1000/2502]  eta: 0:06:29  lr: 0.002395  min_lr: 0.002395  loss: 3.5148 (3.3614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8075 (0.8026)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [126]  [1200/2502]  eta: 0:05:37  lr: 0.002393  min_lr: 0.002393  loss: 3.3659 (3.3771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7828 (0.7992)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [126]  [1400/2502]  eta: 0:04:45  lr: 0.002392  min_lr: 0.002392  loss: 3.3219 (3.3842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7839 (0.7977)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [126]  [1600/2502]  eta: 0:03:53  lr: 0.002390  min_lr: 0.002390  loss: 3.3065 (3.3925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7995 (0.7970)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [126]  [1800/2502]  eta: 0:03:01  lr: 0.002389  min_lr: 0.002389  loss: 3.4782 (3.3879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7256 (0.7970)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [126]  [2000/2502]  eta: 0:02:09  lr: 0.002387  min_lr: 0.002387  loss: 3.4889 (3.3941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8587 (0.8016)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [126]  [2200/2502]  eta: 0:01:17  lr: 0.002386  min_lr: 0.002386  loss: 3.5683 (3.3953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8064 (0.8042)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [126]  [2400/2502]  eta: 0:00:26  lr: 0.002384  min_lr: 0.002384  loss: 3.7948 (3.3931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8327 (inf)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [126]  [2501/2502]  eta: 0:00:00  lr: 0.002384  min_lr: 0.002384  loss: 3.5029 (3.3939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7730 (inf)  time: 0.2312  data: 0.0011  max mem: 18975
Epoch: [126] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002384  min_lr: 0.002384  loss: 3.5029 (3.3807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7730 (inf)
Test:  [ 0/50]  eta: 0:03:14  loss: 0.6681 (0.6681)  acc1: 92.0000 (92.0000)  acc5: 98.4000 (98.4000)  time: 3.8970  data: 3.7554  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 0.9684 (0.9537)  acc1: 81.2000 (81.7818)  acc5: 96.8000 (96.1091)  time: 0.5776  data: 0.4556  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.9617 (0.9631)  acc1: 80.0000 (80.7810)  acc5: 96.4000 (96.0191)  time: 0.2674  data: 0.1477  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1725 (1.0840)  acc1: 74.8000 (78.0774)  acc5: 92.8000 (94.4000)  time: 0.2834  data: 0.1633  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3713 (1.1613)  acc1: 70.8000 (76.1659)  acc5: 90.0000 (93.3951)  time: 0.2242  data: 0.1043  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3713 (1.1798)  acc1: 70.8000 (75.5040)  acc5: 90.4000 (93.3040)  time: 0.2221  data: 0.1032  max mem: 18975
Test: Total time: 0:00:14 (0.2981 s / it)
* Acc@1 75.592 Acc@5 93.388 loss 1.178
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.96%
Epoch: [127]  [   0/2502]  eta: 1:28:51  lr: 0.002384  min_lr: 0.002384  loss: 3.5783 (3.5783)  weight_decay: 0.0500 (0.0500)  time: 2.1308  data: 1.3768  max mem: 18975
Epoch: [127]  [ 200/2502]  eta: 0:10:20  lr: 0.002382  min_lr: 0.002382  loss: 3.5861 (3.3129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8066 (0.8383)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [127]  [ 400/2502]  eta: 0:09:14  lr: 0.002381  min_lr: 0.002381  loss: 3.8866 (3.3951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7896 (0.8253)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [127]  [ 600/2502]  eta: 0:08:17  lr: 0.002379  min_lr: 0.002379  loss: 3.5505 (3.4086)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7547 (0.8218)  time: 0.2577  data: 0.0005  max mem: 18975
Epoch: [127]  [ 800/2502]  eta: 0:07:23  lr: 0.002378  min_lr: 0.002378  loss: 3.6574 (3.4046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.8147)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [127]  [1000/2502]  eta: 0:06:30  lr: 0.002376  min_lr: 0.002376  loss: 3.0212 (3.3765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8243 (0.8202)  time: 0.2578  data: 0.0003  max mem: 18975
Epoch: [127]  [1200/2502]  eta: 0:05:38  lr: 0.002375  min_lr: 0.002375  loss: 3.6691 (3.3832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7795 (0.8140)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [127]  [1400/2502]  eta: 0:04:45  lr: 0.002373  min_lr: 0.002373  loss: 3.3095 (3.3823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8658 (0.8145)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [127]  [1600/2502]  eta: 0:03:53  lr: 0.002372  min_lr: 0.002372  loss: 3.5711 (3.3815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7839 (0.8161)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [127]  [1800/2502]  eta: 0:03:01  lr: 0.002370  min_lr: 0.002370  loss: 2.8880 (3.3804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.8149)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [127]  [2000/2502]  eta: 0:02:09  lr: 0.002369  min_lr: 0.002369  loss: 3.6662 (3.3828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7925 (0.8134)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [127]  [2200/2502]  eta: 0:01:18  lr: 0.002367  min_lr: 0.002367  loss: 3.4766 (3.3831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8064 (0.8157)  time: 0.2610  data: 0.0004  max mem: 18975
Epoch: [127]  [2400/2502]  eta: 0:00:26  lr: 0.002366  min_lr: 0.002366  loss: 3.7868 (3.3816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8268 (0.8156)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [127]  [2501/2502]  eta: 0:00:00  lr: 0.002365  min_lr: 0.002365  loss: 3.7396 (3.3880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7797 (0.8143)  time: 0.2310  data: 0.0012  max mem: 18975
Epoch: [127] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.002365  min_lr: 0.002365  loss: 3.7396 (3.3809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7797 (0.8143)
Test:  [ 0/50]  eta: 0:02:45  loss: 0.6428 (0.6428)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 3.3043  data: 3.1609  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9885 (0.9989)  acc1: 81.6000 (81.8909)  acc5: 96.8000 (96.3273)  time: 0.5328  data: 0.4041  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9885 (0.9957)  acc1: 81.6000 (81.4667)  acc5: 96.0000 (96.1714)  time: 0.2499  data: 0.1260  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2323 (1.1198)  acc1: 75.2000 (78.6452)  acc5: 93.2000 (94.7742)  time: 0.2708  data: 0.1493  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4301 (1.2048)  acc1: 71.2000 (76.5171)  acc5: 91.2000 (93.7268)  time: 0.2808  data: 0.1600  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4301 (1.2226)  acc1: 70.8000 (75.9840)  acc5: 91.2000 (93.5040)  time: 0.2149  data: 0.0960  max mem: 18975
Test: Total time: 0:00:15 (0.3113 s / it)
* Acc@1 75.882 Acc@5 93.362 loss 1.226
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 75.96%
Epoch: [128]  [   0/2502]  eta: 1:23:27  lr: 0.002365  min_lr: 0.002365  loss: 2.5042 (2.5042)  weight_decay: 0.0500 (0.0500)  time: 2.0012  data: 1.5924  max mem: 18975
Epoch: [128]  [ 200/2502]  eta: 0:10:19  lr: 0.002364  min_lr: 0.002364  loss: 3.5448 (3.3709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7779 (0.8022)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [128]  [ 400/2502]  eta: 0:09:12  lr: 0.002362  min_lr: 0.002362  loss: 3.6389 (3.3889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (0.8056)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [128]  [ 600/2502]  eta: 0:08:16  lr: 0.002361  min_lr: 0.002361  loss: 2.9167 (3.3481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7945 (0.8130)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [128]  [ 800/2502]  eta: 0:07:22  lr: 0.002359  min_lr: 0.002359  loss: 3.3982 (3.3688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7851 (0.8153)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [128]  [1000/2502]  eta: 0:06:29  lr: 0.002358  min_lr: 0.002358  loss: 3.0877 (3.3594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8351 (0.8163)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [128]  [1200/2502]  eta: 0:05:37  lr: 0.002356  min_lr: 0.002356  loss: 3.3373 (3.3632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8913 (0.8231)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [128]  [1400/2502]  eta: 0:04:45  lr: 0.002355  min_lr: 0.002355  loss: 3.4058 (3.3625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8597 (0.8198)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [128]  [1600/2502]  eta: 0:03:53  lr: 0.002353  min_lr: 0.002353  loss: 3.0135 (3.3697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7894 (0.8184)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [128]  [1800/2502]  eta: 0:03:01  lr: 0.002352  min_lr: 0.002352  loss: 3.4706 (3.3782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7891 (0.8206)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [128]  [2000/2502]  eta: 0:02:09  lr: 0.002351  min_lr: 0.002351  loss: 3.6176 (3.3767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7998 (0.8247)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [128]  [2200/2502]  eta: 0:01:17  lr: 0.002349  min_lr: 0.002349  loss: 3.3160 (3.3806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7612 (0.8211)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [128]  [2400/2502]  eta: 0:00:26  lr: 0.002348  min_lr: 0.002348  loss: 3.3436 (3.3808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9074 (0.8225)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [128]  [2501/2502]  eta: 0:00:00  lr: 0.002347  min_lr: 0.002347  loss: 3.7788 (3.3842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8415 (0.8228)  time: 0.2307  data: 0.0008  max mem: 18975
Epoch: [128] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.002347  min_lr: 0.002347  loss: 3.7788 (3.3832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8415 (0.8228)
Test:  [ 0/50]  eta: 0:03:00  loss: 0.8190 (0.8190)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 3.6163  data: 3.4668  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9893 (1.0113)  acc1: 81.6000 (80.6182)  acc5: 96.8000 (96.2182)  time: 0.5122  data: 0.3891  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9893 (1.0138)  acc1: 80.0000 (80.7429)  acc5: 96.4000 (96.1905)  time: 0.2463  data: 0.1264  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.2212 (1.1357)  acc1: 75.6000 (78.1161)  acc5: 93.6000 (94.5032)  time: 0.2694  data: 0.1501  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4717 (1.2230)  acc1: 70.8000 (76.2146)  acc5: 90.0000 (93.4732)  time: 0.2643  data: 0.1452  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.4698 (1.2413)  acc1: 70.8000 (75.6960)  acc5: 90.8000 (93.2720)  time: 0.2179  data: 0.0990  max mem: 18975
Test: Total time: 0:00:15 (0.3071 s / it)
* Acc@1 75.932 Acc@5 93.258 loss 1.238
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 75.96%
Epoch: [129]  [   0/2502]  eta: 1:39:53  lr: 0.002347  min_lr: 0.002347  loss: 3.5160 (3.5160)  weight_decay: 0.0500 (0.0500)  time: 2.3955  data: 2.1116  max mem: 18975
Epoch: [129]  [ 200/2502]  eta: 0:10:19  lr: 0.002345  min_lr: 0.002345  loss: 3.4414 (3.4384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8095 (0.8183)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [129]  [ 400/2502]  eta: 0:09:13  lr: 0.002344  min_lr: 0.002344  loss: 3.5621 (3.4466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7623 (0.7941)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [129]  [ 600/2502]  eta: 0:08:18  lr: 0.002342  min_lr: 0.002342  loss: 3.0149 (3.4343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7638 (0.7889)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [129]  [ 800/2502]  eta: 0:07:24  lr: 0.002341  min_lr: 0.002341  loss: 3.7484 (3.4287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7874 (0.7944)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [129]  [1000/2502]  eta: 0:06:30  lr: 0.002339  min_lr: 0.002339  loss: 3.7735 (3.4225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8616 (0.8013)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [129]  [1200/2502]  eta: 0:05:37  lr: 0.002338  min_lr: 0.002338  loss: 3.6919 (3.4235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8098 (0.8100)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [129]  [1400/2502]  eta: 0:04:45  lr: 0.002336  min_lr: 0.002336  loss: 3.1300 (3.4134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.8074)  time: 0.2570  data: 0.0005  max mem: 18975
Epoch: [129]  [1600/2502]  eta: 0:03:53  lr: 0.002335  min_lr: 0.002335  loss: 3.4754 (3.4133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8319 (0.8049)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [129]  [1800/2502]  eta: 0:03:01  lr: 0.002334  min_lr: 0.002334  loss: 3.5052 (3.3992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7994 (0.8127)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [129]  [2000/2502]  eta: 0:02:09  lr: 0.002332  min_lr: 0.002332  loss: 3.7027 (3.4009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7520 (0.8126)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [129]  [2200/2502]  eta: 0:01:18  lr: 0.002331  min_lr: 0.002331  loss: 3.4365 (3.3983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7966 (0.8115)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [129]  [2400/2502]  eta: 0:00:26  lr: 0.002329  min_lr: 0.002329  loss: 3.2745 (3.4010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7479 (0.8115)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [129]  [2501/2502]  eta: 0:00:00  lr: 0.002328  min_lr: 0.002328  loss: 3.3560 (3.4029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7799 (0.8111)  time: 0.2309  data: 0.0007  max mem: 18975
Epoch: [129] Total time: 0:10:46 (0.2585 s / it)
Averaged stats: lr: 0.002328  min_lr: 0.002328  loss: 3.3560 (3.3739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7799 (0.8111)
Test:  [ 0/50]  eta: 0:02:36  loss: 0.6193 (0.6193)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 3.1242  data: 2.9774  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.9016 (0.9482)  acc1: 82.4000 (81.0182)  acc5: 96.8000 (95.9636)  time: 0.4848  data: 0.3631  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9260 (0.9477)  acc1: 78.4000 (80.2476)  acc5: 96.4000 (96.0571)  time: 0.2264  data: 0.1068  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1244 (1.0718)  acc1: 74.4000 (78.0000)  acc5: 93.6000 (94.4387)  time: 0.2479  data: 0.1284  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3528 (1.1409)  acc1: 72.0000 (76.1756)  acc5: 90.4000 (93.6000)  time: 0.2433  data: 0.1243  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3491 (1.1597)  acc1: 70.8000 (75.7600)  acc5: 90.0000 (93.3600)  time: 0.1921  data: 0.0731  max mem: 18975
Test: Total time: 0:00:14 (0.2825 s / it)
* Acc@1 75.924 Acc@5 93.422 loss 1.151
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 75.96%
Epoch: [130]  [   0/2502]  eta: 1:45:52  lr: 0.002328  min_lr: 0.002328  loss: 3.8559 (3.8559)  weight_decay: 0.0500 (0.0500)  time: 2.5390  data: 1.3830  max mem: 18975
Epoch: [130]  [ 200/2502]  eta: 0:10:27  lr: 0.002327  min_lr: 0.002327  loss: 3.7680 (3.3060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8640 (0.8479)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [130]  [ 400/2502]  eta: 0:09:18  lr: 0.002325  min_lr: 0.002325  loss: 3.4298 (3.3175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7994 (0.8355)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [130]  [ 600/2502]  eta: 0:08:19  lr: 0.002324  min_lr: 0.002324  loss: 3.4499 (3.3501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8433 (0.8353)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [130]  [ 800/2502]  eta: 0:07:24  lr: 0.002322  min_lr: 0.002322  loss: 3.5903 (3.3531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7670 (0.8406)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [130]  [1000/2502]  eta: 0:06:31  lr: 0.002321  min_lr: 0.002321  loss: 3.8731 (3.3560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8063 (0.8401)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [130]  [1200/2502]  eta: 0:05:38  lr: 0.002319  min_lr: 0.002319  loss: 3.0307 (3.3548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7499 (0.8310)  time: 0.2581  data: 0.0005  max mem: 18975
Epoch: [130]  [1400/2502]  eta: 0:04:46  lr: 0.002318  min_lr: 0.002318  loss: 3.3048 (3.3721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8986 (0.8352)  time: 0.2563  data: 0.0005  max mem: 18975
Epoch: [130]  [1600/2502]  eta: 0:03:53  lr: 0.002316  min_lr: 0.002316  loss: 3.2800 (3.3795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8191 (0.8343)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [130]  [1800/2502]  eta: 0:03:01  lr: 0.002315  min_lr: 0.002315  loss: 3.3800 (3.3752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8558 (0.8371)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [130]  [2000/2502]  eta: 0:02:09  lr: 0.002313  min_lr: 0.002313  loss: 2.8277 (3.3778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8269 (inf)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [130]  [2200/2502]  eta: 0:01:18  lr: 0.002312  min_lr: 0.002312  loss: 3.7093 (3.3725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7768 (inf)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [130]  [2400/2502]  eta: 0:00:26  lr: 0.002311  min_lr: 0.002311  loss: 3.0683 (3.3732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7353 (inf)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [130]  [2501/2502]  eta: 0:00:00  lr: 0.002310  min_lr: 0.002310  loss: 3.3707 (3.3707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9053 (inf)  time: 0.2337  data: 0.0007  max mem: 18975
Epoch: [130] Total time: 0:10:46 (0.2585 s / it)
Averaged stats: lr: 0.002310  min_lr: 0.002310  loss: 3.3707 (3.3656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9053 (inf)
Test:  [ 0/50]  eta: 0:02:42  loss: 0.6839 (0.6839)  acc1: 92.4000 (92.4000)  acc5: 98.0000 (98.0000)  time: 3.2426  data: 3.1026  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.9509 (0.9487)  acc1: 81.6000 (81.8182)  acc5: 96.0000 (95.7818)  time: 0.4833  data: 0.3613  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9453 (0.9469)  acc1: 80.0000 (81.2952)  acc5: 96.0000 (96.0762)  time: 0.2058  data: 0.0858  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1136 (1.0773)  acc1: 76.0000 (78.8258)  acc5: 93.6000 (94.3484)  time: 0.2403  data: 0.1196  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3908 (1.1442)  acc1: 73.2000 (76.8878)  acc5: 90.4000 (93.6390)  time: 0.2488  data: 0.1284  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3415 (1.1652)  acc1: 69.6000 (76.2960)  acc5: 91.6000 (93.4640)  time: 0.2011  data: 0.0819  max mem: 18975
Test: Total time: 0:00:14 (0.2823 s / it)
* Acc@1 76.156 Acc@5 93.414 loss 1.163
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.16%
Epoch: [131]  [   0/2502]  eta: 1:35:20  lr: 0.002310  min_lr: 0.002310  loss: 2.6510 (2.6510)  weight_decay: 0.0500 (0.0500)  time: 2.2865  data: 2.0092  max mem: 18975
Epoch: [131]  [ 200/2502]  eta: 0:10:19  lr: 0.002308  min_lr: 0.002308  loss: 3.3397 (3.3678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7884 (0.8473)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [131]  [ 400/2502]  eta: 0:09:12  lr: 0.002307  min_lr: 0.002307  loss: 3.5254 (3.3934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7822 (0.8213)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [131]  [ 600/2502]  eta: 0:08:16  lr: 0.002305  min_lr: 0.002305  loss: 3.2130 (3.4031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7828 (0.8247)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [131]  [ 800/2502]  eta: 0:07:22  lr: 0.002304  min_lr: 0.002304  loss: 3.3648 (3.3842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7566 (0.8242)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [131]  [1000/2502]  eta: 0:06:29  lr: 0.002302  min_lr: 0.002302  loss: 3.4477 (3.3969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8068 (0.8331)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [131]  [1200/2502]  eta: 0:05:37  lr: 0.002301  min_lr: 0.002301  loss: 3.7252 (3.3902)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7824 (0.8284)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [131]  [1400/2502]  eta: 0:04:45  lr: 0.002299  min_lr: 0.002299  loss: 3.1028 (3.3804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8350 (0.8257)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [131]  [1600/2502]  eta: 0:03:53  lr: 0.002298  min_lr: 0.002298  loss: 3.5141 (3.3732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7931 (0.8219)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [131]  [1800/2502]  eta: 0:03:01  lr: 0.002296  min_lr: 0.002296  loss: 3.3837 (3.3674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7779 (0.8255)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [131]  [2000/2502]  eta: 0:02:09  lr: 0.002295  min_lr: 0.002295  loss: 3.4387 (3.3733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8111 (0.8276)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [131]  [2200/2502]  eta: 0:01:17  lr: 0.002293  min_lr: 0.002293  loss: 3.5286 (3.3671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7665 (0.8259)  time: 0.2579  data: 0.0005  max mem: 18975
Epoch: [131]  [2400/2502]  eta: 0:00:26  lr: 0.002292  min_lr: 0.002292  loss: 3.2674 (3.3681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7937 (0.8246)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [131]  [2501/2502]  eta: 0:00:00  lr: 0.002291  min_lr: 0.002291  loss: 3.2378 (3.3672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8186 (0.8246)  time: 0.2307  data: 0.0009  max mem: 18975
Epoch: [131] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.002291  min_lr: 0.002291  loss: 3.2378 (3.3727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8186 (0.8246)
Test:  [ 0/50]  eta: 0:02:15  loss: 0.5310 (0.5310)  acc1: 92.4000 (92.4000)  acc5: 98.4000 (98.4000)  time: 2.7100  data: 2.5720  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.8790 (0.9227)  acc1: 81.2000 (81.7091)  acc5: 96.4000 (95.9273)  time: 0.4705  data: 0.3497  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9125 (0.9270)  acc1: 80.4000 (81.2191)  acc5: 96.0000 (96.0000)  time: 0.2575  data: 0.1384  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0024 (1.0458)  acc1: 75.2000 (78.7226)  acc5: 94.4000 (94.6323)  time: 0.2752  data: 0.1561  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3716 (1.1241)  acc1: 71.6000 (76.8098)  acc5: 90.4000 (93.6390)  time: 0.2694  data: 0.1488  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3338 (1.1426)  acc1: 71.2000 (76.2240)  acc5: 91.2000 (93.5520)  time: 0.2066  data: 0.0856  max mem: 18975
Test: Total time: 0:00:14 (0.2962 s / it)
* Acc@1 76.210 Acc@5 93.526 loss 1.143
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.21%
Epoch: [132]  [   0/2502]  eta: 1:30:31  lr: 0.002291  min_lr: 0.002291  loss: 4.6257 (4.6257)  weight_decay: 0.0500 (0.0500)  time: 2.1708  data: 1.9022  max mem: 18975
Epoch: [132]  [ 200/2502]  eta: 0:10:20  lr: 0.002290  min_lr: 0.002290  loss: 3.6087 (3.3884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8249 (0.8128)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [132]  [ 400/2502]  eta: 0:09:13  lr: 0.002288  min_lr: 0.002288  loss: 3.4034 (3.3633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9072 (0.8462)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [132]  [ 600/2502]  eta: 0:08:16  lr: 0.002287  min_lr: 0.002287  loss: 3.7351 (3.3593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7334 (0.8310)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [132]  [ 800/2502]  eta: 0:07:22  lr: 0.002285  min_lr: 0.002285  loss: 2.8479 (3.3498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8901 (0.8378)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [132]  [1000/2502]  eta: 0:06:29  lr: 0.002284  min_lr: 0.002284  loss: 3.2855 (3.3539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7030 (0.8444)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [132]  [1200/2502]  eta: 0:05:37  lr: 0.002282  min_lr: 0.002282  loss: 3.7583 (3.3633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7492 (0.8378)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [132]  [1400/2502]  eta: 0:04:45  lr: 0.002281  min_lr: 0.002281  loss: 3.3428 (3.3586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8300 (0.8335)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [132]  [1600/2502]  eta: 0:03:53  lr: 0.002279  min_lr: 0.002279  loss: 3.5776 (3.3603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8046 (0.8296)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [132]  [1800/2502]  eta: 0:03:01  lr: 0.002278  min_lr: 0.002278  loss: 3.5763 (3.3599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8624 (0.8315)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [132]  [2000/2502]  eta: 0:02:09  lr: 0.002276  min_lr: 0.002276  loss: 3.6589 (3.3665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8852 (0.8324)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [132]  [2200/2502]  eta: 0:01:17  lr: 0.002275  min_lr: 0.002275  loss: 2.9696 (3.3706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8051 (0.8330)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [132]  [2400/2502]  eta: 0:00:26  lr: 0.002273  min_lr: 0.002273  loss: 3.5285 (3.3737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7450 (0.8312)  time: 0.2616  data: 0.0004  max mem: 18975
Epoch: [132]  [2501/2502]  eta: 0:00:00  lr: 0.002272  min_lr: 0.002272  loss: 3.4745 (3.3754)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8094 (0.8297)  time: 0.2303  data: 0.0008  max mem: 18975
Epoch: [132] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002272  min_lr: 0.002272  loss: 3.4745 (3.3738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8094 (0.8297)
Test:  [ 0/50]  eta: 0:03:11  loss: 0.6289 (0.6289)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 3.8312  data: 3.6873  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 0.9163 (0.9380)  acc1: 82.8000 (82.2909)  acc5: 96.4000 (96.5091)  time: 0.6070  data: 0.4847  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.9389 (0.9605)  acc1: 80.0000 (81.2000)  acc5: 96.4000 (96.4381)  time: 0.2706  data: 0.1487  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1422 (1.0759)  acc1: 74.8000 (78.8129)  acc5: 94.0000 (94.9419)  time: 0.2627  data: 0.1414  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3649 (1.1506)  acc1: 72.4000 (77.0439)  acc5: 90.8000 (94.0000)  time: 0.2247  data: 0.1058  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3512 (1.1662)  acc1: 71.6000 (76.2960)  acc5: 91.2000 (93.8480)  time: 0.2244  data: 0.1057  max mem: 18975
Test: Total time: 0:00:14 (0.2980 s / it)
* Acc@1 76.212 Acc@5 93.714 loss 1.166
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.21%
Epoch: [133]  [   0/2502]  eta: 1:22:20  lr: 0.002272  min_lr: 0.002272  loss: 2.7785 (2.7785)  weight_decay: 0.0500 (0.0500)  time: 1.9747  data: 1.7136  max mem: 18975
Epoch: [133]  [ 200/2502]  eta: 0:10:18  lr: 0.002271  min_lr: 0.002271  loss: 3.1853 (3.3057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8003 (0.8478)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [133]  [ 400/2502]  eta: 0:09:12  lr: 0.002269  min_lr: 0.002269  loss: 2.9264 (3.3270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8342 (0.8518)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [133]  [ 600/2502]  eta: 0:08:15  lr: 0.002268  min_lr: 0.002268  loss: 3.6836 (3.3473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8278 (0.8437)  time: 0.2629  data: 0.0003  max mem: 18975
Epoch: [133]  [ 800/2502]  eta: 0:07:21  lr: 0.002266  min_lr: 0.002266  loss: 2.7339 (3.3494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7989 (0.8431)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [133]  [1000/2502]  eta: 0:06:29  lr: 0.002265  min_lr: 0.002265  loss: 3.5366 (3.3402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7656 (0.8337)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [133]  [1200/2502]  eta: 0:05:36  lr: 0.002263  min_lr: 0.002263  loss: 3.7332 (3.3542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8938 (0.8331)  time: 0.2557  data: 0.0004  max mem: 18975
Epoch: [133]  [1400/2502]  eta: 0:04:44  lr: 0.002262  min_lr: 0.002262  loss: 3.4941 (3.3574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8956 (0.8352)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [133]  [1600/2502]  eta: 0:03:52  lr: 0.002260  min_lr: 0.002260  loss: 3.0888 (3.3540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8488 (0.8356)  time: 0.2583  data: 0.0003  max mem: 18975
Epoch: [133]  [1800/2502]  eta: 0:03:01  lr: 0.002259  min_lr: 0.002259  loss: 3.4048 (3.3443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8464 (0.8384)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [133]  [2000/2502]  eta: 0:02:09  lr: 0.002257  min_lr: 0.002257  loss: 3.3278 (3.3457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7771 (0.8378)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [133]  [2200/2502]  eta: 0:01:17  lr: 0.002256  min_lr: 0.002256  loss: 3.4512 (3.3440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7319 (0.8346)  time: 0.2575  data: 0.0005  max mem: 18975
Epoch: [133]  [2400/2502]  eta: 0:00:26  lr: 0.002254  min_lr: 0.002254  loss: 3.1114 (3.3442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8175 (0.8348)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [133]  [2501/2502]  eta: 0:00:00  lr: 0.002254  min_lr: 0.002254  loss: 3.7596 (3.3483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7977 (0.8341)  time: 0.2313  data: 0.0009  max mem: 18975
Epoch: [133] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.002254  min_lr: 0.002254  loss: 3.7596 (3.3606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7977 (0.8341)
Test:  [ 0/50]  eta: 0:02:14  loss: 0.6332 (0.6332)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 2.6907  data: 2.5438  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.9702 (1.0141)  acc1: 82.8000 (81.6000)  acc5: 97.2000 (96.4727)  time: 0.4629  data: 0.3394  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9975 (1.0315)  acc1: 80.4000 (80.9333)  acc5: 96.8000 (96.4952)  time: 0.2635  data: 0.1420  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2694 (1.1432)  acc1: 74.8000 (78.3871)  acc5: 93.6000 (94.8387)  time: 0.2876  data: 0.1665  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3960 (1.2113)  acc1: 70.8000 (76.5854)  acc5: 90.8000 (93.7854)  time: 0.2514  data: 0.1318  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3785 (1.2164)  acc1: 70.8000 (76.1920)  acc5: 90.8000 (93.6640)  time: 0.1913  data: 0.0723  max mem: 18975
Test: Total time: 0:00:14 (0.2927 s / it)
* Acc@1 76.262 Acc@5 93.520 loss 1.225
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.26%
Epoch: [134]  [   0/2502]  eta: 1:11:13  lr: 0.002254  min_lr: 0.002254  loss: 2.6213 (2.6213)  weight_decay: 0.0500 (0.0500)  time: 1.7082  data: 1.4389  max mem: 18975
Epoch: [134]  [ 200/2502]  eta: 0:10:17  lr: 0.002252  min_lr: 0.002252  loss: 3.7585 (3.3449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7964 (0.8235)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [134]  [ 400/2502]  eta: 0:09:12  lr: 0.002251  min_lr: 0.002251  loss: 3.7428 (3.3994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8178 (0.8164)  time: 0.2559  data: 0.0005  max mem: 18975
Epoch: [134]  [ 600/2502]  eta: 0:08:16  lr: 0.002249  min_lr: 0.002249  loss: 3.4041 (3.3718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8403 (0.8192)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [134]  [ 800/2502]  eta: 0:07:22  lr: 0.002248  min_lr: 0.002248  loss: 3.9137 (3.3926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8153 (0.8284)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [134]  [1000/2502]  eta: 0:06:29  lr: 0.002246  min_lr: 0.002246  loss: 3.4215 (3.3823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8014 (0.8207)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [134]  [1200/2502]  eta: 0:05:37  lr: 0.002245  min_lr: 0.002245  loss: 3.7525 (3.3792)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7877 (0.8207)  time: 0.2639  data: 0.0004  max mem: 18975
Epoch: [134]  [1400/2502]  eta: 0:04:45  lr: 0.002243  min_lr: 0.002243  loss: 3.6786 (3.3773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8092 (0.8219)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [134]  [1600/2502]  eta: 0:03:53  lr: 0.002242  min_lr: 0.002242  loss: 3.5040 (3.3722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9165 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [134]  [1800/2502]  eta: 0:03:01  lr: 0.002240  min_lr: 0.002240  loss: 3.3233 (3.3728)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7848 (inf)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [134]  [2000/2502]  eta: 0:02:09  lr: 0.002239  min_lr: 0.002239  loss: 3.3298 (3.3691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7973 (inf)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [134]  [2200/2502]  eta: 0:01:18  lr: 0.002237  min_lr: 0.002237  loss: 3.5216 (3.3695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8417 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [134]  [2400/2502]  eta: 0:00:26  lr: 0.002236  min_lr: 0.002236  loss: 3.1342 (3.3659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8096 (inf)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [134]  [2501/2502]  eta: 0:00:00  lr: 0.002235  min_lr: 0.002235  loss: 3.3300 (3.3609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7824 (inf)  time: 0.2334  data: 0.0009  max mem: 18975
Epoch: [134] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.002235  min_lr: 0.002235  loss: 3.3300 (3.3562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7824 (inf)
Test:  [ 0/50]  eta: 0:02:11  loss: 0.7574 (0.7574)  acc1: 91.2000 (91.2000)  acc5: 98.4000 (98.4000)  time: 2.6355  data: 2.4970  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 1.0043 (0.9980)  acc1: 82.4000 (82.1455)  acc5: 96.4000 (96.0364)  time: 0.4492  data: 0.3278  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 1.0065 (1.0110)  acc1: 80.4000 (81.0667)  acc5: 96.4000 (96.1143)  time: 0.2492  data: 0.1287  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1602 (1.1248)  acc1: 76.4000 (78.1548)  acc5: 93.2000 (94.5161)  time: 0.2734  data: 0.1531  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3765 (1.1865)  acc1: 70.4000 (76.5366)  acc5: 90.8000 (93.7366)  time: 0.2731  data: 0.1521  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3455 (1.2024)  acc1: 71.6000 (75.7760)  acc5: 91.2000 (93.4960)  time: 0.2104  data: 0.0895  max mem: 18975
Test: Total time: 0:00:14 (0.2926 s / it)
* Acc@1 76.048 Acc@5 93.568 loss 1.197
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.26%
Epoch: [135]  [   0/2502]  eta: 1:36:08  lr: 0.002235  min_lr: 0.002235  loss: 3.8185 (3.8185)  weight_decay: 0.0500 (0.0500)  time: 2.3055  data: 2.0326  max mem: 18975
Epoch: [135]  [ 200/2502]  eta: 0:10:22  lr: 0.002233  min_lr: 0.002233  loss: 3.3853 (3.3335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9578 (0.9033)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [135]  [ 400/2502]  eta: 0:09:14  lr: 0.002232  min_lr: 0.002232  loss: 3.0611 (3.2708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8848 (0.8799)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [135]  [ 600/2502]  eta: 0:08:17  lr: 0.002230  min_lr: 0.002230  loss: 3.5198 (3.3043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8030 (0.8556)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [135]  [ 800/2502]  eta: 0:07:22  lr: 0.002229  min_lr: 0.002229  loss: 3.2647 (3.3129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8395 (0.8556)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [135]  [1000/2502]  eta: 0:06:29  lr: 0.002227  min_lr: 0.002227  loss: 3.7024 (3.3258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8319 (0.8532)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [135]  [1200/2502]  eta: 0:05:37  lr: 0.002226  min_lr: 0.002226  loss: 3.3796 (3.3336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8166 (0.8494)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [135]  [1400/2502]  eta: 0:04:44  lr: 0.002224  min_lr: 0.002224  loss: 3.0841 (3.3193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7895 (0.8481)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [135]  [1600/2502]  eta: 0:03:53  lr: 0.002223  min_lr: 0.002223  loss: 3.3916 (3.3296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7691 (0.8422)  time: 0.2638  data: 0.0004  max mem: 18975
Epoch: [135]  [1800/2502]  eta: 0:03:01  lr: 0.002221  min_lr: 0.002221  loss: 3.5638 (3.3341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8250 (0.8461)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [135]  [2000/2502]  eta: 0:02:09  lr: 0.002220  min_lr: 0.002220  loss: 3.7282 (3.3387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7684 (0.8458)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [135]  [2200/2502]  eta: 0:01:17  lr: 0.002218  min_lr: 0.002218  loss: 3.0800 (3.3388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8815 (0.8511)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [135]  [2400/2502]  eta: 0:00:26  lr: 0.002217  min_lr: 0.002217  loss: 3.3109 (3.3378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8876 (0.8523)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [135]  [2501/2502]  eta: 0:00:00  lr: 0.002216  min_lr: 0.002216  loss: 3.4317 (3.3403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7906 (0.8528)  time: 0.2308  data: 0.0007  max mem: 18975
Epoch: [135] Total time: 0:10:45 (0.2578 s / it)
Averaged stats: lr: 0.002216  min_lr: 0.002216  loss: 3.4317 (3.3479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7906 (0.8528)
Test:  [ 0/50]  eta: 0:02:44  loss: 0.5392 (0.5392)  acc1: 92.4000 (92.4000)  acc5: 99.2000 (99.2000)  time: 3.2811  data: 3.1376  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9747 (0.9305)  acc1: 80.8000 (81.5636)  acc5: 96.0000 (96.3273)  time: 0.5214  data: 0.4000  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9882 (0.9670)  acc1: 80.4000 (80.6857)  acc5: 96.0000 (96.1333)  time: 0.2203  data: 0.1005  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1172 (1.0701)  acc1: 74.8000 (78.2452)  acc5: 93.6000 (94.7226)  time: 0.2376  data: 0.1180  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3383 (1.1390)  acc1: 70.8000 (76.4878)  acc5: 90.8000 (93.7951)  time: 0.2862  data: 0.1673  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3281 (1.1552)  acc1: 70.4000 (75.9200)  acc5: 91.6000 (93.6160)  time: 0.2305  data: 0.1118  max mem: 18975
Test: Total time: 0:00:15 (0.3017 s / it)
* Acc@1 76.108 Acc@5 93.602 loss 1.154
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.26%
Epoch: [136]  [   0/2502]  eta: 1:34:08  lr: 0.002216  min_lr: 0.002216  loss: 2.8448 (2.8448)  weight_decay: 0.0500 (0.0500)  time: 2.2576  data: 1.4153  max mem: 18975
Epoch: [136]  [ 200/2502]  eta: 0:10:20  lr: 0.002214  min_lr: 0.002214  loss: 2.9803 (3.3161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7628 (0.8229)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [136]  [ 400/2502]  eta: 0:09:13  lr: 0.002213  min_lr: 0.002213  loss: 3.1007 (3.3248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8379 (0.8149)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [136]  [ 600/2502]  eta: 0:08:16  lr: 0.002211  min_lr: 0.002211  loss: 3.8049 (3.3311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8552 (0.8277)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [136]  [ 800/2502]  eta: 0:07:22  lr: 0.002210  min_lr: 0.002210  loss: 3.1248 (3.3399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7436 (0.8285)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [136]  [1000/2502]  eta: 0:06:29  lr: 0.002208  min_lr: 0.002208  loss: 3.2156 (3.3605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8533 (0.8293)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [136]  [1200/2502]  eta: 0:05:37  lr: 0.002207  min_lr: 0.002207  loss: 3.3940 (3.3593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7952 (0.8399)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [136]  [1400/2502]  eta: 0:04:45  lr: 0.002205  min_lr: 0.002205  loss: 3.4710 (3.3573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7544 (0.8395)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [136]  [1600/2502]  eta: 0:03:53  lr: 0.002204  min_lr: 0.002204  loss: 3.4294 (3.3624)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8007 (0.8375)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [136]  [1800/2502]  eta: 0:03:01  lr: 0.002202  min_lr: 0.002202  loss: 3.5231 (3.3587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7666 (0.8417)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [136]  [2000/2502]  eta: 0:02:09  lr: 0.002201  min_lr: 0.002201  loss: 3.1401 (3.3626)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8286 (0.8418)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [136]  [2200/2502]  eta: 0:01:18  lr: 0.002199  min_lr: 0.002199  loss: 3.4954 (3.3635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8242 (0.8423)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [136]  [2400/2502]  eta: 0:00:26  lr: 0.002198  min_lr: 0.002198  loss: 3.3521 (3.3600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8511 (0.8443)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [136]  [2501/2502]  eta: 0:00:00  lr: 0.002197  min_lr: 0.002197  loss: 3.7782 (3.3628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8115 (0.8447)  time: 0.2307  data: 0.0009  max mem: 18975
Epoch: [136] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.002197  min_lr: 0.002197  loss: 3.7782 (3.3473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8115 (0.8447)
Test:  [ 0/50]  eta: 0:02:45  loss: 0.6322 (0.6322)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 3.3172  data: 3.1691  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9873 (1.0038)  acc1: 81.6000 (82.0000)  acc5: 96.4000 (96.1455)  time: 0.5395  data: 0.4166  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0083 (1.0241)  acc1: 80.0000 (81.3905)  acc5: 96.0000 (96.2095)  time: 0.2460  data: 0.1263  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1827 (1.1338)  acc1: 76.4000 (78.8129)  acc5: 94.4000 (94.6968)  time: 0.2331  data: 0.1136  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.4106 (1.2054)  acc1: 73.2000 (77.0049)  acc5: 90.4000 (93.6781)  time: 0.2208  data: 0.1014  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3657 (1.2130)  acc1: 72.8000 (76.5120)  acc5: 90.8000 (93.6000)  time: 0.1819  data: 0.0608  max mem: 18975
Test: Total time: 0:00:14 (0.2848 s / it)
* Acc@1 76.248 Acc@5 93.438 loss 1.218
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.26%
Epoch: [137]  [   0/2502]  eta: 1:43:38  lr: 0.002197  min_lr: 0.002197  loss: 3.5093 (3.5093)  weight_decay: 0.0500 (0.0500)  time: 2.4856  data: 2.2229  max mem: 18975
Epoch: [137]  [ 200/2502]  eta: 0:10:22  lr: 0.002195  min_lr: 0.002195  loss: 3.0713 (3.3117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7898 (0.8247)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [137]  [ 400/2502]  eta: 0:09:15  lr: 0.002194  min_lr: 0.002194  loss: 3.6004 (3.3057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8189 (0.8399)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [137]  [ 600/2502]  eta: 0:08:17  lr: 0.002192  min_lr: 0.002192  loss: 3.6745 (3.3374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8044 (0.8542)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [137]  [ 800/2502]  eta: 0:07:23  lr: 0.002191  min_lr: 0.002191  loss: 3.6286 (3.3293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7802 (0.8572)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [137]  [1000/2502]  eta: 0:06:30  lr: 0.002189  min_lr: 0.002189  loss: 2.9964 (3.3334)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8515 (0.8480)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [137]  [1200/2502]  eta: 0:05:37  lr: 0.002188  min_lr: 0.002188  loss: 3.7189 (3.3445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8923 (0.8582)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [137]  [1400/2502]  eta: 0:04:45  lr: 0.002186  min_lr: 0.002186  loss: 2.8766 (3.3376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7829 (0.8531)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [137]  [1600/2502]  eta: 0:03:53  lr: 0.002185  min_lr: 0.002185  loss: 3.6326 (3.3356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8741 (0.8520)  time: 0.2625  data: 0.0004  max mem: 18975
Epoch: [137]  [1800/2502]  eta: 0:03:01  lr: 0.002183  min_lr: 0.002183  loss: 3.4158 (3.3406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7954 (0.8482)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [137]  [2000/2502]  eta: 0:02:09  lr: 0.002182  min_lr: 0.002182  loss: 2.9957 (3.3345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8820 (0.8492)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [137]  [2200/2502]  eta: 0:01:18  lr: 0.002180  min_lr: 0.002180  loss: 2.9634 (3.3331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8067 (0.8470)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [137]  [2400/2502]  eta: 0:00:26  lr: 0.002179  min_lr: 0.002179  loss: 3.0071 (3.3331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7603 (0.8454)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [137]  [2501/2502]  eta: 0:00:00  lr: 0.002178  min_lr: 0.002178  loss: 2.7426 (3.3307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8150 (0.8460)  time: 0.2306  data: 0.0007  max mem: 18975
Epoch: [137] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.002178  min_lr: 0.002178  loss: 2.7426 (3.3418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8150 (0.8460)
Test:  [ 0/50]  eta: 0:02:40  loss: 0.5731 (0.5731)  acc1: 91.2000 (91.2000)  acc5: 98.4000 (98.4000)  time: 3.2164  data: 3.0588  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.8838 (0.9006)  acc1: 83.2000 (81.8182)  acc5: 96.0000 (95.9273)  time: 0.4926  data: 0.3675  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9071 (0.9001)  acc1: 80.4000 (81.3714)  acc5: 96.4000 (96.2476)  time: 0.2506  data: 0.1289  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0213 (1.0146)  acc1: 76.4000 (78.5290)  acc5: 94.0000 (94.7742)  time: 0.2738  data: 0.1534  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3062 (1.0874)  acc1: 72.0000 (76.5951)  acc5: 91.2000 (93.8634)  time: 0.2265  data: 0.1068  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2893 (1.0996)  acc1: 72.0000 (76.1040)  acc5: 91.6000 (93.7840)  time: 0.2052  data: 0.0858  max mem: 18975
Test: Total time: 0:00:14 (0.2856 s / it)
* Acc@1 76.198 Acc@5 93.668 loss 1.096
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.26%
Epoch: [138]  [   0/2502]  eta: 1:35:24  lr: 0.002178  min_lr: 0.002178  loss: 2.5536 (2.5536)  weight_decay: 0.0500 (0.0500)  time: 2.2878  data: 1.5773  max mem: 18975
Epoch: [138]  [ 200/2502]  eta: 0:10:20  lr: 0.002176  min_lr: 0.002176  loss: 3.1795 (3.2837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8895 (0.8414)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [138]  [ 400/2502]  eta: 0:09:13  lr: 0.002175  min_lr: 0.002175  loss: 3.3679 (3.3097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7822 (0.8304)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [138]  [ 600/2502]  eta: 0:08:16  lr: 0.002173  min_lr: 0.002173  loss: 2.7670 (3.3210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8414 (0.8440)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [138]  [ 800/2502]  eta: 0:07:23  lr: 0.002172  min_lr: 0.002172  loss: 3.3116 (3.3227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9179 (0.8512)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [138]  [1000/2502]  eta: 0:06:30  lr: 0.002170  min_lr: 0.002170  loss: 3.5120 (3.3363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8180 (0.8543)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [138]  [1200/2502]  eta: 0:05:37  lr: 0.002169  min_lr: 0.002169  loss: 3.2426 (3.3346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8934 (0.8617)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [138]  [1400/2502]  eta: 0:04:45  lr: 0.002167  min_lr: 0.002167  loss: 3.2585 (3.3313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7599 (0.8564)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [138]  [1600/2502]  eta: 0:03:53  lr: 0.002166  min_lr: 0.002166  loss: 3.7517 (3.3266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7653 (0.8476)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [138]  [1800/2502]  eta: 0:03:01  lr: 0.002164  min_lr: 0.002164  loss: 3.1802 (3.3193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9211 (0.8515)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [138]  [2000/2502]  eta: 0:02:09  lr: 0.002163  min_lr: 0.002163  loss: 2.8775 (3.3314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8941 (0.8538)  time: 0.2571  data: 0.0005  max mem: 18975
Epoch: [138]  [2200/2502]  eta: 0:01:17  lr: 0.002161  min_lr: 0.002161  loss: 3.4728 (3.3289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8508 (0.8536)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [138]  [2400/2502]  eta: 0:00:26  lr: 0.002160  min_lr: 0.002160  loss: 3.1814 (3.3311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8898 (0.8544)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [138]  [2501/2502]  eta: 0:00:00  lr: 0.002159  min_lr: 0.002159  loss: 3.2460 (3.3331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8152 (0.8566)  time: 0.2309  data: 0.0007  max mem: 18975
Epoch: [138] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002159  min_lr: 0.002159  loss: 3.2460 (3.3490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8152 (0.8566)
Test:  [ 0/50]  eta: 0:02:32  loss: 0.5617 (0.5617)  acc1: 91.6000 (91.6000)  acc5: 98.4000 (98.4000)  time: 3.0558  data: 2.9145  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.8606 (0.9112)  acc1: 82.4000 (82.4364)  acc5: 96.8000 (96.0364)  time: 0.4777  data: 0.3559  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9320 (0.9247)  acc1: 80.4000 (81.6571)  acc5: 96.4000 (96.1524)  time: 0.2563  data: 0.1361  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0900 (1.0373)  acc1: 77.6000 (79.2129)  acc5: 94.0000 (94.5419)  time: 0.2719  data: 0.1518  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3316 (1.1143)  acc1: 72.0000 (77.0439)  acc5: 90.8000 (93.6195)  time: 0.2703  data: 0.1504  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3075 (1.1320)  acc1: 72.0000 (76.4320)  acc5: 91.2000 (93.5040)  time: 0.2341  data: 0.1146  max mem: 18975
Test: Total time: 0:00:15 (0.3072 s / it)
* Acc@1 76.330 Acc@5 93.574 loss 1.128
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.33%
Epoch: [139]  [   0/2502]  eta: 1:27:28  lr: 0.002159  min_lr: 0.002159  loss: 3.0370 (3.0370)  weight_decay: 0.0500 (0.0500)  time: 2.0975  data: 1.8178  max mem: 18975
Epoch: [139]  [ 200/2502]  eta: 0:10:17  lr: 0.002157  min_lr: 0.002157  loss: 3.3889 (3.3759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8388 (0.8471)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [139]  [ 400/2502]  eta: 0:09:12  lr: 0.002156  min_lr: 0.002156  loss: 3.1457 (3.3102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8064 (0.8534)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [139]  [ 600/2502]  eta: 0:08:16  lr: 0.002154  min_lr: 0.002154  loss: 3.0523 (3.3283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8109 (0.8636)  time: 0.2579  data: 0.0005  max mem: 18975
Epoch: [139]  [ 800/2502]  eta: 0:07:22  lr: 0.002153  min_lr: 0.002153  loss: 2.9870 (3.3351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8168 (0.8561)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [139]  [1000/2502]  eta: 0:06:29  lr: 0.002151  min_lr: 0.002151  loss: 3.5930 (3.3335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7645 (0.8576)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [139]  [1200/2502]  eta: 0:05:37  lr: 0.002150  min_lr: 0.002150  loss: 3.4683 (3.3352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9394 (0.8641)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [139]  [1400/2502]  eta: 0:04:45  lr: 0.002148  min_lr: 0.002148  loss: 3.3905 (3.3201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8301 (0.8635)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [139]  [1600/2502]  eta: 0:03:53  lr: 0.002147  min_lr: 0.002147  loss: 3.2857 (3.3304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8399 (0.8614)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [139]  [1800/2502]  eta: 0:03:01  lr: 0.002145  min_lr: 0.002145  loss: 3.3852 (3.3337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8036 (inf)  time: 0.2573  data: 0.0005  max mem: 18975
Epoch: [139]  [2000/2502]  eta: 0:02:09  lr: 0.002144  min_lr: 0.002144  loss: 3.3405 (3.3353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9278 (inf)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [139]  [2200/2502]  eta: 0:01:17  lr: 0.002142  min_lr: 0.002142  loss: 3.4326 (3.3395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8496 (inf)  time: 0.2565  data: 0.0007  max mem: 18975
Epoch: [139]  [2400/2502]  eta: 0:00:26  lr: 0.002141  min_lr: 0.002141  loss: 3.3643 (3.3437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8053 (inf)  time: 0.2572  data: 0.0003  max mem: 18975
Epoch: [139]  [2501/2502]  eta: 0:00:00  lr: 0.002140  min_lr: 0.002140  loss: 2.8309 (3.3407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7972 (inf)  time: 0.2311  data: 0.0009  max mem: 18975
Epoch: [139] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002140  min_lr: 0.002140  loss: 2.8309 (3.3376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7972 (inf)
Test:  [ 0/50]  eta: 0:03:12  loss: 0.5917 (0.5917)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 3.8447  data: 3.7077  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 0.8955 (0.9394)  acc1: 82.4000 (81.5273)  acc5: 96.8000 (96.4000)  time: 0.5789  data: 0.4582  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9184 (0.9413)  acc1: 78.4000 (80.8571)  acc5: 96.4000 (96.2476)  time: 0.2480  data: 0.1289  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0825 (1.0532)  acc1: 76.0000 (78.4645)  acc5: 94.0000 (94.6452)  time: 0.2525  data: 0.1335  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3115 (1.1214)  acc1: 72.0000 (76.7610)  acc5: 90.8000 (93.9122)  time: 0.2365  data: 0.1156  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3114 (1.1331)  acc1: 72.0000 (76.2560)  acc5: 91.6000 (93.8000)  time: 0.2252  data: 0.1044  max mem: 18975
Test: Total time: 0:00:14 (0.2994 s / it)
* Acc@1 76.266 Acc@5 93.762 loss 1.128
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.33%
Epoch: [140]  [   0/2502]  eta: 1:13:48  lr: 0.002140  min_lr: 0.002140  loss: 2.5164 (2.5164)  weight_decay: 0.0500 (0.0500)  time: 1.7699  data: 1.4695  max mem: 18975
Epoch: [140]  [ 200/2502]  eta: 0:10:14  lr: 0.002138  min_lr: 0.002138  loss: 2.7354 (3.3247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8273 (0.8893)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [140]  [ 400/2502]  eta: 0:09:10  lr: 0.002137  min_lr: 0.002137  loss: 3.5423 (3.3614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7982 (0.8569)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [140]  [ 600/2502]  eta: 0:08:15  lr: 0.002135  min_lr: 0.002135  loss: 3.0838 (3.3507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8314 (0.8428)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [140]  [ 800/2502]  eta: 0:07:21  lr: 0.002134  min_lr: 0.002134  loss: 3.1900 (3.3461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8397 (0.8474)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [140]  [1000/2502]  eta: 0:06:28  lr: 0.002132  min_lr: 0.002132  loss: 3.4151 (3.3591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8927 (0.8533)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [140]  [1200/2502]  eta: 0:05:36  lr: 0.002131  min_lr: 0.002131  loss: 3.4447 (3.3617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8737 (0.8582)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [140]  [1400/2502]  eta: 0:04:44  lr: 0.002129  min_lr: 0.002129  loss: 3.4628 (3.3557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8144 (0.8542)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [140]  [1600/2502]  eta: 0:03:52  lr: 0.002128  min_lr: 0.002128  loss: 2.8690 (3.3494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9684 (0.8644)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [140]  [1800/2502]  eta: 0:03:01  lr: 0.002126  min_lr: 0.002126  loss: 3.3885 (3.3489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8256 (0.8604)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [140]  [2000/2502]  eta: 0:02:09  lr: 0.002124  min_lr: 0.002124  loss: 3.6014 (3.3438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8206 (0.8575)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [140]  [2200/2502]  eta: 0:01:17  lr: 0.002123  min_lr: 0.002123  loss: 3.5886 (3.3421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7943 (0.8555)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [140]  [2400/2502]  eta: 0:00:26  lr: 0.002121  min_lr: 0.002121  loss: 3.5794 (3.3449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8281 (0.8561)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [140]  [2501/2502]  eta: 0:00:00  lr: 0.002121  min_lr: 0.002121  loss: 3.7136 (3.3452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7787 (0.8547)  time: 0.2304  data: 0.0007  max mem: 18975
Epoch: [140] Total time: 0:10:44 (0.2577 s / it)
Averaged stats: lr: 0.002121  min_lr: 0.002121  loss: 3.7136 (3.3484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7787 (0.8547)
Test:  [ 0/50]  eta: 0:03:18  loss: 0.7267 (0.7267)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 3.9778  data: 3.8305  max mem: 18975
Test:  [10/50]  eta: 0:00:25  loss: 0.9947 (1.0230)  acc1: 80.4000 (81.3091)  acc5: 96.0000 (95.9273)  time: 0.6444  data: 0.5199  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 1.0067 (1.0420)  acc1: 80.4000 (80.4191)  acc5: 96.0000 (96.2095)  time: 0.2660  data: 0.1451  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1489 (1.1343)  acc1: 76.4000 (78.3097)  acc5: 94.8000 (94.7226)  time: 0.2261  data: 0.1068  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3836 (1.1998)  acc1: 72.0000 (76.6829)  acc5: 90.4000 (93.8342)  time: 0.2075  data: 0.0883  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3836 (1.2193)  acc1: 71.6000 (75.9760)  acc5: 90.8000 (93.5840)  time: 0.2074  data: 0.0882  max mem: 18975
Test: Total time: 0:00:14 (0.2926 s / it)
* Acc@1 75.972 Acc@5 93.618 loss 1.224
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.33%
Epoch: [141]  [   0/2502]  eta: 1:18:19  lr: 0.002121  min_lr: 0.002121  loss: 2.3357 (2.3357)  weight_decay: 0.0500 (0.0500)  time: 1.8781  data: 1.4324  max mem: 18975
Epoch: [141]  [ 200/2502]  eta: 0:10:15  lr: 0.002119  min_lr: 0.002119  loss: 3.0852 (3.2963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8480 (0.8665)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [141]  [ 400/2502]  eta: 0:09:11  lr: 0.002118  min_lr: 0.002118  loss: 3.5458 (3.2948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8182 (0.8605)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [141]  [ 600/2502]  eta: 0:08:16  lr: 0.002116  min_lr: 0.002116  loss: 3.3401 (3.3063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8007 (0.8461)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [141]  [ 800/2502]  eta: 0:07:22  lr: 0.002114  min_lr: 0.002114  loss: 2.7711 (3.3044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7737 (0.8448)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [141]  [1000/2502]  eta: 0:06:29  lr: 0.002113  min_lr: 0.002113  loss: 3.2372 (3.3183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8400 (0.8493)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [141]  [1200/2502]  eta: 0:05:36  lr: 0.002111  min_lr: 0.002111  loss: 3.4883 (3.3246)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8645 (0.8498)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [141]  [1400/2502]  eta: 0:04:44  lr: 0.002110  min_lr: 0.002110  loss: 3.1027 (3.3270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8586 (0.8491)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [141]  [1600/2502]  eta: 0:03:53  lr: 0.002108  min_lr: 0.002108  loss: 3.2712 (3.3243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8766 (0.8531)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [141]  [1800/2502]  eta: 0:03:01  lr: 0.002107  min_lr: 0.002107  loss: 3.2093 (3.3192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7995 (0.8491)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [141]  [2000/2502]  eta: 0:02:09  lr: 0.002105  min_lr: 0.002105  loss: 3.3880 (3.3233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8527 (0.8510)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [141]  [2200/2502]  eta: 0:01:17  lr: 0.002104  min_lr: 0.002104  loss: 3.5715 (3.3235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7968 (0.8507)  time: 0.2585  data: 0.0004  max mem: 18975
Epoch: [141]  [2400/2502]  eta: 0:00:26  lr: 0.002102  min_lr: 0.002102  loss: 2.7843 (3.3276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8202 (0.8523)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [141]  [2501/2502]  eta: 0:00:00  lr: 0.002101  min_lr: 0.002101  loss: 3.2982 (3.3296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8175 (0.8521)  time: 0.2312  data: 0.0009  max mem: 18975
Epoch: [141] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.002101  min_lr: 0.002101  loss: 3.2982 (3.3275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8175 (0.8521)
Test:  [ 0/50]  eta: 0:02:42  loss: 0.6199 (0.6199)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 3.2446  data: 3.1075  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.9267 (0.9282)  acc1: 82.8000 (82.7636)  acc5: 96.4000 (96.2182)  time: 0.5112  data: 0.3881  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9465 (0.9502)  acc1: 81.6000 (81.9048)  acc5: 96.0000 (96.2286)  time: 0.2433  data: 0.1229  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0984 (1.0688)  acc1: 76.8000 (79.1484)  acc5: 93.2000 (94.6968)  time: 0.2715  data: 0.1523  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3283 (1.1392)  acc1: 72.4000 (77.4244)  acc5: 90.8000 (93.7854)  time: 0.2976  data: 0.1781  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3288 (1.1563)  acc1: 72.4000 (76.8960)  acc5: 91.2000 (93.6160)  time: 0.2291  data: 0.1100  max mem: 18975
Test: Total time: 0:00:15 (0.3123 s / it)
* Acc@1 76.710 Acc@5 93.760 loss 1.157
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.71%
Epoch: [142]  [   0/2502]  eta: 1:29:48  lr: 0.002101  min_lr: 0.002101  loss: 3.9878 (3.9878)  weight_decay: 0.0500 (0.0500)  time: 2.1536  data: 1.8704  max mem: 18975
Epoch: [142]  [ 200/2502]  eta: 0:10:21  lr: 0.002100  min_lr: 0.002100  loss: 3.2701 (3.2974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7793 (0.8162)  time: 0.2645  data: 0.0004  max mem: 18975
Epoch: [142]  [ 400/2502]  eta: 0:09:14  lr: 0.002098  min_lr: 0.002098  loss: 3.3921 (3.3221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8598 (0.8408)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [142]  [ 600/2502]  eta: 0:08:17  lr: 0.002097  min_lr: 0.002097  loss: 3.3292 (3.3151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8777 (0.8467)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [142]  [ 800/2502]  eta: 0:07:23  lr: 0.002095  min_lr: 0.002095  loss: 3.4277 (3.3191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8123 (0.8503)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [142]  [1000/2502]  eta: 0:06:30  lr: 0.002094  min_lr: 0.002094  loss: 2.9788 (3.3157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7821 (0.8391)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [142]  [1200/2502]  eta: 0:05:37  lr: 0.002092  min_lr: 0.002092  loss: 3.0641 (3.3143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8399 (0.8423)  time: 0.2575  data: 0.0005  max mem: 18975
Epoch: [142]  [1400/2502]  eta: 0:04:45  lr: 0.002091  min_lr: 0.002091  loss: 3.7762 (3.3204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9124 (0.8515)  time: 0.2609  data: 0.0004  max mem: 18975
Epoch: [142]  [1600/2502]  eta: 0:03:53  lr: 0.002089  min_lr: 0.002089  loss: 3.5047 (3.3187)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8061 (0.8536)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [142]  [1800/2502]  eta: 0:03:01  lr: 0.002088  min_lr: 0.002088  loss: 3.6582 (3.3215)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8178 (0.8494)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [142]  [2000/2502]  eta: 0:02:09  lr: 0.002086  min_lr: 0.002086  loss: 3.0774 (3.3243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8315 (0.8523)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [142]  [2200/2502]  eta: 0:01:18  lr: 0.002084  min_lr: 0.002084  loss: 3.2049 (3.3230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8586 (0.8568)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [142]  [2400/2502]  eta: 0:00:26  lr: 0.002083  min_lr: 0.002083  loss: 3.7913 (3.3259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7954 (0.8549)  time: 0.2616  data: 0.0004  max mem: 18975
Epoch: [142]  [2501/2502]  eta: 0:00:00  lr: 0.002082  min_lr: 0.002082  loss: 2.7970 (3.3229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8761 (0.8552)  time: 0.2312  data: 0.0007  max mem: 18975
Epoch: [142] Total time: 0:10:46 (0.2585 s / it)
Averaged stats: lr: 0.002082  min_lr: 0.002082  loss: 2.7970 (3.3284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8761 (0.8552)
Test:  [ 0/50]  eta: 0:02:45  loss: 0.5805 (0.5805)  acc1: 92.0000 (92.0000)  acc5: 98.0000 (98.0000)  time: 3.3160  data: 3.1648  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.8556 (0.8924)  acc1: 84.4000 (82.5091)  acc5: 96.4000 (96.1818)  time: 0.5147  data: 0.3925  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8686 (0.8981)  acc1: 80.8000 (81.7905)  acc5: 96.4000 (96.3048)  time: 0.2430  data: 0.1238  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0192 (1.0218)  acc1: 76.8000 (79.2516)  acc5: 93.2000 (94.7613)  time: 0.2692  data: 0.1492  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3817 (1.1017)  acc1: 72.4000 (77.2195)  acc5: 90.4000 (93.8342)  time: 0.2493  data: 0.1294  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3526 (1.1190)  acc1: 72.0000 (76.6720)  acc5: 91.2000 (93.6080)  time: 0.2500  data: 0.1311  max mem: 18975
Test: Total time: 0:00:15 (0.3025 s / it)
* Acc@1 76.476 Acc@5 93.680 loss 1.126
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.71%
Epoch: [143]  [   0/2502]  eta: 1:33:38  lr: 0.002082  min_lr: 0.002082  loss: 3.1512 (3.1512)  weight_decay: 0.0500 (0.0500)  time: 2.2456  data: 1.9565  max mem: 18975
Epoch: [143]  [ 200/2502]  eta: 0:10:18  lr: 0.002081  min_lr: 0.002081  loss: 3.7721 (3.3512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9556 (inf)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [143]  [ 400/2502]  eta: 0:09:12  lr: 0.002079  min_lr: 0.002079  loss: 3.6269 (3.3102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8237 (inf)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [143]  [ 600/2502]  eta: 0:08:16  lr: 0.002078  min_lr: 0.002078  loss: 3.0988 (3.3139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8031 (inf)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [143]  [ 800/2502]  eta: 0:07:22  lr: 0.002076  min_lr: 0.002076  loss: 3.3256 (3.3049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8188 (inf)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [143]  [1000/2502]  eta: 0:06:29  lr: 0.002074  min_lr: 0.002074  loss: 3.4917 (3.3232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8309 (inf)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [143]  [1200/2502]  eta: 0:05:37  lr: 0.002073  min_lr: 0.002073  loss: 2.9627 (3.3208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8314 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [143]  [1400/2502]  eta: 0:04:45  lr: 0.002071  min_lr: 0.002071  loss: 2.9558 (3.3191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8380 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [143]  [1600/2502]  eta: 0:03:53  lr: 0.002070  min_lr: 0.002070  loss: 3.6992 (3.3220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8887 (inf)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [143]  [1800/2502]  eta: 0:03:01  lr: 0.002068  min_lr: 0.002068  loss: 3.1450 (3.3168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7797 (inf)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [143]  [2000/2502]  eta: 0:02:09  lr: 0.002067  min_lr: 0.002067  loss: 3.3277 (3.3200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8431 (inf)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [143]  [2200/2502]  eta: 0:01:17  lr: 0.002065  min_lr: 0.002065  loss: 3.4123 (3.3186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8566 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [143]  [2400/2502]  eta: 0:00:26  lr: 0.002064  min_lr: 0.002064  loss: 3.5274 (3.3124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8535 (inf)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [143]  [2501/2502]  eta: 0:00:00  lr: 0.002063  min_lr: 0.002063  loss: 3.3494 (3.3150)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7960 (inf)  time: 0.2308  data: 0.0007  max mem: 18975
Epoch: [143] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.002063  min_lr: 0.002063  loss: 3.3494 (3.3250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7960 (inf)
Test:  [ 0/50]  eta: 0:02:30  loss: 0.6108 (0.6108)  acc1: 92.4000 (92.4000)  acc5: 98.4000 (98.4000)  time: 3.0182  data: 2.8656  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.8604 (0.9218)  acc1: 82.0000 (82.2545)  acc5: 96.8000 (96.2909)  time: 0.5283  data: 0.4050  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9328 (0.9410)  acc1: 80.8000 (81.4476)  acc5: 96.8000 (96.3048)  time: 0.2779  data: 0.1572  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0858 (1.0519)  acc1: 76.4000 (78.5806)  acc5: 92.8000 (94.8000)  time: 0.2831  data: 0.1625  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3031 (1.1189)  acc1: 72.4000 (76.8878)  acc5: 91.6000 (93.9610)  time: 0.2321  data: 0.1125  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2816 (1.1384)  acc1: 72.4000 (76.2160)  acc5: 92.0000 (93.7600)  time: 0.2174  data: 0.0985  max mem: 18975
Test: Total time: 0:00:14 (0.2970 s / it)
* Acc@1 76.628 Acc@5 93.836 loss 1.138
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.71%
Epoch: [144]  [   0/2502]  eta: 1:32:45  lr: 0.002063  min_lr: 0.002063  loss: 2.9968 (2.9968)  weight_decay: 0.0500 (0.0500)  time: 2.2246  data: 1.9451  max mem: 18975
Epoch: [144]  [ 200/2502]  eta: 0:10:19  lr: 0.002061  min_lr: 0.002061  loss: 3.6139 (3.2759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8729 (0.8573)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [144]  [ 400/2502]  eta: 0:09:13  lr: 0.002060  min_lr: 0.002060  loss: 3.0997 (3.3086)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8122 (0.8518)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [144]  [ 600/2502]  eta: 0:08:17  lr: 0.002058  min_lr: 0.002058  loss: 3.4364 (3.3212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8361 (0.8539)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [144]  [ 800/2502]  eta: 0:07:22  lr: 0.002057  min_lr: 0.002057  loss: 3.7042 (3.3175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8336 (0.8561)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [144]  [1000/2502]  eta: 0:06:29  lr: 0.002055  min_lr: 0.002055  loss: 3.4119 (3.3189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9248 (0.8603)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [144]  [1200/2502]  eta: 0:05:37  lr: 0.002054  min_lr: 0.002054  loss: 3.4219 (3.3198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8381 (0.8594)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [144]  [1400/2502]  eta: 0:04:45  lr: 0.002052  min_lr: 0.002052  loss: 3.3570 (3.3348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8599 (0.8573)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [144]  [1600/2502]  eta: 0:03:53  lr: 0.002051  min_lr: 0.002051  loss: 3.3136 (3.3359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8868 (0.8589)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [144]  [1800/2502]  eta: 0:03:01  lr: 0.002049  min_lr: 0.002049  loss: 3.2613 (3.3365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9244 (0.8657)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [144]  [2000/2502]  eta: 0:02:09  lr: 0.002047  min_lr: 0.002047  loss: 3.4931 (3.3416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8724 (0.8646)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [144]  [2200/2502]  eta: 0:01:17  lr: 0.002046  min_lr: 0.002046  loss: 3.1729 (3.3419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7867 (0.8671)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [144]  [2400/2502]  eta: 0:00:26  lr: 0.002044  min_lr: 0.002044  loss: 3.0500 (3.3383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8752 (0.8659)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [144]  [2501/2502]  eta: 0:00:00  lr: 0.002044  min_lr: 0.002044  loss: 3.6366 (3.3435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8142 (0.8644)  time: 0.2305  data: 0.0009  max mem: 18975
Epoch: [144] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.002044  min_lr: 0.002044  loss: 3.6366 (3.3243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8142 (0.8644)
Test:  [ 0/50]  eta: 0:02:11  loss: 0.6198 (0.6198)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 2.6281  data: 2.4837  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 0.9292 (0.9642)  acc1: 83.6000 (82.4000)  acc5: 95.6000 (96.3636)  time: 0.4340  data: 0.3121  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9601 (0.9841)  acc1: 80.8000 (81.6191)  acc5: 96.8000 (96.4381)  time: 0.2444  data: 0.1242  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2129 (1.1042)  acc1: 76.4000 (78.8645)  acc5: 92.8000 (94.6710)  time: 0.2928  data: 0.1714  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3925 (1.1762)  acc1: 72.0000 (77.0927)  acc5: 90.8000 (93.8439)  time: 0.2875  data: 0.1671  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3883 (1.1895)  acc1: 72.0000 (76.5440)  acc5: 91.2000 (93.6960)  time: 0.2190  data: 0.1001  max mem: 18975
Test: Total time: 0:00:14 (0.2997 s / it)
* Acc@1 76.586 Acc@5 93.726 loss 1.191
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.71%
Epoch: [145]  [   0/2502]  eta: 1:39:06  lr: 0.002044  min_lr: 0.002044  loss: 4.1643 (4.1643)  weight_decay: 0.0500 (0.0500)  time: 2.3768  data: 2.1070  max mem: 18975
Epoch: [145]  [ 200/2502]  eta: 0:10:20  lr: 0.002042  min_lr: 0.002042  loss: 3.2409 (3.2795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8205 (0.8770)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [145]  [ 400/2502]  eta: 0:09:13  lr: 0.002040  min_lr: 0.002040  loss: 2.9479 (3.2718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8622 (0.8875)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [145]  [ 600/2502]  eta: 0:08:16  lr: 0.002039  min_lr: 0.002039  loss: 3.6972 (3.3033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8793 (0.8783)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [145]  [ 800/2502]  eta: 0:07:23  lr: 0.002037  min_lr: 0.002037  loss: 2.9759 (3.2800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8024 (0.8688)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [145]  [1000/2502]  eta: 0:06:30  lr: 0.002036  min_lr: 0.002036  loss: 3.2227 (3.2820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7799 (0.8653)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [145]  [1200/2502]  eta: 0:05:37  lr: 0.002034  min_lr: 0.002034  loss: 3.3271 (3.2978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8092 (0.8682)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [145]  [1400/2502]  eta: 0:04:45  lr: 0.002033  min_lr: 0.002033  loss: 3.2083 (3.2973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7991 (0.8697)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [145]  [1600/2502]  eta: 0:03:53  lr: 0.002031  min_lr: 0.002031  loss: 3.3263 (3.2877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8438 (0.8696)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [145]  [1800/2502]  eta: 0:03:01  lr: 0.002030  min_lr: 0.002030  loss: 3.5774 (3.2945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8605 (0.8693)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [145]  [2000/2502]  eta: 0:02:09  lr: 0.002028  min_lr: 0.002028  loss: 3.7584 (3.2979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8753 (0.8722)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [145]  [2200/2502]  eta: 0:01:18  lr: 0.002027  min_lr: 0.002027  loss: 3.6383 (3.3053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8205 (0.8693)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [145]  [2400/2502]  eta: 0:00:26  lr: 0.002025  min_lr: 0.002025  loss: 2.8419 (3.3022)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9346 (0.8709)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [145]  [2501/2502]  eta: 0:00:00  lr: 0.002024  min_lr: 0.002024  loss: 3.5135 (3.3056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8697 (0.8712)  time: 0.2312  data: 0.0008  max mem: 18975
Epoch: [145] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.002024  min_lr: 0.002024  loss: 3.5135 (3.3180)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8697 (0.8712)
Test:  [ 0/50]  eta: 0:02:50  loss: 0.6190 (0.6190)  acc1: 92.8000 (92.8000)  acc5: 99.2000 (99.2000)  time: 3.4145  data: 3.2762  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9868 (0.9739)  acc1: 82.8000 (81.9636)  acc5: 96.0000 (95.8182)  time: 0.5315  data: 0.4095  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9868 (0.9990)  acc1: 80.8000 (81.1238)  acc5: 96.0000 (96.0952)  time: 0.2365  data: 0.1164  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1709 (1.1112)  acc1: 75.6000 (78.7355)  acc5: 93.6000 (94.5161)  time: 0.2244  data: 0.1049  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3911 (1.1769)  acc1: 72.4000 (76.9463)  acc5: 91.2000 (93.8927)  time: 0.2312  data: 0.1121  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3695 (1.1915)  acc1: 72.8000 (76.5280)  acc5: 91.6000 (93.5760)  time: 0.2183  data: 0.0993  max mem: 18975
Test: Total time: 0:00:14 (0.2896 s / it)
* Acc@1 76.590 Acc@5 93.598 loss 1.195
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.71%
Epoch: [146]  [   0/2502]  eta: 1:26:11  lr: 0.002024  min_lr: 0.002024  loss: 2.6251 (2.6251)  weight_decay: 0.0500 (0.0500)  time: 2.0669  data: 1.6361  max mem: 18975
Epoch: [146]  [ 200/2502]  eta: 0:10:19  lr: 0.002023  min_lr: 0.002023  loss: 2.5731 (3.3340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9514 (0.8922)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [146]  [ 400/2502]  eta: 0:09:13  lr: 0.002021  min_lr: 0.002021  loss: 3.4159 (3.3545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7782 (0.8703)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [146]  [ 600/2502]  eta: 0:08:16  lr: 0.002020  min_lr: 0.002020  loss: 3.8491 (3.3490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8790 (0.8738)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [146]  [ 800/2502]  eta: 0:07:22  lr: 0.002018  min_lr: 0.002018  loss: 3.6972 (3.3639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8077 (0.8624)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [146]  [1000/2502]  eta: 0:06:29  lr: 0.002016  min_lr: 0.002016  loss: 2.6989 (3.3352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8785 (inf)  time: 0.2583  data: 0.0004  max mem: 18975
Epoch: [146]  [1200/2502]  eta: 0:05:37  lr: 0.002015  min_lr: 0.002015  loss: 3.3846 (3.3299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8378 (inf)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [146]  [1400/2502]  eta: 0:04:44  lr: 0.002013  min_lr: 0.002013  loss: 3.2614 (3.3227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8352 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [146]  [1600/2502]  eta: 0:03:53  lr: 0.002012  min_lr: 0.002012  loss: 3.5653 (3.3228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8095 (inf)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [146]  [1800/2502]  eta: 0:03:01  lr: 0.002010  min_lr: 0.002010  loss: 3.5142 (3.3176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8996 (inf)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [146]  [2000/2502]  eta: 0:02:09  lr: 0.002009  min_lr: 0.002009  loss: 3.1791 (3.3249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8504 (inf)  time: 0.2587  data: 0.0004  max mem: 18975
Epoch: [146]  [2200/2502]  eta: 0:01:17  lr: 0.002007  min_lr: 0.002007  loss: 3.3668 (3.3209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8733 (inf)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [146]  [2400/2502]  eta: 0:00:26  lr: 0.002006  min_lr: 0.002006  loss: 3.2315 (3.3134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7997 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [146]  [2501/2502]  eta: 0:00:00  lr: 0.002005  min_lr: 0.002005  loss: 3.3877 (3.3107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8282 (inf)  time: 0.2307  data: 0.0009  max mem: 18975
Epoch: [146] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.002005  min_lr: 0.002005  loss: 3.3877 (3.3131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8282 (inf)
Test:  [ 0/50]  eta: 0:02:39  loss: 0.4760 (0.4760)  acc1: 92.0000 (92.0000)  acc5: 98.4000 (98.4000)  time: 3.1923  data: 3.0498  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.8720 (0.8692)  acc1: 83.2000 (82.3636)  acc5: 96.0000 (96.1091)  time: 0.4848  data: 0.3574  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8720 (0.8772)  acc1: 80.8000 (81.6000)  acc5: 96.4000 (96.2857)  time: 0.2338  data: 0.1108  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0687 (0.9929)  acc1: 76.8000 (78.9419)  acc5: 94.0000 (94.9936)  time: 0.2817  data: 0.1612  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3158 (1.0685)  acc1: 71.6000 (76.9756)  acc5: 90.8000 (94.0683)  time: 0.2996  data: 0.1796  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2690 (1.0835)  acc1: 71.6000 (76.5920)  acc5: 90.8000 (93.9280)  time: 0.2298  data: 0.1109  max mem: 18975
Test: Total time: 0:00:15 (0.3107 s / it)
* Acc@1 76.436 Acc@5 93.844 loss 1.086
Accuracy of the model on the 50000 test images: 76.4%
Max accuracy: 76.71%
Epoch: [147]  [   0/2502]  eta: 1:39:37  lr: 0.002005  min_lr: 0.002005  loss: 3.7141 (3.7141)  weight_decay: 0.0500 (0.0500)  time: 2.3889  data: 1.5329  max mem: 18975
Epoch: [147]  [ 200/2502]  eta: 0:10:21  lr: 0.002003  min_lr: 0.002003  loss: 3.1716 (3.2355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8347 (0.8498)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [147]  [ 400/2502]  eta: 0:09:14  lr: 0.002002  min_lr: 0.002002  loss: 3.2718 (3.2697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7974 (0.8519)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [147]  [ 600/2502]  eta: 0:08:17  lr: 0.002000  min_lr: 0.002000  loss: 3.4798 (3.2928)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8851 (0.8571)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [147]  [ 800/2502]  eta: 0:07:23  lr: 0.001999  min_lr: 0.001999  loss: 3.4180 (3.3011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9081 (0.8693)  time: 0.2635  data: 0.0004  max mem: 18975
Epoch: [147]  [1000/2502]  eta: 0:06:30  lr: 0.001997  min_lr: 0.001997  loss: 3.4034 (3.3014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8553 (0.8707)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [147]  [1200/2502]  eta: 0:05:37  lr: 0.001995  min_lr: 0.001995  loss: 3.2195 (3.3108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8560 (0.8689)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [147]  [1400/2502]  eta: 0:04:45  lr: 0.001994  min_lr: 0.001994  loss: 3.0637 (3.3077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8134 (0.8684)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [147]  [1600/2502]  eta: 0:03:53  lr: 0.001992  min_lr: 0.001992  loss: 3.7360 (3.3070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8335 (0.8712)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [147]  [1800/2502]  eta: 0:03:01  lr: 0.001991  min_lr: 0.001991  loss: 3.6297 (3.3084)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8823 (0.8727)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [147]  [2000/2502]  eta: 0:02:09  lr: 0.001989  min_lr: 0.001989  loss: 3.1943 (3.3032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8727 (0.8725)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [147]  [2200/2502]  eta: 0:01:18  lr: 0.001988  min_lr: 0.001988  loss: 3.5480 (3.3073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8142 (0.8701)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [147]  [2400/2502]  eta: 0:00:26  lr: 0.001986  min_lr: 0.001986  loss: 3.5076 (3.3066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8824 (0.8684)  time: 0.2605  data: 0.0003  max mem: 18975
Epoch: [147]  [2501/2502]  eta: 0:00:00  lr: 0.001985  min_lr: 0.001985  loss: 3.6546 (3.3088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9330 (0.8702)  time: 0.2311  data: 0.0007  max mem: 18975
Epoch: [147] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001985  min_lr: 0.001985  loss: 3.6546 (3.3215)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9330 (0.8702)
Test:  [ 0/50]  eta: 0:03:01  loss: 0.5957 (0.5957)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 3.6399  data: 3.4817  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9209 (0.9540)  acc1: 81.6000 (82.4000)  acc5: 96.8000 (96.2182)  time: 0.5407  data: 0.4172  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9848 (0.9886)  acc1: 80.8000 (81.4095)  acc5: 96.4000 (96.2095)  time: 0.2343  data: 0.1145  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1556 (1.0892)  acc1: 75.6000 (78.9032)  acc5: 94.0000 (94.6581)  time: 0.2404  data: 0.1210  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3485 (1.1589)  acc1: 71.6000 (77.0244)  acc5: 91.2000 (93.7854)  time: 0.2413  data: 0.1223  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3427 (1.1741)  acc1: 71.2000 (76.4560)  acc5: 91.2000 (93.6000)  time: 0.2099  data: 0.0910  max mem: 18975
Test: Total time: 0:00:14 (0.2989 s / it)
* Acc@1 76.674 Acc@5 93.802 loss 1.169
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.71%
Epoch: [148]  [   0/2502]  eta: 1:41:36  lr: 0.001985  min_lr: 0.001985  loss: 3.9233 (3.9233)  weight_decay: 0.0500 (0.0500)  time: 2.4365  data: 2.1671  max mem: 18975
Epoch: [148]  [ 200/2502]  eta: 0:10:22  lr: 0.001984  min_lr: 0.001984  loss: 3.6314 (3.3310)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8678 (0.9078)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [148]  [ 400/2502]  eta: 0:09:14  lr: 0.001982  min_lr: 0.001982  loss: 3.7979 (3.3443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8930 (0.9044)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [148]  [ 600/2502]  eta: 0:08:17  lr: 0.001981  min_lr: 0.001981  loss: 3.2848 (3.3450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9093 (0.8983)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [148]  [ 800/2502]  eta: 0:07:23  lr: 0.001979  min_lr: 0.001979  loss: 3.4629 (3.3550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8578 (0.8888)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [148]  [1000/2502]  eta: 0:06:30  lr: 0.001978  min_lr: 0.001978  loss: 3.2989 (3.3213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8688 (0.8961)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [148]  [1200/2502]  eta: 0:05:37  lr: 0.001976  min_lr: 0.001976  loss: 3.4094 (3.3133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8007 (0.8855)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [148]  [1400/2502]  eta: 0:04:45  lr: 0.001974  min_lr: 0.001974  loss: 3.0968 (3.3131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8336 (0.8787)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [148]  [1600/2502]  eta: 0:03:53  lr: 0.001973  min_lr: 0.001973  loss: 3.2195 (3.3198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8468 (0.8780)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [148]  [1800/2502]  eta: 0:03:01  lr: 0.001971  min_lr: 0.001971  loss: 3.2988 (3.3223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8292 (0.8778)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [148]  [2000/2502]  eta: 0:02:09  lr: 0.001970  min_lr: 0.001970  loss: 3.1278 (3.3220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8302 (0.8794)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [148]  [2200/2502]  eta: 0:01:18  lr: 0.001968  min_lr: 0.001968  loss: 3.5464 (3.3130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9179 (0.8781)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [148]  [2400/2502]  eta: 0:00:26  lr: 0.001967  min_lr: 0.001967  loss: 2.7847 (3.3126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8739 (0.8791)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [148]  [2501/2502]  eta: 0:00:00  lr: 0.001966  min_lr: 0.001966  loss: 3.7383 (3.3191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8453 (0.8793)  time: 0.2316  data: 0.0008  max mem: 18975
Epoch: [148] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001966  min_lr: 0.001966  loss: 3.7383 (3.3106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8453 (0.8793)
Test:  [ 0/50]  eta: 0:02:58  loss: 0.6835 (0.6835)  acc1: 90.4000 (90.4000)  acc5: 97.6000 (97.6000)  time: 3.5777  data: 3.4349  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9483 (0.9811)  acc1: 82.8000 (81.7818)  acc5: 97.2000 (96.0727)  time: 0.5435  data: 0.4116  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 1.0100 (1.0111)  acc1: 80.0000 (81.1429)  acc5: 96.8000 (96.2095)  time: 0.2356  data: 0.1098  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1605 (1.1161)  acc1: 76.4000 (78.8645)  acc5: 93.6000 (94.6839)  time: 0.2115  data: 0.0902  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3730 (1.1719)  acc1: 72.8000 (77.2878)  acc5: 90.8000 (93.9610)  time: 0.2077  data: 0.0873  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2978 (1.1843)  acc1: 71.2000 (76.7840)  acc5: 92.0000 (93.7600)  time: 0.1956  data: 0.0766  max mem: 18975
Test: Total time: 0:00:14 (0.2823 s / it)
* Acc@1 76.688 Acc@5 93.894 loss 1.187
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.71%
Epoch: [149]  [   0/2502]  eta: 1:41:02  lr: 0.001966  min_lr: 0.001966  loss: 4.2997 (4.2997)  weight_decay: 0.0500 (0.0500)  time: 2.4230  data: 2.1416  max mem: 18975
Epoch: [149]  [ 200/2502]  eta: 0:10:21  lr: 0.001964  min_lr: 0.001964  loss: 3.4417 (3.3467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7978 (0.8931)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [149]  [ 400/2502]  eta: 0:09:14  lr: 0.001963  min_lr: 0.001963  loss: 3.4003 (3.3176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9113 (0.8879)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [149]  [ 600/2502]  eta: 0:08:18  lr: 0.001961  min_lr: 0.001961  loss: 3.4933 (3.3077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8590 (0.8905)  time: 0.2629  data: 0.0004  max mem: 18975
Epoch: [149]  [ 800/2502]  eta: 0:07:23  lr: 0.001960  min_lr: 0.001960  loss: 3.3719 (3.3201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8299 (0.8897)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [149]  [1000/2502]  eta: 0:06:30  lr: 0.001958  min_lr: 0.001958  loss: 3.0914 (3.3121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8322 (0.8823)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [149]  [1200/2502]  eta: 0:05:37  lr: 0.001957  min_lr: 0.001957  loss: 2.8038 (3.2987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8559 (0.8901)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [149]  [1400/2502]  eta: 0:04:45  lr: 0.001955  min_lr: 0.001955  loss: 3.3836 (3.2979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8730 (0.8882)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [149]  [1600/2502]  eta: 0:03:53  lr: 0.001953  min_lr: 0.001953  loss: 2.9502 (3.2920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9013 (0.8901)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [149]  [1800/2502]  eta: 0:03:01  lr: 0.001952  min_lr: 0.001952  loss: 3.6392 (3.3032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8833 (0.8903)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [149]  [2000/2502]  eta: 0:02:09  lr: 0.001950  min_lr: 0.001950  loss: 3.2675 (3.3088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8434 (0.8937)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [149]  [2200/2502]  eta: 0:01:18  lr: 0.001949  min_lr: 0.001949  loss: 3.1114 (3.2971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8566 (inf)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [149]  [2400/2502]  eta: 0:00:26  lr: 0.001947  min_lr: 0.001947  loss: 3.6661 (3.3073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8155 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [149]  [2501/2502]  eta: 0:00:00  lr: 0.001946  min_lr: 0.001946  loss: 3.2865 (3.3072)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8714 (inf)  time: 0.2306  data: 0.0009  max mem: 18975
Epoch: [149] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.001946  min_lr: 0.001946  loss: 3.2865 (3.3101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8714 (inf)
Test:  [ 0/50]  eta: 0:02:31  loss: 0.5677 (0.5677)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 3.0286  data: 2.8848  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.9329 (0.9227)  acc1: 82.4000 (82.6909)  acc5: 95.6000 (95.9273)  time: 0.5725  data: 0.4501  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9534 (0.9472)  acc1: 80.4000 (81.5810)  acc5: 96.0000 (96.0381)  time: 0.2899  data: 0.1690  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1187 (1.0483)  acc1: 76.4000 (79.1742)  acc5: 93.2000 (94.5677)  time: 0.2348  data: 0.1130  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3183 (1.1210)  acc1: 72.0000 (77.3463)  acc5: 90.4000 (93.7951)  time: 0.2012  data: 0.0807  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3114 (1.1328)  acc1: 72.0000 (77.0080)  acc5: 90.8000 (93.6000)  time: 0.2005  data: 0.0806  max mem: 18975
Test: Total time: 0:00:14 (0.2810 s / it)
* Acc@1 76.778 Acc@5 93.828 loss 1.133
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.78%
Epoch: [150]  [   0/2502]  eta: 1:21:03  lr: 0.001946  min_lr: 0.001946  loss: 2.3327 (2.3327)  weight_decay: 0.0500 (0.0500)  time: 1.9438  data: 1.6845  max mem: 18975
Epoch: [150]  [ 200/2502]  eta: 0:10:20  lr: 0.001945  min_lr: 0.001945  loss: 3.4989 (3.3046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8874 (0.8416)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [150]  [ 400/2502]  eta: 0:09:15  lr: 0.001943  min_lr: 0.001943  loss: 3.4262 (3.3045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8070 (0.8585)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [150]  [ 600/2502]  eta: 0:08:18  lr: 0.001942  min_lr: 0.001942  loss: 2.9716 (3.3241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9708 (0.8815)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [150]  [ 800/2502]  eta: 0:07:23  lr: 0.001940  min_lr: 0.001940  loss: 2.7261 (3.2932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8161 (0.8807)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [150]  [1000/2502]  eta: 0:06:30  lr: 0.001939  min_lr: 0.001939  loss: 2.5741 (3.2872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8598 (0.8805)  time: 0.2557  data: 0.0003  max mem: 18975
Epoch: [150]  [1200/2502]  eta: 0:05:37  lr: 0.001937  min_lr: 0.001937  loss: 3.5792 (3.2953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9554 (0.8828)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [150]  [1400/2502]  eta: 0:04:45  lr: 0.001935  min_lr: 0.001935  loss: 3.0216 (3.3075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8158 (0.8811)  time: 0.2584  data: 0.0004  max mem: 18975
Epoch: [150]  [1600/2502]  eta: 0:03:53  lr: 0.001934  min_lr: 0.001934  loss: 3.3495 (3.3079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9260 (0.8839)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [150]  [1800/2502]  eta: 0:03:01  lr: 0.001932  min_lr: 0.001932  loss: 3.6062 (3.2991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8376 (0.8795)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [150]  [2000/2502]  eta: 0:02:09  lr: 0.001931  min_lr: 0.001931  loss: 3.2478 (3.2962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8973 (0.8790)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [150]  [2200/2502]  eta: 0:01:18  lr: 0.001929  min_lr: 0.001929  loss: 3.7151 (3.2977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8480 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [150]  [2400/2502]  eta: 0:00:26  lr: 0.001928  min_lr: 0.001928  loss: 3.8558 (3.3039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9054 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [150]  [2501/2502]  eta: 0:00:00  lr: 0.001927  min_lr: 0.001927  loss: 3.2589 (3.2999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8037 (inf)  time: 0.2308  data: 0.0007  max mem: 18975
Epoch: [150] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.001927  min_lr: 0.001927  loss: 3.2589 (3.3018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8037 (inf)
Test:  [ 0/50]  eta: 0:02:22  loss: 0.5934 (0.5934)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 2.8406  data: 2.6981  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.8627 (0.9131)  acc1: 82.0000 (82.1818)  acc5: 96.0000 (96.2182)  time: 0.4792  data: 0.3557  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9473 (0.9336)  acc1: 80.0000 (81.3714)  acc5: 96.4000 (96.1714)  time: 0.2615  data: 0.1397  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0665 (1.0399)  acc1: 76.8000 (78.9419)  acc5: 93.6000 (94.7097)  time: 0.2904  data: 0.1690  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3072 (1.1094)  acc1: 72.4000 (77.1902)  acc5: 90.4000 (93.8342)  time: 0.2813  data: 0.1608  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2884 (1.1225)  acc1: 72.4000 (76.8240)  acc5: 91.2000 (93.7760)  time: 0.2087  data: 0.0892  max mem: 18975
Test: Total time: 0:00:15 (0.3047 s / it)
* Acc@1 76.884 Acc@5 93.950 loss 1.124
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 76.88%
Epoch: [151]  [   0/2502]  eta: 1:30:46  lr: 0.001927  min_lr: 0.001927  loss: 3.2148 (3.2148)  weight_decay: 0.0500 (0.0500)  time: 2.1768  data: 1.8958  max mem: 18975
Epoch: [151]  [ 200/2502]  eta: 0:10:24  lr: 0.001925  min_lr: 0.001925  loss: 3.6416 (3.2769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9052 (0.8827)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [151]  [ 400/2502]  eta: 0:09:15  lr: 0.001924  min_lr: 0.001924  loss: 3.6713 (3.2630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9392 (0.8790)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [151]  [ 600/2502]  eta: 0:08:18  lr: 0.001922  min_lr: 0.001922  loss: 3.4682 (3.2947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8742 (0.8850)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [151]  [ 800/2502]  eta: 0:07:23  lr: 0.001921  min_lr: 0.001921  loss: 3.1678 (3.3057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9051 (0.8882)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [151]  [1000/2502]  eta: 0:06:30  lr: 0.001919  min_lr: 0.001919  loss: 3.3144 (3.2967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8897 (0.8907)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [151]  [1200/2502]  eta: 0:05:37  lr: 0.001917  min_lr: 0.001917  loss: 3.0901 (3.2944)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8653 (0.8931)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [151]  [1400/2502]  eta: 0:04:45  lr: 0.001916  min_lr: 0.001916  loss: 3.1610 (3.2913)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8907 (0.8931)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [151]  [1600/2502]  eta: 0:03:53  lr: 0.001914  min_lr: 0.001914  loss: 3.8271 (3.2912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9208 (0.8956)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [151]  [1800/2502]  eta: 0:03:01  lr: 0.001913  min_lr: 0.001913  loss: 3.7392 (3.2868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8262 (0.8910)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [151]  [2000/2502]  eta: 0:02:09  lr: 0.001911  min_lr: 0.001911  loss: 3.6564 (3.2939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8712 (0.8907)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [151]  [2200/2502]  eta: 0:01:18  lr: 0.001910  min_lr: 0.001910  loss: 3.6338 (3.2975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8797 (0.8908)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [151]  [2400/2502]  eta: 0:00:26  lr: 0.001908  min_lr: 0.001908  loss: 3.1633 (3.2955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8591 (0.8892)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [151]  [2501/2502]  eta: 0:00:00  lr: 0.001907  min_lr: 0.001907  loss: 3.6451 (3.2993)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9385 (0.8906)  time: 0.2307  data: 0.0008  max mem: 18975
Epoch: [151] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.001907  min_lr: 0.001907  loss: 3.6451 (3.3061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9385 (0.8906)
Test:  [ 0/50]  eta: 0:02:49  loss: 0.6374 (0.6374)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 3.3879  data: 3.2502  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.8997 (0.9327)  acc1: 81.2000 (81.6727)  acc5: 96.8000 (96.3636)  time: 0.5719  data: 0.4499  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.9328 (0.9503)  acc1: 79.2000 (80.8762)  acc5: 96.8000 (96.5333)  time: 0.2918  data: 0.1718  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0880 (1.0595)  acc1: 78.4000 (78.9806)  acc5: 94.4000 (95.0710)  time: 0.2740  data: 0.1547  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3461 (1.1369)  acc1: 74.8000 (77.3171)  acc5: 91.2000 (94.1171)  time: 0.2155  data: 0.0965  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3394 (1.1523)  acc1: 72.0000 (76.7280)  acc5: 91.2000 (93.9600)  time: 0.2152  data: 0.0963  max mem: 18975
Test: Total time: 0:00:14 (0.2942 s / it)
* Acc@1 76.708 Acc@5 93.932 loss 1.148
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.88%
Epoch: [152]  [   0/2502]  eta: 1:31:17  lr: 0.001907  min_lr: 0.001907  loss: 2.6013 (2.6013)  weight_decay: 0.0500 (0.0500)  time: 2.1893  data: 1.8575  max mem: 18975
Epoch: [152]  [ 200/2502]  eta: 0:10:19  lr: 0.001906  min_lr: 0.001906  loss: 3.5029 (3.2359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8997 (0.9445)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [152]  [ 400/2502]  eta: 0:09:14  lr: 0.001904  min_lr: 0.001904  loss: 3.3326 (3.2475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8596 (0.9128)  time: 0.2646  data: 0.0004  max mem: 18975
Epoch: [152]  [ 600/2502]  eta: 0:08:17  lr: 0.001903  min_lr: 0.001903  loss: 3.2349 (3.2605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9056 (0.9142)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [152]  [ 800/2502]  eta: 0:07:23  lr: 0.001901  min_lr: 0.001901  loss: 3.5393 (3.2598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8510 (0.9000)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [152]  [1000/2502]  eta: 0:06:30  lr: 0.001900  min_lr: 0.001900  loss: 3.3903 (3.2599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8062 (0.8967)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [152]  [1200/2502]  eta: 0:05:37  lr: 0.001898  min_lr: 0.001898  loss: 3.3492 (3.2672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8527 (0.9000)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [152]  [1400/2502]  eta: 0:04:45  lr: 0.001896  min_lr: 0.001896  loss: 2.9246 (3.2770)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9295 (0.8964)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [152]  [1600/2502]  eta: 0:03:53  lr: 0.001895  min_lr: 0.001895  loss: 3.0686 (3.2709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8450 (0.8953)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [152]  [1800/2502]  eta: 0:03:01  lr: 0.001893  min_lr: 0.001893  loss: 3.2988 (3.2723)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9084 (0.8961)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [152]  [2000/2502]  eta: 0:02:09  lr: 0.001892  min_lr: 0.001892  loss: 3.4613 (3.2748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8863 (0.8966)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [152]  [2200/2502]  eta: 0:01:18  lr: 0.001890  min_lr: 0.001890  loss: 3.3354 (3.2746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8442 (0.8987)  time: 0.2628  data: 0.0003  max mem: 18975
Epoch: [152]  [2400/2502]  eta: 0:00:26  lr: 0.001889  min_lr: 0.001889  loss: 3.4444 (3.2830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8156 (0.8961)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [152]  [2501/2502]  eta: 0:00:00  lr: 0.001888  min_lr: 0.001888  loss: 2.5941 (3.2822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8249 (0.8945)  time: 0.2304  data: 0.0007  max mem: 18975
Epoch: [152] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001888  min_lr: 0.001888  loss: 2.5941 (3.2857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8249 (0.8945)
Test:  [ 0/50]  eta: 0:02:09  loss: 0.5593 (0.5593)  acc1: 90.0000 (90.0000)  acc5: 98.0000 (98.0000)  time: 2.5980  data: 2.4639  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.8608 (0.8627)  acc1: 83.2000 (82.9091)  acc5: 96.8000 (96.5455)  time: 0.5036  data: 0.3831  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8608 (0.8860)  acc1: 81.2000 (81.7714)  acc5: 96.8000 (96.3810)  time: 0.2883  data: 0.1680  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0398 (0.9995)  acc1: 76.0000 (79.0581)  acc5: 94.0000 (94.9032)  time: 0.2425  data: 0.1223  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2720 (1.0741)  acc1: 72.4000 (77.2000)  acc5: 90.8000 (94.0585)  time: 0.2221  data: 0.1026  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2692 (1.0884)  acc1: 72.8000 (76.7440)  acc5: 90.8000 (93.8640)  time: 0.2102  data: 0.0908  max mem: 18975
Test: Total time: 0:00:14 (0.2891 s / it)
* Acc@1 77.066 Acc@5 94.052 loss 1.087
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.07%
Epoch: [153]  [   0/2502]  eta: 1:34:20  lr: 0.001888  min_lr: 0.001888  loss: 3.7119 (3.7119)  weight_decay: 0.0500 (0.0500)  time: 2.2625  data: 2.0015  max mem: 18975
Epoch: [153]  [ 200/2502]  eta: 0:10:18  lr: 0.001886  min_lr: 0.001886  loss: 3.5153 (3.3211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8192 (0.8457)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [153]  [ 400/2502]  eta: 0:09:11  lr: 0.001885  min_lr: 0.001885  loss: 3.5469 (3.3069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8643 (0.8991)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [153]  [ 600/2502]  eta: 0:08:16  lr: 0.001883  min_lr: 0.001883  loss: 3.2808 (3.2946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9205 (0.8964)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [153]  [ 800/2502]  eta: 0:07:22  lr: 0.001882  min_lr: 0.001882  loss: 3.1993 (3.2903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8577 (0.9000)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [153]  [1000/2502]  eta: 0:06:29  lr: 0.001880  min_lr: 0.001880  loss: 3.6210 (3.2966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8741 (0.8982)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [153]  [1200/2502]  eta: 0:05:37  lr: 0.001878  min_lr: 0.001878  loss: 3.4923 (3.3012)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8746 (0.9040)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [153]  [1400/2502]  eta: 0:04:45  lr: 0.001877  min_lr: 0.001877  loss: 3.6776 (3.3283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8646 (0.9035)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [153]  [1600/2502]  eta: 0:03:53  lr: 0.001875  min_lr: 0.001875  loss: 2.6650 (3.3212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9238 (0.9089)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [153]  [1800/2502]  eta: 0:03:01  lr: 0.001874  min_lr: 0.001874  loss: 3.4336 (3.3170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8530 (0.9035)  time: 0.2638  data: 0.0004  max mem: 18975
Epoch: [153]  [2000/2502]  eta: 0:02:09  lr: 0.001872  min_lr: 0.001872  loss: 3.5706 (3.3225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8131 (0.9045)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [153]  [2200/2502]  eta: 0:01:17  lr: 0.001871  min_lr: 0.001871  loss: 3.5890 (3.3230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8614 (0.9015)  time: 0.2651  data: 0.0004  max mem: 18975
Epoch: [153]  [2400/2502]  eta: 0:00:26  lr: 0.001869  min_lr: 0.001869  loss: 3.5668 (3.3252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8804 (0.8994)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [153]  [2501/2502]  eta: 0:00:00  lr: 0.001868  min_lr: 0.001868  loss: 3.0049 (3.3256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8473 (0.8983)  time: 0.2312  data: 0.0009  max mem: 18975
Epoch: [153] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.001868  min_lr: 0.001868  loss: 3.0049 (3.3051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8473 (0.8983)
Test:  [ 0/50]  eta: 0:02:15  loss: 0.5561 (0.5561)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 2.7013  data: 2.5570  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.8284 (0.8627)  acc1: 80.8000 (82.0000)  acc5: 97.2000 (96.5818)  time: 0.4858  data: 0.3626  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8782 (0.8831)  acc1: 80.0000 (81.3524)  acc5: 96.8000 (96.5143)  time: 0.2596  data: 0.1395  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0427 (0.9964)  acc1: 78.4000 (79.2774)  acc5: 94.8000 (95.0323)  time: 0.2642  data: 0.1450  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3000 (1.0736)  acc1: 71.6000 (77.2293)  acc5: 91.6000 (94.1561)  time: 0.2621  data: 0.1430  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2751 (1.0883)  acc1: 72.0000 (76.7120)  acc5: 91.6000 (93.9840)  time: 0.2097  data: 0.0908  max mem: 18975
Test: Total time: 0:00:14 (0.2958 s / it)
* Acc@1 76.844 Acc@5 94.024 loss 1.086
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 77.07%
Epoch: [154]  [   0/2502]  eta: 1:33:57  lr: 0.001868  min_lr: 0.001868  loss: 3.4316 (3.4316)  weight_decay: 0.0500 (0.0500)  time: 2.2532  data: 1.9687  max mem: 18975
Epoch: [154]  [ 200/2502]  eta: 0:10:19  lr: 0.001867  min_lr: 0.001867  loss: 3.3111 (3.2353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8932 (0.8849)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [154]  [ 400/2502]  eta: 0:09:13  lr: 0.001865  min_lr: 0.001865  loss: 3.1906 (3.2227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9224 (0.9102)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [154]  [ 600/2502]  eta: 0:08:16  lr: 0.001863  min_lr: 0.001863  loss: 3.5256 (3.2323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9282 (0.9134)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [154]  [ 800/2502]  eta: 0:07:22  lr: 0.001862  min_lr: 0.001862  loss: 2.9524 (3.2206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8025 (0.9024)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [154]  [1000/2502]  eta: 0:06:29  lr: 0.001860  min_lr: 0.001860  loss: 2.8561 (3.2371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9257 (0.8999)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [154]  [1200/2502]  eta: 0:05:37  lr: 0.001859  min_lr: 0.001859  loss: 3.0288 (3.2393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8943 (0.9048)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [154]  [1400/2502]  eta: 0:04:45  lr: 0.001857  min_lr: 0.001857  loss: 3.7744 (3.2466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8939 (0.9047)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [154]  [1600/2502]  eta: 0:03:53  lr: 0.001856  min_lr: 0.001856  loss: 3.8570 (3.2621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8551 (0.9015)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [154]  [1800/2502]  eta: 0:03:01  lr: 0.001854  min_lr: 0.001854  loss: 3.3226 (3.2619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8447 (0.9047)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [154]  [2000/2502]  eta: 0:02:09  lr: 0.001853  min_lr: 0.001853  loss: 3.7504 (3.2667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9071 (0.9067)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [154]  [2200/2502]  eta: 0:01:17  lr: 0.001851  min_lr: 0.001851  loss: 3.1902 (3.2696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8474 (0.9048)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [154]  [2400/2502]  eta: 0:00:26  lr: 0.001849  min_lr: 0.001849  loss: 3.2643 (3.2744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8346 (0.9036)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [154]  [2501/2502]  eta: 0:00:00  lr: 0.001849  min_lr: 0.001849  loss: 3.5467 (3.2775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9372 (0.9050)  time: 0.2305  data: 0.0007  max mem: 18975
Epoch: [154] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.001849  min_lr: 0.001849  loss: 3.5467 (3.2899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9372 (0.9050)
Test:  [ 0/50]  eta: 0:03:01  loss: 0.5799 (0.5799)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 3.6212  data: 3.4827  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.9253 (0.9241)  acc1: 82.4000 (82.4000)  acc5: 97.6000 (96.6546)  time: 0.4977  data: 0.3761  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9089 (0.9245)  acc1: 80.0000 (81.7524)  acc5: 96.4000 (96.5714)  time: 0.2075  data: 0.0879  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1189 (1.0459)  acc1: 76.8000 (79.3161)  acc5: 92.8000 (95.0581)  time: 0.2248  data: 0.1055  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3914 (1.1303)  acc1: 71.2000 (77.2098)  acc5: 90.8000 (94.1463)  time: 0.2596  data: 0.1405  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3683 (1.1531)  acc1: 70.4000 (76.3840)  acc5: 91.2000 (93.9200)  time: 0.2314  data: 0.1126  max mem: 18975
Test: Total time: 0:00:14 (0.2923 s / it)
* Acc@1 76.598 Acc@5 93.774 loss 1.161
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 77.07%
Epoch: [155]  [   0/2502]  eta: 1:33:32  lr: 0.001849  min_lr: 0.001849  loss: 3.7742 (3.7742)  weight_decay: 0.0500 (0.0500)  time: 2.2433  data: 1.9475  max mem: 18975
Epoch: [155]  [ 200/2502]  eta: 0:10:19  lr: 0.001847  min_lr: 0.001847  loss: 3.6157 (3.2568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9459 (0.9308)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [155]  [ 400/2502]  eta: 0:09:15  lr: 0.001845  min_lr: 0.001845  loss: 3.5101 (3.2753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8859 (0.9185)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [155]  [ 600/2502]  eta: 0:08:18  lr: 0.001844  min_lr: 0.001844  loss: 2.7996 (3.2810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8805 (0.9230)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [155]  [ 800/2502]  eta: 0:07:23  lr: 0.001842  min_lr: 0.001842  loss: 3.2274 (3.2715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8617 (0.9140)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [155]  [1000/2502]  eta: 0:06:30  lr: 0.001841  min_lr: 0.001841  loss: 3.2906 (3.2761)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8751 (0.9080)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [155]  [1200/2502]  eta: 0:05:38  lr: 0.001839  min_lr: 0.001839  loss: 3.6168 (3.2808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8541 (0.9071)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [155]  [1400/2502]  eta: 0:04:45  lr: 0.001838  min_lr: 0.001838  loss: 3.1300 (3.2789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8597 (0.9060)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [155]  [1600/2502]  eta: 0:03:53  lr: 0.001836  min_lr: 0.001836  loss: 3.5408 (3.2850)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0392 (0.9102)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [155]  [1800/2502]  eta: 0:03:01  lr: 0.001834  min_lr: 0.001834  loss: 3.3721 (3.2960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9387 (0.9110)  time: 0.2581  data: 0.0003  max mem: 18975
Epoch: [155]  [2000/2502]  eta: 0:02:09  lr: 0.001833  min_lr: 0.001833  loss: 3.4932 (3.2981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9173 (0.9122)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [155]  [2200/2502]  eta: 0:01:18  lr: 0.001831  min_lr: 0.001831  loss: 3.2649 (3.2952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8978 (0.9111)  time: 0.2585  data: 0.0004  max mem: 18975
Epoch: [155]  [2400/2502]  eta: 0:00:26  lr: 0.001830  min_lr: 0.001830  loss: 3.3917 (3.2926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8708 (0.9066)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [155]  [2501/2502]  eta: 0:00:00  lr: 0.001829  min_lr: 0.001829  loss: 3.7167 (3.2926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8652 (0.9065)  time: 0.2311  data: 0.0008  max mem: 18975
Epoch: [155] Total time: 0:10:46 (0.2586 s / it)
Averaged stats: lr: 0.001829  min_lr: 0.001829  loss: 3.7167 (3.2837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8652 (0.9065)
Test:  [ 0/50]  eta: 0:03:20  loss: 0.6530 (0.6530)  acc1: 92.4000 (92.4000)  acc5: 97.6000 (97.6000)  time: 4.0029  data: 3.8691  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9197 (0.9464)  acc1: 82.8000 (83.0182)  acc5: 96.8000 (96.3636)  time: 0.5261  data: 0.4054  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9278 (0.9638)  acc1: 81.6000 (82.0952)  acc5: 96.4000 (96.4191)  time: 0.1994  data: 0.0800  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1877 (1.0836)  acc1: 77.2000 (79.5484)  acc5: 94.0000 (94.9677)  time: 0.2421  data: 0.1229  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3625 (1.1502)  acc1: 72.0000 (77.6878)  acc5: 91.6000 (94.1854)  time: 0.2424  data: 0.1233  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3202 (1.1677)  acc1: 72.0000 (77.1040)  acc5: 91.6000 (93.9360)  time: 0.1972  data: 0.0782  max mem: 18975
Test: Total time: 0:00:14 (0.2910 s / it)
* Acc@1 77.040 Acc@5 93.920 loss 1.171
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.07%
Epoch: [156]  [   0/2502]  eta: 1:31:54  lr: 0.001829  min_lr: 0.001829  loss: 4.1135 (4.1135)  weight_decay: 0.0500 (0.0500)  time: 2.2039  data: 1.4192  max mem: 18975
Epoch: [156]  [ 200/2502]  eta: 0:10:21  lr: 0.001827  min_lr: 0.001827  loss: 3.6911 (3.2655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9778 (0.9200)  time: 0.2578  data: 0.0005  max mem: 18975
Epoch: [156]  [ 400/2502]  eta: 0:09:13  lr: 0.001826  min_lr: 0.001826  loss: 2.8180 (3.2260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8682 (0.9155)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [156]  [ 600/2502]  eta: 0:08:16  lr: 0.001824  min_lr: 0.001824  loss: 3.6240 (3.2370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8305 (0.9144)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [156]  [ 800/2502]  eta: 0:07:23  lr: 0.001823  min_lr: 0.001823  loss: 2.7702 (3.2454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8738 (0.9274)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [156]  [1000/2502]  eta: 0:06:29  lr: 0.001821  min_lr: 0.001821  loss: 3.3108 (3.2548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9068 (0.9184)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [156]  [1200/2502]  eta: 0:05:37  lr: 0.001820  min_lr: 0.001820  loss: 2.8489 (3.2571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9050 (0.9227)  time: 0.2645  data: 0.0004  max mem: 18975
Epoch: [156]  [1400/2502]  eta: 0:04:45  lr: 0.001818  min_lr: 0.001818  loss: 3.5434 (3.2706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9028 (0.9229)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [156]  [1600/2502]  eta: 0:03:53  lr: 0.001816  min_lr: 0.001816  loss: 3.3972 (3.2755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8868 (0.9220)  time: 0.2588  data: 0.0004  max mem: 18975
Epoch: [156]  [1800/2502]  eta: 0:03:01  lr: 0.001815  min_lr: 0.001815  loss: 3.2192 (3.2846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8855 (0.9202)  time: 0.2580  data: 0.0005  max mem: 18975
Epoch: [156]  [2000/2502]  eta: 0:02:09  lr: 0.001813  min_lr: 0.001813  loss: 3.3677 (3.2807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8120 (0.9156)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [156]  [2200/2502]  eta: 0:01:18  lr: 0.001812  min_lr: 0.001812  loss: 3.3567 (3.2836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8467 (0.9139)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [156]  [2400/2502]  eta: 0:00:26  lr: 0.001810  min_lr: 0.001810  loss: 3.0833 (3.2846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8902 (0.9170)  time: 0.2657  data: 0.0004  max mem: 18975
Epoch: [156]  [2501/2502]  eta: 0:00:00  lr: 0.001809  min_lr: 0.001809  loss: 3.7685 (3.2885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9056 (0.9178)  time: 0.2309  data: 0.0009  max mem: 18975
Epoch: [156] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.001809  min_lr: 0.001809  loss: 3.7685 (3.2698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9056 (0.9178)
Test:  [ 0/50]  eta: 0:03:23  loss: 0.6801 (0.6801)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 4.0632  data: 3.9275  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9124 (0.9634)  acc1: 84.8000 (83.1636)  acc5: 97.2000 (96.4727)  time: 0.5287  data: 0.4069  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9705 (0.9771)  acc1: 81.6000 (82.4952)  acc5: 96.4000 (96.2857)  time: 0.1819  data: 0.0619  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1966 (1.0920)  acc1: 78.0000 (79.7290)  acc5: 93.6000 (94.9161)  time: 0.2431  data: 0.1230  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3761 (1.1673)  acc1: 72.8000 (77.7854)  acc5: 91.2000 (94.1073)  time: 0.2930  data: 0.1704  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3766 (1.1811)  acc1: 72.0000 (77.1360)  acc5: 92.0000 (93.9600)  time: 0.2393  data: 0.1175  max mem: 18975
Test: Total time: 0:00:15 (0.3090 s / it)
* Acc@1 76.990 Acc@5 93.996 loss 1.180
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.07%
Epoch: [157]  [   0/2502]  eta: 1:35:57  lr: 0.001809  min_lr: 0.001809  loss: 3.6893 (3.6893)  weight_decay: 0.0500 (0.0500)  time: 2.3013  data: 1.6357  max mem: 18975
Epoch: [157]  [ 200/2502]  eta: 0:10:19  lr: 0.001808  min_lr: 0.001808  loss: 3.6349 (3.3013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8612 (0.9217)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [157]  [ 400/2502]  eta: 0:09:13  lr: 0.001806  min_lr: 0.001806  loss: 3.4367 (3.3072)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8786 (0.9165)  time: 0.2631  data: 0.0004  max mem: 18975
Epoch: [157]  [ 600/2502]  eta: 0:08:16  lr: 0.001805  min_lr: 0.001805  loss: 3.5257 (3.3188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9025 (0.9117)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [157]  [ 800/2502]  eta: 0:07:22  lr: 0.001803  min_lr: 0.001803  loss: 3.1202 (3.3061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8874 (0.9130)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [157]  [1000/2502]  eta: 0:06:29  lr: 0.001802  min_lr: 0.001802  loss: 3.6050 (3.3081)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [157]  [1200/2502]  eta: 0:05:37  lr: 0.001800  min_lr: 0.001800  loss: 2.8489 (3.3001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9370 (nan)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [157]  [1400/2502]  eta: 0:04:45  lr: 0.001798  min_lr: 0.001798  loss: 3.7612 (3.3066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9756 (nan)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [157]  [1600/2502]  eta: 0:03:53  lr: 0.001797  min_lr: 0.001797  loss: 3.2408 (3.2953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9013 (nan)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [157]  [1800/2502]  eta: 0:03:01  lr: 0.001795  min_lr: 0.001795  loss: 3.3312 (3.2913)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (nan)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [157]  [2000/2502]  eta: 0:02:09  lr: 0.001794  min_lr: 0.001794  loss: 3.3933 (3.2833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8594 (nan)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [157]  [2200/2502]  eta: 0:01:18  lr: 0.001792  min_lr: 0.001792  loss: 3.4725 (3.2870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8997 (nan)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [157]  [2400/2502]  eta: 0:00:26  lr: 0.001791  min_lr: 0.001791  loss: 3.2356 (3.2901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8581 (nan)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [157]  [2501/2502]  eta: 0:00:00  lr: 0.001790  min_lr: 0.001790  loss: 3.4790 (3.2904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9970 (nan)  time: 0.2316  data: 0.0009  max mem: 18975
Epoch: [157] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.001790  min_lr: 0.001790  loss: 3.4790 (3.2746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9970 (nan)
Test:  [ 0/50]  eta: 0:03:14  loss: 0.5771 (0.5771)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 3.8845  data: 3.7451  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.8816 (0.8770)  acc1: 82.0000 (82.5818)  acc5: 96.8000 (96.6909)  time: 0.5447  data: 0.4237  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9051 (0.9031)  acc1: 81.6000 (81.7905)  acc5: 96.8000 (96.6857)  time: 0.2186  data: 0.0994  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0898 (1.0234)  acc1: 76.8000 (79.5484)  acc5: 95.2000 (95.0452)  time: 0.2352  data: 0.1149  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3529 (1.1048)  acc1: 72.4000 (77.5317)  acc5: 90.8000 (94.1659)  time: 0.2311  data: 0.1108  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3451 (1.1198)  acc1: 70.8000 (76.8720)  acc5: 92.0000 (94.0240)  time: 0.2164  data: 0.0974  max mem: 18975
Test: Total time: 0:00:14 (0.2883 s / it)
* Acc@1 77.054 Acc@5 94.080 loss 1.113
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.07%
Epoch: [158]  [   0/2502]  eta: 1:42:39  lr: 0.001790  min_lr: 0.001790  loss: 3.6158 (3.6158)  weight_decay: 0.0500 (0.0500)  time: 2.4620  data: 2.1950  max mem: 18975
Epoch: [158]  [ 200/2502]  eta: 0:10:24  lr: 0.001788  min_lr: 0.001788  loss: 3.5432 (3.3228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8912 (0.9152)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [158]  [ 400/2502]  eta: 0:09:15  lr: 0.001787  min_lr: 0.001787  loss: 3.2154 (3.2775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9257 (0.9237)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [158]  [ 600/2502]  eta: 0:08:17  lr: 0.001785  min_lr: 0.001785  loss: 3.0552 (3.2322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8561 (0.9236)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [158]  [ 800/2502]  eta: 0:07:23  lr: 0.001783  min_lr: 0.001783  loss: 2.7962 (3.2494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8885 (0.9196)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [158]  [1000/2502]  eta: 0:06:30  lr: 0.001782  min_lr: 0.001782  loss: 3.4138 (3.2567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8263 (0.9151)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [158]  [1200/2502]  eta: 0:05:37  lr: 0.001780  min_lr: 0.001780  loss: 3.5186 (3.2565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9108 (0.9149)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [158]  [1400/2502]  eta: 0:04:45  lr: 0.001779  min_lr: 0.001779  loss: 3.5496 (3.2555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8896 (0.9183)  time: 0.2584  data: 0.0004  max mem: 18975
Epoch: [158]  [1600/2502]  eta: 0:03:53  lr: 0.001777  min_lr: 0.001777  loss: 2.9020 (3.2559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9201 (0.9163)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [158]  [1800/2502]  eta: 0:03:01  lr: 0.001776  min_lr: 0.001776  loss: 3.7926 (3.2703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8714 (0.9152)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [158]  [2000/2502]  eta: 0:02:09  lr: 0.001774  min_lr: 0.001774  loss: 3.4671 (3.2711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9512 (0.9141)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [158]  [2200/2502]  eta: 0:01:18  lr: 0.001772  min_lr: 0.001772  loss: 3.1200 (3.2727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8452 (0.9140)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [158]  [2400/2502]  eta: 0:00:26  lr: 0.001771  min_lr: 0.001771  loss: 3.2609 (3.2697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9436 (0.9147)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [158]  [2501/2502]  eta: 0:00:00  lr: 0.001770  min_lr: 0.001770  loss: 2.9227 (3.2655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8855 (0.9173)  time: 0.2311  data: 0.0009  max mem: 18975
Epoch: [158] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.001770  min_lr: 0.001770  loss: 2.9227 (3.2738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8855 (0.9173)
Test:  [ 0/50]  eta: 0:02:39  loss: 0.6101 (0.6101)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 3.1847  data: 3.0413  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.8367 (0.8940)  acc1: 82.4000 (81.9636)  acc5: 96.8000 (96.5818)  time: 0.4654  data: 0.3415  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8736 (0.9039)  acc1: 81.6000 (81.7905)  acc5: 96.4000 (96.4762)  time: 0.2291  data: 0.1078  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0895 (1.0141)  acc1: 77.6000 (79.0323)  acc5: 93.6000 (94.9936)  time: 0.2662  data: 0.1449  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2905 (1.0823)  acc1: 73.6000 (77.4927)  acc5: 91.2000 (94.1463)  time: 0.2585  data: 0.1380  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2565 (1.0879)  acc1: 73.6000 (77.2480)  acc5: 92.0000 (94.0960)  time: 0.1950  data: 0.0758  max mem: 18975
Test: Total time: 0:00:14 (0.2853 s / it)
* Acc@1 77.198 Acc@5 94.150 loss 1.085
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.20%
Epoch: [159]  [   0/2502]  eta: 1:14:52  lr: 0.001770  min_lr: 0.001770  loss: 3.8323 (3.8323)  weight_decay: 0.0500 (0.0500)  time: 1.7955  data: 1.5215  max mem: 18975
Epoch: [159]  [ 200/2502]  eta: 0:10:15  lr: 0.001769  min_lr: 0.001769  loss: 3.6153 (3.2788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8670 (0.8626)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [159]  [ 400/2502]  eta: 0:09:11  lr: 0.001767  min_lr: 0.001767  loss: 3.3807 (3.3122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8869 (0.8993)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [159]  [ 600/2502]  eta: 0:08:15  lr: 0.001765  min_lr: 0.001765  loss: 3.4303 (3.2992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8517 (0.9020)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [159]  [ 800/2502]  eta: 0:07:21  lr: 0.001764  min_lr: 0.001764  loss: 3.1861 (3.2810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8836 (0.9080)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [159]  [1000/2502]  eta: 0:06:28  lr: 0.001762  min_lr: 0.001762  loss: 2.6848 (3.2867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8968 (0.9048)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [159]  [1200/2502]  eta: 0:05:36  lr: 0.001761  min_lr: 0.001761  loss: 3.6823 (3.2930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9155 (0.9063)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [159]  [1400/2502]  eta: 0:04:44  lr: 0.001759  min_lr: 0.001759  loss: 3.5176 (3.2981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8637 (0.9040)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [159]  [1600/2502]  eta: 0:03:52  lr: 0.001758  min_lr: 0.001758  loss: 3.2243 (3.2937)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0005 (0.9115)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [159]  [1800/2502]  eta: 0:03:01  lr: 0.001756  min_lr: 0.001756  loss: 3.1824 (3.2925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8379 (0.9096)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [159]  [2000/2502]  eta: 0:02:09  lr: 0.001754  min_lr: 0.001754  loss: 3.4885 (3.2888)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9363 (0.9102)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [159]  [2200/2502]  eta: 0:01:17  lr: 0.001753  min_lr: 0.001753  loss: 2.7481 (3.2917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8338 (0.9068)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [159]  [2400/2502]  eta: 0:00:26  lr: 0.001751  min_lr: 0.001751  loss: 2.9103 (3.2858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8586 (0.9040)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [159]  [2501/2502]  eta: 0:00:00  lr: 0.001751  min_lr: 0.001751  loss: 3.2701 (3.2812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9140 (0.9061)  time: 0.2312  data: 0.0009  max mem: 18975
Epoch: [159] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.001751  min_lr: 0.001751  loss: 3.2701 (3.2696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9140 (0.9061)
Test:  [ 0/50]  eta: 0:02:59  loss: 0.5398 (0.5398)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 3.5934  data: 3.4512  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.7858 (0.8403)  acc1: 85.2000 (83.0182)  acc5: 96.4000 (96.8727)  time: 0.5718  data: 0.4506  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.8179 (0.8612)  acc1: 81.6000 (82.5143)  acc5: 96.4000 (96.7238)  time: 0.2476  data: 0.1281  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0626 (0.9816)  acc1: 76.8000 (79.9226)  acc5: 93.2000 (95.1097)  time: 0.2376  data: 0.1181  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3049 (1.0516)  acc1: 73.2000 (78.0195)  acc5: 91.6000 (94.2927)  time: 0.2478  data: 0.1288  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2830 (1.0706)  acc1: 72.8000 (77.4320)  acc5: 91.6000 (94.0320)  time: 0.2021  data: 0.0832  max mem: 18975
Test: Total time: 0:00:15 (0.3003 s / it)
* Acc@1 77.240 Acc@5 94.096 loss 1.072
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.24%
Epoch: [160]  [   0/2502]  eta: 1:25:12  lr: 0.001751  min_lr: 0.001751  loss: 2.2623 (2.2623)  weight_decay: 0.0500 (0.0500)  time: 2.0433  data: 1.7733  max mem: 18975
Epoch: [160]  [ 200/2502]  eta: 0:10:18  lr: 0.001749  min_lr: 0.001749  loss: 2.8941 (3.2453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8815 (0.9240)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [160]  [ 400/2502]  eta: 0:09:12  lr: 0.001747  min_lr: 0.001747  loss: 3.4433 (3.2273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9124 (0.9380)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [160]  [ 600/2502]  eta: 0:08:16  lr: 0.001746  min_lr: 0.001746  loss: 3.6311 (3.2479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9471 (0.9261)  time: 0.2575  data: 0.0005  max mem: 18975
Epoch: [160]  [ 800/2502]  eta: 0:07:22  lr: 0.001744  min_lr: 0.001744  loss: 3.4818 (3.2503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8970 (0.9224)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [160]  [1000/2502]  eta: 0:06:29  lr: 0.001743  min_lr: 0.001743  loss: 3.4295 (3.2576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9191 (0.9213)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [160]  [1200/2502]  eta: 0:05:37  lr: 0.001741  min_lr: 0.001741  loss: 3.3658 (3.2695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8395 (0.9205)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [160]  [1400/2502]  eta: 0:04:45  lr: 0.001740  min_lr: 0.001740  loss: 3.4129 (3.2746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8517 (0.9200)  time: 0.2643  data: 0.0004  max mem: 18975
Epoch: [160]  [1600/2502]  eta: 0:03:53  lr: 0.001738  min_lr: 0.001738  loss: 2.8917 (3.2735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9060 (0.9183)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [160]  [1800/2502]  eta: 0:03:01  lr: 0.001736  min_lr: 0.001736  loss: 3.8610 (3.2834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8804 (0.9217)  time: 0.2589  data: 0.0004  max mem: 18975
Epoch: [160]  [2000/2502]  eta: 0:02:09  lr: 0.001735  min_lr: 0.001735  loss: 3.5274 (3.2771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8742 (0.9196)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [160]  [2200/2502]  eta: 0:01:17  lr: 0.001733  min_lr: 0.001733  loss: 3.3982 (3.2686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8827 (0.9206)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [160]  [2400/2502]  eta: 0:00:26  lr: 0.001732  min_lr: 0.001732  loss: 3.3331 (3.2677)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [160]  [2501/2502]  eta: 0:00:00  lr: 0.001731  min_lr: 0.001731  loss: 2.7546 (3.2715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9158 (nan)  time: 0.2313  data: 0.0007  max mem: 18975
Epoch: [160] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.001731  min_lr: 0.001731  loss: 2.7546 (3.2660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9158 (nan)
Test:  [ 0/50]  eta: 0:03:23  loss: 0.5348 (0.5348)  acc1: 93.2000 (93.2000)  acc5: 98.8000 (98.8000)  time: 4.0632  data: 3.9207  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.8229 (0.8497)  acc1: 84.0000 (83.5636)  acc5: 96.4000 (96.4000)  time: 0.5351  data: 0.4130  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8683 (0.8947)  acc1: 80.8000 (82.1143)  acc5: 96.4000 (96.3810)  time: 0.2123  data: 0.0921  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0721 (1.0143)  acc1: 76.0000 (79.6129)  acc5: 93.2000 (94.6710)  time: 0.2466  data: 0.1254  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2756 (1.0776)  acc1: 72.8000 (77.8634)  acc5: 91.2000 (93.9415)  time: 0.2368  data: 0.1137  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2491 (1.0941)  acc1: 72.4000 (77.2800)  acc5: 91.6000 (93.8560)  time: 0.2172  data: 0.0953  max mem: 18975
Test: Total time: 0:00:14 (0.2963 s / it)
* Acc@1 77.028 Acc@5 94.012 loss 1.098
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.24%
Epoch: [161]  [   0/2502]  eta: 1:28:27  lr: 0.001731  min_lr: 0.001731  loss: 2.4629 (2.4629)  weight_decay: 0.0500 (0.0500)  time: 2.1213  data: 1.5774  max mem: 18975
Epoch: [161]  [ 200/2502]  eta: 0:10:19  lr: 0.001729  min_lr: 0.001729  loss: 2.9223 (3.1906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8541 (0.9107)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [161]  [ 400/2502]  eta: 0:09:13  lr: 0.001728  min_lr: 0.001728  loss: 3.4951 (3.2311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9530 (0.9127)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [161]  [ 600/2502]  eta: 0:08:16  lr: 0.001726  min_lr: 0.001726  loss: 3.5076 (3.2344)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0073 (0.9078)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [161]  [ 800/2502]  eta: 0:07:23  lr: 0.001725  min_lr: 0.001725  loss: 3.0969 (3.2541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9798 (0.9139)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [161]  [1000/2502]  eta: 0:06:30  lr: 0.001723  min_lr: 0.001723  loss: 3.2180 (3.2541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9492 (0.9141)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [161]  [1200/2502]  eta: 0:05:37  lr: 0.001721  min_lr: 0.001721  loss: 3.6884 (3.2643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8095 (0.9053)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [161]  [1400/2502]  eta: 0:04:45  lr: 0.001720  min_lr: 0.001720  loss: 3.6113 (3.2578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9323 (0.9076)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [161]  [1600/2502]  eta: 0:03:53  lr: 0.001718  min_lr: 0.001718  loss: 3.5709 (3.2423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9156 (0.9087)  time: 0.2605  data: 0.0003  max mem: 18975
Epoch: [161]  [1800/2502]  eta: 0:03:01  lr: 0.001717  min_lr: 0.001717  loss: 3.6464 (3.2430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9994 (0.9151)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [161]  [2000/2502]  eta: 0:02:09  lr: 0.001715  min_lr: 0.001715  loss: 3.6813 (3.2428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9366 (0.9159)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [161]  [2200/2502]  eta: 0:01:17  lr: 0.001714  min_lr: 0.001714  loss: 3.2222 (3.2374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8990 (0.9166)  time: 0.2615  data: 0.0004  max mem: 18975
Epoch: [161]  [2400/2502]  eta: 0:00:26  lr: 0.001712  min_lr: 0.001712  loss: 3.0222 (3.2380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9242 (0.9199)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [161]  [2501/2502]  eta: 0:00:00  lr: 0.001711  min_lr: 0.001711  loss: 2.8225 (3.2389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9071 (0.9209)  time: 0.2305  data: 0.0007  max mem: 18975
Epoch: [161] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.001711  min_lr: 0.001711  loss: 2.8225 (3.2567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9071 (0.9209)
Test:  [ 0/50]  eta: 0:02:45  loss: 0.5401 (0.5401)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 3.3058  data: 3.1699  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.8219 (0.8639)  acc1: 84.8000 (83.1636)  acc5: 96.0000 (96.4727)  time: 0.5340  data: 0.4131  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8468 (0.8864)  acc1: 80.8000 (81.7333)  acc5: 96.0000 (96.3810)  time: 0.2357  data: 0.1160  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0008 (0.9953)  acc1: 77.6000 (79.3936)  acc5: 94.0000 (95.0968)  time: 0.2536  data: 0.1341  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2882 (1.0745)  acc1: 71.6000 (77.2781)  acc5: 91.2000 (94.1561)  time: 0.2640  data: 0.1434  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2614 (1.0831)  acc1: 72.0000 (77.0080)  acc5: 92.0000 (94.0800)  time: 0.2053  data: 0.0848  max mem: 18975
Test: Total time: 0:00:15 (0.3007 s / it)
* Acc@1 77.158 Acc@5 94.166 loss 1.078
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.24%
Epoch: [162]  [   0/2502]  eta: 1:27:24  lr: 0.001711  min_lr: 0.001711  loss: 2.2590 (2.2590)  weight_decay: 0.0500 (0.0500)  time: 2.0961  data: 1.5471  max mem: 18975
Epoch: [162]  [ 200/2502]  eta: 0:10:20  lr: 0.001710  min_lr: 0.001710  loss: 2.7596 (3.1931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9458 (0.9046)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [162]  [ 400/2502]  eta: 0:09:14  lr: 0.001708  min_lr: 0.001708  loss: 3.2907 (3.2353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8264 (0.8982)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [162]  [ 600/2502]  eta: 0:08:17  lr: 0.001707  min_lr: 0.001707  loss: 3.5558 (3.2559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9607 (0.9075)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [162]  [ 800/2502]  eta: 0:07:23  lr: 0.001705  min_lr: 0.001705  loss: 2.9033 (3.2631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9139 (0.9151)  time: 0.2579  data: 0.0003  max mem: 18975
Epoch: [162]  [1000/2502]  eta: 0:06:30  lr: 0.001703  min_lr: 0.001703  loss: 3.6038 (3.2665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9742 (0.9289)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [162]  [1200/2502]  eta: 0:05:37  lr: 0.001702  min_lr: 0.001702  loss: 3.1437 (3.2515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8952 (0.9310)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [162]  [1400/2502]  eta: 0:04:45  lr: 0.001700  min_lr: 0.001700  loss: 3.2301 (3.2451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9284 (0.9315)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [162]  [1600/2502]  eta: 0:03:53  lr: 0.001699  min_lr: 0.001699  loss: 3.1532 (3.2541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9219 (0.9304)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [162]  [1800/2502]  eta: 0:03:01  lr: 0.001697  min_lr: 0.001697  loss: 3.5347 (3.2477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8746 (0.9264)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [162]  [2000/2502]  eta: 0:02:09  lr: 0.001696  min_lr: 0.001696  loss: 2.9944 (3.2480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7957 (0.9283)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [162]  [2200/2502]  eta: 0:01:18  lr: 0.001694  min_lr: 0.001694  loss: 2.7960 (3.2470)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9180 (0.9262)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [162]  [2400/2502]  eta: 0:00:26  lr: 0.001692  min_lr: 0.001692  loss: 2.8457 (3.2490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9609 (0.9284)  time: 0.2572  data: 0.0003  max mem: 18975
Epoch: [162]  [2501/2502]  eta: 0:00:00  lr: 0.001692  min_lr: 0.001692  loss: 3.0793 (3.2491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8463 (0.9268)  time: 0.2316  data: 0.0009  max mem: 18975
Epoch: [162] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.001692  min_lr: 0.001692  loss: 3.0793 (3.2549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8463 (0.9268)
Test:  [ 0/50]  eta: 0:03:11  loss: 0.6224 (0.6224)  acc1: 92.0000 (92.0000)  acc5: 98.4000 (98.4000)  time: 3.8290  data: 3.6847  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 0.8555 (0.8976)  acc1: 83.6000 (83.2727)  acc5: 97.2000 (96.8727)  time: 0.6160  data: 0.4921  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.8934 (0.9370)  acc1: 81.2000 (82.3238)  acc5: 97.2000 (96.7429)  time: 0.2894  data: 0.1684  max mem: 18975
Test:  [30/50]  eta: 0:00:08  loss: 1.0978 (1.0342)  acc1: 77.6000 (80.0387)  acc5: 94.0000 (95.1871)  time: 0.2843  data: 0.1610  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2819 (1.1053)  acc1: 72.4000 (77.9024)  acc5: 91.6000 (94.2927)  time: 0.2392  data: 0.1142  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2819 (1.1222)  acc1: 71.6000 (77.2480)  acc5: 92.0000 (94.2080)  time: 0.2357  data: 0.1140  max mem: 18975
Test: Total time: 0:00:15 (0.3118 s / it)
* Acc@1 77.180 Acc@5 94.146 loss 1.125
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.24%
Epoch: [163]  [   0/2502]  eta: 1:16:17  lr: 0.001692  min_lr: 0.001692  loss: 3.8716 (3.8716)  weight_decay: 0.0500 (0.0500)  time: 1.8297  data: 1.4705  max mem: 18975
Epoch: [163]  [ 200/2502]  eta: 0:10:15  lr: 0.001690  min_lr: 0.001690  loss: 3.4548 (3.2428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9144 (0.9193)  time: 0.2573  data: 0.0003  max mem: 18975
Epoch: [163]  [ 400/2502]  eta: 0:09:11  lr: 0.001688  min_lr: 0.001688  loss: 3.5727 (3.2173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8511 (0.9204)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [163]  [ 600/2502]  eta: 0:08:15  lr: 0.001687  min_lr: 0.001687  loss: 3.5448 (3.2508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9119 (0.9103)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [163]  [ 800/2502]  eta: 0:07:22  lr: 0.001685  min_lr: 0.001685  loss: 3.5066 (3.2613)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8452 (0.9226)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [163]  [1000/2502]  eta: 0:06:29  lr: 0.001684  min_lr: 0.001684  loss: 3.6308 (3.2638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9314 (0.9303)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [163]  [1200/2502]  eta: 0:05:36  lr: 0.001682  min_lr: 0.001682  loss: 3.1544 (3.2714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8438 (0.9197)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [163]  [1400/2502]  eta: 0:04:45  lr: 0.001681  min_lr: 0.001681  loss: 3.3102 (3.2643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9378 (0.9275)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [163]  [1600/2502]  eta: 0:03:53  lr: 0.001679  min_lr: 0.001679  loss: 3.3921 (3.2646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9252 (0.9280)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [163]  [1800/2502]  eta: 0:03:01  lr: 0.001678  min_lr: 0.001678  loss: 3.8306 (3.2753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9550 (0.9298)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [163]  [2000/2502]  eta: 0:02:09  lr: 0.001676  min_lr: 0.001676  loss: 3.6389 (3.2683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9822 (0.9281)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [163]  [2200/2502]  eta: 0:01:17  lr: 0.001674  min_lr: 0.001674  loss: 3.0870 (3.2634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9540 (0.9358)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [163]  [2400/2502]  eta: 0:00:26  lr: 0.001673  min_lr: 0.001673  loss: 3.2293 (3.2671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9691 (0.9332)  time: 0.2637  data: 0.0004  max mem: 18975
Epoch: [163]  [2501/2502]  eta: 0:00:00  lr: 0.001672  min_lr: 0.001672  loss: 3.5218 (3.2681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9839 (0.9335)  time: 0.2308  data: 0.0007  max mem: 18975
Epoch: [163] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.001672  min_lr: 0.001672  loss: 3.5218 (3.2564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9839 (0.9335)
Test:  [ 0/50]  eta: 0:03:04  loss: 0.6427 (0.6427)  acc1: 93.2000 (93.2000)  acc5: 98.4000 (98.4000)  time: 3.6830  data: 3.5447  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.8908 (0.9499)  acc1: 85.2000 (83.4545)  acc5: 96.8000 (96.4727)  time: 0.5534  data: 0.4312  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9001 (0.9506)  acc1: 81.2000 (82.6286)  acc5: 96.8000 (96.5333)  time: 0.2373  data: 0.1172  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0827 (1.0594)  acc1: 78.0000 (80.1161)  acc5: 94.0000 (95.2129)  time: 0.2276  data: 0.1083  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3454 (1.1296)  acc1: 74.0000 (78.1951)  acc5: 91.6000 (94.3122)  time: 0.2287  data: 0.1097  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3258 (1.1468)  acc1: 71.2000 (77.5040)  acc5: 91.6000 (94.2480)  time: 0.2370  data: 0.1181  max mem: 18975
Test: Total time: 0:00:14 (0.2915 s / it)
* Acc@1 77.206 Acc@5 94.170 loss 1.152
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.24%
Epoch: [164]  [   0/2502]  eta: 1:31:55  lr: 0.001672  min_lr: 0.001672  loss: 4.1435 (4.1435)  weight_decay: 0.0500 (0.0500)  time: 2.2045  data: 1.6637  max mem: 18975
Epoch: [164]  [ 200/2502]  eta: 0:10:21  lr: 0.001670  min_lr: 0.001670  loss: 3.5904 (3.2164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9377 (0.9262)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [164]  [ 400/2502]  eta: 0:09:14  lr: 0.001669  min_lr: 0.001669  loss: 3.2707 (3.2591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9351 (0.9365)  time: 0.2571  data: 0.0005  max mem: 18975
Epoch: [164]  [ 600/2502]  eta: 0:08:17  lr: 0.001667  min_lr: 0.001667  loss: 2.8732 (3.2768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9551 (0.9559)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [164]  [ 800/2502]  eta: 0:07:22  lr: 0.001666  min_lr: 0.001666  loss: 3.6900 (3.2747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8668 (0.9446)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [164]  [1000/2502]  eta: 0:06:29  lr: 0.001664  min_lr: 0.001664  loss: 3.2624 (3.2573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9267 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [164]  [1200/2502]  eta: 0:05:37  lr: 0.001663  min_lr: 0.001663  loss: 2.9556 (3.2605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9589 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [164]  [1400/2502]  eta: 0:04:45  lr: 0.001661  min_lr: 0.001661  loss: 3.5140 (3.2528)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9940 (inf)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [164]  [1600/2502]  eta: 0:03:53  lr: 0.001659  min_lr: 0.001659  loss: 3.3824 (3.2684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9882 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [164]  [1800/2502]  eta: 0:03:01  lr: 0.001658  min_lr: 0.001658  loss: 3.6897 (3.2693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9274 (inf)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [164]  [2000/2502]  eta: 0:02:09  lr: 0.001656  min_lr: 0.001656  loss: 3.1530 (3.2732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8646 (inf)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [164]  [2200/2502]  eta: 0:01:17  lr: 0.001655  min_lr: 0.001655  loss: 2.7461 (3.2693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8747 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [164]  [2400/2502]  eta: 0:00:26  lr: 0.001653  min_lr: 0.001653  loss: 3.2165 (3.2644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9241 (inf)  time: 0.2558  data: 0.0003  max mem: 18975
Epoch: [164]  [2501/2502]  eta: 0:00:00  lr: 0.001652  min_lr: 0.001652  loss: 3.5455 (3.2659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9520 (inf)  time: 0.2343  data: 0.0007  max mem: 18975
Epoch: [164] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.001652  min_lr: 0.001652  loss: 3.5455 (3.2544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9520 (inf)
Test:  [ 0/50]  eta: 0:03:05  loss: 0.6848 (0.6848)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 3.7157  data: 3.5726  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.9368 (1.0048)  acc1: 82.8000 (82.4364)  acc5: 96.8000 (96.4727)  time: 0.5573  data: 0.4341  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9520 (1.0073)  acc1: 80.8000 (81.6762)  acc5: 96.8000 (96.5905)  time: 0.2444  data: 0.1242  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.2290 (1.1164)  acc1: 76.4000 (79.3936)  acc5: 93.6000 (95.0323)  time: 0.2355  data: 0.1164  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3657 (1.1844)  acc1: 74.4000 (77.8049)  acc5: 90.8000 (94.1463)  time: 0.2508  data: 0.1284  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3551 (1.1985)  acc1: 75.6000 (77.2960)  acc5: 91.2000 (93.9600)  time: 0.2354  data: 0.1126  max mem: 18975
Test: Total time: 0:00:15 (0.3052 s / it)
* Acc@1 77.308 Acc@5 94.132 loss 1.204
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.31%
Epoch: [165]  [   0/2502]  eta: 1:33:20  lr: 0.001652  min_lr: 0.001652  loss: 2.6610 (2.6610)  weight_decay: 0.0500 (0.0500)  time: 2.2385  data: 1.9756  max mem: 18975
Epoch: [165]  [ 200/2502]  eta: 0:10:22  lr: 0.001651  min_lr: 0.001651  loss: 3.6439 (3.3243)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8785 (0.9006)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [165]  [ 400/2502]  eta: 0:09:13  lr: 0.001649  min_lr: 0.001649  loss: 2.7735 (3.3027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9270 (0.9210)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [165]  [ 600/2502]  eta: 0:08:16  lr: 0.001648  min_lr: 0.001648  loss: 3.4609 (3.2864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9609 (0.9292)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [165]  [ 800/2502]  eta: 0:07:23  lr: 0.001646  min_lr: 0.001646  loss: 3.4929 (3.2681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8967 (0.9290)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [165]  [1000/2502]  eta: 0:06:30  lr: 0.001645  min_lr: 0.001645  loss: 3.2844 (3.2579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8825 (0.9298)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [165]  [1200/2502]  eta: 0:05:37  lr: 0.001643  min_lr: 0.001643  loss: 2.6983 (3.2438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9327 (0.9362)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [165]  [1400/2502]  eta: 0:04:45  lr: 0.001641  min_lr: 0.001641  loss: 3.0377 (3.2430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9047 (0.9338)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [165]  [1600/2502]  eta: 0:03:53  lr: 0.001640  min_lr: 0.001640  loss: 3.3608 (3.2468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8876 (0.9417)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [165]  [1800/2502]  eta: 0:03:01  lr: 0.001638  min_lr: 0.001638  loss: 3.2465 (3.2494)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0583 (0.9510)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [165]  [2000/2502]  eta: 0:02:09  lr: 0.001637  min_lr: 0.001637  loss: 3.4229 (3.2570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8398 (0.9450)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [165]  [2200/2502]  eta: 0:01:17  lr: 0.001635  min_lr: 0.001635  loss: 2.9975 (3.2552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9300 (0.9430)  time: 0.2587  data: 0.0003  max mem: 18975
Epoch: [165]  [2400/2502]  eta: 0:00:26  lr: 0.001634  min_lr: 0.001634  loss: 3.3302 (3.2554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9066 (0.9427)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [165]  [2501/2502]  eta: 0:00:00  lr: 0.001633  min_lr: 0.001633  loss: 3.2490 (3.2571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9968 (0.9442)  time: 0.2311  data: 0.0009  max mem: 18975
Epoch: [165] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.001633  min_lr: 0.001633  loss: 3.2490 (3.2466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9968 (0.9442)
Test:  [ 0/50]  eta: 0:02:15  loss: 0.6043 (0.6043)  acc1: 91.6000 (91.6000)  acc5: 98.8000 (98.8000)  time: 2.7138  data: 2.5746  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 0.8683 (0.8835)  acc1: 84.4000 (83.2000)  acc5: 96.8000 (96.4727)  time: 0.4396  data: 0.3184  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.8768 (0.8974)  acc1: 81.2000 (82.4381)  acc5: 96.8000 (96.6095)  time: 0.2280  data: 0.1081  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1685 (1.0234)  acc1: 75.6000 (79.9613)  acc5: 93.6000 (95.0710)  time: 0.2607  data: 0.1406  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3284 (1.0948)  acc1: 73.2000 (78.0878)  acc5: 91.6000 (94.3122)  time: 0.2771  data: 0.1577  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2836 (1.1108)  acc1: 72.0000 (77.4880)  acc5: 92.0000 (94.1840)  time: 0.2252  data: 0.1063  max mem: 18975
Test: Total time: 0:00:14 (0.2907 s / it)
* Acc@1 77.364 Acc@5 94.268 loss 1.111
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.36%
Epoch: [166]  [   0/2502]  eta: 1:44:00  lr: 0.001633  min_lr: 0.001633  loss: 2.2585 (2.2585)  weight_decay: 0.0500 (0.0500)  time: 2.4943  data: 2.2322  max mem: 18975
Epoch: [166]  [ 200/2502]  eta: 0:10:21  lr: 0.001631  min_lr: 0.001631  loss: 3.2669 (3.2033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9127 (0.9409)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [166]  [ 400/2502]  eta: 0:09:14  lr: 0.001630  min_lr: 0.001630  loss: 3.4829 (3.2364)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0766 (0.9804)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [166]  [ 600/2502]  eta: 0:08:17  lr: 0.001628  min_lr: 0.001628  loss: 3.2165 (3.2221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8449 (0.9494)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [166]  [ 800/2502]  eta: 0:07:23  lr: 0.001627  min_lr: 0.001627  loss: 3.2041 (3.2196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8587 (0.9390)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [166]  [1000/2502]  eta: 0:06:30  lr: 0.001625  min_lr: 0.001625  loss: 3.1222 (3.2270)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9375 (0.9443)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [166]  [1200/2502]  eta: 0:05:37  lr: 0.001623  min_lr: 0.001623  loss: 3.4602 (3.2391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9816 (0.9433)  time: 0.2557  data: 0.0003  max mem: 18975
Epoch: [166]  [1400/2502]  eta: 0:04:45  lr: 0.001622  min_lr: 0.001622  loss: 3.0435 (3.2373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8970 (0.9432)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [166]  [1600/2502]  eta: 0:03:53  lr: 0.001620  min_lr: 0.001620  loss: 3.1473 (3.2431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9589 (0.9424)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [166]  [1800/2502]  eta: 0:03:01  lr: 0.001619  min_lr: 0.001619  loss: 3.7951 (3.2444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8678 (0.9405)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [166]  [2000/2502]  eta: 0:02:09  lr: 0.001617  min_lr: 0.001617  loss: 2.9539 (3.2371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8569 (0.9416)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [166]  [2200/2502]  eta: 0:01:18  lr: 0.001616  min_lr: 0.001616  loss: 2.8981 (3.2372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8515 (0.9406)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [166]  [2400/2502]  eta: 0:00:26  lr: 0.001614  min_lr: 0.001614  loss: 3.6435 (3.2434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9430 (0.9378)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [166]  [2501/2502]  eta: 0:00:00  lr: 0.001613  min_lr: 0.001613  loss: 3.4870 (3.2454)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0208 (0.9409)  time: 0.2313  data: 0.0008  max mem: 18975
Epoch: [166] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001613  min_lr: 0.001613  loss: 3.4870 (3.2342)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0208 (0.9409)
Test:  [ 0/50]  eta: 0:03:15  loss: 0.6008 (0.6008)  acc1: 91.2000 (91.2000)  acc5: 98.4000 (98.4000)  time: 3.9023  data: 3.7638  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 0.8438 (0.8758)  acc1: 82.8000 (83.0182)  acc5: 96.8000 (96.4000)  time: 0.6001  data: 0.4788  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.8882 (0.9009)  acc1: 80.8000 (82.1524)  acc5: 96.4000 (96.4571)  time: 0.2809  data: 0.1615  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1103 (1.0115)  acc1: 76.4000 (79.6516)  acc5: 93.6000 (94.8387)  time: 0.2597  data: 0.1406  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2785 (1.0779)  acc1: 73.6000 (77.9512)  acc5: 91.2000 (94.1171)  time: 0.2018  data: 0.0827  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2353 (1.0903)  acc1: 74.0000 (77.5120)  acc5: 92.0000 (94.0720)  time: 0.2015  data: 0.0826  max mem: 18975
Test: Total time: 0:00:14 (0.2948 s / it)
* Acc@1 77.440 Acc@5 94.170 loss 1.092
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.44%
Epoch: [167]  [   0/2502]  eta: 1:29:27  lr: 0.001613  min_lr: 0.001613  loss: 2.6286 (2.6286)  weight_decay: 0.0500 (0.0500)  time: 2.1452  data: 1.8737  max mem: 18975
Epoch: [167]  [ 200/2502]  eta: 0:10:17  lr: 0.001612  min_lr: 0.001612  loss: 3.2905 (3.2715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9460 (0.9389)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [167]  [ 400/2502]  eta: 0:09:11  lr: 0.001610  min_lr: 0.001610  loss: 3.4956 (3.2673)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9602 (0.9580)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [167]  [ 600/2502]  eta: 0:08:16  lr: 0.001609  min_lr: 0.001609  loss: 3.3845 (3.2557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9351 (0.9449)  time: 0.2627  data: 0.0004  max mem: 18975
Epoch: [167]  [ 800/2502]  eta: 0:07:21  lr: 0.001607  min_lr: 0.001607  loss: 3.0730 (3.2595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8808 (0.9396)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [167]  [1000/2502]  eta: 0:06:29  lr: 0.001605  min_lr: 0.001605  loss: 3.1024 (3.2449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8808 (0.9379)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [167]  [1200/2502]  eta: 0:05:36  lr: 0.001604  min_lr: 0.001604  loss: 3.5891 (3.2422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8979 (0.9367)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [167]  [1400/2502]  eta: 0:04:44  lr: 0.001602  min_lr: 0.001602  loss: 3.3527 (3.2368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9060 (0.9384)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [167]  [1600/2502]  eta: 0:03:52  lr: 0.001601  min_lr: 0.001601  loss: 3.6313 (3.2439)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0330 (0.9460)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [167]  [1800/2502]  eta: 0:03:01  lr: 0.001599  min_lr: 0.001599  loss: 3.5512 (3.2476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9060 (nan)  time: 0.2641  data: 0.0004  max mem: 18975
Epoch: [167]  [2000/2502]  eta: 0:02:09  lr: 0.001598  min_lr: 0.001598  loss: 3.7034 (3.2475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9316 (nan)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [167]  [2200/2502]  eta: 0:01:17  lr: 0.001596  min_lr: 0.001596  loss: 3.3299 (3.2485)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9853 (nan)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [167]  [2400/2502]  eta: 0:00:26  lr: 0.001594  min_lr: 0.001594  loss: 3.0548 (3.2493)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8993 (nan)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [167]  [2501/2502]  eta: 0:00:00  lr: 0.001594  min_lr: 0.001594  loss: 3.2295 (3.2467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8230 (nan)  time: 0.2309  data: 0.0012  max mem: 18975
Epoch: [167] Total time: 0:10:45 (0.2578 s / it)
Averaged stats: lr: 0.001594  min_lr: 0.001594  loss: 3.2295 (3.2414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8230 (nan)
Test:  [ 0/50]  eta: 0:03:12  loss: 0.5637 (0.5637)  acc1: 90.8000 (90.8000)  acc5: 98.0000 (98.0000)  time: 3.8466  data: 3.7070  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 0.8534 (0.8883)  acc1: 82.0000 (83.1273)  acc5: 96.4000 (96.7273)  time: 0.6155  data: 0.4935  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.9232 (0.9124)  acc1: 80.0000 (82.0000)  acc5: 96.4000 (96.8191)  time: 0.2881  data: 0.1684  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1278 (1.0274)  acc1: 76.4000 (79.7806)  acc5: 94.0000 (95.2645)  time: 0.2542  data: 0.1350  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2987 (1.0969)  acc1: 74.0000 (78.1854)  acc5: 91.2000 (94.5073)  time: 0.1987  data: 0.0789  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2801 (1.1117)  acc1: 74.4000 (77.6320)  acc5: 92.0000 (94.3680)  time: 0.1984  data: 0.0787  max mem: 18975
Test: Total time: 0:00:14 (0.2953 s / it)
* Acc@1 77.716 Acc@5 94.368 loss 1.107
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.72%
Epoch: [168]  [   0/2502]  eta: 1:21:36  lr: 0.001594  min_lr: 0.001594  loss: 3.3552 (3.3552)  weight_decay: 0.0500 (0.0500)  time: 1.9571  data: 1.6755  max mem: 18975
Epoch: [168]  [ 200/2502]  eta: 0:10:16  lr: 0.001592  min_lr: 0.001592  loss: 3.1660 (3.2329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8893 (0.9802)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [168]  [ 400/2502]  eta: 0:09:11  lr: 0.001591  min_lr: 0.001591  loss: 3.4076 (3.2605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8817 (0.9755)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [168]  [ 600/2502]  eta: 0:08:15  lr: 0.001589  min_lr: 0.001589  loss: 3.1318 (3.2494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9080 (0.9727)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [168]  [ 800/2502]  eta: 0:07:21  lr: 0.001587  min_lr: 0.001587  loss: 3.2534 (3.2333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8980 (0.9595)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [168]  [1000/2502]  eta: 0:06:29  lr: 0.001586  min_lr: 0.001586  loss: 3.4536 (3.2366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8468 (0.9511)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [168]  [1200/2502]  eta: 0:05:37  lr: 0.001584  min_lr: 0.001584  loss: 3.3585 (3.2403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0344 (0.9662)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [168]  [1400/2502]  eta: 0:04:45  lr: 0.001583  min_lr: 0.001583  loss: 3.1613 (3.2398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9337 (0.9660)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [168]  [1600/2502]  eta: 0:03:53  lr: 0.001581  min_lr: 0.001581  loss: 3.2366 (3.2413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9941 (0.9669)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [168]  [1800/2502]  eta: 0:03:01  lr: 0.001580  min_lr: 0.001580  loss: 3.0235 (3.2363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9099 (0.9596)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [168]  [2000/2502]  eta: 0:02:09  lr: 0.001578  min_lr: 0.001578  loss: 2.8799 (3.2393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9791 (0.9599)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [168]  [2200/2502]  eta: 0:01:17  lr: 0.001576  min_lr: 0.001576  loss: 3.3905 (3.2405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8885 (0.9586)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [168]  [2400/2502]  eta: 0:00:26  lr: 0.001575  min_lr: 0.001575  loss: 3.5374 (3.2340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9210 (0.9584)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [168]  [2501/2502]  eta: 0:00:00  lr: 0.001574  min_lr: 0.001574  loss: 2.9243 (3.2334)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9008 (0.9571)  time: 0.2313  data: 0.0009  max mem: 18975
Epoch: [168] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.001574  min_lr: 0.001574  loss: 2.9243 (3.2285)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9008 (0.9571)
Test:  [ 0/50]  eta: 0:02:38  loss: 0.5607 (0.5607)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 3.1688  data: 3.0241  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.8351 (0.8490)  acc1: 82.4000 (83.1273)  acc5: 96.8000 (96.8000)  time: 0.5491  data: 0.4272  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.8743 (0.8682)  acc1: 81.2000 (82.3429)  acc5: 96.8000 (96.8000)  time: 0.2917  data: 0.1714  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 0.9552 (0.9823)  acc1: 77.6000 (79.9097)  acc5: 94.8000 (95.2258)  time: 0.2919  data: 0.1714  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2468 (1.0499)  acc1: 71.6000 (78.1171)  acc5: 92.0000 (94.5073)  time: 0.2271  data: 0.1068  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1977 (1.0649)  acc1: 72.4000 (77.6080)  acc5: 92.8000 (94.3680)  time: 0.2269  data: 0.1066  max mem: 18975
Test: Total time: 0:00:14 (0.2945 s / it)
* Acc@1 77.660 Acc@5 94.338 loss 1.064
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.72%
Epoch: [169]  [   0/2502]  eta: 1:36:46  lr: 0.001574  min_lr: 0.001574  loss: 3.9437 (3.9437)  weight_decay: 0.0500 (0.0500)  time: 2.3208  data: 1.5096  max mem: 18975
Epoch: [169]  [ 200/2502]  eta: 0:10:19  lr: 0.001573  min_lr: 0.001573  loss: 2.7781 (3.1957)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8929 (0.9268)  time: 0.2565  data: 0.0005  max mem: 18975
Epoch: [169]  [ 400/2502]  eta: 0:09:12  lr: 0.001571  min_lr: 0.001571  loss: 3.4005 (3.2058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9927 (0.9551)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [169]  [ 600/2502]  eta: 0:08:16  lr: 0.001569  min_lr: 0.001569  loss: 3.3430 (3.2154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9352 (0.9743)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [169]  [ 800/2502]  eta: 0:07:22  lr: 0.001568  min_lr: 0.001568  loss: 3.3122 (3.2144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9086 (inf)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [169]  [1000/2502]  eta: 0:06:29  lr: 0.001566  min_lr: 0.001566  loss: 2.6481 (3.2085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0548 (inf)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [169]  [1200/2502]  eta: 0:05:37  lr: 0.001565  min_lr: 0.001565  loss: 3.5759 (3.2052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9666 (inf)  time: 0.2619  data: 0.0003  max mem: 18975
Epoch: [169]  [1400/2502]  eta: 0:04:45  lr: 0.001563  min_lr: 0.001563  loss: 3.3911 (3.1999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9215 (inf)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [169]  [1600/2502]  eta: 0:03:53  lr: 0.001562  min_lr: 0.001562  loss: 3.1483 (3.1977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9781 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [169]  [1800/2502]  eta: 0:03:01  lr: 0.001560  min_lr: 0.001560  loss: 3.5720 (3.2000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9539 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [169]  [2000/2502]  eta: 0:02:09  lr: 0.001559  min_lr: 0.001559  loss: 2.9343 (3.2013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9982 (inf)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [169]  [2200/2502]  eta: 0:01:17  lr: 0.001557  min_lr: 0.001557  loss: 2.9774 (3.2078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9023 (inf)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [169]  [2400/2502]  eta: 0:00:26  lr: 0.001555  min_lr: 0.001555  loss: 3.1948 (3.2153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9635 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [169]  [2501/2502]  eta: 0:00:00  lr: 0.001555  min_lr: 0.001555  loss: 3.3492 (3.2155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9381 (inf)  time: 0.2305  data: 0.0007  max mem: 18975
Epoch: [169] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.001555  min_lr: 0.001555  loss: 3.3492 (3.2303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9381 (inf)
Test:  [ 0/50]  eta: 0:03:25  loss: 0.7049 (0.7049)  acc1: 92.4000 (92.4000)  acc5: 98.4000 (98.4000)  time: 4.1119  data: 3.9734  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.8600 (0.9156)  acc1: 83.2000 (83.2364)  acc5: 97.6000 (96.8727)  time: 0.5372  data: 0.4162  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9303 (0.9508)  acc1: 82.4000 (82.0381)  acc5: 96.8000 (96.8952)  time: 0.1841  data: 0.0641  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0588 (1.0599)  acc1: 76.0000 (79.7161)  acc5: 94.4000 (95.5484)  time: 0.2187  data: 0.0987  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3348 (1.1312)  acc1: 73.2000 (77.9902)  acc5: 92.0000 (94.5659)  time: 0.2547  data: 0.1347  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3277 (1.1459)  acc1: 74.4000 (77.6480)  acc5: 92.0000 (94.4160)  time: 0.2131  data: 0.0929  max mem: 18975
Test: Total time: 0:00:14 (0.2907 s / it)
* Acc@1 77.806 Acc@5 94.352 loss 1.146
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.81%
Epoch: [170]  [   0/2502]  eta: 1:35:06  lr: 0.001555  min_lr: 0.001555  loss: 4.1479 (4.1479)  weight_decay: 0.0500 (0.0500)  time: 2.2806  data: 2.0125  max mem: 18975
Epoch: [170]  [ 200/2502]  eta: 0:10:21  lr: 0.001553  min_lr: 0.001553  loss: 2.9691 (3.2314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9816 (0.9575)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [170]  [ 400/2502]  eta: 0:09:14  lr: 0.001551  min_lr: 0.001551  loss: 3.7629 (3.2393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9989 (0.9747)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [170]  [ 600/2502]  eta: 0:08:17  lr: 0.001550  min_lr: 0.001550  loss: 2.7282 (3.2121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9228 (0.9685)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [170]  [ 800/2502]  eta: 0:07:23  lr: 0.001548  min_lr: 0.001548  loss: 3.4037 (3.2374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9351 (0.9751)  time: 0.2589  data: 0.0003  max mem: 18975
Epoch: [170]  [1000/2502]  eta: 0:06:30  lr: 0.001547  min_lr: 0.001547  loss: 3.2987 (3.2418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9137 (0.9695)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [170]  [1200/2502]  eta: 0:05:37  lr: 0.001545  min_lr: 0.001545  loss: 3.0857 (3.2383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9139 (0.9659)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [170]  [1400/2502]  eta: 0:04:45  lr: 0.001544  min_lr: 0.001544  loss: 3.2912 (3.2314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8773 (0.9632)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [170]  [1600/2502]  eta: 0:03:53  lr: 0.001542  min_lr: 0.001542  loss: 3.2979 (3.2453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9337 (0.9649)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [170]  [1800/2502]  eta: 0:03:01  lr: 0.001541  min_lr: 0.001541  loss: 2.9869 (3.2416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9619 (0.9654)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [170]  [2000/2502]  eta: 0:02:09  lr: 0.001539  min_lr: 0.001539  loss: 3.2930 (3.2505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9718 (0.9639)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [170]  [2200/2502]  eta: 0:01:18  lr: 0.001537  min_lr: 0.001537  loss: 3.3809 (3.2491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9239 (0.9665)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [170]  [2400/2502]  eta: 0:00:26  lr: 0.001536  min_lr: 0.001536  loss: 3.6084 (3.2494)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0446 (0.9668)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [170]  [2501/2502]  eta: 0:00:00  lr: 0.001535  min_lr: 0.001535  loss: 3.4584 (3.2491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9341 (0.9666)  time: 0.2312  data: 0.0009  max mem: 18975
Epoch: [170] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001535  min_lr: 0.001535  loss: 3.4584 (3.2375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9341 (0.9666)
Test:  [ 0/50]  eta: 0:03:08  loss: 0.5990 (0.5990)  acc1: 93.6000 (93.6000)  acc5: 98.4000 (98.4000)  time: 3.7784  data: 3.6428  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.8732 (0.9197)  acc1: 82.8000 (83.0545)  acc5: 98.0000 (96.9818)  time: 0.5584  data: 0.4376  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9020 (0.9314)  acc1: 80.0000 (81.9810)  acc5: 96.8000 (96.7810)  time: 0.2357  data: 0.1163  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0900 (1.0337)  acc1: 77.2000 (79.7161)  acc5: 93.6000 (95.4839)  time: 0.2679  data: 0.1465  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3051 (1.1101)  acc1: 72.4000 (77.6781)  acc5: 91.6000 (94.5854)  time: 0.2515  data: 0.1298  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2945 (1.1241)  acc1: 72.4000 (77.1840)  acc5: 92.4000 (94.4320)  time: 0.2498  data: 0.1296  max mem: 18975
Test: Total time: 0:00:14 (0.2937 s / it)
* Acc@1 77.530 Acc@5 94.240 loss 1.119
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.81%
Epoch: [171]  [   0/2502]  eta: 1:33:02  lr: 0.001535  min_lr: 0.001535  loss: 3.9736 (3.9736)  weight_decay: 0.0500 (0.0500)  time: 2.2312  data: 1.8227  max mem: 18975
Epoch: [171]  [ 200/2502]  eta: 0:10:20  lr: 0.001534  min_lr: 0.001534  loss: 2.9876 (3.1185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9118 (0.9356)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [171]  [ 400/2502]  eta: 0:09:12  lr: 0.001532  min_lr: 0.001532  loss: 3.2134 (3.1381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9329 (0.9533)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [171]  [ 600/2502]  eta: 0:08:16  lr: 0.001530  min_lr: 0.001530  loss: 3.0995 (3.1671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8948 (0.9628)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [171]  [ 800/2502]  eta: 0:07:22  lr: 0.001529  min_lr: 0.001529  loss: 3.6807 (3.1842)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0001 (0.9582)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [171]  [1000/2502]  eta: 0:06:29  lr: 0.001527  min_lr: 0.001527  loss: 2.9016 (3.1970)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0556 (0.9689)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [171]  [1200/2502]  eta: 0:05:37  lr: 0.001526  min_lr: 0.001526  loss: 3.1945 (3.2055)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9734 (0.9686)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [171]  [1400/2502]  eta: 0:04:45  lr: 0.001524  min_lr: 0.001524  loss: 3.4190 (3.2104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8955 (0.9700)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [171]  [1600/2502]  eta: 0:03:53  lr: 0.001523  min_lr: 0.001523  loss: 3.6375 (3.2157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9115 (0.9697)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [171]  [1800/2502]  eta: 0:03:01  lr: 0.001521  min_lr: 0.001521  loss: 3.0056 (3.2170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9680 (0.9679)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [171]  [2000/2502]  eta: 0:02:09  lr: 0.001520  min_lr: 0.001520  loss: 3.3547 (3.2160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8518 (0.9651)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [171]  [2200/2502]  eta: 0:01:18  lr: 0.001518  min_lr: 0.001518  loss: 3.4087 (3.2178)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0449 (0.9684)  time: 0.2647  data: 0.0004  max mem: 18975
Epoch: [171]  [2400/2502]  eta: 0:00:26  lr: 0.001516  min_lr: 0.001516  loss: 3.5787 (3.2202)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0006 (0.9670)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [171]  [2501/2502]  eta: 0:00:00  lr: 0.001516  min_lr: 0.001516  loss: 3.4122 (3.2200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9458 (0.9686)  time: 0.2305  data: 0.0007  max mem: 18975
Epoch: [171] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.001516  min_lr: 0.001516  loss: 3.4122 (3.2225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9458 (0.9686)
Test:  [ 0/50]  eta: 0:02:43  loss: 0.6245 (0.6245)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 3.2765  data: 3.1195  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.9286 (0.9036)  acc1: 83.6000 (83.5273)  acc5: 96.0000 (96.6182)  time: 0.4975  data: 0.3743  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9361 (0.9384)  acc1: 80.8000 (82.7619)  acc5: 96.0000 (96.4191)  time: 0.2361  data: 0.1156  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0979 (1.0374)  acc1: 78.0000 (80.2839)  acc5: 94.0000 (95.1742)  time: 0.2730  data: 0.1522  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2730 (1.1006)  acc1: 72.8000 (78.5366)  acc5: 92.0000 (94.4781)  time: 0.2516  data: 0.1319  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2498 (1.1143)  acc1: 72.8000 (77.7520)  acc5: 92.4000 (94.3920)  time: 0.1774  data: 0.0587  max mem: 18975
Test: Total time: 0:00:14 (0.2893 s / it)
* Acc@1 77.884 Acc@5 94.402 loss 1.111
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.88%
Epoch: [172]  [   0/2502]  eta: 1:20:25  lr: 0.001516  min_lr: 0.001516  loss: 3.7388 (3.7388)  weight_decay: 0.0500 (0.0500)  time: 1.9285  data: 1.6541  max mem: 18975
Epoch: [172]  [ 200/2502]  eta: 0:10:15  lr: 0.001514  min_lr: 0.001514  loss: 3.3751 (3.2523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8774 (0.9843)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [172]  [ 400/2502]  eta: 0:09:10  lr: 0.001513  min_lr: 0.001513  loss: 3.4682 (3.2381)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0430 (0.9948)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [172]  [ 600/2502]  eta: 0:08:14  lr: 0.001511  min_lr: 0.001511  loss: 3.0586 (3.2384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9209 (0.9922)  time: 0.2556  data: 0.0003  max mem: 18975
Epoch: [172]  [ 800/2502]  eta: 0:07:21  lr: 0.001509  min_lr: 0.001509  loss: 3.2569 (3.2193)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0052 (0.9846)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [172]  [1000/2502]  eta: 0:06:28  lr: 0.001508  min_lr: 0.001508  loss: 3.2383 (3.2179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9302 (0.9781)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [172]  [1200/2502]  eta: 0:05:36  lr: 0.001506  min_lr: 0.001506  loss: 2.6805 (3.2066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9967 (0.9775)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [172]  [1400/2502]  eta: 0:04:44  lr: 0.001505  min_lr: 0.001505  loss: 3.4400 (3.2236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9862 (0.9735)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [172]  [1600/2502]  eta: 0:03:52  lr: 0.001503  min_lr: 0.001503  loss: 3.0488 (3.2300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9080 (0.9683)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [172]  [1800/2502]  eta: 0:03:01  lr: 0.001502  min_lr: 0.001502  loss: 3.2226 (3.2378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9873 (0.9731)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [172]  [2000/2502]  eta: 0:02:09  lr: 0.001500  min_lr: 0.001500  loss: 3.3429 (3.2391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9489 (0.9748)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [172]  [2200/2502]  eta: 0:01:17  lr: 0.001499  min_lr: 0.001499  loss: 3.0972 (3.2426)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0185 (0.9801)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [172]  [2400/2502]  eta: 0:00:26  lr: 0.001497  min_lr: 0.001497  loss: 3.5196 (3.2508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8633 (0.9799)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [172]  [2501/2502]  eta: 0:00:00  lr: 0.001496  min_lr: 0.001496  loss: 3.5319 (3.2528)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8788 (0.9776)  time: 0.2302  data: 0.0007  max mem: 18975
Epoch: [172] Total time: 0:10:44 (0.2576 s / it)
Averaged stats: lr: 0.001496  min_lr: 0.001496  loss: 3.5319 (3.2337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8788 (0.9776)
Test:  [ 0/50]  eta: 0:02:20  loss: 0.6258 (0.6258)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 2.8192  data: 2.6638  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9320 (0.9506)  acc1: 84.4000 (83.6364)  acc5: 96.4000 (96.4727)  time: 0.5401  data: 0.4161  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9443 (0.9656)  acc1: 80.4000 (82.6476)  acc5: 96.4000 (96.4571)  time: 0.2780  data: 0.1574  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1133 (1.0617)  acc1: 78.0000 (80.5032)  acc5: 94.0000 (95.0581)  time: 0.2494  data: 0.1292  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3070 (1.1273)  acc1: 74.0000 (78.6829)  acc5: 91.6000 (94.3610)  time: 0.2330  data: 0.1135  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3065 (1.1424)  acc1: 73.2000 (77.9680)  acc5: 92.0000 (94.0960)  time: 0.2278  data: 0.1090  max mem: 18975
Test: Total time: 0:00:15 (0.3000 s / it)
* Acc@1 77.974 Acc@5 94.334 loss 1.141
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 77.97%
Epoch: [173]  [   0/2502]  eta: 1:42:10  lr: 0.001496  min_lr: 0.001496  loss: 3.7377 (3.7377)  weight_decay: 0.0500 (0.0500)  time: 2.4503  data: 2.1817  max mem: 18975
Epoch: [173]  [ 200/2502]  eta: 0:10:25  lr: 0.001495  min_lr: 0.001495  loss: 3.3824 (3.1388)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0302 (1.0476)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [173]  [ 400/2502]  eta: 0:09:15  lr: 0.001493  min_lr: 0.001493  loss: 3.4326 (3.1682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9847 (1.0291)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [173]  [ 600/2502]  eta: 0:08:18  lr: 0.001492  min_lr: 0.001492  loss: 3.4555 (3.1903)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0201 (1.0177)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [173]  [ 800/2502]  eta: 0:07:23  lr: 0.001490  min_lr: 0.001490  loss: 3.5125 (3.2023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0211 (1.0117)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [173]  [1000/2502]  eta: 0:06:30  lr: 0.001488  min_lr: 0.001488  loss: 3.0319 (3.2080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9110 (0.9981)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [173]  [1200/2502]  eta: 0:05:37  lr: 0.001487  min_lr: 0.001487  loss: 3.4677 (3.2107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9445 (0.9924)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [173]  [1400/2502]  eta: 0:04:45  lr: 0.001485  min_lr: 0.001485  loss: 2.9012 (3.1995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9906 (0.9880)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [173]  [1600/2502]  eta: 0:03:53  lr: 0.001484  min_lr: 0.001484  loss: 2.9907 (3.2120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9954 (0.9911)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [173]  [1800/2502]  eta: 0:03:01  lr: 0.001482  min_lr: 0.001482  loss: 2.9093 (3.2146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9656 (0.9892)  time: 0.2573  data: 0.0003  max mem: 18975
Epoch: [173]  [2000/2502]  eta: 0:02:09  lr: 0.001481  min_lr: 0.001481  loss: 3.0839 (3.2122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8693 (0.9911)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [173]  [2200/2502]  eta: 0:01:18  lr: 0.001479  min_lr: 0.001479  loss: 3.6009 (3.2134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9154 (0.9883)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [173]  [2400/2502]  eta: 0:00:26  lr: 0.001478  min_lr: 0.001478  loss: 2.9528 (3.2144)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0172 (0.9864)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [173]  [2501/2502]  eta: 0:00:00  lr: 0.001477  min_lr: 0.001477  loss: 2.9701 (3.2147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9635 (0.9852)  time: 0.2305  data: 0.0007  max mem: 18975
Epoch: [173] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001477  min_lr: 0.001477  loss: 2.9701 (3.2132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9635 (0.9852)
Test:  [ 0/50]  eta: 0:03:12  loss: 0.4581 (0.4581)  acc1: 93.6000 (93.6000)  acc5: 98.8000 (98.8000)  time: 3.8590  data: 3.7226  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 0.8760 (0.8306)  acc1: 83.2000 (82.8727)  acc5: 96.4000 (96.6182)  time: 0.6034  data: 0.4812  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.8660 (0.8539)  acc1: 81.2000 (82.4762)  acc5: 96.4000 (96.6857)  time: 0.2846  data: 0.1640  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0813 (0.9659)  acc1: 78.0000 (80.1290)  acc5: 94.4000 (95.1484)  time: 0.2506  data: 0.1302  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2194 (1.0238)  acc1: 74.8000 (78.5073)  acc5: 92.0000 (94.4585)  time: 0.1978  data: 0.0782  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2183 (1.0440)  acc1: 74.4000 (77.9280)  acc5: 92.4000 (94.2720)  time: 0.1968  data: 0.0781  max mem: 18975
Test: Total time: 0:00:14 (0.2939 s / it)
* Acc@1 77.910 Acc@5 94.282 loss 1.041
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.97%
Epoch: [174]  [   0/2502]  eta: 1:21:33  lr: 0.001477  min_lr: 0.001477  loss: 3.3596 (3.3596)  weight_decay: 0.0500 (0.0500)  time: 1.9560  data: 1.7091  max mem: 18975
Epoch: [174]  [ 200/2502]  eta: 0:10:16  lr: 0.001475  min_lr: 0.001475  loss: 3.0834 (3.1575)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0154 (0.9651)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [174]  [ 400/2502]  eta: 0:09:12  lr: 0.001474  min_lr: 0.001474  loss: 3.3226 (3.1585)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9457 (0.9682)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [174]  [ 600/2502]  eta: 0:08:16  lr: 0.001472  min_lr: 0.001472  loss: 2.8568 (3.1631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9446 (0.9734)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [174]  [ 800/2502]  eta: 0:07:22  lr: 0.001471  min_lr: 0.001471  loss: 3.0428 (3.1735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9675 (0.9681)  time: 0.2572  data: 0.0003  max mem: 18975
Epoch: [174]  [1000/2502]  eta: 0:06:30  lr: 0.001469  min_lr: 0.001469  loss: 2.7872 (3.1735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9765 (0.9711)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [174]  [1200/2502]  eta: 0:05:37  lr: 0.001468  min_lr: 0.001468  loss: 2.9849 (3.1826)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0020 (0.9785)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [174]  [1400/2502]  eta: 0:04:45  lr: 0.001466  min_lr: 0.001466  loss: 3.3867 (3.1858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9707 (0.9812)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [174]  [1600/2502]  eta: 0:03:53  lr: 0.001464  min_lr: 0.001464  loss: 3.5029 (3.1908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9577 (0.9835)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [174]  [1800/2502]  eta: 0:03:01  lr: 0.001463  min_lr: 0.001463  loss: 3.3123 (3.1905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9154 (0.9796)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [174]  [2000/2502]  eta: 0:02:09  lr: 0.001461  min_lr: 0.001461  loss: 3.5345 (3.1925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0174 (0.9837)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [174]  [2200/2502]  eta: 0:01:17  lr: 0.001460  min_lr: 0.001460  loss: 3.1397 (3.1939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9474 (0.9848)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [174]  [2400/2502]  eta: 0:00:26  lr: 0.001458  min_lr: 0.001458  loss: 3.0674 (3.1906)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0261 (0.9898)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [174]  [2501/2502]  eta: 0:00:00  lr: 0.001457  min_lr: 0.001457  loss: 3.6651 (3.1883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9343 (0.9898)  time: 0.2312  data: 0.0007  max mem: 18975
Epoch: [174] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.001457  min_lr: 0.001457  loss: 3.6651 (3.2027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9343 (0.9898)
Test:  [ 0/50]  eta: 0:02:26  loss: 0.6095 (0.6095)  acc1: 92.8000 (92.8000)  acc5: 99.2000 (99.2000)  time: 2.9343  data: 2.7967  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 0.8429 (0.8898)  acc1: 84.0000 (83.4546)  acc5: 96.8000 (96.8727)  time: 0.4343  data: 0.3122  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9210 (0.9162)  acc1: 80.8000 (82.3619)  acc5: 97.2000 (96.8000)  time: 0.2057  data: 0.0849  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1065 (1.0257)  acc1: 78.0000 (80.0258)  acc5: 94.8000 (95.4323)  time: 0.2304  data: 0.1102  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 1.2747 (1.0942)  acc1: 74.4000 (78.3317)  acc5: 91.6000 (94.6244)  time: 0.2408  data: 0.1205  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2704 (1.1140)  acc1: 74.0000 (77.7440)  acc5: 91.6000 (94.4160)  time: 0.2310  data: 0.1109  max mem: 18975
Test: Total time: 0:00:13 (0.2797 s / it)
* Acc@1 77.862 Acc@5 94.458 loss 1.113
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.97%
Epoch: [175]  [   0/2502]  eta: 1:35:55  lr: 0.001457  min_lr: 0.001457  loss: 3.5286 (3.5286)  weight_decay: 0.0500 (0.0500)  time: 2.3003  data: 1.3816  max mem: 18975
Epoch: [175]  [ 200/2502]  eta: 0:10:18  lr: 0.001456  min_lr: 0.001456  loss: 3.0850 (3.2155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8697 (0.9188)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [175]  [ 400/2502]  eta: 0:09:12  lr: 0.001454  min_lr: 0.001454  loss: 3.3692 (3.2504)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0142 (0.9547)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [175]  [ 600/2502]  eta: 0:08:16  lr: 0.001453  min_lr: 0.001453  loss: 3.5874 (3.2380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9648 (0.9555)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [175]  [ 800/2502]  eta: 0:07:22  lr: 0.001451  min_lr: 0.001451  loss: 3.3358 (3.2388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9651 (0.9606)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [175]  [1000/2502]  eta: 0:06:29  lr: 0.001450  min_lr: 0.001450  loss: 3.0322 (3.2185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9794 (0.9668)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [175]  [1200/2502]  eta: 0:05:37  lr: 0.001448  min_lr: 0.001448  loss: 3.5249 (3.2229)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0042 (0.9772)  time: 0.2569  data: 0.0005  max mem: 18975
Epoch: [175]  [1400/2502]  eta: 0:04:45  lr: 0.001447  min_lr: 0.001447  loss: 3.3233 (3.2232)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0679 (0.9916)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [175]  [1600/2502]  eta: 0:03:53  lr: 0.001445  min_lr: 0.001445  loss: 3.2081 (3.2223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9715 (0.9889)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [175]  [1800/2502]  eta: 0:03:01  lr: 0.001444  min_lr: 0.001444  loss: 3.4094 (3.2328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9561 (0.9909)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [175]  [2000/2502]  eta: 0:02:09  lr: 0.001442  min_lr: 0.001442  loss: 3.0573 (3.2206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9217 (inf)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [175]  [2200/2502]  eta: 0:01:18  lr: 0.001440  min_lr: 0.001440  loss: 3.5082 (3.2105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9962 (inf)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [175]  [2400/2502]  eta: 0:00:26  lr: 0.001439  min_lr: 0.001439  loss: 3.0849 (3.2107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9282 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [175]  [2501/2502]  eta: 0:00:00  lr: 0.001438  min_lr: 0.001438  loss: 2.8719 (3.2131)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0107 (inf)  time: 0.2306  data: 0.0009  max mem: 18975
Epoch: [175] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001438  min_lr: 0.001438  loss: 2.8719 (3.2094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0107 (inf)
Test:  [ 0/50]  eta: 0:02:12  loss: 0.5903 (0.5903)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 2.6538  data: 2.5067  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.8865 (0.8969)  acc1: 83.6000 (83.4182)  acc5: 96.4000 (96.8364)  time: 0.4649  data: 0.3423  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9189 (0.9232)  acc1: 81.2000 (82.3810)  acc5: 96.4000 (96.6095)  time: 0.2309  data: 0.1111  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0790 (1.0214)  acc1: 77.2000 (80.0516)  acc5: 94.4000 (95.2000)  time: 0.2608  data: 0.1414  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2658 (1.0884)  acc1: 73.2000 (78.2146)  acc5: 91.6000 (94.4000)  time: 0.2678  data: 0.1486  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2135 (1.1001)  acc1: 74.0000 (77.7600)  acc5: 92.4000 (94.2720)  time: 0.1993  data: 0.0803  max mem: 18975
Test: Total time: 0:00:14 (0.2868 s / it)
* Acc@1 77.856 Acc@5 94.376 loss 1.096
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.97%
Epoch: [176]  [   0/2502]  eta: 1:32:39  lr: 0.001438  min_lr: 0.001438  loss: 2.2934 (2.2934)  weight_decay: 0.0500 (0.0500)  time: 2.2222  data: 1.7973  max mem: 18975
Epoch: [176]  [ 200/2502]  eta: 0:10:17  lr: 0.001437  min_lr: 0.001437  loss: 3.5054 (3.2233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9508 (0.9951)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [176]  [ 400/2502]  eta: 0:09:12  lr: 0.001435  min_lr: 0.001435  loss: 3.4971 (3.1944)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9785 (0.9745)  time: 0.2589  data: 0.0004  max mem: 18975
Epoch: [176]  [ 600/2502]  eta: 0:08:16  lr: 0.001433  min_lr: 0.001433  loss: 2.9088 (3.1864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9687 (0.9849)  time: 0.2648  data: 0.0004  max mem: 18975
Epoch: [176]  [ 800/2502]  eta: 0:07:22  lr: 0.001432  min_lr: 0.001432  loss: 3.0060 (3.2066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9902 (0.9886)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [176]  [1000/2502]  eta: 0:06:30  lr: 0.001430  min_lr: 0.001430  loss: 3.5352 (3.2130)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0521 (1.0024)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [176]  [1200/2502]  eta: 0:05:37  lr: 0.001429  min_lr: 0.001429  loss: 3.1625 (3.2100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9113 (0.9930)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [176]  [1400/2502]  eta: 0:04:45  lr: 0.001427  min_lr: 0.001427  loss: 3.0302 (3.2128)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0399 (0.9978)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [176]  [1600/2502]  eta: 0:03:53  lr: 0.001426  min_lr: 0.001426  loss: 3.2857 (3.2101)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0076 (0.9968)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [176]  [1800/2502]  eta: 0:03:01  lr: 0.001424  min_lr: 0.001424  loss: 3.1776 (3.2058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9520 (0.9916)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [176]  [2000/2502]  eta: 0:02:09  lr: 0.001423  min_lr: 0.001423  loss: 3.3942 (3.2104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9907 (0.9976)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [176]  [2200/2502]  eta: 0:01:17  lr: 0.001421  min_lr: 0.001421  loss: 3.3852 (3.2022)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9726 (0.9966)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [176]  [2400/2502]  eta: 0:00:26  lr: 0.001420  min_lr: 0.001420  loss: 2.8753 (3.2032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9132 (0.9967)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [176]  [2501/2502]  eta: 0:00:00  lr: 0.001419  min_lr: 0.001419  loss: 3.4283 (3.2030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9734 (0.9952)  time: 0.2309  data: 0.0009  max mem: 18975
Epoch: [176] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.001419  min_lr: 0.001419  loss: 3.4283 (3.2108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9734 (0.9952)
Test:  [ 0/50]  eta: 0:02:53  loss: 0.5448 (0.5448)  acc1: 93.2000 (93.2000)  acc5: 98.4000 (98.4000)  time: 3.4680  data: 3.3238  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.8797 (0.8902)  acc1: 86.0000 (84.0000)  acc5: 96.8000 (96.5091)  time: 0.5522  data: 0.4298  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9096 (0.9193)  acc1: 81.2000 (82.5905)  acc5: 96.8000 (96.7048)  time: 0.2667  data: 0.1467  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0252 (1.0297)  acc1: 76.8000 (79.9613)  acc5: 94.8000 (95.2258)  time: 0.2793  data: 0.1580  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2948 (1.0975)  acc1: 73.2000 (78.0293)  acc5: 92.0000 (94.4976)  time: 0.2503  data: 0.1295  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2934 (1.1080)  acc1: 73.2000 (77.7200)  acc5: 92.8000 (94.4480)  time: 0.2137  data: 0.0947  max mem: 18975
Test: Total time: 0:00:14 (0.2996 s / it)
* Acc@1 78.162 Acc@5 94.522 loss 1.101
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.16%
Epoch: [177]  [   0/2502]  eta: 1:10:32  lr: 0.001419  min_lr: 0.001419  loss: 2.4620 (2.4620)  weight_decay: 0.0500 (0.0500)  time: 1.6916  data: 1.4195  max mem: 18975
Epoch: [177]  [ 200/2502]  eta: 0:10:14  lr: 0.001417  min_lr: 0.001417  loss: 2.8222 (3.1075)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0108 (0.9717)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [177]  [ 400/2502]  eta: 0:09:10  lr: 0.001416  min_lr: 0.001416  loss: 2.8673 (3.1282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9604 (0.9895)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [177]  [ 600/2502]  eta: 0:08:15  lr: 0.001414  min_lr: 0.001414  loss: 2.8919 (3.1572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9502 (0.9881)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [177]  [ 800/2502]  eta: 0:07:22  lr: 0.001413  min_lr: 0.001413  loss: 2.6777 (3.1643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9720 (0.9815)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [177]  [1000/2502]  eta: 0:06:29  lr: 0.001411  min_lr: 0.001411  loss: 3.4004 (3.1815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9681 (0.9870)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [177]  [1200/2502]  eta: 0:05:37  lr: 0.001410  min_lr: 0.001410  loss: 2.9852 (3.1830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9715 (0.9861)  time: 0.2572  data: 0.0005  max mem: 18975
Epoch: [177]  [1400/2502]  eta: 0:04:45  lr: 0.001408  min_lr: 0.001408  loss: 3.4151 (3.1836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9016 (0.9831)  time: 0.2635  data: 0.0004  max mem: 18975
Epoch: [177]  [1600/2502]  eta: 0:03:53  lr: 0.001406  min_lr: 0.001406  loss: 3.5165 (3.1885)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0297 (0.9904)  time: 0.2587  data: 0.0004  max mem: 18975
Epoch: [177]  [1800/2502]  eta: 0:03:01  lr: 0.001405  min_lr: 0.001405  loss: 3.5685 (3.1916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8962 (0.9905)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [177]  [2000/2502]  eta: 0:02:09  lr: 0.001403  min_lr: 0.001403  loss: 3.0685 (3.1890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9772 (0.9929)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [177]  [2200/2502]  eta: 0:01:17  lr: 0.001402  min_lr: 0.001402  loss: 3.0374 (3.1880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9592 (0.9920)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [177]  [2400/2502]  eta: 0:00:26  lr: 0.001400  min_lr: 0.001400  loss: 3.5056 (3.1939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9523 (0.9949)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [177]  [2501/2502]  eta: 0:00:00  lr: 0.001400  min_lr: 0.001400  loss: 3.1214 (3.1959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9304 (0.9927)  time: 0.2312  data: 0.0009  max mem: 18975
Epoch: [177] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.001400  min_lr: 0.001400  loss: 3.1214 (3.1919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9304 (0.9927)
Test:  [ 0/50]  eta: 0:02:36  loss: 0.6015 (0.6015)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 3.1251  data: 2.9785  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.8732 (0.8769)  acc1: 84.4000 (83.4182)  acc5: 96.8000 (96.9091)  time: 0.5490  data: 0.4266  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.8968 (0.8986)  acc1: 81.6000 (82.6095)  acc5: 96.8000 (96.8571)  time: 0.2676  data: 0.1474  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0298 (0.9995)  acc1: 76.8000 (80.0516)  acc5: 94.8000 (95.4065)  time: 0.2467  data: 0.1260  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2204 (1.0603)  acc1: 73.6000 (78.3512)  acc5: 92.4000 (94.7805)  time: 0.2309  data: 0.1110  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2204 (1.0720)  acc1: 73.6000 (77.9280)  acc5: 92.8000 (94.7520)  time: 0.2041  data: 0.0853  max mem: 18975
Test: Total time: 0:00:14 (0.2882 s / it)
* Acc@1 78.106 Acc@5 94.614 loss 1.072
Accuracy of the model on the 50000 test images: 78.1%
Max accuracy: 78.16%
Epoch: [178]  [   0/2502]  eta: 1:32:40  lr: 0.001400  min_lr: 0.001400  loss: 2.1206 (2.1206)  weight_decay: 0.0500 (0.0500)  time: 2.2225  data: 1.8655  max mem: 18975
Epoch: [178]  [ 200/2502]  eta: 0:10:21  lr: 0.001398  min_lr: 0.001398  loss: 2.7076 (3.2331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9801 (1.0058)  time: 0.2644  data: 0.0004  max mem: 18975
Epoch: [178]  [ 400/2502]  eta: 0:09:13  lr: 0.001396  min_lr: 0.001396  loss: 3.5049 (3.2366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0036 (1.0310)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [178]  [ 600/2502]  eta: 0:08:17  lr: 0.001395  min_lr: 0.001395  loss: 3.6627 (3.2202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9170 (1.0141)  time: 0.2646  data: 0.0004  max mem: 18975
Epoch: [178]  [ 800/2502]  eta: 0:07:23  lr: 0.001393  min_lr: 0.001393  loss: 3.4068 (3.2102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9194 (1.0141)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [178]  [1000/2502]  eta: 0:06:30  lr: 0.001392  min_lr: 0.001392  loss: 3.5108 (3.2247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9825 (1.0099)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [178]  [1200/2502]  eta: 0:05:37  lr: 0.001390  min_lr: 0.001390  loss: 2.7921 (3.2227)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0207 (1.0132)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [178]  [1400/2502]  eta: 0:04:45  lr: 0.001389  min_lr: 0.001389  loss: 3.4756 (3.2147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9841 (1.0191)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [178]  [1600/2502]  eta: 0:03:53  lr: 0.001387  min_lr: 0.001387  loss: 3.5404 (3.2101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9289 (1.0137)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [178]  [1800/2502]  eta: 0:03:01  lr: 0.001386  min_lr: 0.001386  loss: 3.3296 (3.2050)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0043 (1.0150)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [178]  [2000/2502]  eta: 0:02:09  lr: 0.001384  min_lr: 0.001384  loss: 3.1321 (3.2093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9540 (1.0146)  time: 0.2570  data: 0.0005  max mem: 18975
Epoch: [178]  [2200/2502]  eta: 0:01:18  lr: 0.001383  min_lr: 0.001383  loss: 3.1143 (3.2068)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0344 (1.0147)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [178]  [2400/2502]  eta: 0:00:26  lr: 0.001381  min_lr: 0.001381  loss: 3.6796 (3.2045)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0385 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [178]  [2501/2502]  eta: 0:00:00  lr: 0.001380  min_lr: 0.001380  loss: 3.1484 (3.2017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9748 (inf)  time: 0.2305  data: 0.0008  max mem: 18975
Epoch: [178] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001380  min_lr: 0.001380  loss: 3.1484 (3.1956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9748 (inf)
Test:  [ 0/50]  eta: 0:02:46  loss: 0.5938 (0.5938)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 3.3237  data: 3.1789  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.8036 (0.8604)  acc1: 86.0000 (83.7455)  acc5: 96.4000 (96.6546)  time: 0.5297  data: 0.4072  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8576 (0.8892)  acc1: 82.0000 (82.7810)  acc5: 96.4000 (96.6095)  time: 0.2485  data: 0.1277  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0279 (0.9976)  acc1: 78.0000 (80.5032)  acc5: 93.6000 (95.2000)  time: 0.2444  data: 0.1240  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2716 (1.0698)  acc1: 74.8000 (78.7024)  acc5: 91.6000 (94.4390)  time: 0.2151  data: 0.0959  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2450 (1.0874)  acc1: 74.4000 (78.0720)  acc5: 92.4000 (94.2960)  time: 0.1770  data: 0.0570  max mem: 18975
Test: Total time: 0:00:14 (0.2830 s / it)
* Acc@1 78.272 Acc@5 94.402 loss 1.083
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.27%
Epoch: [179]  [   0/2502]  eta: 1:30:03  lr: 0.001380  min_lr: 0.001380  loss: 3.6843 (3.6843)  weight_decay: 0.0500 (0.0500)  time: 2.1598  data: 1.8953  max mem: 18975
Epoch: [179]  [ 200/2502]  eta: 0:10:20  lr: 0.001379  min_lr: 0.001379  loss: 3.0203 (3.2129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9076 (0.9414)  time: 0.2661  data: 0.0004  max mem: 18975
Epoch: [179]  [ 400/2502]  eta: 0:09:14  lr: 0.001377  min_lr: 0.001377  loss: 3.1929 (3.2295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9531 (0.9849)  time: 0.2590  data: 0.0004  max mem: 18975
Epoch: [179]  [ 600/2502]  eta: 0:08:17  lr: 0.001376  min_lr: 0.001376  loss: 2.8627 (3.2212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0291 (0.9943)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [179]  [ 800/2502]  eta: 0:07:23  lr: 0.001374  min_lr: 0.001374  loss: 3.2248 (3.2165)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9550 (1.0007)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [179]  [1000/2502]  eta: 0:06:29  lr: 0.001373  min_lr: 0.001373  loss: 3.2843 (3.2190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9399 (1.0027)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [179]  [1200/2502]  eta: 0:05:37  lr: 0.001371  min_lr: 0.001371  loss: 3.2087 (3.2095)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0100 (1.0033)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [179]  [1400/2502]  eta: 0:04:45  lr: 0.001370  min_lr: 0.001370  loss: 3.2656 (3.2013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9978 (1.0090)  time: 0.2647  data: 0.0004  max mem: 18975
Epoch: [179]  [1600/2502]  eta: 0:03:53  lr: 0.001368  min_lr: 0.001368  loss: 3.5894 (3.2105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9766 (1.0076)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [179]  [1800/2502]  eta: 0:03:01  lr: 0.001367  min_lr: 0.001367  loss: 3.6191 (3.2112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9594 (1.0064)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [179]  [2000/2502]  eta: 0:02:09  lr: 0.001365  min_lr: 0.001365  loss: 3.3809 (3.2088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9733 (1.0088)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [179]  [2200/2502]  eta: 0:01:18  lr: 0.001363  min_lr: 0.001363  loss: 2.6871 (3.2114)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9566 (1.0093)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [179]  [2400/2502]  eta: 0:00:26  lr: 0.001362  min_lr: 0.001362  loss: 3.4444 (3.2119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9867 (1.0065)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [179]  [2501/2502]  eta: 0:00:00  lr: 0.001361  min_lr: 0.001361  loss: 2.8860 (3.2054)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0247 (1.0064)  time: 0.2308  data: 0.0009  max mem: 18975
Epoch: [179] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.001361  min_lr: 0.001361  loss: 2.8860 (3.1956)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0247 (1.0064)
Test:  [ 0/50]  eta: 0:02:59  loss: 0.5287 (0.5287)  acc1: 92.8000 (92.8000)  acc5: 98.8000 (98.8000)  time: 3.5856  data: 3.4418  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.8150 (0.8379)  acc1: 85.2000 (83.7455)  acc5: 97.6000 (96.5818)  time: 0.5552  data: 0.4320  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.8458 (0.8498)  acc1: 82.4000 (83.5048)  acc5: 96.8000 (96.8000)  time: 0.2491  data: 0.1285  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 0.9859 (0.9511)  acc1: 79.2000 (80.9161)  acc5: 94.0000 (95.3290)  time: 0.2415  data: 0.1201  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2428 (1.0207)  acc1: 73.6000 (79.1220)  acc5: 92.0000 (94.7024)  time: 0.2330  data: 0.1111  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2428 (1.0340)  acc1: 73.2000 (78.4320)  acc5: 93.6000 (94.5360)  time: 0.2095  data: 0.0890  max mem: 18975
Test: Total time: 0:00:14 (0.2968 s / it)
* Acc@1 78.186 Acc@5 94.564 loss 1.033
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.27%
Epoch: [180]  [   0/2502]  eta: 1:37:23  lr: 0.001361  min_lr: 0.001361  loss: 2.0377 (2.0377)  weight_decay: 0.0500 (0.0500)  time: 2.3354  data: 1.4162  max mem: 18975
Epoch: [180]  [ 200/2502]  eta: 0:10:21  lr: 0.001360  min_lr: 0.001360  loss: 3.4175 (3.1616)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0394 (1.0783)  time: 0.2633  data: 0.0003  max mem: 18975
Epoch: [180]  [ 400/2502]  eta: 0:09:14  lr: 0.001358  min_lr: 0.001358  loss: 3.2358 (3.1910)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0595 (1.0581)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [180]  [ 600/2502]  eta: 0:08:17  lr: 0.001357  min_lr: 0.001357  loss: 3.4448 (3.2101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9657 (1.0404)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [180]  [ 800/2502]  eta: 0:07:23  lr: 0.001355  min_lr: 0.001355  loss: 3.3196 (3.2169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9162 (1.0207)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [180]  [1000/2502]  eta: 0:06:30  lr: 0.001354  min_lr: 0.001354  loss: 3.5824 (3.2197)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0248 (1.0163)  time: 0.2558  data: 0.0003  max mem: 18975
Epoch: [180]  [1200/2502]  eta: 0:05:37  lr: 0.001352  min_lr: 0.001352  loss: 3.2861 (3.2020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9551 (1.0177)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [180]  [1400/2502]  eta: 0:04:45  lr: 0.001350  min_lr: 0.001350  loss: 3.0432 (3.1975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9923 (1.0243)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [180]  [1600/2502]  eta: 0:03:53  lr: 0.001349  min_lr: 0.001349  loss: 3.4731 (3.1952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9053 (1.0213)  time: 0.2556  data: 0.0004  max mem: 18975
Epoch: [180]  [1800/2502]  eta: 0:03:01  lr: 0.001347  min_lr: 0.001347  loss: 3.5424 (3.1977)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0089 (1.0206)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [180]  [2000/2502]  eta: 0:02:09  lr: 0.001346  min_lr: 0.001346  loss: 2.5282 (3.1926)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0262 (1.0240)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [180]  [2200/2502]  eta: 0:01:17  lr: 0.001344  min_lr: 0.001344  loss: 3.1586 (3.1976)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0377 (1.0269)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [180]  [2400/2502]  eta: 0:00:26  lr: 0.001343  min_lr: 0.001343  loss: 3.2070 (3.2002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9216 (1.0220)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [180]  [2501/2502]  eta: 0:00:00  lr: 0.001342  min_lr: 0.001342  loss: 3.5591 (3.2030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9789 (1.0207)  time: 0.2369  data: 0.0008  max mem: 18975
Epoch: [180] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.001342  min_lr: 0.001342  loss: 3.5591 (3.1874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9789 (1.0207)
Test:  [ 0/50]  eta: 0:02:55  loss: 0.6128 (0.6128)  acc1: 92.8000 (92.8000)  acc5: 98.4000 (98.4000)  time: 3.5157  data: 3.3698  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9390 (0.9539)  acc1: 84.4000 (82.7273)  acc5: 96.8000 (96.5818)  time: 0.5300  data: 0.4082  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9737 (0.9655)  acc1: 82.0000 (82.3238)  acc5: 96.8000 (96.7619)  time: 0.2276  data: 0.1079  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0923 (1.0686)  acc1: 76.8000 (80.0387)  acc5: 95.2000 (95.4065)  time: 0.2120  data: 0.0923  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3538 (1.1398)  acc1: 73.2000 (78.3902)  acc5: 92.4000 (94.6634)  time: 0.2132  data: 0.0936  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3211 (1.1573)  acc1: 74.0000 (78.0640)  acc5: 93.2000 (94.5760)  time: 0.2114  data: 0.0920  max mem: 18975
Test: Total time: 0:00:14 (0.2844 s / it)
* Acc@1 78.346 Acc@5 94.556 loss 1.154
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.35%
Epoch: [181]  [   0/2502]  eta: 1:30:34  lr: 0.001342  min_lr: 0.001342  loss: 2.8877 (2.8877)  weight_decay: 0.0500 (0.0500)  time: 2.1719  data: 1.9008  max mem: 18975
Epoch: [181]  [ 200/2502]  eta: 0:10:17  lr: 0.001341  min_lr: 0.001341  loss: 3.2191 (3.2674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0257 (1.0980)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [181]  [ 400/2502]  eta: 0:09:13  lr: 0.001339  min_lr: 0.001339  loss: 2.8693 (3.2176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9743 (1.0473)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [181]  [ 600/2502]  eta: 0:08:17  lr: 0.001338  min_lr: 0.001338  loss: 3.3910 (3.1712)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0390 (1.0354)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [181]  [ 800/2502]  eta: 0:07:23  lr: 0.001336  min_lr: 0.001336  loss: 2.5815 (3.1455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9859 (1.0401)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [181]  [1000/2502]  eta: 0:06:30  lr: 0.001334  min_lr: 0.001334  loss: 3.3265 (3.1533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9468 (1.0382)  time: 0.2572  data: 0.0003  max mem: 18975
Epoch: [181]  [1200/2502]  eta: 0:05:37  lr: 0.001333  min_lr: 0.001333  loss: 3.2484 (3.1655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9497 (1.0386)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [181]  [1400/2502]  eta: 0:04:45  lr: 0.001331  min_lr: 0.001331  loss: 3.4126 (3.1631)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0177 (1.0361)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [181]  [1600/2502]  eta: 0:03:53  lr: 0.001330  min_lr: 0.001330  loss: 2.6308 (3.1682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9899 (1.0413)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [181]  [1800/2502]  eta: 0:03:01  lr: 0.001328  min_lr: 0.001328  loss: 3.2433 (3.1778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9814 (1.0411)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [181]  [2000/2502]  eta: 0:02:09  lr: 0.001327  min_lr: 0.001327  loss: 3.5386 (3.1831)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0503 (1.0379)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [181]  [2200/2502]  eta: 0:01:17  lr: 0.001325  min_lr: 0.001325  loss: 3.3897 (3.1845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9803 (1.0360)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [181]  [2400/2502]  eta: 0:00:26  lr: 0.001324  min_lr: 0.001324  loss: 3.2760 (3.1776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9663 (1.0345)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [181]  [2501/2502]  eta: 0:00:00  lr: 0.001323  min_lr: 0.001323  loss: 3.5203 (3.1749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9750 (1.0328)  time: 0.2307  data: 0.0007  max mem: 18975
Epoch: [181] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.001323  min_lr: 0.001323  loss: 3.5203 (3.1820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9750 (1.0328)
Test:  [ 0/50]  eta: 0:03:04  loss: 0.5158 (0.5158)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 3.6806  data: 3.5371  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.7907 (0.8206)  acc1: 84.0000 (83.6364)  acc5: 97.6000 (96.9818)  time: 0.5552  data: 0.4313  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.8411 (0.8572)  acc1: 82.0000 (82.7238)  acc5: 97.2000 (96.8381)  time: 0.2451  data: 0.1240  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0599 (0.9700)  acc1: 78.4000 (80.4516)  acc5: 93.6000 (95.3548)  time: 0.2506  data: 0.1285  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2396 (1.0375)  acc1: 74.0000 (78.8585)  acc5: 91.2000 (94.3415)  time: 0.2144  data: 0.0929  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2129 (1.0511)  acc1: 73.6000 (78.1760)  acc5: 92.0000 (94.2880)  time: 0.2021  data: 0.0831  max mem: 18975
Test: Total time: 0:00:14 (0.2812 s / it)
* Acc@1 78.370 Acc@5 94.460 loss 1.053
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.37%
Epoch: [182]  [   0/2502]  eta: 1:18:54  lr: 0.001323  min_lr: 0.001323  loss: 2.2536 (2.2536)  weight_decay: 0.0500 (0.0500)  time: 1.8923  data: 1.6143  max mem: 18975
Epoch: [182]  [ 200/2502]  eta: 0:10:13  lr: 0.001322  min_lr: 0.001322  loss: 2.7560 (3.1339)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0506 (0.9750)  time: 0.2574  data: 0.0003  max mem: 18975
Epoch: [182]  [ 400/2502]  eta: 0:09:10  lr: 0.001320  min_lr: 0.001320  loss: 2.9377 (3.1562)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [182]  [ 600/2502]  eta: 0:08:14  lr: 0.001318  min_lr: 0.001318  loss: 3.0114 (3.1361)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0075 (nan)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [182]  [ 800/2502]  eta: 0:07:21  lr: 0.001317  min_lr: 0.001317  loss: 3.1277 (3.1194)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0185 (nan)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [182]  [1000/2502]  eta: 0:06:28  lr: 0.001315  min_lr: 0.001315  loss: 2.7249 (3.1253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9942 (nan)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [182]  [1200/2502]  eta: 0:05:36  lr: 0.001314  min_lr: 0.001314  loss: 2.8077 (3.1230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9567 (nan)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [182]  [1400/2502]  eta: 0:04:44  lr: 0.001312  min_lr: 0.001312  loss: 2.9717 (3.1411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0658 (nan)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [182]  [1600/2502]  eta: 0:03:52  lr: 0.001311  min_lr: 0.001311  loss: 3.3788 (3.1467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9618 (nan)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [182]  [1800/2502]  eta: 0:03:01  lr: 0.001309  min_lr: 0.001309  loss: 3.1791 (3.1435)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0475 (nan)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [182]  [2000/2502]  eta: 0:02:09  lr: 0.001308  min_lr: 0.001308  loss: 3.3184 (3.1393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9764 (nan)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [182]  [2200/2502]  eta: 0:01:17  lr: 0.001306  min_lr: 0.001306  loss: 3.5439 (3.1392)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0784 (nan)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [182]  [2400/2502]  eta: 0:00:26  lr: 0.001305  min_lr: 0.001305  loss: 3.4106 (3.1397)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1131 (nan)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [182]  [2501/2502]  eta: 0:00:00  lr: 0.001304  min_lr: 0.001304  loss: 3.1007 (3.1458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9762 (nan)  time: 0.2310  data: 0.0009  max mem: 18975
Epoch: [182] Total time: 0:10:45 (0.2578 s / it)
Averaged stats: lr: 0.001304  min_lr: 0.001304  loss: 3.1007 (3.1743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9762 (nan)
Test:  [ 0/50]  eta: 0:02:51  loss: 0.5533 (0.5533)  acc1: 91.6000 (91.6000)  acc5: 98.8000 (98.8000)  time: 3.4344  data: 3.2902  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.8322 (0.8684)  acc1: 86.0000 (83.3818)  acc5: 97.6000 (97.0182)  time: 0.5431  data: 0.4166  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9010 (0.8978)  acc1: 81.6000 (82.5524)  acc5: 97.6000 (96.9905)  time: 0.2619  data: 0.1400  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 0.9904 (0.9855)  acc1: 78.4000 (80.5290)  acc5: 94.4000 (95.6000)  time: 0.2697  data: 0.1506  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2486 (1.0525)  acc1: 75.2000 (78.6049)  acc5: 92.0000 (94.7415)  time: 0.2476  data: 0.1285  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2364 (1.0617)  acc1: 74.4000 (78.1280)  acc5: 92.0000 (94.6640)  time: 0.2583  data: 0.1395  max mem: 18975
Test: Total time: 0:00:15 (0.3069 s / it)
* Acc@1 78.286 Acc@5 94.596 loss 1.064
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.37%
Epoch: [183]  [   0/2502]  eta: 1:28:32  lr: 0.001304  min_lr: 0.001304  loss: 3.7484 (3.7484)  weight_decay: 0.0500 (0.0500)  time: 2.1233  data: 1.8123  max mem: 18975
Epoch: [183]  [ 200/2502]  eta: 0:10:19  lr: 0.001303  min_lr: 0.001303  loss: 3.0254 (3.1234)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0126 (0.9893)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [183]  [ 400/2502]  eta: 0:09:12  lr: 0.001301  min_lr: 0.001301  loss: 3.1528 (3.1260)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0972 (1.0503)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [183]  [ 600/2502]  eta: 0:08:15  lr: 0.001299  min_lr: 0.001299  loss: 3.2991 (3.1648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9584 (1.0336)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [183]  [ 800/2502]  eta: 0:07:22  lr: 0.001298  min_lr: 0.001298  loss: 2.6677 (3.1648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9644 (1.0377)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [183]  [1000/2502]  eta: 0:06:29  lr: 0.001296  min_lr: 0.001296  loss: 3.4151 (3.1586)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1034 (1.0508)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [183]  [1200/2502]  eta: 0:05:36  lr: 0.001295  min_lr: 0.001295  loss: 3.0866 (3.1614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9512 (1.0475)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [183]  [1400/2502]  eta: 0:04:44  lr: 0.001293  min_lr: 0.001293  loss: 3.4294 (3.1539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9971 (1.0389)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [183]  [1600/2502]  eta: 0:03:52  lr: 0.001292  min_lr: 0.001292  loss: 2.8552 (3.1571)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0334 (1.0374)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [183]  [1800/2502]  eta: 0:03:01  lr: 0.001290  min_lr: 0.001290  loss: 3.2946 (3.1579)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0170 (1.0359)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [183]  [2000/2502]  eta: 0:02:09  lr: 0.001289  min_lr: 0.001289  loss: 3.5058 (3.1535)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0589 (1.0411)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [183]  [2200/2502]  eta: 0:01:17  lr: 0.001287  min_lr: 0.001287  loss: 3.1296 (3.1555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9853 (1.0402)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [183]  [2400/2502]  eta: 0:00:26  lr: 0.001286  min_lr: 0.001286  loss: 3.2989 (3.1602)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0010 (1.0396)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [183]  [2501/2502]  eta: 0:00:00  lr: 0.001285  min_lr: 0.001285  loss: 3.3375 (3.1632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9994 (1.0387)  time: 0.2308  data: 0.0009  max mem: 18975
Epoch: [183] Total time: 0:10:44 (0.2578 s / it)
Averaged stats: lr: 0.001285  min_lr: 0.001285  loss: 3.3375 (3.1743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9994 (1.0387)
Test:  [ 0/50]  eta: 0:02:39  loss: 0.4624 (0.4624)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 3.1854  data: 3.0273  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.7408 (0.7986)  acc1: 85.2000 (83.6364)  acc5: 96.4000 (96.8364)  time: 0.5077  data: 0.3812  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8267 (0.8255)  acc1: 82.0000 (82.7619)  acc5: 96.4000 (96.9143)  time: 0.2329  data: 0.1115  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 0.9781 (0.9362)  acc1: 78.8000 (80.5290)  acc5: 94.4000 (95.5355)  time: 0.2600  data: 0.1401  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2275 (1.0068)  acc1: 73.2000 (78.8781)  acc5: 92.0000 (94.7805)  time: 0.2832  data: 0.1635  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2275 (1.0265)  acc1: 73.2000 (78.3280)  acc5: 91.6000 (94.6160)  time: 0.2165  data: 0.0976  max mem: 18975
Test: Total time: 0:00:15 (0.3023 s / it)
* Acc@1 78.458 Acc@5 94.548 loss 1.025
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.46%
Epoch: [184]  [   0/2502]  eta: 1:29:06  lr: 0.001285  min_lr: 0.001285  loss: 4.2498 (4.2498)  weight_decay: 0.0500 (0.0500)  time: 2.1370  data: 1.8691  max mem: 18975
Epoch: [184]  [ 200/2502]  eta: 0:10:18  lr: 0.001284  min_lr: 0.001284  loss: 3.2503 (3.1967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9336 (1.0494)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [184]  [ 400/2502]  eta: 0:09:12  lr: 0.001282  min_lr: 0.001282  loss: 3.2822 (3.1518)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1571 (1.0682)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [184]  [ 600/2502]  eta: 0:08:16  lr: 0.001281  min_lr: 0.001281  loss: 3.0674 (3.1698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9708 (1.0574)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [184]  [ 800/2502]  eta: 0:07:22  lr: 0.001279  min_lr: 0.001279  loss: 2.9087 (3.1568)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0089 (1.0577)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [184]  [1000/2502]  eta: 0:06:29  lr: 0.001278  min_lr: 0.001278  loss: 3.2099 (3.1455)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0348 (1.0525)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [184]  [1200/2502]  eta: 0:05:37  lr: 0.001276  min_lr: 0.001276  loss: 3.3522 (3.1401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9754 (1.0433)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [184]  [1400/2502]  eta: 0:04:45  lr: 0.001274  min_lr: 0.001274  loss: 3.0535 (3.1380)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0607 (1.0433)  time: 0.2557  data: 0.0003  max mem: 18975
Epoch: [184]  [1600/2502]  eta: 0:03:53  lr: 0.001273  min_lr: 0.001273  loss: 3.3522 (3.1394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0177 (1.0452)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [184]  [1800/2502]  eta: 0:03:01  lr: 0.001271  min_lr: 0.001271  loss: 2.9514 (3.1454)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0344 (1.0463)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [184]  [2000/2502]  eta: 0:02:09  lr: 0.001270  min_lr: 0.001270  loss: 2.7357 (3.1472)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0221 (1.0481)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [184]  [2200/2502]  eta: 0:01:17  lr: 0.001268  min_lr: 0.001268  loss: 3.2508 (3.1514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0931 (1.0545)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [184]  [2400/2502]  eta: 0:00:26  lr: 0.001267  min_lr: 0.001267  loss: 3.2492 (3.1587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9809 (1.0499)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [184]  [2501/2502]  eta: 0:00:00  lr: 0.001266  min_lr: 0.001266  loss: 3.5467 (3.1618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9706 (1.0498)  time: 0.2305  data: 0.0008  max mem: 18975
Epoch: [184] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.001266  min_lr: 0.001266  loss: 3.5467 (3.1682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9706 (1.0498)
Test:  [ 0/50]  eta: 0:02:35  loss: 0.6127 (0.6127)  acc1: 91.2000 (91.2000)  acc5: 98.4000 (98.4000)  time: 3.1129  data: 2.9713  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.9255 (0.9457)  acc1: 84.0000 (83.9273)  acc5: 98.0000 (97.0545)  time: 0.4870  data: 0.3657  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9961 (0.9803)  acc1: 81.2000 (82.7810)  acc5: 96.8000 (96.8762)  time: 0.2307  data: 0.1110  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0588 (1.0759)  acc1: 78.8000 (80.3226)  acc5: 94.8000 (95.4581)  time: 0.2618  data: 0.1422  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2968 (1.1316)  acc1: 74.8000 (78.9171)  acc5: 92.4000 (94.8390)  time: 0.2441  data: 0.1250  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2948 (1.1481)  acc1: 74.4000 (78.1280)  acc5: 92.8000 (94.7760)  time: 0.1947  data: 0.0717  max mem: 18975
Test: Total time: 0:00:14 (0.2906 s / it)
* Acc@1 78.350 Acc@5 94.618 loss 1.144
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.46%
Epoch: [185]  [   0/2502]  eta: 1:37:13  lr: 0.001266  min_lr: 0.001266  loss: 3.6380 (3.6380)  weight_decay: 0.0500 (0.0500)  time: 2.3316  data: 2.0512  max mem: 18975
Epoch: [185]  [ 200/2502]  eta: 0:10:20  lr: 0.001265  min_lr: 0.001265  loss: 3.2984 (3.1820)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0767 (1.0761)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [185]  [ 400/2502]  eta: 0:09:14  lr: 0.001263  min_lr: 0.001263  loss: 2.7469 (3.1875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0184 (1.0668)  time: 0.2581  data: 0.0005  max mem: 18975
Epoch: [185]  [ 600/2502]  eta: 0:08:16  lr: 0.001262  min_lr: 0.001262  loss: 2.7844 (3.1458)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0108 (1.0557)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [185]  [ 800/2502]  eta: 0:07:22  lr: 0.001260  min_lr: 0.001260  loss: 3.2820 (3.1564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9665 (1.0613)  time: 0.2639  data: 0.0004  max mem: 18975
Epoch: [185]  [1000/2502]  eta: 0:06:29  lr: 0.001259  min_lr: 0.001259  loss: 3.0858 (3.1622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9576 (1.0524)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [185]  [1200/2502]  eta: 0:05:37  lr: 0.001257  min_lr: 0.001257  loss: 3.3227 (3.1615)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0580 (inf)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [185]  [1400/2502]  eta: 0:04:45  lr: 0.001256  min_lr: 0.001256  loss: 3.4657 (3.1617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9953 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [185]  [1600/2502]  eta: 0:03:53  lr: 0.001254  min_lr: 0.001254  loss: 2.9971 (3.1554)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0073 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [185]  [1800/2502]  eta: 0:03:01  lr: 0.001253  min_lr: 0.001253  loss: 3.6221 (3.1561)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0528 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [185]  [2000/2502]  eta: 0:02:09  lr: 0.001251  min_lr: 0.001251  loss: 3.4176 (3.1521)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0190 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [185]  [2200/2502]  eta: 0:01:18  lr: 0.001250  min_lr: 0.001250  loss: 3.2347 (3.1567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9935 (inf)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [185]  [2400/2502]  eta: 0:00:26  lr: 0.001248  min_lr: 0.001248  loss: 3.3019 (3.1638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9641 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [185]  [2501/2502]  eta: 0:00:00  lr: 0.001247  min_lr: 0.001247  loss: 3.5804 (3.1667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9947 (inf)  time: 0.2307  data: 0.0007  max mem: 18975
Epoch: [185] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001247  min_lr: 0.001247  loss: 3.5804 (3.1617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9947 (inf)
Test:  [ 0/50]  eta: 0:02:38  loss: 0.6705 (0.6705)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 3.1789  data: 3.0354  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9352 (0.9581)  acc1: 84.8000 (83.9273)  acc5: 96.8000 (96.6909)  time: 0.5407  data: 0.4185  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9691 (0.9820)  acc1: 81.2000 (83.0476)  acc5: 96.8000 (96.5714)  time: 0.2585  data: 0.1390  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1240 (1.0858)  acc1: 78.4000 (80.6323)  acc5: 93.2000 (95.1613)  time: 0.2579  data: 0.1391  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3535 (1.1519)  acc1: 74.0000 (78.7707)  acc5: 92.4000 (94.5659)  time: 0.2820  data: 0.1633  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3135 (1.1577)  acc1: 74.0000 (78.2960)  acc5: 92.4000 (94.5040)  time: 0.2181  data: 0.0994  max mem: 18975
Test: Total time: 0:00:15 (0.3092 s / it)
* Acc@1 78.402 Acc@5 94.586 loss 1.155
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.46%
Epoch: [186]  [   0/2502]  eta: 1:21:42  lr: 0.001247  min_lr: 0.001247  loss: 4.0930 (4.0930)  weight_decay: 0.0500 (0.0500)  time: 1.9596  data: 1.3721  max mem: 18975
Epoch: [186]  [ 200/2502]  eta: 0:10:16  lr: 0.001246  min_lr: 0.001246  loss: 2.6276 (3.1657)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0290 (1.0426)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [186]  [ 400/2502]  eta: 0:09:11  lr: 0.001244  min_lr: 0.001244  loss: 2.7900 (3.1410)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0216 (1.0670)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [186]  [ 600/2502]  eta: 0:08:16  lr: 0.001243  min_lr: 0.001243  loss: 3.5375 (3.1525)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0603 (1.0580)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [186]  [ 800/2502]  eta: 0:07:22  lr: 0.001241  min_lr: 0.001241  loss: 3.4837 (3.1451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9837 (1.0528)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [186]  [1000/2502]  eta: 0:06:29  lr: 0.001240  min_lr: 0.001240  loss: 3.6076 (3.1608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9398 (1.0461)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [186]  [1200/2502]  eta: 0:05:37  lr: 0.001238  min_lr: 0.001238  loss: 3.3897 (3.1487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9743 (1.0432)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [186]  [1400/2502]  eta: 0:04:45  lr: 0.001237  min_lr: 0.001237  loss: 3.0813 (3.1461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0242 (1.0475)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [186]  [1600/2502]  eta: 0:03:53  lr: 0.001235  min_lr: 0.001235  loss: 3.2835 (3.1527)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0119 (1.0509)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [186]  [1800/2502]  eta: 0:03:01  lr: 0.001234  min_lr: 0.001234  loss: 3.3245 (3.1478)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0350 (1.0477)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [186]  [2000/2502]  eta: 0:02:09  lr: 0.001232  min_lr: 0.001232  loss: 2.7804 (3.1506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0085 (1.0494)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [186]  [2200/2502]  eta: 0:01:17  lr: 0.001231  min_lr: 0.001231  loss: 3.4838 (3.1542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9600 (1.0481)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [186]  [2400/2502]  eta: 0:00:26  lr: 0.001229  min_lr: 0.001229  loss: 3.1517 (3.1571)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0249 (1.0476)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [186]  [2501/2502]  eta: 0:00:00  lr: 0.001229  min_lr: 0.001229  loss: 3.3496 (3.1554)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0417 (1.0473)  time: 0.2312  data: 0.0010  max mem: 18975
Epoch: [186] Total time: 0:10:45 (0.2581 s / it)
Averaged stats: lr: 0.001229  min_lr: 0.001229  loss: 3.3496 (3.1664)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0417 (1.0473)
Test:  [ 0/50]  eta: 0:03:09  loss: 0.5814 (0.5814)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 3.7902  data: 3.6458  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 0.8252 (0.8767)  acc1: 85.2000 (83.7091)  acc5: 97.2000 (97.0545)  time: 0.6155  data: 0.4920  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.8724 (0.8934)  acc1: 81.2000 (82.8571)  acc5: 96.8000 (96.9333)  time: 0.2955  data: 0.1745  max mem: 18975
Test:  [30/50]  eta: 0:00:08  loss: 1.0311 (1.0046)  acc1: 78.8000 (80.6452)  acc5: 94.4000 (95.6516)  time: 0.2818  data: 0.1620  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2986 (1.0815)  acc1: 74.8000 (78.7220)  acc5: 92.4000 (94.8098)  time: 0.2218  data: 0.1027  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2725 (1.1001)  acc1: 73.6000 (78.0400)  acc5: 92.4000 (94.5920)  time: 0.2213  data: 0.1026  max mem: 18975
Test: Total time: 0:00:15 (0.3057 s / it)
* Acc@1 78.364 Acc@5 94.556 loss 1.098
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.46%
Epoch: [187]  [   0/2502]  eta: 1:22:15  lr: 0.001229  min_lr: 0.001229  loss: 2.1006 (2.1006)  weight_decay: 0.0500 (0.0500)  time: 1.9727  data: 1.5150  max mem: 18975
Epoch: [187]  [ 200/2502]  eta: 0:10:16  lr: 0.001227  min_lr: 0.001227  loss: 3.5376 (3.1040)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0029 (1.0518)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [187]  [ 400/2502]  eta: 0:09:12  lr: 0.001226  min_lr: 0.001226  loss: 2.8982 (3.1056)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0503 (1.0672)  time: 0.2638  data: 0.0004  max mem: 18975
Epoch: [187]  [ 600/2502]  eta: 0:08:16  lr: 0.001224  min_lr: 0.001224  loss: 3.3921 (3.1335)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0601 (1.0818)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [187]  [ 800/2502]  eta: 0:07:22  lr: 0.001223  min_lr: 0.001223  loss: 3.3043 (3.1508)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0051 (1.0717)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [187]  [1000/2502]  eta: 0:06:29  lr: 0.001221  min_lr: 0.001221  loss: 2.8532 (3.1512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0515 (1.0688)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [187]  [1200/2502]  eta: 0:05:37  lr: 0.001220  min_lr: 0.001220  loss: 3.0205 (3.1491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9938 (1.0664)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [187]  [1400/2502]  eta: 0:04:45  lr: 0.001218  min_lr: 0.001218  loss: 3.4362 (3.1582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9994 (1.0612)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [187]  [1600/2502]  eta: 0:03:53  lr: 0.001217  min_lr: 0.001217  loss: 2.3819 (3.1489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9796 (1.0609)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [187]  [1800/2502]  eta: 0:03:01  lr: 0.001215  min_lr: 0.001215  loss: 3.0474 (3.1505)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0632 (1.0624)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [187]  [2000/2502]  eta: 0:02:09  lr: 0.001214  min_lr: 0.001214  loss: 3.0319 (3.1529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9659 (1.0611)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [187]  [2200/2502]  eta: 0:01:17  lr: 0.001212  min_lr: 0.001212  loss: 3.0208 (3.1511)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0080 (inf)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [187]  [2400/2502]  eta: 0:00:26  lr: 0.001211  min_lr: 0.001211  loss: 3.5699 (3.1590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9474 (inf)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [187]  [2501/2502]  eta: 0:00:00  lr: 0.001210  min_lr: 0.001210  loss: 3.4190 (3.1573)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1150 (inf)  time: 0.2315  data: 0.0009  max mem: 18975
Epoch: [187] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.001210  min_lr: 0.001210  loss: 3.4190 (3.1574)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1150 (inf)
Test:  [ 0/50]  eta: 0:02:28  loss: 0.5574 (0.5574)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 2.9742  data: 2.8157  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.8662 (0.8956)  acc1: 83.6000 (84.6909)  acc5: 96.8000 (96.8364)  time: 0.4952  data: 0.3725  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.9220 (0.9220)  acc1: 81.6000 (83.0286)  acc5: 96.8000 (96.8952)  time: 0.2434  data: 0.1237  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1123 (1.0319)  acc1: 77.2000 (80.5806)  acc5: 95.2000 (95.4581)  time: 0.2418  data: 0.1221  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2870 (1.0996)  acc1: 73.6000 (78.8195)  acc5: 91.6000 (94.7415)  time: 0.2384  data: 0.1183  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2618 (1.1221)  acc1: 73.2000 (78.0720)  acc5: 92.8000 (94.5920)  time: 0.2064  data: 0.0857  max mem: 18975
Test: Total time: 0:00:14 (0.2879 s / it)
* Acc@1 78.452 Acc@5 94.612 loss 1.116
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.46%
Epoch: [188]  [   0/2502]  eta: 1:14:58  lr: 0.001210  min_lr: 0.001210  loss: 2.1167 (2.1167)  weight_decay: 0.0500 (0.0500)  time: 1.7979  data: 1.5398  max mem: 18975
Epoch: [188]  [ 200/2502]  eta: 0:10:16  lr: 0.001208  min_lr: 0.001208  loss: 2.6804 (3.1323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0938 (1.1346)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [188]  [ 400/2502]  eta: 0:09:12  lr: 0.001207  min_lr: 0.001207  loss: 3.0139 (3.1411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0220 (1.0726)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [188]  [ 600/2502]  eta: 0:08:16  lr: 0.001205  min_lr: 0.001205  loss: 3.1277 (3.1452)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0162 (1.0668)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [188]  [ 800/2502]  eta: 0:07:22  lr: 0.001204  min_lr: 0.001204  loss: 3.2683 (3.1340)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0631 (1.0648)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [188]  [1000/2502]  eta: 0:06:29  lr: 0.001202  min_lr: 0.001202  loss: 2.8234 (3.1333)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0622 (1.0721)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [188]  [1200/2502]  eta: 0:05:37  lr: 0.001201  min_lr: 0.001201  loss: 3.1095 (3.1283)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0147 (1.0665)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [188]  [1400/2502]  eta: 0:04:45  lr: 0.001199  min_lr: 0.001199  loss: 3.5393 (3.1265)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0227 (1.0715)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [188]  [1600/2502]  eta: 0:03:53  lr: 0.001198  min_lr: 0.001198  loss: 3.3426 (3.1384)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0947 (1.0744)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [188]  [1800/2502]  eta: 0:03:01  lr: 0.001196  min_lr: 0.001196  loss: 3.3197 (3.1457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0264 (1.0746)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [188]  [2000/2502]  eta: 0:02:09  lr: 0.001195  min_lr: 0.001195  loss: 3.1331 (3.1423)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0090 (1.0728)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [188]  [2200/2502]  eta: 0:01:17  lr: 0.001193  min_lr: 0.001193  loss: 3.0215 (3.1547)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9920 (1.0699)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [188]  [2400/2502]  eta: 0:00:26  lr: 0.001192  min_lr: 0.001192  loss: 3.2751 (3.1537)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0888 (1.0665)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [188]  [2501/2502]  eta: 0:00:00  lr: 0.001191  min_lr: 0.001191  loss: 3.6401 (3.1589)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0038 (1.0646)  time: 0.2376  data: 0.0007  max mem: 18975
Epoch: [188] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.001191  min_lr: 0.001191  loss: 3.6401 (3.1574)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0038 (1.0646)
Test:  [ 0/50]  eta: 0:03:04  loss: 0.5876 (0.5876)  acc1: 93.2000 (93.2000)  acc5: 99.2000 (99.2000)  time: 3.6837  data: 3.5425  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.9326 (0.9474)  acc1: 84.4000 (83.1273)  acc5: 97.6000 (97.2727)  time: 0.5400  data: 0.4113  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.9733 (0.9790)  acc1: 82.0000 (82.5714)  acc5: 97.2000 (97.1810)  time: 0.2535  data: 0.1302  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.1215 (1.0897)  acc1: 78.4000 (80.3871)  acc5: 94.8000 (95.8710)  time: 0.2857  data: 0.1664  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3773 (1.1574)  acc1: 74.4000 (78.7024)  acc5: 92.8000 (95.0244)  time: 0.2400  data: 0.1209  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3302 (1.1788)  acc1: 73.6000 (78.1360)  acc5: 93.2000 (94.8640)  time: 0.2183  data: 0.0953  max mem: 18975
Test: Total time: 0:00:14 (0.2963 s / it)
* Acc@1 78.208 Acc@5 94.726 loss 1.183
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.46%
Epoch: [189]  [   0/2502]  eta: 1:21:59  lr: 0.001191  min_lr: 0.001191  loss: 2.8470 (2.8470)  weight_decay: 0.0500 (0.0500)  time: 1.9662  data: 1.5252  max mem: 18975
Epoch: [189]  [ 200/2502]  eta: 0:10:20  lr: 0.001190  min_lr: 0.001190  loss: 3.5365 (3.1548)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0968 (1.0953)  time: 0.2597  data: 0.0005  max mem: 18975
Epoch: [189]  [ 400/2502]  eta: 0:09:14  lr: 0.001188  min_lr: 0.001188  loss: 3.2899 (3.0869)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0311 (1.0757)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [189]  [ 600/2502]  eta: 0:08:18  lr: 0.001187  min_lr: 0.001187  loss: 3.4093 (3.1119)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0489 (1.0641)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [189]  [ 800/2502]  eta: 0:07:23  lr: 0.001185  min_lr: 0.001185  loss: 3.2300 (3.1061)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0169 (1.0628)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [189]  [1000/2502]  eta: 0:06:30  lr: 0.001184  min_lr: 0.001184  loss: 3.4613 (3.1165)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0547 (1.0641)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [189]  [1200/2502]  eta: 0:05:37  lr: 0.001182  min_lr: 0.001182  loss: 3.1933 (3.1256)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0693 (1.0711)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [189]  [1400/2502]  eta: 0:04:45  lr: 0.001181  min_lr: 0.001181  loss: 3.2741 (3.1184)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1868 (1.0792)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [189]  [1600/2502]  eta: 0:03:53  lr: 0.001179  min_lr: 0.001179  loss: 2.9870 (3.1378)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0608 (1.0777)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [189]  [1800/2502]  eta: 0:03:01  lr: 0.001178  min_lr: 0.001178  loss: 3.4139 (3.1476)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0584 (1.0750)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [189]  [2000/2502]  eta: 0:02:09  lr: 0.001176  min_lr: 0.001176  loss: 3.0362 (3.1596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1265 (1.0745)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [189]  [2200/2502]  eta: 0:01:18  lr: 0.001175  min_lr: 0.001175  loss: 3.5053 (3.1605)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0228 (1.0750)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [189]  [2400/2502]  eta: 0:00:26  lr: 0.001173  min_lr: 0.001173  loss: 3.0644 (3.1596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9980 (1.0757)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [189]  [2501/2502]  eta: 0:00:00  lr: 0.001173  min_lr: 0.001173  loss: 3.4693 (3.1615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9853 (1.0736)  time: 0.2308  data: 0.0009  max mem: 18975
Epoch: [189] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001173  min_lr: 0.001173  loss: 3.4693 (3.1494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9853 (1.0736)
Test:  [ 0/50]  eta: 0:03:15  loss: 0.5647 (0.5647)  acc1: 91.6000 (91.6000)  acc5: 98.8000 (98.8000)  time: 3.9137  data: 3.7767  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.8808 (0.8827)  acc1: 83.6000 (82.9818)  acc5: 97.2000 (96.9818)  time: 0.5394  data: 0.4141  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8697 (0.8954)  acc1: 81.6000 (82.7238)  acc5: 96.8000 (96.8571)  time: 0.2079  data: 0.0849  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0648 (1.0042)  acc1: 78.4000 (80.4129)  acc5: 94.8000 (95.5871)  time: 0.2578  data: 0.1360  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2675 (1.0714)  acc1: 74.0000 (78.6927)  acc5: 92.4000 (95.0244)  time: 0.2646  data: 0.1442  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2702 (1.0906)  acc1: 73.2000 (78.1200)  acc5: 92.8000 (94.8800)  time: 0.2008  data: 0.0819  max mem: 18975
Test: Total time: 0:00:15 (0.3019 s / it)
* Acc@1 78.542 Acc@5 94.724 loss 1.091
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.54%
Epoch: [190]  [   0/2502]  eta: 1:13:44  lr: 0.001173  min_lr: 0.001173  loss: 3.8671 (3.8671)  weight_decay: 0.0500 (0.0500)  time: 1.7685  data: 1.4970  max mem: 18975
Epoch: [190]  [ 200/2502]  eta: 0:10:18  lr: 0.001171  min_lr: 0.001171  loss: 3.2541 (3.1052)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0650 (1.1083)  time: 0.2590  data: 0.0004  max mem: 18975
Epoch: [190]  [ 400/2502]  eta: 0:09:11  lr: 0.001170  min_lr: 0.001170  loss: 2.9766 (3.0832)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0306 (1.0688)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [190]  [ 600/2502]  eta: 0:08:16  lr: 0.001168  min_lr: 0.001168  loss: 3.5232 (3.0906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9927 (1.0663)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [190]  [ 800/2502]  eta: 0:07:22  lr: 0.001167  min_lr: 0.001167  loss: 3.1589 (3.1192)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1168 (1.0758)  time: 0.2572  data: 0.0003  max mem: 18975
Epoch: [190]  [1000/2502]  eta: 0:06:29  lr: 0.001165  min_lr: 0.001165  loss: 2.6909 (3.1086)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1589 (1.0836)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [190]  [1200/2502]  eta: 0:05:37  lr: 0.001164  min_lr: 0.001164  loss: 3.1449 (3.1080)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0743 (1.0820)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [190]  [1400/2502]  eta: 0:04:44  lr: 0.001162  min_lr: 0.001162  loss: 3.3448 (3.1143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9980 (1.0819)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [190]  [1600/2502]  eta: 0:03:52  lr: 0.001161  min_lr: 0.001161  loss: 3.5180 (3.1225)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1136 (1.0844)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [190]  [1800/2502]  eta: 0:03:01  lr: 0.001159  min_lr: 0.001159  loss: 3.0349 (3.1313)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0887 (1.0867)  time: 0.2614  data: 0.0004  max mem: 18975
Epoch: [190]  [2000/2502]  eta: 0:02:09  lr: 0.001158  min_lr: 0.001158  loss: 3.2865 (3.1330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9523 (1.0861)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [190]  [2200/2502]  eta: 0:01:17  lr: 0.001156  min_lr: 0.001156  loss: 3.4599 (3.1346)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1545 (1.0907)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [190]  [2400/2502]  eta: 0:00:26  lr: 0.001155  min_lr: 0.001155  loss: 3.4569 (3.1371)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0437 (1.0916)  time: 0.2557  data: 0.0003  max mem: 18975
Epoch: [190]  [2501/2502]  eta: 0:00:00  lr: 0.001154  min_lr: 0.001154  loss: 3.6052 (3.1372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0173 (1.0904)  time: 0.2308  data: 0.0007  max mem: 18975
Epoch: [190] Total time: 0:10:44 (0.2578 s / it)
Averaged stats: lr: 0.001154  min_lr: 0.001154  loss: 3.6052 (3.1390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0173 (1.0904)
Test:  [ 0/50]  eta: 0:03:18  loss: 0.6560 (0.6560)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 3.9610  data: 3.8150  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.8779 (0.9173)  acc1: 84.8000 (83.6000)  acc5: 96.8000 (96.7273)  time: 0.5047  data: 0.3830  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9310 (0.9503)  acc1: 81.6000 (82.7810)  acc5: 97.2000 (96.8000)  time: 0.1800  data: 0.0594  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1225 (1.0549)  acc1: 78.4000 (80.4387)  acc5: 94.8000 (95.4065)  time: 0.2572  data: 0.1343  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2975 (1.1145)  acc1: 74.8000 (78.7902)  acc5: 92.0000 (94.6829)  time: 0.2753  data: 0.1538  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2791 (1.1267)  acc1: 74.4000 (78.2560)  acc5: 92.8000 (94.5440)  time: 0.2036  data: 0.0847  max mem: 18975
Test: Total time: 0:00:14 (0.2953 s / it)
* Acc@1 78.426 Acc@5 94.756 loss 1.122
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.54%
Epoch: [191]  [   0/2502]  eta: 1:37:08  lr: 0.001154  min_lr: 0.001154  loss: 3.3544 (3.3544)  weight_decay: 0.0500 (0.0500)  time: 2.3294  data: 1.8207  max mem: 18975
Epoch: [191]  [ 200/2502]  eta: 0:10:20  lr: 0.001153  min_lr: 0.001153  loss: 2.4585 (3.0450)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0624 (1.1121)  time: 0.2623  data: 0.0004  max mem: 18975
Epoch: [191]  [ 400/2502]  eta: 0:09:13  lr: 0.001151  min_lr: 0.001151  loss: 2.9289 (3.0717)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0925 (1.0954)  time: 0.2562  data: 0.0003  max mem: 18975
Epoch: [191]  [ 600/2502]  eta: 0:08:17  lr: 0.001150  min_lr: 0.001150  loss: 2.6040 (3.0879)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1054 (1.1018)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [191]  [ 800/2502]  eta: 0:07:22  lr: 0.001148  min_lr: 0.001148  loss: 3.4742 (3.1249)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0563 (1.0891)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [191]  [1000/2502]  eta: 0:06:30  lr: 0.001147  min_lr: 0.001147  loss: 3.4318 (3.1201)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0623 (1.0940)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [191]  [1200/2502]  eta: 0:05:37  lr: 0.001145  min_lr: 0.001145  loss: 3.4106 (3.1319)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0830 (1.0907)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [191]  [1400/2502]  eta: 0:04:45  lr: 0.001144  min_lr: 0.001144  loss: 3.0207 (3.1310)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0851 (1.0861)  time: 0.2579  data: 0.0003  max mem: 18975
Epoch: [191]  [1600/2502]  eta: 0:03:53  lr: 0.001142  min_lr: 0.001142  loss: 3.4518 (3.1428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0984 (1.0896)  time: 0.2585  data: 0.0004  max mem: 18975
Epoch: [191]  [1800/2502]  eta: 0:03:01  lr: 0.001141  min_lr: 0.001141  loss: 3.1224 (3.1510)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9792 (1.0849)  time: 0.2574  data: 0.0003  max mem: 18975
Epoch: [191]  [2000/2502]  eta: 0:02:09  lr: 0.001139  min_lr: 0.001139  loss: 3.1420 (3.1442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1012 (1.0849)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [191]  [2200/2502]  eta: 0:01:18  lr: 0.001138  min_lr: 0.001138  loss: 2.9354 (3.1449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0261 (1.0868)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [191]  [2400/2502]  eta: 0:00:26  lr: 0.001137  min_lr: 0.001137  loss: 3.3517 (3.1446)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0840 (1.0877)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [191]  [2501/2502]  eta: 0:00:00  lr: 0.001136  min_lr: 0.001136  loss: 2.4320 (3.1466)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0343 (1.0866)  time: 0.2303  data: 0.0008  max mem: 18975
Epoch: [191] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.001136  min_lr: 0.001136  loss: 2.4320 (3.1456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0343 (1.0866)
Test:  [ 0/50]  eta: 0:03:28  loss: 0.4568 (0.4568)  acc1: 92.8000 (92.8000)  acc5: 98.4000 (98.4000)  time: 4.1762  data: 4.0310  max mem: 18975
Test:  [10/50]  eta: 0:00:24  loss: 0.7719 (0.7907)  acc1: 84.0000 (83.6727)  acc5: 97.2000 (96.7273)  time: 0.6165  data: 0.4947  max mem: 18975
Test:  [20/50]  eta: 0:00:13  loss: 0.7880 (0.8130)  acc1: 81.6000 (83.1238)  acc5: 97.2000 (96.7429)  time: 0.2549  data: 0.1351  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 0.9700 (0.9185)  acc1: 79.2000 (80.9806)  acc5: 95.2000 (95.4710)  time: 0.2556  data: 0.1353  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.1926 (0.9825)  acc1: 75.2000 (79.3561)  acc5: 92.4000 (94.8683)  time: 0.2230  data: 0.1013  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1729 (0.9998)  acc1: 74.4000 (78.7040)  acc5: 93.2000 (94.7600)  time: 0.2225  data: 0.1011  max mem: 18975
Test: Total time: 0:00:14 (0.2979 s / it)
* Acc@1 78.520 Acc@5 94.716 loss 1.003
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.54%
Epoch: [192]  [   0/2502]  eta: 1:28:44  lr: 0.001136  min_lr: 0.001136  loss: 3.3143 (3.3143)  weight_decay: 0.0500 (0.0500)  time: 2.1280  data: 1.5642  max mem: 18975
Epoch: [192]  [ 200/2502]  eta: 0:10:19  lr: 0.001134  min_lr: 0.001134  loss: 3.2192 (3.1531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0274 (1.0708)  time: 0.2656  data: 0.0004  max mem: 18975
Epoch: [192]  [ 400/2502]  eta: 0:09:12  lr: 0.001133  min_lr: 0.001133  loss: 3.1698 (3.1388)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0754 (1.0907)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [192]  [ 600/2502]  eta: 0:08:16  lr: 0.001131  min_lr: 0.001131  loss: 3.1243 (3.1161)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1636 (1.1042)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [192]  [ 800/2502]  eta: 0:07:22  lr: 0.001130  min_lr: 0.001130  loss: 3.3114 (3.1101)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0799 (1.1098)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [192]  [1000/2502]  eta: 0:06:29  lr: 0.001128  min_lr: 0.001128  loss: 3.3625 (3.1052)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0567 (1.1063)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [192]  [1200/2502]  eta: 0:05:37  lr: 0.001127  min_lr: 0.001127  loss: 2.8037 (3.1208)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0447 (1.1020)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [192]  [1400/2502]  eta: 0:04:45  lr: 0.001125  min_lr: 0.001125  loss: 2.8135 (3.1262)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0224 (1.0961)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [192]  [1600/2502]  eta: 0:03:53  lr: 0.001124  min_lr: 0.001124  loss: 3.1344 (3.1262)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1706 (1.1042)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [192]  [1800/2502]  eta: 0:03:01  lr: 0.001123  min_lr: 0.001123  loss: 3.1832 (3.1221)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0782 (1.1006)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [192]  [2000/2502]  eta: 0:02:09  lr: 0.001121  min_lr: 0.001121  loss: 3.4372 (3.1245)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0159 (1.1017)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [192]  [2200/2502]  eta: 0:01:18  lr: 0.001120  min_lr: 0.001120  loss: 2.8008 (3.1312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1060 (1.1042)  time: 0.2578  data: 0.0003  max mem: 18975
Epoch: [192]  [2400/2502]  eta: 0:00:26  lr: 0.001118  min_lr: 0.001118  loss: 2.9987 (3.1266)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1060 (1.1079)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [192]  [2501/2502]  eta: 0:00:00  lr: 0.001117  min_lr: 0.001117  loss: 3.4627 (3.1293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9874 (1.1050)  time: 0.2311  data: 0.0009  max mem: 18975
Epoch: [192] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001117  min_lr: 0.001117  loss: 3.4627 (3.1281)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9874 (1.1050)
Test:  [ 0/50]  eta: 0:02:48  loss: 0.5646 (0.5646)  acc1: 94.0000 (94.0000)  acc5: 98.4000 (98.4000)  time: 3.3606  data: 3.2076  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.8376 (0.8539)  acc1: 84.8000 (84.6909)  acc5: 96.8000 (96.8364)  time: 0.4998  data: 0.3763  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.8737 (0.8844)  acc1: 82.4000 (83.4857)  acc5: 96.8000 (96.8762)  time: 0.2078  data: 0.0879  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0864 (0.9922)  acc1: 79.6000 (80.9032)  acc5: 94.8000 (95.4581)  time: 0.2346  data: 0.1154  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2322 (1.0519)  acc1: 74.8000 (79.1415)  acc5: 92.0000 (94.8195)  time: 0.2610  data: 0.1414  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2054 (1.0642)  acc1: 74.8000 (78.7120)  acc5: 92.4000 (94.6960)  time: 0.2164  data: 0.0967  max mem: 18975
Test: Total time: 0:00:14 (0.2911 s / it)
* Acc@1 78.684 Acc@5 94.816 loss 1.064
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.68%
Epoch: [193]  [   0/2502]  eta: 1:30:17  lr: 0.001117  min_lr: 0.001117  loss: 3.9919 (3.9919)  weight_decay: 0.0500 (0.0500)  time: 2.1654  data: 1.9025  max mem: 18975
Epoch: [193]  [ 200/2502]  eta: 0:10:18  lr: 0.001116  min_lr: 0.001116  loss: 3.1275 (3.0830)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1811 (1.1069)  time: 0.2584  data: 0.0004  max mem: 18975
Epoch: [193]  [ 400/2502]  eta: 0:09:12  lr: 0.001115  min_lr: 0.001115  loss: 2.8719 (3.1253)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1037 (1.1388)  time: 0.2568  data: 0.0005  max mem: 18975
Epoch: [193]  [ 600/2502]  eta: 0:08:16  lr: 0.001113  min_lr: 0.001113  loss: 2.7845 (3.1148)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0420 (1.1201)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [193]  [ 800/2502]  eta: 0:07:22  lr: 0.001112  min_lr: 0.001112  loss: 2.9750 (3.1138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0485 (1.1131)  time: 0.2578  data: 0.0005  max mem: 18975
Epoch: [193]  [1000/2502]  eta: 0:06:29  lr: 0.001110  min_lr: 0.001110  loss: 2.9687 (3.1043)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1155 (1.1102)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [193]  [1200/2502]  eta: 0:05:37  lr: 0.001109  min_lr: 0.001109  loss: 3.2696 (3.0993)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1036 (1.1097)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [193]  [1400/2502]  eta: 0:04:45  lr: 0.001107  min_lr: 0.001107  loss: 2.9423 (3.1086)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0342 (1.1042)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [193]  [1600/2502]  eta: 0:03:53  lr: 0.001106  min_lr: 0.001106  loss: 3.4862 (3.1234)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0921 (1.1106)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [193]  [1800/2502]  eta: 0:03:01  lr: 0.001104  min_lr: 0.001104  loss: 3.3603 (3.1133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0822 (1.1118)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [193]  [2000/2502]  eta: 0:02:09  lr: 0.001103  min_lr: 0.001103  loss: 3.1488 (3.1122)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0370 (1.1078)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [193]  [2200/2502]  eta: 0:01:18  lr: 0.001101  min_lr: 0.001101  loss: 2.4139 (3.1121)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1951 (1.1143)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [193]  [2400/2502]  eta: 0:00:26  lr: 0.001100  min_lr: 0.001100  loss: 2.5691 (3.1114)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1879 (1.1170)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [193]  [2501/2502]  eta: 0:00:00  lr: 0.001099  min_lr: 0.001099  loss: 2.8568 (3.1116)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0038 (1.1154)  time: 0.2311  data: 0.0009  max mem: 18975
Epoch: [193] Total time: 0:10:46 (0.2585 s / it)
Averaged stats: lr: 0.001099  min_lr: 0.001099  loss: 2.8568 (3.1238)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0038 (1.1154)
Test:  [ 0/50]  eta: 0:02:15  loss: 0.4961 (0.4961)  acc1: 91.2000 (91.2000)  acc5: 98.0000 (98.0000)  time: 2.7090  data: 2.5725  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.7391 (0.7747)  acc1: 84.8000 (84.1818)  acc5: 96.8000 (96.6545)  time: 0.4756  data: 0.3536  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.7397 (0.7975)  acc1: 82.8000 (83.1619)  acc5: 96.8000 (96.6476)  time: 0.2485  data: 0.1278  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 0.9323 (0.8993)  acc1: 78.4000 (81.3032)  acc5: 94.8000 (95.2774)  time: 0.2577  data: 0.1375  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.1476 (0.9671)  acc1: 75.6000 (79.6000)  acc5: 92.8000 (94.7024)  time: 0.2769  data: 0.1551  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1494 (0.9844)  acc1: 74.8000 (79.0480)  acc5: 93.2000 (94.5840)  time: 0.2152  data: 0.0921  max mem: 18975
Test: Total time: 0:00:14 (0.2935 s / it)
* Acc@1 78.948 Acc@5 94.770 loss 0.989
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 78.95%
Epoch: [194]  [   0/2502]  eta: 1:28:58  lr: 0.001099  min_lr: 0.001099  loss: 3.8361 (3.8361)  weight_decay: 0.0500 (0.0500)  time: 2.1337  data: 1.8681  max mem: 18975
Epoch: [194]  [ 200/2502]  eta: 0:10:18  lr: 0.001098  min_lr: 0.001098  loss: 3.3725 (3.1309)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0998 (1.1316)  time: 0.2582  data: 0.0003  max mem: 18975
Epoch: [194]  [ 400/2502]  eta: 0:09:12  lr: 0.001096  min_lr: 0.001096  loss: 3.5757 (3.0994)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0813 (1.1117)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [194]  [ 600/2502]  eta: 0:08:16  lr: 0.001095  min_lr: 0.001095  loss: 3.4420 (3.1057)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0134 (1.1135)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [194]  [ 800/2502]  eta: 0:07:22  lr: 0.001093  min_lr: 0.001093  loss: 3.6291 (3.1419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0628 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [194]  [1000/2502]  eta: 0:06:29  lr: 0.001092  min_lr: 0.001092  loss: 2.5488 (3.1396)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0366 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [194]  [1200/2502]  eta: 0:05:37  lr: 0.001090  min_lr: 0.001090  loss: 3.2360 (3.1416)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0853 (inf)  time: 0.2574  data: 0.0005  max mem: 18975
Epoch: [194]  [1400/2502]  eta: 0:04:45  lr: 0.001089  min_lr: 0.001089  loss: 2.9006 (3.1373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0252 (inf)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [194]  [1600/2502]  eta: 0:03:53  lr: 0.001088  min_lr: 0.001088  loss: 3.2856 (3.1310)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1282 (inf)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [194]  [1800/2502]  eta: 0:03:01  lr: 0.001086  min_lr: 0.001086  loss: 3.0647 (3.1347)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0432 (inf)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [194]  [2000/2502]  eta: 0:02:09  lr: 0.001085  min_lr: 0.001085  loss: 2.8706 (3.1343)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1227 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [194]  [2200/2502]  eta: 0:01:17  lr: 0.001083  min_lr: 0.001083  loss: 3.4412 (3.1351)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0606 (inf)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [194]  [2400/2502]  eta: 0:00:26  lr: 0.001082  min_lr: 0.001082  loss: 3.0428 (3.1305)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2407 (inf)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [194]  [2501/2502]  eta: 0:00:00  lr: 0.001081  min_lr: 0.001081  loss: 3.3933 (3.1314)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0837 (inf)  time: 0.2307  data: 0.0009  max mem: 18975
Epoch: [194] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001081  min_lr: 0.001081  loss: 3.3933 (3.1239)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0837 (inf)
Test:  [ 0/50]  eta: 0:03:15  loss: 0.5571 (0.5571)  acc1: 93.6000 (93.6000)  acc5: 98.4000 (98.4000)  time: 3.9171  data: 3.7740  max mem: 18975
Test:  [10/50]  eta: 0:00:23  loss: 0.8006 (0.8449)  acc1: 84.0000 (84.6546)  acc5: 97.2000 (97.2000)  time: 0.5894  data: 0.4673  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.8428 (0.8630)  acc1: 81.6000 (83.9048)  acc5: 97.2000 (97.1810)  time: 0.2318  data: 0.1121  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0589 (0.9721)  acc1: 79.6000 (81.2516)  acc5: 95.2000 (95.8968)  time: 0.2101  data: 0.0904  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2229 (1.0395)  acc1: 74.0000 (79.5415)  acc5: 92.8000 (95.1610)  time: 0.2223  data: 0.1026  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1951 (1.0508)  acc1: 74.4000 (78.9440)  acc5: 93.2000 (94.9920)  time: 0.2242  data: 0.1050  max mem: 18975
Test: Total time: 0:00:14 (0.2934 s / it)
* Acc@1 79.018 Acc@5 94.888 loss 1.055
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.02%
Epoch: [195]  [   0/2502]  eta: 1:26:07  lr: 0.001081  min_lr: 0.001081  loss: 2.4514 (2.4514)  weight_decay: 0.0500 (0.0500)  time: 2.0655  data: 1.7983  max mem: 18975
Epoch: [195]  [ 200/2502]  eta: 0:10:16  lr: 0.001080  min_lr: 0.001080  loss: 3.2718 (3.0711)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1579 (1.1228)  time: 0.2598  data: 0.0004  max mem: 18975
Epoch: [195]  [ 400/2502]  eta: 0:09:12  lr: 0.001078  min_lr: 0.001078  loss: 3.3767 (3.0801)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0632 (1.1103)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [195]  [ 600/2502]  eta: 0:08:16  lr: 0.001077  min_lr: 0.001077  loss: 3.1235 (3.0889)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0402 (1.0995)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [195]  [ 800/2502]  eta: 0:07:22  lr: 0.001075  min_lr: 0.001075  loss: 3.1719 (3.0965)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1661 (1.1283)  time: 0.2565  data: 0.0005  max mem: 18975
Epoch: [195]  [1000/2502]  eta: 0:06:29  lr: 0.001074  min_lr: 0.001074  loss: 3.0969 (3.1172)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0472 (1.1260)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [195]  [1200/2502]  eta: 0:05:37  lr: 0.001072  min_lr: 0.001072  loss: 2.6598 (3.1069)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1026 (1.1229)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [195]  [1400/2502]  eta: 0:04:45  lr: 0.001071  min_lr: 0.001071  loss: 3.3178 (3.1048)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0165 (1.1175)  time: 0.2572  data: 0.0005  max mem: 18975
Epoch: [195]  [1600/2502]  eta: 0:03:53  lr: 0.001069  min_lr: 0.001069  loss: 3.2463 (3.0898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0957 (1.1204)  time: 0.2573  data: 0.0005  max mem: 18975
Epoch: [195]  [1800/2502]  eta: 0:03:01  lr: 0.001068  min_lr: 0.001068  loss: 2.6813 (3.0875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0773 (1.1227)  time: 0.2580  data: 0.0005  max mem: 18975
Epoch: [195]  [2000/2502]  eta: 0:02:09  lr: 0.001067  min_lr: 0.001067  loss: 2.7827 (3.0928)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0852 (1.1208)  time: 0.2558  data: 0.0004  max mem: 18975
Epoch: [195]  [2200/2502]  eta: 0:01:18  lr: 0.001065  min_lr: 0.001065  loss: 3.4716 (3.0999)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0298 (1.1192)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [195]  [2400/2502]  eta: 0:00:26  lr: 0.001064  min_lr: 0.001064  loss: 3.3083 (3.1077)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0337 (1.1178)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [195]  [2501/2502]  eta: 0:00:00  lr: 0.001063  min_lr: 0.001063  loss: 2.9686 (3.1076)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1061 (1.1188)  time: 0.2313  data: 0.0009  max mem: 18975
Epoch: [195] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.001063  min_lr: 0.001063  loss: 2.9686 (3.1256)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1061 (1.1188)
Test:  [ 0/50]  eta: 0:02:56  loss: 0.5253 (0.5253)  acc1: 93.2000 (93.2000)  acc5: 97.6000 (97.6000)  time: 3.5271  data: 3.3747  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.8160 (0.8315)  acc1: 83.2000 (84.4364)  acc5: 97.2000 (97.1636)  time: 0.4720  data: 0.3493  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.8540 (0.8566)  acc1: 82.4000 (83.4857)  acc5: 97.2000 (96.9905)  time: 0.2082  data: 0.0880  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0826 (0.9699)  acc1: 79.6000 (81.1226)  acc5: 93.6000 (95.5355)  time: 0.2413  data: 0.1199  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2162 (1.0260)  acc1: 74.8000 (79.6781)  acc5: 92.0000 (95.0146)  time: 0.2509  data: 0.1293  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1765 (1.0449)  acc1: 74.8000 (79.1520)  acc5: 92.8000 (94.8320)  time: 0.2249  data: 0.1030  max mem: 18975
Test: Total time: 0:00:14 (0.2889 s / it)
* Acc@1 79.036 Acc@5 94.894 loss 1.045
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.04%
Epoch: [196]  [   0/2502]  eta: 1:18:37  lr: 0.001063  min_lr: 0.001063  loss: 2.2404 (2.2404)  weight_decay: 0.0500 (0.0500)  time: 1.8854  data: 1.6105  max mem: 18975
Epoch: [196]  [ 200/2502]  eta: 0:10:14  lr: 0.001061  min_lr: 0.001061  loss: 3.3300 (3.0760)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9654 (1.0830)  time: 0.2588  data: 0.0004  max mem: 18975
Epoch: [196]  [ 400/2502]  eta: 0:09:11  lr: 0.001060  min_lr: 0.001060  loss: 3.4389 (3.1152)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1051 (1.0966)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [196]  [ 600/2502]  eta: 0:08:15  lr: 0.001059  min_lr: 0.001059  loss: 2.4022 (3.1097)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0971 (1.1193)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [196]  [ 800/2502]  eta: 0:07:21  lr: 0.001057  min_lr: 0.001057  loss: 3.4537 (3.1157)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0805 (1.1227)  time: 0.2568  data: 0.0005  max mem: 18975
Epoch: [196]  [1000/2502]  eta: 0:06:29  lr: 0.001056  min_lr: 0.001056  loss: 2.9920 (3.1148)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0766 (1.1229)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [196]  [1200/2502]  eta: 0:05:37  lr: 0.001054  min_lr: 0.001054  loss: 2.9233 (3.1270)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0777 (1.1164)  time: 0.2640  data: 0.0004  max mem: 18975
Epoch: [196]  [1400/2502]  eta: 0:04:45  lr: 0.001053  min_lr: 0.001053  loss: 3.0380 (3.1335)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3123 (1.1303)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [196]  [1600/2502]  eta: 0:03:53  lr: 0.001051  min_lr: 0.001051  loss: 3.1969 (3.1397)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1170 (1.1309)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [196]  [1800/2502]  eta: 0:03:01  lr: 0.001050  min_lr: 0.001050  loss: 3.1773 (3.1389)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0908 (1.1285)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [196]  [2000/2502]  eta: 0:02:09  lr: 0.001048  min_lr: 0.001048  loss: 2.9702 (3.1402)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1301 (1.1333)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [196]  [2200/2502]  eta: 0:01:17  lr: 0.001047  min_lr: 0.001047  loss: 3.2584 (3.1391)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0717 (1.1296)  time: 0.2611  data: 0.0004  max mem: 18975
Epoch: [196]  [2400/2502]  eta: 0:00:26  lr: 0.001046  min_lr: 0.001046  loss: 3.0377 (3.1365)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1131 (1.1314)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [196]  [2501/2502]  eta: 0:00:00  lr: 0.001045  min_lr: 0.001045  loss: 3.4877 (3.1397)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0682 (inf)  time: 0.2320  data: 0.0012  max mem: 18975
Epoch: [196] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.001045  min_lr: 0.001045  loss: 3.4877 (3.1311)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0682 (inf)
Test:  [ 0/50]  eta: 0:02:44  loss: 0.5621 (0.5621)  acc1: 92.4000 (92.4000)  acc5: 97.6000 (97.6000)  time: 3.2830  data: 3.1467  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.8370 (0.8404)  acc1: 84.4000 (84.2545)  acc5: 97.2000 (96.9455)  time: 0.5099  data: 0.3886  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8371 (0.8476)  acc1: 82.4000 (83.3524)  acc5: 96.8000 (96.8000)  time: 0.2356  data: 0.1161  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0464 (0.9534)  acc1: 78.4000 (80.9419)  acc5: 94.4000 (95.5742)  time: 0.2603  data: 0.1391  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.1820 (1.0168)  acc1: 75.6000 (79.4439)  acc5: 93.2000 (94.9268)  time: 0.2313  data: 0.1080  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1735 (1.0325)  acc1: 74.8000 (78.8320)  acc5: 93.2000 (94.7920)  time: 0.2144  data: 0.0911  max mem: 18975
Test: Total time: 0:00:13 (0.2768 s / it)
* Acc@1 78.924 Acc@5 94.886 loss 1.025
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 79.04%
Epoch: [197]  [   0/2502]  eta: 1:20:26  lr: 0.001045  min_lr: 0.001045  loss: 3.4313 (3.4313)  weight_decay: 0.0500 (0.0500)  time: 1.9290  data: 1.6790  max mem: 18975
Epoch: [197]  [ 200/2502]  eta: 0:10:20  lr: 0.001043  min_lr: 0.001043  loss: 3.5233 (3.1108)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1032 (1.0694)  time: 0.2582  data: 0.0004  max mem: 18975
Epoch: [197]  [ 400/2502]  eta: 0:09:13  lr: 0.001042  min_lr: 0.001042  loss: 2.9860 (3.1362)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0179 (1.0681)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [197]  [ 600/2502]  eta: 0:08:16  lr: 0.001041  min_lr: 0.001041  loss: 3.1873 (3.1232)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0635 (1.0873)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [197]  [ 800/2502]  eta: 0:07:22  lr: 0.001039  min_lr: 0.001039  loss: 2.9065 (3.1212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0689 (1.0921)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [197]  [1000/2502]  eta: 0:06:29  lr: 0.001038  min_lr: 0.001038  loss: 2.9380 (3.1072)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0771 (1.1020)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [197]  [1200/2502]  eta: 0:05:37  lr: 0.001036  min_lr: 0.001036  loss: 2.9096 (3.1024)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1375 (1.1099)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [197]  [1400/2502]  eta: 0:04:45  lr: 0.001035  min_lr: 0.001035  loss: 3.1563 (3.1056)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0385 (1.1163)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [197]  [1600/2502]  eta: 0:03:53  lr: 0.001033  min_lr: 0.001033  loss: 3.1718 (3.1106)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1612 (1.1236)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [197]  [1800/2502]  eta: 0:03:01  lr: 0.001032  min_lr: 0.001032  loss: 3.1754 (3.1132)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0072 (1.1289)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [197]  [2000/2502]  eta: 0:02:09  lr: 0.001031  min_lr: 0.001031  loss: 2.9798 (3.1168)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0145 (1.1250)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [197]  [2200/2502]  eta: 0:01:18  lr: 0.001029  min_lr: 0.001029  loss: 3.3313 (3.1145)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1574 (1.1283)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [197]  [2400/2502]  eta: 0:00:26  lr: 0.001028  min_lr: 0.001028  loss: 3.4368 (3.1123)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1221 (1.1274)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [197]  [2501/2502]  eta: 0:00:00  lr: 0.001027  min_lr: 0.001027  loss: 3.2292 (3.1146)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0368 (1.1256)  time: 0.2307  data: 0.0007  max mem: 18975
Epoch: [197] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.001027  min_lr: 0.001027  loss: 3.2292 (3.1180)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0368 (1.1256)
Test:  [ 0/50]  eta: 0:02:09  loss: 0.5453 (0.5453)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 2.5915  data: 2.4451  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.8592 (0.8715)  acc1: 86.4000 (84.2545)  acc5: 97.6000 (97.0909)  time: 0.4640  data: 0.3424  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9061 (0.8944)  acc1: 82.0000 (82.9905)  acc5: 97.6000 (97.0667)  time: 0.2541  data: 0.1349  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0753 (0.9930)  acc1: 78.0000 (80.7871)  acc5: 94.4000 (95.7548)  time: 0.2807  data: 0.1611  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2359 (1.0532)  acc1: 75.6000 (79.2098)  acc5: 92.4000 (95.0732)  time: 0.2992  data: 0.1797  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1854 (1.0607)  acc1: 75.2000 (78.9120)  acc5: 93.6000 (94.9120)  time: 0.2362  data: 0.1172  max mem: 18975
Test: Total time: 0:00:15 (0.3085 s / it)
* Acc@1 78.936 Acc@5 94.926 loss 1.060
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 79.04%
Epoch: [198]  [   0/2502]  eta: 1:39:41  lr: 0.001027  min_lr: 0.001027  loss: 3.6644 (3.6644)  weight_decay: 0.0500 (0.0500)  time: 2.3909  data: 2.1032  max mem: 18975
Epoch: [198]  [ 200/2502]  eta: 0:10:20  lr: 0.001026  min_lr: 0.001026  loss: 3.0638 (3.0066)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1111 (1.1227)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [198]  [ 400/2502]  eta: 0:09:13  lr: 0.001024  min_lr: 0.001024  loss: 2.8504 (3.0144)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0373 (1.1265)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [198]  [ 600/2502]  eta: 0:08:17  lr: 0.001023  min_lr: 0.001023  loss: 2.7245 (3.0477)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1034 (1.1339)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [198]  [ 800/2502]  eta: 0:07:22  lr: 0.001021  min_lr: 0.001021  loss: 2.9026 (3.0516)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1407 (1.1344)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [198]  [1000/2502]  eta: 0:06:29  lr: 0.001020  min_lr: 0.001020  loss: 3.1435 (3.0639)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0948 (1.1271)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [198]  [1200/2502]  eta: 0:05:37  lr: 0.001018  min_lr: 0.001018  loss: 3.3504 (3.0707)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1365 (1.1270)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [198]  [1400/2502]  eta: 0:04:45  lr: 0.001017  min_lr: 0.001017  loss: 3.1011 (3.0752)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1122 (1.1279)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [198]  [1600/2502]  eta: 0:03:53  lr: 0.001016  min_lr: 0.001016  loss: 3.3174 (3.0889)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1352 (1.1314)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [198]  [1800/2502]  eta: 0:03:01  lr: 0.001014  min_lr: 0.001014  loss: 2.6996 (3.0933)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0460 (1.1394)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [198]  [2000/2502]  eta: 0:02:09  lr: 0.001013  min_lr: 0.001013  loss: 3.2433 (3.0996)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0845 (1.1398)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [198]  [2200/2502]  eta: 0:01:17  lr: 0.001011  min_lr: 0.001011  loss: 3.3129 (3.0984)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1691 (1.1375)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [198]  [2400/2502]  eta: 0:00:26  lr: 0.001010  min_lr: 0.001010  loss: 3.2121 (3.1036)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1564 (1.1389)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [198]  [2501/2502]  eta: 0:00:00  lr: 0.001009  min_lr: 0.001009  loss: 2.8464 (3.1010)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1014 (1.1386)  time: 0.2309  data: 0.0009  max mem: 18975
Epoch: [198] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.001009  min_lr: 0.001009  loss: 2.8464 (3.1004)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1014 (1.1386)
Test:  [ 0/50]  eta: 0:02:14  loss: 0.4520 (0.4520)  acc1: 94.8000 (94.8000)  acc5: 98.8000 (98.8000)  time: 2.6932  data: 2.5497  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.7759 (0.7874)  acc1: 85.2000 (84.9455)  acc5: 97.2000 (97.2000)  time: 0.4644  data: 0.3429  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.8241 (0.8229)  acc1: 81.2000 (83.4095)  acc5: 97.2000 (97.1238)  time: 0.2496  data: 0.1302  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 0.9913 (0.9167)  acc1: 78.0000 (81.0452)  acc5: 95.2000 (95.8839)  time: 0.2476  data: 0.1283  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 1.1389 (0.9717)  acc1: 74.4000 (79.6683)  acc5: 92.8000 (95.2195)  time: 0.2214  data: 0.1023  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1389 (0.9929)  acc1: 73.2000 (78.8480)  acc5: 93.2000 (95.0480)  time: 0.1995  data: 0.0806  max mem: 18975
Test: Total time: 0:00:14 (0.2817 s / it)
* Acc@1 79.186 Acc@5 94.920 loss 0.988
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.19%
Epoch: [199]  [   0/2502]  eta: 1:19:23  lr: 0.001009  min_lr: 0.001009  loss: 2.6059 (2.6059)  weight_decay: 0.0500 (0.0500)  time: 1.9038  data: 1.6304  max mem: 18975
Epoch: [199]  [ 200/2502]  eta: 0:10:19  lr: 0.001008  min_lr: 0.001008  loss: 3.0327 (3.0989)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0434 (1.1233)  time: 0.2692  data: 0.0004  max mem: 18975
Epoch: [199]  [ 400/2502]  eta: 0:09:12  lr: 0.001006  min_lr: 0.001006  loss: 3.1676 (3.0899)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0540 (1.1075)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [199]  [ 600/2502]  eta: 0:08:16  lr: 0.001005  min_lr: 0.001005  loss: 2.9267 (3.1006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9773 (1.1064)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [199]  [ 800/2502]  eta: 0:07:22  lr: 0.001003  min_lr: 0.001003  loss: 3.2786 (3.1028)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0294 (1.0995)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [199]  [1000/2502]  eta: 0:06:29  lr: 0.001002  min_lr: 0.001002  loss: 3.4322 (3.1014)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1768 (1.1121)  time: 0.2673  data: 0.0004  max mem: 18975
Epoch: [199]  [1200/2502]  eta: 0:05:37  lr: 0.001001  min_lr: 0.001001  loss: 3.6277 (3.1016)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1867 (1.1192)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [199]  [1400/2502]  eta: 0:04:45  lr: 0.000999  min_lr: 0.000999  loss: 2.5394 (3.0971)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1370 (1.1234)  time: 0.2571  data: 0.0005  max mem: 18975
Epoch: [199]  [1600/2502]  eta: 0:03:53  lr: 0.000998  min_lr: 0.000998  loss: 3.4900 (3.0988)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1184 (1.1242)  time: 0.2590  data: 0.0004  max mem: 18975
Epoch: [199]  [1800/2502]  eta: 0:03:01  lr: 0.000996  min_lr: 0.000996  loss: 2.7726 (3.0975)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0505 (1.1289)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [199]  [2000/2502]  eta: 0:02:09  lr: 0.000995  min_lr: 0.000995  loss: 3.2598 (3.0953)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0540 (1.1293)  time: 0.2571  data: 0.0005  max mem: 18975
Epoch: [199]  [2200/2502]  eta: 0:01:18  lr: 0.000994  min_lr: 0.000994  loss: 3.0517 (3.0936)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0943 (1.1324)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [199]  [2400/2502]  eta: 0:00:26  lr: 0.000992  min_lr: 0.000992  loss: 3.0420 (3.0982)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2975 (1.1395)  time: 0.2564  data: 0.0003  max mem: 18975
Epoch: [199]  [2501/2502]  eta: 0:00:00  lr: 0.000991  min_lr: 0.000991  loss: 3.1835 (3.1004)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0806 (1.1398)  time: 0.2305  data: 0.0009  max mem: 18975
Epoch: [199] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.000991  min_lr: 0.000991  loss: 3.1835 (3.1210)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0806 (1.1398)
Test:  [ 0/50]  eta: 0:02:20  loss: 0.6092 (0.6092)  acc1: 90.0000 (90.0000)  acc5: 98.0000 (98.0000)  time: 2.8101  data: 2.6666  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.8346 (0.8582)  acc1: 85.2000 (83.4546)  acc5: 97.2000 (96.9818)  time: 0.5286  data: 0.4068  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8416 (0.8698)  acc1: 82.4000 (83.1810)  acc5: 97.2000 (96.9714)  time: 0.2773  data: 0.1578  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0849 (0.9859)  acc1: 79.2000 (81.0581)  acc5: 94.8000 (95.5742)  time: 0.2603  data: 0.1412  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2676 (1.0567)  acc1: 74.8000 (79.4829)  acc5: 92.4000 (94.8390)  time: 0.2601  data: 0.1411  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2544 (1.0675)  acc1: 74.8000 (78.9680)  acc5: 93.2000 (94.6880)  time: 0.2054  data: 0.0865  max mem: 18975
Test: Total time: 0:00:15 (0.3023 s / it)
* Acc@1 78.918 Acc@5 94.850 loss 1.066
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 79.19%
Epoch: [200]  [   0/2502]  eta: 1:31:00  lr: 0.000991  min_lr: 0.000991  loss: 2.6640 (2.6640)  weight_decay: 0.0500 (0.0500)  time: 2.1826  data: 1.9195  max mem: 18975
Epoch: [200]  [ 200/2502]  eta: 0:10:17  lr: 0.000990  min_lr: 0.000990  loss: 3.1410 (3.1022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1126 (1.1112)  time: 0.2585  data: 0.0004  max mem: 18975
Epoch: [200]  [ 400/2502]  eta: 0:09:12  lr: 0.000989  min_lr: 0.000989  loss: 3.2793 (3.1136)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1532 (1.1525)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [200]  [ 600/2502]  eta: 0:08:17  lr: 0.000987  min_lr: 0.000987  loss: 2.6336 (3.1072)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0280 (1.1467)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [200]  [ 800/2502]  eta: 0:07:23  lr: 0.000986  min_lr: 0.000986  loss: 3.2591 (3.1120)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1559 (1.1458)  time: 0.2586  data: 0.0004  max mem: 18975
Epoch: [200]  [1000/2502]  eta: 0:06:30  lr: 0.000984  min_lr: 0.000984  loss: 3.1699 (3.1122)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0823 (1.1440)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [200]  [1200/2502]  eta: 0:05:37  lr: 0.000983  min_lr: 0.000983  loss: 2.9074 (3.1096)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1427 (1.1425)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [200]  [1400/2502]  eta: 0:04:45  lr: 0.000982  min_lr: 0.000982  loss: 3.0476 (3.1025)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0965 (1.1380)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [200]  [1600/2502]  eta: 0:03:53  lr: 0.000980  min_lr: 0.000980  loss: 3.4208 (3.1072)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0881 (1.1353)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [200]  [1800/2502]  eta: 0:03:01  lr: 0.000979  min_lr: 0.000979  loss: 3.3220 (3.1052)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0713 (1.1318)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [200]  [2000/2502]  eta: 0:02:09  lr: 0.000977  min_lr: 0.000977  loss: 3.2158 (3.1101)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0632 (1.1331)  time: 0.2586  data: 0.0004  max mem: 18975
Epoch: [200]  [2200/2502]  eta: 0:01:18  lr: 0.000976  min_lr: 0.000976  loss: 3.3001 (3.1090)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1524 (1.1361)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [200]  [2400/2502]  eta: 0:00:26  lr: 0.000974  min_lr: 0.000974  loss: 3.2189 (3.1103)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0079 (1.1330)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [200]  [2501/2502]  eta: 0:00:00  lr: 0.000974  min_lr: 0.000974  loss: 3.1712 (3.1110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0530 (1.1308)  time: 0.2304  data: 0.0007  max mem: 18975
Epoch: [200] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.000974  min_lr: 0.000974  loss: 3.1712 (3.1009)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0530 (1.1308)
Test:  [ 0/50]  eta: 0:03:10  loss: 0.4715 (0.4715)  acc1: 94.0000 (94.0000)  acc5: 98.8000 (98.8000)  time: 3.8154  data: 3.6769  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.7894 (0.7992)  acc1: 87.6000 (85.0182)  acc5: 98.0000 (97.4182)  time: 0.5215  data: 0.4003  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8333 (0.8516)  acc1: 82.8000 (83.7524)  acc5: 97.2000 (97.1619)  time: 0.2019  data: 0.0824  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0957 (0.9568)  acc1: 78.4000 (81.3806)  acc5: 94.0000 (96.0000)  time: 0.2568  data: 0.1357  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2070 (1.0255)  acc1: 74.8000 (79.7463)  acc5: 92.8000 (95.2585)  time: 0.2750  data: 0.1543  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2016 (1.0379)  acc1: 74.8000 (79.2400)  acc5: 93.6000 (95.2000)  time: 0.2019  data: 0.0831  max mem: 18975
Test: Total time: 0:00:14 (0.2981 s / it)
* Acc@1 79.288 Acc@5 94.980 loss 1.039
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.29%
Epoch: [201]  [   0/2502]  eta: 1:30:39  lr: 0.000974  min_lr: 0.000974  loss: 2.5831 (2.5831)  weight_decay: 0.0500 (0.0500)  time: 2.1739  data: 1.8941  max mem: 18975
Epoch: [201]  [ 200/2502]  eta: 0:10:16  lr: 0.000972  min_lr: 0.000972  loss: 3.0080 (3.0698)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0633 (1.1130)  time: 0.2580  data: 0.0004  max mem: 18975
Epoch: [201]  [ 400/2502]  eta: 0:09:11  lr: 0.000971  min_lr: 0.000971  loss: 2.8089 (3.1036)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1966 (1.1812)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [201]  [ 600/2502]  eta: 0:08:15  lr: 0.000970  min_lr: 0.000970  loss: 2.5344 (3.0822)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0109 (1.1725)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [201]  [ 800/2502]  eta: 0:07:21  lr: 0.000968  min_lr: 0.000968  loss: 3.1468 (3.0872)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0786 (1.1616)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [201]  [1000/2502]  eta: 0:06:29  lr: 0.000967  min_lr: 0.000967  loss: 2.5403 (3.0732)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1753 (1.1713)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [201]  [1200/2502]  eta: 0:05:37  lr: 0.000965  min_lr: 0.000965  loss: 3.2222 (3.0838)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0227 (1.1691)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [201]  [1400/2502]  eta: 0:04:45  lr: 0.000964  min_lr: 0.000964  loss: 2.7710 (3.0774)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1807 (1.1670)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [201]  [1600/2502]  eta: 0:03:53  lr: 0.000963  min_lr: 0.000963  loss: 3.3737 (3.0821)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1378 (1.1685)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [201]  [1800/2502]  eta: 0:03:01  lr: 0.000961  min_lr: 0.000961  loss: 2.8716 (3.0960)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0911 (1.1647)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [201]  [2000/2502]  eta: 0:02:09  lr: 0.000960  min_lr: 0.000960  loss: 2.7164 (3.0993)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1101 (1.1618)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [201]  [2200/2502]  eta: 0:01:17  lr: 0.000958  min_lr: 0.000958  loss: 3.1943 (3.1055)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2019 (1.1653)  time: 0.2557  data: 0.0004  max mem: 18975
Epoch: [201]  [2400/2502]  eta: 0:00:26  lr: 0.000957  min_lr: 0.000957  loss: 3.2814 (3.1075)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1627 (1.1700)  time: 0.2609  data: 0.0004  max mem: 18975
Epoch: [201]  [2501/2502]  eta: 0:00:00  lr: 0.000956  min_lr: 0.000956  loss: 3.1744 (3.1048)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0584 (1.1683)  time: 0.2304  data: 0.0009  max mem: 18975
Epoch: [201] Total time: 0:10:45 (0.2579 s / it)
Averaged stats: lr: 0.000956  min_lr: 0.000956  loss: 3.1744 (3.0928)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0584 (1.1683)
Test:  [ 0/50]  eta: 0:02:44  loss: 0.4863 (0.4863)  acc1: 92.8000 (92.8000)  acc5: 98.4000 (98.4000)  time: 3.2930  data: 3.1502  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.7447 (0.7851)  acc1: 85.2000 (84.6182)  acc5: 96.8000 (97.0182)  time: 0.5407  data: 0.4184  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.8162 (0.8087)  acc1: 81.6000 (83.3905)  acc5: 96.8000 (97.0286)  time: 0.2565  data: 0.1357  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0169 (0.9175)  acc1: 78.0000 (80.9419)  acc5: 95.2000 (95.7161)  time: 0.2432  data: 0.1230  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.1638 (0.9794)  acc1: 74.8000 (79.4829)  acc5: 92.0000 (94.9171)  time: 0.2412  data: 0.1221  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1507 (0.9936)  acc1: 74.8000 (78.9680)  acc5: 92.4000 (94.8400)  time: 0.2077  data: 0.0887  max mem: 18975
Test: Total time: 0:00:14 (0.2977 s / it)
* Acc@1 79.050 Acc@5 94.970 loss 0.989
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.29%
Epoch: [202]  [   0/2502]  eta: 1:38:01  lr: 0.000956  min_lr: 0.000956  loss: 1.9731 (1.9731)  weight_decay: 0.0500 (0.0500)  time: 2.3508  data: 1.5490  max mem: 18975
Epoch: [202]  [ 200/2502]  eta: 0:10:22  lr: 0.000955  min_lr: 0.000955  loss: 3.1405 (3.0438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0794 (1.1258)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [202]  [ 400/2502]  eta: 0:09:14  lr: 0.000953  min_lr: 0.000953  loss: 3.0611 (3.0604)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1522 (1.1483)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [202]  [ 600/2502]  eta: 0:08:18  lr: 0.000952  min_lr: 0.000952  loss: 2.5697 (3.0716)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1700 (1.1438)  time: 0.2626  data: 0.0004  max mem: 18975
Epoch: [202]  [ 800/2502]  eta: 0:07:23  lr: 0.000951  min_lr: 0.000951  loss: 3.5545 (3.0834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1052 (1.1497)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [202]  [1000/2502]  eta: 0:06:30  lr: 0.000949  min_lr: 0.000949  loss: 3.0753 (3.0816)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1128 (1.1444)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [202]  [1200/2502]  eta: 0:05:37  lr: 0.000948  min_lr: 0.000948  loss: 2.9685 (3.0883)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0652 (1.1432)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [202]  [1400/2502]  eta: 0:04:45  lr: 0.000946  min_lr: 0.000946  loss: 3.5702 (3.0984)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2077 (1.1462)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [202]  [1600/2502]  eta: 0:03:53  lr: 0.000945  min_lr: 0.000945  loss: 3.2272 (3.0980)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0471 (1.1502)  time: 0.2577  data: 0.0003  max mem: 18975
Epoch: [202]  [1800/2502]  eta: 0:03:01  lr: 0.000944  min_lr: 0.000944  loss: 3.4270 (3.0990)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2146 (1.1627)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [202]  [2000/2502]  eta: 0:02:09  lr: 0.000942  min_lr: 0.000942  loss: 3.5346 (3.1058)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1518 (1.1686)  time: 0.2573  data: 0.0005  max mem: 18975
Epoch: [202]  [2200/2502]  eta: 0:01:18  lr: 0.000941  min_lr: 0.000941  loss: 2.5780 (3.1000)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1912 (1.1693)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [202]  [2400/2502]  eta: 0:00:26  lr: 0.000939  min_lr: 0.000939  loss: 2.6920 (3.1026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1401 (1.1685)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [202]  [2501/2502]  eta: 0:00:00  lr: 0.000939  min_lr: 0.000939  loss: 3.5493 (3.1054)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0892 (1.1690)  time: 0.2315  data: 0.0010  max mem: 18975
Epoch: [202] Total time: 0:10:46 (0.2585 s / it)
Averaged stats: lr: 0.000939  min_lr: 0.000939  loss: 3.5493 (3.0906)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0892 (1.1690)
Test:  [ 0/50]  eta: 0:02:21  loss: 0.6241 (0.6241)  acc1: 93.2000 (93.2000)  acc5: 98.4000 (98.4000)  time: 2.8276  data: 2.6874  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 0.8855 (0.9128)  acc1: 85.6000 (84.3636)  acc5: 97.2000 (97.2364)  time: 0.4369  data: 0.3158  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.9503 (0.9520)  acc1: 83.2000 (83.7524)  acc5: 97.2000 (97.1429)  time: 0.2350  data: 0.1158  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.1603 (1.0551)  acc1: 78.4000 (81.2516)  acc5: 95.2000 (95.8194)  time: 0.2851  data: 0.1658  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.3393 (1.1271)  acc1: 74.0000 (79.6293)  acc5: 92.4000 (95.0537)  time: 0.2822  data: 0.1605  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.3262 (1.1426)  acc1: 74.0000 (79.0640)  acc5: 93.2000 (94.9600)  time: 0.2212  data: 0.0993  max mem: 18975
Test: Total time: 0:00:14 (0.2982 s / it)
* Acc@1 79.028 Acc@5 94.914 loss 1.138
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.29%
Epoch: [203]  [   0/2502]  eta: 1:41:45  lr: 0.000939  min_lr: 0.000939  loss: 3.3450 (3.3450)  weight_decay: 0.0500 (0.0500)  time: 2.4401  data: 1.9389  max mem: 18975
Epoch: [203]  [ 200/2502]  eta: 0:10:22  lr: 0.000937  min_lr: 0.000937  loss: 2.9719 (3.0997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1291 (1.1666)  time: 0.2588  data: 0.0004  max mem: 18975
Epoch: [203]  [ 400/2502]  eta: 0:09:14  lr: 0.000936  min_lr: 0.000936  loss: 2.4123 (3.0764)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1409 (1.1873)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [203]  [ 600/2502]  eta: 0:08:17  lr: 0.000935  min_lr: 0.000935  loss: 3.0065 (3.0739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1552 (1.1836)  time: 0.2559  data: 0.0004  max mem: 18975
Epoch: [203]  [ 800/2502]  eta: 0:07:23  lr: 0.000933  min_lr: 0.000933  loss: 3.2228 (3.0684)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1148 (1.1721)  time: 0.2559  data: 0.0003  max mem: 18975
Epoch: [203]  [1000/2502]  eta: 0:06:30  lr: 0.000932  min_lr: 0.000932  loss: 2.9356 (3.0794)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0933 (1.1667)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [203]  [1200/2502]  eta: 0:05:37  lr: 0.000930  min_lr: 0.000930  loss: 3.2044 (3.0816)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2632 (inf)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [203]  [1400/2502]  eta: 0:04:45  lr: 0.000929  min_lr: 0.000929  loss: 2.9908 (3.0755)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1986 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [203]  [1600/2502]  eta: 0:03:53  lr: 0.000928  min_lr: 0.000928  loss: 3.1604 (3.0798)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0316 (inf)  time: 0.2643  data: 0.0004  max mem: 18975
Epoch: [203]  [1800/2502]  eta: 0:03:01  lr: 0.000926  min_lr: 0.000926  loss: 3.4533 (3.0755)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1839 (inf)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [203]  [2000/2502]  eta: 0:02:09  lr: 0.000925  min_lr: 0.000925  loss: 2.6304 (3.0750)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1351 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [203]  [2200/2502]  eta: 0:01:18  lr: 0.000924  min_lr: 0.000924  loss: 3.2658 (3.0790)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1439 (inf)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [203]  [2400/2502]  eta: 0:00:26  lr: 0.000922  min_lr: 0.000922  loss: 3.2382 (3.0772)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1859 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [203]  [2501/2502]  eta: 0:00:00  lr: 0.000921  min_lr: 0.000921  loss: 3.2217 (3.0780)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1207 (inf)  time: 0.2304  data: 0.0009  max mem: 18975
Epoch: [203] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.000921  min_lr: 0.000921  loss: 3.2217 (3.0923)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1207 (inf)
Test:  [ 0/50]  eta: 0:02:27  loss: 0.4948 (0.4948)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 2.9593  data: 2.8190  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.8424 (0.8491)  acc1: 85.2000 (84.5818)  acc5: 97.2000 (96.9091)  time: 0.4794  data: 0.3569  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.8610 (0.8773)  acc1: 81.2000 (83.3905)  acc5: 97.2000 (96.8381)  time: 0.2337  data: 0.1135  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0247 (0.9756)  acc1: 78.4000 (81.2000)  acc5: 94.8000 (95.6903)  time: 0.2453  data: 0.1233  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2372 (1.0429)  acc1: 74.4000 (79.4146)  acc5: 92.4000 (94.9951)  time: 0.2608  data: 0.1391  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2125 (1.0507)  acc1: 74.4000 (78.8880)  acc5: 93.6000 (94.9600)  time: 0.2130  data: 0.0936  max mem: 18975
Test: Total time: 0:00:14 (0.2882 s / it)
* Acc@1 79.100 Acc@5 94.934 loss 1.046
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.29%
Epoch: [204]  [   0/2502]  eta: 1:29:14  lr: 0.000921  min_lr: 0.000921  loss: 3.3845 (3.3845)  weight_decay: 0.0500 (0.0500)  time: 2.1399  data: 1.3511  max mem: 18975
Epoch: [204]  [ 200/2502]  eta: 0:10:21  lr: 0.000920  min_lr: 0.000920  loss: 2.7796 (3.0906)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0977 (1.2132)  time: 0.2617  data: 0.0004  max mem: 18975
Epoch: [204]  [ 400/2502]  eta: 0:09:13  lr: 0.000919  min_lr: 0.000919  loss: 2.7657 (3.0501)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1788 (1.1945)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [204]  [ 600/2502]  eta: 0:08:16  lr: 0.000917  min_lr: 0.000917  loss: 3.2018 (3.0699)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1199 (1.1877)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [204]  [ 800/2502]  eta: 0:07:23  lr: 0.000916  min_lr: 0.000916  loss: 3.1714 (3.0573)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2012 (1.1914)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [204]  [1000/2502]  eta: 0:06:30  lr: 0.000915  min_lr: 0.000915  loss: 3.3386 (3.0664)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1071 (1.1896)  time: 0.2579  data: 0.0005  max mem: 18975
Epoch: [204]  [1200/2502]  eta: 0:05:37  lr: 0.000913  min_lr: 0.000913  loss: 2.8372 (3.0737)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0480 (nan)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [204]  [1400/2502]  eta: 0:04:45  lr: 0.000912  min_lr: 0.000912  loss: 3.3190 (3.0752)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0903 (nan)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [204]  [1600/2502]  eta: 0:03:53  lr: 0.000910  min_lr: 0.000910  loss: 3.1011 (3.0704)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1155 (nan)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [204]  [1800/2502]  eta: 0:03:01  lr: 0.000909  min_lr: 0.000909  loss: 3.4068 (3.0707)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1405 (nan)  time: 0.2577  data: 0.0005  max mem: 18975
Epoch: [204]  [2000/2502]  eta: 0:02:09  lr: 0.000908  min_lr: 0.000908  loss: 3.3708 (3.0788)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1651 (nan)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [204]  [2200/2502]  eta: 0:01:18  lr: 0.000906  min_lr: 0.000906  loss: 3.2431 (3.0785)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1409 (nan)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [204]  [2400/2502]  eta: 0:00:26  lr: 0.000905  min_lr: 0.000905  loss: 2.9041 (3.0828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1361 (nan)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [204]  [2501/2502]  eta: 0:00:00  lr: 0.000904  min_lr: 0.000904  loss: 2.6682 (3.0831)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0729 (nan)  time: 0.2307  data: 0.0009  max mem: 18975
Epoch: [204] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.000904  min_lr: 0.000904  loss: 2.6682 (3.0795)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0729 (nan)
Test:  [ 0/50]  eta: 0:03:20  loss: 0.4410 (0.4410)  acc1: 92.0000 (92.0000)  acc5: 98.4000 (98.4000)  time: 4.0154  data: 3.8770  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.7373 (0.7651)  acc1: 85.2000 (84.2182)  acc5: 97.2000 (96.9091)  time: 0.5263  data: 0.4044  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.7642 (0.7921)  acc1: 82.8000 (83.5429)  acc5: 96.8000 (96.9524)  time: 0.1955  data: 0.0758  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0004 (0.8968)  acc1: 78.4000 (81.4452)  acc5: 94.4000 (95.6516)  time: 0.2437  data: 0.1245  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.1537 (0.9635)  acc1: 76.8000 (79.8049)  acc5: 92.4000 (94.9756)  time: 0.2699  data: 0.1509  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1460 (0.9809)  acc1: 75.2000 (79.1040)  acc5: 92.8000 (94.9040)  time: 0.2231  data: 0.1042  max mem: 18975
Test: Total time: 0:00:15 (0.3020 s / it)
* Acc@1 79.196 Acc@5 95.016 loss 0.977
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.29%
Epoch: [205]  [   0/2502]  eta: 1:36:33  lr: 0.000904  min_lr: 0.000904  loss: 3.2884 (3.2884)  weight_decay: 0.0500 (0.0500)  time: 2.3157  data: 2.0395  max mem: 18975
Epoch: [205]  [ 200/2502]  eta: 0:10:20  lr: 0.000903  min_lr: 0.000903  loss: 3.1667 (3.1418)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1362 (1.1547)  time: 0.2594  data: 0.0005  max mem: 18975
Epoch: [205]  [ 400/2502]  eta: 0:09:14  lr: 0.000901  min_lr: 0.000901  loss: 3.0162 (3.0970)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1930 (1.1608)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [205]  [ 600/2502]  eta: 0:08:17  lr: 0.000900  min_lr: 0.000900  loss: 3.1437 (3.0955)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1026 (1.1725)  time: 0.2636  data: 0.0003  max mem: 18975
Epoch: [205]  [ 800/2502]  eta: 0:07:23  lr: 0.000899  min_lr: 0.000899  loss: 3.1509 (3.0883)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1139 (1.1717)  time: 0.2575  data: 0.0004  max mem: 18975
Epoch: [205]  [1000/2502]  eta: 0:06:29  lr: 0.000897  min_lr: 0.000897  loss: 2.9803 (3.0827)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2234 (1.1764)  time: 0.2570  data: 0.0005  max mem: 18975
Epoch: [205]  [1200/2502]  eta: 0:05:37  lr: 0.000896  min_lr: 0.000896  loss: 3.1686 (3.0783)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2026 (1.1763)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [205]  [1400/2502]  eta: 0:04:45  lr: 0.000895  min_lr: 0.000895  loss: 2.9235 (3.0729)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1293 (1.1754)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [205]  [1600/2502]  eta: 0:03:53  lr: 0.000893  min_lr: 0.000893  loss: 3.2577 (3.0795)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1251 (1.1718)  time: 0.2581  data: 0.0004  max mem: 18975
Epoch: [205]  [1800/2502]  eta: 0:03:01  lr: 0.000892  min_lr: 0.000892  loss: 2.9398 (3.0739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1678 (1.1731)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [205]  [2000/2502]  eta: 0:02:09  lr: 0.000891  min_lr: 0.000891  loss: 3.0217 (3.0821)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1308 (1.1758)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [205]  [2200/2502]  eta: 0:01:18  lr: 0.000889  min_lr: 0.000889  loss: 2.9347 (3.0784)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1038 (1.1766)  time: 0.2571  data: 0.0005  max mem: 18975
Epoch: [205]  [2400/2502]  eta: 0:00:26  lr: 0.000888  min_lr: 0.000888  loss: 3.3026 (3.0851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1954 (1.1783)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [205]  [2501/2502]  eta: 0:00:00  lr: 0.000887  min_lr: 0.000887  loss: 2.9227 (3.0888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0974 (1.1771)  time: 0.2304  data: 0.0008  max mem: 18975
Epoch: [205] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.000887  min_lr: 0.000887  loss: 2.9227 (3.0804)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0974 (1.1771)
Test:  [ 0/50]  eta: 0:02:12  loss: 0.5455 (0.5455)  acc1: 94.0000 (94.0000)  acc5: 99.6000 (99.6000)  time: 2.6496  data: 2.5111  max mem: 18975
Test:  [10/50]  eta: 0:00:19  loss: 0.8308 (0.8306)  acc1: 82.4000 (84.6546)  acc5: 97.2000 (97.1636)  time: 0.4885  data: 0.3676  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.8487 (0.8526)  acc1: 82.4000 (84.0191)  acc5: 97.2000 (97.2000)  time: 0.2449  data: 0.1257  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0259 (0.9568)  acc1: 78.8000 (81.5355)  acc5: 94.8000 (95.8839)  time: 0.2537  data: 0.1333  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2165 (1.0236)  acc1: 75.2000 (79.9415)  acc5: 92.8000 (95.2195)  time: 0.2554  data: 0.1343  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2132 (1.0415)  acc1: 74.4000 (79.3520)  acc5: 93.2000 (95.0400)  time: 0.1908  data: 0.0707  max mem: 18975
Test: Total time: 0:00:14 (0.2846 s / it)
* Acc@1 79.418 Acc@5 95.056 loss 1.039
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.42%
Epoch: [206]  [   0/2502]  eta: 1:37:54  lr: 0.000887  min_lr: 0.000887  loss: 2.2339 (2.2339)  weight_decay: 0.0500 (0.0500)  time: 2.3478  data: 2.0789  max mem: 18975
Epoch: [206]  [ 200/2502]  eta: 0:10:21  lr: 0.000886  min_lr: 0.000886  loss: 3.2980 (3.1352)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1078 (1.2152)  time: 0.2606  data: 0.0011  max mem: 18975
Epoch: [206]  [ 400/2502]  eta: 0:09:14  lr: 0.000884  min_lr: 0.000884  loss: 2.9086 (3.1218)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1617 (1.2062)  time: 0.2577  data: 0.0003  max mem: 18975
Epoch: [206]  [ 600/2502]  eta: 0:08:17  lr: 0.000883  min_lr: 0.000883  loss: 2.8929 (3.0959)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0983 (1.1952)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [206]  [ 800/2502]  eta: 0:07:22  lr: 0.000882  min_lr: 0.000882  loss: 2.9299 (3.0893)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2709 (1.2103)  time: 0.2561  data: 0.0004  max mem: 18975
Epoch: [206]  [1000/2502]  eta: 0:06:30  lr: 0.000880  min_lr: 0.000880  loss: 2.9545 (3.0858)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1924 (1.2118)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [206]  [1200/2502]  eta: 0:05:37  lr: 0.000879  min_lr: 0.000879  loss: 3.4538 (3.0739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1171 (1.2131)  time: 0.2572  data: 0.0005  max mem: 18975
Epoch: [206]  [1400/2502]  eta: 0:04:45  lr: 0.000878  min_lr: 0.000878  loss: 2.8518 (3.0627)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1236 (1.2035)  time: 0.2569  data: 0.0005  max mem: 18975
Epoch: [206]  [1600/2502]  eta: 0:03:53  lr: 0.000876  min_lr: 0.000876  loss: 3.2249 (3.0747)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2677 (1.2028)  time: 0.2637  data: 0.0003  max mem: 18975
Epoch: [206]  [1800/2502]  eta: 0:03:01  lr: 0.000875  min_lr: 0.000875  loss: 3.2512 (3.0716)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1448 (1.2059)  time: 0.2569  data: 0.0003  max mem: 18975
Epoch: [206]  [2000/2502]  eta: 0:02:09  lr: 0.000873  min_lr: 0.000873  loss: 2.7605 (3.0703)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1319 (1.2034)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [206]  [2200/2502]  eta: 0:01:18  lr: 0.000872  min_lr: 0.000872  loss: 3.3179 (3.0655)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1455 (1.2049)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [206]  [2400/2502]  eta: 0:00:26  lr: 0.000871  min_lr: 0.000871  loss: 3.4028 (3.0718)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2201 (1.2078)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [206]  [2501/2502]  eta: 0:00:00  lr: 0.000870  min_lr: 0.000870  loss: 2.7672 (3.0676)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1037 (1.2044)  time: 0.2308  data: 0.0009  max mem: 18975
Epoch: [206] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.000870  min_lr: 0.000870  loss: 2.7672 (3.0756)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1037 (1.2044)
Test:  [ 0/50]  eta: 0:03:11  loss: 0.5000 (0.5000)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 3.8281  data: 3.6798  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.8166 (0.7893)  acc1: 83.6000 (84.6909)  acc5: 97.2000 (97.2000)  time: 0.5157  data: 0.3926  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8166 (0.8220)  acc1: 82.4000 (83.8667)  acc5: 97.2000 (97.1619)  time: 0.2179  data: 0.0973  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0335 (0.9247)  acc1: 78.8000 (81.6903)  acc5: 94.4000 (95.9355)  time: 0.2672  data: 0.1474  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.1695 (0.9887)  acc1: 76.4000 (80.2342)  acc5: 92.8000 (95.1805)  time: 0.2577  data: 0.1390  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1549 (1.0051)  acc1: 75.2000 (79.7520)  acc5: 93.6000 (95.1040)  time: 0.2208  data: 0.1021  max mem: 18975
Test: Total time: 0:00:14 (0.2994 s / it)
* Acc@1 79.744 Acc@5 95.052 loss 1.003
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.74%
Epoch: [207]  [   0/2502]  eta: 1:37:27  lr: 0.000870  min_lr: 0.000870  loss: 3.6608 (3.6608)  weight_decay: 0.0500 (0.0500)  time: 2.3373  data: 2.0571  max mem: 18975
Epoch: [207]  [ 200/2502]  eta: 0:10:20  lr: 0.000869  min_lr: 0.000869  loss: 3.3492 (3.0354)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2265 (1.1676)  time: 0.2587  data: 0.0005  max mem: 18975
Epoch: [207]  [ 400/2502]  eta: 0:09:15  lr: 0.000867  min_lr: 0.000867  loss: 3.0959 (3.0352)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1064 (1.1783)  time: 0.2643  data: 0.0003  max mem: 18975
Epoch: [207]  [ 600/2502]  eta: 0:08:18  lr: 0.000866  min_lr: 0.000866  loss: 3.0156 (3.0372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1594 (1.1875)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [207]  [ 800/2502]  eta: 0:07:23  lr: 0.000865  min_lr: 0.000865  loss: 2.8896 (3.0422)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2089 (1.1984)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [207]  [1000/2502]  eta: 0:06:30  lr: 0.000863  min_lr: 0.000863  loss: 3.5314 (3.0572)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2233 (1.2115)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [207]  [1200/2502]  eta: 0:05:38  lr: 0.000862  min_lr: 0.000862  loss: 3.5514 (3.0589)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1910 (1.2112)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [207]  [1400/2502]  eta: 0:04:45  lr: 0.000861  min_lr: 0.000861  loss: 3.0979 (3.0591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0761 (1.2047)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [207]  [1600/2502]  eta: 0:03:53  lr: 0.000859  min_lr: 0.000859  loss: 3.4213 (3.0608)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1705 (1.2057)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [207]  [1800/2502]  eta: 0:03:01  lr: 0.000858  min_lr: 0.000858  loss: 3.0026 (3.0599)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1480 (1.2052)  time: 0.2567  data: 0.0003  max mem: 18975
Epoch: [207]  [2000/2502]  eta: 0:02:09  lr: 0.000857  min_lr: 0.000857  loss: 3.0822 (3.0589)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1620 (1.2140)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [207]  [2200/2502]  eta: 0:01:18  lr: 0.000855  min_lr: 0.000855  loss: 2.8504 (3.0563)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2344 (1.2139)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [207]  [2400/2502]  eta: 0:00:26  lr: 0.000854  min_lr: 0.000854  loss: 2.5390 (3.0601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0964 (1.2124)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [207]  [2501/2502]  eta: 0:00:00  lr: 0.000853  min_lr: 0.000853  loss: 3.0588 (3.0634)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3119 (1.2153)  time: 0.2307  data: 0.0009  max mem: 18975
Epoch: [207] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.000853  min_lr: 0.000853  loss: 3.0588 (3.0617)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3119 (1.2153)
Test:  [ 0/50]  eta: 0:02:46  loss: 0.5545 (0.5545)  acc1: 93.2000 (93.2000)  acc5: 98.8000 (98.8000)  time: 3.3230  data: 3.1789  max mem: 18975
Test:  [10/50]  eta: 0:00:21  loss: 0.7716 (0.8108)  acc1: 85.2000 (84.5091)  acc5: 97.2000 (96.9091)  time: 0.5326  data: 0.4038  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.8055 (0.8344)  acc1: 82.8000 (83.4286)  acc5: 96.8000 (96.8952)  time: 0.2554  data: 0.1319  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0906 (0.9444)  acc1: 78.4000 (81.4194)  acc5: 94.8000 (95.6903)  time: 0.2296  data: 0.1078  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2006 (1.0107)  acc1: 75.2000 (79.6781)  acc5: 92.4000 (95.1024)  time: 0.2192  data: 0.0978  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2006 (1.0274)  acc1: 74.0000 (79.1040)  acc5: 93.6000 (95.0240)  time: 0.2277  data: 0.1087  max mem: 18975
Test: Total time: 0:00:14 (0.2920 s / it)
* Acc@1 79.538 Acc@5 95.066 loss 1.020
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.74%
Epoch: [208]  [   0/2502]  eta: 1:31:03  lr: 0.000853  min_lr: 0.000853  loss: 3.7193 (3.7193)  weight_decay: 0.0500 (0.0500)  time: 2.1835  data: 1.8395  max mem: 18975
Epoch: [208]  [ 200/2502]  eta: 0:10:20  lr: 0.000852  min_lr: 0.000852  loss: 3.2374 (3.0241)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2248 (1.2283)  time: 0.2636  data: 0.0004  max mem: 18975
Epoch: [208]  [ 400/2502]  eta: 0:09:12  lr: 0.000850  min_lr: 0.000850  loss: 2.6486 (3.0411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1717 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [208]  [ 600/2502]  eta: 0:08:16  lr: 0.000849  min_lr: 0.000849  loss: 3.1137 (3.0642)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1991 (inf)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [208]  [ 800/2502]  eta: 0:07:22  lr: 0.000848  min_lr: 0.000848  loss: 2.4577 (3.0357)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1602 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [208]  [1000/2502]  eta: 0:06:29  lr: 0.000846  min_lr: 0.000846  loss: 3.4928 (3.0520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1687 (inf)  time: 0.2585  data: 0.0004  max mem: 18975
Epoch: [208]  [1200/2502]  eta: 0:05:37  lr: 0.000845  min_lr: 0.000845  loss: 2.9727 (3.0520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2452 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [208]  [1400/2502]  eta: 0:04:45  lr: 0.000844  min_lr: 0.000844  loss: 2.9341 (3.0588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1829 (inf)  time: 0.2596  data: 0.0004  max mem: 18975
Epoch: [208]  [1600/2502]  eta: 0:03:53  lr: 0.000842  min_lr: 0.000842  loss: 3.4792 (3.0527)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1159 (inf)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [208]  [1800/2502]  eta: 0:03:01  lr: 0.000841  min_lr: 0.000841  loss: 3.0593 (3.0604)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1915 (inf)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [208]  [2000/2502]  eta: 0:02:09  lr: 0.000840  min_lr: 0.000840  loss: 3.4584 (3.0668)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1948 (inf)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [208]  [2200/2502]  eta: 0:01:17  lr: 0.000838  min_lr: 0.000838  loss: 3.2402 (3.0674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1284 (inf)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [208]  [2400/2502]  eta: 0:00:26  lr: 0.000837  min_lr: 0.000837  loss: 3.2846 (3.0655)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1590 (inf)  time: 0.2575  data: 0.0003  max mem: 18975
Epoch: [208]  [2501/2502]  eta: 0:00:00  lr: 0.000836  min_lr: 0.000836  loss: 2.9149 (3.0682)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1547 (inf)  time: 0.2310  data: 0.0011  max mem: 18975
Epoch: [208] Total time: 0:10:45 (0.2580 s / it)
Averaged stats: lr: 0.000836  min_lr: 0.000836  loss: 2.9149 (3.0716)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1547 (inf)
Test:  [ 0/50]  eta: 0:02:27  loss: 0.5216 (0.5216)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 2.9541  data: 2.8155  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.7767 (0.8169)  acc1: 87.2000 (84.5091)  acc5: 96.8000 (96.8364)  time: 0.5175  data: 0.3931  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.8005 (0.8365)  acc1: 83.2000 (83.8857)  acc5: 96.8000 (96.9524)  time: 0.2845  data: 0.1630  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 0.9254 (0.9413)  acc1: 78.8000 (81.7161)  acc5: 95.2000 (95.7161)  time: 0.2974  data: 0.1778  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.1453 (1.0018)  acc1: 75.6000 (79.9220)  acc5: 92.8000 (95.0634)  time: 0.2564  data: 0.1370  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1448 (1.0124)  acc1: 75.2000 (79.2240)  acc5: 93.6000 (95.0480)  time: 0.2115  data: 0.0913  max mem: 18975
Test: Total time: 0:00:15 (0.3031 s / it)
* Acc@1 79.470 Acc@5 95.090 loss 1.007
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.74%
Epoch: [209]  [   0/2502]  eta: 1:37:58  lr: 0.000836  min_lr: 0.000836  loss: 2.4782 (2.4782)  weight_decay: 0.0500 (0.0500)  time: 2.3494  data: 2.0793  max mem: 18975
Epoch: [209]  [ 200/2502]  eta: 0:10:20  lr: 0.000835  min_lr: 0.000835  loss: 2.9325 (2.9676)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1884 (1.1902)  time: 0.2589  data: 0.0005  max mem: 18975
Epoch: [209]  [ 400/2502]  eta: 0:09:13  lr: 0.000834  min_lr: 0.000834  loss: 3.4755 (2.9859)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2521 (1.2159)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [209]  [ 600/2502]  eta: 0:08:17  lr: 0.000832  min_lr: 0.000832  loss: 3.4139 (3.0197)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1724 (1.2437)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [209]  [ 800/2502]  eta: 0:07:23  lr: 0.000831  min_lr: 0.000831  loss: 3.1505 (3.0175)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1622 (1.2480)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [209]  [1000/2502]  eta: 0:06:30  lr: 0.000830  min_lr: 0.000830  loss: 3.1000 (3.0209)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3077 (1.2461)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [209]  [1200/2502]  eta: 0:05:37  lr: 0.000828  min_lr: 0.000828  loss: 3.1745 (3.0318)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2172 (1.2458)  time: 0.2560  data: 0.0003  max mem: 18975
Epoch: [209]  [1400/2502]  eta: 0:04:45  lr: 0.000827  min_lr: 0.000827  loss: 3.1981 (3.0343)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1108 (nan)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [209]  [1600/2502]  eta: 0:03:53  lr: 0.000826  min_lr: 0.000826  loss: 3.2348 (3.0390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2484 (nan)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [209]  [1800/2502]  eta: 0:03:01  lr: 0.000824  min_lr: 0.000824  loss: 3.1473 (3.0374)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0378 (nan)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [209]  [2000/2502]  eta: 0:02:09  lr: 0.000823  min_lr: 0.000823  loss: 3.5122 (3.0458)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2346 (nan)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [209]  [2200/2502]  eta: 0:01:18  lr: 0.000822  min_lr: 0.000822  loss: 3.0368 (3.0486)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2125 (nan)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [209]  [2400/2502]  eta: 0:00:26  lr: 0.000820  min_lr: 0.000820  loss: 3.5021 (3.0473)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1492 (nan)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [209]  [2501/2502]  eta: 0:00:00  lr: 0.000820  min_lr: 0.000820  loss: 2.8332 (3.0479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1627 (nan)  time: 0.2309  data: 0.0009  max mem: 18975
Epoch: [209] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.000820  min_lr: 0.000820  loss: 2.8332 (3.0472)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1627 (nan)
Test:  [ 0/50]  eta: 0:02:47  loss: 0.4652 (0.4652)  acc1: 92.8000 (92.8000)  acc5: 98.8000 (98.8000)  time: 3.3536  data: 3.2082  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.7257 (0.7620)  acc1: 86.0000 (85.0546)  acc5: 97.6000 (97.1636)  time: 0.5229  data: 0.4008  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.7763 (0.7911)  acc1: 83.2000 (83.9238)  acc5: 96.8000 (97.0667)  time: 0.2311  data: 0.1116  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 0.9631 (0.8974)  acc1: 78.8000 (81.4194)  acc5: 95.2000 (95.7161)  time: 0.2420  data: 0.1228  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.1460 (0.9577)  acc1: 75.2000 (79.9512)  acc5: 92.8000 (95.0634)  time: 0.2513  data: 0.1314  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1325 (0.9732)  acc1: 74.0000 (79.4000)  acc5: 93.2000 (94.9680)  time: 0.2420  data: 0.1223  max mem: 18975
Test: Total time: 0:00:14 (0.2988 s / it)
* Acc@1 79.422 Acc@5 95.084 loss 0.975
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.74%
Epoch: [210]  [   0/2502]  eta: 1:32:25  lr: 0.000820  min_lr: 0.000820  loss: 3.2622 (3.2622)  weight_decay: 0.0500 (0.0500)  time: 2.2165  data: 1.5275  max mem: 18975
Epoch: [210]  [ 200/2502]  eta: 0:10:17  lr: 0.000818  min_lr: 0.000818  loss: 3.2133 (3.1018)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2492 (1.2007)  time: 0.2598  data: 0.0004  max mem: 18975
Epoch: [210]  [ 400/2502]  eta: 0:09:12  lr: 0.000817  min_lr: 0.000817  loss: 3.4858 (3.0934)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1409 (1.2067)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [210]  [ 600/2502]  eta: 0:08:15  lr: 0.000816  min_lr: 0.000816  loss: 2.9815 (3.0651)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1649 (1.2070)  time: 0.2566  data: 0.0003  max mem: 18975
Epoch: [210]  [ 800/2502]  eta: 0:07:22  lr: 0.000814  min_lr: 0.000814  loss: 3.0796 (3.0748)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1931 (1.2134)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [210]  [1000/2502]  eta: 0:06:29  lr: 0.000813  min_lr: 0.000813  loss: 2.8282 (3.0608)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1888 (1.2101)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [210]  [1200/2502]  eta: 0:05:37  lr: 0.000812  min_lr: 0.000812  loss: 3.1063 (3.0715)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1902 (1.2138)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [210]  [1400/2502]  eta: 0:04:45  lr: 0.000810  min_lr: 0.000810  loss: 3.2839 (3.0737)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3488 (1.2205)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [210]  [1600/2502]  eta: 0:03:53  lr: 0.000809  min_lr: 0.000809  loss: 3.5217 (3.0742)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1101 (1.2253)  time: 0.2567  data: 0.0004  max mem: 18975
Epoch: [210]  [1800/2502]  eta: 0:03:01  lr: 0.000808  min_lr: 0.000808  loss: 2.5932 (3.0655)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1184 (1.2256)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [210]  [2000/2502]  eta: 0:02:09  lr: 0.000806  min_lr: 0.000806  loss: 3.3055 (3.0677)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2834 (1.2277)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [210]  [2200/2502]  eta: 0:01:17  lr: 0.000805  min_lr: 0.000805  loss: 2.4824 (3.0701)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2217 (1.2258)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [210]  [2400/2502]  eta: 0:00:26  lr: 0.000804  min_lr: 0.000804  loss: 3.5948 (3.0711)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1395 (1.2194)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [210]  [2501/2502]  eta: 0:00:00  lr: 0.000803  min_lr: 0.000803  loss: 3.0963 (3.0694)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2367 (1.2205)  time: 0.2310  data: 0.0008  max mem: 18975
Epoch: [210] Total time: 0:10:46 (0.2582 s / it)
Averaged stats: lr: 0.000803  min_lr: 0.000803  loss: 3.0963 (3.0538)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2367 (1.2205)
Test:  [ 0/50]  eta: 0:02:19  loss: 0.4878 (0.4878)  acc1: 94.0000 (94.0000)  acc5: 99.6000 (99.6000)  time: 2.7909  data: 2.6487  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 0.8090 (0.8084)  acc1: 84.4000 (85.0182)  acc5: 97.2000 (97.2000)  time: 0.4357  data: 0.3128  max mem: 18975
Test:  [20/50]  eta: 0:00:11  loss: 0.8251 (0.8226)  acc1: 82.8000 (84.0952)  acc5: 96.8000 (97.2381)  time: 0.2462  data: 0.1232  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 0.9656 (0.9200)  acc1: 78.4000 (81.7161)  acc5: 95.6000 (95.9871)  time: 0.2832  data: 0.1611  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.1499 (0.9744)  acc1: 76.0000 (80.2439)  acc5: 93.2000 (95.4829)  time: 0.2724  data: 0.1532  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1360 (0.9938)  acc1: 76.0000 (79.7280)  acc5: 94.0000 (95.2480)  time: 0.2219  data: 0.1028  max mem: 18975
Test: Total time: 0:00:14 (0.2975 s / it)
* Acc@1 79.804 Acc@5 95.244 loss 0.994
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.80%
Epoch: [211]  [   0/2502]  eta: 1:41:22  lr: 0.000803  min_lr: 0.000803  loss: 3.1074 (3.1074)  weight_decay: 0.0500 (0.0500)  time: 2.4310  data: 2.1688  max mem: 18975
Epoch: [211]  [ 200/2502]  eta: 0:10:22  lr: 0.000802  min_lr: 0.000802  loss: 3.0460 (3.0294)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2910 (1.2465)  time: 0.2597  data: 0.0005  max mem: 18975
Epoch: [211]  [ 400/2502]  eta: 0:09:14  lr: 0.000801  min_lr: 0.000801  loss: 3.3979 (3.0345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1284 (1.2215)  time: 0.2573  data: 0.0005  max mem: 18975
Epoch: [211]  [ 600/2502]  eta: 0:08:17  lr: 0.000799  min_lr: 0.000799  loss: 3.3013 (3.0332)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3116 (1.2445)  time: 0.2561  data: 0.0003  max mem: 18975
Epoch: [211]  [ 800/2502]  eta: 0:07:23  lr: 0.000798  min_lr: 0.000798  loss: 3.1426 (3.0338)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1522 (1.2376)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [211]  [1000/2502]  eta: 0:06:30  lr: 0.000797  min_lr: 0.000797  loss: 3.2092 (3.0532)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1880 (1.2333)  time: 0.2571  data: 0.0003  max mem: 18975
Epoch: [211]  [1200/2502]  eta: 0:05:37  lr: 0.000795  min_lr: 0.000795  loss: 3.1616 (3.0457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2873 (1.2438)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [211]  [1400/2502]  eta: 0:04:45  lr: 0.000794  min_lr: 0.000794  loss: 3.2691 (3.0596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2247 (1.2435)  time: 0.2566  data: 0.0005  max mem: 18975
Epoch: [211]  [1600/2502]  eta: 0:03:53  lr: 0.000793  min_lr: 0.000793  loss: 3.1473 (3.0552)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1650 (1.2418)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [211]  [1800/2502]  eta: 0:03:01  lr: 0.000791  min_lr: 0.000791  loss: 2.6576 (3.0551)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2119 (1.2418)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [211]  [2000/2502]  eta: 0:02:09  lr: 0.000790  min_lr: 0.000790  loss: 2.9263 (3.0541)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1454 (1.2462)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [211]  [2200/2502]  eta: 0:01:18  lr: 0.000789  min_lr: 0.000789  loss: 3.4506 (3.0519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2472 (1.2426)  time: 0.2598  data: 0.0004  max mem: 18975
Epoch: [211]  [2400/2502]  eta: 0:00:26  lr: 0.000787  min_lr: 0.000787  loss: 3.3437 (3.0561)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1807 (1.2339)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [211]  [2501/2502]  eta: 0:00:00  lr: 0.000787  min_lr: 0.000787  loss: 2.8035 (3.0553)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1953 (1.2333)  time: 0.2310  data: 0.0009  max mem: 18975
Epoch: [211] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.000787  min_lr: 0.000787  loss: 2.8035 (3.0340)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1953 (1.2333)
Test:  [ 0/50]  eta: 0:02:36  loss: 0.5461 (0.5461)  acc1: 92.8000 (92.8000)  acc5: 98.8000 (98.8000)  time: 3.1349  data: 2.9893  max mem: 18975
Test:  [10/50]  eta: 0:00:18  loss: 0.8166 (0.8194)  acc1: 85.2000 (84.6909)  acc5: 98.0000 (97.3091)  time: 0.4658  data: 0.3431  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.8263 (0.8353)  acc1: 82.8000 (83.6952)  acc5: 97.2000 (97.2762)  time: 0.1988  data: 0.0755  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 0.9745 (0.9305)  acc1: 80.0000 (81.2774)  acc5: 95.2000 (96.0387)  time: 0.2091  data: 0.0864  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 1.1551 (0.9910)  acc1: 76.0000 (79.7561)  acc5: 92.8000 (95.2488)  time: 0.2476  data: 0.1285  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1393 (0.9997)  acc1: 76.0000 (79.3840)  acc5: 92.8000 (95.0400)  time: 0.2168  data: 0.0978  max mem: 18975
Test: Total time: 0:00:13 (0.2729 s / it)
* Acc@1 79.642 Acc@5 95.180 loss 0.991
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.80%
Epoch: [212]  [   0/2502]  eta: 1:21:15  lr: 0.000787  min_lr: 0.000787  loss: 3.1846 (3.1846)  weight_decay: 0.0500 (0.0500)  time: 1.9488  data: 1.5900  max mem: 18975
Epoch: [212]  [ 200/2502]  eta: 0:10:22  lr: 0.000785  min_lr: 0.000785  loss: 3.1012 (2.9787)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1975 (1.1966)  time: 0.2612  data: 0.0005  max mem: 18975
Epoch: [212]  [ 400/2502]  eta: 0:09:15  lr: 0.000784  min_lr: 0.000784  loss: 3.2729 (2.9914)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4254 (1.2596)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [212]  [ 600/2502]  eta: 0:08:17  lr: 0.000783  min_lr: 0.000783  loss: 3.0321 (2.9887)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1518 (1.2682)  time: 0.2555  data: 0.0003  max mem: 18975
Epoch: [212]  [ 800/2502]  eta: 0:07:23  lr: 0.000781  min_lr: 0.000781  loss: 3.1970 (3.0078)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2036 (1.2565)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [212]  [1000/2502]  eta: 0:06:29  lr: 0.000780  min_lr: 0.000780  loss: 3.0902 (3.0054)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3509 (1.2636)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [212]  [1200/2502]  eta: 0:05:37  lr: 0.000779  min_lr: 0.000779  loss: 2.9860 (3.0027)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1810 (1.2573)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [212]  [1400/2502]  eta: 0:04:45  lr: 0.000778  min_lr: 0.000778  loss: 3.1445 (2.9992)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2849 (1.2592)  time: 0.2564  data: 0.0004  max mem: 18975
Epoch: [212]  [1600/2502]  eta: 0:03:53  lr: 0.000776  min_lr: 0.000776  loss: 3.4784 (3.0128)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2626 (1.2590)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [212]  [1800/2502]  eta: 0:03:01  lr: 0.000775  min_lr: 0.000775  loss: 3.1549 (3.0142)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2282 (1.2644)  time: 0.2587  data: 0.0004  max mem: 18975
Epoch: [212]  [2000/2502]  eta: 0:02:09  lr: 0.000774  min_lr: 0.000774  loss: 3.0762 (3.0153)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1725 (1.2591)  time: 0.2577  data: 0.0004  max mem: 18975
Epoch: [212]  [2200/2502]  eta: 0:01:18  lr: 0.000772  min_lr: 0.000772  loss: 3.0477 (3.0181)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1257 (1.2517)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [212]  [2400/2502]  eta: 0:00:26  lr: 0.000771  min_lr: 0.000771  loss: 2.8282 (3.0259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2090 (1.2494)  time: 0.2573  data: 0.0004  max mem: 18975
Epoch: [212]  [2501/2502]  eta: 0:00:00  lr: 0.000770  min_lr: 0.000770  loss: 3.2621 (3.0286)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2445 (1.2527)  time: 0.2311  data: 0.0009  max mem: 18975
Epoch: [212] Total time: 0:10:46 (0.2583 s / it)
Averaged stats: lr: 0.000770  min_lr: 0.000770  loss: 3.2621 (3.0360)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2445 (1.2527)
Test:  [ 0/50]  eta: 0:03:11  loss: 0.5577 (0.5577)  acc1: 93.6000 (93.6000)  acc5: 98.4000 (98.4000)  time: 3.8334  data: 3.6950  max mem: 18975
Test:  [10/50]  eta: 0:00:22  loss: 0.8700 (0.8826)  acc1: 86.4000 (84.8364)  acc5: 97.6000 (97.3818)  time: 0.5740  data: 0.4508  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.8700 (0.8931)  acc1: 83.2000 (84.1333)  acc5: 97.6000 (97.3333)  time: 0.2368  data: 0.1162  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 0.9946 (1.0038)  acc1: 79.6000 (81.6387)  acc5: 96.4000 (96.0000)  time: 0.2418  data: 0.1222  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2825 (1.0716)  acc1: 74.8000 (79.7659)  acc5: 92.4000 (95.2390)  time: 0.2389  data: 0.1194  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.2467 (1.0842)  acc1: 74.8000 (79.3920)  acc5: 93.6000 (95.1120)  time: 0.2383  data: 0.1193  max mem: 18975
Test: Total time: 0:00:14 (0.2907 s / it)
* Acc@1 79.402 Acc@5 95.082 loss 1.084
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.80%
Epoch: [213]  [   0/2502]  eta: 1:25:42  lr: 0.000770  min_lr: 0.000770  loss: 2.2015 (2.2015)  weight_decay: 0.0500 (0.0500)  time: 2.0552  data: 1.6745  max mem: 18975
Epoch: [213]  [ 200/2502]  eta: 0:10:23  lr: 0.000769  min_lr: 0.000769  loss: 3.5080 (3.0469)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2241 (1.2934)  time: 0.2606  data: 0.0004  max mem: 18975
Epoch: [213]  [ 400/2502]  eta: 0:09:14  lr: 0.000768  min_lr: 0.000768  loss: 2.6789 (3.0248)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1770 (1.2658)  time: 0.2563  data: 0.0003  max mem: 18975
Epoch: [213]  [ 600/2502]  eta: 0:08:17  lr: 0.000766  min_lr: 0.000766  loss: 2.7114 (3.0141)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2039 (1.2572)  time: 0.2570  data: 0.0003  max mem: 18975
Epoch: [213]  [ 800/2502]  eta: 0:07:23  lr: 0.000765  min_lr: 0.000765  loss: 2.9657 (3.0232)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1693 (1.2487)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [213]  [1000/2502]  eta: 0:06:30  lr: 0.000764  min_lr: 0.000764  loss: 3.0604 (3.0257)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1768 (1.2417)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [213]  [1200/2502]  eta: 0:05:37  lr: 0.000763  min_lr: 0.000763  loss: 3.2347 (3.0336)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1930 (1.2477)  time: 0.2565  data: 0.0003  max mem: 18975
Epoch: [213]  [1400/2502]  eta: 0:04:45  lr: 0.000761  min_lr: 0.000761  loss: 3.1011 (3.0476)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2425 (1.2450)  time: 0.2568  data: 0.0003  max mem: 18975
Epoch: [213]  [1600/2502]  eta: 0:03:53  lr: 0.000760  min_lr: 0.000760  loss: 3.1805 (3.0431)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2436 (1.2470)  time: 0.2627  data: 0.0004  max mem: 18975
Epoch: [213]  [1800/2502]  eta: 0:03:01  lr: 0.000759  min_lr: 0.000759  loss: 2.9951 (3.0456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2342 (1.2545)  time: 0.2565  data: 0.0004  max mem: 18975
Epoch: [213]  [2000/2502]  eta: 0:02:09  lr: 0.000757  min_lr: 0.000757  loss: 2.6422 (3.0474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3229 (1.2588)  time: 0.2554  data: 0.0004  max mem: 18975
Epoch: [213]  [2200/2502]  eta: 0:01:18  lr: 0.000756  min_lr: 0.000756  loss: 3.4669 (3.0365)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1817 (1.2587)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [213]  [2400/2502]  eta: 0:00:26  lr: 0.000755  min_lr: 0.000755  loss: 2.8831 (3.0313)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1956 (1.2547)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [213]  [2501/2502]  eta: 0:00:00  lr: 0.000754  min_lr: 0.000754  loss: 2.9140 (3.0367)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2431 (1.2550)  time: 0.2307  data: 0.0007  max mem: 18975
Epoch: [213] Total time: 0:10:45 (0.2582 s / it)
Averaged stats: lr: 0.000754  min_lr: 0.000754  loss: 2.9140 (3.0308)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2431 (1.2550)
Test:  [ 0/50]  eta: 0:02:16  loss: 0.5296 (0.5296)  acc1: 93.2000 (93.2000)  acc5: 98.8000 (98.8000)  time: 2.7345  data: 2.5958  max mem: 18975
Test:  [10/50]  eta: 0:00:20  loss: 0.8166 (0.8175)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (97.2000)  time: 0.5125  data: 0.3911  max mem: 18975
Test:  [20/50]  eta: 0:00:12  loss: 0.8166 (0.8295)  acc1: 83.6000 (84.0381)  acc5: 97.2000 (97.2571)  time: 0.2946  data: 0.1745  max mem: 18975
Test:  [30/50]  eta: 0:00:07  loss: 1.0031 (0.9300)  acc1: 79.6000 (81.7032)  acc5: 95.2000 (95.9871)  time: 0.2802  data: 0.1603  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.1139 (0.9880)  acc1: 75.2000 (80.2146)  acc5: 92.8000 (95.2390)  time: 0.2396  data: 0.1206  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1139 (1.0005)  acc1: 75.2000 (79.5680)  acc5: 92.8000 (95.1440)  time: 0.1975  data: 0.0785  max mem: 18975
Test: Total time: 0:00:14 (0.2917 s / it)
* Acc@1 79.766 Acc@5 95.234 loss 0.999
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.80%
Epoch: [214]  [   0/2502]  eta: 1:34:25  lr: 0.000754  min_lr: 0.000754  loss: 3.1848 (3.1848)  weight_decay: 0.0500 (0.0500)  time: 2.2644  data: 1.4260  max mem: 18975
Epoch: [214]  [ 200/2502]  eta: 0:10:21  lr: 0.000753  min_lr: 0.000753  loss: 2.8754 (3.0259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4213 (1.3337)  time: 0.2588  data: 0.0004  max mem: 18975
Epoch: [214]  [ 400/2502]  eta: 0:09:14  lr: 0.000752  min_lr: 0.000752  loss: 3.2301 (2.9961)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2011 (1.2980)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [214]  [ 600/2502]  eta: 0:08:17  lr: 0.000750  min_lr: 0.000750  loss: 3.2603 (3.0253)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1593 (1.2655)  time: 0.2563  data: 0.0004  max mem: 18975
Epoch: [214]  [ 800/2502]  eta: 0:07:23  lr: 0.000749  min_lr: 0.000749  loss: 2.8757 (3.0165)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2255 (1.2625)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [214]  [1000/2502]  eta: 0:06:30  lr: 0.000748  min_lr: 0.000748  loss: 2.9733 (3.0323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1811 (1.2574)  time: 0.2570  data: 0.0004  max mem: 18975
Epoch: [214]  [1200/2502]  eta: 0:05:37  lr: 0.000746  min_lr: 0.000746  loss: 3.1542 (3.0213)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2126 (1.2543)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [214]  [1400/2502]  eta: 0:04:45  lr: 0.000745  min_lr: 0.000745  loss: 3.0480 (3.0172)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2320 (1.2660)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [214]  [1600/2502]  eta: 0:03:53  lr: 0.000744  min_lr: 0.000744  loss: 3.2266 (3.0215)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3980 (1.2718)  time: 0.2578  data: 0.0004  max mem: 18975
Epoch: [214]  [1800/2502]  eta: 0:03:01  lr: 0.000743  min_lr: 0.000743  loss: 3.3368 (3.0236)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2355 (1.2726)  time: 0.2560  data: 0.0004  max mem: 18975
Epoch: [214]  [2000/2502]  eta: 0:02:09  lr: 0.000741  min_lr: 0.000741  loss: 2.7681 (3.0253)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1933 (1.2705)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [214]  [2200/2502]  eta: 0:01:18  lr: 0.000740  min_lr: 0.000740  loss: 2.4106 (3.0247)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2305 (1.2715)  time: 0.2584  data: 0.0004  max mem: 18975
Epoch: [214]  [2400/2502]  eta: 0:00:26  lr: 0.000739  min_lr: 0.000739  loss: 3.0863 (3.0255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2467 (1.2724)  time: 0.2566  data: 0.0004  max mem: 18975
Epoch: [214]  [2501/2502]  eta: 0:00:00  lr: 0.000738  min_lr: 0.000738  loss: 2.9697 (3.0250)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2300 (1.2718)  time: 0.2317  data: 0.0009  max mem: 18975
Epoch: [214] Total time: 0:10:46 (0.2585 s / it)
Averaged stats: lr: 0.000738  min_lr: 0.000738  loss: 2.9697 (3.0345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2300 (1.2718)
Test:  [ 0/50]  eta: 0:02:12  loss: 0.5321 (0.5321)  acc1: 94.4000 (94.4000)  acc5: 98.8000 (98.8000)  time: 2.6597  data: 2.5159  max mem: 18975
Test:  [10/50]  eta: 0:00:16  loss: 0.8347 (0.8304)  acc1: 87.2000 (85.7091)  acc5: 97.2000 (97.3091)  time: 0.4116  data: 0.2870  max mem: 18975
Test:  [20/50]  eta: 0:00:10  loss: 0.8347 (0.8465)  acc1: 83.2000 (84.4762)  acc5: 97.2000 (97.2381)  time: 0.2327  data: 0.1115  max mem: 18975
Test:  [30/50]  eta: 0:00:06  loss: 1.0045 (0.9538)  acc1: 80.4000 (82.0387)  acc5: 95.2000 (95.9742)  time: 0.2838  data: 0.1638  max mem: 18975
Test:  [40/50]  eta: 0:00:03  loss: 1.2110 (1.0202)  acc1: 74.8000 (80.3220)  acc5: 92.8000 (95.3171)  time: 0.2607  data: 0.1411  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1945 (1.0353)  acc1: 74.0000 (79.7680)  acc5: 93.6000 (95.2000)  time: 0.2067  data: 0.0876  max mem: 18975
Test: Total time: 0:00:14 (0.2863 s / it)
* Acc@1 79.782 Acc@5 95.274 loss 1.031
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.80%
Epoch: [215]  [   0/2502]  eta: 1:32:02  lr: 0.000738  min_lr: 0.000738  loss: 2.8194 (2.8194)  weight_decay: 0.0500 (0.0500)  time: 2.2071  data: 1.9302  max mem: 18975
Epoch: [215]  [ 200/2502]  eta: 0:10:18  lr: 0.000737  min_lr: 0.000737  loss: 2.6272 (3.0592)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2542 (1.2633)  time: 0.2596  data: 0.0004  max mem: 18975
Epoch: [215]  [ 400/2502]  eta: 0:09:12  lr: 0.000736  min_lr: 0.000736  loss: 3.3514 (3.0639)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2753 (1.2783)  time: 0.2579  data: 0.0004  max mem: 18975
Epoch: [215]  [ 600/2502]  eta: 0:08:16  lr: 0.000734  min_lr: 0.000734  loss: 3.2981 (3.0639)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2830 (1.2664)  time: 0.2575  data: 0.0005  max mem: 18975
Epoch: [215]  [ 800/2502]  eta: 0:07:23  lr: 0.000733  min_lr: 0.000733  loss: 2.6873 (3.0383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1049 (1.2574)  time: 0.2571  data: 0.0004  max mem: 18975
Epoch: [215]  [1000/2502]  eta: 0:06:29  lr: 0.000732  min_lr: 0.000732  loss: 3.0640 (3.0277)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2210 (1.2591)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [215]  [1200/2502]  eta: 0:05:37  lr: 0.000730  min_lr: 0.000730  loss: 2.8153 (3.0348)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2668 (1.2737)  time: 0.2572  data: 0.0004  max mem: 18975
Epoch: [215]  [1400/2502]  eta: 0:04:45  lr: 0.000729  min_lr: 0.000729  loss: 3.2623 (3.0397)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2709 (1.2812)  time: 0.2574  data: 0.0004  max mem: 18975
Epoch: [215]  [1600/2502]  eta: 0:03:53  lr: 0.000728  min_lr: 0.000728  loss: 2.9973 (3.0402)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2206 (1.2850)  time: 0.2562  data: 0.0004  max mem: 18975
Epoch: [215]  [1800/2502]  eta: 0:03:01  lr: 0.000727  min_lr: 0.000727  loss: 2.5914 (3.0431)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2429 (1.2876)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [215]  [2000/2502]  eta: 0:02:09  lr: 0.000725  min_lr: 0.000725  loss: 3.3711 (3.0423)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3435 (1.2946)  time: 0.2576  data: 0.0004  max mem: 18975
Epoch: [215]  [2200/2502]  eta: 0:01:18  lr: 0.000724  min_lr: 0.000724  loss: 3.2996 (3.0413)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1943 (1.2884)  time: 0.2569  data: 0.0004  max mem: 18975
Epoch: [215]  [2400/2502]  eta: 0:00:26  lr: 0.000723  min_lr: 0.000723  loss: 3.4555 (3.0452)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3158 (1.2870)  time: 0.2568  data: 0.0004  max mem: 18975
Epoch: [215]  [2501/2502]  eta: 0:00:00  lr: 0.000722  min_lr: 0.000722  loss: 3.1574 (3.0447)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1707 (1.2846)  time: 0.2307  data: 0.0009  max mem: 18975
Epoch: [215] Total time: 0:10:46 (0.2584 s / it)
Averaged stats: lr: 0.000722  min_lr: 0.000722  loss: 3.1574 (3.0395)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1707 (1.2846)
Test:  [ 0/50]  eta: 0:02:34  loss: 0.5284 (0.5284)  acc1: 93.2000 (93.2000)  acc5: 98.4000 (98.4000)  time: 3.0981  data: 2.9543  max mem: 18975
Test:  [10/50]  eta: 0:00:17  loss: 0.8159 (0.8237)  acc1: 85.6000 (84.9818)  acc5: 97.2000 (97.1636)  time: 0.4441  data: 0.3226  max mem: 18975
Test:  [20/50]  eta: 0:00:09  loss: 0.8211 (0.8477)  acc1: 82.4000 (84.0952)  acc5: 97.2000 (97.1048)  time: 0.1900  data: 0.0702  max mem: 18975
Test:  [30/50]  eta: 0:00:05  loss: 1.0122 (0.9461)  acc1: 79.6000 (81.8194)  acc5: 94.8000 (95.9355)  time: 0.2020  data: 0.0822  max mem: 18975
Test:  [40/50]  eta: 0:00:02  loss: 1.1576 (1.0064)  acc1: 75.6000 (80.1268)  acc5: 92.8000 (95.3561)  time: 0.2358  data: 0.1155  max mem: 18975
Test:  [49/50]  eta: 0:00:00  loss: 1.1534 (1.0213)  acc1: 75.6000 (79.6240)  acc5: 94.0000 (95.2560)  time: 0.2245  data: 0.1038  max mem: 18975
Test: Total time: 0:00:13 (0.2677 s / it)
* Acc@1 79.784 Acc@5 95.282 loss 1.017
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.80%
| distributed init (rank 0): env://, gpu 0
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 3): env://, gpu 3
| distributed init (rank 6): env://, gpu 6
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 5): env://, gpu 5
| distributed init (rank 7): env://, gpu 7
Namespace(batch_size=128, epochs=300, update_freq=4, model='tiny', drop_path=0, input_size=224, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.004, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.4, cutmix=0.5, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_tiny_2.4G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=8, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(224, 224))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7f2bd9c0bf10>
Mixup is activated!
Model = SFCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
  )
  (layer1): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((24,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=24, bias=False)
        (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(192, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=24, bias=False)
          (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(24, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): Identity()
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(48, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(192, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.003)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(48, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(192, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.006)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(48, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(192, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.009)
    )
  )
  (layer2): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
        (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.011)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.014)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.017)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.020)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.023)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.026)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.029)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.031)
    )
  )
  (layer3): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
        (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.034)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.037)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.040)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.043)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.046)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.049)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.051)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.054)
    )
    (8): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.057)
    )
    (9): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.060)
    )
    (10): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.063)
    )
    (11): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.066)
    )
    (12): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.069)
    )
    (13): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.071)
    )
    (14): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.074)
    )
    (15): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.077)
    )
    (16): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.080)
    )
    (17): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.083)
    )
    (18): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.086)
    )
    (19): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
        (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(768, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.089)
    )
  )
  (layer4): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1536, bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1536, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.091)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1536, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.094)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1536, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.097)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
        (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1536, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.100)
    )
  )
  (head): ConvX(
    (conv): Conv2d(384, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 15945832
LR = 0.00400000
Batch size = 4096
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 312
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.le.conv.weight",
      "layer1.0.conv_in.conv.weight",
      "layer1.0.conv.conv.weight",
      "layer1.0.conv_out.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.le.conv.weight",
      "layer1.1.conv_in.conv.weight",
      "layer1.1.conv.conv.weight",
      "layer1.1.conv_out.conv.weight",
      "layer1.2.le.conv.weight",
      "layer1.2.conv_in.conv.weight",
      "layer1.2.conv.conv.weight",
      "layer1.2.conv_out.conv.weight",
      "layer1.3.le.conv.weight",
      "layer1.3.conv_in.conv.weight",
      "layer1.3.conv.conv.weight",
      "layer1.3.conv_out.conv.weight",
      "layer2.0.le.conv.weight",
      "layer2.0.conv_in.conv.weight",
      "layer2.0.conv.conv.weight",
      "layer2.0.conv_out.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.le.conv.weight",
      "layer2.1.conv_in.conv.weight",
      "layer2.1.conv.conv.weight",
      "layer2.1.conv_out.conv.weight",
      "layer2.2.le.conv.weight",
      "layer2.2.conv_in.conv.weight",
      "layer2.2.conv.conv.weight",
      "layer2.2.conv_out.conv.weight",
      "layer2.3.le.conv.weight",
      "layer2.3.conv_in.conv.weight",
      "layer2.3.conv.conv.weight",
      "layer2.3.conv_out.conv.weight",
      "layer2.4.le.conv.weight",
      "layer2.4.conv_in.conv.weight",
      "layer2.4.conv.conv.weight",
      "layer2.4.conv_out.conv.weight",
      "layer2.5.le.conv.weight",
      "layer2.5.conv_in.conv.weight",
      "layer2.5.conv.conv.weight",
      "layer2.5.conv_out.conv.weight",
      "layer2.6.le.conv.weight",
      "layer2.6.conv_in.conv.weight",
      "layer2.6.conv.conv.weight",
      "layer2.6.conv_out.conv.weight",
      "layer2.7.le.conv.weight",
      "layer2.7.conv_in.conv.weight",
      "layer2.7.conv.conv.weight",
      "layer2.7.conv_out.conv.weight",
      "layer3.0.le.conv.weight",
      "layer3.0.conv_in.conv.weight",
      "layer3.0.conv.conv.weight",
      "layer3.0.conv_out.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.le.conv.weight",
      "layer3.1.conv_in.conv.weight",
      "layer3.1.conv.conv.weight",
      "layer3.1.conv_out.conv.weight",
      "layer3.2.le.conv.weight",
      "layer3.2.conv_in.conv.weight",
      "layer3.2.conv.conv.weight",
      "layer3.2.conv_out.conv.weight",
      "layer3.3.le.conv.weight",
      "layer3.3.conv_in.conv.weight",
      "layer3.3.conv.conv.weight",
      "layer3.3.conv_out.conv.weight",
      "layer3.4.le.conv.weight",
      "layer3.4.conv_in.conv.weight",
      "layer3.4.conv.conv.weight",
      "layer3.4.conv_out.conv.weight",
      "layer3.5.le.conv.weight",
      "layer3.5.conv_in.conv.weight",
      "layer3.5.conv.conv.weight",
      "layer3.5.conv_out.conv.weight",
      "layer3.6.le.conv.weight",
      "layer3.6.conv_in.conv.weight",
      "layer3.6.conv.conv.weight",
      "layer3.6.conv_out.conv.weight",
      "layer3.7.le.conv.weight",
      "layer3.7.conv_in.conv.weight",
      "layer3.7.conv.conv.weight",
      "layer3.7.conv_out.conv.weight",
      "layer3.8.le.conv.weight",
      "layer3.8.conv_in.conv.weight",
      "layer3.8.conv.conv.weight",
      "layer3.8.conv_out.conv.weight",
      "layer3.9.le.conv.weight",
      "layer3.9.conv_in.conv.weight",
      "layer3.9.conv.conv.weight",
      "layer3.9.conv_out.conv.weight",
      "layer3.10.le.conv.weight",
      "layer3.10.conv_in.conv.weight",
      "layer3.10.conv.conv.weight",
      "layer3.10.conv_out.conv.weight",
      "layer3.11.le.conv.weight",
      "layer3.11.conv_in.conv.weight",
      "layer3.11.conv.conv.weight",
      "layer3.11.conv_out.conv.weight",
      "layer3.12.le.conv.weight",
      "layer3.12.conv_in.conv.weight",
      "layer3.12.conv.conv.weight",
      "layer3.12.conv_out.conv.weight",
      "layer3.13.le.conv.weight",
      "layer3.13.conv_in.conv.weight",
      "layer3.13.conv.conv.weight",
      "layer3.13.conv_out.conv.weight",
      "layer3.14.le.conv.weight",
      "layer3.14.conv_in.conv.weight",
      "layer3.14.conv.conv.weight",
      "layer3.14.conv_out.conv.weight",
      "layer3.15.le.conv.weight",
      "layer3.15.conv_in.conv.weight",
      "layer3.15.conv.conv.weight",
      "layer3.15.conv_out.conv.weight",
      "layer3.16.le.conv.weight",
      "layer3.16.conv_in.conv.weight",
      "layer3.16.conv.conv.weight",
      "layer3.16.conv_out.conv.weight",
      "layer3.17.le.conv.weight",
      "layer3.17.conv_in.conv.weight",
      "layer3.17.conv.conv.weight",
      "layer3.17.conv_out.conv.weight",
      "layer3.18.le.conv.weight",
      "layer3.18.conv_in.conv.weight",
      "layer3.18.conv.conv.weight",
      "layer3.18.conv_out.conv.weight",
      "layer3.19.le.conv.weight",
      "layer3.19.conv_in.conv.weight",
      "layer3.19.conv.conv.weight",
      "layer3.19.conv_out.conv.weight",
      "layer4.0.le.conv.weight",
      "layer4.0.conv_in.conv.weight",
      "layer4.0.conv.conv.weight",
      "layer4.0.conv_out.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.le.conv.weight",
      "layer4.1.conv_in.conv.weight",
      "layer4.1.conv.conv.weight",
      "layer4.1.conv_out.conv.weight",
      "layer4.2.le.conv.weight",
      "layer4.2.conv_in.conv.weight",
      "layer4.2.conv.conv.weight",
      "layer4.2.conv_out.conv.weight",
      "layer4.3.le.conv.weight",
      "layer4.3.conv_in.conv.weight",
      "layer4.3.conv.conv.weight",
      "layer4.3.conv_out.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.ln.weight",
      "layer1.0.ln.bias",
      "layer1.0.le.norm.weight",
      "layer1.0.le.norm.bias",
      "layer1.0.conv_in.norm.weight",
      "layer1.0.conv_in.norm.bias",
      "layer1.0.conv.norm.weight",
      "layer1.0.conv.norm.bias",
      "layer1.0.conv_out.norm.weight",
      "layer1.0.conv_out.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.le.norm.weight",
      "layer1.1.le.norm.bias",
      "layer1.1.conv_in.norm.weight",
      "layer1.1.conv_in.norm.bias",
      "layer1.1.conv.norm.weight",
      "layer1.1.conv.norm.bias",
      "layer1.1.conv_out.norm.weight",
      "layer1.1.conv_out.norm.bias",
      "layer1.2.le.norm.weight",
      "layer1.2.le.norm.bias",
      "layer1.2.conv_in.norm.weight",
      "layer1.2.conv_in.norm.bias",
      "layer1.2.conv.norm.weight",
      "layer1.2.conv.norm.bias",
      "layer1.2.conv_out.norm.weight",
      "layer1.2.conv_out.norm.bias",
      "layer1.3.le.norm.weight",
      "layer1.3.le.norm.bias",
      "layer1.3.conv_in.norm.weight",
      "layer1.3.conv_in.norm.bias",
      "layer1.3.conv.norm.weight",
      "layer1.3.conv.norm.bias",
      "layer1.3.conv_out.norm.weight",
      "layer1.3.conv_out.norm.bias",
      "layer2.0.ln.weight",
      "layer2.0.ln.bias",
      "layer2.0.le.norm.weight",
      "layer2.0.le.norm.bias",
      "layer2.0.conv_in.norm.weight",
      "layer2.0.conv_in.norm.bias",
      "layer2.0.conv.norm.weight",
      "layer2.0.conv.norm.bias",
      "layer2.0.conv_out.norm.weight",
      "layer2.0.conv_out.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.le.norm.weight",
      "layer2.1.le.norm.bias",
      "layer2.1.conv_in.norm.weight",
      "layer2.1.conv_in.norm.bias",
      "layer2.1.conv.norm.weight",
      "layer2.1.conv.norm.bias",
      "layer2.1.conv_out.norm.weight",
      "layer2.1.conv_out.norm.bias",
      "layer2.2.le.norm.weight",
      "layer2.2.le.norm.bias",
      "layer2.2.conv_in.norm.weight",
      "layer2.2.conv_in.norm.bias",
      "layer2.2.conv.norm.weight",
      "layer2.2.conv.norm.bias",
      "layer2.2.conv_out.norm.weight",
      "layer2.2.conv_out.norm.bias",
      "layer2.3.le.norm.weight",
      "layer2.3.le.norm.bias",
      "layer2.3.conv_in.norm.weight",
      "layer2.3.conv_in.norm.bias",
      "layer2.3.conv.norm.weight",
      "layer2.3.conv.norm.bias",
      "layer2.3.conv_out.norm.weight",
      "layer2.3.conv_out.norm.bias",
      "layer2.4.le.norm.weight",
      "layer2.4.le.norm.bias",
      "layer2.4.conv_in.norm.weight",
      "layer2.4.conv_in.norm.bias",
      "layer2.4.conv.norm.weight",
      "layer2.4.conv.norm.bias",
      "layer2.4.conv_out.norm.weight",
      "layer2.4.conv_out.norm.bias",
      "layer2.5.le.norm.weight",
      "layer2.5.le.norm.bias",
      "layer2.5.conv_in.norm.weight",
      "layer2.5.conv_in.norm.bias",
      "layer2.5.conv.norm.weight",
      "layer2.5.conv.norm.bias",
      "layer2.5.conv_out.norm.weight",
      "layer2.5.conv_out.norm.bias",
      "layer2.6.le.norm.weight",
      "layer2.6.le.norm.bias",
      "layer2.6.conv_in.norm.weight",
      "layer2.6.conv_in.norm.bias",
      "layer2.6.conv.norm.weight",
      "layer2.6.conv.norm.bias",
      "layer2.6.conv_out.norm.weight",
      "layer2.6.conv_out.norm.bias",
      "layer2.7.le.norm.weight",
      "layer2.7.le.norm.bias",
      "layer2.7.conv_in.norm.weight",
      "layer2.7.conv_in.norm.bias",
      "layer2.7.conv.norm.weight",
      "layer2.7.conv.norm.bias",
      "layer2.7.conv_out.norm.weight",
      "layer2.7.conv_out.norm.bias",
      "layer3.0.ln.weight",
      "layer3.0.ln.bias",
      "layer3.0.le.norm.weight",
      "layer3.0.le.norm.bias",
      "layer3.0.conv_in.norm.weight",
      "layer3.0.conv_in.norm.bias",
      "layer3.0.conv.norm.weight",
      "layer3.0.conv.norm.bias",
      "layer3.0.conv_out.norm.weight",
      "layer3.0.conv_out.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.le.norm.weight",
      "layer3.1.le.norm.bias",
      "layer3.1.conv_in.norm.weight",
      "layer3.1.conv_in.norm.bias",
      "layer3.1.conv.norm.weight",
      "layer3.1.conv.norm.bias",
      "layer3.1.conv_out.norm.weight",
      "layer3.1.conv_out.norm.bias",
      "layer3.2.le.norm.weight",
      "layer3.2.le.norm.bias",
      "layer3.2.conv_in.norm.weight",
      "layer3.2.conv_in.norm.bias",
      "layer3.2.conv.norm.weight",
      "layer3.2.conv.norm.bias",
      "layer3.2.conv_out.norm.weight",
      "layer3.2.conv_out.norm.bias",
      "layer3.3.le.norm.weight",
      "layer3.3.le.norm.bias",
      "layer3.3.conv_in.norm.weight",
      "layer3.3.conv_in.norm.bias",
      "layer3.3.conv.norm.weight",
      "layer3.3.conv.norm.bias",
      "layer3.3.conv_out.norm.weight",
      "layer3.3.conv_out.norm.bias",
      "layer3.4.le.norm.weight",
      "layer3.4.le.norm.bias",
      "layer3.4.conv_in.norm.weight",
      "layer3.4.conv_in.norm.bias",
      "layer3.4.conv.norm.weight",
      "layer3.4.conv.norm.bias",
      "layer3.4.conv_out.norm.weight",
      "layer3.4.conv_out.norm.bias",
      "layer3.5.le.norm.weight",
      "layer3.5.le.norm.bias",
      "layer3.5.conv_in.norm.weight",
      "layer3.5.conv_in.norm.bias",
      "layer3.5.conv.norm.weight",
      "layer3.5.conv.norm.bias",
      "layer3.5.conv_out.norm.weight",
      "layer3.5.conv_out.norm.bias",
      "layer3.6.le.norm.weight",
      "layer3.6.le.norm.bias",
      "layer3.6.conv_in.norm.weight",
      "layer3.6.conv_in.norm.bias",
      "layer3.6.conv.norm.weight",
      "layer3.6.conv.norm.bias",
      "layer3.6.conv_out.norm.weight",
      "layer3.6.conv_out.norm.bias",
      "layer3.7.le.norm.weight",
      "layer3.7.le.norm.bias",
      "layer3.7.conv_in.norm.weight",
      "layer3.7.conv_in.norm.bias",
      "layer3.7.conv.norm.weight",
      "layer3.7.conv.norm.bias",
      "layer3.7.conv_out.norm.weight",
      "layer3.7.conv_out.norm.bias",
      "layer3.8.le.norm.weight",
      "layer3.8.le.norm.bias",
      "layer3.8.conv_in.norm.weight",
      "layer3.8.conv_in.norm.bias",
      "layer3.8.conv.norm.weight",
      "layer3.8.conv.norm.bias",
      "layer3.8.conv_out.norm.weight",
      "layer3.8.conv_out.norm.bias",
      "layer3.9.le.norm.weight",
      "layer3.9.le.norm.bias",
      "layer3.9.conv_in.norm.weight",
      "layer3.9.conv_in.norm.bias",
      "layer3.9.conv.norm.weight",
      "layer3.9.conv.norm.bias",
      "layer3.9.conv_out.norm.weight",
      "layer3.9.conv_out.norm.bias",
      "layer3.10.le.norm.weight",
      "layer3.10.le.norm.bias",
      "layer3.10.conv_in.norm.weight",
      "layer3.10.conv_in.norm.bias",
      "layer3.10.conv.norm.weight",
      "layer3.10.conv.norm.bias",
      "layer3.10.conv_out.norm.weight",
      "layer3.10.conv_out.norm.bias",
      "layer3.11.le.norm.weight",
      "layer3.11.le.norm.bias",
      "layer3.11.conv_in.norm.weight",
      "layer3.11.conv_in.norm.bias",
      "layer3.11.conv.norm.weight",
      "layer3.11.conv.norm.bias",
      "layer3.11.conv_out.norm.weight",
      "layer3.11.conv_out.norm.bias",
      "layer3.12.le.norm.weight",
      "layer3.12.le.norm.bias",
      "layer3.12.conv_in.norm.weight",
      "layer3.12.conv_in.norm.bias",
      "layer3.12.conv.norm.weight",
      "layer3.12.conv.norm.bias",
      "layer3.12.conv_out.norm.weight",
      "layer3.12.conv_out.norm.bias",
      "layer3.13.le.norm.weight",
      "layer3.13.le.norm.bias",
      "layer3.13.conv_in.norm.weight",
      "layer3.13.conv_in.norm.bias",
      "layer3.13.conv.norm.weight",
      "layer3.13.conv.norm.bias",
      "layer3.13.conv_out.norm.weight",
      "layer3.13.conv_out.norm.bias",
      "layer3.14.le.norm.weight",
      "layer3.14.le.norm.bias",
      "layer3.14.conv_in.norm.weight",
      "layer3.14.conv_in.norm.bias",
      "layer3.14.conv.norm.weight",
      "layer3.14.conv.norm.bias",
      "layer3.14.conv_out.norm.weight",
      "layer3.14.conv_out.norm.bias",
      "layer3.15.le.norm.weight",
      "layer3.15.le.norm.bias",
      "layer3.15.conv_in.norm.weight",
      "layer3.15.conv_in.norm.bias",
      "layer3.15.conv.norm.weight",
      "layer3.15.conv.norm.bias",
      "layer3.15.conv_out.norm.weight",
      "layer3.15.conv_out.norm.bias",
      "layer3.16.le.norm.weight",
      "layer3.16.le.norm.bias",
      "layer3.16.conv_in.norm.weight",
      "layer3.16.conv_in.norm.bias",
      "layer3.16.conv.norm.weight",
      "layer3.16.conv.norm.bias",
      "layer3.16.conv_out.norm.weight",
      "layer3.16.conv_out.norm.bias",
      "layer3.17.le.norm.weight",
      "layer3.17.le.norm.bias",
      "layer3.17.conv_in.norm.weight",
      "layer3.17.conv_in.norm.bias",
      "layer3.17.conv.norm.weight",
      "layer3.17.conv.norm.bias",
      "layer3.17.conv_out.norm.weight",
      "layer3.17.conv_out.norm.bias",
      "layer3.18.le.norm.weight",
      "layer3.18.le.norm.bias",
      "layer3.18.conv_in.norm.weight",
      "layer3.18.conv_in.norm.bias",
      "layer3.18.conv.norm.weight",
      "layer3.18.conv.norm.bias",
      "layer3.18.conv_out.norm.weight",
      "layer3.18.conv_out.norm.bias",
      "layer3.19.le.norm.weight",
      "layer3.19.le.norm.bias",
      "layer3.19.conv_in.norm.weight",
      "layer3.19.conv_in.norm.bias",
      "layer3.19.conv.norm.weight",
      "layer3.19.conv.norm.bias",
      "layer3.19.conv_out.norm.weight",
      "layer3.19.conv_out.norm.bias",
      "layer4.0.ln.weight",
      "layer4.0.ln.bias",
      "layer4.0.le.norm.weight",
      "layer4.0.le.norm.bias",
      "layer4.0.conv_in.norm.weight",
      "layer4.0.conv_in.norm.bias",
      "layer4.0.conv.norm.weight",
      "layer4.0.conv.norm.bias",
      "layer4.0.conv_out.norm.weight",
      "layer4.0.conv_out.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.le.norm.weight",
      "layer4.1.le.norm.bias",
      "layer4.1.conv_in.norm.weight",
      "layer4.1.conv_in.norm.bias",
      "layer4.1.conv.norm.weight",
      "layer4.1.conv.norm.bias",
      "layer4.1.conv_out.norm.weight",
      "layer4.1.conv_out.norm.bias",
      "layer4.2.le.norm.weight",
      "layer4.2.le.norm.bias",
      "layer4.2.conv_in.norm.weight",
      "layer4.2.conv_in.norm.bias",
      "layer4.2.conv.norm.weight",
      "layer4.2.conv.norm.bias",
      "layer4.2.conv_out.norm.weight",
      "layer4.2.conv_out.norm.bias",
      "layer4.3.le.norm.weight",
      "layer4.3.le.norm.bias",
      "layer4.3.conv_in.norm.weight",
      "layer4.3.conv_in.norm.bias",
      "layer4.3.conv.norm.weight",
      "layer4.3.conv.norm.bias",
      "layer4.3.conv_out.norm.weight",
      "layer4.3.conv_out.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 6240
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: checkpoint_tiny_2.4G/checkpoint-215.pth
Resume checkpoint checkpoint_tiny_2.4G/checkpoint-215.pth
With optim & sched!
Start training for 300 epochs
Epoch: [216]  [   0/1251]  eta: 4:08:32  lr: 0.000825  min_lr: 0.000825  loss: 4.0641 (4.0641)  weight_decay: 0.0500 (0.0500)  time: 11.9208  data: 3.0878  max mem: 19097
Epoch: [216]  [ 200/1251]  eta: 0:05:33  lr: 0.000822  min_lr: 0.000822  loss: 3.3433 (2.9994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8808 (0.8787)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [216]  [ 400/1251]  eta: 0:04:04  lr: 0.000819  min_lr: 0.000819  loss: 3.0153 (2.9648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9055 (0.9220)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [216]  [ 600/1251]  eta: 0:03:01  lr: 0.000817  min_lr: 0.000817  loss: 2.6037 (2.9524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8914 (0.9360)  time: 0.2596  data: 0.0005  max mem: 19097
Epoch: [216]  [ 800/1251]  eta: 0:02:03  lr: 0.000814  min_lr: 0.000814  loss: 2.7469 (2.9454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9228 (0.9472)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [216]  [1000/1251]  eta: 0:01:07  lr: 0.000811  min_lr: 0.000811  loss: 3.1905 (2.9462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8779 (inf)  time: 0.2584  data: 0.0005  max mem: 19097
Epoch: [216]  [1200/1251]  eta: 0:00:13  lr: 0.000808  min_lr: 0.000808  loss: 3.4270 (2.9460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9442 (inf)  time: 0.2592  data: 0.0005  max mem: 19097
Epoch: [216]  [1250/1251]  eta: 0:00:00  lr: 0.000807  min_lr: 0.000807  loss: 3.3407 (2.9582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9215 (inf)  time: 0.2183  data: 0.0006  max mem: 19097
Epoch: [216] Total time: 0:05:35 (0.2682 s / it)
Averaged stats: lr: 0.000807  min_lr: 0.000807  loss: 3.3407 (2.9580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9215 (inf)
Test:  [ 0/25]  eta: 0:04:28  loss: 0.7118 (0.7118)  acc1: 90.0000 (90.0000)  acc5: 98.0000 (98.0000)  time: 10.7256  data: 7.2289  max mem: 19097
Test:  [10/25]  eta: 0:00:16  loss: 0.8787 (0.9041)  acc1: 84.8000 (83.8545)  acc5: 97.6000 (97.3455)  time: 1.0829  data: 0.6574  max mem: 19097
Test:  [20/25]  eta: 0:00:03  loss: 1.0740 (1.0522)  acc1: 78.8000 (80.3810)  acc5: 95.6000 (95.7333)  time: 0.1186  data: 0.0002  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.1165 (1.0609)  acc1: 78.0000 (79.9200)  acc5: 94.8000 (95.6160)  time: 0.1185  data: 0.0001  max mem: 19097
Test: Total time: 0:00:13 (0.5456 s / it)
* Acc@1 80.672 Acc@5 95.520 loss 1.049
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.67%
Epoch: [217]  [   0/1251]  eta: 0:59:21  lr: 0.000807  min_lr: 0.000807  loss: 2.0386 (2.0386)  weight_decay: 0.0500 (0.0500)  time: 2.8466  data: 2.4425  max mem: 19097
Epoch: [217]  [ 200/1251]  eta: 0:04:45  lr: 0.000804  min_lr: 0.000804  loss: 3.2090 (2.9726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0300 (1.0127)  time: 0.2561  data: 0.0004  max mem: 19097
Epoch: [217]  [ 400/1251]  eta: 0:03:45  lr: 0.000801  min_lr: 0.000801  loss: 2.9859 (2.9697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9425 (1.0005)  time: 0.2576  data: 0.0003  max mem: 19097
Epoch: [217]  [ 600/1251]  eta: 0:02:50  lr: 0.000799  min_lr: 0.000799  loss: 3.2400 (2.9609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9778 (1.0043)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [217]  [ 800/1251]  eta: 0:01:57  lr: 0.000796  min_lr: 0.000796  loss: 3.0287 (2.9456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0242 (1.0015)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [217]  [1000/1251]  eta: 0:01:05  lr: 0.000793  min_lr: 0.000793  loss: 2.8541 (2.9533)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0531 (1.0076)  time: 0.2560  data: 0.0004  max mem: 19097
Epoch: [217]  [1200/1251]  eta: 0:00:13  lr: 0.000790  min_lr: 0.000790  loss: 3.1327 (2.9503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8861 (1.0032)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [217]  [1250/1251]  eta: 0:00:00  lr: 0.000789  min_lr: 0.000789  loss: 2.2888 (2.9445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9280 (1.0018)  time: 0.2163  data: 0.0005  max mem: 19097
Epoch: [217] Total time: 0:05:25 (0.2601 s / it)
Averaged stats: lr: 0.000789  min_lr: 0.000789  loss: 2.2888 (2.9600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9280 (1.0018)
Test:  [ 0/25]  eta: 0:02:33  loss: 0.5766 (0.5766)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 6.1288  data: 5.9714  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7729 (0.7855)  acc1: 85.6000 (83.7091)  acc5: 97.6000 (97.2364)  time: 0.7622  data: 0.6364  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9991 (0.9292)  acc1: 78.0000 (80.2667)  acc5: 95.2000 (95.7333)  time: 0.1948  data: 0.0742  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0150 (0.9379)  acc1: 78.0000 (79.9680)  acc5: 94.8000 (95.6000)  time: 0.1936  data: 0.0742  max mem: 19097
Test: Total time: 0:00:10 (0.4236 s / it)
* Acc@1 80.720 Acc@5 95.636 loss 0.922
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.72%
Epoch: [218]  [   0/1251]  eta: 0:54:16  lr: 0.000789  min_lr: 0.000789  loss: 2.3137 (2.3137)  weight_decay: 0.0500 (0.0500)  time: 2.6027  data: 2.2976  max mem: 19097
Epoch: [218]  [ 200/1251]  eta: 0:04:44  lr: 0.000786  min_lr: 0.000786  loss: 3.2387 (3.0166)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9458 (1.0384)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [218]  [ 400/1251]  eta: 0:03:44  lr: 0.000784  min_lr: 0.000784  loss: 2.9740 (2.9793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9871 (1.0320)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [218]  [ 600/1251]  eta: 0:02:50  lr: 0.000781  min_lr: 0.000781  loss: 2.7226 (2.9745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9444 (1.0301)  time: 0.2562  data: 0.0004  max mem: 19097
Epoch: [218]  [ 800/1251]  eta: 0:01:57  lr: 0.000778  min_lr: 0.000778  loss: 2.8391 (2.9711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9784 (1.0089)  time: 0.2588  data: 0.0004  max mem: 19097
Epoch: [218]  [1000/1251]  eta: 0:01:05  lr: 0.000775  min_lr: 0.000775  loss: 3.4477 (2.9709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9093 (1.0032)  time: 0.2566  data: 0.0003  max mem: 19097
Epoch: [218]  [1200/1251]  eta: 0:00:13  lr: 0.000772  min_lr: 0.000772  loss: 3.2568 (2.9605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9939 (0.9992)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [218]  [1250/1251]  eta: 0:00:00  lr: 0.000772  min_lr: 0.000772  loss: 3.1331 (2.9548)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0048 (1.0032)  time: 0.2167  data: 0.0005  max mem: 19097
Epoch: [218] Total time: 0:05:24 (0.2597 s / it)
Averaged stats: lr: 0.000772  min_lr: 0.000772  loss: 3.1331 (2.9479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0048 (1.0032)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6561 (0.6561)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.7436  data: 5.6013  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8179 (0.8646)  acc1: 85.2000 (83.7818)  acc5: 97.6000 (97.3091)  time: 0.7516  data: 0.6283  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0293 (1.0059)  acc1: 77.6000 (80.5143)  acc5: 95.6000 (95.8667)  time: 0.2145  data: 0.0945  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0806 (1.0141)  acc1: 78.4000 (80.2560)  acc5: 95.2000 (95.7280)  time: 0.2142  data: 0.0944  max mem: 19097
Test: Total time: 0:00:10 (0.4232 s / it)
* Acc@1 80.620 Acc@5 95.572 loss 1.007
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.72%
Epoch: [219]  [   0/1251]  eta: 1:08:24  lr: 0.000771  min_lr: 0.000771  loss: 1.9047 (1.9047)  weight_decay: 0.0500 (0.0500)  time: 3.2810  data: 2.3332  max mem: 19097
Epoch: [219]  [ 200/1251]  eta: 0:04:47  lr: 0.000769  min_lr: 0.000769  loss: 3.2639 (2.9318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9386 (0.9962)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [219]  [ 400/1251]  eta: 0:03:46  lr: 0.000766  min_lr: 0.000766  loss: 2.6537 (2.9038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0110 (1.0201)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [219]  [ 600/1251]  eta: 0:02:51  lr: 0.000763  min_lr: 0.000763  loss: 2.8643 (2.8965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9434 (1.0185)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [219]  [ 800/1251]  eta: 0:01:58  lr: 0.000760  min_lr: 0.000760  loss: 3.0990 (2.9188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9416 (1.0423)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [219]  [1000/1251]  eta: 0:01:05  lr: 0.000757  min_lr: 0.000757  loss: 2.3407 (2.9273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9633 (1.0503)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [219]  [1200/1251]  eta: 0:00:13  lr: 0.000755  min_lr: 0.000755  loss: 3.0753 (2.9235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9668 (1.0397)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [219]  [1250/1251]  eta: 0:00:00  lr: 0.000754  min_lr: 0.000754  loss: 3.2320 (2.9275)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0321 (1.0395)  time: 0.2167  data: 0.0007  max mem: 19097
Epoch: [219] Total time: 0:05:25 (0.2603 s / it)
Averaged stats: lr: 0.000754  min_lr: 0.000754  loss: 3.2320 (2.9381)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0321 (1.0395)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7018 (0.7018)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.5974  data: 5.4548  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.8517 (0.8727)  acc1: 84.4000 (83.1636)  acc5: 97.2000 (97.1273)  time: 0.7286  data: 0.6003  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0178 (1.0131)  acc1: 77.2000 (80.0571)  acc5: 95.6000 (95.5619)  time: 0.2137  data: 0.0904  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0918 (1.0212)  acc1: 77.6000 (79.7600)  acc5: 94.8000 (95.5520)  time: 0.2121  data: 0.0905  max mem: 19097
Test: Total time: 0:00:10 (0.4169 s / it)
* Acc@1 80.556 Acc@5 95.514 loss 1.010
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.72%
Epoch: [220]  [   0/1251]  eta: 1:09:29  lr: 0.000754  min_lr: 0.000754  loss: 3.2332 (3.2332)  weight_decay: 0.0500 (0.0500)  time: 3.3327  data: 1.7211  max mem: 19097
Epoch: [220]  [ 200/1251]  eta: 0:04:46  lr: 0.000751  min_lr: 0.000751  loss: 3.2341 (2.8994)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0379 (0.9995)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [220]  [ 400/1251]  eta: 0:03:45  lr: 0.000748  min_lr: 0.000748  loss: 2.6851 (2.8674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0506 (1.0245)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [220]  [ 600/1251]  eta: 0:02:51  lr: 0.000745  min_lr: 0.000745  loss: 3.1401 (2.8976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9420 (1.0170)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [220]  [ 800/1251]  eta: 0:01:57  lr: 0.000743  min_lr: 0.000743  loss: 3.3284 (2.9165)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9559 (1.0038)  time: 0.2578  data: 0.0004  max mem: 19097
Epoch: [220]  [1000/1251]  eta: 0:01:05  lr: 0.000740  min_lr: 0.000740  loss: 2.8177 (2.9299)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0057 (1.0121)  time: 0.2578  data: 0.0004  max mem: 19097
Epoch: [220]  [1200/1251]  eta: 0:00:13  lr: 0.000737  min_lr: 0.000737  loss: 3.2911 (2.9387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9619 (1.0137)  time: 0.2579  data: 0.0004  max mem: 19097
Epoch: [220]  [1250/1251]  eta: 0:00:00  lr: 0.000736  min_lr: 0.000736  loss: 2.9556 (2.9388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9103 (1.0116)  time: 0.2166  data: 0.0007  max mem: 19097
Epoch: [220] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000736  min_lr: 0.000736  loss: 2.9556 (2.9507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9103 (1.0116)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6536 (0.6536)  acc1: 89.6000 (89.6000)  acc5: 98.0000 (98.0000)  time: 5.5858  data: 5.4287  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7904 (0.8442)  acc1: 84.8000 (83.5636)  acc5: 97.2000 (97.1636)  time: 0.7203  data: 0.5928  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0011 (0.9771)  acc1: 77.2000 (80.1714)  acc5: 95.2000 (95.8095)  time: 0.1984  data: 0.0719  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0357 (0.9848)  acc1: 78.4000 (80.0000)  acc5: 95.2000 (95.6480)  time: 0.1972  data: 0.0718  max mem: 19097
Test: Total time: 0:00:10 (0.4050 s / it)
* Acc@1 80.832 Acc@5 95.618 loss 0.976
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.83%
Epoch: [221]  [   0/1251]  eta: 1:08:37  lr: 0.000736  min_lr: 0.000736  loss: 2.7417 (2.7417)  weight_decay: 0.0500 (0.0500)  time: 3.2913  data: 3.0287  max mem: 19097
Epoch: [221]  [ 200/1251]  eta: 0:04:46  lr: 0.000734  min_lr: 0.000734  loss: 2.7350 (2.9527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9877 (1.0347)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [221]  [ 400/1251]  eta: 0:03:46  lr: 0.000731  min_lr: 0.000731  loss: 2.9894 (2.9490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0195 (1.0539)  time: 0.2661  data: 0.0004  max mem: 19097
Epoch: [221]  [ 600/1251]  eta: 0:02:51  lr: 0.000728  min_lr: 0.000728  loss: 3.0254 (2.9292)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0620 (1.0716)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [221]  [ 800/1251]  eta: 0:01:57  lr: 0.000725  min_lr: 0.000725  loss: 2.7095 (2.9409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9939 (1.0564)  time: 0.2561  data: 0.0004  max mem: 19097
Epoch: [221]  [1000/1251]  eta: 0:01:05  lr: 0.000722  min_lr: 0.000722  loss: 3.1903 (2.9582)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0045 (1.0540)  time: 0.2579  data: 0.0004  max mem: 19097
Epoch: [221]  [1200/1251]  eta: 0:00:13  lr: 0.000720  min_lr: 0.000720  loss: 3.1312 (2.9516)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0345 (1.0609)  time: 0.2578  data: 0.0004  max mem: 19097
Epoch: [221]  [1250/1251]  eta: 0:00:00  lr: 0.000719  min_lr: 0.000719  loss: 2.8864 (2.9523)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0190 (1.0603)  time: 0.2163  data: 0.0005  max mem: 19097
Epoch: [221] Total time: 0:05:25 (0.2601 s / it)
Averaged stats: lr: 0.000719  min_lr: 0.000719  loss: 2.8864 (2.9485)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0190 (1.0603)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6647 (0.6647)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.5930  data: 5.4264  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8831 (0.8698)  acc1: 84.4000 (84.2545)  acc5: 97.6000 (97.3818)  time: 0.7614  data: 0.6355  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0131 (1.0212)  acc1: 78.0000 (80.5143)  acc5: 95.2000 (95.7143)  time: 0.2132  data: 0.0925  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0556 (1.0280)  acc1: 78.0000 (80.3520)  acc5: 95.2000 (95.6960)  time: 0.2120  data: 0.0924  max mem: 19097
Test: Total time: 0:00:10 (0.4164 s / it)
* Acc@1 80.816 Acc@5 95.582 loss 1.021
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.83%
Epoch: [222]  [   0/1251]  eta: 1:02:04  lr: 0.000719  min_lr: 0.000719  loss: 2.8807 (2.8807)  weight_decay: 0.0500 (0.0500)  time: 2.9771  data: 1.9630  max mem: 19097
Epoch: [222]  [ 200/1251]  eta: 0:04:46  lr: 0.000716  min_lr: 0.000716  loss: 3.1968 (2.9517)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0460 (1.0797)  time: 0.2562  data: 0.0004  max mem: 19097
Epoch: [222]  [ 400/1251]  eta: 0:03:46  lr: 0.000714  min_lr: 0.000714  loss: 2.8465 (2.9532)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0032 (1.0465)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [222]  [ 600/1251]  eta: 0:02:51  lr: 0.000711  min_lr: 0.000711  loss: 3.1673 (2.9464)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0140 (1.0422)  time: 0.2575  data: 0.0005  max mem: 19097
Epoch: [222]  [ 800/1251]  eta: 0:01:58  lr: 0.000708  min_lr: 0.000708  loss: 3.4004 (2.9588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0599 (1.0636)  time: 0.2562  data: 0.0003  max mem: 19097
Epoch: [222]  [1000/1251]  eta: 0:01:05  lr: 0.000705  min_lr: 0.000705  loss: 3.2594 (2.9473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9803 (1.0614)  time: 0.2561  data: 0.0004  max mem: 19097
Epoch: [222]  [1200/1251]  eta: 0:00:13  lr: 0.000703  min_lr: 0.000703  loss: 3.4785 (2.9481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9468 (1.0452)  time: 0.2568  data: 0.0005  max mem: 19097
Epoch: [222]  [1250/1251]  eta: 0:00:00  lr: 0.000702  min_lr: 0.000702  loss: 2.8665 (2.9443)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0680 (1.0484)  time: 0.2168  data: 0.0007  max mem: 19097
Epoch: [222] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000702  min_lr: 0.000702  loss: 2.8665 (2.9373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0680 (1.0484)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5754 (0.5754)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.3993  data: 5.2266  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.8148 (0.7946)  acc1: 83.6000 (83.8909)  acc5: 97.2000 (97.1273)  time: 0.6730  data: 0.5467  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9980 (0.9358)  acc1: 77.6000 (80.3048)  acc5: 95.2000 (95.7524)  time: 0.1902  data: 0.0675  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0130 (0.9477)  acc1: 77.6000 (80.1120)  acc5: 94.4000 (95.5040)  time: 0.2030  data: 0.0816  max mem: 19097
Test: Total time: 0:00:10 (0.4017 s / it)
* Acc@1 80.760 Acc@5 95.558 loss 0.939
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.83%
Epoch: [223]  [   0/1251]  eta: 1:05:09  lr: 0.000702  min_lr: 0.000702  loss: 3.2019 (3.2019)  weight_decay: 0.0500 (0.0500)  time: 3.1253  data: 2.5598  max mem: 19097
Epoch: [223]  [ 200/1251]  eta: 0:04:47  lr: 0.000699  min_lr: 0.000699  loss: 2.4878 (2.8504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9648 (1.0043)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [223]  [ 400/1251]  eta: 0:03:45  lr: 0.000696  min_lr: 0.000696  loss: 3.2265 (2.9291)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2581  data: 0.0004  max mem: 19097
Epoch: [223]  [ 600/1251]  eta: 0:02:51  lr: 0.000694  min_lr: 0.000694  loss: 3.2220 (2.9169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9671 (nan)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [223]  [ 800/1251]  eta: 0:01:58  lr: 0.000691  min_lr: 0.000691  loss: 2.4537 (2.9189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0628 (nan)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [223]  [1000/1251]  eta: 0:01:05  lr: 0.000688  min_lr: 0.000688  loss: 3.1183 (2.9258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9718 (nan)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [223]  [1200/1251]  eta: 0:00:13  lr: 0.000686  min_lr: 0.000686  loss: 3.2229 (2.9221)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0996 (nan)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [223]  [1250/1251]  eta: 0:00:00  lr: 0.000685  min_lr: 0.000685  loss: 2.7373 (2.9207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9993 (nan)  time: 0.2164  data: 0.0006  max mem: 19097
Epoch: [223] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000685  min_lr: 0.000685  loss: 2.7373 (2.9379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9993 (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5478 (0.5478)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 5.7547  data: 5.6134  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7904 (0.7897)  acc1: 85.2000 (84.3273)  acc5: 97.2000 (97.2727)  time: 0.7543  data: 0.6316  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9866 (0.9379)  acc1: 78.4000 (80.7619)  acc5: 95.2000 (95.8095)  time: 0.2034  data: 0.0835  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0431 (0.9517)  acc1: 78.4000 (80.3680)  acc5: 94.8000 (95.6160)  time: 0.2064  data: 0.0875  max mem: 19097
Test: Total time: 0:00:10 (0.4196 s / it)
* Acc@1 80.772 Acc@5 95.532 loss 0.945
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.83%
Epoch: [224]  [   0/1251]  eta: 1:03:34  lr: 0.000685  min_lr: 0.000685  loss: 3.3616 (3.3616)  weight_decay: 0.0500 (0.0500)  time: 3.0491  data: 1.8198  max mem: 19097
Epoch: [224]  [ 200/1251]  eta: 0:04:47  lr: 0.000682  min_lr: 0.000682  loss: 3.2207 (2.9923)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1168 (1.0613)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [224]  [ 400/1251]  eta: 0:03:45  lr: 0.000680  min_lr: 0.000680  loss: 3.0698 (2.9749)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1140 (1.0824)  time: 0.2583  data: 0.0004  max mem: 19097
Epoch: [224]  [ 600/1251]  eta: 0:02:51  lr: 0.000677  min_lr: 0.000677  loss: 3.1793 (2.9386)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0726 (1.0837)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [224]  [ 800/1251]  eta: 0:01:58  lr: 0.000674  min_lr: 0.000674  loss: 2.8556 (2.9459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9717 (1.0687)  time: 0.2658  data: 0.0004  max mem: 19097
Epoch: [224]  [1000/1251]  eta: 0:01:05  lr: 0.000671  min_lr: 0.000671  loss: 3.2895 (2.9373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9850 (1.0833)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [224]  [1200/1251]  eta: 0:00:13  lr: 0.000669  min_lr: 0.000669  loss: 2.9230 (2.9399)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0047 (1.0786)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [224]  [1250/1251]  eta: 0:00:00  lr: 0.000668  min_lr: 0.000668  loss: 3.2481 (2.9416)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1120 (1.0805)  time: 0.2170  data: 0.0010  max mem: 19097
Epoch: [224] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000668  min_lr: 0.000668  loss: 3.2481 (2.9188)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1120 (1.0805)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6555 (0.6555)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.6795  data: 5.5046  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.8417 (0.8466)  acc1: 85.2000 (83.7091)  acc5: 97.2000 (97.2000)  time: 0.7327  data: 0.6051  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0418 (0.9890)  acc1: 78.0000 (80.4191)  acc5: 95.2000 (95.5619)  time: 0.1992  data: 0.0785  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0728 (0.9971)  acc1: 78.0000 (80.2080)  acc5: 94.8000 (95.5360)  time: 0.1982  data: 0.0784  max mem: 19097
Test: Total time: 0:00:10 (0.4093 s / it)
* Acc@1 80.620 Acc@5 95.586 loss 0.986
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.83%
Epoch: [225]  [   0/1251]  eta: 1:03:23  lr: 0.000668  min_lr: 0.000668  loss: 3.3007 (3.3007)  weight_decay: 0.0500 (0.0500)  time: 3.0407  data: 1.6901  max mem: 19097
Epoch: [225]  [ 200/1251]  eta: 0:04:46  lr: 0.000665  min_lr: 0.000665  loss: 2.3399 (2.9500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9293 (1.0201)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [225]  [ 400/1251]  eta: 0:03:46  lr: 0.000663  min_lr: 0.000663  loss: 2.6314 (2.9380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9135 (1.0155)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [225]  [ 600/1251]  eta: 0:02:51  lr: 0.000660  min_lr: 0.000660  loss: 3.2537 (2.9327)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0870 (1.0467)  time: 0.2588  data: 0.0003  max mem: 19097
Epoch: [225]  [ 800/1251]  eta: 0:01:58  lr: 0.000657  min_lr: 0.000657  loss: 2.8799 (2.9373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9926 (1.0455)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [225]  [1000/1251]  eta: 0:01:05  lr: 0.000655  min_lr: 0.000655  loss: 2.9450 (2.9306)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0772 (1.0459)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [225]  [1200/1251]  eta: 0:00:13  lr: 0.000652  min_lr: 0.000652  loss: 2.8703 (2.9460)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0600 (1.0533)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [225]  [1250/1251]  eta: 0:00:00  lr: 0.000652  min_lr: 0.000652  loss: 3.2810 (2.9459)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0322 (1.0527)  time: 0.2165  data: 0.0006  max mem: 19097
Epoch: [225] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000652  min_lr: 0.000652  loss: 3.2810 (2.9351)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0322 (1.0527)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5960 (0.5960)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.6190  data: 5.4834  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.8241 (0.7972)  acc1: 83.6000 (84.0364)  acc5: 96.8000 (97.2364)  time: 0.7316  data: 0.6090  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9941 (0.9489)  acc1: 78.4000 (80.6476)  acc5: 95.2000 (95.5429)  time: 0.1978  data: 0.0778  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0552 (0.9603)  acc1: 78.4000 (80.4000)  acc5: 94.4000 (95.3120)  time: 0.2028  data: 0.0830  max mem: 19097
Test: Total time: 0:00:10 (0.4100 s / it)
* Acc@1 80.992 Acc@5 95.662 loss 0.945
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 80.99%
Epoch: [226]  [   0/1251]  eta: 1:07:05  lr: 0.000651  min_lr: 0.000651  loss: 2.4075 (2.4075)  weight_decay: 0.0500 (0.0500)  time: 3.2177  data: 2.9283  max mem: 19097
Epoch: [226]  [ 200/1251]  eta: 0:04:46  lr: 0.000649  min_lr: 0.000649  loss: 3.2807 (2.9432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9964 (1.0554)  time: 0.2592  data: 0.0004  max mem: 19097
Epoch: [226]  [ 400/1251]  eta: 0:03:45  lr: 0.000646  min_lr: 0.000646  loss: 3.1409 (2.9118)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1323 (1.0789)  time: 0.2569  data: 0.0003  max mem: 19097
Epoch: [226]  [ 600/1251]  eta: 0:02:50  lr: 0.000644  min_lr: 0.000644  loss: 3.2194 (2.9259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1107 (1.0955)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [226]  [ 800/1251]  eta: 0:01:57  lr: 0.000641  min_lr: 0.000641  loss: 2.9899 (2.9377)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0127 (1.0828)  time: 0.2577  data: 0.0005  max mem: 19097
Epoch: [226]  [1000/1251]  eta: 0:01:05  lr: 0.000638  min_lr: 0.000638  loss: 2.4690 (2.9302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9892 (1.0779)  time: 0.2593  data: 0.0004  max mem: 19097
Epoch: [226]  [1200/1251]  eta: 0:00:13  lr: 0.000636  min_lr: 0.000636  loss: 2.9821 (2.9247)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2330 (1.0965)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [226]  [1250/1251]  eta: 0:00:00  lr: 0.000635  min_lr: 0.000635  loss: 3.2602 (2.9242)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2315 (1.1006)  time: 0.2172  data: 0.0006  max mem: 19097
Epoch: [226] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000635  min_lr: 0.000635  loss: 3.2602 (2.9190)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2315 (1.1006)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6567 (0.6567)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.6279  data: 5.4891  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8737 (0.8544)  acc1: 84.4000 (83.7818)  acc5: 97.6000 (97.5273)  time: 0.7465  data: 0.6143  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0523 (1.0042)  acc1: 77.2000 (80.2286)  acc5: 95.6000 (95.9810)  time: 0.2069  data: 0.0816  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0891 (1.0144)  acc1: 78.4000 (80.0160)  acc5: 94.8000 (95.7440)  time: 0.2040  data: 0.0815  max mem: 19097
Test: Total time: 0:00:10 (0.4127 s / it)
* Acc@1 80.814 Acc@5 95.570 loss 1.006
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.99%
Epoch: [227]  [   0/1251]  eta: 1:06:49  lr: 0.000635  min_lr: 0.000635  loss: 3.6898 (3.6898)  weight_decay: 0.0500 (0.0500)  time: 3.2048  data: 1.9377  max mem: 19097
Epoch: [227]  [ 200/1251]  eta: 0:04:47  lr: 0.000632  min_lr: 0.000632  loss: 2.9924 (2.9312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9990 (1.0398)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [227]  [ 400/1251]  eta: 0:03:46  lr: 0.000630  min_lr: 0.000630  loss: 3.4432 (2.9508)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0789 (1.0738)  time: 0.2638  data: 0.0004  max mem: 19097
Epoch: [227]  [ 600/1251]  eta: 0:02:51  lr: 0.000627  min_lr: 0.000627  loss: 3.0540 (2.9326)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1235 (1.0689)  time: 0.2568  data: 0.0003  max mem: 19097
Epoch: [227]  [ 800/1251]  eta: 0:01:58  lr: 0.000625  min_lr: 0.000625  loss: 2.9168 (2.9296)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1217 (1.0798)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [227]  [1000/1251]  eta: 0:01:05  lr: 0.000622  min_lr: 0.000622  loss: 3.2012 (2.9284)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0039 (1.0796)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [227]  [1200/1251]  eta: 0:00:13  lr: 0.000619  min_lr: 0.000619  loss: 3.1899 (2.9322)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1202 (1.0896)  time: 0.2569  data: 0.0003  max mem: 19097
Epoch: [227]  [1250/1251]  eta: 0:00:00  lr: 0.000619  min_lr: 0.000619  loss: 3.1410 (2.9326)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0166 (1.0884)  time: 0.2165  data: 0.0006  max mem: 19097
Epoch: [227] Total time: 0:05:25 (0.2601 s / it)
Averaged stats: lr: 0.000619  min_lr: 0.000619  loss: 3.1410 (2.9157)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0166 (1.0884)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6209 (0.6209)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 5.4323  data: 5.2882  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8465 (0.8086)  acc1: 84.4000 (83.8182)  acc5: 98.0000 (97.4546)  time: 0.7426  data: 0.6180  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9577 (0.9480)  acc1: 78.4000 (80.4762)  acc5: 95.6000 (95.9238)  time: 0.2051  data: 0.0845  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0704 (0.9596)  acc1: 78.4000 (80.0640)  acc5: 94.8000 (95.8400)  time: 0.2128  data: 0.0931  max mem: 19097
Test: Total time: 0:00:10 (0.4108 s / it)
* Acc@1 80.846 Acc@5 95.634 loss 0.949
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.99%
Epoch: [228]  [   0/1251]  eta: 1:08:44  lr: 0.000619  min_lr: 0.000619  loss: 3.7062 (3.7062)  weight_decay: 0.0500 (0.0500)  time: 3.2966  data: 2.5990  max mem: 19097
Epoch: [228]  [ 200/1251]  eta: 0:04:49  lr: 0.000616  min_lr: 0.000616  loss: 2.7462 (2.8410)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0401 (1.0926)  time: 0.2582  data: 0.0004  max mem: 19097
Epoch: [228]  [ 400/1251]  eta: 0:03:47  lr: 0.000614  min_lr: 0.000614  loss: 3.1201 (2.8817)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0773 (1.1207)  time: 0.2574  data: 0.0003  max mem: 19097
Epoch: [228]  [ 600/1251]  eta: 0:02:51  lr: 0.000611  min_lr: 0.000611  loss: 2.7843 (2.8610)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0038 (1.0864)  time: 0.2574  data: 0.0005  max mem: 19097
Epoch: [228]  [ 800/1251]  eta: 0:01:58  lr: 0.000608  min_lr: 0.000608  loss: 2.3740 (2.8706)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0661 (1.0891)  time: 0.2582  data: 0.0004  max mem: 19097
Epoch: [228]  [1000/1251]  eta: 0:01:05  lr: 0.000606  min_lr: 0.000606  loss: 3.0700 (2.8766)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0261 (1.0908)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [228]  [1200/1251]  eta: 0:00:13  lr: 0.000603  min_lr: 0.000603  loss: 2.8794 (2.8782)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0715 (1.0922)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [228]  [1250/1251]  eta: 0:00:00  lr: 0.000603  min_lr: 0.000603  loss: 2.8424 (2.8780)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0233 (1.0906)  time: 0.2166  data: 0.0007  max mem: 19097
Epoch: [228] Total time: 0:05:26 (0.2611 s / it)
Averaged stats: lr: 0.000603  min_lr: 0.000603  loss: 2.8424 (2.9105)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0233 (1.0906)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6143 (0.6143)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.7838  data: 5.6385  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7978 (0.7910)  acc1: 84.4000 (84.4727)  acc5: 97.6000 (97.3818)  time: 0.6772  data: 0.5525  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9685 (0.9417)  acc1: 78.4000 (80.7238)  acc5: 96.0000 (95.9619)  time: 0.1689  data: 0.0477  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0464 (0.9576)  acc1: 78.4000 (80.4000)  acc5: 95.2000 (95.8240)  time: 0.2021  data: 0.0818  max mem: 19097
Test: Total time: 0:00:10 (0.4170 s / it)
* Acc@1 81.054 Acc@5 95.738 loss 0.950
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.05%
Epoch: [229]  [   0/1251]  eta: 0:56:04  lr: 0.000603  min_lr: 0.000603  loss: 1.8481 (1.8481)  weight_decay: 0.0500 (0.0500)  time: 2.6892  data: 2.3931  max mem: 19097
Epoch: [229]  [ 200/1251]  eta: 0:04:44  lr: 0.000600  min_lr: 0.000600  loss: 2.8391 (2.9462)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1208 (1.1021)  time: 0.2559  data: 0.0003  max mem: 19097
Epoch: [229]  [ 400/1251]  eta: 0:03:45  lr: 0.000597  min_lr: 0.000597  loss: 3.0075 (2.9129)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0996 (1.1331)  time: 0.2561  data: 0.0003  max mem: 19097
Epoch: [229]  [ 600/1251]  eta: 0:02:50  lr: 0.000595  min_lr: 0.000595  loss: 3.1899 (2.9044)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0900 (1.1225)  time: 0.2562  data: 0.0003  max mem: 19097
Epoch: [229]  [ 800/1251]  eta: 0:01:57  lr: 0.000592  min_lr: 0.000592  loss: 3.2656 (2.9102)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0993 (1.1299)  time: 0.2584  data: 0.0004  max mem: 19097
Epoch: [229]  [1000/1251]  eta: 0:01:05  lr: 0.000590  min_lr: 0.000590  loss: 2.9346 (2.9204)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0983 (nan)  time: 0.2569  data: 0.0003  max mem: 19097
Epoch: [229]  [1200/1251]  eta: 0:00:13  lr: 0.000587  min_lr: 0.000587  loss: 2.7615 (2.9162)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0171 (nan)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [229]  [1250/1251]  eta: 0:00:00  lr: 0.000587  min_lr: 0.000587  loss: 2.4578 (2.9060)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0387 (nan)  time: 0.2173  data: 0.0007  max mem: 19097
Epoch: [229] Total time: 0:05:25 (0.2600 s / it)
Averaged stats: lr: 0.000587  min_lr: 0.000587  loss: 2.4578 (2.9142)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0387 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5650 (0.5650)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.4839  data: 5.3129  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7493 (0.7762)  acc1: 85.2000 (84.1091)  acc5: 97.6000 (97.1636)  time: 0.6721  data: 0.5462  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9691 (0.9238)  acc1: 78.8000 (80.6476)  acc5: 95.6000 (95.8286)  time: 0.1901  data: 0.0702  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9783 (0.9324)  acc1: 78.8000 (80.3360)  acc5: 95.6000 (95.8560)  time: 0.2193  data: 0.0999  max mem: 19097
Test: Total time: 0:00:10 (0.4174 s / it)
* Acc@1 80.970 Acc@5 95.722 loss 0.923
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.05%
Epoch: [230]  [   0/1251]  eta: 1:09:02  lr: 0.000587  min_lr: 0.000587  loss: 3.2962 (3.2962)  weight_decay: 0.0500 (0.0500)  time: 3.3115  data: 1.7458  max mem: 19097
Epoch: [230]  [ 200/1251]  eta: 0:04:47  lr: 0.000584  min_lr: 0.000584  loss: 2.9895 (2.9255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1422 (1.1399)  time: 0.2562  data: 0.0003  max mem: 19097
Epoch: [230]  [ 400/1251]  eta: 0:03:45  lr: 0.000582  min_lr: 0.000582  loss: 2.6645 (2.9291)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0652 (1.0969)  time: 0.2560  data: 0.0003  max mem: 19097
Epoch: [230]  [ 600/1251]  eta: 0:02:51  lr: 0.000579  min_lr: 0.000579  loss: 2.2744 (2.9011)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0687 (1.1109)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [230]  [ 800/1251]  eta: 0:01:57  lr: 0.000577  min_lr: 0.000577  loss: 3.0225 (2.9143)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2008 (1.1237)  time: 0.2561  data: 0.0004  max mem: 19097
Epoch: [230]  [1000/1251]  eta: 0:01:05  lr: 0.000574  min_lr: 0.000574  loss: 2.8311 (2.9003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9838 (1.1123)  time: 0.2579  data: 0.0004  max mem: 19097
Epoch: [230]  [1200/1251]  eta: 0:00:13  lr: 0.000571  min_lr: 0.000571  loss: 2.9599 (2.8956)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1705 (1.1106)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [230]  [1250/1251]  eta: 0:00:00  lr: 0.000571  min_lr: 0.000571  loss: 2.8709 (2.8959)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0566 (1.1093)  time: 0.2167  data: 0.0006  max mem: 19097
Epoch: [230] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000571  min_lr: 0.000571  loss: 2.8709 (2.9026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0566 (1.1093)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6119 (0.6119)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 5.5495  data: 5.4041  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7531 (0.7885)  acc1: 84.0000 (83.9636)  acc5: 97.6000 (97.4546)  time: 0.7240  data: 0.5994  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9687 (0.9337)  acc1: 78.4000 (80.6286)  acc5: 95.2000 (95.8857)  time: 0.1955  data: 0.0750  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9729 (0.9416)  acc1: 78.4000 (80.4480)  acc5: 95.2000 (95.7440)  time: 0.1954  data: 0.0749  max mem: 19097
Test: Total time: 0:00:10 (0.4012 s / it)
* Acc@1 80.862 Acc@5 95.702 loss 0.935
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.05%
Epoch: [231]  [   0/1251]  eta: 1:04:13  lr: 0.000571  min_lr: 0.000571  loss: 2.0894 (2.0894)  weight_decay: 0.0500 (0.0500)  time: 3.0807  data: 2.7753  max mem: 19097
Epoch: [231]  [ 200/1251]  eta: 0:04:47  lr: 0.000568  min_lr: 0.000568  loss: 3.0377 (2.9196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9751 (1.0156)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [231]  [ 400/1251]  eta: 0:03:46  lr: 0.000566  min_lr: 0.000566  loss: 3.1169 (2.8750)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0163 (1.0472)  time: 0.2562  data: 0.0004  max mem: 19097
Epoch: [231]  [ 600/1251]  eta: 0:02:50  lr: 0.000563  min_lr: 0.000563  loss: 3.3373 (2.8932)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0659 (1.0653)  time: 0.2554  data: 0.0004  max mem: 19097
Epoch: [231]  [ 800/1251]  eta: 0:01:57  lr: 0.000561  min_lr: 0.000561  loss: 3.3106 (2.8757)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0747 (1.0873)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [231]  [1000/1251]  eta: 0:01:05  lr: 0.000558  min_lr: 0.000558  loss: 3.1286 (2.8882)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0483 (1.0894)  time: 0.2634  data: 0.0004  max mem: 19097
Epoch: [231]  [1200/1251]  eta: 0:00:13  lr: 0.000556  min_lr: 0.000556  loss: 2.9713 (2.8970)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0642 (1.0899)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [231]  [1250/1251]  eta: 0:00:00  lr: 0.000555  min_lr: 0.000555  loss: 2.9760 (2.9003)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0401 (1.0884)  time: 0.2168  data: 0.0007  max mem: 19097
Epoch: [231] Total time: 0:05:25 (0.2600 s / it)
Averaged stats: lr: 0.000555  min_lr: 0.000555  loss: 2.9760 (2.9012)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0401 (1.0884)
Test:  [ 0/25]  eta: 0:01:21  loss: 0.5823 (0.5823)  acc1: 91.2000 (91.2000)  acc5: 98.4000 (98.4000)  time: 3.2436  data: 3.0935  max mem: 19097
Test:  [10/25]  eta: 0:00:08  loss: 0.8053 (0.7861)  acc1: 83.6000 (84.1455)  acc5: 97.6000 (97.4909)  time: 0.5701  data: 0.4453  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9542 (0.9293)  acc1: 78.0000 (80.6667)  acc5: 95.6000 (96.0000)  time: 0.3020  data: 0.1794  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0341 (0.9374)  acc1: 78.0000 (80.4640)  acc5: 95.2000 (95.8240)  time: 0.2346  data: 0.1127  max mem: 19097
Test: Total time: 0:00:10 (0.4090 s / it)
* Acc@1 81.118 Acc@5 95.744 loss 0.919
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.12%
Epoch: [232]  [   0/1251]  eta: 1:04:57  lr: 0.000555  min_lr: 0.000555  loss: 3.6716 (3.6716)  weight_decay: 0.0500 (0.0500)  time: 3.1158  data: 2.8391  max mem: 19097
Epoch: [232]  [ 200/1251]  eta: 0:04:45  lr: 0.000553  min_lr: 0.000553  loss: 2.9777 (2.8636)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1649 (1.1103)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [232]  [ 400/1251]  eta: 0:03:45  lr: 0.000550  min_lr: 0.000550  loss: 2.5668 (2.8445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0265 (1.0990)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [232]  [ 600/1251]  eta: 0:02:50  lr: 0.000548  min_lr: 0.000548  loss: 2.5254 (2.8713)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0470 (1.0888)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [232]  [ 800/1251]  eta: 0:01:58  lr: 0.000545  min_lr: 0.000545  loss: 2.5117 (2.8565)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1128 (1.0831)  time: 0.2642  data: 0.0005  max mem: 19097
Epoch: [232]  [1000/1251]  eta: 0:01:05  lr: 0.000543  min_lr: 0.000543  loss: 2.7578 (2.8533)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1997 (1.0931)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [232]  [1200/1251]  eta: 0:00:13  lr: 0.000540  min_lr: 0.000540  loss: 3.2501 (2.8777)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1023 (1.0983)  time: 0.2570  data: 0.0003  max mem: 19097
Epoch: [232]  [1250/1251]  eta: 0:00:00  lr: 0.000540  min_lr: 0.000540  loss: 3.1055 (2.8795)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1334 (1.1003)  time: 0.2175  data: 0.0007  max mem: 19097
Epoch: [232] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000540  min_lr: 0.000540  loss: 3.1055 (2.8932)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1334 (1.1003)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7161 (0.7161)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.6955  data: 5.5497  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8916 (0.8867)  acc1: 84.0000 (84.3273)  acc5: 97.6000 (97.6000)  time: 0.7533  data: 0.6206  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0587 (1.0256)  acc1: 78.4000 (81.0857)  acc5: 96.0000 (95.8667)  time: 0.2033  data: 0.0784  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.1183 (1.0353)  acc1: 78.4000 (80.7360)  acc5: 95.2000 (95.7600)  time: 0.2019  data: 0.0798  max mem: 19097
Test: Total time: 0:00:10 (0.4138 s / it)
* Acc@1 80.986 Acc@5 95.754 loss 1.026
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.12%
Epoch: [233]  [   0/1251]  eta: 1:09:06  lr: 0.000540  min_lr: 0.000540  loss: 1.9062 (1.9062)  weight_decay: 0.0500 (0.0500)  time: 3.3146  data: 2.7842  max mem: 19097
Epoch: [233]  [ 200/1251]  eta: 0:04:49  lr: 0.000537  min_lr: 0.000537  loss: 2.9836 (2.8217)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1786 (1.2200)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [233]  [ 400/1251]  eta: 0:03:47  lr: 0.000535  min_lr: 0.000535  loss: 3.2262 (2.8645)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0331 (1.1979)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [233]  [ 600/1251]  eta: 0:02:51  lr: 0.000533  min_lr: 0.000533  loss: 2.8793 (2.8372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1200 (1.1642)  time: 0.2562  data: 0.0004  max mem: 19097
Epoch: [233]  [ 800/1251]  eta: 0:01:58  lr: 0.000530  min_lr: 0.000530  loss: 3.0603 (2.8514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0420 (1.1548)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [233]  [1000/1251]  eta: 0:01:05  lr: 0.000528  min_lr: 0.000528  loss: 2.9361 (2.8686)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0810 (1.1462)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [233]  [1200/1251]  eta: 0:00:13  lr: 0.000525  min_lr: 0.000525  loss: 2.6841 (2.8659)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0852 (1.1496)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [233]  [1250/1251]  eta: 0:00:00  lr: 0.000525  min_lr: 0.000525  loss: 3.1179 (2.8681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1169 (1.1516)  time: 0.2166  data: 0.0007  max mem: 19097
Epoch: [233] Total time: 0:05:26 (0.2607 s / it)
Averaged stats: lr: 0.000525  min_lr: 0.000525  loss: 3.1179 (2.8774)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1169 (1.1516)
Test:  [ 0/25]  eta: 0:01:54  loss: 0.6539 (0.6539)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 4.5926  data: 4.4457  max mem: 19097
Test:  [10/25]  eta: 0:00:09  loss: 0.8261 (0.8423)  acc1: 84.8000 (84.1455)  acc5: 97.6000 (97.5636)  time: 0.6642  data: 0.5395  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9926 (0.9894)  acc1: 78.4000 (81.0857)  acc5: 96.0000 (96.1333)  time: 0.2321  data: 0.1116  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0357 (0.9972)  acc1: 78.4000 (80.7680)  acc5: 95.2000 (95.9680)  time: 0.2113  data: 0.0926  max mem: 19097
Test: Total time: 0:00:10 (0.4115 s / it)
* Acc@1 81.144 Acc@5 95.798 loss 0.984
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.14%
Epoch: [234]  [   0/1251]  eta: 1:08:48  lr: 0.000525  min_lr: 0.000525  loss: 2.6082 (2.6082)  weight_decay: 0.0500 (0.0500)  time: 3.3000  data: 3.0372  max mem: 19097
Epoch: [234]  [ 200/1251]  eta: 0:04:46  lr: 0.000522  min_lr: 0.000522  loss: 3.2121 (2.8557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9854 (1.0811)  time: 0.2573  data: 0.0003  max mem: 19097
Epoch: [234]  [ 400/1251]  eta: 0:03:45  lr: 0.000520  min_lr: 0.000520  loss: 3.0184 (2.9060)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1240 (1.1057)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [234]  [ 600/1251]  eta: 0:02:51  lr: 0.000517  min_lr: 0.000517  loss: 2.5768 (2.9037)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1609 (1.1199)  time: 0.2573  data: 0.0003  max mem: 19097
Epoch: [234]  [ 800/1251]  eta: 0:01:57  lr: 0.000515  min_lr: 0.000515  loss: 2.8800 (2.8809)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2529 (1.1236)  time: 0.2585  data: 0.0004  max mem: 19097
Epoch: [234]  [1000/1251]  eta: 0:01:05  lr: 0.000513  min_lr: 0.000513  loss: 2.8770 (2.8703)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1629 (1.1274)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [234]  [1200/1251]  eta: 0:00:13  lr: 0.000510  min_lr: 0.000510  loss: 2.9455 (2.8683)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0791 (1.1342)  time: 0.2650  data: 0.0004  max mem: 19097
Epoch: [234]  [1250/1251]  eta: 0:00:00  lr: 0.000510  min_lr: 0.000510  loss: 2.8445 (2.8665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0791 (1.1344)  time: 0.2168  data: 0.0007  max mem: 19097
Epoch: [234] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000510  min_lr: 0.000510  loss: 2.8445 (2.8801)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0791 (1.1344)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6072 (0.6072)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.8513  data: 5.7088  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7825 (0.7877)  acc1: 84.8000 (84.0000)  acc5: 97.6000 (97.2727)  time: 0.7243  data: 0.6010  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9685 (0.9337)  acc1: 78.8000 (80.8952)  acc5: 95.2000 (95.7714)  time: 0.1882  data: 0.0682  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0359 (0.9455)  acc1: 78.8000 (80.5920)  acc5: 94.8000 (95.5360)  time: 0.1870  data: 0.0681  max mem: 19097
Test: Total time: 0:00:10 (0.4070 s / it)
* Acc@1 81.188 Acc@5 95.680 loss 0.933
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.19%
Epoch: [235]  [   0/1251]  eta: 1:07:20  lr: 0.000510  min_lr: 0.000510  loss: 1.9847 (1.9847)  weight_decay: 0.0500 (0.0500)  time: 3.2298  data: 2.9310  max mem: 19097
Epoch: [235]  [ 200/1251]  eta: 0:04:46  lr: 0.000507  min_lr: 0.000507  loss: 2.5817 (2.9171)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1649 (1.1129)  time: 0.2579  data: 0.0003  max mem: 19097
Epoch: [235]  [ 400/1251]  eta: 0:03:45  lr: 0.000505  min_lr: 0.000505  loss: 2.6845 (2.8930)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0132 (1.1361)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [235]  [ 600/1251]  eta: 0:02:50  lr: 0.000502  min_lr: 0.000502  loss: 2.7654 (2.8735)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1364 (1.1470)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [235]  [ 800/1251]  eta: 0:01:57  lr: 0.000500  min_lr: 0.000500  loss: 3.1346 (2.8887)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0649 (1.1529)  time: 0.2569  data: 0.0003  max mem: 19097
Epoch: [235]  [1000/1251]  eta: 0:01:05  lr: 0.000498  min_lr: 0.000498  loss: 2.8706 (2.8981)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1046 (1.1535)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [235]  [1200/1251]  eta: 0:00:13  lr: 0.000495  min_lr: 0.000495  loss: 2.9576 (2.9013)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2474 (1.1650)  time: 0.2579  data: 0.0004  max mem: 19097
Epoch: [235]  [1250/1251]  eta: 0:00:00  lr: 0.000495  min_lr: 0.000495  loss: 3.2391 (2.9015)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1190 (1.1635)  time: 0.2169  data: 0.0014  max mem: 19097
Epoch: [235] Total time: 0:05:25 (0.2601 s / it)
Averaged stats: lr: 0.000495  min_lr: 0.000495  loss: 3.2391 (2.8788)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1190 (1.1635)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6835 (0.6835)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.8724  data: 5.7140  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8536 (0.8666)  acc1: 84.4000 (84.1091)  acc5: 97.2000 (97.3091)  time: 0.7889  data: 0.6626  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0496 (1.0042)  acc1: 78.8000 (80.8000)  acc5: 95.6000 (95.7714)  time: 0.2084  data: 0.0876  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0946 (1.0093)  acc1: 78.8000 (80.6560)  acc5: 94.8000 (95.6800)  time: 0.2072  data: 0.0875  max mem: 19097
Test: Total time: 0:00:10 (0.4241 s / it)
* Acc@1 81.256 Acc@5 95.774 loss 0.998
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.26%
Epoch: [236]  [   0/1251]  eta: 1:02:01  lr: 0.000495  min_lr: 0.000495  loss: 2.3632 (2.3632)  weight_decay: 0.0500 (0.0500)  time: 2.9748  data: 2.6857  max mem: 19097
Epoch: [236]  [ 200/1251]  eta: 0:04:45  lr: 0.000492  min_lr: 0.000492  loss: 2.7904 (2.8911)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1758 (1.2255)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [236]  [ 400/1251]  eta: 0:03:45  lr: 0.000490  min_lr: 0.000490  loss: 3.3095 (2.8721)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0605 (1.1719)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [236]  [ 600/1251]  eta: 0:02:50  lr: 0.000488  min_lr: 0.000488  loss: 2.6178 (2.8814)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1467 (1.1742)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [236]  [ 800/1251]  eta: 0:01:57  lr: 0.000485  min_lr: 0.000485  loss: 3.3604 (2.8900)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1358 (1.1599)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [236]  [1000/1251]  eta: 0:01:05  lr: 0.000483  min_lr: 0.000483  loss: 2.6734 (2.8739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0585 (1.1532)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [236]  [1200/1251]  eta: 0:00:13  lr: 0.000481  min_lr: 0.000481  loss: 3.0895 (2.8828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2524 (1.1607)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [236]  [1250/1251]  eta: 0:00:00  lr: 0.000480  min_lr: 0.000480  loss: 3.0549 (2.8876)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2148 (1.1632)  time: 0.2167  data: 0.0006  max mem: 19097
Epoch: [236] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000480  min_lr: 0.000480  loss: 3.0549 (2.8780)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2148 (1.1632)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5962 (0.5962)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.3474  data: 5.2022  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7936 (0.8010)  acc1: 84.8000 (84.9091)  acc5: 97.2000 (97.3091)  time: 0.7200  data: 0.5965  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0195 (0.9473)  acc1: 78.0000 (81.0857)  acc5: 95.6000 (95.7143)  time: 0.2116  data: 0.0916  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9728 (0.9533)  acc1: 80.0000 (81.0080)  acc5: 95.2000 (95.6800)  time: 0.2104  data: 0.0915  max mem: 19097
Test: Total time: 0:00:10 (0.4055 s / it)
* Acc@1 81.536 Acc@5 95.762 loss 0.934
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.54%
Epoch: [237]  [   0/1251]  eta: 1:10:54  lr: 0.000480  min_lr: 0.000480  loss: 2.7205 (2.7205)  weight_decay: 0.0500 (0.0500)  time: 3.4008  data: 3.1333  max mem: 19097
Epoch: [237]  [ 200/1251]  eta: 0:04:48  lr: 0.000478  min_lr: 0.000478  loss: 3.1636 (2.8581)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0376 (1.1232)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [237]  [ 400/1251]  eta: 0:03:46  lr: 0.000475  min_lr: 0.000475  loss: 2.4789 (2.8420)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0837 (1.1288)  time: 0.2574  data: 0.0003  max mem: 19097
Epoch: [237]  [ 600/1251]  eta: 0:02:51  lr: 0.000473  min_lr: 0.000473  loss: 2.7369 (2.8565)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2556  data: 0.0004  max mem: 19097
Epoch: [237]  [ 800/1251]  eta: 0:01:57  lr: 0.000471  min_lr: 0.000471  loss: 2.9725 (2.8594)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0546 (nan)  time: 0.2578  data: 0.0004  max mem: 19097
Epoch: [237]  [1000/1251]  eta: 0:01:05  lr: 0.000468  min_lr: 0.000468  loss: 3.1957 (2.8613)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1370 (nan)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [237]  [1200/1251]  eta: 0:00:13  lr: 0.000466  min_lr: 0.000466  loss: 3.1227 (2.8583)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1401 (nan)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [237]  [1250/1251]  eta: 0:00:00  lr: 0.000466  min_lr: 0.000466  loss: 3.1914 (2.8604)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0931 (nan)  time: 0.2168  data: 0.0006  max mem: 19097
Epoch: [237] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000466  min_lr: 0.000466  loss: 3.1914 (2.8699)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0931 (nan)
Test:  [ 0/25]  eta: 0:01:39  loss: 0.5867 (0.5867)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 3.9703  data: 3.7877  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7886 (0.7857)  acc1: 84.8000 (83.9273)  acc5: 97.6000 (97.4909)  time: 0.6860  data: 0.5579  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9297 (0.9313)  acc1: 78.4000 (80.9714)  acc5: 95.6000 (96.0381)  time: 0.2722  data: 0.1511  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0188 (0.9424)  acc1: 78.8000 (80.8000)  acc5: 95.2000 (95.8560)  time: 0.2333  data: 0.1131  max mem: 19097
Test: Total time: 0:00:10 (0.4160 s / it)
* Acc@1 81.260 Acc@5 95.840 loss 0.924
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.54%
Epoch: [238]  [   0/1251]  eta: 1:12:13  lr: 0.000466  min_lr: 0.000466  loss: 2.3938 (2.3938)  weight_decay: 0.0500 (0.0500)  time: 3.4643  data: 2.2288  max mem: 19097
Epoch: [238]  [ 200/1251]  eta: 0:04:46  lr: 0.000463  min_lr: 0.000463  loss: 3.2930 (2.9175)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1920 (1.1482)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [238]  [ 400/1251]  eta: 0:03:46  lr: 0.000461  min_lr: 0.000461  loss: 3.0704 (2.8811)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1092 (1.1579)  time: 0.2628  data: 0.0004  max mem: 19097
Epoch: [238]  [ 600/1251]  eta: 0:02:51  lr: 0.000459  min_lr: 0.000459  loss: 2.8367 (2.8776)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1474 (1.1898)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [238]  [ 800/1251]  eta: 0:01:57  lr: 0.000456  min_lr: 0.000456  loss: 3.2647 (2.8711)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1672 (1.1798)  time: 0.2646  data: 0.0004  max mem: 19097
Epoch: [238]  [1000/1251]  eta: 0:01:05  lr: 0.000454  min_lr: 0.000454  loss: 2.8065 (2.8748)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0139 (1.1649)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [238]  [1200/1251]  eta: 0:00:13  lr: 0.000452  min_lr: 0.000452  loss: 3.0804 (2.8725)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0816 (1.1616)  time: 0.2561  data: 0.0003  max mem: 19097
Epoch: [238]  [1250/1251]  eta: 0:00:00  lr: 0.000451  min_lr: 0.000451  loss: 2.6158 (2.8713)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0176 (1.1588)  time: 0.2169  data: 0.0006  max mem: 19097
Epoch: [238] Total time: 0:05:25 (0.2606 s / it)
Averaged stats: lr: 0.000451  min_lr: 0.000451  loss: 2.6158 (2.8686)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0176 (1.1588)
Test:  [ 0/25]  eta: 0:01:21  loss: 0.5747 (0.5747)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 3.2520  data: 3.1086  max mem: 19097
Test:  [10/25]  eta: 0:00:09  loss: 0.8103 (0.8054)  acc1: 84.0000 (84.1455)  acc5: 97.6000 (97.5273)  time: 0.6216  data: 0.4937  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9966 (0.9514)  acc1: 78.4000 (80.9905)  acc5: 95.2000 (95.9810)  time: 0.2971  data: 0.1742  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0578 (0.9605)  acc1: 78.4000 (80.7360)  acc5: 95.2000 (95.8560)  time: 0.2088  data: 0.0896  max mem: 19097
Test: Total time: 0:00:10 (0.4024 s / it)
* Acc@1 81.384 Acc@5 95.800 loss 0.947
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.54%
Epoch: [239]  [   0/1251]  eta: 1:02:59  lr: 0.000451  min_lr: 0.000451  loss: 3.3496 (3.3496)  weight_decay: 0.0500 (0.0500)  time: 3.0215  data: 1.7840  max mem: 19097
Epoch: [239]  [ 200/1251]  eta: 0:04:45  lr: 0.000449  min_lr: 0.000449  loss: 3.2689 (2.8916)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1805 (1.1517)  time: 0.2562  data: 0.0004  max mem: 19097
Epoch: [239]  [ 400/1251]  eta: 0:03:45  lr: 0.000447  min_lr: 0.000447  loss: 3.0302 (2.8838)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2009 (1.1522)  time: 0.2559  data: 0.0004  max mem: 19097
Epoch: [239]  [ 600/1251]  eta: 0:02:51  lr: 0.000445  min_lr: 0.000445  loss: 3.1625 (2.8744)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0572 (1.1470)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [239]  [ 800/1251]  eta: 0:01:57  lr: 0.000442  min_lr: 0.000442  loss: 2.8629 (2.8536)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0627 (1.1503)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [239]  [1000/1251]  eta: 0:01:05  lr: 0.000440  min_lr: 0.000440  loss: 2.5903 (2.8525)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1814 (1.1639)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [239]  [1200/1251]  eta: 0:00:13  lr: 0.000438  min_lr: 0.000438  loss: 2.6216 (2.8588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0712 (1.1612)  time: 0.2558  data: 0.0004  max mem: 19097
Epoch: [239]  [1250/1251]  eta: 0:00:00  lr: 0.000437  min_lr: 0.000437  loss: 2.4354 (2.8602)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0712 (1.1594)  time: 0.2169  data: 0.0010  max mem: 19097
Epoch: [239] Total time: 0:05:25 (0.2600 s / it)
Averaged stats: lr: 0.000437  min_lr: 0.000437  loss: 2.4354 (2.8606)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0712 (1.1594)
Test:  [ 0/25]  eta: 0:01:43  loss: 0.5944 (0.5944)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 4.1304  data: 3.9601  max mem: 19097
Test:  [10/25]  eta: 0:00:09  loss: 0.8150 (0.8202)  acc1: 84.0000 (83.7818)  acc5: 96.8000 (97.3091)  time: 0.6660  data: 0.5402  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9581 (0.9470)  acc1: 78.0000 (80.8952)  acc5: 95.2000 (95.8667)  time: 0.2548  data: 0.1340  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0015 (0.9532)  acc1: 79.2000 (80.7680)  acc5: 95.2000 (95.7760)  time: 0.2247  data: 0.1052  max mem: 19097
Test: Total time: 0:00:10 (0.4103 s / it)
* Acc@1 81.448 Acc@5 95.798 loss 0.938
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.54%
Epoch: [240]  [   0/1251]  eta: 1:00:08  lr: 0.000437  min_lr: 0.000437  loss: 3.4740 (3.4740)  weight_decay: 0.0500 (0.0500)  time: 2.8844  data: 2.0415  max mem: 19097
Epoch: [240]  [ 200/1251]  eta: 0:04:45  lr: 0.000435  min_lr: 0.000435  loss: 2.7398 (2.8694)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0126 (1.1145)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [240]  [ 400/1251]  eta: 0:03:45  lr: 0.000433  min_lr: 0.000433  loss: 3.1122 (2.8819)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0999 (1.1398)  time: 0.2580  data: 0.0004  max mem: 19097
Epoch: [240]  [ 600/1251]  eta: 0:02:51  lr: 0.000431  min_lr: 0.000431  loss: 2.8670 (2.8851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0981 (1.1707)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [240]  [ 800/1251]  eta: 0:01:58  lr: 0.000428  min_lr: 0.000428  loss: 3.0294 (2.8801)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1487 (1.1714)  time: 0.2574  data: 0.0003  max mem: 19097
Epoch: [240]  [1000/1251]  eta: 0:01:05  lr: 0.000426  min_lr: 0.000426  loss: 2.9613 (2.8570)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1189 (1.1688)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [240]  [1200/1251]  eta: 0:00:13  lr: 0.000424  min_lr: 0.000424  loss: 2.7798 (2.8579)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1980 (1.1701)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [240]  [1250/1251]  eta: 0:00:00  lr: 0.000423  min_lr: 0.000423  loss: 3.1871 (2.8598)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1190 (1.1662)  time: 0.2166  data: 0.0006  max mem: 19097
Epoch: [240] Total time: 0:05:25 (0.2600 s / it)
Averaged stats: lr: 0.000423  min_lr: 0.000423  loss: 3.1871 (2.8584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1190 (1.1662)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5753 (0.5753)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.7572  data: 5.6187  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8047 (0.8071)  acc1: 84.4000 (84.5455)  acc5: 97.2000 (97.4182)  time: 0.7623  data: 0.6402  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9579 (0.9477)  acc1: 78.0000 (80.9714)  acc5: 96.0000 (96.0000)  time: 0.1925  data: 0.0723  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0318 (0.9550)  acc1: 77.6000 (80.7840)  acc5: 95.2000 (95.8560)  time: 0.1922  data: 0.0722  max mem: 19097
Test: Total time: 0:00:10 (0.4071 s / it)
* Acc@1 81.456 Acc@5 95.864 loss 0.938
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.54%
Epoch: [241]  [   0/1251]  eta: 1:07:01  lr: 0.000423  min_lr: 0.000423  loss: 1.8708 (1.8708)  weight_decay: 0.0500 (0.0500)  time: 3.2146  data: 2.5026  max mem: 19097
Epoch: [241]  [ 200/1251]  eta: 0:04:46  lr: 0.000421  min_lr: 0.000421  loss: 3.0733 (2.8458)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1236 (1.1167)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [241]  [ 400/1251]  eta: 0:03:45  lr: 0.000419  min_lr: 0.000419  loss: 2.2996 (2.8364)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2560  data: 0.0004  max mem: 19097
Epoch: [241]  [ 600/1251]  eta: 0:02:51  lr: 0.000417  min_lr: 0.000417  loss: 3.1414 (2.8535)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1321 (nan)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [241]  [ 800/1251]  eta: 0:01:57  lr: 0.000415  min_lr: 0.000415  loss: 2.5741 (2.8627)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0536 (nan)  time: 0.2560  data: 0.0003  max mem: 19097
Epoch: [241]  [1000/1251]  eta: 0:01:05  lr: 0.000412  min_lr: 0.000412  loss: 2.8931 (2.8566)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1062 (nan)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [241]  [1200/1251]  eta: 0:00:13  lr: 0.000410  min_lr: 0.000410  loss: 2.6184 (2.8471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0912 (nan)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [241]  [1250/1251]  eta: 0:00:00  lr: 0.000410  min_lr: 0.000410  loss: 2.8421 (2.8431)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1325 (nan)  time: 0.2163  data: 0.0005  max mem: 19097
Epoch: [241] Total time: 0:05:25 (0.2601 s / it)
Averaged stats: lr: 0.000410  min_lr: 0.000410  loss: 2.8421 (2.8576)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1325 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6064 (0.6064)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5908  data: 5.4399  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8054 (0.7844)  acc1: 84.0000 (84.1818)  acc5: 97.6000 (97.3091)  time: 0.7519  data: 0.6288  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9547 (0.9241)  acc1: 78.8000 (80.8762)  acc5: 95.2000 (95.9048)  time: 0.2210  data: 0.1016  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0097 (0.9357)  acc1: 78.8000 (80.6720)  acc5: 95.2000 (95.7440)  time: 0.2202  data: 0.1015  max mem: 19097
Test: Total time: 0:00:10 (0.4226 s / it)
* Acc@1 81.436 Acc@5 95.924 loss 0.917
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.54%
Epoch: [242]  [   0/1251]  eta: 1:08:38  lr: 0.000410  min_lr: 0.000410  loss: 1.9251 (1.9251)  weight_decay: 0.0500 (0.0500)  time: 3.2922  data: 2.5373  max mem: 19097
Epoch: [242]  [ 200/1251]  eta: 0:04:47  lr: 0.000407  min_lr: 0.000407  loss: 3.2018 (2.8529)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1493 (1.2016)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [242]  [ 400/1251]  eta: 0:03:45  lr: 0.000405  min_lr: 0.000405  loss: 2.6024 (2.8359)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1219 (1.1875)  time: 0.2578  data: 0.0004  max mem: 19097
Epoch: [242]  [ 600/1251]  eta: 0:02:51  lr: 0.000403  min_lr: 0.000403  loss: 2.8159 (2.8616)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3088 (1.2143)  time: 0.2560  data: 0.0004  max mem: 19097
Epoch: [242]  [ 800/1251]  eta: 0:01:57  lr: 0.000401  min_lr: 0.000401  loss: 2.9032 (2.8449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1387 (1.1950)  time: 0.2638  data: 0.0004  max mem: 19097
Epoch: [242]  [1000/1251]  eta: 0:01:05  lr: 0.000399  min_lr: 0.000399  loss: 2.9202 (2.8483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2072 (1.1960)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [242]  [1200/1251]  eta: 0:00:13  lr: 0.000397  min_lr: 0.000397  loss: 2.8257 (2.8546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1085 (1.1960)  time: 0.2580  data: 0.0003  max mem: 19097
Epoch: [242]  [1250/1251]  eta: 0:00:00  lr: 0.000396  min_lr: 0.000396  loss: 3.1692 (2.8546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1490 (1.1943)  time: 0.2204  data: 0.0005  max mem: 19097
Epoch: [242] Total time: 0:05:26 (0.2607 s / it)
Averaged stats: lr: 0.000396  min_lr: 0.000396  loss: 3.1692 (2.8485)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1490 (1.1943)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6003 (0.6003)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.7400  data: 5.6017  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7828 (0.7846)  acc1: 84.4000 (84.6182)  acc5: 97.6000 (97.5636)  time: 0.7523  data: 0.6270  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9604 (0.9329)  acc1: 79.2000 (80.8952)  acc5: 95.6000 (96.0381)  time: 0.2064  data: 0.0851  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0021 (0.9412)  acc1: 79.2000 (80.7360)  acc5: 95.6000 (95.9360)  time: 0.2059  data: 0.0850  max mem: 19097
Test: Total time: 0:00:10 (0.4172 s / it)
* Acc@1 81.376 Acc@5 95.914 loss 0.930
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.54%
Epoch: [243]  [   0/1251]  eta: 1:02:16  lr: 0.000396  min_lr: 0.000396  loss: 2.3760 (2.3760)  weight_decay: 0.0500 (0.0500)  time: 2.9868  data: 1.9344  max mem: 19097
Epoch: [243]  [ 200/1251]  eta: 0:04:47  lr: 0.000394  min_lr: 0.000394  loss: 2.4830 (2.7963)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0640 (1.0938)  time: 0.2612  data: 0.0004  max mem: 19097
Epoch: [243]  [ 400/1251]  eta: 0:03:45  lr: 0.000392  min_lr: 0.000392  loss: 3.1175 (2.8413)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2774 (1.1709)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [243]  [ 600/1251]  eta: 0:02:51  lr: 0.000390  min_lr: 0.000390  loss: 2.6970 (2.8452)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1735 (1.1764)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [243]  [ 800/1251]  eta: 0:01:57  lr: 0.000388  min_lr: 0.000388  loss: 3.0722 (2.8390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0964 (1.1749)  time: 0.2576  data: 0.0005  max mem: 19097
Epoch: [243]  [1000/1251]  eta: 0:01:05  lr: 0.000385  min_lr: 0.000385  loss: 3.1820 (2.8467)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0829 (1.1691)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [243]  [1200/1251]  eta: 0:00:13  lr: 0.000383  min_lr: 0.000383  loss: 2.8570 (2.8518)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1414 (1.1659)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [243]  [1250/1251]  eta: 0:00:00  lr: 0.000383  min_lr: 0.000383  loss: 3.0578 (2.8565)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1097 (1.1628)  time: 0.2167  data: 0.0007  max mem: 19097
Epoch: [243] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000383  min_lr: 0.000383  loss: 3.0578 (2.8393)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1097 (1.1628)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6415 (0.6415)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.5510  data: 5.3809  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.8113 (0.8393)  acc1: 84.8000 (84.7273)  acc5: 97.2000 (97.3455)  time: 0.7156  data: 0.5881  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0226 (0.9888)  acc1: 78.4000 (81.2000)  acc5: 95.2000 (95.9810)  time: 0.1994  data: 0.0751  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0958 (0.9984)  acc1: 78.4000 (80.9440)  acc5: 95.2000 (95.8880)  time: 0.1985  data: 0.0750  max mem: 19097
Test: Total time: 0:00:10 (0.4039 s / it)
* Acc@1 81.420 Acc@5 95.834 loss 0.985
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.54%
Epoch: [244]  [   0/1251]  eta: 1:05:40  lr: 0.000383  min_lr: 0.000383  loss: 2.9108 (2.9108)  weight_decay: 0.0500 (0.0500)  time: 3.1503  data: 1.8825  max mem: 19097
Epoch: [244]  [ 200/1251]  eta: 0:04:46  lr: 0.000381  min_lr: 0.000381  loss: 3.0038 (2.8492)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1006 (1.1841)  time: 0.2564  data: 0.0003  max mem: 19097
Epoch: [244]  [ 400/1251]  eta: 0:03:45  lr: 0.000379  min_lr: 0.000379  loss: 2.9448 (2.8506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1982 (1.1989)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [244]  [ 600/1251]  eta: 0:02:51  lr: 0.000377  min_lr: 0.000377  loss: 3.1783 (2.8434)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1428 (1.1868)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [244]  [ 800/1251]  eta: 0:01:58  lr: 0.000374  min_lr: 0.000374  loss: 2.7711 (2.8334)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1319 (1.1758)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [244]  [1000/1251]  eta: 0:01:05  lr: 0.000372  min_lr: 0.000372  loss: 2.7400 (2.8179)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1626 (1.1772)  time: 0.2579  data: 0.0004  max mem: 19097
Epoch: [244]  [1200/1251]  eta: 0:00:13  lr: 0.000370  min_lr: 0.000370  loss: 3.2380 (2.8145)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1391 (1.1875)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [244]  [1250/1251]  eta: 0:00:00  lr: 0.000370  min_lr: 0.000370  loss: 2.9037 (2.8147)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1568 (1.1924)  time: 0.2168  data: 0.0006  max mem: 19097
Epoch: [244] Total time: 0:05:26 (0.2608 s / it)
Averaged stats: lr: 0.000370  min_lr: 0.000370  loss: 2.9037 (2.8408)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1568 (1.1924)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6034 (0.6034)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.4640  data: 5.3052  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7655 (0.7829)  acc1: 84.4000 (84.0727)  acc5: 97.6000 (97.2727)  time: 0.7512  data: 0.6247  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9674 (0.9356)  acc1: 79.2000 (80.8762)  acc5: 95.6000 (95.8857)  time: 0.2130  data: 0.0921  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0184 (0.9447)  acc1: 79.2000 (80.7200)  acc5: 95.6000 (95.8400)  time: 0.2118  data: 0.0921  max mem: 19097
Test: Total time: 0:00:10 (0.4125 s / it)
* Acc@1 81.478 Acc@5 95.844 loss 0.925
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.54%
Epoch: [245]  [   0/1251]  eta: 1:09:01  lr: 0.000370  min_lr: 0.000370  loss: 3.0916 (3.0916)  weight_decay: 0.0500 (0.0500)  time: 3.3109  data: 1.7070  max mem: 19097
Epoch: [245]  [ 200/1251]  eta: 0:04:46  lr: 0.000368  min_lr: 0.000368  loss: 2.7109 (2.7559)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2308 (1.2183)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [245]  [ 400/1251]  eta: 0:03:45  lr: 0.000366  min_lr: 0.000366  loss: 2.5336 (2.7850)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3492 (1.2714)  time: 0.2633  data: 0.0003  max mem: 19097
Epoch: [245]  [ 600/1251]  eta: 0:02:51  lr: 0.000364  min_lr: 0.000364  loss: 2.5715 (2.7899)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0912 (1.2497)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [245]  [ 800/1251]  eta: 0:01:58  lr: 0.000362  min_lr: 0.000362  loss: 3.0586 (2.7921)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1378 (1.2319)  time: 0.2617  data: 0.0004  max mem: 19097
Epoch: [245]  [1000/1251]  eta: 0:01:05  lr: 0.000359  min_lr: 0.000359  loss: 2.6776 (2.7929)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1198 (1.2289)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [245]  [1200/1251]  eta: 0:00:13  lr: 0.000357  min_lr: 0.000357  loss: 2.4273 (2.8046)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1360 (1.2225)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [245]  [1250/1251]  eta: 0:00:00  lr: 0.000357  min_lr: 0.000357  loss: 2.9704 (2.8074)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2656 (1.2254)  time: 0.2167  data: 0.0006  max mem: 19097
Epoch: [245] Total time: 0:05:25 (0.2603 s / it)
Averaged stats: lr: 0.000357  min_lr: 0.000357  loss: 2.9704 (2.8246)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2656 (1.2254)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6150 (0.6150)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 5.6961  data: 5.5540  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7994 (0.8094)  acc1: 85.2000 (84.7273)  acc5: 97.6000 (97.4182)  time: 0.7476  data: 0.6256  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9825 (0.9575)  acc1: 79.6000 (81.1619)  acc5: 95.6000 (95.9429)  time: 0.1939  data: 0.0746  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0233 (0.9653)  acc1: 79.2000 (80.9920)  acc5: 94.8000 (95.8400)  time: 0.1933  data: 0.0746  max mem: 19097
Test: Total time: 0:00:10 (0.4053 s / it)
* Acc@1 81.630 Acc@5 95.862 loss 0.948
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.63%
Epoch: [246]  [   0/1251]  eta: 1:02:04  lr: 0.000357  min_lr: 0.000357  loss: 2.7438 (2.7438)  weight_decay: 0.0500 (0.0500)  time: 2.9771  data: 2.6669  max mem: 19097
Epoch: [246]  [ 200/1251]  eta: 0:04:45  lr: 0.000355  min_lr: 0.000355  loss: 2.7588 (2.8392)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0730 (1.1929)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [246]  [ 400/1251]  eta: 0:03:45  lr: 0.000353  min_lr: 0.000353  loss: 2.9769 (2.8353)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1573 (1.1798)  time: 0.2587  data: 0.0004  max mem: 19097
Epoch: [246]  [ 600/1251]  eta: 0:02:50  lr: 0.000351  min_lr: 0.000351  loss: 2.6312 (2.8354)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0925 (1.1687)  time: 0.2571  data: 0.0003  max mem: 19097
Epoch: [246]  [ 800/1251]  eta: 0:01:57  lr: 0.000349  min_lr: 0.000349  loss: 2.8366 (2.8250)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1446 (1.1799)  time: 0.2659  data: 0.0004  max mem: 19097
Epoch: [246]  [1000/1251]  eta: 0:01:05  lr: 0.000347  min_lr: 0.000347  loss: 2.8028 (2.8314)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1200 (1.1823)  time: 0.2562  data: 0.0004  max mem: 19097
Epoch: [246]  [1200/1251]  eta: 0:00:13  lr: 0.000345  min_lr: 0.000345  loss: 3.0493 (2.8180)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2276 (1.1933)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [246]  [1250/1251]  eta: 0:00:00  lr: 0.000344  min_lr: 0.000344  loss: 2.5251 (2.8145)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1980 (1.1894)  time: 0.2164  data: 0.0007  max mem: 19097
Epoch: [246] Total time: 0:05:25 (0.2600 s / it)
Averaged stats: lr: 0.000344  min_lr: 0.000344  loss: 2.5251 (2.8316)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1980 (1.1894)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6012 (0.6012)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.7234  data: 5.5658  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7898 (0.7940)  acc1: 84.8000 (84.2182)  acc5: 97.6000 (97.4182)  time: 0.7200  data: 0.5954  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9517 (0.9472)  acc1: 78.8000 (80.8191)  acc5: 95.6000 (95.7714)  time: 0.1854  data: 0.0654  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0365 (0.9557)  acc1: 78.8000 (80.5760)  acc5: 95.6000 (95.7280)  time: 0.1910  data: 0.0721  max mem: 19097
Test: Total time: 0:00:10 (0.4049 s / it)
* Acc@1 81.490 Acc@5 95.842 loss 0.938
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.63%
Epoch: [247]  [   0/1251]  eta: 1:07:32  lr: 0.000344  min_lr: 0.000344  loss: 2.9892 (2.9892)  weight_decay: 0.0500 (0.0500)  time: 3.2393  data: 1.8142  max mem: 19097
Epoch: [247]  [ 200/1251]  eta: 0:04:46  lr: 0.000342  min_lr: 0.000342  loss: 2.8672 (2.9032)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1913 (1.1706)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [247]  [ 400/1251]  eta: 0:03:45  lr: 0.000340  min_lr: 0.000340  loss: 2.7049 (2.8261)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3680 (1.2067)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [247]  [ 600/1251]  eta: 0:02:50  lr: 0.000338  min_lr: 0.000338  loss: 2.6518 (2.8339)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2700 (1.2084)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [247]  [ 800/1251]  eta: 0:01:57  lr: 0.000336  min_lr: 0.000336  loss: 2.7859 (2.8074)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2331 (1.2274)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [247]  [1000/1251]  eta: 0:01:05  lr: 0.000334  min_lr: 0.000334  loss: 2.8076 (2.8165)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1221 (1.2172)  time: 0.2575  data: 0.0003  max mem: 19097
Epoch: [247]  [1200/1251]  eta: 0:00:13  lr: 0.000332  min_lr: 0.000332  loss: 2.7541 (2.8261)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1896 (1.2164)  time: 0.2588  data: 0.0003  max mem: 19097
Epoch: [247]  [1250/1251]  eta: 0:00:00  lr: 0.000332  min_lr: 0.000332  loss: 2.4110 (2.8190)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2333 (1.2201)  time: 0.2165  data: 0.0005  max mem: 19097
Epoch: [247] Total time: 0:05:25 (0.2603 s / it)
Averaged stats: lr: 0.000332  min_lr: 0.000332  loss: 2.4110 (2.8133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2333 (1.2201)
Test:  [ 0/25]  eta: 0:01:32  loss: 0.4934 (0.4934)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 3.6823  data: 3.5387  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7262 (0.7044)  acc1: 84.4000 (84.6545)  acc5: 98.0000 (97.7091)  time: 0.6891  data: 0.5666  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.8936 (0.8562)  acc1: 78.4000 (81.0667)  acc5: 96.0000 (96.3048)  time: 0.2937  data: 0.1743  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9512 (0.8644)  acc1: 78.4000 (80.8800)  acc5: 95.6000 (96.1120)  time: 0.2258  data: 0.1072  max mem: 19097
Test: Total time: 0:00:10 (0.4047 s / it)
* Acc@1 81.630 Acc@5 95.910 loss 0.845
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.63%
Epoch: [248]  [   0/1251]  eta: 1:05:53  lr: 0.000332  min_lr: 0.000332  loss: 3.3545 (3.3545)  weight_decay: 0.0500 (0.0500)  time: 3.1599  data: 2.8827  max mem: 19097
Epoch: [248]  [ 200/1251]  eta: 0:04:45  lr: 0.000330  min_lr: 0.000330  loss: 2.7125 (2.7331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1552 (1.1879)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [248]  [ 400/1251]  eta: 0:03:44  lr: 0.000328  min_lr: 0.000328  loss: 2.5436 (2.7557)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0670 (1.1766)  time: 0.2582  data: 0.0004  max mem: 19097
Epoch: [248]  [ 600/1251]  eta: 0:02:50  lr: 0.000326  min_lr: 0.000326  loss: 2.7400 (2.7447)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1797 (1.1721)  time: 0.2569  data: 0.0003  max mem: 19097
Epoch: [248]  [ 800/1251]  eta: 0:01:57  lr: 0.000324  min_lr: 0.000324  loss: 2.7999 (2.7572)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1834 (1.1711)  time: 0.2569  data: 0.0003  max mem: 19097
Epoch: [248]  [1000/1251]  eta: 0:01:05  lr: 0.000322  min_lr: 0.000322  loss: 2.6792 (2.7632)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1895 (1.1948)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [248]  [1200/1251]  eta: 0:00:13  lr: 0.000320  min_lr: 0.000320  loss: 2.4887 (2.7636)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2198 (1.1971)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [248]  [1250/1251]  eta: 0:00:00  lr: 0.000320  min_lr: 0.000320  loss: 2.7782 (2.7637)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2198 (1.1976)  time: 0.2166  data: 0.0006  max mem: 19097
Epoch: [248] Total time: 0:05:25 (0.2600 s / it)
Averaged stats: lr: 0.000320  min_lr: 0.000320  loss: 2.7782 (2.8093)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2198 (1.1976)
Test:  [ 0/25]  eta: 0:01:21  loss: 0.5616 (0.5616)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 3.2570  data: 3.1189  max mem: 19097
Test:  [10/25]  eta: 0:00:09  loss: 0.7558 (0.7582)  acc1: 85.2000 (84.5455)  acc5: 98.0000 (97.4909)  time: 0.6033  data: 0.4795  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9413 (0.9048)  acc1: 78.8000 (81.1619)  acc5: 95.6000 (96.0191)  time: 0.2987  data: 0.1782  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0077 (0.9145)  acc1: 78.8000 (81.0880)  acc5: 95.2000 (95.7920)  time: 0.2038  data: 0.0851  max mem: 19097
Test: Total time: 0:00:10 (0.4033 s / it)
* Acc@1 81.720 Acc@5 95.888 loss 0.897
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.72%
Epoch: [249]  [   0/1251]  eta: 1:10:35  lr: 0.000320  min_lr: 0.000320  loss: 3.2359 (3.2359)  weight_decay: 0.0500 (0.0500)  time: 3.3856  data: 3.1086  max mem: 19097
Epoch: [249]  [ 200/1251]  eta: 0:04:48  lr: 0.000318  min_lr: 0.000318  loss: 2.9977 (2.8929)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1952 (1.2033)  time: 0.2594  data: 0.0004  max mem: 19097
Epoch: [249]  [ 400/1251]  eta: 0:03:46  lr: 0.000316  min_lr: 0.000316  loss: 2.9738 (2.8505)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2167 (1.2096)  time: 0.2582  data: 0.0004  max mem: 19097
Epoch: [249]  [ 600/1251]  eta: 0:02:51  lr: 0.000314  min_lr: 0.000314  loss: 3.1552 (2.8451)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1232 (1.1975)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [249]  [ 800/1251]  eta: 0:01:58  lr: 0.000312  min_lr: 0.000312  loss: 2.8405 (2.8435)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1627 (1.2028)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [249]  [1000/1251]  eta: 0:01:05  lr: 0.000310  min_lr: 0.000310  loss: 2.5538 (2.8383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2190 (1.2215)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [249]  [1200/1251]  eta: 0:00:13  lr: 0.000308  min_lr: 0.000308  loss: 2.9713 (2.8474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1599 (1.2192)  time: 0.2597  data: 0.0004  max mem: 19097
Epoch: [249]  [1250/1251]  eta: 0:00:00  lr: 0.000308  min_lr: 0.000308  loss: 2.3315 (2.8406)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1896 (1.2199)  time: 0.2162  data: 0.0005  max mem: 19097
Epoch: [249] Total time: 0:05:25 (0.2605 s / it)
Averaged stats: lr: 0.000308  min_lr: 0.000308  loss: 2.3315 (2.8173)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1896 (1.2199)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5474 (0.5474)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.6892  data: 5.5470  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7391 (0.7122)  acc1: 85.6000 (84.6909)  acc5: 98.0000 (97.5636)  time: 0.7351  data: 0.6108  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.8970 (0.8656)  acc1: 78.8000 (81.0095)  acc5: 95.6000 (96.0000)  time: 0.1947  data: 0.0742  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9762 (0.8768)  acc1: 78.8000 (80.8000)  acc5: 95.2000 (95.9360)  time: 0.1955  data: 0.0741  max mem: 19097
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 81.690 Acc@5 95.974 loss 0.860
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.72%
Epoch: [250]  [   0/1251]  eta: 1:04:09  lr: 0.000307  min_lr: 0.000307  loss: 3.2210 (3.2210)  weight_decay: 0.0500 (0.0500)  time: 3.0775  data: 1.5673  max mem: 19097
Epoch: [250]  [ 200/1251]  eta: 0:04:46  lr: 0.000306  min_lr: 0.000306  loss: 2.3378 (2.7551)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0425 (1.2251)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [250]  [ 400/1251]  eta: 0:03:46  lr: 0.000304  min_lr: 0.000304  loss: 2.9213 (2.7606)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0955 (1.1781)  time: 0.2584  data: 0.0003  max mem: 19097
Epoch: [250]  [ 600/1251]  eta: 0:02:51  lr: 0.000302  min_lr: 0.000302  loss: 2.7823 (2.7833)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1937 (1.1817)  time: 0.2561  data: 0.0003  max mem: 19097
Epoch: [250]  [ 800/1251]  eta: 0:01:58  lr: 0.000300  min_lr: 0.000300  loss: 2.7171 (2.7917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1790 (1.1749)  time: 0.2564  data: 0.0003  max mem: 19097
Epoch: [250]  [1000/1251]  eta: 0:01:05  lr: 0.000298  min_lr: 0.000298  loss: 3.0350 (2.8060)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1050 (1.1811)  time: 0.2558  data: 0.0003  max mem: 19097
Epoch: [250]  [1200/1251]  eta: 0:00:13  lr: 0.000296  min_lr: 0.000296  loss: 3.2382 (2.8108)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2392 (1.1885)  time: 0.2570  data: 0.0005  max mem: 19097
Epoch: [250]  [1250/1251]  eta: 0:00:00  lr: 0.000296  min_lr: 0.000296  loss: 2.6245 (2.8069)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2115 (1.1916)  time: 0.2169  data: 0.0007  max mem: 19097
Epoch: [250] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000296  min_lr: 0.000296  loss: 2.6245 (2.8112)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2115 (1.1916)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5615 (0.5615)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 5.7948  data: 5.6550  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7685 (0.7535)  acc1: 85.6000 (84.6545)  acc5: 97.6000 (97.6727)  time: 0.7644  data: 0.6418  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9118 (0.8954)  acc1: 78.8000 (81.2571)  acc5: 96.0000 (96.1905)  time: 0.2191  data: 0.0974  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9633 (0.9042)  acc1: 78.0000 (80.9440)  acc5: 95.6000 (96.0640)  time: 0.2186  data: 0.0974  max mem: 19097
Test: Total time: 0:00:10 (0.4292 s / it)
* Acc@1 81.746 Acc@5 95.946 loss 0.893
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.75%
Epoch: [251]  [   0/1251]  eta: 1:09:15  lr: 0.000296  min_lr: 0.000296  loss: 3.5479 (3.5479)  weight_decay: 0.0500 (0.0500)  time: 3.3217  data: 3.0483  max mem: 19097
Epoch: [251]  [ 200/1251]  eta: 0:04:46  lr: 0.000294  min_lr: 0.000294  loss: 2.9319 (2.7900)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1303 (1.2334)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [251]  [ 400/1251]  eta: 0:03:46  lr: 0.000292  min_lr: 0.000292  loss: 3.0141 (2.8215)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1789 (1.2088)  time: 0.2675  data: 0.0004  max mem: 19097
Epoch: [251]  [ 600/1251]  eta: 0:02:51  lr: 0.000290  min_lr: 0.000290  loss: 3.1345 (2.8165)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0685 (1.1852)  time: 0.2593  data: 0.0004  max mem: 19097
Epoch: [251]  [ 800/1251]  eta: 0:01:58  lr: 0.000288  min_lr: 0.000288  loss: 2.3636 (2.8044)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1966 (1.1913)  time: 0.2561  data: 0.0004  max mem: 19097
Epoch: [251]  [1000/1251]  eta: 0:01:05  lr: 0.000286  min_lr: 0.000286  loss: 2.1562 (2.8012)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1055 (1.1940)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [251]  [1200/1251]  eta: 0:00:13  lr: 0.000284  min_lr: 0.000284  loss: 2.7118 (2.8118)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1696 (1.1939)  time: 0.2630  data: 0.0004  max mem: 19097
Epoch: [251]  [1250/1251]  eta: 0:00:00  lr: 0.000284  min_lr: 0.000284  loss: 2.9503 (2.8125)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2313 (1.1954)  time: 0.2166  data: 0.0006  max mem: 19097
Epoch: [251] Total time: 0:05:26 (0.2607 s / it)
Averaged stats: lr: 0.000284  min_lr: 0.000284  loss: 2.9503 (2.7975)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2313 (1.1954)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6080 (0.6080)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.8806  data: 5.7196  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7804 (0.7810)  acc1: 86.0000 (85.2364)  acc5: 97.6000 (97.6364)  time: 0.7625  data: 0.6352  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9351 (0.9301)  acc1: 79.6000 (81.5429)  acc5: 95.6000 (96.1714)  time: 0.2003  data: 0.0790  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0328 (0.9398)  acc1: 79.6000 (81.3920)  acc5: 95.2000 (96.0800)  time: 0.1986  data: 0.0789  max mem: 19097
Test: Total time: 0:00:10 (0.4187 s / it)
* Acc@1 81.846 Acc@5 95.948 loss 0.927
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.85%
Epoch: [252]  [   0/1251]  eta: 1:09:32  lr: 0.000284  min_lr: 0.000284  loss: 3.5124 (3.5124)  weight_decay: 0.0500 (0.0500)  time: 3.3353  data: 3.0646  max mem: 19097
Epoch: [252]  [ 200/1251]  eta: 0:04:46  lr: 0.000282  min_lr: 0.000282  loss: 2.8268 (2.8148)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1260 (1.1907)  time: 0.2573  data: 0.0005  max mem: 19097
Epoch: [252]  [ 400/1251]  eta: 0:03:45  lr: 0.000280  min_lr: 0.000280  loss: 3.0989 (2.8243)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1757 (1.2020)  time: 0.2579  data: 0.0005  max mem: 19097
Epoch: [252]  [ 600/1251]  eta: 0:02:51  lr: 0.000279  min_lr: 0.000279  loss: 2.8687 (2.8268)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1869 (1.2060)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [252]  [ 800/1251]  eta: 0:01:58  lr: 0.000277  min_lr: 0.000277  loss: 3.0495 (2.8110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1694 (1.2070)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [252]  [1000/1251]  eta: 0:01:05  lr: 0.000275  min_lr: 0.000275  loss: 2.0031 (2.7985)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1465 (1.2091)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [252]  [1200/1251]  eta: 0:00:13  lr: 0.000273  min_lr: 0.000273  loss: 2.2639 (2.8008)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2836 (1.2132)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [252]  [1250/1251]  eta: 0:00:00  lr: 0.000273  min_lr: 0.000273  loss: 3.0700 (2.8036)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2400 (1.2164)  time: 0.2169  data: 0.0005  max mem: 19097
Epoch: [252] Total time: 0:05:25 (0.2603 s / it)
Averaged stats: lr: 0.000273  min_lr: 0.000273  loss: 3.0700 (2.8053)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2400 (1.2164)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6006 (0.6006)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.8574  data: 5.6921  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7926 (0.8034)  acc1: 86.4000 (84.8727)  acc5: 97.6000 (97.5636)  time: 0.7465  data: 0.6220  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9763 (0.9459)  acc1: 79.2000 (81.5238)  acc5: 95.6000 (96.2095)  time: 0.2141  data: 0.0946  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0598 (0.9559)  acc1: 79.2000 (81.2480)  acc5: 95.6000 (96.0640)  time: 0.2134  data: 0.0945  max mem: 19097
Test: Total time: 0:00:10 (0.4275 s / it)
* Acc@1 81.804 Acc@5 95.988 loss 0.945
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.85%
Epoch: [253]  [   0/1251]  eta: 1:09:46  lr: 0.000273  min_lr: 0.000273  loss: 3.0138 (3.0138)  weight_decay: 0.0500 (0.0500)  time: 3.3465  data: 2.7648  max mem: 19097
Epoch: [253]  [ 200/1251]  eta: 0:04:47  lr: 0.000271  min_lr: 0.000271  loss: 3.0785 (2.8635)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1150 (1.1745)  time: 0.2565  data: 0.0005  max mem: 19097
Epoch: [253]  [ 400/1251]  eta: 0:03:46  lr: 0.000269  min_lr: 0.000269  loss: 3.0194 (2.8682)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1079 (1.1762)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [253]  [ 600/1251]  eta: 0:02:51  lr: 0.000267  min_lr: 0.000267  loss: 3.1294 (2.8286)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2328 (1.1866)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [253]  [ 800/1251]  eta: 0:01:58  lr: 0.000265  min_lr: 0.000265  loss: 3.2594 (2.8318)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1350 (1.1867)  time: 0.2698  data: 0.0004  max mem: 19097
Epoch: [253]  [1000/1251]  eta: 0:01:05  lr: 0.000264  min_lr: 0.000264  loss: 2.9805 (2.8243)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1277 (1.1923)  time: 0.2581  data: 0.0004  max mem: 19097
Epoch: [253]  [1200/1251]  eta: 0:00:13  lr: 0.000262  min_lr: 0.000262  loss: 2.9782 (2.8235)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2364 (1.1998)  time: 0.2573  data: 0.0003  max mem: 19097
Epoch: [253]  [1250/1251]  eta: 0:00:00  lr: 0.000261  min_lr: 0.000261  loss: 2.4758 (2.8231)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3398 (1.2058)  time: 0.2167  data: 0.0007  max mem: 19097
Epoch: [253] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000261  min_lr: 0.000261  loss: 2.4758 (2.8114)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3398 (1.2058)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6285 (0.6285)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.4896  data: 5.3368  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8148 (0.8058)  acc1: 85.6000 (84.7273)  acc5: 97.6000 (97.4546)  time: 0.7487  data: 0.6245  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9519 (0.9497)  acc1: 79.6000 (81.2381)  acc5: 95.6000 (95.9048)  time: 0.2101  data: 0.0901  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0376 (0.9572)  acc1: 79.6000 (81.0080)  acc5: 94.8000 (95.7920)  time: 0.2123  data: 0.0934  max mem: 19097
Test: Total time: 0:00:10 (0.4124 s / it)
* Acc@1 81.686 Acc@5 95.956 loss 0.944
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.85%
Epoch: [254]  [   0/1251]  eta: 1:05:15  lr: 0.000261  min_lr: 0.000261  loss: 2.9293 (2.9293)  weight_decay: 0.0500 (0.0500)  time: 3.1302  data: 1.7776  max mem: 19097
Epoch: [254]  [ 200/1251]  eta: 0:04:47  lr: 0.000260  min_lr: 0.000260  loss: 3.0849 (2.8164)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3693 (inf)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [254]  [ 400/1251]  eta: 0:03:45  lr: 0.000258  min_lr: 0.000258  loss: 2.9580 (2.7930)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2370 (inf)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [254]  [ 600/1251]  eta: 0:02:51  lr: 0.000256  min_lr: 0.000256  loss: 2.5064 (2.7851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1710 (inf)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [254]  [ 800/1251]  eta: 0:01:58  lr: 0.000254  min_lr: 0.000254  loss: 2.8845 (2.7929)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1994 (inf)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [254]  [1000/1251]  eta: 0:01:05  lr: 0.000253  min_lr: 0.000253  loss: 3.3042 (2.7901)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2249 (inf)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [254]  [1200/1251]  eta: 0:00:13  lr: 0.000251  min_lr: 0.000251  loss: 2.8252 (2.7972)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2006 (inf)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [254]  [1250/1251]  eta: 0:00:00  lr: 0.000251  min_lr: 0.000251  loss: 2.7919 (2.7976)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2268 (inf)  time: 0.2168  data: 0.0009  max mem: 19097
Epoch: [254] Total time: 0:05:25 (0.2601 s / it)
Averaged stats: lr: 0.000251  min_lr: 0.000251  loss: 2.7919 (2.7943)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2268 (inf)
Test:  [ 0/25]  eta: 0:01:43  loss: 0.6312 (0.6312)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 4.1409  data: 3.9665  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7982 (0.7933)  acc1: 85.2000 (84.6545)  acc5: 97.6000 (97.5636)  time: 0.6830  data: 0.5578  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9713 (0.9430)  acc1: 79.2000 (81.4476)  acc5: 96.0000 (95.9619)  time: 0.2602  data: 0.1407  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0429 (0.9532)  acc1: 79.2000 (81.1360)  acc5: 94.8000 (95.8400)  time: 0.2084  data: 0.0885  max mem: 19097
Test: Total time: 0:00:10 (0.4102 s / it)
* Acc@1 81.790 Acc@5 95.902 loss 0.938
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.85%
Epoch: [255]  [   0/1251]  eta: 1:09:03  lr: 0.000250  min_lr: 0.000250  loss: 3.1730 (3.1730)  weight_decay: 0.0500 (0.0500)  time: 3.3121  data: 2.3083  max mem: 19097
Epoch: [255]  [ 200/1251]  eta: 0:04:48  lr: 0.000249  min_lr: 0.000249  loss: 2.7855 (2.8220)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2961 (1.2827)  time: 0.2632  data: 0.0004  max mem: 19097
Epoch: [255]  [ 400/1251]  eta: 0:03:46  lr: 0.000247  min_lr: 0.000247  loss: 2.7032 (2.8068)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1587 (1.2617)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [255]  [ 600/1251]  eta: 0:02:51  lr: 0.000245  min_lr: 0.000245  loss: 2.2365 (2.7917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2378 (1.2545)  time: 0.2574  data: 0.0003  max mem: 19097
Epoch: [255]  [ 800/1251]  eta: 0:01:58  lr: 0.000244  min_lr: 0.000244  loss: 2.7249 (2.7950)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2025 (1.2685)  time: 0.2567  data: 0.0003  max mem: 19097
Epoch: [255]  [1000/1251]  eta: 0:01:05  lr: 0.000242  min_lr: 0.000242  loss: 2.6873 (2.7795)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1857 (1.2719)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [255]  [1200/1251]  eta: 0:00:13  lr: 0.000240  min_lr: 0.000240  loss: 2.8879 (2.7843)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2204 (1.2653)  time: 0.2565  data: 0.0003  max mem: 19097
Epoch: [255]  [1250/1251]  eta: 0:00:00  lr: 0.000240  min_lr: 0.000240  loss: 2.4119 (2.7824)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1742 (1.2619)  time: 0.2164  data: 0.0008  max mem: 19097
Epoch: [255] Total time: 0:05:26 (0.2607 s / it)
Averaged stats: lr: 0.000240  min_lr: 0.000240  loss: 2.4119 (2.7843)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1742 (1.2619)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5753 (0.5753)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.6578  data: 5.5150  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7834 (0.7662)  acc1: 85.2000 (84.6545)  acc5: 98.0000 (97.5636)  time: 0.7145  data: 0.5914  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9419 (0.9122)  acc1: 78.8000 (81.4476)  acc5: 96.0000 (96.0762)  time: 0.1880  data: 0.0678  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0061 (0.9207)  acc1: 78.8000 (81.1360)  acc5: 94.8000 (95.8880)  time: 0.1872  data: 0.0677  max mem: 19097
Test: Total time: 0:00:09 (0.3992 s / it)
* Acc@1 82.048 Acc@5 96.010 loss 0.901
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.05%
Epoch: [256]  [   0/1251]  eta: 1:06:55  lr: 0.000240  min_lr: 0.000240  loss: 3.0330 (3.0330)  weight_decay: 0.0500 (0.0500)  time: 3.2101  data: 2.9289  max mem: 19097
Epoch: [256]  [ 200/1251]  eta: 0:04:48  lr: 0.000238  min_lr: 0.000238  loss: 2.6969 (2.7301)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2773 (1.2603)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [256]  [ 400/1251]  eta: 0:03:46  lr: 0.000236  min_lr: 0.000236  loss: 2.3910 (2.7465)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2945 (1.2547)  time: 0.2607  data: 0.0004  max mem: 19097
Epoch: [256]  [ 600/1251]  eta: 0:02:51  lr: 0.000235  min_lr: 0.000235  loss: 2.3112 (2.7309)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3872 (1.3143)  time: 0.2643  data: 0.0004  max mem: 19097
Epoch: [256]  [ 800/1251]  eta: 0:01:58  lr: 0.000233  min_lr: 0.000233  loss: 2.9979 (2.7445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1608 (1.3051)  time: 0.2581  data: 0.0004  max mem: 19097
Epoch: [256]  [1000/1251]  eta: 0:01:05  lr: 0.000231  min_lr: 0.000231  loss: 2.9989 (2.7447)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2991 (1.2939)  time: 0.2577  data: 0.0005  max mem: 19097
Epoch: [256]  [1200/1251]  eta: 0:00:13  lr: 0.000230  min_lr: 0.000230  loss: 2.9543 (2.7531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2946 (1.2907)  time: 0.2644  data: 0.0004  max mem: 19097
Epoch: [256]  [1250/1251]  eta: 0:00:00  lr: 0.000229  min_lr: 0.000229  loss: 2.8848 (2.7542)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1664 (1.2853)  time: 0.2166  data: 0.0007  max mem: 19097
Epoch: [256] Total time: 0:05:26 (0.2608 s / it)
Averaged stats: lr: 0.000229  min_lr: 0.000229  loss: 2.8848 (2.7769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1664 (1.2853)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6270 (0.6270)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.4749  data: 5.3127  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8127 (0.8023)  acc1: 85.6000 (84.6909)  acc5: 98.0000 (97.5273)  time: 0.7436  data: 0.6186  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9700 (0.9415)  acc1: 79.2000 (81.4286)  acc5: 96.0000 (96.0191)  time: 0.2048  data: 0.0839  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0399 (0.9489)  acc1: 79.2000 (81.2640)  acc5: 95.2000 (95.9360)  time: 0.2133  data: 0.0934  max mem: 19097
Test: Total time: 0:00:10 (0.4124 s / it)
* Acc@1 81.978 Acc@5 95.986 loss 0.932
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.05%
Epoch: [257]  [   0/1251]  eta: 1:04:31  lr: 0.000229  min_lr: 0.000229  loss: 2.4162 (2.4162)  weight_decay: 0.0500 (0.0500)  time: 3.0950  data: 2.0630  max mem: 19097
Epoch: [257]  [ 200/1251]  eta: 0:04:48  lr: 0.000228  min_lr: 0.000228  loss: 2.5057 (2.8401)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1311 (1.1565)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [257]  [ 400/1251]  eta: 0:03:46  lr: 0.000226  min_lr: 0.000226  loss: 3.0158 (2.8050)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1848 (1.1926)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [257]  [ 600/1251]  eta: 0:02:51  lr: 0.000224  min_lr: 0.000224  loss: 2.2864 (2.7965)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2043 (1.2002)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [257]  [ 800/1251]  eta: 0:01:58  lr: 0.000223  min_lr: 0.000223  loss: 2.7185 (2.8135)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2449 (1.2223)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [257]  [1000/1251]  eta: 0:01:05  lr: 0.000221  min_lr: 0.000221  loss: 3.0422 (2.8085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2157 (1.2308)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [257]  [1200/1251]  eta: 0:00:13  lr: 0.000219  min_lr: 0.000219  loss: 2.7360 (2.8064)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2319 (1.2321)  time: 0.2650  data: 0.0004  max mem: 19097
Epoch: [257]  [1250/1251]  eta: 0:00:00  lr: 0.000219  min_lr: 0.000219  loss: 2.9865 (2.8053)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2319 (1.2317)  time: 0.2217  data: 0.0006  max mem: 19097
Epoch: [257] Total time: 0:05:26 (0.2607 s / it)
Averaged stats: lr: 0.000219  min_lr: 0.000219  loss: 2.9865 (2.7879)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2319 (1.2317)
Test:  [ 0/25]  eta: 0:01:38  loss: 0.6475 (0.6475)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 3.9444  data: 3.7985  max mem: 19097
Test:  [10/25]  eta: 0:00:09  loss: 0.8343 (0.8349)  acc1: 85.6000 (84.5091)  acc5: 98.0000 (97.6364)  time: 0.6036  data: 0.4811  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0490 (0.9730)  acc1: 79.2000 (81.1429)  acc5: 96.0000 (96.0000)  time: 0.2477  data: 0.1268  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0592 (0.9811)  acc1: 79.6000 (81.0560)  acc5: 95.2000 (95.9520)  time: 0.1967  data: 0.0764  max mem: 19097
Test: Total time: 0:00:09 (0.3885 s / it)
* Acc@1 81.898 Acc@5 95.916 loss 0.968
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 82.05%
Epoch: [258]  [   0/1251]  eta: 1:09:14  lr: 0.000219  min_lr: 0.000219  loss: 3.3091 (3.3091)  weight_decay: 0.0500 (0.0500)  time: 3.3211  data: 2.3935  max mem: 19097
Epoch: [258]  [ 200/1251]  eta: 0:04:46  lr: 0.000217  min_lr: 0.000217  loss: 2.2019 (2.7822)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2230 (1.2248)  time: 0.2593  data: 0.0004  max mem: 19097
Epoch: [258]  [ 400/1251]  eta: 0:03:46  lr: 0.000216  min_lr: 0.000216  loss: 2.7890 (2.7994)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2013 (1.2378)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [258]  [ 600/1251]  eta: 0:02:51  lr: 0.000214  min_lr: 0.000214  loss: 2.5673 (2.8102)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3481 (1.2694)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [258]  [ 800/1251]  eta: 0:01:57  lr: 0.000212  min_lr: 0.000212  loss: 2.9522 (2.8113)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2045 (1.2766)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [258]  [1000/1251]  eta: 0:01:05  lr: 0.000211  min_lr: 0.000211  loss: 2.8273 (2.8123)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3885 (1.2886)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [258]  [1200/1251]  eta: 0:00:13  lr: 0.000209  min_lr: 0.000209  loss: 2.9719 (2.8032)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1655 (1.2849)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [258]  [1250/1251]  eta: 0:00:00  lr: 0.000209  min_lr: 0.000209  loss: 3.1630 (2.8089)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1517 (1.2781)  time: 0.2169  data: 0.0006  max mem: 19097
Epoch: [258] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000209  min_lr: 0.000209  loss: 3.1630 (2.7947)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1517 (1.2781)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5616 (0.5616)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.7469  data: 5.6027  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7766 (0.7613)  acc1: 85.6000 (84.8364)  acc5: 97.6000 (97.6727)  time: 0.7147  data: 0.5898  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9503 (0.9071)  acc1: 78.4000 (81.4476)  acc5: 96.0000 (96.1714)  time: 0.1971  data: 0.0763  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0211 (0.9215)  acc1: 78.4000 (81.1680)  acc5: 94.8000 (95.9520)  time: 0.1977  data: 0.0776  max mem: 19097
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 81.974 Acc@5 96.002 loss 0.903
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.05%
Epoch: [259]  [   0/1251]  eta: 1:04:00  lr: 0.000209  min_lr: 0.000209  loss: 3.0855 (3.0855)  weight_decay: 0.0500 (0.0500)  time: 3.0699  data: 1.8808  max mem: 19097
Epoch: [259]  [ 200/1251]  eta: 0:04:46  lr: 0.000207  min_lr: 0.000207  loss: 3.0034 (2.7649)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1890 (1.2603)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [259]  [ 400/1251]  eta: 0:03:45  lr: 0.000206  min_lr: 0.000206  loss: 2.6794 (2.7424)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3133 (1.2735)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [259]  [ 600/1251]  eta: 0:02:50  lr: 0.000204  min_lr: 0.000204  loss: 2.6144 (2.7555)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1880 (1.2720)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [259]  [ 800/1251]  eta: 0:01:57  lr: 0.000203  min_lr: 0.000203  loss: 2.6293 (2.7796)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1855 (1.2652)  time: 0.2582  data: 0.0004  max mem: 19097
Epoch: [259]  [1000/1251]  eta: 0:01:05  lr: 0.000201  min_lr: 0.000201  loss: 2.7498 (2.7789)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1462 (1.2537)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [259]  [1200/1251]  eta: 0:00:13  lr: 0.000199  min_lr: 0.000199  loss: 2.5252 (2.7596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2415 (1.2642)  time: 0.2569  data: 0.0005  max mem: 19097
Epoch: [259]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 2.8585 (2.7635)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2029 (1.2612)  time: 0.2165  data: 0.0007  max mem: 19097
Epoch: [259] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 2.8585 (2.7656)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2029 (1.2612)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6360 (0.6360)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.6289  data: 5.4801  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8374 (0.8252)  acc1: 85.2000 (84.9091)  acc5: 98.0000 (97.6364)  time: 0.7369  data: 0.6127  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9966 (0.9575)  acc1: 79.6000 (81.6381)  acc5: 96.0000 (95.9048)  time: 0.1972  data: 0.0756  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0422 (0.9690)  acc1: 79.6000 (81.3280)  acc5: 95.6000 (95.7920)  time: 0.1993  data: 0.0788  max mem: 19097
Test: Total time: 0:00:10 (0.4087 s / it)
* Acc@1 82.058 Acc@5 95.926 loss 0.955
Accuracy of the model on the 50000 test images: 82.1%
Max accuracy: 82.06%
Epoch: [260]  [   0/1251]  eta: 1:01:44  lr: 0.000199  min_lr: 0.000199  loss: 3.4409 (3.4409)  weight_decay: 0.0500 (0.0500)  time: 2.9613  data: 2.6888  max mem: 19097
Epoch: [260]  [ 200/1251]  eta: 0:04:45  lr: 0.000197  min_lr: 0.000197  loss: 2.6916 (2.7704)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2629 (1.2457)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [260]  [ 400/1251]  eta: 0:03:45  lr: 0.000196  min_lr: 0.000196  loss: 3.0651 (2.8023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1571 (1.2554)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [260]  [ 600/1251]  eta: 0:02:50  lr: 0.000194  min_lr: 0.000194  loss: 2.4755 (2.7789)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2774 (1.2590)  time: 0.2568  data: 0.0005  max mem: 19097
Epoch: [260]  [ 800/1251]  eta: 0:01:57  lr: 0.000193  min_lr: 0.000193  loss: 2.9268 (2.7768)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2964 (1.2846)  time: 0.2582  data: 0.0004  max mem: 19097
Epoch: [260]  [1000/1251]  eta: 0:01:05  lr: 0.000191  min_lr: 0.000191  loss: 2.8049 (2.7758)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2695 (1.2873)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [260]  [1200/1251]  eta: 0:00:13  lr: 0.000190  min_lr: 0.000190  loss: 2.8481 (2.7830)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2748 (1.2830)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [260]  [1250/1251]  eta: 0:00:00  lr: 0.000189  min_lr: 0.000189  loss: 3.1533 (2.7875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2559 (1.2810)  time: 0.2164  data: 0.0006  max mem: 19097
Epoch: [260] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000189  min_lr: 0.000189  loss: 3.1533 (2.7699)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2559 (1.2810)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6668 (0.6668)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 5.6472  data: 5.5041  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8429 (0.8467)  acc1: 85.6000 (85.0545)  acc5: 98.0000 (97.8909)  time: 0.7360  data: 0.6115  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0043 (0.9897)  acc1: 79.6000 (81.8476)  acc5: 96.4000 (96.4000)  time: 0.2190  data: 0.0985  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0841 (1.0032)  acc1: 79.6000 (81.5040)  acc5: 96.0000 (96.2880)  time: 0.2189  data: 0.0984  max mem: 19097
Test: Total time: 0:00:10 (0.4237 s / it)
* Acc@1 82.048 Acc@5 96.020 loss 0.990
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.06%
Epoch: [261]  [   0/1251]  eta: 1:10:13  lr: 0.000189  min_lr: 0.000189  loss: 3.4819 (3.4819)  weight_decay: 0.0500 (0.0500)  time: 3.3682  data: 2.7489  max mem: 19097
Epoch: [261]  [ 200/1251]  eta: 0:04:46  lr: 0.000188  min_lr: 0.000188  loss: 2.9350 (2.7230)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2054 (1.2279)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [261]  [ 400/1251]  eta: 0:03:45  lr: 0.000186  min_lr: 0.000186  loss: 2.2262 (2.7044)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1264 (1.2272)  time: 0.2566  data: 0.0003  max mem: 19097
Epoch: [261]  [ 600/1251]  eta: 0:02:51  lr: 0.000185  min_lr: 0.000185  loss: 3.0726 (2.7244)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1591 (1.2342)  time: 0.2629  data: 0.0004  max mem: 19097
Epoch: [261]  [ 800/1251]  eta: 0:01:57  lr: 0.000183  min_lr: 0.000183  loss: 2.3627 (2.7247)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2539 (1.2257)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [261]  [1000/1251]  eta: 0:01:05  lr: 0.000182  min_lr: 0.000182  loss: 2.7548 (2.7323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1701 (1.2380)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [261]  [1200/1251]  eta: 0:00:13  lr: 0.000180  min_lr: 0.000180  loss: 2.5178 (2.7403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1631 (1.2393)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [261]  [1250/1251]  eta: 0:00:00  lr: 0.000180  min_lr: 0.000180  loss: 2.2895 (2.7384)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1766 (1.2381)  time: 0.2164  data: 0.0007  max mem: 19097
Epoch: [261] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000180  min_lr: 0.000180  loss: 2.2895 (2.7651)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1766 (1.2381)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5537 (0.5537)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6349  data: 5.4757  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7685 (0.7450)  acc1: 86.4000 (85.0909)  acc5: 97.6000 (97.7455)  time: 0.6741  data: 0.5476  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9052 (0.8890)  acc1: 79.6000 (81.6191)  acc5: 96.4000 (96.1905)  time: 0.1797  data: 0.0588  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9685 (0.9008)  acc1: 78.0000 (81.3280)  acc5: 95.2000 (96.0320)  time: 0.1857  data: 0.0658  max mem: 19097
Test: Total time: 0:00:09 (0.3987 s / it)
* Acc@1 81.954 Acc@5 96.014 loss 0.886
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.06%
Epoch: [262]  [   0/1251]  eta: 1:07:15  lr: 0.000180  min_lr: 0.000180  loss: 1.8232 (1.8232)  weight_decay: 0.0500 (0.0500)  time: 3.2261  data: 2.4259  max mem: 19097
Epoch: [262]  [ 200/1251]  eta: 0:04:46  lr: 0.000179  min_lr: 0.000179  loss: 2.7845 (2.8112)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1616 (1.2466)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [262]  [ 400/1251]  eta: 0:03:45  lr: 0.000177  min_lr: 0.000177  loss: 2.9079 (2.7778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3041 (1.2554)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [262]  [ 600/1251]  eta: 0:02:50  lr: 0.000176  min_lr: 0.000176  loss: 2.7946 (2.7706)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2419 (inf)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [262]  [ 800/1251]  eta: 0:01:57  lr: 0.000174  min_lr: 0.000174  loss: 2.9016 (2.7557)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1314 (inf)  time: 0.2591  data: 0.0003  max mem: 19097
Epoch: [262]  [1000/1251]  eta: 0:01:05  lr: 0.000173  min_lr: 0.000173  loss: 3.2076 (2.7799)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1810 (inf)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [262]  [1200/1251]  eta: 0:00:13  lr: 0.000171  min_lr: 0.000171  loss: 2.8123 (2.7788)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2208 (inf)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [262]  [1250/1251]  eta: 0:00:00  lr: 0.000171  min_lr: 0.000171  loss: 2.6400 (2.7806)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2208 (inf)  time: 0.2163  data: 0.0006  max mem: 19097
Epoch: [262] Total time: 0:05:25 (0.2603 s / it)
Averaged stats: lr: 0.000171  min_lr: 0.000171  loss: 2.6400 (2.7724)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2208 (inf)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5596 (0.5596)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.6985  data: 5.5568  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7730 (0.7595)  acc1: 85.6000 (85.1273)  acc5: 98.0000 (97.6364)  time: 0.7614  data: 0.6361  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9513 (0.8930)  acc1: 80.0000 (81.7714)  acc5: 95.6000 (96.0952)  time: 0.2098  data: 0.0888  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9674 (0.9055)  acc1: 80.0000 (81.5200)  acc5: 95.6000 (95.9360)  time: 0.2084  data: 0.0887  max mem: 19097
Test: Total time: 0:00:10 (0.4200 s / it)
* Acc@1 82.148 Acc@5 96.032 loss 0.891
Accuracy of the model on the 50000 test images: 82.1%
Max accuracy: 82.15%
Epoch: [263]  [   0/1251]  eta: 0:59:07  lr: 0.000171  min_lr: 0.000171  loss: 3.4003 (3.4003)  weight_decay: 0.0500 (0.0500)  time: 2.8358  data: 2.5399  max mem: 19097
Epoch: [263]  [ 200/1251]  eta: 0:04:44  lr: 0.000169  min_lr: 0.000169  loss: 2.7790 (2.7744)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2549 (1.2195)  time: 0.2556  data: 0.0004  max mem: 19097
Epoch: [263]  [ 400/1251]  eta: 0:03:44  lr: 0.000168  min_lr: 0.000168  loss: 3.0180 (2.7891)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2207 (1.2406)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [263]  [ 600/1251]  eta: 0:02:50  lr: 0.000167  min_lr: 0.000167  loss: 2.7663 (2.7977)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1975 (1.2517)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [263]  [ 800/1251]  eta: 0:01:57  lr: 0.000165  min_lr: 0.000165  loss: 2.8141 (2.8026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1991 (1.2421)  time: 0.2675  data: 0.0004  max mem: 19097
Epoch: [263]  [1000/1251]  eta: 0:01:05  lr: 0.000164  min_lr: 0.000164  loss: 3.1217 (2.7929)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1879 (1.2423)  time: 0.2562  data: 0.0004  max mem: 19097
Epoch: [263]  [1200/1251]  eta: 0:00:13  lr: 0.000162  min_lr: 0.000162  loss: 2.3767 (2.7734)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1840 (1.2440)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [263]  [1250/1251]  eta: 0:00:00  lr: 0.000162  min_lr: 0.000162  loss: 2.8442 (2.7729)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1840 (1.2443)  time: 0.2219  data: 0.0007  max mem: 19097
Epoch: [263] Total time: 0:05:25 (0.2603 s / it)
Averaged stats: lr: 0.000162  min_lr: 0.000162  loss: 2.8442 (2.7630)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1840 (1.2443)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6004 (0.6004)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.7365  data: 5.5934  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7700 (0.7704)  acc1: 85.6000 (85.3091)  acc5: 98.0000 (97.6727)  time: 0.7694  data: 0.6370  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9696 (0.9092)  acc1: 80.0000 (81.8286)  acc5: 96.4000 (96.3048)  time: 0.2092  data: 0.0843  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9747 (0.9219)  acc1: 80.0000 (81.4560)  acc5: 96.0000 (96.1280)  time: 0.2064  data: 0.0842  max mem: 19097
Test: Total time: 0:00:10 (0.4190 s / it)
* Acc@1 82.190 Acc@5 96.006 loss 0.906
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.19%
Epoch: [264]  [   0/1251]  eta: 0:59:58  lr: 0.000162  min_lr: 0.000162  loss: 2.8561 (2.8561)  weight_decay: 0.0500 (0.0500)  time: 2.8763  data: 2.5840  max mem: 19097
Epoch: [264]  [ 200/1251]  eta: 0:04:43  lr: 0.000160  min_lr: 0.000160  loss: 2.8963 (2.7912)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1617 (1.1912)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [264]  [ 400/1251]  eta: 0:03:44  lr: 0.000159  min_lr: 0.000159  loss: 2.9230 (2.7828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2119 (1.2474)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [264]  [ 600/1251]  eta: 0:02:50  lr: 0.000158  min_lr: 0.000158  loss: 3.0606 (2.7859)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1884 (1.2515)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [264]  [ 800/1251]  eta: 0:01:57  lr: 0.000156  min_lr: 0.000156  loss: 2.4254 (2.7835)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1918 (1.2610)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [264]  [1000/1251]  eta: 0:01:05  lr: 0.000155  min_lr: 0.000155  loss: 2.8731 (2.7843)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2396 (1.2626)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [264]  [1200/1251]  eta: 0:00:13  lr: 0.000154  min_lr: 0.000154  loss: 2.2537 (2.7767)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1952 (1.2574)  time: 0.2585  data: 0.0004  max mem: 19097
Epoch: [264]  [1250/1251]  eta: 0:00:00  lr: 0.000153  min_lr: 0.000153  loss: 2.7062 (2.7753)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1630 (1.2560)  time: 0.2167  data: 0.0005  max mem: 19097
Epoch: [264] Total time: 0:05:24 (0.2595 s / it)
Averaged stats: lr: 0.000153  min_lr: 0.000153  loss: 2.7062 (2.7724)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1630 (1.2560)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5411 (0.5411)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.7985  data: 5.6601  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7435 (0.7365)  acc1: 86.4000 (85.3091)  acc5: 97.6000 (97.6000)  time: 0.7429  data: 0.6201  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9109 (0.8785)  acc1: 79.6000 (81.7714)  acc5: 96.0000 (96.1524)  time: 0.1984  data: 0.0784  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9416 (0.8896)  acc1: 79.6000 (81.4560)  acc5: 95.2000 (96.0160)  time: 0.2024  data: 0.0831  max mem: 19097
Test: Total time: 0:00:10 (0.4169 s / it)
* Acc@1 82.242 Acc@5 96.060 loss 0.876
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.24%
Epoch: [265]  [   0/1251]  eta: 1:01:32  lr: 0.000153  min_lr: 0.000153  loss: 3.5025 (3.5025)  weight_decay: 0.0500 (0.0500)  time: 2.9515  data: 2.6560  max mem: 19097
Epoch: [265]  [ 200/1251]  eta: 0:04:45  lr: 0.000152  min_lr: 0.000152  loss: 2.8548 (2.7778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2067 (1.2560)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [265]  [ 400/1251]  eta: 0:03:45  lr: 0.000150  min_lr: 0.000150  loss: 2.1944 (2.7850)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1789 (1.2299)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [265]  [ 600/1251]  eta: 0:02:50  lr: 0.000149  min_lr: 0.000149  loss: 2.9963 (2.7515)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1491 (1.2356)  time: 0.2562  data: 0.0004  max mem: 19097
Epoch: [265]  [ 800/1251]  eta: 0:01:57  lr: 0.000148  min_lr: 0.000148  loss: 2.5074 (2.7419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2742 (1.2451)  time: 0.2648  data: 0.0004  max mem: 19097
Epoch: [265]  [1000/1251]  eta: 0:01:05  lr: 0.000146  min_lr: 0.000146  loss: 2.8948 (2.7382)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4768 (1.2684)  time: 0.2628  data: 0.0004  max mem: 19097
Epoch: [265]  [1200/1251]  eta: 0:00:13  lr: 0.000145  min_lr: 0.000145  loss: 2.4932 (2.7484)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1089 (1.2620)  time: 0.2581  data: 0.0004  max mem: 19097
Epoch: [265]  [1250/1251]  eta: 0:00:00  lr: 0.000145  min_lr: 0.000145  loss: 2.9050 (2.7514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1024 (1.2624)  time: 0.2166  data: 0.0006  max mem: 19097
Epoch: [265] Total time: 0:05:25 (0.2599 s / it)
Averaged stats: lr: 0.000145  min_lr: 0.000145  loss: 2.9050 (2.7511)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1024 (1.2624)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.5958 (0.5958)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.9290  data: 5.7925  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7948 (0.7859)  acc1: 86.4000 (85.0545)  acc5: 98.0000 (97.7455)  time: 0.7905  data: 0.6671  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9698 (0.9282)  acc1: 79.6000 (81.4667)  acc5: 96.0000 (96.3048)  time: 0.2066  data: 0.0864  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0125 (0.9374)  acc1: 79.6000 (81.2640)  acc5: 95.6000 (96.1760)  time: 0.2058  data: 0.0863  max mem: 19097
Test: Total time: 0:00:10 (0.4248 s / it)
* Acc@1 82.184 Acc@5 96.062 loss 0.920
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.24%
Epoch: [266]  [   0/1251]  eta: 1:05:50  lr: 0.000145  min_lr: 0.000145  loss: 2.3095 (2.3095)  weight_decay: 0.0500 (0.0500)  time: 3.1577  data: 2.7581  max mem: 19097
Epoch: [266]  [ 200/1251]  eta: 0:04:46  lr: 0.000143  min_lr: 0.000143  loss: 2.5705 (2.8051)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1689 (1.2428)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [266]  [ 400/1251]  eta: 0:03:45  lr: 0.000142  min_lr: 0.000142  loss: 2.2103 (2.7828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1880 (1.2483)  time: 0.2559  data: 0.0004  max mem: 19097
Epoch: [266]  [ 600/1251]  eta: 0:02:50  lr: 0.000141  min_lr: 0.000141  loss: 2.9104 (2.7690)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2055 (1.2559)  time: 0.2578  data: 0.0003  max mem: 19097
Epoch: [266]  [ 800/1251]  eta: 0:01:57  lr: 0.000139  min_lr: 0.000139  loss: 2.5448 (2.7560)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2855 (1.2642)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [266]  [1000/1251]  eta: 0:01:05  lr: 0.000138  min_lr: 0.000138  loss: 2.6912 (2.7411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1441 (1.2489)  time: 0.2636  data: 0.0004  max mem: 19097
Epoch: [266]  [1200/1251]  eta: 0:00:13  lr: 0.000137  min_lr: 0.000137  loss: 2.8111 (2.7490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2944 (1.2480)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [266]  [1250/1251]  eta: 0:00:00  lr: 0.000137  min_lr: 0.000137  loss: 2.3992 (2.7456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1906 (1.2454)  time: 0.2164  data: 0.0006  max mem: 19097
Epoch: [266] Total time: 0:05:25 (0.2603 s / it)
Averaged stats: lr: 0.000137  min_lr: 0.000137  loss: 2.3992 (2.7486)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1906 (1.2454)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5407 (0.5407)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.5946  data: 5.4491  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7374 (0.7386)  acc1: 85.6000 (84.9455)  acc5: 98.0000 (97.6727)  time: 0.7542  data: 0.6298  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9199 (0.8759)  acc1: 80.0000 (81.7143)  acc5: 96.0000 (96.2476)  time: 0.2068  data: 0.0864  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9553 (0.8860)  acc1: 80.0000 (81.4880)  acc5: 95.6000 (96.0480)  time: 0.2060  data: 0.0863  max mem: 19097
Test: Total time: 0:00:10 (0.4130 s / it)
* Acc@1 82.188 Acc@5 96.116 loss 0.868
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.24%
Epoch: [267]  [   0/1251]  eta: 1:06:37  lr: 0.000136  min_lr: 0.000136  loss: 3.1712 (3.1712)  weight_decay: 0.0500 (0.0500)  time: 3.1958  data: 2.2377  max mem: 19097
Epoch: [267]  [ 200/1251]  eta: 0:04:46  lr: 0.000135  min_lr: 0.000135  loss: 3.0172 (2.7320)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2022 (1.2693)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [267]  [ 400/1251]  eta: 0:03:45  lr: 0.000134  min_lr: 0.000134  loss: 2.7675 (2.7565)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2604 (1.2803)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [267]  [ 600/1251]  eta: 0:02:51  lr: 0.000133  min_lr: 0.000133  loss: 2.6661 (2.7767)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3065 (1.2795)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [267]  [ 800/1251]  eta: 0:01:57  lr: 0.000131  min_lr: 0.000131  loss: 2.3863 (2.7619)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2536 (1.2819)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [267]  [1000/1251]  eta: 0:01:05  lr: 0.000130  min_lr: 0.000130  loss: 3.1285 (2.7565)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2592 (1.2785)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [267]  [1200/1251]  eta: 0:00:13  lr: 0.000129  min_lr: 0.000129  loss: 3.0298 (2.7490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2363 (1.2762)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [267]  [1250/1251]  eta: 0:00:00  lr: 0.000129  min_lr: 0.000129  loss: 2.3828 (2.7480)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2610 (1.2750)  time: 0.2165  data: 0.0007  max mem: 19097
Epoch: [267] Total time: 0:05:25 (0.2603 s / it)
Averaged stats: lr: 0.000129  min_lr: 0.000129  loss: 2.3828 (2.7546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2610 (1.2750)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5301 (0.5301)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.6035  data: 5.4655  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7538 (0.7415)  acc1: 84.8000 (85.2727)  acc5: 98.0000 (97.6364)  time: 0.6993  data: 0.5770  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9062 (0.8828)  acc1: 79.6000 (81.8857)  acc5: 96.4000 (96.1905)  time: 0.1885  data: 0.0675  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9630 (0.8918)  acc1: 79.6000 (81.6640)  acc5: 95.6000 (95.9840)  time: 0.1900  data: 0.0699  max mem: 19097
Test: Total time: 0:00:10 (0.4000 s / it)
* Acc@1 82.330 Acc@5 96.036 loss 0.874
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.33%
Epoch: [268]  [   0/1251]  eta: 1:00:06  lr: 0.000128  min_lr: 0.000128  loss: 2.5435 (2.5435)  weight_decay: 0.0500 (0.0500)  time: 2.8827  data: 2.6155  max mem: 19097
Epoch: [268]  [ 200/1251]  eta: 0:04:45  lr: 0.000127  min_lr: 0.000127  loss: 2.6788 (2.7255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3278 (1.2428)  time: 0.2585  data: 0.0004  max mem: 19097
Epoch: [268]  [ 400/1251]  eta: 0:03:44  lr: 0.000126  min_lr: 0.000126  loss: 3.0550 (2.7497)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1458 (1.2242)  time: 0.2564  data: 0.0003  max mem: 19097
Epoch: [268]  [ 600/1251]  eta: 0:02:50  lr: 0.000125  min_lr: 0.000125  loss: 2.8053 (2.7551)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2754 (1.2491)  time: 0.2655  data: 0.0004  max mem: 19097
Epoch: [268]  [ 800/1251]  eta: 0:01:58  lr: 0.000123  min_lr: 0.000123  loss: 2.5983 (2.7491)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2360 (1.2599)  time: 0.2584  data: 0.0004  max mem: 19097
Epoch: [268]  [1000/1251]  eta: 0:01:05  lr: 0.000122  min_lr: 0.000122  loss: 2.5228 (2.7423)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2242 (1.2681)  time: 0.2571  data: 0.0003  max mem: 19097
Epoch: [268]  [1200/1251]  eta: 0:00:13  lr: 0.000121  min_lr: 0.000121  loss: 2.8467 (2.7483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2914 (inf)  time: 0.2567  data: 0.0003  max mem: 19097
Epoch: [268]  [1250/1251]  eta: 0:00:00  lr: 0.000121  min_lr: 0.000121  loss: 2.9372 (2.7540)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2396 (inf)  time: 0.2222  data: 0.0007  max mem: 19097
Epoch: [268] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000121  min_lr: 0.000121  loss: 2.9372 (2.7445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2396 (inf)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6586 (0.6586)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.5823  data: 5.4174  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8339 (0.8379)  acc1: 84.8000 (84.9455)  acc5: 97.6000 (97.5636)  time: 0.7396  data: 0.6126  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0127 (0.9830)  acc1: 78.8000 (81.3905)  acc5: 96.0000 (96.2286)  time: 0.2107  data: 0.0898  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0730 (0.9936)  acc1: 78.8000 (81.2000)  acc5: 95.6000 (96.1120)  time: 0.2094  data: 0.0897  max mem: 19097
Test: Total time: 0:00:10 (0.4144 s / it)
* Acc@1 82.202 Acc@5 96.092 loss 0.978
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.33%
Epoch: [269]  [   0/1251]  eta: 1:08:12  lr: 0.000121  min_lr: 0.000121  loss: 2.8437 (2.8437)  weight_decay: 0.0500 (0.0500)  time: 3.2715  data: 2.2650  max mem: 19097
Epoch: [269]  [ 200/1251]  eta: 0:04:46  lr: 0.000120  min_lr: 0.000120  loss: 2.8424 (2.7924)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2390 (1.2776)  time: 0.2583  data: 0.0004  max mem: 19097
Epoch: [269]  [ 400/1251]  eta: 0:03:45  lr: 0.000118  min_lr: 0.000118  loss: 2.5033 (2.7500)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1074 (1.2430)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [269]  [ 600/1251]  eta: 0:02:51  lr: 0.000117  min_lr: 0.000117  loss: 2.2368 (2.7325)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1966 (1.2438)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [269]  [ 800/1251]  eta: 0:01:58  lr: 0.000116  min_lr: 0.000116  loss: 2.9876 (2.7354)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3242 (1.2637)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [269]  [1000/1251]  eta: 0:01:05  lr: 0.000115  min_lr: 0.000115  loss: 2.9691 (2.7396)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2711 (1.2623)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [269]  [1200/1251]  eta: 0:00:13  lr: 0.000113  min_lr: 0.000113  loss: 2.9690 (2.7478)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1508 (1.2574)  time: 0.2632  data: 0.0004  max mem: 19097
Epoch: [269]  [1250/1251]  eta: 0:00:00  lr: 0.000113  min_lr: 0.000113  loss: 2.9056 (2.7528)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1971 (1.2593)  time: 0.2171  data: 0.0007  max mem: 19097
Epoch: [269] Total time: 0:05:26 (0.2607 s / it)
Averaged stats: lr: 0.000113  min_lr: 0.000113  loss: 2.9056 (2.7440)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1971 (1.2593)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5801 (0.5801)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.6883  data: 5.5372  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7970 (0.7874)  acc1: 84.8000 (85.3455)  acc5: 97.6000 (97.7455)  time: 0.7499  data: 0.6259  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9477 (0.9340)  acc1: 80.4000 (81.7714)  acc5: 96.0000 (96.2095)  time: 0.2049  data: 0.0850  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0141 (0.9450)  acc1: 80.0000 (81.4720)  acc5: 95.6000 (96.0640)  time: 0.2037  data: 0.0849  max mem: 19097
Test: Total time: 0:00:10 (0.4134 s / it)
* Acc@1 82.170 Acc@5 96.080 loss 0.928
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.33%
Epoch: [270]  [   0/1251]  eta: 1:09:05  lr: 0.000113  min_lr: 0.000113  loss: 1.8180 (1.8180)  weight_decay: 0.0500 (0.0500)  time: 3.3134  data: 3.0000  max mem: 19097
Epoch: [270]  [ 200/1251]  eta: 0:04:47  lr: 0.000112  min_lr: 0.000112  loss: 2.7926 (2.6818)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2779 (1.2387)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [270]  [ 400/1251]  eta: 0:03:46  lr: 0.000111  min_lr: 0.000111  loss: 2.8613 (2.7204)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1784 (1.2269)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [270]  [ 600/1251]  eta: 0:02:51  lr: 0.000110  min_lr: 0.000110  loss: 2.9752 (2.7207)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2406 (1.2359)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [270]  [ 800/1251]  eta: 0:01:58  lr: 0.000109  min_lr: 0.000109  loss: 2.1929 (2.7262)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3227 (1.2426)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [270]  [1000/1251]  eta: 0:01:05  lr: 0.000107  min_lr: 0.000107  loss: 2.1416 (2.7046)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2812 (1.2495)  time: 0.2583  data: 0.0004  max mem: 19097
Epoch: [270]  [1200/1251]  eta: 0:00:13  lr: 0.000106  min_lr: 0.000106  loss: 2.8816 (2.7069)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1548 (1.2463)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [270]  [1250/1251]  eta: 0:00:00  lr: 0.000106  min_lr: 0.000106  loss: 3.1272 (2.7138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2225 (1.2475)  time: 0.2228  data: 0.0007  max mem: 19097
Epoch: [270] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000106  min_lr: 0.000106  loss: 3.1272 (2.7367)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2225 (1.2475)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6433 (0.6433)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.6049  data: 5.4668  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.8353 (0.8260)  acc1: 85.6000 (85.1636)  acc5: 97.6000 (97.7455)  time: 0.7144  data: 0.5912  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0150 (0.9657)  acc1: 79.6000 (81.5429)  acc5: 96.0000 (96.2857)  time: 0.1883  data: 0.0679  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0384 (0.9761)  acc1: 79.6000 (81.3920)  acc5: 95.6000 (96.1120)  time: 0.1884  data: 0.0679  max mem: 19097
Test: Total time: 0:00:09 (0.3996 s / it)
* Acc@1 82.272 Acc@5 96.072 loss 0.959
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.33%
Epoch: [271]  [   0/1251]  eta: 1:04:28  lr: 0.000106  min_lr: 0.000106  loss: 2.7565 (2.7565)  weight_decay: 0.0500 (0.0500)  time: 3.0927  data: 1.5538  max mem: 19097
Epoch: [271]  [ 200/1251]  eta: 0:04:46  lr: 0.000105  min_lr: 0.000105  loss: 2.9945 (2.7086)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1203 (1.2149)  time: 0.2580  data: 0.0004  max mem: 19097
Epoch: [271]  [ 400/1251]  eta: 0:03:45  lr: 0.000104  min_lr: 0.000104  loss: 3.0046 (2.7233)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2834 (1.2395)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [271]  [ 600/1251]  eta: 0:02:50  lr: 0.000102  min_lr: 0.000102  loss: 2.7740 (2.7274)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2583 (1.2438)  time: 0.2638  data: 0.0004  max mem: 19097
Epoch: [271]  [ 800/1251]  eta: 0:01:57  lr: 0.000101  min_lr: 0.000101  loss: 3.0089 (2.7251)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2350 (1.2492)  time: 0.2576  data: 0.0003  max mem: 19097
Epoch: [271]  [1000/1251]  eta: 0:01:05  lr: 0.000100  min_lr: 0.000100  loss: 3.1060 (2.7251)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2528 (1.2616)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [271]  [1200/1251]  eta: 0:00:13  lr: 0.000099  min_lr: 0.000099  loss: 3.1176 (2.7327)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2433 (1.2688)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [271]  [1250/1251]  eta: 0:00:00  lr: 0.000099  min_lr: 0.000099  loss: 2.9423 (2.7331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2323 (1.2667)  time: 0.2166  data: 0.0006  max mem: 19097
Epoch: [271] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000099  min_lr: 0.000099  loss: 2.9423 (2.7511)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2323 (1.2667)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.6239 (0.6239)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.2720  data: 5.0971  max mem: 19097
Test:  [10/25]  eta: 0:00:09  loss: 0.8039 (0.8056)  acc1: 85.6000 (85.3091)  acc5: 98.0000 (97.7818)  time: 0.6378  data: 0.5105  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9753 (0.9437)  acc1: 79.2000 (81.7333)  acc5: 96.0000 (96.1905)  time: 0.1965  data: 0.0760  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0022 (0.9540)  acc1: 79.2000 (81.5680)  acc5: 95.6000 (96.0800)  time: 0.2068  data: 0.0870  max mem: 19097
Test: Total time: 0:00:10 (0.4108 s / it)
* Acc@1 82.230 Acc@5 96.104 loss 0.939
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.33%
Epoch: [272]  [   0/1251]  eta: 1:05:48  lr: 0.000099  min_lr: 0.000099  loss: 3.0936 (3.0936)  weight_decay: 0.0500 (0.0500)  time: 3.1561  data: 1.7312  max mem: 19097
Epoch: [272]  [ 200/1251]  eta: 0:04:46  lr: 0.000098  min_lr: 0.000098  loss: 3.0702 (2.7731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1925 (1.2523)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [272]  [ 400/1251]  eta: 0:03:45  lr: 0.000097  min_lr: 0.000097  loss: 3.2069 (2.7895)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2707 (1.2666)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [272]  [ 600/1251]  eta: 0:02:51  lr: 0.000096  min_lr: 0.000096  loss: 2.7598 (2.7652)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2558 (1.2677)  time: 0.2570  data: 0.0005  max mem: 19097
Epoch: [272]  [ 800/1251]  eta: 0:01:57  lr: 0.000094  min_lr: 0.000094  loss: 2.3672 (2.7586)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2889 (1.2667)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [272]  [1000/1251]  eta: 0:01:05  lr: 0.000093  min_lr: 0.000093  loss: 3.1202 (2.7537)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2211 (1.2585)  time: 0.2631  data: 0.0004  max mem: 19097
Epoch: [272]  [1200/1251]  eta: 0:00:13  lr: 0.000092  min_lr: 0.000092  loss: 2.9349 (2.7512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2894 (1.2629)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [272]  [1250/1251]  eta: 0:00:00  lr: 0.000092  min_lr: 0.000092  loss: 3.0338 (2.7541)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3146 (1.2634)  time: 0.2167  data: 0.0009  max mem: 19097
Epoch: [272] Total time: 0:05:25 (0.2603 s / it)
Averaged stats: lr: 0.000092  min_lr: 0.000092  loss: 3.0338 (2.7432)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3146 (1.2634)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6161 (0.6161)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.6946  data: 5.5563  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7920 (0.7985)  acc1: 85.6000 (84.9455)  acc5: 98.0000 (97.8546)  time: 0.7282  data: 0.6045  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9813 (0.9426)  acc1: 80.0000 (81.6000)  acc5: 96.0000 (96.1905)  time: 0.1932  data: 0.0706  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0254 (0.9533)  acc1: 80.0000 (81.3600)  acc5: 95.6000 (96.0960)  time: 0.1927  data: 0.0705  max mem: 19097
Test: Total time: 0:00:10 (0.4060 s / it)
* Acc@1 82.238 Acc@5 96.068 loss 0.938
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.33%
Epoch: [273]  [   0/1251]  eta: 1:04:05  lr: 0.000092  min_lr: 0.000092  loss: 2.6515 (2.6515)  weight_decay: 0.0500 (0.0500)  time: 3.0741  data: 1.7810  max mem: 19097
Epoch: [273]  [ 200/1251]  eta: 0:04:46  lr: 0.000091  min_lr: 0.000091  loss: 2.9060 (2.7632)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2234 (1.2878)  time: 0.2560  data: 0.0004  max mem: 19097
Epoch: [273]  [ 400/1251]  eta: 0:03:45  lr: 0.000090  min_lr: 0.000090  loss: 3.0315 (2.7369)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1636 (1.2386)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [273]  [ 600/1251]  eta: 0:02:50  lr: 0.000089  min_lr: 0.000089  loss: 2.8989 (2.7533)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1787 (1.2267)  time: 0.2560  data: 0.0003  max mem: 19097
Epoch: [273]  [ 800/1251]  eta: 0:01:57  lr: 0.000088  min_lr: 0.000088  loss: 2.8588 (2.7460)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2006 (1.2304)  time: 0.2562  data: 0.0004  max mem: 19097
Epoch: [273]  [1000/1251]  eta: 0:01:05  lr: 0.000087  min_lr: 0.000087  loss: 2.8653 (2.7446)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2375 (1.2262)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [273]  [1200/1251]  eta: 0:00:13  lr: 0.000086  min_lr: 0.000086  loss: 2.6871 (2.7495)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2302 (1.2326)  time: 0.2581  data: 0.0004  max mem: 19097
Epoch: [273]  [1250/1251]  eta: 0:00:00  lr: 0.000085  min_lr: 0.000085  loss: 2.7779 (2.7490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2233 (1.2327)  time: 0.2167  data: 0.0007  max mem: 19097
Epoch: [273] Total time: 0:05:24 (0.2596 s / it)
Averaged stats: lr: 0.000085  min_lr: 0.000085  loss: 2.7779 (2.7431)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2233 (1.2327)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5722 (0.5722)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.4856  data: 5.3474  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7795 (0.7615)  acc1: 85.6000 (85.3818)  acc5: 97.6000 (97.7091)  time: 0.7517  data: 0.6280  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9372 (0.9011)  acc1: 78.8000 (81.7333)  acc5: 96.0000 (96.1714)  time: 0.2103  data: 0.0895  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9877 (0.9146)  acc1: 78.8000 (81.3440)  acc5: 96.0000 (96.1280)  time: 0.2154  data: 0.0954  max mem: 19097
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 82.302 Acc@5 96.146 loss 0.897
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.33%
Epoch: [274]  [   0/1251]  eta: 1:05:37  lr: 0.000085  min_lr: 0.000085  loss: 2.6763 (2.6763)  weight_decay: 0.0500 (0.0500)  time: 3.1475  data: 1.7947  max mem: 19097
Epoch: [274]  [ 200/1251]  eta: 0:04:47  lr: 0.000084  min_lr: 0.000084  loss: 2.9706 (2.7665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1989 (1.2078)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [274]  [ 400/1251]  eta: 0:03:45  lr: 0.000083  min_lr: 0.000083  loss: 2.8244 (2.7806)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1856 (1.2216)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [274]  [ 600/1251]  eta: 0:02:51  lr: 0.000082  min_lr: 0.000082  loss: 3.0689 (2.7780)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1195 (1.2102)  time: 0.2568  data: 0.0005  max mem: 19097
Epoch: [274]  [ 800/1251]  eta: 0:01:58  lr: 0.000081  min_lr: 0.000081  loss: 2.6181 (2.7694)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1726 (1.2113)  time: 0.2702  data: 0.0004  max mem: 19097
Epoch: [274]  [1000/1251]  eta: 0:01:05  lr: 0.000080  min_lr: 0.000080  loss: 3.0797 (2.7619)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3372 (1.2427)  time: 0.2579  data: 0.0004  max mem: 19097
Epoch: [274]  [1200/1251]  eta: 0:00:13  lr: 0.000079  min_lr: 0.000079  loss: 2.5158 (2.7421)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2826 (1.2490)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [274]  [1250/1251]  eta: 0:00:00  lr: 0.000079  min_lr: 0.000079  loss: 2.3800 (2.7348)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2859 (1.2508)  time: 0.2170  data: 0.0005  max mem: 19097
Epoch: [274] Total time: 0:05:26 (0.2610 s / it)
Averaged stats: lr: 0.000079  min_lr: 0.000079  loss: 2.3800 (2.7374)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2859 (1.2508)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5140 (0.5140)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 5.7289  data: 5.5867  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7336 (0.7129)  acc1: 85.2000 (85.2727)  acc5: 98.0000 (97.8182)  time: 0.7254  data: 0.5994  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.8875 (0.8581)  acc1: 80.8000 (82.0381)  acc5: 96.0000 (96.2857)  time: 0.1930  data: 0.0714  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9379 (0.8710)  acc1: 80.8000 (81.7920)  acc5: 95.6000 (96.1920)  time: 0.1923  data: 0.0713  max mem: 19097
Test: Total time: 0:00:10 (0.4078 s / it)
* Acc@1 82.378 Acc@5 96.166 loss 0.856
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.38%
Epoch: [275]  [   0/1251]  eta: 1:06:31  lr: 0.000079  min_lr: 0.000079  loss: 1.7340 (1.7340)  weight_decay: 0.0500 (0.0500)  time: 3.1904  data: 2.8742  max mem: 19097
Epoch: [275]  [ 200/1251]  eta: 0:04:45  lr: 0.000078  min_lr: 0.000078  loss: 2.9671 (2.7683)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2042 (1.2408)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [275]  [ 400/1251]  eta: 0:03:45  lr: 0.000077  min_lr: 0.000077  loss: 2.8425 (2.7372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2422 (1.2524)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [275]  [ 600/1251]  eta: 0:02:50  lr: 0.000076  min_lr: 0.000076  loss: 2.7426 (2.7605)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1280 (1.2402)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [275]  [ 800/1251]  eta: 0:01:57  lr: 0.000075  min_lr: 0.000075  loss: 2.5245 (2.7475)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1901 (1.2526)  time: 0.2636  data: 0.0004  max mem: 19097
Epoch: [275]  [1000/1251]  eta: 0:01:05  lr: 0.000074  min_lr: 0.000074  loss: 2.2322 (2.7383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1981 (1.2626)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [275]  [1200/1251]  eta: 0:00:13  lr: 0.000073  min_lr: 0.000073  loss: 2.3142 (2.7442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2539 (1.2595)  time: 0.2559  data: 0.0004  max mem: 19097
Epoch: [275]  [1250/1251]  eta: 0:00:00  lr: 0.000073  min_lr: 0.000073  loss: 2.9907 (2.7465)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2907 (1.2627)  time: 0.2168  data: 0.0005  max mem: 19097
Epoch: [275] Total time: 0:05:24 (0.2597 s / it)
Averaged stats: lr: 0.000073  min_lr: 0.000073  loss: 2.9907 (2.7396)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2907 (1.2627)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5891 (0.5891)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.4492  data: 5.2860  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7949 (0.7718)  acc1: 86.8000 (85.6000)  acc5: 98.0000 (97.7455)  time: 0.7409  data: 0.6153  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9490 (0.9121)  acc1: 80.8000 (82.2667)  acc5: 95.6000 (96.3238)  time: 0.2208  data: 0.1006  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0007 (0.9242)  acc1: 80.0000 (81.8240)  acc5: 95.6000 (96.1600)  time: 0.2192  data: 0.1005  max mem: 19097
Test: Total time: 0:00:10 (0.4167 s / it)
* Acc@1 82.348 Acc@5 96.152 loss 0.911
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.38%
Epoch: [276]  [   0/1251]  eta: 1:01:04  lr: 0.000073  min_lr: 0.000073  loss: 3.1311 (3.1311)  weight_decay: 0.0500 (0.0500)  time: 2.9295  data: 1.5624  max mem: 19097
Epoch: [276]  [ 200/1251]  eta: 0:04:46  lr: 0.000072  min_lr: 0.000072  loss: 2.9718 (2.7474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1545 (1.2270)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [276]  [ 400/1251]  eta: 0:03:45  lr: 0.000071  min_lr: 0.000071  loss: 2.4149 (2.7436)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2635 (1.2507)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [276]  [ 600/1251]  eta: 0:02:50  lr: 0.000070  min_lr: 0.000070  loss: 2.5630 (2.7205)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2710 (1.2508)  time: 0.2558  data: 0.0003  max mem: 19097
Epoch: [276]  [ 800/1251]  eta: 0:01:57  lr: 0.000069  min_lr: 0.000069  loss: 2.6195 (2.7066)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1874 (1.2381)  time: 0.2560  data: 0.0003  max mem: 19097
Epoch: [276]  [1000/1251]  eta: 0:01:05  lr: 0.000068  min_lr: 0.000068  loss: 2.6984 (2.7214)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2102 (1.2311)  time: 0.2568  data: 0.0003  max mem: 19097
Epoch: [276]  [1200/1251]  eta: 0:00:13  lr: 0.000067  min_lr: 0.000067  loss: 2.6146 (2.7303)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2447 (1.2386)  time: 0.2577  data: 0.0003  max mem: 19097
Epoch: [276]  [1250/1251]  eta: 0:00:00  lr: 0.000067  min_lr: 0.000067  loss: 2.7796 (2.7300)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2421 (1.2384)  time: 0.2167  data: 0.0006  max mem: 19097
Epoch: [276] Total time: 0:05:24 (0.2597 s / it)
Averaged stats: lr: 0.000067  min_lr: 0.000067  loss: 2.7796 (2.7234)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2421 (1.2384)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6095 (0.6095)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.4901  data: 5.3473  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8097 (0.8038)  acc1: 85.2000 (85.4545)  acc5: 98.0000 (97.8546)  time: 0.7527  data: 0.6277  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9871 (0.9435)  acc1: 80.8000 (82.0381)  acc5: 96.0000 (96.3429)  time: 0.2179  data: 0.0970  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0221 (0.9544)  acc1: 80.8000 (81.7120)  acc5: 95.6000 (96.1760)  time: 0.2170  data: 0.0969  max mem: 19097
Test: Total time: 0:00:10 (0.4159 s / it)
* Acc@1 82.312 Acc@5 96.108 loss 0.941
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.38%
Epoch: [277]  [   0/1251]  eta: 1:09:36  lr: 0.000067  min_lr: 0.000067  loss: 2.0887 (2.0887)  weight_decay: 0.0500 (0.0500)  time: 3.3388  data: 2.5609  max mem: 19097
Epoch: [277]  [ 200/1251]  eta: 0:04:46  lr: 0.000066  min_lr: 0.000066  loss: 2.8934 (2.8000)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2493 (1.3061)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [277]  [ 400/1251]  eta: 0:03:46  lr: 0.000065  min_lr: 0.000065  loss: 2.0902 (2.7617)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2607 (1.3148)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [277]  [ 600/1251]  eta: 0:02:51  lr: 0.000064  min_lr: 0.000064  loss: 2.5103 (2.7539)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2279 (1.2890)  time: 0.2562  data: 0.0004  max mem: 19097
Epoch: [277]  [ 800/1251]  eta: 0:01:57  lr: 0.000064  min_lr: 0.000064  loss: 3.0317 (2.7742)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2164 (1.2937)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [277]  [1000/1251]  eta: 0:01:05  lr: 0.000063  min_lr: 0.000063  loss: 2.8909 (2.7570)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2021 (1.2859)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [277]  [1200/1251]  eta: 0:00:13  lr: 0.000062  min_lr: 0.000062  loss: 2.4326 (2.7510)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1994 (inf)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [277]  [1250/1251]  eta: 0:00:00  lr: 0.000062  min_lr: 0.000062  loss: 3.0967 (2.7502)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2596 (inf)  time: 0.2169  data: 0.0007  max mem: 19097
Epoch: [277] Total time: 0:05:25 (0.2603 s / it)
Averaged stats: lr: 0.000062  min_lr: 0.000062  loss: 3.0967 (2.7386)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2596 (inf)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6233 (0.6233)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.6372  data: 5.4918  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8029 (0.8083)  acc1: 85.6000 (85.3091)  acc5: 98.0000 (97.8182)  time: 0.7419  data: 0.6164  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9857 (0.9518)  acc1: 80.8000 (82.0381)  acc5: 96.4000 (96.2286)  time: 0.2202  data: 0.0956  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0371 (0.9618)  acc1: 80.8000 (81.7920)  acc5: 95.6000 (96.1440)  time: 0.2194  data: 0.0955  max mem: 19097
Test: Total time: 0:00:10 (0.4235 s / it)
* Acc@1 82.312 Acc@5 96.158 loss 0.950
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.38%
Epoch: [278]  [   0/1251]  eta: 1:07:14  lr: 0.000062  min_lr: 0.000062  loss: 3.1617 (3.1617)  weight_decay: 0.0500 (0.0500)  time: 3.2247  data: 2.8924  max mem: 19097
Epoch: [278]  [ 200/1251]  eta: 0:04:46  lr: 0.000061  min_lr: 0.000061  loss: 2.7948 (2.7356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2546 (1.2435)  time: 0.2574  data: 0.0003  max mem: 19097
Epoch: [278]  [ 400/1251]  eta: 0:03:46  lr: 0.000060  min_lr: 0.000060  loss: 2.3390 (2.7522)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2270 (1.2687)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [278]  [ 600/1251]  eta: 0:02:51  lr: 0.000059  min_lr: 0.000059  loss: 2.8803 (2.7555)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1977 (1.2643)  time: 0.2560  data: 0.0004  max mem: 19097
Epoch: [278]  [ 800/1251]  eta: 0:01:57  lr: 0.000058  min_lr: 0.000058  loss: 3.0124 (2.7544)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1636 (1.2588)  time: 0.2567  data: 0.0003  max mem: 19097
Epoch: [278]  [1000/1251]  eta: 0:01:05  lr: 0.000057  min_lr: 0.000057  loss: 3.1181 (2.7518)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2155 (1.2528)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [278]  [1200/1251]  eta: 0:00:13  lr: 0.000056  min_lr: 0.000056  loss: 2.5189 (2.7403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1941 (1.2493)  time: 0.2568  data: 0.0005  max mem: 19097
Epoch: [278]  [1250/1251]  eta: 0:00:00  lr: 0.000056  min_lr: 0.000056  loss: 2.3891 (2.7368)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1812 (1.2466)  time: 0.2168  data: 0.0007  max mem: 19097
Epoch: [278] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000056  min_lr: 0.000056  loss: 2.3891 (2.7347)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1812 (1.2466)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.5290 (0.5290)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.8409  data: 5.6896  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7245 (0.7107)  acc1: 84.8000 (85.5273)  acc5: 98.0000 (97.7818)  time: 0.7422  data: 0.6181  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.8784 (0.8495)  acc1: 80.8000 (82.1714)  acc5: 96.4000 (96.2667)  time: 0.1940  data: 0.0740  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9385 (0.8607)  acc1: 80.4000 (81.8240)  acc5: 96.0000 (96.1600)  time: 0.1927  data: 0.0740  max mem: 19097
Test: Total time: 0:00:10 (0.4114 s / it)
* Acc@1 82.412 Acc@5 96.156 loss 0.846
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.41%
Epoch: [279]  [   0/1251]  eta: 1:05:18  lr: 0.000056  min_lr: 0.000056  loss: 2.5384 (2.5384)  weight_decay: 0.0500 (0.0500)  time: 3.1327  data: 2.8178  max mem: 19097
Epoch: [279]  [ 200/1251]  eta: 0:04:45  lr: 0.000055  min_lr: 0.000055  loss: 3.0443 (2.7415)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2490 (1.2626)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [279]  [ 400/1251]  eta: 0:03:45  lr: 0.000055  min_lr: 0.000055  loss: 2.2523 (2.7311)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2678 (1.2591)  time: 0.2653  data: 0.0004  max mem: 19097
Epoch: [279]  [ 600/1251]  eta: 0:02:51  lr: 0.000054  min_lr: 0.000054  loss: 2.1635 (2.7231)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2937 (1.2630)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [279]  [ 800/1251]  eta: 0:01:58  lr: 0.000053  min_lr: 0.000053  loss: 2.5184 (2.7068)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2241 (1.2685)  time: 0.2577  data: 0.0005  max mem: 19097
Epoch: [279]  [1000/1251]  eta: 0:01:05  lr: 0.000052  min_lr: 0.000052  loss: 2.9468 (2.7144)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2296 (1.2814)  time: 0.2561  data: 0.0004  max mem: 19097
Epoch: [279]  [1200/1251]  eta: 0:00:13  lr: 0.000051  min_lr: 0.000051  loss: 2.7076 (2.7272)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2413 (1.2796)  time: 0.2655  data: 0.0004  max mem: 19097
Epoch: [279]  [1250/1251]  eta: 0:00:00  lr: 0.000051  min_lr: 0.000051  loss: 2.9045 (2.7330)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2070 (1.2788)  time: 0.2166  data: 0.0007  max mem: 19097
Epoch: [279] Total time: 0:05:25 (0.2605 s / it)
Averaged stats: lr: 0.000051  min_lr: 0.000051  loss: 2.9045 (2.7280)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2070 (1.2788)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6410 (0.6410)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.5566  data: 5.3791  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.8291 (0.8279)  acc1: 85.6000 (85.4182)  acc5: 98.0000 (97.7091)  time: 0.7299  data: 0.6017  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9990 (0.9727)  acc1: 79.6000 (81.9810)  acc5: 96.0000 (96.3429)  time: 0.1998  data: 0.0788  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0653 (0.9843)  acc1: 80.0000 (81.7760)  acc5: 96.0000 (96.2880)  time: 0.2095  data: 0.0898  max mem: 19097
Test: Total time: 0:00:10 (0.4130 s / it)
* Acc@1 82.356 Acc@5 96.150 loss 0.972
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.41%
Epoch: [280]  [   0/1251]  eta: 1:04:37  lr: 0.000051  min_lr: 0.000051  loss: 3.4071 (3.4071)  weight_decay: 0.0500 (0.0500)  time: 3.0994  data: 2.5030  max mem: 19097
Epoch: [280]  [ 200/1251]  eta: 0:04:46  lr: 0.000050  min_lr: 0.000050  loss: 3.0268 (2.6853)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2220 (1.2072)  time: 0.2566  data: 0.0003  max mem: 19097
Epoch: [280]  [ 400/1251]  eta: 0:03:45  lr: 0.000050  min_lr: 0.000050  loss: 2.9378 (2.7307)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2001 (1.2159)  time: 0.2585  data: 0.0004  max mem: 19097
Epoch: [280]  [ 600/1251]  eta: 0:02:51  lr: 0.000049  min_lr: 0.000049  loss: 2.6173 (2.7393)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1979 (1.2432)  time: 0.2602  data: 0.0004  max mem: 19097
Epoch: [280]  [ 800/1251]  eta: 0:01:57  lr: 0.000048  min_lr: 0.000048  loss: 2.5816 (2.7210)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2839 (1.2453)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [280]  [1000/1251]  eta: 0:01:05  lr: 0.000047  min_lr: 0.000047  loss: 3.0337 (2.7306)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1892 (1.2449)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [280]  [1200/1251]  eta: 0:00:13  lr: 0.000046  min_lr: 0.000046  loss: 2.9993 (2.7403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1751 (1.2457)  time: 0.2595  data: 0.0004  max mem: 19097
Epoch: [280]  [1250/1251]  eta: 0:00:00  lr: 0.000046  min_lr: 0.000046  loss: 2.1450 (2.7345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1898 (1.2467)  time: 0.2165  data: 0.0007  max mem: 19097
Epoch: [280] Total time: 0:05:25 (0.2601 s / it)
Averaged stats: lr: 0.000046  min_lr: 0.000046  loss: 2.1450 (2.7143)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1898 (1.2467)
Test:  [ 0/25]  eta: 0:01:19  loss: 0.5346 (0.5346)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 3.1806  data: 3.0405  max mem: 19097
Test:  [10/25]  eta: 0:00:08  loss: 0.7444 (0.7244)  acc1: 85.2000 (85.6000)  acc5: 97.6000 (97.7455)  time: 0.5822  data: 0.4551  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9238 (0.8685)  acc1: 80.0000 (82.0191)  acc5: 96.0000 (96.2286)  time: 0.2955  data: 0.1722  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9480 (0.8814)  acc1: 80.0000 (81.6640)  acc5: 95.6000 (96.0960)  time: 0.2164  data: 0.0955  max mem: 19097
Test: Total time: 0:00:09 (0.3984 s / it)
* Acc@1 82.392 Acc@5 96.132 loss 0.865
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.41%
Epoch: [281]  [   0/1251]  eta: 1:08:08  lr: 0.000046  min_lr: 0.000046  loss: 3.0254 (3.0254)  weight_decay: 0.0500 (0.0500)  time: 3.2685  data: 2.8786  max mem: 19097
Epoch: [281]  [ 200/1251]  eta: 0:04:46  lr: 0.000046  min_lr: 0.000046  loss: 2.8088 (2.7227)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2572 (1.2683)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [281]  [ 400/1251]  eta: 0:03:46  lr: 0.000045  min_lr: 0.000045  loss: 2.8453 (2.7292)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2761 (1.2855)  time: 0.2571  data: 0.0003  max mem: 19097
Epoch: [281]  [ 600/1251]  eta: 0:02:51  lr: 0.000044  min_lr: 0.000044  loss: 2.1201 (2.7110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1995 (1.2684)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [281]  [ 800/1251]  eta: 0:01:58  lr: 0.000043  min_lr: 0.000043  loss: 2.8321 (2.6867)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1662 (1.2539)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [281]  [1000/1251]  eta: 0:01:05  lr: 0.000043  min_lr: 0.000043  loss: 2.4574 (2.6909)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2647 (1.2604)  time: 0.2638  data: 0.0004  max mem: 19097
Epoch: [281]  [1200/1251]  eta: 0:00:13  lr: 0.000042  min_lr: 0.000042  loss: 2.8388 (2.6998)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2295 (1.2584)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [281]  [1250/1251]  eta: 0:00:00  lr: 0.000042  min_lr: 0.000042  loss: 2.4888 (2.6996)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2907 (1.2618)  time: 0.2160  data: 0.0005  max mem: 19097
Epoch: [281] Total time: 0:05:25 (0.2605 s / it)
Averaged stats: lr: 0.000042  min_lr: 0.000042  loss: 2.4888 (2.7197)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2907 (1.2618)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5441 (0.5441)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 5.6831  data: 5.5503  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7545 (0.7368)  acc1: 85.2000 (85.5636)  acc5: 98.0000 (97.7455)  time: 0.7441  data: 0.6140  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9088 (0.8772)  acc1: 80.0000 (82.1333)  acc5: 96.0000 (96.3429)  time: 0.1954  data: 0.0706  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9529 (0.8893)  acc1: 79.6000 (81.7600)  acc5: 95.6000 (96.2240)  time: 0.1942  data: 0.0705  max mem: 19097
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 82.350 Acc@5 96.164 loss 0.875
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.41%
Epoch: [282]  [   0/1251]  eta: 1:06:34  lr: 0.000042  min_lr: 0.000042  loss: 2.4112 (2.4112)  weight_decay: 0.0500 (0.0500)  time: 3.1930  data: 1.6147  max mem: 19097
Epoch: [282]  [ 200/1251]  eta: 0:04:46  lr: 0.000041  min_lr: 0.000041  loss: 3.0155 (2.7417)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2977 (1.2793)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [282]  [ 400/1251]  eta: 0:03:45  lr: 0.000040  min_lr: 0.000040  loss: 2.7937 (2.7530)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3065 (1.2825)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [282]  [ 600/1251]  eta: 0:02:51  lr: 0.000040  min_lr: 0.000040  loss: 2.6922 (2.7663)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2014 (1.2606)  time: 0.2649  data: 0.0004  max mem: 19097
Epoch: [282]  [ 800/1251]  eta: 0:01:58  lr: 0.000039  min_lr: 0.000039  loss: 2.6876 (2.7479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1955 (1.2592)  time: 0.2649  data: 0.0003  max mem: 19097
Epoch: [282]  [1000/1251]  eta: 0:01:05  lr: 0.000038  min_lr: 0.000038  loss: 2.9760 (2.7585)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2236 (1.2515)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [282]  [1200/1251]  eta: 0:00:13  lr: 0.000037  min_lr: 0.000037  loss: 2.9678 (2.7474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1734 (1.2465)  time: 0.2589  data: 0.0004  max mem: 19097
Epoch: [282]  [1250/1251]  eta: 0:00:00  lr: 0.000037  min_lr: 0.000037  loss: 2.8703 (2.7494)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1274 (1.2458)  time: 0.2167  data: 0.0006  max mem: 19097
Epoch: [282] Total time: 0:05:25 (0.2605 s / it)
Averaged stats: lr: 0.000037  min_lr: 0.000037  loss: 2.8703 (2.7291)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1274 (1.2458)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5883 (0.5883)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.6369  data: 5.4808  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7779 (0.7741)  acc1: 86.0000 (85.4182)  acc5: 98.0000 (97.7091)  time: 0.7163  data: 0.5920  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9373 (0.9175)  acc1: 80.0000 (82.0952)  acc5: 96.0000 (96.3048)  time: 0.1933  data: 0.0698  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9987 (0.9295)  acc1: 80.4000 (81.8720)  acc5: 95.6000 (96.1760)  time: 0.1948  data: 0.0717  max mem: 19097
Test: Total time: 0:00:10 (0.4057 s / it)
* Acc@1 82.428 Acc@5 96.134 loss 0.916
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.43%
Epoch: [283]  [   0/1251]  eta: 1:00:07  lr: 0.000037  min_lr: 0.000037  loss: 2.0564 (2.0564)  weight_decay: 0.0500 (0.0500)  time: 2.8838  data: 2.6096  max mem: 19097
Epoch: [283]  [ 200/1251]  eta: 0:04:45  lr: 0.000037  min_lr: 0.000037  loss: 2.8438 (2.7566)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2151 (1.2641)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [283]  [ 400/1251]  eta: 0:03:44  lr: 0.000036  min_lr: 0.000036  loss: 2.7331 (2.7461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1444 (1.2465)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [283]  [ 600/1251]  eta: 0:02:50  lr: 0.000035  min_lr: 0.000035  loss: 2.7267 (2.7345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1977 (1.2363)  time: 0.2585  data: 0.0004  max mem: 19097
Epoch: [283]  [ 800/1251]  eta: 0:01:57  lr: 0.000035  min_lr: 0.000035  loss: 3.1018 (2.7418)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1855 (1.2306)  time: 0.2560  data: 0.0004  max mem: 19097
Epoch: [283]  [1000/1251]  eta: 0:01:05  lr: 0.000034  min_lr: 0.000034  loss: 3.0856 (2.7241)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1762 (1.2345)  time: 0.2579  data: 0.0003  max mem: 19097
Epoch: [283]  [1200/1251]  eta: 0:00:13  lr: 0.000033  min_lr: 0.000033  loss: 2.4278 (2.7158)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2641 (1.2349)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [283]  [1250/1251]  eta: 0:00:00  lr: 0.000033  min_lr: 0.000033  loss: 3.0583 (2.7167)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2335 (1.2366)  time: 0.2162  data: 0.0006  max mem: 19097
Epoch: [283] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000033  min_lr: 0.000033  loss: 3.0583 (2.7184)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2335 (1.2366)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6255 (0.6255)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 5.6896  data: 5.5335  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8220 (0.8083)  acc1: 85.2000 (85.2727)  acc5: 97.6000 (97.6727)  time: 0.7707  data: 0.6461  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9833 (0.9484)  acc1: 80.0000 (81.9619)  acc5: 96.4000 (96.3810)  time: 0.2204  data: 0.0998  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0419 (0.9606)  acc1: 80.0000 (81.6960)  acc5: 95.6000 (96.2560)  time: 0.2191  data: 0.0997  max mem: 19097
Test: Total time: 0:00:10 (0.4262 s / it)
* Acc@1 82.256 Acc@5 96.126 loss 0.949
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.43%
Epoch: [284]  [   0/1251]  eta: 1:05:41  lr: 0.000033  min_lr: 0.000033  loss: 2.9341 (2.9341)  weight_decay: 0.0500 (0.0500)  time: 3.1509  data: 2.3281  max mem: 19097
Epoch: [284]  [ 200/1251]  eta: 0:04:46  lr: 0.000032  min_lr: 0.000032  loss: 2.8995 (2.7070)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2612 (1.2994)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [284]  [ 400/1251]  eta: 0:03:45  lr: 0.000032  min_lr: 0.000032  loss: 2.6847 (2.7119)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2451 (1.3040)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [284]  [ 600/1251]  eta: 0:02:50  lr: 0.000031  min_lr: 0.000031  loss: 2.8330 (2.7018)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1810 (1.2858)  time: 0.2581  data: 0.0004  max mem: 19097
Epoch: [284]  [ 800/1251]  eta: 0:01:57  lr: 0.000031  min_lr: 0.000031  loss: 2.7270 (2.7069)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1529 (1.2696)  time: 0.2586  data: 0.0004  max mem: 19097
Epoch: [284]  [1000/1251]  eta: 0:01:05  lr: 0.000030  min_lr: 0.000030  loss: 3.0453 (2.7087)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1945 (1.2669)  time: 0.2568  data: 0.0003  max mem: 19097
Epoch: [284]  [1200/1251]  eta: 0:00:13  lr: 0.000029  min_lr: 0.000029  loss: 2.5537 (2.7013)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2566  data: 0.0003  max mem: 19097
Epoch: [284]  [1250/1251]  eta: 0:00:00  lr: 0.000029  min_lr: 0.000029  loss: 2.9802 (2.7036)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2255 (nan)  time: 0.2163  data: 0.0006  max mem: 19097
Epoch: [284] Total time: 0:05:25 (0.2601 s / it)
Averaged stats: lr: 0.000029  min_lr: 0.000029  loss: 2.9802 (2.7172)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2255 (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5846 (0.5846)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.7522  data: 5.6145  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7761 (0.7707)  acc1: 85.6000 (85.3818)  acc5: 98.0000 (97.7091)  time: 0.7181  data: 0.5964  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9582 (0.9122)  acc1: 78.8000 (81.8476)  acc5: 96.4000 (96.3238)  time: 0.1885  data: 0.0691  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9976 (0.9246)  acc1: 78.8000 (81.5360)  acc5: 95.6000 (96.2240)  time: 0.1925  data: 0.0737  max mem: 19097
Test: Total time: 0:00:10 (0.4071 s / it)
* Acc@1 82.394 Acc@5 96.122 loss 0.912
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.43%
Epoch: [285]  [   0/1251]  eta: 1:09:04  lr: 0.000029  min_lr: 0.000029  loss: 1.8992 (1.8992)  weight_decay: 0.0500 (0.0500)  time: 3.3132  data: 2.4730  max mem: 19097
Epoch: [285]  [ 200/1251]  eta: 0:04:48  lr: 0.000029  min_lr: 0.000029  loss: 2.7094 (2.7488)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2228 (1.2617)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [285]  [ 400/1251]  eta: 0:03:46  lr: 0.000028  min_lr: 0.000028  loss: 2.5056 (2.7153)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1828 (1.2354)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [285]  [ 600/1251]  eta: 0:02:51  lr: 0.000027  min_lr: 0.000027  loss: 2.9101 (2.7156)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2186 (1.2363)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [285]  [ 800/1251]  eta: 0:01:58  lr: 0.000027  min_lr: 0.000027  loss: 3.1810 (2.7296)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1907 (1.2422)  time: 0.2632  data: 0.0004  max mem: 19097
Epoch: [285]  [1000/1251]  eta: 0:01:05  lr: 0.000026  min_lr: 0.000026  loss: 2.9113 (2.7216)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2152 (1.2424)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [285]  [1200/1251]  eta: 0:00:13  lr: 0.000026  min_lr: 0.000026  loss: 2.4065 (2.7219)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1904 (1.2321)  time: 0.2649  data: 0.0004  max mem: 19097
Epoch: [285]  [1250/1251]  eta: 0:00:00  lr: 0.000026  min_lr: 0.000026  loss: 3.1022 (2.7255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1312 (1.2296)  time: 0.2165  data: 0.0008  max mem: 19097
Epoch: [285] Total time: 0:05:25 (0.2606 s / it)
Averaged stats: lr: 0.000026  min_lr: 0.000026  loss: 3.1022 (2.7106)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1312 (1.2296)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5923 (0.5923)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 5.5447  data: 5.4016  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7956 (0.7836)  acc1: 86.4000 (85.5636)  acc5: 98.0000 (97.7091)  time: 0.6668  data: 0.5429  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9661 (0.9269)  acc1: 80.8000 (82.0762)  acc5: 96.0000 (96.3429)  time: 0.1775  data: 0.0547  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0171 (0.9387)  acc1: 80.8000 (81.8240)  acc5: 96.0000 (96.2880)  time: 0.1974  data: 0.0760  max mem: 19097
Test: Total time: 0:00:10 (0.4056 s / it)
* Acc@1 82.440 Acc@5 96.158 loss 0.928
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.44%
Epoch: [286]  [   0/1251]  eta: 1:01:38  lr: 0.000026  min_lr: 0.000026  loss: 3.1707 (3.1707)  weight_decay: 0.0500 (0.0500)  time: 2.9568  data: 2.6804  max mem: 19097
Epoch: [286]  [ 200/1251]  eta: 0:04:46  lr: 0.000025  min_lr: 0.000025  loss: 2.6499 (2.7173)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2381 (1.2011)  time: 0.2580  data: 0.0003  max mem: 19097
Epoch: [286]  [ 400/1251]  eta: 0:03:45  lr: 0.000025  min_lr: 0.000025  loss: 3.0193 (2.7520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1848 (1.2325)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [286]  [ 600/1251]  eta: 0:02:50  lr: 0.000024  min_lr: 0.000024  loss: 2.3890 (2.7370)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1675 (1.2226)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [286]  [ 800/1251]  eta: 0:01:57  lr: 0.000023  min_lr: 0.000023  loss: 2.6903 (2.7223)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2465 (1.2294)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [286]  [1000/1251]  eta: 0:01:05  lr: 0.000023  min_lr: 0.000023  loss: 3.1008 (2.7312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1937 (1.2287)  time: 0.2633  data: 0.0003  max mem: 19097
Epoch: [286]  [1200/1251]  eta: 0:00:13  lr: 0.000022  min_lr: 0.000022  loss: 2.8400 (2.7404)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1734 (1.2304)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [286]  [1250/1251]  eta: 0:00:00  lr: 0.000022  min_lr: 0.000022  loss: 2.6142 (2.7349)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1734 (1.2304)  time: 0.2169  data: 0.0006  max mem: 19097
Epoch: [286] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000022  min_lr: 0.000022  loss: 2.6142 (2.7183)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1734 (1.2304)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5497 (0.5497)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.6535  data: 5.5105  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7506 (0.7364)  acc1: 85.6000 (85.5636)  acc5: 98.0000 (97.7455)  time: 0.7249  data: 0.5996  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9098 (0.8790)  acc1: 81.6000 (82.3048)  acc5: 96.4000 (96.3429)  time: 0.1965  data: 0.0748  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9525 (0.8915)  acc1: 80.4000 (82.0160)  acc5: 95.6000 (96.1440)  time: 0.1955  data: 0.0747  max mem: 19097
Test: Total time: 0:00:10 (0.4077 s / it)
* Acc@1 82.440 Acc@5 96.164 loss 0.877
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.44%
Epoch: [287]  [   0/1251]  eta: 1:00:06  lr: 0.000022  min_lr: 0.000022  loss: 1.9369 (1.9369)  weight_decay: 0.0500 (0.0500)  time: 2.8825  data: 2.5431  max mem: 19097
Epoch: [287]  [ 200/1251]  eta: 0:04:46  lr: 0.000022  min_lr: 0.000022  loss: 2.3679 (2.7026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1768 (1.2177)  time: 0.2581  data: 0.0004  max mem: 19097
Epoch: [287]  [ 400/1251]  eta: 0:03:45  lr: 0.000021  min_lr: 0.000021  loss: 2.9804 (2.7294)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2223 (1.2111)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [287]  [ 600/1251]  eta: 0:02:50  lr: 0.000021  min_lr: 0.000021  loss: 3.0304 (2.6977)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2398 (1.2152)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [287]  [ 800/1251]  eta: 0:01:57  lr: 0.000020  min_lr: 0.000020  loss: 2.5406 (2.7146)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1811 (1.2220)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [287]  [1000/1251]  eta: 0:01:05  lr: 0.000020  min_lr: 0.000020  loss: 3.0659 (2.7221)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1621 (1.2199)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [287]  [1200/1251]  eta: 0:00:13  lr: 0.000019  min_lr: 0.000019  loss: 2.9720 (2.7326)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1851 (1.2224)  time: 0.2565  data: 0.0004  max mem: 19097
Epoch: [287]  [1250/1251]  eta: 0:00:00  lr: 0.000019  min_lr: 0.000019  loss: 3.0583 (2.7321)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3014 (1.2249)  time: 0.2166  data: 0.0007  max mem: 19097
Epoch: [287] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000019  min_lr: 0.000019  loss: 3.0583 (2.7193)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3014 (1.2249)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6494 (0.6494)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.7692  data: 5.6309  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8326 (0.8269)  acc1: 85.2000 (85.4182)  acc5: 98.0000 (97.6727)  time: 0.7606  data: 0.6361  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0004 (0.9737)  acc1: 80.8000 (81.9238)  acc5: 96.0000 (96.2667)  time: 0.2058  data: 0.0846  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0570 (0.9862)  acc1: 80.4000 (81.6160)  acc5: 95.6000 (96.1600)  time: 0.2046  data: 0.0845  max mem: 19097
Test: Total time: 0:00:10 (0.4178 s / it)
* Acc@1 82.294 Acc@5 96.112 loss 0.973
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.44%
Epoch: [288]  [   0/1251]  eta: 1:06:31  lr: 0.000019  min_lr: 0.000019  loss: 3.3749 (3.3749)  weight_decay: 0.0500 (0.0500)  time: 3.1904  data: 2.3788  max mem: 19097
Epoch: [288]  [ 200/1251]  eta: 0:04:47  lr: 0.000019  min_lr: 0.000019  loss: 2.4619 (2.6942)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1953 (1.2271)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [288]  [ 400/1251]  eta: 0:03:45  lr: 0.000018  min_lr: 0.000018  loss: 2.8848 (2.7111)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1759 (1.2301)  time: 0.2576  data: 0.0004  max mem: 19097
Epoch: [288]  [ 600/1251]  eta: 0:02:51  lr: 0.000018  min_lr: 0.000018  loss: 2.9818 (2.6917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1908 (1.2225)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [288]  [ 800/1251]  eta: 0:01:58  lr: 0.000017  min_lr: 0.000017  loss: 2.8020 (2.6983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2441 (1.2259)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [288]  [1000/1251]  eta: 0:01:05  lr: 0.000017  min_lr: 0.000017  loss: 2.8170 (2.6980)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2509 (1.2357)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [288]  [1200/1251]  eta: 0:00:13  lr: 0.000016  min_lr: 0.000016  loss: 3.0958 (2.6996)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2193 (1.2429)  time: 0.2647  data: 0.0004  max mem: 19097
Epoch: [288]  [1250/1251]  eta: 0:00:00  lr: 0.000016  min_lr: 0.000016  loss: 2.7093 (2.6949)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2152 (1.2432)  time: 0.2168  data: 0.0008  max mem: 19097
Epoch: [288] Total time: 0:05:25 (0.2603 s / it)
Averaged stats: lr: 0.000016  min_lr: 0.000016  loss: 2.7093 (2.7134)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2152 (1.2432)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5762 (0.5762)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 5.5167  data: 5.3765  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7685 (0.7622)  acc1: 85.6000 (85.5273)  acc5: 98.0000 (97.8182)  time: 0.7470  data: 0.6228  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9472 (0.9046)  acc1: 81.2000 (82.1714)  acc5: 96.4000 (96.4000)  time: 0.2069  data: 0.0859  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9926 (0.9173)  acc1: 80.0000 (81.8400)  acc5: 95.6000 (96.2400)  time: 0.2193  data: 0.0991  max mem: 19097
Test: Total time: 0:00:10 (0.4195 s / it)
* Acc@1 82.446 Acc@5 96.148 loss 0.904
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.45%
Epoch: [289]  [   0/1251]  eta: 1:02:58  lr: 0.000016  min_lr: 0.000016  loss: 2.3402 (2.3402)  weight_decay: 0.0500 (0.0500)  time: 3.0203  data: 2.7074  max mem: 19097
Epoch: [289]  [ 200/1251]  eta: 0:04:48  lr: 0.000016  min_lr: 0.000016  loss: 3.0316 (2.7910)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2624 (1.2154)  time: 0.2641  data: 0.0004  max mem: 19097
Epoch: [289]  [ 400/1251]  eta: 0:03:45  lr: 0.000015  min_lr: 0.000015  loss: 2.9333 (2.7603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2270 (1.2430)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [289]  [ 600/1251]  eta: 0:02:51  lr: 0.000015  min_lr: 0.000015  loss: 2.7648 (2.7265)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1564 (1.2265)  time: 0.2560  data: 0.0004  max mem: 19097
Epoch: [289]  [ 800/1251]  eta: 0:01:57  lr: 0.000014  min_lr: 0.000014  loss: 2.5413 (2.7442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1422 (1.2182)  time: 0.2560  data: 0.0004  max mem: 19097
Epoch: [289]  [1000/1251]  eta: 0:01:05  lr: 0.000014  min_lr: 0.000014  loss: 2.7715 (2.7442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1904 (1.2197)  time: 0.2571  data: 0.0004  max mem: 19097
Epoch: [289]  [1200/1251]  eta: 0:00:13  lr: 0.000014  min_lr: 0.000014  loss: 2.3197 (2.7364)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1903 (1.2286)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [289]  [1250/1251]  eta: 0:00:00  lr: 0.000014  min_lr: 0.000014  loss: 2.6255 (2.7348)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1914 (1.2267)  time: 0.2164  data: 0.0006  max mem: 19097
Epoch: [289] Total time: 0:05:24 (0.2597 s / it)
Averaged stats: lr: 0.000014  min_lr: 0.000014  loss: 2.6255 (2.7053)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1914 (1.2267)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5491 (0.5491)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.6346  data: 5.4920  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7428 (0.7324)  acc1: 85.6000 (85.4909)  acc5: 98.0000 (97.7455)  time: 0.7632  data: 0.6392  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9143 (0.8745)  acc1: 80.8000 (82.0762)  acc5: 96.4000 (96.4381)  time: 0.2142  data: 0.0939  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9469 (0.8875)  acc1: 80.0000 (81.7280)  acc5: 96.0000 (96.2880)  time: 0.2135  data: 0.0938  max mem: 19097
Test: Total time: 0:00:10 (0.4189 s / it)
* Acc@1 82.456 Acc@5 96.172 loss 0.872
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.46%
Epoch: [290]  [   0/1251]  eta: 1:07:46  lr: 0.000014  min_lr: 0.000014  loss: 3.1157 (3.1157)  weight_decay: 0.0500 (0.0500)  time: 3.2504  data: 2.9651  max mem: 19097
Epoch: [290]  [ 200/1251]  eta: 0:04:47  lr: 0.000013  min_lr: 0.000013  loss: 3.0552 (2.7842)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1558 (1.1820)  time: 0.2567  data: 0.0003  max mem: 19097
Epoch: [290]  [ 400/1251]  eta: 0:03:46  lr: 0.000013  min_lr: 0.000013  loss: 2.4992 (2.7437)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2004 (1.2170)  time: 0.2568  data: 0.0003  max mem: 19097
Epoch: [290]  [ 600/1251]  eta: 0:02:51  lr: 0.000012  min_lr: 0.000012  loss: 2.8867 (2.7194)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1527 (1.2176)  time: 0.2647  data: 0.0004  max mem: 19097
Epoch: [290]  [ 800/1251]  eta: 0:01:57  lr: 0.000012  min_lr: 0.000012  loss: 3.0969 (2.7127)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2162 (1.2246)  time: 0.2580  data: 0.0004  max mem: 19097
Epoch: [290]  [1000/1251]  eta: 0:01:05  lr: 0.000012  min_lr: 0.000012  loss: 2.9717 (2.7334)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3231 (1.2272)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [290]  [1200/1251]  eta: 0:00:13  lr: 0.000011  min_lr: 0.000011  loss: 2.4500 (2.7277)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2424 (1.2272)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [290]  [1250/1251]  eta: 0:00:00  lr: 0.000011  min_lr: 0.000011  loss: 2.4860 (2.7270)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2932 (1.2290)  time: 0.2172  data: 0.0005  max mem: 19097
Epoch: [290] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000011  min_lr: 0.000011  loss: 2.4860 (2.7114)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2932 (1.2290)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5329 (0.5329)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.7590  data: 5.6195  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7318 (0.7202)  acc1: 85.2000 (85.5636)  acc5: 98.0000 (97.7818)  time: 0.7325  data: 0.6103  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9008 (0.8626)  acc1: 80.8000 (82.1714)  acc5: 96.4000 (96.3810)  time: 0.1879  data: 0.0682  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9359 (0.8749)  acc1: 80.0000 (81.8400)  acc5: 95.6000 (96.2720)  time: 0.1868  data: 0.0681  max mem: 19097
Test: Total time: 0:00:10 (0.4037 s / it)
* Acc@1 82.486 Acc@5 96.170 loss 0.862
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.49%
Epoch: [291]  [   0/1251]  eta: 1:01:12  lr: 0.000011  min_lr: 0.000011  loss: 3.0977 (3.0977)  weight_decay: 0.0500 (0.0500)  time: 2.9357  data: 2.6606  max mem: 19097
Epoch: [291]  [ 200/1251]  eta: 0:04:44  lr: 0.000011  min_lr: 0.000011  loss: 3.0187 (2.6848)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1444 (1.2205)  time: 0.2569  data: 0.0003  max mem: 19097
Epoch: [291]  [ 400/1251]  eta: 0:03:44  lr: 0.000010  min_lr: 0.000010  loss: 2.8813 (2.6852)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1782 (1.2470)  time: 0.2559  data: 0.0004  max mem: 19097
Epoch: [291]  [ 600/1251]  eta: 0:02:50  lr: 0.000010  min_lr: 0.000010  loss: 2.9496 (2.6986)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1506 (1.2397)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [291]  [ 800/1251]  eta: 0:01:57  lr: 0.000010  min_lr: 0.000010  loss: 2.8738 (2.6997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2111 (inf)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [291]  [1000/1251]  eta: 0:01:05  lr: 0.000009  min_lr: 0.000009  loss: 2.4516 (2.7058)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1837 (inf)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [291]  [1200/1251]  eta: 0:00:13  lr: 0.000009  min_lr: 0.000009  loss: 2.5619 (2.7106)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1925 (inf)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [291]  [1250/1251]  eta: 0:00:00  lr: 0.000009  min_lr: 0.000009  loss: 2.5564 (2.7094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1532 (inf)  time: 0.2167  data: 0.0007  max mem: 19097
Epoch: [291] Total time: 0:05:25 (0.2600 s / it)
Averaged stats: lr: 0.000009  min_lr: 0.000009  loss: 2.5564 (2.7125)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1532 (inf)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6075 (0.6075)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.7240  data: 5.5664  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7854 (0.7848)  acc1: 86.0000 (85.6727)  acc5: 98.0000 (97.7455)  time: 0.7602  data: 0.6344  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9589 (0.9252)  acc1: 81.2000 (82.1714)  acc5: 96.4000 (96.4000)  time: 0.2079  data: 0.0873  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0027 (0.9363)  acc1: 80.4000 (81.8880)  acc5: 95.6000 (96.2240)  time: 0.2090  data: 0.0892  max mem: 19097
Test: Total time: 0:00:10 (0.4191 s / it)
* Acc@1 82.386 Acc@5 96.162 loss 0.923
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.49%
Epoch: [292]  [   0/1251]  eta: 1:04:07  lr: 0.000009  min_lr: 0.000009  loss: 2.7578 (2.7578)  weight_decay: 0.0500 (0.0500)  time: 3.0751  data: 2.1223  max mem: 19097
Epoch: [292]  [ 200/1251]  eta: 0:04:47  lr: 0.000009  min_lr: 0.000009  loss: 2.6556 (2.7448)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1554 (1.2045)  time: 0.2583  data: 0.0005  max mem: 19097
Epoch: [292]  [ 400/1251]  eta: 0:03:46  lr: 0.000008  min_lr: 0.000008  loss: 3.0744 (2.7263)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2144 (1.2236)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [292]  [ 600/1251]  eta: 0:02:51  lr: 0.000008  min_lr: 0.000008  loss: 2.8776 (2.7236)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2241 (1.2420)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [292]  [ 800/1251]  eta: 0:01:58  lr: 0.000008  min_lr: 0.000008  loss: 3.1485 (2.7186)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2452 (1.2361)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [292]  [1000/1251]  eta: 0:01:05  lr: 0.000008  min_lr: 0.000008  loss: 2.6196 (2.7164)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1610 (1.2311)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [292]  [1200/1251]  eta: 0:00:13  lr: 0.000007  min_lr: 0.000007  loss: 2.6823 (2.7074)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2580 (1.2364)  time: 0.2566  data: 0.0003  max mem: 19097
Epoch: [292]  [1250/1251]  eta: 0:00:00  lr: 0.000007  min_lr: 0.000007  loss: 2.8639 (2.7064)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2500 (1.2377)  time: 0.2167  data: 0.0006  max mem: 19097
Epoch: [292] Total time: 0:05:26 (0.2607 s / it)
Averaged stats: lr: 0.000007  min_lr: 0.000007  loss: 2.8639 (2.7066)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2500 (1.2377)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5695 (0.5695)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.4089  data: 5.2409  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7632 (0.7555)  acc1: 86.0000 (85.4545)  acc5: 98.0000 (97.7818)  time: 0.7673  data: 0.6381  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9347 (0.8992)  acc1: 80.8000 (82.0191)  acc5: 96.4000 (96.4381)  time: 0.2352  data: 0.1134  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9813 (0.9110)  acc1: 80.4000 (81.7440)  acc5: 95.6000 (96.3200)  time: 0.2230  data: 0.1033  max mem: 19097
Test: Total time: 0:00:10 (0.4272 s / it)
* Acc@1 82.424 Acc@5 96.176 loss 0.897
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.49%
Epoch: [293]  [   0/1251]  eta: 1:07:46  lr: 0.000007  min_lr: 0.000007  loss: 2.9366 (2.9366)  weight_decay: 0.0500 (0.0500)  time: 3.2509  data: 2.0019  max mem: 19097
Epoch: [293]  [ 200/1251]  eta: 0:04:46  lr: 0.000007  min_lr: 0.000007  loss: 2.5134 (2.7399)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2495 (1.2554)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [293]  [ 400/1251]  eta: 0:03:46  lr: 0.000007  min_lr: 0.000007  loss: 2.7853 (2.6982)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2524 (1.2472)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [293]  [ 600/1251]  eta: 0:02:51  lr: 0.000006  min_lr: 0.000006  loss: 2.6556 (2.6916)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1850 (1.2389)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [293]  [ 800/1251]  eta: 0:01:58  lr: 0.000006  min_lr: 0.000006  loss: 2.1063 (2.6793)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1515 (1.2344)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [293]  [1000/1251]  eta: 0:01:05  lr: 0.000006  min_lr: 0.000006  loss: 2.6894 (2.6902)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2114 (1.2354)  time: 0.2569  data: 0.0005  max mem: 19097
Epoch: [293]  [1200/1251]  eta: 0:00:13  lr: 0.000006  min_lr: 0.000006  loss: 2.5892 (2.6894)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2643 (1.2424)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [293]  [1250/1251]  eta: 0:00:00  lr: 0.000006  min_lr: 0.000006  loss: 3.0042 (2.6904)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2478 (1.2433)  time: 0.2165  data: 0.0006  max mem: 19097
Epoch: [293] Total time: 0:05:25 (0.2604 s / it)
Averaged stats: lr: 0.000006  min_lr: 0.000006  loss: 3.0042 (2.7200)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2478 (1.2433)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5893 (0.5893)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.4895  data: 5.3547  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7764 (0.7729)  acc1: 85.6000 (85.5273)  acc5: 98.0000 (97.7091)  time: 0.7717  data: 0.6502  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9513 (0.9170)  acc1: 81.2000 (82.0952)  acc5: 96.4000 (96.3238)  time: 0.2212  data: 0.1019  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0078 (0.9299)  acc1: 80.4000 (81.8080)  acc5: 95.6000 (96.1920)  time: 0.2205  data: 0.1018  max mem: 19097
Test: Total time: 0:00:10 (0.4186 s / it)
* Acc@1 82.406 Acc@5 96.122 loss 0.916
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.49%
Epoch: [294]  [   0/1251]  eta: 1:08:36  lr: 0.000006  min_lr: 0.000006  loss: 2.5830 (2.5830)  weight_decay: 0.0500 (0.0500)  time: 3.2904  data: 1.7088  max mem: 19097
Epoch: [294]  [ 200/1251]  eta: 0:04:48  lr: 0.000005  min_lr: 0.000005  loss: 2.9083 (2.6532)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2282 (1.2336)  time: 0.2589  data: 0.0003  max mem: 19097
Epoch: [294]  [ 400/1251]  eta: 0:03:46  lr: 0.000005  min_lr: 0.000005  loss: 2.8506 (2.6694)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1712 (1.2194)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [294]  [ 600/1251]  eta: 0:02:51  lr: 0.000005  min_lr: 0.000005  loss: 2.5232 (2.7050)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2071 (1.2143)  time: 0.2565  data: 0.0003  max mem: 19097
Epoch: [294]  [ 800/1251]  eta: 0:01:58  lr: 0.000005  min_lr: 0.000005  loss: 2.5454 (2.7087)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2002 (1.2142)  time: 0.2563  data: 0.0003  max mem: 19097
Epoch: [294]  [1000/1251]  eta: 0:01:05  lr: 0.000004  min_lr: 0.000004  loss: 2.5672 (2.7032)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2096 (1.2273)  time: 0.2556  data: 0.0004  max mem: 19097
Epoch: [294]  [1200/1251]  eta: 0:00:13  lr: 0.000004  min_lr: 0.000004  loss: 3.1057 (2.7075)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1433 (1.2173)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [294]  [1250/1251]  eta: 0:00:00  lr: 0.000004  min_lr: 0.000004  loss: 3.0038 (2.7059)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2100 (1.2177)  time: 0.2165  data: 0.0006  max mem: 19097
Epoch: [294] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000004  min_lr: 0.000004  loss: 3.0038 (2.7022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2100 (1.2177)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5405 (0.5405)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.7515  data: 5.6102  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7449 (0.7264)  acc1: 85.2000 (85.6364)  acc5: 98.0000 (97.7091)  time: 0.7701  data: 0.6435  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9019 (0.8681)  acc1: 81.2000 (82.1524)  acc5: 96.4000 (96.2857)  time: 0.1984  data: 0.0751  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9440 (0.8806)  acc1: 80.4000 (81.8720)  acc5: 95.6000 (96.1280)  time: 0.1977  data: 0.0750  max mem: 19097
Test: Total time: 0:00:10 (0.4107 s / it)
* Acc@1 82.458 Acc@5 96.192 loss 0.865
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.49%
Epoch: [295]  [   0/1251]  eta: 1:09:22  lr: 0.000004  min_lr: 0.000004  loss: 2.0866 (2.0866)  weight_decay: 0.0500 (0.0500)  time: 3.3276  data: 1.7514  max mem: 19097
Epoch: [295]  [ 200/1251]  eta: 0:04:47  lr: 0.000004  min_lr: 0.000004  loss: 2.8168 (2.7049)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2042 (1.1960)  time: 0.2580  data: 0.0004  max mem: 19097
Epoch: [295]  [ 400/1251]  eta: 0:03:46  lr: 0.000004  min_lr: 0.000004  loss: 2.5097 (2.7125)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1845 (1.2278)  time: 0.2576  data: 0.0003  max mem: 19097
Epoch: [295]  [ 600/1251]  eta: 0:02:51  lr: 0.000004  min_lr: 0.000004  loss: 3.1201 (2.7174)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1859 (1.2170)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [295]  [ 800/1251]  eta: 0:01:58  lr: 0.000003  min_lr: 0.000003  loss: 2.8381 (2.7293)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1853 (1.2270)  time: 0.2564  data: 0.0004  max mem: 19097
Epoch: [295]  [1000/1251]  eta: 0:01:05  lr: 0.000003  min_lr: 0.000003  loss: 3.1062 (2.7265)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1561 (1.2267)  time: 0.2574  data: 0.0004  max mem: 19097
Epoch: [295]  [1200/1251]  eta: 0:00:13  lr: 0.000003  min_lr: 0.000003  loss: 2.5138 (2.6998)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1011 (1.2193)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [295]  [1250/1251]  eta: 0:00:00  lr: 0.000003  min_lr: 0.000003  loss: 3.1973 (2.7035)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2279 (1.2181)  time: 0.2169  data: 0.0007  max mem: 19097
Epoch: [295] Total time: 0:05:26 (0.2609 s / it)
Averaged stats: lr: 0.000003  min_lr: 0.000003  loss: 3.1973 (2.7135)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2279 (1.2181)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6715 (0.6715)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.8338  data: 5.6932  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.8533 (0.8511)  acc1: 85.6000 (85.3091)  acc5: 98.0000 (97.7091)  time: 0.7624  data: 0.6378  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 1.0211 (0.9933)  acc1: 80.4000 (81.8095)  acc5: 96.4000 (96.3238)  time: 0.2023  data: 0.0816  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0804 (1.0046)  acc1: 80.4000 (81.5360)  acc5: 96.0000 (96.1920)  time: 0.2012  data: 0.0815  max mem: 19097
Test: Total time: 0:00:10 (0.4174 s / it)
* Acc@1 82.334 Acc@5 96.126 loss 0.992
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.49%
Epoch: [296]  [   0/1251]  eta: 1:03:36  lr: 0.000003  min_lr: 0.000003  loss: 2.5161 (2.5161)  weight_decay: 0.0500 (0.0500)  time: 3.0509  data: 2.7549  max mem: 19097
Epoch: [296]  [ 200/1251]  eta: 0:04:47  lr: 0.000003  min_lr: 0.000003  loss: 2.5006 (2.6797)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1646 (1.2203)  time: 0.2565  data: 0.0003  max mem: 19097
Epoch: [296]  [ 400/1251]  eta: 0:03:45  lr: 0.000003  min_lr: 0.000003  loss: 2.6591 (2.6955)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2226 (1.2283)  time: 0.2569  data: 0.0003  max mem: 19097
Epoch: [296]  [ 600/1251]  eta: 0:02:50  lr: 0.000003  min_lr: 0.000003  loss: 2.9867 (2.6849)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1701 (1.2235)  time: 0.2575  data: 0.0004  max mem: 19097
Epoch: [296]  [ 800/1251]  eta: 0:01:58  lr: 0.000002  min_lr: 0.000002  loss: 2.5936 (2.6888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1950 (1.2164)  time: 0.2630  data: 0.0003  max mem: 19097
Epoch: [296]  [1000/1251]  eta: 0:01:05  lr: 0.000002  min_lr: 0.000002  loss: 2.4841 (2.6914)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1745 (1.2225)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [296]  [1200/1251]  eta: 0:00:13  lr: 0.000002  min_lr: 0.000002  loss: 3.1609 (2.7046)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1282 (1.2159)  time: 0.2594  data: 0.0004  max mem: 19097
Epoch: [296]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.6306 (2.7023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1836 (1.2182)  time: 0.2167  data: 0.0006  max mem: 19097
Epoch: [296] Total time: 0:05:26 (0.2608 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.6306 (2.7032)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1836 (1.2182)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5501 (0.5501)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 5.6992  data: 5.5567  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7553 (0.7400)  acc1: 86.0000 (85.6364)  acc5: 98.0000 (97.7091)  time: 0.7182  data: 0.5959  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9199 (0.8805)  acc1: 81.6000 (82.2095)  acc5: 96.4000 (96.3810)  time: 0.1894  data: 0.0693  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9639 (0.8925)  acc1: 80.4000 (81.9680)  acc5: 95.6000 (96.2240)  time: 0.2080  data: 0.0885  max mem: 19097
Test: Total time: 0:00:10 (0.4171 s / it)
* Acc@1 82.532 Acc@5 96.200 loss 0.878
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.53%
Epoch: [297]  [   0/1251]  eta: 1:06:48  lr: 0.000002  min_lr: 0.000002  loss: 2.1644 (2.1644)  weight_decay: 0.0500 (0.0500)  time: 3.2042  data: 2.9204  max mem: 19097
Epoch: [297]  [ 200/1251]  eta: 0:04:45  lr: 0.000002  min_lr: 0.000002  loss: 2.8494 (2.6791)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2036 (1.2091)  time: 0.2577  data: 0.0004  max mem: 19097
Epoch: [297]  [ 400/1251]  eta: 0:03:45  lr: 0.000002  min_lr: 0.000002  loss: 2.8809 (2.7175)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1649 (inf)  time: 0.2573  data: 0.0004  max mem: 19097
Epoch: [297]  [ 600/1251]  eta: 0:02:50  lr: 0.000002  min_lr: 0.000002  loss: 2.6217 (2.7039)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1648 (inf)  time: 0.2576  data: 0.0003  max mem: 19097
Epoch: [297]  [ 800/1251]  eta: 0:01:57  lr: 0.000002  min_lr: 0.000002  loss: 2.3541 (2.6938)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2045 (inf)  time: 0.2563  data: 0.0003  max mem: 19097
Epoch: [297]  [1000/1251]  eta: 0:01:05  lr: 0.000002  min_lr: 0.000002  loss: 3.0819 (2.6978)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1756 (inf)  time: 0.2572  data: 0.0003  max mem: 19097
Epoch: [297]  [1200/1251]  eta: 0:00:13  lr: 0.000002  min_lr: 0.000002  loss: 2.9818 (2.7035)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1878 (inf)  time: 0.2569  data: 0.0004  max mem: 19097
Epoch: [297]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.9741 (2.7051)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2564 (inf)  time: 0.2167  data: 0.0006  max mem: 19097
Epoch: [297] Total time: 0:05:25 (0.2605 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.9741 (2.7010)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2564 (inf)
Test:  [ 0/25]  eta: 0:01:52  loss: 0.5820 (0.5820)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 4.4857  data: 4.3127  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7756 (0.7664)  acc1: 85.6000 (85.4182)  acc5: 97.6000 (97.6364)  time: 0.6670  data: 0.5397  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9515 (0.9062)  acc1: 80.8000 (81.9810)  acc5: 96.4000 (96.2857)  time: 0.2492  data: 0.1286  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9871 (0.9176)  acc1: 80.4000 (81.7600)  acc5: 95.6000 (96.1440)  time: 0.2220  data: 0.1025  max mem: 19097
Test: Total time: 0:00:10 (0.4070 s / it)
* Acc@1 82.386 Acc@5 96.140 loss 0.902
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.53%
Epoch: [298]  [   0/1251]  eta: 1:05:43  lr: 0.000002  min_lr: 0.000002  loss: 3.2696 (3.2696)  weight_decay: 0.0500 (0.0500)  time: 3.1526  data: 2.1631  max mem: 19097
Epoch: [298]  [ 200/1251]  eta: 0:04:45  lr: 0.000001  min_lr: 0.000001  loss: 2.1899 (2.7390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1351 (1.2430)  time: 0.2563  data: 0.0004  max mem: 19097
Epoch: [298]  [ 400/1251]  eta: 0:03:44  lr: 0.000001  min_lr: 0.000001  loss: 2.3983 (2.7113)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1577 (1.2400)  time: 0.2570  data: 0.0004  max mem: 19097
Epoch: [298]  [ 600/1251]  eta: 0:02:50  lr: 0.000001  min_lr: 0.000001  loss: 2.2960 (2.7038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1280 (1.2402)  time: 0.2567  data: 0.0004  max mem: 19097
Epoch: [298]  [ 800/1251]  eta: 0:01:57  lr: 0.000001  min_lr: 0.000001  loss: 2.6766 (2.7058)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1619 (1.2378)  time: 0.2582  data: 0.0004  max mem: 19097
Epoch: [298]  [1000/1251]  eta: 0:01:05  lr: 0.000001  min_lr: 0.000001  loss: 3.0577 (2.7072)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2096 (1.2351)  time: 0.2584  data: 0.0005  max mem: 19097
Epoch: [298]  [1200/1251]  eta: 0:00:13  lr: 0.000001  min_lr: 0.000001  loss: 2.0482 (2.7064)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1784 (1.2335)  time: 0.2579  data: 0.0004  max mem: 19097
Epoch: [298]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.8148 (2.7054)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1398 (1.2289)  time: 0.2166  data: 0.0007  max mem: 19097
Epoch: [298] Total time: 0:05:25 (0.2602 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.8148 (2.7115)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1398 (1.2289)
Test:  [ 0/25]  eta: 0:01:40  loss: 0.6016 (0.6016)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 4.0234  data: 3.8798  max mem: 19097
Test:  [10/25]  eta: 0:00:10  loss: 0.7989 (0.7887)  acc1: 85.6000 (85.6727)  acc5: 97.6000 (97.6364)  time: 0.6932  data: 0.5688  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9657 (0.9280)  acc1: 81.2000 (82.1714)  acc5: 96.4000 (96.3238)  time: 0.2597  data: 0.1371  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 1.0126 (0.9399)  acc1: 81.2000 (81.9040)  acc5: 96.0000 (96.2240)  time: 0.2091  data: 0.0879  max mem: 19097
Test: Total time: 0:00:09 (0.3995 s / it)
* Acc@1 82.462 Acc@5 96.162 loss 0.927
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.53%
Epoch: [299]  [   0/1251]  eta: 1:00:53  lr: 0.000001  min_lr: 0.000001  loss: 3.2879 (3.2879)  weight_decay: 0.0500 (0.0500)  time: 2.9202  data: 1.7758  max mem: 19097
Epoch: [299]  [ 200/1251]  eta: 0:04:47  lr: 0.000001  min_lr: 0.000001  loss: 2.3286 (2.6849)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2013 (1.2398)  time: 0.2571  data: 0.0003  max mem: 19097
Epoch: [299]  [ 400/1251]  eta: 0:03:45  lr: 0.000001  min_lr: 0.000001  loss: 2.3872 (2.7014)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2332 (1.2294)  time: 0.2567  data: 0.0003  max mem: 19097
Epoch: [299]  [ 600/1251]  eta: 0:02:51  lr: 0.000001  min_lr: 0.000001  loss: 2.8096 (2.7211)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2338 (1.2187)  time: 0.2566  data: 0.0004  max mem: 19097
Epoch: [299]  [ 800/1251]  eta: 0:01:57  lr: 0.000001  min_lr: 0.000001  loss: 3.1388 (2.7348)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1682 (1.2236)  time: 0.2568  data: 0.0004  max mem: 19097
Epoch: [299]  [1000/1251]  eta: 0:01:05  lr: 0.000001  min_lr: 0.000001  loss: 2.5573 (2.7295)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1987 (1.2174)  time: 0.2572  data: 0.0004  max mem: 19097
Epoch: [299]  [1200/1251]  eta: 0:00:13  lr: 0.000001  min_lr: 0.000001  loss: 2.7978 (2.7305)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2148 (1.2272)  time: 0.2579  data: 0.0005  max mem: 19097
Epoch: [299]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.1273 (2.7261)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1817 (1.2273)  time: 0.2165  data: 0.0007  max mem: 19097
Epoch: [299] Total time: 0:05:25 (0.2605 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.1273 (2.7133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1817 (1.2273)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5341 (0.5341)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 5.5647  data: 5.4207  max mem: 19097
Test:  [10/25]  eta: 0:00:11  loss: 0.7390 (0.7246)  acc1: 85.6000 (85.7818)  acc5: 98.0000 (97.6727)  time: 0.7399  data: 0.6167  max mem: 19097
Test:  [20/25]  eta: 0:00:02  loss: 0.9090 (0.8671)  acc1: 82.0000 (82.1143)  acc5: 96.0000 (96.2476)  time: 0.2162  data: 0.0957  max mem: 19097
Test:  [24/25]  eta: 0:00:00  loss: 0.9503 (0.8796)  acc1: 80.8000 (81.8560)  acc5: 95.6000 (96.0640)  time: 0.2181  data: 0.0986  max mem: 19097
Test: Total time: 0:00:10 (0.4207 s / it)
* Acc@1 82.476 Acc@5 96.164 loss 0.865
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.53%
Training time 7:51:41
