| distributed init (rank 0): env://, gpu 0
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 3): env://, gpu 3
Namespace(batch_size=128, epochs=300, update_freq=4, model='base', drop_path=0, input_size=224, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.004, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.8, cutmix=1.0, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_base_8.7G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=4, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(224, 224))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7fa45b0a8b50>
Mixup is activated!
Model = SFCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
  )
  (layer1): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((40,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
        (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(40, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(40, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=40, bias=False)
          (norm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(40, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): Identity()
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.006)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.011)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.017)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.023)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.028)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.034)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 240, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(240, 240, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=240, bias=False)
        (norm): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.040)
    )
  )
  (layer2): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((80,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=80, bias=False)
        (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(80, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=80, bias=False)
          (norm): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(80, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.046)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.051)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.057)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.063)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.068)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.074)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.080)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.085)
    )
    (8): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.091)
    )
    (9): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.097)
    )
    (10): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.102)
    )
    (11): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.108)
    )
    (12): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.114)
    )
    (13): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.120)
    )
    (14): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=480, bias=False)
        (norm): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(480, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.125)
    )
  )
  (layer3): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=160, bias=False)
        (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=160, bias=False)
          (norm): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(160, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.131)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.137)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.142)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.148)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.154)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.159)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.165)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.171)
    )
    (8): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.176)
    )
    (9): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.182)
    )
    (10): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.188)
    )
    (11): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.194)
    )
    (12): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.199)
    )
    (13): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.205)
    )
    (14): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.211)
    )
    (15): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.216)
    )
    (16): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.222)
    )
    (17): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.228)
    )
    (18): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.233)
    )
    (19): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.239)
    )
    (20): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.245)
    )
    (21): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.250)
    )
    (22): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.256)
    )
    (23): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.262)
    )
    (24): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.268)
    )
    (25): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.273)
    )
    (26): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.279)
    )
    (27): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.285)
    )
    (28): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.290)
    )
    (29): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.296)
    )
    (30): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.302)
    )
    (31): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.307)
    )
    (32): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.313)
    )
    (33): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.319)
    )
    (34): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (norm): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.324)
    )
  )
  (layer4): Sequential(
    (0): DropBottleNeck(
      (ln): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
      (le): ConvX(
        (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=320, bias=False)
        (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(320, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=320, bias=False)
          (norm): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
        (1): ConvX(
          (conv): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act()
        )
      )
      (drop_path): DropPath(drop_prob=0.330)
    )
    (1): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.336)
    )
    (2): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.342)
    )
    (3): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.347)
    )
    (4): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.353)
    )
    (5): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.359)
    )
    (6): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.364)
    )
    (7): DropBottleNeck(
      (le): ConvX(
        (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=640, bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (conv_in): ConvX(
        (conv): Conv2d(640, 1920, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): SiLU(inplace=True)
        )
      )
      (conv): ConvX(
        (conv): Conv2d(1920, 1920, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1920, bias=False)
        (norm): BatchNorm2d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act(
          (act): GSiLU()
        )
      )
      (conv_out): ConvX(
        (conv): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): Act()
      )
      (skip): Identity()
      (drop_path): DropPath(drop_prob=0.370)
    )
  )
  (head): ConvX(
    (conv): Conv2d(640, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): SiLU(inplace=True)
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 49050136
LR = 0.00400000
Batch size = 2048
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 625
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.le.conv.weight",
      "layer1.0.conv_in.conv.weight",
      "layer1.0.conv.conv.weight",
      "layer1.0.conv_out.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.le.conv.weight",
      "layer1.1.conv_in.conv.weight",
      "layer1.1.conv.conv.weight",
      "layer1.1.conv_out.conv.weight",
      "layer1.2.le.conv.weight",
      "layer1.2.conv_in.conv.weight",
      "layer1.2.conv.conv.weight",
      "layer1.2.conv_out.conv.weight",
      "layer1.3.le.conv.weight",
      "layer1.3.conv_in.conv.weight",
      "layer1.3.conv.conv.weight",
      "layer1.3.conv_out.conv.weight",
      "layer1.4.le.conv.weight",
      "layer1.4.conv_in.conv.weight",
      "layer1.4.conv.conv.weight",
      "layer1.4.conv_out.conv.weight",
      "layer1.5.le.conv.weight",
      "layer1.5.conv_in.conv.weight",
      "layer1.5.conv.conv.weight",
      "layer1.5.conv_out.conv.weight",
      "layer1.6.le.conv.weight",
      "layer1.6.conv_in.conv.weight",
      "layer1.6.conv.conv.weight",
      "layer1.6.conv_out.conv.weight",
      "layer1.7.le.conv.weight",
      "layer1.7.conv_in.conv.weight",
      "layer1.7.conv.conv.weight",
      "layer1.7.conv_out.conv.weight",
      "layer2.0.le.conv.weight",
      "layer2.0.conv_in.conv.weight",
      "layer2.0.conv.conv.weight",
      "layer2.0.conv_out.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.le.conv.weight",
      "layer2.1.conv_in.conv.weight",
      "layer2.1.conv.conv.weight",
      "layer2.1.conv_out.conv.weight",
      "layer2.2.le.conv.weight",
      "layer2.2.conv_in.conv.weight",
      "layer2.2.conv.conv.weight",
      "layer2.2.conv_out.conv.weight",
      "layer2.3.le.conv.weight",
      "layer2.3.conv_in.conv.weight",
      "layer2.3.conv.conv.weight",
      "layer2.3.conv_out.conv.weight",
      "layer2.4.le.conv.weight",
      "layer2.4.conv_in.conv.weight",
      "layer2.4.conv.conv.weight",
      "layer2.4.conv_out.conv.weight",
      "layer2.5.le.conv.weight",
      "layer2.5.conv_in.conv.weight",
      "layer2.5.conv.conv.weight",
      "layer2.5.conv_out.conv.weight",
      "layer2.6.le.conv.weight",
      "layer2.6.conv_in.conv.weight",
      "layer2.6.conv.conv.weight",
      "layer2.6.conv_out.conv.weight",
      "layer2.7.le.conv.weight",
      "layer2.7.conv_in.conv.weight",
      "layer2.7.conv.conv.weight",
      "layer2.7.conv_out.conv.weight",
      "layer2.8.le.conv.weight",
      "layer2.8.conv_in.conv.weight",
      "layer2.8.conv.conv.weight",
      "layer2.8.conv_out.conv.weight",
      "layer2.9.le.conv.weight",
      "layer2.9.conv_in.conv.weight",
      "layer2.9.conv.conv.weight",
      "layer2.9.conv_out.conv.weight",
      "layer2.10.le.conv.weight",
      "layer2.10.conv_in.conv.weight",
      "layer2.10.conv.conv.weight",
      "layer2.10.conv_out.conv.weight",
      "layer2.11.le.conv.weight",
      "layer2.11.conv_in.conv.weight",
      "layer2.11.conv.conv.weight",
      "layer2.11.conv_out.conv.weight",
      "layer2.12.le.conv.weight",
      "layer2.12.conv_in.conv.weight",
      "layer2.12.conv.conv.weight",
      "layer2.12.conv_out.conv.weight",
      "layer2.13.le.conv.weight",
      "layer2.13.conv_in.conv.weight",
      "layer2.13.conv.conv.weight",
      "layer2.13.conv_out.conv.weight",
      "layer2.14.le.conv.weight",
      "layer2.14.conv_in.conv.weight",
      "layer2.14.conv.conv.weight",
      "layer2.14.conv_out.conv.weight",
      "layer3.0.le.conv.weight",
      "layer3.0.conv_in.conv.weight",
      "layer3.0.conv.conv.weight",
      "layer3.0.conv_out.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.le.conv.weight",
      "layer3.1.conv_in.conv.weight",
      "layer3.1.conv.conv.weight",
      "layer3.1.conv_out.conv.weight",
      "layer3.2.le.conv.weight",
      "layer3.2.conv_in.conv.weight",
      "layer3.2.conv.conv.weight",
      "layer3.2.conv_out.conv.weight",
      "layer3.3.le.conv.weight",
      "layer3.3.conv_in.conv.weight",
      "layer3.3.conv.conv.weight",
      "layer3.3.conv_out.conv.weight",
      "layer3.4.le.conv.weight",
      "layer3.4.conv_in.conv.weight",
      "layer3.4.conv.conv.weight",
      "layer3.4.conv_out.conv.weight",
      "layer3.5.le.conv.weight",
      "layer3.5.conv_in.conv.weight",
      "layer3.5.conv.conv.weight",
      "layer3.5.conv_out.conv.weight",
      "layer3.6.le.conv.weight",
      "layer3.6.conv_in.conv.weight",
      "layer3.6.conv.conv.weight",
      "layer3.6.conv_out.conv.weight",
      "layer3.7.le.conv.weight",
      "layer3.7.conv_in.conv.weight",
      "layer3.7.conv.conv.weight",
      "layer3.7.conv_out.conv.weight",
      "layer3.8.le.conv.weight",
      "layer3.8.conv_in.conv.weight",
      "layer3.8.conv.conv.weight",
      "layer3.8.conv_out.conv.weight",
      "layer3.9.le.conv.weight",
      "layer3.9.conv_in.conv.weight",
      "layer3.9.conv.conv.weight",
      "layer3.9.conv_out.conv.weight",
      "layer3.10.le.conv.weight",
      "layer3.10.conv_in.conv.weight",
      "layer3.10.conv.conv.weight",
      "layer3.10.conv_out.conv.weight",
      "layer3.11.le.conv.weight",
      "layer3.11.conv_in.conv.weight",
      "layer3.11.conv.conv.weight",
      "layer3.11.conv_out.conv.weight",
      "layer3.12.le.conv.weight",
      "layer3.12.conv_in.conv.weight",
      "layer3.12.conv.conv.weight",
      "layer3.12.conv_out.conv.weight",
      "layer3.13.le.conv.weight",
      "layer3.13.conv_in.conv.weight",
      "layer3.13.conv.conv.weight",
      "layer3.13.conv_out.conv.weight",
      "layer3.14.le.conv.weight",
      "layer3.14.conv_in.conv.weight",
      "layer3.14.conv.conv.weight",
      "layer3.14.conv_out.conv.weight",
      "layer3.15.le.conv.weight",
      "layer3.15.conv_in.conv.weight",
      "layer3.15.conv.conv.weight",
      "layer3.15.conv_out.conv.weight",
      "layer3.16.le.conv.weight",
      "layer3.16.conv_in.conv.weight",
      "layer3.16.conv.conv.weight",
      "layer3.16.conv_out.conv.weight",
      "layer3.17.le.conv.weight",
      "layer3.17.conv_in.conv.weight",
      "layer3.17.conv.conv.weight",
      "layer3.17.conv_out.conv.weight",
      "layer3.18.le.conv.weight",
      "layer3.18.conv_in.conv.weight",
      "layer3.18.conv.conv.weight",
      "layer3.18.conv_out.conv.weight",
      "layer3.19.le.conv.weight",
      "layer3.19.conv_in.conv.weight",
      "layer3.19.conv.conv.weight",
      "layer3.19.conv_out.conv.weight",
      "layer3.20.le.conv.weight",
      "layer3.20.conv_in.conv.weight",
      "layer3.20.conv.conv.weight",
      "layer3.20.conv_out.conv.weight",
      "layer3.21.le.conv.weight",
      "layer3.21.conv_in.conv.weight",
      "layer3.21.conv.conv.weight",
      "layer3.21.conv_out.conv.weight",
      "layer3.22.le.conv.weight",
      "layer3.22.conv_in.conv.weight",
      "layer3.22.conv.conv.weight",
      "layer3.22.conv_out.conv.weight",
      "layer3.23.le.conv.weight",
      "layer3.23.conv_in.conv.weight",
      "layer3.23.conv.conv.weight",
      "layer3.23.conv_out.conv.weight",
      "layer3.24.le.conv.weight",
      "layer3.24.conv_in.conv.weight",
      "layer3.24.conv.conv.weight",
      "layer3.24.conv_out.conv.weight",
      "layer3.25.le.conv.weight",
      "layer3.25.conv_in.conv.weight",
      "layer3.25.conv.conv.weight",
      "layer3.25.conv_out.conv.weight",
      "layer3.26.le.conv.weight",
      "layer3.26.conv_in.conv.weight",
      "layer3.26.conv.conv.weight",
      "layer3.26.conv_out.conv.weight",
      "layer3.27.le.conv.weight",
      "layer3.27.conv_in.conv.weight",
      "layer3.27.conv.conv.weight",
      "layer3.27.conv_out.conv.weight",
      "layer3.28.le.conv.weight",
      "layer3.28.conv_in.conv.weight",
      "layer3.28.conv.conv.weight",
      "layer3.28.conv_out.conv.weight",
      "layer3.29.le.conv.weight",
      "layer3.29.conv_in.conv.weight",
      "layer3.29.conv.conv.weight",
      "layer3.29.conv_out.conv.weight",
      "layer3.30.le.conv.weight",
      "layer3.30.conv_in.conv.weight",
      "layer3.30.conv.conv.weight",
      "layer3.30.conv_out.conv.weight",
      "layer3.31.le.conv.weight",
      "layer3.31.conv_in.conv.weight",
      "layer3.31.conv.conv.weight",
      "layer3.31.conv_out.conv.weight",
      "layer3.32.le.conv.weight",
      "layer3.32.conv_in.conv.weight",
      "layer3.32.conv.conv.weight",
      "layer3.32.conv_out.conv.weight",
      "layer3.33.le.conv.weight",
      "layer3.33.conv_in.conv.weight",
      "layer3.33.conv.conv.weight",
      "layer3.33.conv_out.conv.weight",
      "layer3.34.le.conv.weight",
      "layer3.34.conv_in.conv.weight",
      "layer3.34.conv.conv.weight",
      "layer3.34.conv_out.conv.weight",
      "layer4.0.le.conv.weight",
      "layer4.0.conv_in.conv.weight",
      "layer4.0.conv.conv.weight",
      "layer4.0.conv_out.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.le.conv.weight",
      "layer4.1.conv_in.conv.weight",
      "layer4.1.conv.conv.weight",
      "layer4.1.conv_out.conv.weight",
      "layer4.2.le.conv.weight",
      "layer4.2.conv_in.conv.weight",
      "layer4.2.conv.conv.weight",
      "layer4.2.conv_out.conv.weight",
      "layer4.3.le.conv.weight",
      "layer4.3.conv_in.conv.weight",
      "layer4.3.conv.conv.weight",
      "layer4.3.conv_out.conv.weight",
      "layer4.4.le.conv.weight",
      "layer4.4.conv_in.conv.weight",
      "layer4.4.conv.conv.weight",
      "layer4.4.conv_out.conv.weight",
      "layer4.5.le.conv.weight",
      "layer4.5.conv_in.conv.weight",
      "layer4.5.conv.conv.weight",
      "layer4.5.conv_out.conv.weight",
      "layer4.6.le.conv.weight",
      "layer4.6.conv_in.conv.weight",
      "layer4.6.conv.conv.weight",
      "layer4.6.conv_out.conv.weight",
      "layer4.7.le.conv.weight",
      "layer4.7.conv_in.conv.weight",
      "layer4.7.conv.conv.weight",
      "layer4.7.conv_out.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.ln.weight",
      "layer1.0.ln.bias",
      "layer1.0.le.norm.weight",
      "layer1.0.le.norm.bias",
      "layer1.0.conv_in.norm.weight",
      "layer1.0.conv_in.norm.bias",
      "layer1.0.conv.norm.weight",
      "layer1.0.conv.norm.bias",
      "layer1.0.conv_out.norm.weight",
      "layer1.0.conv_out.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.le.norm.weight",
      "layer1.1.le.norm.bias",
      "layer1.1.conv_in.norm.weight",
      "layer1.1.conv_in.norm.bias",
      "layer1.1.conv.norm.weight",
      "layer1.1.conv.norm.bias",
      "layer1.1.conv_out.norm.weight",
      "layer1.1.conv_out.norm.bias",
      "layer1.2.le.norm.weight",
      "layer1.2.le.norm.bias",
      "layer1.2.conv_in.norm.weight",
      "layer1.2.conv_in.norm.bias",
      "layer1.2.conv.norm.weight",
      "layer1.2.conv.norm.bias",
      "layer1.2.conv_out.norm.weight",
      "layer1.2.conv_out.norm.bias",
      "layer1.3.le.norm.weight",
      "layer1.3.le.norm.bias",
      "layer1.3.conv_in.norm.weight",
      "layer1.3.conv_in.norm.bias",
      "layer1.3.conv.norm.weight",
      "layer1.3.conv.norm.bias",
      "layer1.3.conv_out.norm.weight",
      "layer1.3.conv_out.norm.bias",
      "layer1.4.le.norm.weight",
      "layer1.4.le.norm.bias",
      "layer1.4.conv_in.norm.weight",
      "layer1.4.conv_in.norm.bias",
      "layer1.4.conv.norm.weight",
      "layer1.4.conv.norm.bias",
      "layer1.4.conv_out.norm.weight",
      "layer1.4.conv_out.norm.bias",
      "layer1.5.le.norm.weight",
      "layer1.5.le.norm.bias",
      "layer1.5.conv_in.norm.weight",
      "layer1.5.conv_in.norm.bias",
      "layer1.5.conv.norm.weight",
      "layer1.5.conv.norm.bias",
      "layer1.5.conv_out.norm.weight",
      "layer1.5.conv_out.norm.bias",
      "layer1.6.le.norm.weight",
      "layer1.6.le.norm.bias",
      "layer1.6.conv_in.norm.weight",
      "layer1.6.conv_in.norm.bias",
      "layer1.6.conv.norm.weight",
      "layer1.6.conv.norm.bias",
      "layer1.6.conv_out.norm.weight",
      "layer1.6.conv_out.norm.bias",
      "layer1.7.le.norm.weight",
      "layer1.7.le.norm.bias",
      "layer1.7.conv_in.norm.weight",
      "layer1.7.conv_in.norm.bias",
      "layer1.7.conv.norm.weight",
      "layer1.7.conv.norm.bias",
      "layer1.7.conv_out.norm.weight",
      "layer1.7.conv_out.norm.bias",
      "layer2.0.ln.weight",
      "layer2.0.ln.bias",
      "layer2.0.le.norm.weight",
      "layer2.0.le.norm.bias",
      "layer2.0.conv_in.norm.weight",
      "layer2.0.conv_in.norm.bias",
      "layer2.0.conv.norm.weight",
      "layer2.0.conv.norm.bias",
      "layer2.0.conv_out.norm.weight",
      "layer2.0.conv_out.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.le.norm.weight",
      "layer2.1.le.norm.bias",
      "layer2.1.conv_in.norm.weight",
      "layer2.1.conv_in.norm.bias",
      "layer2.1.conv.norm.weight",
      "layer2.1.conv.norm.bias",
      "layer2.1.conv_out.norm.weight",
      "layer2.1.conv_out.norm.bias",
      "layer2.2.le.norm.weight",
      "layer2.2.le.norm.bias",
      "layer2.2.conv_in.norm.weight",
      "layer2.2.conv_in.norm.bias",
      "layer2.2.conv.norm.weight",
      "layer2.2.conv.norm.bias",
      "layer2.2.conv_out.norm.weight",
      "layer2.2.conv_out.norm.bias",
      "layer2.3.le.norm.weight",
      "layer2.3.le.norm.bias",
      "layer2.3.conv_in.norm.weight",
      "layer2.3.conv_in.norm.bias",
      "layer2.3.conv.norm.weight",
      "layer2.3.conv.norm.bias",
      "layer2.3.conv_out.norm.weight",
      "layer2.3.conv_out.norm.bias",
      "layer2.4.le.norm.weight",
      "layer2.4.le.norm.bias",
      "layer2.4.conv_in.norm.weight",
      "layer2.4.conv_in.norm.bias",
      "layer2.4.conv.norm.weight",
      "layer2.4.conv.norm.bias",
      "layer2.4.conv_out.norm.weight",
      "layer2.4.conv_out.norm.bias",
      "layer2.5.le.norm.weight",
      "layer2.5.le.norm.bias",
      "layer2.5.conv_in.norm.weight",
      "layer2.5.conv_in.norm.bias",
      "layer2.5.conv.norm.weight",
      "layer2.5.conv.norm.bias",
      "layer2.5.conv_out.norm.weight",
      "layer2.5.conv_out.norm.bias",
      "layer2.6.le.norm.weight",
      "layer2.6.le.norm.bias",
      "layer2.6.conv_in.norm.weight",
      "layer2.6.conv_in.norm.bias",
      "layer2.6.conv.norm.weight",
      "layer2.6.conv.norm.bias",
      "layer2.6.conv_out.norm.weight",
      "layer2.6.conv_out.norm.bias",
      "layer2.7.le.norm.weight",
      "layer2.7.le.norm.bias",
      "layer2.7.conv_in.norm.weight",
      "layer2.7.conv_in.norm.bias",
      "layer2.7.conv.norm.weight",
      "layer2.7.conv.norm.bias",
      "layer2.7.conv_out.norm.weight",
      "layer2.7.conv_out.norm.bias",
      "layer2.8.le.norm.weight",
      "layer2.8.le.norm.bias",
      "layer2.8.conv_in.norm.weight",
      "layer2.8.conv_in.norm.bias",
      "layer2.8.conv.norm.weight",
      "layer2.8.conv.norm.bias",
      "layer2.8.conv_out.norm.weight",
      "layer2.8.conv_out.norm.bias",
      "layer2.9.le.norm.weight",
      "layer2.9.le.norm.bias",
      "layer2.9.conv_in.norm.weight",
      "layer2.9.conv_in.norm.bias",
      "layer2.9.conv.norm.weight",
      "layer2.9.conv.norm.bias",
      "layer2.9.conv_out.norm.weight",
      "layer2.9.conv_out.norm.bias",
      "layer2.10.le.norm.weight",
      "layer2.10.le.norm.bias",
      "layer2.10.conv_in.norm.weight",
      "layer2.10.conv_in.norm.bias",
      "layer2.10.conv.norm.weight",
      "layer2.10.conv.norm.bias",
      "layer2.10.conv_out.norm.weight",
      "layer2.10.conv_out.norm.bias",
      "layer2.11.le.norm.weight",
      "layer2.11.le.norm.bias",
      "layer2.11.conv_in.norm.weight",
      "layer2.11.conv_in.norm.bias",
      "layer2.11.conv.norm.weight",
      "layer2.11.conv.norm.bias",
      "layer2.11.conv_out.norm.weight",
      "layer2.11.conv_out.norm.bias",
      "layer2.12.le.norm.weight",
      "layer2.12.le.norm.bias",
      "layer2.12.conv_in.norm.weight",
      "layer2.12.conv_in.norm.bias",
      "layer2.12.conv.norm.weight",
      "layer2.12.conv.norm.bias",
      "layer2.12.conv_out.norm.weight",
      "layer2.12.conv_out.norm.bias",
      "layer2.13.le.norm.weight",
      "layer2.13.le.norm.bias",
      "layer2.13.conv_in.norm.weight",
      "layer2.13.conv_in.norm.bias",
      "layer2.13.conv.norm.weight",
      "layer2.13.conv.norm.bias",
      "layer2.13.conv_out.norm.weight",
      "layer2.13.conv_out.norm.bias",
      "layer2.14.le.norm.weight",
      "layer2.14.le.norm.bias",
      "layer2.14.conv_in.norm.weight",
      "layer2.14.conv_in.norm.bias",
      "layer2.14.conv.norm.weight",
      "layer2.14.conv.norm.bias",
      "layer2.14.conv_out.norm.weight",
      "layer2.14.conv_out.norm.bias",
      "layer3.0.ln.weight",
      "layer3.0.ln.bias",
      "layer3.0.le.norm.weight",
      "layer3.0.le.norm.bias",
      "layer3.0.conv_in.norm.weight",
      "layer3.0.conv_in.norm.bias",
      "layer3.0.conv.norm.weight",
      "layer3.0.conv.norm.bias",
      "layer3.0.conv_out.norm.weight",
      "layer3.0.conv_out.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.le.norm.weight",
      "layer3.1.le.norm.bias",
      "layer3.1.conv_in.norm.weight",
      "layer3.1.conv_in.norm.bias",
      "layer3.1.conv.norm.weight",
      "layer3.1.conv.norm.bias",
      "layer3.1.conv_out.norm.weight",
      "layer3.1.conv_out.norm.bias",
      "layer3.2.le.norm.weight",
      "layer3.2.le.norm.bias",
      "layer3.2.conv_in.norm.weight",
      "layer3.2.conv_in.norm.bias",
      "layer3.2.conv.norm.weight",
      "layer3.2.conv.norm.bias",
      "layer3.2.conv_out.norm.weight",
      "layer3.2.conv_out.norm.bias",
      "layer3.3.le.norm.weight",
      "layer3.3.le.norm.bias",
      "layer3.3.conv_in.norm.weight",
      "layer3.3.conv_in.norm.bias",
      "layer3.3.conv.norm.weight",
      "layer3.3.conv.norm.bias",
      "layer3.3.conv_out.norm.weight",
      "layer3.3.conv_out.norm.bias",
      "layer3.4.le.norm.weight",
      "layer3.4.le.norm.bias",
      "layer3.4.conv_in.norm.weight",
      "layer3.4.conv_in.norm.bias",
      "layer3.4.conv.norm.weight",
      "layer3.4.conv.norm.bias",
      "layer3.4.conv_out.norm.weight",
      "layer3.4.conv_out.norm.bias",
      "layer3.5.le.norm.weight",
      "layer3.5.le.norm.bias",
      "layer3.5.conv_in.norm.weight",
      "layer3.5.conv_in.norm.bias",
      "layer3.5.conv.norm.weight",
      "layer3.5.conv.norm.bias",
      "layer3.5.conv_out.norm.weight",
      "layer3.5.conv_out.norm.bias",
      "layer3.6.le.norm.weight",
      "layer3.6.le.norm.bias",
      "layer3.6.conv_in.norm.weight",
      "layer3.6.conv_in.norm.bias",
      "layer3.6.conv.norm.weight",
      "layer3.6.conv.norm.bias",
      "layer3.6.conv_out.norm.weight",
      "layer3.6.conv_out.norm.bias",
      "layer3.7.le.norm.weight",
      "layer3.7.le.norm.bias",
      "layer3.7.conv_in.norm.weight",
      "layer3.7.conv_in.norm.bias",
      "layer3.7.conv.norm.weight",
      "layer3.7.conv.norm.bias",
      "layer3.7.conv_out.norm.weight",
      "layer3.7.conv_out.norm.bias",
      "layer3.8.le.norm.weight",
      "layer3.8.le.norm.bias",
      "layer3.8.conv_in.norm.weight",
      "layer3.8.conv_in.norm.bias",
      "layer3.8.conv.norm.weight",
      "layer3.8.conv.norm.bias",
      "layer3.8.conv_out.norm.weight",
      "layer3.8.conv_out.norm.bias",
      "layer3.9.le.norm.weight",
      "layer3.9.le.norm.bias",
      "layer3.9.conv_in.norm.weight",
      "layer3.9.conv_in.norm.bias",
      "layer3.9.conv.norm.weight",
      "layer3.9.conv.norm.bias",
      "layer3.9.conv_out.norm.weight",
      "layer3.9.conv_out.norm.bias",
      "layer3.10.le.norm.weight",
      "layer3.10.le.norm.bias",
      "layer3.10.conv_in.norm.weight",
      "layer3.10.conv_in.norm.bias",
      "layer3.10.conv.norm.weight",
      "layer3.10.conv.norm.bias",
      "layer3.10.conv_out.norm.weight",
      "layer3.10.conv_out.norm.bias",
      "layer3.11.le.norm.weight",
      "layer3.11.le.norm.bias",
      "layer3.11.conv_in.norm.weight",
      "layer3.11.conv_in.norm.bias",
      "layer3.11.conv.norm.weight",
      "layer3.11.conv.norm.bias",
      "layer3.11.conv_out.norm.weight",
      "layer3.11.conv_out.norm.bias",
      "layer3.12.le.norm.weight",
      "layer3.12.le.norm.bias",
      "layer3.12.conv_in.norm.weight",
      "layer3.12.conv_in.norm.bias",
      "layer3.12.conv.norm.weight",
      "layer3.12.conv.norm.bias",
      "layer3.12.conv_out.norm.weight",
      "layer3.12.conv_out.norm.bias",
      "layer3.13.le.norm.weight",
      "layer3.13.le.norm.bias",
      "layer3.13.conv_in.norm.weight",
      "layer3.13.conv_in.norm.bias",
      "layer3.13.conv.norm.weight",
      "layer3.13.conv.norm.bias",
      "layer3.13.conv_out.norm.weight",
      "layer3.13.conv_out.norm.bias",
      "layer3.14.le.norm.weight",
      "layer3.14.le.norm.bias",
      "layer3.14.conv_in.norm.weight",
      "layer3.14.conv_in.norm.bias",
      "layer3.14.conv.norm.weight",
      "layer3.14.conv.norm.bias",
      "layer3.14.conv_out.norm.weight",
      "layer3.14.conv_out.norm.bias",
      "layer3.15.le.norm.weight",
      "layer3.15.le.norm.bias",
      "layer3.15.conv_in.norm.weight",
      "layer3.15.conv_in.norm.bias",
      "layer3.15.conv.norm.weight",
      "layer3.15.conv.norm.bias",
      "layer3.15.conv_out.norm.weight",
      "layer3.15.conv_out.norm.bias",
      "layer3.16.le.norm.weight",
      "layer3.16.le.norm.bias",
      "layer3.16.conv_in.norm.weight",
      "layer3.16.conv_in.norm.bias",
      "layer3.16.conv.norm.weight",
      "layer3.16.conv.norm.bias",
      "layer3.16.conv_out.norm.weight",
      "layer3.16.conv_out.norm.bias",
      "layer3.17.le.norm.weight",
      "layer3.17.le.norm.bias",
      "layer3.17.conv_in.norm.weight",
      "layer3.17.conv_in.norm.bias",
      "layer3.17.conv.norm.weight",
      "layer3.17.conv.norm.bias",
      "layer3.17.conv_out.norm.weight",
      "layer3.17.conv_out.norm.bias",
      "layer3.18.le.norm.weight",
      "layer3.18.le.norm.bias",
      "layer3.18.conv_in.norm.weight",
      "layer3.18.conv_in.norm.bias",
      "layer3.18.conv.norm.weight",
      "layer3.18.conv.norm.bias",
      "layer3.18.conv_out.norm.weight",
      "layer3.18.conv_out.norm.bias",
      "layer3.19.le.norm.weight",
      "layer3.19.le.norm.bias",
      "layer3.19.conv_in.norm.weight",
      "layer3.19.conv_in.norm.bias",
      "layer3.19.conv.norm.weight",
      "layer3.19.conv.norm.bias",
      "layer3.19.conv_out.norm.weight",
      "layer3.19.conv_out.norm.bias",
      "layer3.20.le.norm.weight",
      "layer3.20.le.norm.bias",
      "layer3.20.conv_in.norm.weight",
      "layer3.20.conv_in.norm.bias",
      "layer3.20.conv.norm.weight",
      "layer3.20.conv.norm.bias",
      "layer3.20.conv_out.norm.weight",
      "layer3.20.conv_out.norm.bias",
      "layer3.21.le.norm.weight",
      "layer3.21.le.norm.bias",
      "layer3.21.conv_in.norm.weight",
      "layer3.21.conv_in.norm.bias",
      "layer3.21.conv.norm.weight",
      "layer3.21.conv.norm.bias",
      "layer3.21.conv_out.norm.weight",
      "layer3.21.conv_out.norm.bias",
      "layer3.22.le.norm.weight",
      "layer3.22.le.norm.bias",
      "layer3.22.conv_in.norm.weight",
      "layer3.22.conv_in.norm.bias",
      "layer3.22.conv.norm.weight",
      "layer3.22.conv.norm.bias",
      "layer3.22.conv_out.norm.weight",
      "layer3.22.conv_out.norm.bias",
      "layer3.23.le.norm.weight",
      "layer3.23.le.norm.bias",
      "layer3.23.conv_in.norm.weight",
      "layer3.23.conv_in.norm.bias",
      "layer3.23.conv.norm.weight",
      "layer3.23.conv.norm.bias",
      "layer3.23.conv_out.norm.weight",
      "layer3.23.conv_out.norm.bias",
      "layer3.24.le.norm.weight",
      "layer3.24.le.norm.bias",
      "layer3.24.conv_in.norm.weight",
      "layer3.24.conv_in.norm.bias",
      "layer3.24.conv.norm.weight",
      "layer3.24.conv.norm.bias",
      "layer3.24.conv_out.norm.weight",
      "layer3.24.conv_out.norm.bias",
      "layer3.25.le.norm.weight",
      "layer3.25.le.norm.bias",
      "layer3.25.conv_in.norm.weight",
      "layer3.25.conv_in.norm.bias",
      "layer3.25.conv.norm.weight",
      "layer3.25.conv.norm.bias",
      "layer3.25.conv_out.norm.weight",
      "layer3.25.conv_out.norm.bias",
      "layer3.26.le.norm.weight",
      "layer3.26.le.norm.bias",
      "layer3.26.conv_in.norm.weight",
      "layer3.26.conv_in.norm.bias",
      "layer3.26.conv.norm.weight",
      "layer3.26.conv.norm.bias",
      "layer3.26.conv_out.norm.weight",
      "layer3.26.conv_out.norm.bias",
      "layer3.27.le.norm.weight",
      "layer3.27.le.norm.bias",
      "layer3.27.conv_in.norm.weight",
      "layer3.27.conv_in.norm.bias",
      "layer3.27.conv.norm.weight",
      "layer3.27.conv.norm.bias",
      "layer3.27.conv_out.norm.weight",
      "layer3.27.conv_out.norm.bias",
      "layer3.28.le.norm.weight",
      "layer3.28.le.norm.bias",
      "layer3.28.conv_in.norm.weight",
      "layer3.28.conv_in.norm.bias",
      "layer3.28.conv.norm.weight",
      "layer3.28.conv.norm.bias",
      "layer3.28.conv_out.norm.weight",
      "layer3.28.conv_out.norm.bias",
      "layer3.29.le.norm.weight",
      "layer3.29.le.norm.bias",
      "layer3.29.conv_in.norm.weight",
      "layer3.29.conv_in.norm.bias",
      "layer3.29.conv.norm.weight",
      "layer3.29.conv.norm.bias",
      "layer3.29.conv_out.norm.weight",
      "layer3.29.conv_out.norm.bias",
      "layer3.30.le.norm.weight",
      "layer3.30.le.norm.bias",
      "layer3.30.conv_in.norm.weight",
      "layer3.30.conv_in.norm.bias",
      "layer3.30.conv.norm.weight",
      "layer3.30.conv.norm.bias",
      "layer3.30.conv_out.norm.weight",
      "layer3.30.conv_out.norm.bias",
      "layer3.31.le.norm.weight",
      "layer3.31.le.norm.bias",
      "layer3.31.conv_in.norm.weight",
      "layer3.31.conv_in.norm.bias",
      "layer3.31.conv.norm.weight",
      "layer3.31.conv.norm.bias",
      "layer3.31.conv_out.norm.weight",
      "layer3.31.conv_out.norm.bias",
      "layer3.32.le.norm.weight",
      "layer3.32.le.norm.bias",
      "layer3.32.conv_in.norm.weight",
      "layer3.32.conv_in.norm.bias",
      "layer3.32.conv.norm.weight",
      "layer3.32.conv.norm.bias",
      "layer3.32.conv_out.norm.weight",
      "layer3.32.conv_out.norm.bias",
      "layer3.33.le.norm.weight",
      "layer3.33.le.norm.bias",
      "layer3.33.conv_in.norm.weight",
      "layer3.33.conv_in.norm.bias",
      "layer3.33.conv.norm.weight",
      "layer3.33.conv.norm.bias",
      "layer3.33.conv_out.norm.weight",
      "layer3.33.conv_out.norm.bias",
      "layer3.34.le.norm.weight",
      "layer3.34.le.norm.bias",
      "layer3.34.conv_in.norm.weight",
      "layer3.34.conv_in.norm.bias",
      "layer3.34.conv.norm.weight",
      "layer3.34.conv.norm.bias",
      "layer3.34.conv_out.norm.weight",
      "layer3.34.conv_out.norm.bias",
      "layer4.0.ln.weight",
      "layer4.0.ln.bias",
      "layer4.0.le.norm.weight",
      "layer4.0.le.norm.bias",
      "layer4.0.conv_in.norm.weight",
      "layer4.0.conv_in.norm.bias",
      "layer4.0.conv.norm.weight",
      "layer4.0.conv.norm.bias",
      "layer4.0.conv_out.norm.weight",
      "layer4.0.conv_out.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.le.norm.weight",
      "layer4.1.le.norm.bias",
      "layer4.1.conv_in.norm.weight",
      "layer4.1.conv_in.norm.bias",
      "layer4.1.conv.norm.weight",
      "layer4.1.conv.norm.bias",
      "layer4.1.conv_out.norm.weight",
      "layer4.1.conv_out.norm.bias",
      "layer4.2.le.norm.weight",
      "layer4.2.le.norm.bias",
      "layer4.2.conv_in.norm.weight",
      "layer4.2.conv_in.norm.bias",
      "layer4.2.conv.norm.weight",
      "layer4.2.conv.norm.bias",
      "layer4.2.conv_out.norm.weight",
      "layer4.2.conv_out.norm.bias",
      "layer4.3.le.norm.weight",
      "layer4.3.le.norm.bias",
      "layer4.3.conv_in.norm.weight",
      "layer4.3.conv_in.norm.bias",
      "layer4.3.conv.norm.weight",
      "layer4.3.conv.norm.bias",
      "layer4.3.conv_out.norm.weight",
      "layer4.3.conv_out.norm.bias",
      "layer4.4.le.norm.weight",
      "layer4.4.le.norm.bias",
      "layer4.4.conv_in.norm.weight",
      "layer4.4.conv_in.norm.bias",
      "layer4.4.conv.norm.weight",
      "layer4.4.conv.norm.bias",
      "layer4.4.conv_out.norm.weight",
      "layer4.4.conv_out.norm.bias",
      "layer4.5.le.norm.weight",
      "layer4.5.le.norm.bias",
      "layer4.5.conv_in.norm.weight",
      "layer4.5.conv_in.norm.bias",
      "layer4.5.conv.norm.weight",
      "layer4.5.conv.norm.bias",
      "layer4.5.conv_out.norm.weight",
      "layer4.5.conv_out.norm.bias",
      "layer4.6.le.norm.weight",
      "layer4.6.le.norm.bias",
      "layer4.6.conv_in.norm.weight",
      "layer4.6.conv_in.norm.bias",
      "layer4.6.conv.norm.weight",
      "layer4.6.conv.norm.bias",
      "layer4.6.conv_out.norm.weight",
      "layer4.6.conv_out.norm.bias",
      "layer4.7.le.norm.weight",
      "layer4.7.le.norm.bias",
      "layer4.7.conv_in.norm.weight",
      "layer4.7.conv_in.norm.bias",
      "layer4.7.conv.norm.weight",
      "layer4.7.conv.norm.bias",
      "layer4.7.conv_out.norm.weight",
      "layer4.7.conv_out.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 12500
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: checkpoint_base_8.7G/checkpoint-161.pth
Resume checkpoint checkpoint_base_8.7G/checkpoint-161.pth
With optim & sched!
Start training for 300 epochs
Epoch: [162]  [   0/2502]  eta: 8:07:05  lr: 0.001956  min_lr: 0.001956  loss: 4.1324 (4.1324)  weight_decay: 0.0500 (0.0500)  time: 11.6810  data: 1.8511  max mem: 42519
Epoch: [162]  [ 200/2502]  eta: 0:20:45  lr: 0.001954  min_lr: 0.001954  loss: 3.2170 (3.3231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7180 (0.7432)  time: 0.4847  data: 0.0005  max mem: 42519
Epoch: [162]  [ 400/2502]  eta: 0:17:59  lr: 0.001952  min_lr: 0.001952  loss: 3.4447 (3.2898)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7164 (0.7436)  time: 0.4849  data: 0.0009  max mem: 42519
Epoch: [162]  [ 600/2502]  eta: 0:15:57  lr: 0.001950  min_lr: 0.001950  loss: 3.5027 (3.3021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7574 (0.7417)  time: 0.4837  data: 0.0008  max mem: 42519
Epoch: [162]  [ 800/2502]  eta: 0:14:08  lr: 0.001948  min_lr: 0.001948  loss: 3.4907 (3.3029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6897 (0.7425)  time: 0.4829  data: 0.0008  max mem: 42519
Epoch: [162]  [1000/2502]  eta: 0:12:24  lr: 0.001947  min_lr: 0.001947  loss: 3.4348 (3.2892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7184 (0.7420)  time: 0.4833  data: 0.0007  max mem: 42519
Epoch: [162]  [1200/2502]  eta: 0:10:43  lr: 0.001945  min_lr: 0.001945  loss: 3.5038 (3.2970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7250 (0.7447)  time: 0.4836  data: 0.0007  max mem: 42519
Epoch: [162]  [1400/2502]  eta: 0:09:03  lr: 0.001943  min_lr: 0.001943  loss: 3.3460 (3.3016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6914 (0.7451)  time: 0.4826  data: 0.0005  max mem: 42519
Epoch: [162]  [1600/2502]  eta: 0:07:23  lr: 0.001941  min_lr: 0.001941  loss: 3.5930 (3.3016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8204 (0.7491)  time: 0.4828  data: 0.0005  max mem: 42519
Epoch: [162]  [1800/2502]  eta: 0:05:44  lr: 0.001939  min_lr: 0.001939  loss: 3.4515 (3.3010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7527 (0.7493)  time: 0.4837  data: 0.0006  max mem: 42519
Epoch: [162]  [2000/2502]  eta: 0:04:06  lr: 0.001938  min_lr: 0.001938  loss: 3.1378 (3.2923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6761 (0.7422)  time: 0.4836  data: 0.0005  max mem: 42519
Epoch: [162]  [2200/2502]  eta: 0:02:27  lr: 0.001936  min_lr: 0.001936  loss: 3.3877 (3.2977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7675 (0.7441)  time: 0.4834  data: 0.0005  max mem: 42519
Epoch: [162]  [2400/2502]  eta: 0:00:49  lr: 0.001934  min_lr: 0.001934  loss: 3.0248 (3.2947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7311 (0.7453)  time: 0.4835  data: 0.0006  max mem: 42519
Epoch: [162]  [2501/2502]  eta: 0:00:00  lr: 0.001933  min_lr: 0.001933  loss: 3.3168 (3.2961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7047 (0.7447)  time: 0.4547  data: 0.0006  max mem: 42519
Epoch: [162] Total time: 0:20:24 (0.4892 s / it)
Averaged stats: lr: 0.001933  min_lr: 0.001933  loss: 3.3168 (3.2992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7047 (0.7447)
Test:  [ 0/50]  eta: 0:06:50  loss: 0.5896 (0.5896)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 8.2037  data: 3.0515  max mem: 42519
Test:  [10/50]  eta: 0:00:38  loss: 0.8186 (0.8396)  acc1: 84.8000 (84.7273)  acc5: 97.6000 (97.3091)  time: 0.9648  data: 0.2778  max mem: 42519
Test:  [20/50]  eta: 0:00:18  loss: 0.8654 (0.8735)  acc1: 84.0000 (83.6381)  acc5: 97.6000 (97.3143)  time: 0.2405  data: 0.0004  max mem: 42519
Test:  [30/50]  eta: 0:00:09  loss: 1.0483 (0.9731)  acc1: 80.8000 (81.7677)  acc5: 95.2000 (96.0258)  time: 0.2386  data: 0.0004  max mem: 42519
Test:  [40/50]  eta: 0:00:04  loss: 1.1931 (1.0389)  acc1: 76.0000 (80.0098)  acc5: 92.8000 (95.3268)  time: 0.2370  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2053 (1.0554)  acc1: 74.4000 (79.3040)  acc5: 93.2000 (95.2320)  time: 0.2368  data: 0.0002  max mem: 42519
Test: Total time: 0:00:19 (0.3995 s / it)
* Acc@1 79.554 Acc@5 95.150 loss 1.055
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.55%
Epoch: [163]  [   0/2502]  eta: 1:23:18  lr: 0.001933  min_lr: 0.001933  loss: 3.7858 (3.7858)  weight_decay: 0.0500 (0.0500)  time: 1.9977  data: 1.4424  max mem: 42519
Epoch: [163]  [ 200/2502]  eta: 0:18:42  lr: 0.001931  min_lr: 0.001931  loss: 3.5037 (3.3744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7477 (0.7376)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [163]  [ 400/2502]  eta: 0:16:56  lr: 0.001930  min_lr: 0.001930  loss: 3.2161 (3.3017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7389 (0.7440)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [163]  [ 600/2502]  eta: 0:15:19  lr: 0.001928  min_lr: 0.001928  loss: 3.4614 (3.2984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7649 (0.7493)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [163]  [ 800/2502]  eta: 0:13:41  lr: 0.001926  min_lr: 0.001926  loss: 3.2863 (3.2978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7503 (0.7493)  time: 0.4837  data: 0.0004  max mem: 42519
Epoch: [163]  [1000/2502]  eta: 0:12:04  lr: 0.001924  min_lr: 0.001924  loss: 3.2517 (3.2958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6910 (0.7472)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [163]  [1200/2502]  eta: 0:10:28  lr: 0.001922  min_lr: 0.001922  loss: 3.4462 (3.3045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6875 (0.7446)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [163]  [1400/2502]  eta: 0:08:51  lr: 0.001921  min_lr: 0.001921  loss: 3.3393 (3.3008)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7169 (0.7411)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [163]  [1600/2502]  eta: 0:07:14  lr: 0.001919  min_lr: 0.001919  loss: 3.2057 (3.2970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7804 (0.7451)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [163]  [1800/2502]  eta: 0:05:38  lr: 0.001917  min_lr: 0.001917  loss: 3.4793 (3.2959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7387 (0.7471)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [163]  [2000/2502]  eta: 0:04:01  lr: 0.001915  min_lr: 0.001915  loss: 3.3514 (3.2956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7544 (0.7460)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [163]  [2200/2502]  eta: 0:02:25  lr: 0.001913  min_lr: 0.001913  loss: 3.5985 (3.3023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7485 (0.7477)  time: 0.4892  data: 0.0004  max mem: 42519
Epoch: [163]  [2400/2502]  eta: 0:00:49  lr: 0.001912  min_lr: 0.001912  loss: 3.3641 (3.2998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7100 (0.7468)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [163]  [2501/2502]  eta: 0:00:00  lr: 0.001911  min_lr: 0.001911  loss: 3.5406 (3.3026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7340 (0.7458)  time: 0.4346  data: 0.0009  max mem: 42519
Epoch: [163] Total time: 0:20:03 (0.4810 s / it)
Averaged stats: lr: 0.001911  min_lr: 0.001911  loss: 3.5406 (3.3012)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7340 (0.7458)
Test:  [ 0/50]  eta: 0:04:01  loss: 0.6528 (0.6528)  acc1: 92.8000 (92.8000)  acc5: 99.2000 (99.2000)  time: 4.8365  data: 4.5577  max mem: 42519
Test:  [10/50]  eta: 0:00:26  loss: 1.0167 (0.9924)  acc1: 84.0000 (84.9455)  acc5: 98.0000 (97.2364)  time: 0.6553  data: 0.4147  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 1.0167 (1.0147)  acc1: 81.6000 (83.7333)  acc5: 97.2000 (97.2762)  time: 0.2375  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.1416 (1.1080)  acc1: 79.6000 (81.8452)  acc5: 95.2000 (96.0903)  time: 0.2379  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.4023 (1.1844)  acc1: 77.2000 (79.9805)  acc5: 93.2000 (95.3268)  time: 0.2377  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.3622 (1.1987)  acc1: 74.8000 (79.4880)  acc5: 92.8000 (95.1360)  time: 0.2373  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3320 s / it)
* Acc@1 79.496 Acc@5 95.134 loss 1.198
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.55%
Epoch: [164]  [   0/2502]  eta: 1:36:44  lr: 0.001911  min_lr: 0.001911  loss: 3.2498 (3.2498)  weight_decay: 0.0500 (0.0500)  time: 2.3201  data: 1.6245  max mem: 42519
Epoch: [164]  [ 200/2502]  eta: 0:18:45  lr: 0.001909  min_lr: 0.001909  loss: 3.3330 (3.2775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7379 (0.7186)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [164]  [ 400/2502]  eta: 0:16:59  lr: 0.001907  min_lr: 0.001907  loss: 3.5117 (3.3013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7449 (0.7320)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [164]  [ 600/2502]  eta: 0:15:19  lr: 0.001905  min_lr: 0.001905  loss: 3.4302 (3.2927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7118 (0.7338)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [164]  [ 800/2502]  eta: 0:13:41  lr: 0.001904  min_lr: 0.001904  loss: 3.2459 (3.2916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7218 (0.7401)  time: 0.4857  data: 0.0005  max mem: 42519
Epoch: [164]  [1000/2502]  eta: 0:12:04  lr: 0.001902  min_lr: 0.001902  loss: 3.4467 (3.3080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7357 (0.7402)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [164]  [1200/2502]  eta: 0:10:27  lr: 0.001900  min_lr: 0.001900  loss: 3.4596 (3.3058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6996 (0.7416)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [164]  [1400/2502]  eta: 0:08:51  lr: 0.001898  min_lr: 0.001898  loss: 3.2764 (3.3131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7182 (0.7436)  time: 0.4855  data: 0.0005  max mem: 42519
Epoch: [164]  [1600/2502]  eta: 0:07:14  lr: 0.001896  min_lr: 0.001896  loss: 3.5351 (3.3101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7549 (0.7469)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [164]  [1800/2502]  eta: 0:05:38  lr: 0.001895  min_lr: 0.001895  loss: 3.4682 (3.3049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7017 (0.7471)  time: 0.4822  data: 0.0005  max mem: 42519
Epoch: [164]  [2000/2502]  eta: 0:04:01  lr: 0.001893  min_lr: 0.001893  loss: 3.1115 (3.2983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7202 (0.7477)  time: 0.4811  data: 0.0005  max mem: 42519
Epoch: [164]  [2200/2502]  eta: 0:02:25  lr: 0.001891  min_lr: 0.001891  loss: 3.4702 (3.2979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6978 (0.7481)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [164]  [2400/2502]  eta: 0:00:49  lr: 0.001889  min_lr: 0.001889  loss: 3.2920 (3.2977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7436 (0.7513)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [164]  [2501/2502]  eta: 0:00:00  lr: 0.001888  min_lr: 0.001888  loss: 3.2033 (3.2967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7240 (0.7501)  time: 0.4479  data: 0.0010  max mem: 42519
Epoch: [164] Total time: 0:20:04 (0.4815 s / it)
Averaged stats: lr: 0.001888  min_lr: 0.001888  loss: 3.2033 (3.2954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7240 (0.7501)
Test:  [ 0/50]  eta: 0:03:38  loss: 0.5469 (0.5469)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 4.3754  data: 4.0920  max mem: 42519
Test:  [10/50]  eta: 0:00:24  loss: 0.8901 (0.8890)  acc1: 84.0000 (84.6909)  acc5: 97.2000 (97.0182)  time: 0.6133  data: 0.3724  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.9099 (0.9089)  acc1: 82.8000 (83.5048)  acc5: 97.2000 (97.1619)  time: 0.2375  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0356 (1.0025)  acc1: 79.2000 (81.6258)  acc5: 95.6000 (95.8839)  time: 0.2378  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1965 (1.0561)  acc1: 75.6000 (80.0098)  acc5: 92.4000 (95.1805)  time: 0.2375  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1782 (1.0707)  acc1: 75.6000 (79.4240)  acc5: 92.4000 (95.0560)  time: 0.2372  data: 0.0001  max mem: 42519
Test: Total time: 0:00:16 (0.3223 s / it)
* Acc@1 79.518 Acc@5 95.240 loss 1.070
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.55%
Epoch: [165]  [   0/2502]  eta: 1:35:40  lr: 0.001888  min_lr: 0.001888  loss: 3.3585 (3.3585)  weight_decay: 0.0500 (0.0500)  time: 2.2945  data: 1.6614  max mem: 42519
Epoch: [165]  [ 200/2502]  eta: 0:18:43  lr: 0.001887  min_lr: 0.001887  loss: 3.5060 (3.2597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7439 (0.7537)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [165]  [ 400/2502]  eta: 0:16:57  lr: 0.001885  min_lr: 0.001885  loss: 3.4674 (3.2570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7410 (0.7477)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [165]  [ 600/2502]  eta: 0:15:18  lr: 0.001883  min_lr: 0.001883  loss: 3.5305 (3.2625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7624 (0.7475)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [165]  [ 800/2502]  eta: 0:13:40  lr: 0.001881  min_lr: 0.001881  loss: 3.3704 (3.2627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6738 (0.7445)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [165]  [1000/2502]  eta: 0:12:03  lr: 0.001879  min_lr: 0.001879  loss: 3.3852 (3.2671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7461 (0.7519)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [165]  [1200/2502]  eta: 0:10:26  lr: 0.001878  min_lr: 0.001878  loss: 3.3558 (3.2723)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8065 (0.7530)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [165]  [1400/2502]  eta: 0:08:50  lr: 0.001876  min_lr: 0.001876  loss: 3.2733 (3.2804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7475 (0.7561)  time: 0.4838  data: 0.0004  max mem: 42519
Epoch: [165]  [1600/2502]  eta: 0:07:14  lr: 0.001874  min_lr: 0.001874  loss: 3.3613 (3.2813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (0.7574)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [165]  [1800/2502]  eta: 0:05:37  lr: 0.001872  min_lr: 0.001872  loss: 3.3149 (3.2808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8023 (0.7594)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [165]  [2000/2502]  eta: 0:04:01  lr: 0.001870  min_lr: 0.001870  loss: 3.4431 (3.2873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7053 (0.7565)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [165]  [2200/2502]  eta: 0:02:25  lr: 0.001869  min_lr: 0.001869  loss: 3.4508 (3.2882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.7562)  time: 0.4830  data: 0.0004  max mem: 42519
Epoch: [165]  [2400/2502]  eta: 0:00:49  lr: 0.001867  min_lr: 0.001867  loss: 3.3917 (3.2863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7231 (0.7538)  time: 0.4808  data: 0.0004  max mem: 42519
Epoch: [165]  [2501/2502]  eta: 0:00:00  lr: 0.001866  min_lr: 0.001866  loss: 3.3205 (3.2867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (0.7516)  time: 0.4375  data: 0.0007  max mem: 42519
Epoch: [165] Total time: 0:20:03 (0.4812 s / it)
Averaged stats: lr: 0.001866  min_lr: 0.001866  loss: 3.3205 (3.2854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (0.7516)
Test:  [ 0/50]  eta: 0:02:20  loss: 0.5550 (0.5550)  acc1: 93.2000 (93.2000)  acc5: 99.2000 (99.2000)  time: 2.8116  data: 2.5204  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8351 (0.8700)  acc1: 85.2000 (84.8727)  acc5: 98.0000 (97.2364)  time: 0.4850  data: 0.2420  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8750 (0.8878)  acc1: 82.4000 (83.7333)  acc5: 97.6000 (97.1619)  time: 0.2653  data: 0.0277  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9818 (0.9814)  acc1: 79.2000 (81.5097)  acc5: 94.0000 (95.8839)  time: 0.2580  data: 0.0208  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2177 (1.0434)  acc1: 74.4000 (79.5415)  acc5: 93.2000 (95.2488)  time: 0.2375  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2090 (1.0521)  acc1: 74.0000 (79.1280)  acc5: 94.0000 (95.2000)  time: 0.2372  data: 0.0001  max mem: 42519
Test: Total time: 0:00:15 (0.3019 s / it)
* Acc@1 79.568 Acc@5 95.300 loss 1.050
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.57%
Epoch: [166]  [   0/2502]  eta: 1:24:08  lr: 0.001866  min_lr: 0.001866  loss: 3.3989 (3.3989)  weight_decay: 0.0500 (0.0500)  time: 2.0179  data: 1.5291  max mem: 42519
Epoch: [166]  [ 200/2502]  eta: 0:18:42  lr: 0.001864  min_lr: 0.001864  loss: 3.3828 (3.3537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7485 (0.7627)  time: 0.4808  data: 0.0005  max mem: 42519
Epoch: [166]  [ 400/2502]  eta: 0:16:58  lr: 0.001862  min_lr: 0.001862  loss: 3.1682 (3.3168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7474 (0.7616)  time: 0.4783  data: 0.0006  max mem: 42519
Epoch: [166]  [ 600/2502]  eta: 0:15:18  lr: 0.001861  min_lr: 0.001861  loss: 3.1282 (3.2850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7518 (0.7596)  time: 0.4789  data: 0.0006  max mem: 42519
Epoch: [166]  [ 800/2502]  eta: 0:13:41  lr: 0.001859  min_lr: 0.001859  loss: 3.3692 (3.2857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7526 (0.7648)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [166]  [1000/2502]  eta: 0:12:04  lr: 0.001857  min_lr: 0.001857  loss: 3.2858 (3.2863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7106 (0.7601)  time: 0.4879  data: 0.0004  max mem: 42519
Epoch: [166]  [1200/2502]  eta: 0:10:27  lr: 0.001855  min_lr: 0.001855  loss: 3.4689 (3.2898)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7437 (0.7591)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [166]  [1400/2502]  eta: 0:08:51  lr: 0.001853  min_lr: 0.001853  loss: 3.3353 (3.2920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7050 (0.7578)  time: 0.4837  data: 0.0005  max mem: 42519
Epoch: [166]  [1600/2502]  eta: 0:07:14  lr: 0.001852  min_lr: 0.001852  loss: 3.3191 (3.2941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7468 (0.7609)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [166]  [1800/2502]  eta: 0:05:38  lr: 0.001850  min_lr: 0.001850  loss: 3.3584 (3.2893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7217 (0.7578)  time: 0.4800  data: 0.0006  max mem: 42519
Epoch: [166]  [2000/2502]  eta: 0:04:01  lr: 0.001848  min_lr: 0.001848  loss: 3.2214 (3.2890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7737 (0.7597)  time: 0.4804  data: 0.0006  max mem: 42519
Epoch: [166]  [2200/2502]  eta: 0:02:25  lr: 0.001846  min_lr: 0.001846  loss: 2.9722 (3.2842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7160 (0.7597)  time: 0.4785  data: 0.0006  max mem: 42519
Epoch: [166]  [2400/2502]  eta: 0:00:49  lr: 0.001845  min_lr: 0.001845  loss: 3.1628 (3.2824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7446 (0.7594)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [166]  [2501/2502]  eta: 0:00:00  lr: 0.001844  min_lr: 0.001844  loss: 3.3258 (3.2800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7097 (0.7579)  time: 0.4520  data: 0.0007  max mem: 42519
Epoch: [166] Total time: 0:20:05 (0.4818 s / it)
Averaged stats: lr: 0.001844  min_lr: 0.001844  loss: 3.3258 (3.2900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7097 (0.7579)
Test:  [ 0/50]  eta: 0:02:29  loss: 0.5091 (0.5091)  acc1: 94.0000 (94.0000)  acc5: 98.8000 (98.8000)  time: 2.9814  data: 2.6974  max mem: 42519
Test:  [10/50]  eta: 0:00:22  loss: 0.8560 (0.8915)  acc1: 84.4000 (84.1455)  acc5: 97.6000 (97.2000)  time: 0.5644  data: 0.3195  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.8915 (0.9086)  acc1: 80.8000 (83.2952)  acc5: 97.6000 (97.0667)  time: 0.2939  data: 0.0549  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0469 (0.9981)  acc1: 78.8000 (81.7161)  acc5: 94.8000 (95.9871)  time: 0.2520  data: 0.0142  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2358 (1.0586)  acc1: 76.4000 (80.0098)  acc5: 92.8000 (95.2390)  time: 0.2418  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2358 (1.0741)  acc1: 75.6000 (79.4160)  acc5: 93.2000 (95.1120)  time: 0.2414  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3187 s / it)
* Acc@1 79.550 Acc@5 95.204 loss 1.074
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.57%
Epoch: [167]  [   0/2502]  eta: 1:38:13  lr: 0.001844  min_lr: 0.001844  loss: 3.6152 (3.6152)  weight_decay: 0.0500 (0.0500)  time: 2.3554  data: 1.8577  max mem: 42519
Epoch: [167]  [ 200/2502]  eta: 0:18:48  lr: 0.001842  min_lr: 0.001842  loss: 3.4984 (3.3107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7099 (0.7420)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [167]  [ 400/2502]  eta: 0:16:59  lr: 0.001840  min_lr: 0.001840  loss: 3.4983 (3.2969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7516 (0.7471)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [167]  [ 600/2502]  eta: 0:15:19  lr: 0.001838  min_lr: 0.001838  loss: 3.4458 (3.2947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7664 (0.7526)  time: 0.4777  data: 0.0005  max mem: 42519
Epoch: [167]  [ 800/2502]  eta: 0:13:41  lr: 0.001836  min_lr: 0.001836  loss: 3.4534 (3.2934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7657 (0.7609)  time: 0.4817  data: 0.0005  max mem: 42519
Epoch: [167]  [1000/2502]  eta: 0:12:04  lr: 0.001835  min_lr: 0.001835  loss: 3.2789 (3.2887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7310 (0.7614)  time: 0.4833  data: 0.0007  max mem: 42519
Epoch: [167]  [1200/2502]  eta: 0:10:28  lr: 0.001833  min_lr: 0.001833  loss: 3.4178 (3.2737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7528 (nan)  time: 0.4896  data: 0.0006  max mem: 42519
Epoch: [167]  [1400/2502]  eta: 0:08:51  lr: 0.001831  min_lr: 0.001831  loss: 3.3964 (3.2750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7829 (nan)  time: 0.4828  data: 0.0006  max mem: 42519
Epoch: [167]  [1600/2502]  eta: 0:07:15  lr: 0.001829  min_lr: 0.001829  loss: 3.3794 (3.2791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7260 (nan)  time: 0.4776  data: 0.0005  max mem: 42519
Epoch: [167]  [1800/2502]  eta: 0:05:38  lr: 0.001828  min_lr: 0.001828  loss: 3.2768 (3.2774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7619 (nan)  time: 0.4792  data: 0.0006  max mem: 42519
Epoch: [167]  [2000/2502]  eta: 0:04:02  lr: 0.001826  min_lr: 0.001826  loss: 3.4651 (3.2876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7637 (nan)  time: 0.4816  data: 0.0007  max mem: 42519
Epoch: [167]  [2200/2502]  eta: 0:02:25  lr: 0.001824  min_lr: 0.001824  loss: 2.9870 (3.2863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7217 (nan)  time: 0.4799  data: 0.0006  max mem: 42519
Epoch: [167]  [2400/2502]  eta: 0:00:49  lr: 0.001822  min_lr: 0.001822  loss: 3.5071 (3.2859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7685 (nan)  time: 0.4789  data: 0.0007  max mem: 42519
Epoch: [167]  [2501/2502]  eta: 0:00:00  lr: 0.001821  min_lr: 0.001821  loss: 3.5882 (3.2869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7217 (nan)  time: 0.4376  data: 0.0010  max mem: 42519
Epoch: [167] Total time: 0:20:06 (0.4822 s / it)
Averaged stats: lr: 0.001821  min_lr: 0.001821  loss: 3.5882 (3.2753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7217 (nan)
Test:  [ 0/50]  eta: 0:02:29  loss: 0.6123 (0.6123)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 2.9991  data: 2.6945  max mem: 42519
Test:  [10/50]  eta: 0:00:22  loss: 0.9314 (0.9299)  acc1: 84.8000 (85.0909)  acc5: 97.2000 (97.0909)  time: 0.5514  data: 0.3037  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.9314 (0.9377)  acc1: 84.0000 (84.3429)  acc5: 97.2000 (97.2571)  time: 0.2806  data: 0.0396  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.1304 (1.0447)  acc1: 79.2000 (81.8839)  acc5: 95.6000 (96.0000)  time: 0.2461  data: 0.0076  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2976 (1.1148)  acc1: 75.2000 (80.0195)  acc5: 92.8000 (95.2293)  time: 0.2375  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2877 (1.1349)  acc1: 75.2000 (79.5040)  acc5: 93.6000 (95.1280)  time: 0.2374  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3121 s / it)
* Acc@1 79.678 Acc@5 95.274 loss 1.129
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.68%
Epoch: [168]  [   0/2502]  eta: 1:21:08  lr: 0.001821  min_lr: 0.001821  loss: 3.7339 (3.7339)  weight_decay: 0.0500 (0.0500)  time: 1.9458  data: 1.4604  max mem: 42519
Epoch: [168]  [ 200/2502]  eta: 0:18:45  lr: 0.001819  min_lr: 0.001819  loss: 3.3560 (3.2428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7239 (0.7478)  time: 0.4897  data: 0.0004  max mem: 42519
Epoch: [168]  [ 400/2502]  eta: 0:17:01  lr: 0.001818  min_lr: 0.001818  loss: 3.1216 (3.2533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7271 (0.7549)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [168]  [ 600/2502]  eta: 0:15:20  lr: 0.001816  min_lr: 0.001816  loss: 3.1934 (3.2555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7628 (0.7700)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [168]  [ 800/2502]  eta: 0:13:42  lr: 0.001814  min_lr: 0.001814  loss: 3.3985 (3.2617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7841 (0.7744)  time: 0.4815  data: 0.0005  max mem: 42519
Epoch: [168]  [1000/2502]  eta: 0:12:06  lr: 0.001812  min_lr: 0.001812  loss: 3.2543 (3.2591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (0.7696)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [168]  [1200/2502]  eta: 0:10:28  lr: 0.001811  min_lr: 0.001811  loss: 3.3575 (3.2610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7421 (0.7727)  time: 0.4801  data: 0.0006  max mem: 42519
Epoch: [168]  [1400/2502]  eta: 0:08:51  lr: 0.001809  min_lr: 0.001809  loss: 3.3113 (3.2587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7510 (0.7693)  time: 0.4923  data: 0.0004  max mem: 42519
Epoch: [168]  [1600/2502]  eta: 0:07:15  lr: 0.001807  min_lr: 0.001807  loss: 3.2255 (3.2607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6930 (0.7655)  time: 0.4773  data: 0.0005  max mem: 42519
Epoch: [168]  [1800/2502]  eta: 0:05:38  lr: 0.001805  min_lr: 0.001805  loss: 3.1315 (3.2628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7375 (0.7649)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [168]  [2000/2502]  eta: 0:04:01  lr: 0.001803  min_lr: 0.001803  loss: 3.3008 (3.2630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7552 (0.7639)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [168]  [2200/2502]  eta: 0:02:25  lr: 0.001802  min_lr: 0.001802  loss: 3.1808 (3.2627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7784 (0.7653)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [168]  [2400/2502]  eta: 0:00:49  lr: 0.001800  min_lr: 0.001800  loss: 3.3377 (3.2623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7395 (nan)  time: 0.4800  data: 0.0005  max mem: 42519
Epoch: [168]  [2501/2502]  eta: 0:00:00  lr: 0.001799  min_lr: 0.001799  loss: 3.4696 (3.2617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7587 (nan)  time: 0.4462  data: 0.0007  max mem: 42519
Epoch: [168] Total time: 0:20:04 (0.4814 s / it)
Averaged stats: lr: 0.001799  min_lr: 0.001799  loss: 3.4696 (3.2756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7587 (nan)
Test:  [ 0/50]  eta: 0:03:57  loss: 0.5469 (0.5469)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 4.7537  data: 4.4692  max mem: 42519
Test:  [10/50]  eta: 0:00:25  loss: 0.8537 (0.8543)  acc1: 86.4000 (85.1636)  acc5: 98.0000 (97.3818)  time: 0.6475  data: 0.4067  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8537 (0.8760)  acc1: 81.6000 (84.0762)  acc5: 97.2000 (97.3905)  time: 0.2373  data: 0.0004  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0150 (0.9702)  acc1: 80.0000 (82.1161)  acc5: 95.6000 (96.1677)  time: 0.2377  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2403 (1.0388)  acc1: 75.6000 (80.3317)  acc5: 93.6000 (95.4439)  time: 0.2375  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2252 (1.0458)  acc1: 75.2000 (79.8800)  acc5: 93.6000 (95.3360)  time: 0.2373  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3296 s / it)
* Acc@1 80.026 Acc@5 95.418 loss 1.040
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.03%
Epoch: [169]  [   0/2502]  eta: 1:23:19  lr: 0.001799  min_lr: 0.001799  loss: 2.6489 (2.6489)  weight_decay: 0.0500 (0.0500)  time: 1.9982  data: 1.5149  max mem: 42519
Epoch: [169]  [ 200/2502]  eta: 0:18:42  lr: 0.001797  min_lr: 0.001797  loss: 2.8746 (3.2540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7327 (0.7386)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [169]  [ 400/2502]  eta: 0:16:58  lr: 0.001795  min_lr: 0.001795  loss: 3.4091 (3.2391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7163 (0.7539)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [169]  [ 600/2502]  eta: 0:15:20  lr: 0.001794  min_lr: 0.001794  loss: 3.5275 (3.2483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7759 (0.7631)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [169]  [ 800/2502]  eta: 0:13:42  lr: 0.001792  min_lr: 0.001792  loss: 3.3768 (3.2693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8158 (0.7669)  time: 0.4815  data: 0.0005  max mem: 42519
Epoch: [169]  [1000/2502]  eta: 0:12:05  lr: 0.001790  min_lr: 0.001790  loss: 3.5327 (3.2670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7395 (0.7689)  time: 0.4790  data: 0.0006  max mem: 42519
Epoch: [169]  [1200/2502]  eta: 0:10:28  lr: 0.001788  min_lr: 0.001788  loss: 3.4279 (3.2597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7410 (0.7674)  time: 0.4803  data: 0.0005  max mem: 42519
Epoch: [169]  [1400/2502]  eta: 0:08:51  lr: 0.001786  min_lr: 0.001786  loss: 3.4505 (3.2603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7013 (0.7657)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [169]  [1600/2502]  eta: 0:07:14  lr: 0.001785  min_lr: 0.001785  loss: 3.5297 (3.2606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (0.7665)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [169]  [1800/2502]  eta: 0:05:38  lr: 0.001783  min_lr: 0.001783  loss: 3.3129 (3.2603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7475 (0.7712)  time: 0.4827  data: 0.0006  max mem: 42519
Epoch: [169]  [2000/2502]  eta: 0:04:01  lr: 0.001781  min_lr: 0.001781  loss: 3.3969 (3.2638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6861 (0.7668)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [169]  [2200/2502]  eta: 0:02:25  lr: 0.001779  min_lr: 0.001779  loss: 3.5126 (3.2640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7770 (0.7674)  time: 0.4807  data: 0.0006  max mem: 42519
Epoch: [169]  [2400/2502]  eta: 0:00:49  lr: 0.001778  min_lr: 0.001778  loss: 3.6050 (3.2655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7126 (0.7675)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [169]  [2501/2502]  eta: 0:00:00  lr: 0.001777  min_lr: 0.001777  loss: 3.1243 (3.2658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7735 (0.7681)  time: 0.4372  data: 0.0009  max mem: 42519
Epoch: [169] Total time: 0:20:04 (0.4814 s / it)
Averaged stats: lr: 0.001777  min_lr: 0.001777  loss: 3.1243 (3.2652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7735 (0.7681)
Test:  [ 0/50]  eta: 0:03:08  loss: 0.5552 (0.5552)  acc1: 93.6000 (93.6000)  acc5: 99.2000 (99.2000)  time: 3.7654  data: 3.4740  max mem: 42519
Test:  [10/50]  eta: 0:00:22  loss: 0.9158 (0.8654)  acc1: 84.8000 (84.5455)  acc5: 97.2000 (97.0909)  time: 0.5583  data: 0.3163  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.9162 (0.9018)  acc1: 82.8000 (83.6381)  acc5: 97.2000 (97.1429)  time: 0.2378  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0275 (1.0025)  acc1: 80.4000 (81.8323)  acc5: 95.2000 (96.0258)  time: 0.2379  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2970 (1.0709)  acc1: 76.4000 (79.9707)  acc5: 92.8000 (95.2000)  time: 0.2377  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2260 (1.0816)  acc1: 75.6000 (79.4720)  acc5: 93.6000 (95.2480)  time: 0.2375  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3111 s / it)
* Acc@1 79.812 Acc@5 95.288 loss 1.072
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 80.03%
Epoch: [170]  [   0/2502]  eta: 1:36:24  lr: 0.001777  min_lr: 0.001777  loss: 3.9435 (3.9435)  weight_decay: 0.0500 (0.0500)  time: 2.3119  data: 1.8134  max mem: 42519
Epoch: [170]  [ 200/2502]  eta: 0:18:48  lr: 0.001775  min_lr: 0.001775  loss: 3.1230 (3.2482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7754 (0.7590)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [170]  [ 400/2502]  eta: 0:17:01  lr: 0.001773  min_lr: 0.001773  loss: 3.3052 (3.2503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7657 (0.7708)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [170]  [ 600/2502]  eta: 0:15:20  lr: 0.001771  min_lr: 0.001771  loss: 3.4909 (3.2590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7683 (0.7634)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [170]  [ 800/2502]  eta: 0:13:41  lr: 0.001769  min_lr: 0.001769  loss: 3.5061 (3.2628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.7608)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [170]  [1000/2502]  eta: 0:12:05  lr: 0.001768  min_lr: 0.001768  loss: 3.2744 (3.2567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6992 (0.7597)  time: 0.4812  data: 0.0004  max mem: 42519
Epoch: [170]  [1200/2502]  eta: 0:10:28  lr: 0.001766  min_lr: 0.001766  loss: 3.6602 (3.2537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7671 (0.7591)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [170]  [1400/2502]  eta: 0:08:52  lr: 0.001764  min_lr: 0.001764  loss: 3.3673 (3.2561)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7697 (0.7591)  time: 0.4909  data: 0.0005  max mem: 42519
Epoch: [170]  [1600/2502]  eta: 0:07:15  lr: 0.001762  min_lr: 0.001762  loss: 3.1338 (3.2444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7713 (0.7612)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [170]  [1800/2502]  eta: 0:05:38  lr: 0.001761  min_lr: 0.001761  loss: 3.4559 (3.2476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.7614)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [170]  [2000/2502]  eta: 0:04:02  lr: 0.001759  min_lr: 0.001759  loss: 3.3736 (3.2528)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7237 (0.7631)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [170]  [2200/2502]  eta: 0:02:25  lr: 0.001757  min_lr: 0.001757  loss: 3.2430 (3.2514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7615 (0.7662)  time: 0.4784  data: 0.0005  max mem: 42519
Epoch: [170]  [2400/2502]  eta: 0:00:49  lr: 0.001755  min_lr: 0.001755  loss: 3.4985 (3.2606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7448 (0.7662)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [170]  [2501/2502]  eta: 0:00:00  lr: 0.001754  min_lr: 0.001754  loss: 3.2409 (3.2632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7621 (0.7679)  time: 0.4460  data: 0.0013  max mem: 42519
Epoch: [170] Total time: 0:20:05 (0.4818 s / it)
Averaged stats: lr: 0.001754  min_lr: 0.001754  loss: 3.2409 (3.2698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7621 (0.7679)
Test:  [ 0/50]  eta: 0:02:19  loss: 0.5161 (0.5161)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 2.7973  data: 2.5300  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8431 (0.8341)  acc1: 83.6000 (85.0909)  acc5: 98.0000 (97.2364)  time: 0.4852  data: 0.2419  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8603 (0.8635)  acc1: 82.8000 (84.0762)  acc5: 97.6000 (97.1238)  time: 0.2550  data: 0.0092  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0414 (0.9674)  acc1: 78.8000 (81.9871)  acc5: 94.8000 (95.8323)  time: 0.2528  data: 0.0088  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1883 (1.0273)  acc1: 75.2000 (80.4488)  acc5: 92.8000 (95.2293)  time: 0.2852  data: 0.0480  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1681 (1.0460)  acc1: 75.6000 (80.0240)  acc5: 93.6000 (95.1760)  time: 0.2818  data: 0.0447  max mem: 42519
Test: Total time: 0:00:15 (0.3172 s / it)
* Acc@1 79.904 Acc@5 95.292 loss 1.046
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.03%
Epoch: [171]  [   0/2502]  eta: 1:39:02  lr: 0.001754  min_lr: 0.001754  loss: 3.8960 (3.8960)  weight_decay: 0.0500 (0.0500)  time: 2.3751  data: 1.6293  max mem: 42519
Epoch: [171]  [ 200/2502]  eta: 0:18:49  lr: 0.001753  min_lr: 0.001753  loss: 2.8910 (3.2442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.7714)  time: 0.4809  data: 0.0004  max mem: 42519
Epoch: [171]  [ 400/2502]  eta: 0:17:00  lr: 0.001751  min_lr: 0.001751  loss: 3.4032 (3.2377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8001 (0.7676)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [171]  [ 600/2502]  eta: 0:15:21  lr: 0.001749  min_lr: 0.001749  loss: 3.5502 (3.2371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7850 (0.7702)  time: 0.4809  data: 0.0004  max mem: 42519
Epoch: [171]  [ 800/2502]  eta: 0:13:43  lr: 0.001747  min_lr: 0.001747  loss: 2.9098 (3.2344)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7497 (0.7783)  time: 0.4866  data: 0.0004  max mem: 42519
Epoch: [171]  [1000/2502]  eta: 0:12:05  lr: 0.001745  min_lr: 0.001745  loss: 3.1327 (3.2316)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7124 (0.7726)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [171]  [1200/2502]  eta: 0:10:28  lr: 0.001744  min_lr: 0.001744  loss: 3.2921 (3.2440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7484 (0.7786)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [171]  [1400/2502]  eta: 0:08:51  lr: 0.001742  min_lr: 0.001742  loss: 3.4070 (3.2402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6858 (0.7752)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [171]  [1600/2502]  eta: 0:07:14  lr: 0.001740  min_lr: 0.001740  loss: 3.4130 (3.2476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7975 (0.7818)  time: 0.4782  data: 0.0003  max mem: 42519
Epoch: [171]  [1800/2502]  eta: 0:05:38  lr: 0.001738  min_lr: 0.001738  loss: 3.1078 (3.2518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7695 (0.7823)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [171]  [2000/2502]  eta: 0:04:01  lr: 0.001737  min_lr: 0.001737  loss: 3.3536 (3.2498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7810 (0.7815)  time: 0.4820  data: 0.0005  max mem: 42519
Epoch: [171]  [2200/2502]  eta: 0:02:25  lr: 0.001735  min_lr: 0.001735  loss: 3.3672 (3.2540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7320 (0.7788)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [171]  [2400/2502]  eta: 0:00:49  lr: 0.001733  min_lr: 0.001733  loss: 3.5218 (3.2577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7636 (0.7806)  time: 0.4814  data: 0.0005  max mem: 42519
Epoch: [171]  [2501/2502]  eta: 0:00:00  lr: 0.001732  min_lr: 0.001732  loss: 3.3405 (3.2607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8147 (0.7820)  time: 0.4439  data: 0.0010  max mem: 42519
Epoch: [171] Total time: 0:20:05 (0.4819 s / it)
Averaged stats: lr: 0.001732  min_lr: 0.001732  loss: 3.3405 (3.2611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8147 (0.7820)
Test:  [ 0/50]  eta: 0:02:58  loss: 0.6131 (0.6131)  acc1: 92.8000 (92.8000)  acc5: 99.2000 (99.2000)  time: 3.5627  data: 3.2899  max mem: 42519
Test:  [10/50]  eta: 0:00:21  loss: 0.9869 (0.9545)  acc1: 83.6000 (84.9455)  acc5: 97.6000 (97.3818)  time: 0.5396  data: 0.2995  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.9971 (0.9853)  acc1: 82.4000 (83.8476)  acc5: 97.6000 (97.4286)  time: 0.2564  data: 0.0170  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0987 (1.0636)  acc1: 80.8000 (82.1548)  acc5: 95.2000 (96.1806)  time: 0.2567  data: 0.0171  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.3171 (1.1297)  acc1: 74.8000 (80.1756)  acc5: 92.4000 (95.3951)  time: 0.2407  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2990 (1.1396)  acc1: 73.2000 (79.5120)  acc5: 92.8000 (95.3280)  time: 0.2407  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3157 s / it)
* Acc@1 79.734 Acc@5 95.322 loss 1.137
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 80.03%
Epoch: [172]  [   0/2502]  eta: 1:38:36  lr: 0.001732  min_lr: 0.001732  loss: 3.4339 (3.4339)  weight_decay: 0.0500 (0.0500)  time: 2.3649  data: 1.3660  max mem: 42519
Epoch: [172]  [ 200/2502]  eta: 0:18:45  lr: 0.001730  min_lr: 0.001730  loss: 3.2204 (3.2068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7229 (0.7617)  time: 0.4787  data: 0.0003  max mem: 42519
Epoch: [172]  [ 400/2502]  eta: 0:17:01  lr: 0.001729  min_lr: 0.001729  loss: 3.3725 (3.2428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7713 (0.7608)  time: 0.4870  data: 0.0006  max mem: 42519
Epoch: [172]  [ 600/2502]  eta: 0:15:20  lr: 0.001727  min_lr: 0.001727  loss: 3.3409 (3.2217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7731 (0.7773)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [172]  [ 800/2502]  eta: 0:13:42  lr: 0.001725  min_lr: 0.001725  loss: 3.3114 (3.2240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7619 (0.7774)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [172]  [1000/2502]  eta: 0:12:05  lr: 0.001723  min_lr: 0.001723  loss: 3.1413 (3.2182)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7065 (0.7749)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [172]  [1200/2502]  eta: 0:10:28  lr: 0.001721  min_lr: 0.001721  loss: 3.3953 (3.2187)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7902 (0.7771)  time: 0.4818  data: 0.0004  max mem: 42519
Epoch: [172]  [1400/2502]  eta: 0:08:51  lr: 0.001720  min_lr: 0.001720  loss: 3.2906 (3.2307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7782 (0.7771)  time: 0.4922  data: 0.0005  max mem: 42519
Epoch: [172]  [1600/2502]  eta: 0:07:15  lr: 0.001718  min_lr: 0.001718  loss: 3.3564 (3.2361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7042 (0.7786)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [172]  [1800/2502]  eta: 0:05:38  lr: 0.001716  min_lr: 0.001716  loss: 3.2958 (3.2391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7713 (0.7810)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [172]  [2000/2502]  eta: 0:04:02  lr: 0.001714  min_lr: 0.001714  loss: 3.3139 (3.2442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7199 (0.7797)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [172]  [2200/2502]  eta: 0:02:25  lr: 0.001713  min_lr: 0.001713  loss: 3.4194 (3.2492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7769 (0.7837)  time: 0.4782  data: 0.0003  max mem: 42519
Epoch: [172]  [2400/2502]  eta: 0:00:49  lr: 0.001711  min_lr: 0.001711  loss: 3.5388 (3.2483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7924 (0.7826)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [172]  [2501/2502]  eta: 0:00:00  lr: 0.001710  min_lr: 0.001710  loss: 3.5568 (3.2484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7567 (0.7815)  time: 0.4459  data: 0.0007  max mem: 42519
Epoch: [172] Total time: 0:20:04 (0.4816 s / it)
Averaged stats: lr: 0.001710  min_lr: 0.001710  loss: 3.5568 (3.2529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7567 (0.7815)
Test:  [ 0/50]  eta: 0:04:13  loss: 0.6047 (0.6047)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 5.0702  data: 4.7836  max mem: 42519
Test:  [10/50]  eta: 0:00:27  loss: 0.9081 (0.9301)  acc1: 83.2000 (84.2545)  acc5: 97.6000 (97.2727)  time: 0.6767  data: 0.4353  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.9420 (0.9511)  acc1: 82.4000 (83.4476)  acc5: 97.6000 (97.2571)  time: 0.2432  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0886 (1.0456)  acc1: 80.8000 (81.9484)  acc5: 95.2000 (96.1419)  time: 0.2523  data: 0.0071  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2944 (1.1180)  acc1: 75.6000 (80.1268)  acc5: 93.6000 (95.4634)  time: 0.2478  data: 0.0068  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.3214 (1.1304)  acc1: 74.4000 (79.5280)  acc5: 93.6000 (95.3840)  time: 0.2471  data: 0.0066  max mem: 42519
Test: Total time: 0:00:17 (0.3443 s / it)
* Acc@1 79.736 Acc@5 95.424 loss 1.124
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 80.03%
Epoch: [173]  [   0/2502]  eta: 1:41:44  lr: 0.001710  min_lr: 0.001710  loss: 3.0457 (3.0457)  weight_decay: 0.0500 (0.0500)  time: 2.4397  data: 1.8073  max mem: 42519
Epoch: [173]  [ 200/2502]  eta: 0:18:54  lr: 0.001708  min_lr: 0.001708  loss: 3.4022 (3.2089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7805 (0.7805)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [173]  [ 400/2502]  eta: 0:17:01  lr: 0.001706  min_lr: 0.001706  loss: 3.1639 (3.2333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7758 (0.7659)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [173]  [ 600/2502]  eta: 0:15:22  lr: 0.001705  min_lr: 0.001705  loss: 3.3954 (3.2307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7492 (0.7716)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [173]  [ 800/2502]  eta: 0:13:43  lr: 0.001703  min_lr: 0.001703  loss: 3.1322 (3.2320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8000 (0.7759)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [173]  [1000/2502]  eta: 0:12:05  lr: 0.001701  min_lr: 0.001701  loss: 3.3341 (3.2263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6872 (0.7701)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [173]  [1200/2502]  eta: 0:10:28  lr: 0.001699  min_lr: 0.001699  loss: 3.4312 (3.2373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8085 (0.7729)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [173]  [1400/2502]  eta: 0:08:51  lr: 0.001697  min_lr: 0.001697  loss: 3.1987 (3.2356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7263 (0.7691)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [173]  [1600/2502]  eta: 0:07:14  lr: 0.001696  min_lr: 0.001696  loss: 3.2969 (3.2356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7689 (0.7701)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [173]  [1800/2502]  eta: 0:05:38  lr: 0.001694  min_lr: 0.001694  loss: 3.4286 (3.2381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7818 (0.7704)  time: 0.4812  data: 0.0004  max mem: 42519
Epoch: [173]  [2000/2502]  eta: 0:04:01  lr: 0.001692  min_lr: 0.001692  loss: 3.4930 (3.2436)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7621 (nan)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [173]  [2200/2502]  eta: 0:02:25  lr: 0.001690  min_lr: 0.001690  loss: 3.5440 (3.2453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8091 (nan)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [173]  [2400/2502]  eta: 0:00:49  lr: 0.001689  min_lr: 0.001689  loss: 3.1945 (3.2451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8359 (nan)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [173]  [2501/2502]  eta: 0:00:00  lr: 0.001688  min_lr: 0.001688  loss: 3.3768 (3.2469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7502 (nan)  time: 0.4361  data: 0.0007  max mem: 42519
Epoch: [173] Total time: 0:20:04 (0.4816 s / it)
Averaged stats: lr: 0.001688  min_lr: 0.001688  loss: 3.3768 (3.2479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7502 (nan)
Test:  [ 0/50]  eta: 0:03:42  loss: 0.6243 (0.6243)  acc1: 93.6000 (93.6000)  acc5: 98.8000 (98.8000)  time: 4.4533  data: 4.1848  max mem: 42519
Test:  [10/50]  eta: 0:00:25  loss: 0.9478 (0.9456)  acc1: 84.0000 (85.1273)  acc5: 98.0000 (97.2727)  time: 0.6369  data: 0.3974  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.9714 (0.9713)  acc1: 81.6000 (83.9048)  acc5: 97.6000 (97.1619)  time: 0.2483  data: 0.0096  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0875 (1.0586)  acc1: 79.2000 (81.7936)  acc5: 95.2000 (95.9613)  time: 0.2452  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.3023 (1.1239)  acc1: 77.2000 (80.1951)  acc5: 93.2000 (95.3659)  time: 0.2435  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.3023 (1.1397)  acc1: 76.4000 (79.6400)  acc5: 93.6000 (95.2560)  time: 0.2380  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3302 s / it)
* Acc@1 79.908 Acc@5 95.326 loss 1.136
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.03%
Epoch: [174]  [   0/2502]  eta: 1:40:18  lr: 0.001688  min_lr: 0.001688  loss: 3.7846 (3.7846)  weight_decay: 0.0500 (0.0500)  time: 2.4053  data: 1.9183  max mem: 42519
Epoch: [174]  [ 200/2502]  eta: 0:18:48  lr: 0.001686  min_lr: 0.001686  loss: 3.1929 (3.2689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7382 (0.7886)  time: 0.4813  data: 0.0004  max mem: 42519
Epoch: [174]  [ 400/2502]  eta: 0:17:04  lr: 0.001684  min_lr: 0.001684  loss: 3.1636 (3.2416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7921 (0.7961)  time: 0.4815  data: 0.0004  max mem: 42519
Epoch: [174]  [ 600/2502]  eta: 0:15:22  lr: 0.001682  min_lr: 0.001682  loss: 3.4389 (3.2352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7211 (0.7849)  time: 0.4806  data: 0.0006  max mem: 42519
Epoch: [174]  [ 800/2502]  eta: 0:13:43  lr: 0.001681  min_lr: 0.001681  loss: 3.4817 (3.2400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7184 (0.7899)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [174]  [1000/2502]  eta: 0:12:06  lr: 0.001679  min_lr: 0.001679  loss: 3.1902 (3.2492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7252 (0.7916)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [174]  [1200/2502]  eta: 0:10:28  lr: 0.001677  min_lr: 0.001677  loss: 3.1378 (3.2615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7101 (0.7847)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [174]  [1400/2502]  eta: 0:08:51  lr: 0.001675  min_lr: 0.001675  loss: 3.4891 (3.2600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7839 (0.7869)  time: 0.4935  data: 0.0004  max mem: 42519
Epoch: [174]  [1600/2502]  eta: 0:07:15  lr: 0.001674  min_lr: 0.001674  loss: 3.2427 (3.2653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7356 (0.7871)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [174]  [1800/2502]  eta: 0:05:38  lr: 0.001672  min_lr: 0.001672  loss: 3.5701 (3.2692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7701 (0.7872)  time: 0.4811  data: 0.0004  max mem: 42519
Epoch: [174]  [2000/2502]  eta: 0:04:02  lr: 0.001670  min_lr: 0.001670  loss: 3.1368 (3.2647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7547 (0.7880)  time: 0.4983  data: 0.0004  max mem: 42519
Epoch: [174]  [2200/2502]  eta: 0:02:25  lr: 0.001668  min_lr: 0.001668  loss: 3.3973 (3.2652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7934 (0.7881)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [174]  [2400/2502]  eta: 0:00:49  lr: 0.001666  min_lr: 0.001666  loss: 3.4798 (3.2672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7805 (0.7858)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [174]  [2501/2502]  eta: 0:00:00  lr: 0.001666  min_lr: 0.001666  loss: 3.2999 (3.2656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8355 (0.7880)  time: 0.4350  data: 0.0008  max mem: 42519
Epoch: [174] Total time: 0:20:05 (0.4819 s / it)
Averaged stats: lr: 0.001666  min_lr: 0.001666  loss: 3.2999 (3.2566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8355 (0.7880)
Test:  [ 0/50]  eta: 0:04:46  loss: 0.5684 (0.5684)  acc1: 92.0000 (92.0000)  acc5: 98.8000 (98.8000)  time: 5.7221  data: 5.4529  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.8506 (0.8324)  acc1: 84.4000 (85.1636)  acc5: 97.6000 (97.4909)  time: 0.7359  data: 0.4961  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8506 (0.8679)  acc1: 83.6000 (84.0381)  acc5: 97.6000 (97.2571)  time: 0.2375  data: 0.0004  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0111 (0.9663)  acc1: 80.4000 (82.2323)  acc5: 95.6000 (96.1806)  time: 0.2461  data: 0.0028  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2001 (1.0241)  acc1: 76.8000 (80.6146)  acc5: 93.2000 (95.6683)  time: 0.2459  data: 0.0027  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1796 (1.0481)  acc1: 76.0000 (80.0080)  acc5: 94.0000 (95.5520)  time: 0.2426  data: 0.0024  max mem: 42519
Test: Total time: 0:00:17 (0.3536 s / it)
* Acc@1 80.170 Acc@5 95.422 loss 1.049
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.17%
Epoch: [175]  [   0/2502]  eta: 1:24:51  lr: 0.001666  min_lr: 0.001666  loss: 3.5198 (3.5198)  weight_decay: 0.0500 (0.0500)  time: 2.0349  data: 1.5453  max mem: 42519
Epoch: [175]  [ 200/2502]  eta: 0:18:45  lr: 0.001664  min_lr: 0.001664  loss: 3.0253 (3.2126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7175 (0.7960)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [175]  [ 400/2502]  eta: 0:16:58  lr: 0.001662  min_lr: 0.001662  loss: 3.3015 (3.2340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7770 (0.8048)  time: 0.4898  data: 0.0004  max mem: 42519
Epoch: [175]  [ 600/2502]  eta: 0:15:18  lr: 0.001660  min_lr: 0.001660  loss: 3.3938 (3.2391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8037 (0.8113)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [175]  [ 800/2502]  eta: 0:13:41  lr: 0.001658  min_lr: 0.001658  loss: 3.4590 (3.2412)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8380 (0.8072)  time: 0.4839  data: 0.0005  max mem: 42519
Epoch: [175]  [1000/2502]  eta: 0:12:05  lr: 0.001657  min_lr: 0.001657  loss: 3.4166 (3.2432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7502 (0.8017)  time: 0.4799  data: 0.0005  max mem: 42519
Epoch: [175]  [1200/2502]  eta: 0:10:28  lr: 0.001655  min_lr: 0.001655  loss: 3.1567 (3.2501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7711 (0.8109)  time: 0.4870  data: 0.0005  max mem: 42519
Epoch: [175]  [1400/2502]  eta: 0:08:51  lr: 0.001653  min_lr: 0.001653  loss: 3.3468 (3.2514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7624 (0.8070)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [175]  [1600/2502]  eta: 0:07:15  lr: 0.001651  min_lr: 0.001651  loss: 3.2024 (3.2442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7800 (0.8009)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [175]  [1800/2502]  eta: 0:05:38  lr: 0.001650  min_lr: 0.001650  loss: 3.2747 (3.2401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.7998)  time: 0.4811  data: 0.0004  max mem: 42519
Epoch: [175]  [2000/2502]  eta: 0:04:02  lr: 0.001648  min_lr: 0.001648  loss: 3.3812 (3.2437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8284 (0.8008)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [175]  [2200/2502]  eta: 0:02:25  lr: 0.001646  min_lr: 0.001646  loss: 3.6522 (3.2422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7854 (0.8006)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [175]  [2400/2502]  eta: 0:00:49  lr: 0.001644  min_lr: 0.001644  loss: 3.2458 (3.2463)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8219 (0.8036)  time: 0.4816  data: 0.0005  max mem: 42519
Epoch: [175]  [2501/2502]  eta: 0:00:00  lr: 0.001644  min_lr: 0.001644  loss: 3.2419 (3.2476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7422 (0.8036)  time: 0.4373  data: 0.0009  max mem: 42519
Epoch: [175] Total time: 0:20:06 (0.4822 s / it)
Averaged stats: lr: 0.001644  min_lr: 0.001644  loss: 3.2419 (3.2453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7422 (0.8036)
Test:  [ 0/50]  eta: 0:03:48  loss: 0.5723 (0.5723)  acc1: 93.6000 (93.6000)  acc5: 99.6000 (99.6000)  time: 4.5736  data: 4.2971  max mem: 42519
Test:  [10/50]  eta: 0:00:25  loss: 0.9367 (0.9232)  acc1: 84.0000 (85.0909)  acc5: 97.6000 (97.1273)  time: 0.6313  data: 0.3910  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.9367 (0.9441)  acc1: 82.8000 (84.0381)  acc5: 97.6000 (97.2381)  time: 0.2398  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0628 (1.0358)  acc1: 79.6000 (82.0516)  acc5: 94.4000 (95.9742)  time: 0.2415  data: 0.0007  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2203 (1.0922)  acc1: 76.8000 (80.4488)  acc5: 93.2000 (95.4342)  time: 0.2390  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2203 (1.1072)  acc1: 76.4000 (79.9680)  acc5: 93.6000 (95.3520)  time: 0.2375  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3278 s / it)
* Acc@1 80.054 Acc@5 95.560 loss 1.107
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.17%
Epoch: [176]  [   0/2502]  eta: 1:34:14  lr: 0.001643  min_lr: 0.001643  loss: 2.4224 (2.4224)  weight_decay: 0.0500 (0.0500)  time: 2.2602  data: 1.4750  max mem: 42519
Epoch: [176]  [ 200/2502]  eta: 0:18:49  lr: 0.001642  min_lr: 0.001642  loss: 3.3704 (3.2585)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7744 (0.7772)  time: 0.4814  data: 0.0004  max mem: 42519
Epoch: [176]  [ 400/2502]  eta: 0:17:04  lr: 0.001640  min_lr: 0.001640  loss: 2.8420 (3.2250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8071 (0.7878)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [176]  [ 600/2502]  eta: 0:15:22  lr: 0.001638  min_lr: 0.001638  loss: 3.5714 (3.2320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7723 (0.7841)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [176]  [ 800/2502]  eta: 0:13:44  lr: 0.001636  min_lr: 0.001636  loss: 3.4288 (3.2278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7402 (0.7831)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [176]  [1000/2502]  eta: 0:12:06  lr: 0.001635  min_lr: 0.001635  loss: 3.3698 (3.2181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7892 (0.7843)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [176]  [1200/2502]  eta: 0:10:29  lr: 0.001633  min_lr: 0.001633  loss: 3.1940 (3.2138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8335 (0.7863)  time: 0.4914  data: 0.0004  max mem: 42519
Epoch: [176]  [1400/2502]  eta: 0:08:52  lr: 0.001631  min_lr: 0.001631  loss: 3.2494 (3.2160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7548 (0.7847)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [176]  [1600/2502]  eta: 0:07:15  lr: 0.001629  min_lr: 0.001629  loss: 3.4802 (3.2181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7691 (0.7833)  time: 0.4817  data: 0.0004  max mem: 42519
Epoch: [176]  [1800/2502]  eta: 0:05:38  lr: 0.001628  min_lr: 0.001628  loss: 3.1208 (3.2192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7807 (0.7864)  time: 0.4785  data: 0.0005  max mem: 42519
Epoch: [176]  [2000/2502]  eta: 0:04:02  lr: 0.001626  min_lr: 0.001626  loss: 3.4073 (3.2228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8020 (0.7877)  time: 0.4859  data: 0.0004  max mem: 42519
Epoch: [176]  [2200/2502]  eta: 0:02:25  lr: 0.001624  min_lr: 0.001624  loss: 3.0914 (3.2218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7539 (0.7884)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [176]  [2400/2502]  eta: 0:00:49  lr: 0.001622  min_lr: 0.001622  loss: 3.4157 (3.2249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7918 (0.7903)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [176]  [2501/2502]  eta: 0:00:00  lr: 0.001621  min_lr: 0.001621  loss: 3.2119 (3.2239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7633 (0.7904)  time: 0.4323  data: 0.0008  max mem: 42519
Epoch: [176] Total time: 0:20:05 (0.4819 s / it)
Averaged stats: lr: 0.001621  min_lr: 0.001621  loss: 3.2119 (3.2333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7633 (0.7904)
Test:  [ 0/50]  eta: 0:04:12  loss: 0.4843 (0.4843)  acc1: 94.0000 (94.0000)  acc5: 99.2000 (99.2000)  time: 5.0406  data: 4.7511  max mem: 42519
Test:  [10/50]  eta: 0:00:26  loss: 0.8821 (0.8476)  acc1: 86.0000 (84.8364)  acc5: 96.8000 (96.8727)  time: 0.6739  data: 0.4323  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8821 (0.8633)  acc1: 82.8000 (83.7714)  acc5: 97.6000 (97.2381)  time: 0.2383  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9476 (0.9561)  acc1: 80.0000 (81.8968)  acc5: 96.0000 (96.0129)  time: 0.2390  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2131 (1.0197)  acc1: 76.4000 (80.1854)  acc5: 93.2000 (95.3951)  time: 0.2380  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1910 (1.0307)  acc1: 76.0000 (79.6880)  acc5: 94.0000 (95.3520)  time: 0.2374  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3363 s / it)
* Acc@1 79.898 Acc@5 95.442 loss 1.031
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.17%
Epoch: [177]  [   0/2502]  eta: 1:40:52  lr: 0.001621  min_lr: 0.001621  loss: 3.7575 (3.7575)  weight_decay: 0.0500 (0.0500)  time: 2.4189  data: 1.3507  max mem: 42519
Epoch: [177]  [ 200/2502]  eta: 0:18:56  lr: 0.001620  min_lr: 0.001620  loss: 3.4109 (3.2434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7322 (0.7806)  time: 0.4826  data: 0.0005  max mem: 42519
Epoch: [177]  [ 400/2502]  eta: 0:17:05  lr: 0.001618  min_lr: 0.001618  loss: 3.3824 (3.2191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7772 (0.7907)  time: 0.4813  data: 0.0005  max mem: 42519
Epoch: [177]  [ 600/2502]  eta: 0:15:23  lr: 0.001616  min_lr: 0.001616  loss: 3.3977 (3.2098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8062 (0.8054)  time: 0.4863  data: 0.0006  max mem: 42519
Epoch: [177]  [ 800/2502]  eta: 0:13:44  lr: 0.001614  min_lr: 0.001614  loss: 3.1963 (3.2284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6974 (0.7902)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [177]  [1000/2502]  eta: 0:12:06  lr: 0.001613  min_lr: 0.001613  loss: 3.1312 (3.2217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8195 (0.7987)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [177]  [1200/2502]  eta: 0:10:28  lr: 0.001611  min_lr: 0.001611  loss: 3.2231 (3.2186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7305 (0.7986)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [177]  [1400/2502]  eta: 0:08:51  lr: 0.001609  min_lr: 0.001609  loss: 3.4889 (3.2184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8076 (0.8018)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [177]  [1600/2502]  eta: 0:07:14  lr: 0.001607  min_lr: 0.001607  loss: 3.3480 (3.2200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8062 (0.8005)  time: 0.4779  data: 0.0004  max mem: 42519
Epoch: [177]  [1800/2502]  eta: 0:05:38  lr: 0.001606  min_lr: 0.001606  loss: 3.1409 (3.2210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8568 (0.8038)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [177]  [2000/2502]  eta: 0:04:01  lr: 0.001604  min_lr: 0.001604  loss: 3.2691 (3.2233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7744 (0.8033)  time: 0.4886  data: 0.0004  max mem: 42519
Epoch: [177]  [2200/2502]  eta: 0:02:25  lr: 0.001602  min_lr: 0.001602  loss: 3.4261 (3.2295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.8011)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [177]  [2400/2502]  eta: 0:00:49  lr: 0.001600  min_lr: 0.001600  loss: 3.1642 (3.2290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7623 (0.8028)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [177]  [2501/2502]  eta: 0:00:00  lr: 0.001599  min_lr: 0.001599  loss: 3.5447 (3.2306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (0.8026)  time: 0.4346  data: 0.0009  max mem: 42519
Epoch: [177] Total time: 0:20:04 (0.4815 s / it)
Averaged stats: lr: 0.001599  min_lr: 0.001599  loss: 3.5447 (3.2337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (0.8026)
Test:  [ 0/50]  eta: 0:04:28  loss: 0.5658 (0.5658)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.3755  data: 5.0845  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.8946 (0.9035)  acc1: 85.2000 (85.0909)  acc5: 97.2000 (97.0909)  time: 0.7044  data: 0.4627  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.9451 (0.9351)  acc1: 82.4000 (84.0381)  acc5: 97.2000 (97.3333)  time: 0.2406  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0585 (1.0182)  acc1: 79.6000 (82.1548)  acc5: 95.6000 (96.2452)  time: 0.2421  data: 0.0007  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2457 (1.0829)  acc1: 75.6000 (80.3707)  acc5: 93.2000 (95.6098)  time: 0.2389  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2457 (1.0969)  acc1: 75.2000 (79.7440)  acc5: 93.6000 (95.5680)  time: 0.2374  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3443 s / it)
* Acc@1 80.022 Acc@5 95.438 loss 1.095
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.17%
Epoch: [178]  [   0/2502]  eta: 1:39:41  lr: 0.001599  min_lr: 0.001599  loss: 2.5577 (2.5577)  weight_decay: 0.0500 (0.0500)  time: 2.3907  data: 1.3742  max mem: 42519
Epoch: [178]  [ 200/2502]  eta: 0:18:45  lr: 0.001598  min_lr: 0.001598  loss: 3.2989 (3.2118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7797 (0.7931)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [178]  [ 400/2502]  eta: 0:17:02  lr: 0.001596  min_lr: 0.001596  loss: 3.4413 (3.2307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7767 (0.7934)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [178]  [ 600/2502]  eta: 0:15:21  lr: 0.001594  min_lr: 0.001594  loss: 3.1839 (3.2291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8597 (0.8128)  time: 0.4797  data: 0.0006  max mem: 42519
Epoch: [178]  [ 800/2502]  eta: 0:13:41  lr: 0.001592  min_lr: 0.001592  loss: 3.1550 (3.2204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7728 (0.8104)  time: 0.4778  data: 0.0006  max mem: 42519
Epoch: [178]  [1000/2502]  eta: 0:12:04  lr: 0.001591  min_lr: 0.001591  loss: 3.3260 (3.2142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8205 (0.8022)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [178]  [1200/2502]  eta: 0:10:27  lr: 0.001589  min_lr: 0.001589  loss: 3.4661 (3.2282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7914 (0.8036)  time: 0.4776  data: 0.0005  max mem: 42519
Epoch: [178]  [1400/2502]  eta: 0:08:50  lr: 0.001587  min_lr: 0.001587  loss: 3.2172 (3.2265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8073 (0.8017)  time: 0.4782  data: 0.0005  max mem: 42519
Epoch: [178]  [1600/2502]  eta: 0:07:14  lr: 0.001585  min_lr: 0.001585  loss: 3.3139 (3.2244)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7627 (0.7999)  time: 0.4787  data: 0.0006  max mem: 42519
Epoch: [178]  [1800/2502]  eta: 0:05:38  lr: 0.001584  min_lr: 0.001584  loss: 3.3649 (3.2164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7640 (0.7957)  time: 0.4808  data: 0.0006  max mem: 42519
Epoch: [178]  [2000/2502]  eta: 0:04:01  lr: 0.001582  min_lr: 0.001582  loss: 3.4679 (3.2170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8487 (0.7976)  time: 0.4793  data: 0.0006  max mem: 42519
Epoch: [178]  [2200/2502]  eta: 0:02:25  lr: 0.001580  min_lr: 0.001580  loss: 3.5579 (3.2194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7710 (0.7994)  time: 0.4821  data: 0.0005  max mem: 42519
Epoch: [178]  [2400/2502]  eta: 0:00:49  lr: 0.001578  min_lr: 0.001578  loss: 3.2109 (3.2200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8075 (0.8005)  time: 0.4819  data: 0.0004  max mem: 42519
Epoch: [178]  [2501/2502]  eta: 0:00:00  lr: 0.001578  min_lr: 0.001578  loss: 3.5140 (3.2209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (0.8000)  time: 0.4321  data: 0.0007  max mem: 42519
Epoch: [178] Total time: 0:20:04 (0.4814 s / it)
Averaged stats: lr: 0.001578  min_lr: 0.001578  loss: 3.5140 (3.2258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (0.8000)
Test:  [ 0/50]  eta: 0:04:27  loss: 0.6306 (0.6306)  acc1: 92.8000 (92.8000)  acc5: 99.2000 (99.2000)  time: 5.3409  data: 5.0044  max mem: 42519
Test:  [10/50]  eta: 0:00:31  loss: 0.9006 (0.9945)  acc1: 85.2000 (84.8364)  acc5: 97.2000 (97.2727)  time: 0.7806  data: 0.5343  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 1.0372 (1.0016)  acc1: 83.6000 (84.0381)  acc5: 97.2000 (97.2952)  time: 0.2824  data: 0.0460  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.1643 (1.0956)  acc1: 78.4000 (82.1161)  acc5: 94.4000 (96.1032)  time: 0.2394  data: 0.0026  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.3459 (1.1533)  acc1: 76.4000 (80.3902)  acc5: 93.6000 (95.4732)  time: 0.2373  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.3459 (1.1665)  acc1: 76.0000 (79.8080)  acc5: 94.0000 (95.4720)  time: 0.2359  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3595 s / it)
* Acc@1 79.904 Acc@5 95.388 loss 1.167
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.17%
Epoch: [179]  [   0/2502]  eta: 1:28:56  lr: 0.001577  min_lr: 0.001577  loss: 3.6251 (3.6251)  weight_decay: 0.0500 (0.0500)  time: 2.1328  data: 1.2979  max mem: 42519
Epoch: [179]  [ 200/2502]  eta: 0:18:51  lr: 0.001576  min_lr: 0.001576  loss: 2.9838 (3.1664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7591 (0.7732)  time: 0.4825  data: 0.0006  max mem: 42519
Epoch: [179]  [ 400/2502]  eta: 0:17:02  lr: 0.001574  min_lr: 0.001574  loss: 2.8711 (3.1893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7876 (0.7998)  time: 0.4814  data: 0.0004  max mem: 42519
Epoch: [179]  [ 600/2502]  eta: 0:15:21  lr: 0.001572  min_lr: 0.001572  loss: 3.3343 (3.1973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8206 (0.8060)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [179]  [ 800/2502]  eta: 0:13:43  lr: 0.001570  min_lr: 0.001570  loss: 3.3342 (3.2162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8557 (0.8154)  time: 0.4886  data: 0.0004  max mem: 42519
Epoch: [179]  [1000/2502]  eta: 0:12:05  lr: 0.001569  min_lr: 0.001569  loss: 3.2509 (3.2189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7709 (0.8116)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [179]  [1200/2502]  eta: 0:10:28  lr: 0.001567  min_lr: 0.001567  loss: 3.4664 (3.2195)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8113 (0.8106)  time: 0.4907  data: 0.0005  max mem: 42519
Epoch: [179]  [1400/2502]  eta: 0:08:51  lr: 0.001565  min_lr: 0.001565  loss: 3.1318 (3.2162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8182 (0.8121)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [179]  [1600/2502]  eta: 0:07:14  lr: 0.001563  min_lr: 0.001563  loss: 3.4569 (3.2215)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7780 (0.8135)  time: 0.4805  data: 0.0005  max mem: 42519
Epoch: [179]  [1800/2502]  eta: 0:05:38  lr: 0.001562  min_lr: 0.001562  loss: 3.2184 (3.2289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7617 (0.8115)  time: 0.4851  data: 0.0005  max mem: 42519
Epoch: [179]  [2000/2502]  eta: 0:04:02  lr: 0.001560  min_lr: 0.001560  loss: 3.3782 (3.2271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8092 (0.8101)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [179]  [2200/2502]  eta: 0:02:25  lr: 0.001558  min_lr: 0.001558  loss: 3.4389 (3.2269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7679 (0.8133)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [179]  [2400/2502]  eta: 0:00:49  lr: 0.001556  min_lr: 0.001556  loss: 3.4164 (3.2267)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7722 (0.8151)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [179]  [2501/2502]  eta: 0:00:00  lr: 0.001556  min_lr: 0.001556  loss: 3.1794 (3.2279)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8675 (0.8172)  time: 0.4330  data: 0.0007  max mem: 42519
Epoch: [179] Total time: 0:20:05 (0.4820 s / it)
Averaged stats: lr: 0.001556  min_lr: 0.001556  loss: 3.1794 (3.2280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8675 (0.8172)
Test:  [ 0/50]  eta: 0:02:44  loss: 0.5752 (0.5752)  acc1: 94.0000 (94.0000)  acc5: 99.2000 (99.2000)  time: 3.2920  data: 3.0145  max mem: 42519
Test:  [10/50]  eta: 0:00:25  loss: 0.8889 (0.9089)  acc1: 84.4000 (85.1273)  acc5: 97.6000 (97.4545)  time: 0.6450  data: 0.3943  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.9183 (0.9313)  acc1: 82.8000 (84.1714)  acc5: 97.6000 (97.6000)  time: 0.3724  data: 0.1288  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0621 (1.0193)  acc1: 80.4000 (82.2968)  acc5: 96.0000 (96.4258)  time: 0.3268  data: 0.0885  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2110 (1.0770)  acc1: 75.6000 (80.5951)  acc5: 93.2000 (95.7463)  time: 0.2632  data: 0.0259  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1929 (1.0878)  acc1: 75.6000 (80.1280)  acc5: 94.0000 (95.6480)  time: 0.2472  data: 0.0099  max mem: 42519
Test: Total time: 0:00:18 (0.3651 s / it)
* Acc@1 80.238 Acc@5 95.528 loss 1.091
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.24%
Epoch: [180]  [   0/2502]  eta: 1:19:24  lr: 0.001556  min_lr: 0.001556  loss: 3.1657 (3.1657)  weight_decay: 0.0500 (0.0500)  time: 1.9044  data: 1.4199  max mem: 42519
Epoch: [180]  [ 200/2502]  eta: 0:18:39  lr: 0.001554  min_lr: 0.001554  loss: 3.2712 (3.2480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7661 (0.8091)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [180]  [ 400/2502]  eta: 0:16:58  lr: 0.001552  min_lr: 0.001552  loss: 3.5491 (3.2264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8142 (0.8099)  time: 0.4870  data: 0.0004  max mem: 42519
Epoch: [180]  [ 600/2502]  eta: 0:15:19  lr: 0.001550  min_lr: 0.001550  loss: 3.3999 (3.2103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7992 (0.8119)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [180]  [ 800/2502]  eta: 0:13:41  lr: 0.001549  min_lr: 0.001549  loss: 3.2535 (3.2089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7661 (nan)  time: 0.4813  data: 0.0005  max mem: 42519
Epoch: [180]  [1000/2502]  eta: 0:12:05  lr: 0.001547  min_lr: 0.001547  loss: 3.3851 (3.2055)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7895 (nan)  time: 0.4809  data: 0.0004  max mem: 42519
Epoch: [180]  [1200/2502]  eta: 0:10:28  lr: 0.001545  min_lr: 0.001545  loss: 3.3262 (3.2134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7531 (nan)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [180]  [1400/2502]  eta: 0:08:51  lr: 0.001543  min_lr: 0.001543  loss: 3.3599 (3.2184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8347 (nan)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [180]  [1600/2502]  eta: 0:07:15  lr: 0.001542  min_lr: 0.001542  loss: 3.3050 (3.2172)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7968 (nan)  time: 0.4810  data: 0.0004  max mem: 42519
Epoch: [180]  [1800/2502]  eta: 0:05:38  lr: 0.001540  min_lr: 0.001540  loss: 3.4672 (3.2171)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7984 (nan)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [180]  [2000/2502]  eta: 0:04:01  lr: 0.001538  min_lr: 0.001538  loss: 3.2782 (3.2221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7984 (nan)  time: 0.4818  data: 0.0005  max mem: 42519
Epoch: [180]  [2200/2502]  eta: 0:02:25  lr: 0.001536  min_lr: 0.001536  loss: 2.9855 (3.2226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8196 (nan)  time: 0.4814  data: 0.0004  max mem: 42519
Epoch: [180]  [2400/2502]  eta: 0:00:49  lr: 0.001535  min_lr: 0.001535  loss: 3.3052 (3.2256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7905 (nan)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [180]  [2501/2502]  eta: 0:00:00  lr: 0.001534  min_lr: 0.001534  loss: 2.8679 (3.2256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7869 (nan)  time: 0.4321  data: 0.0008  max mem: 42519
Epoch: [180] Total time: 0:20:05 (0.4817 s / it)
Averaged stats: lr: 0.001534  min_lr: 0.001534  loss: 2.8679 (3.2221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7869 (nan)
Test:  [ 0/50]  eta: 0:04:23  loss: 0.5537 (0.5537)  acc1: 93.2000 (93.2000)  acc5: 99.2000 (99.2000)  time: 5.2644  data: 4.9444  max mem: 42519
Test:  [10/50]  eta: 0:00:31  loss: 0.9231 (0.8671)  acc1: 83.2000 (85.1636)  acc5: 98.0000 (97.5273)  time: 0.7824  data: 0.5315  max mem: 42519
Test:  [20/50]  eta: 0:00:16  loss: 0.9175 (0.8850)  acc1: 82.8000 (84.0571)  acc5: 97.6000 (97.3524)  time: 0.3197  data: 0.0787  max mem: 42519
Test:  [30/50]  eta: 0:00:09  loss: 1.0005 (0.9755)  acc1: 79.2000 (82.1290)  acc5: 95.2000 (96.1548)  time: 0.2718  data: 0.0339  max mem: 42519
Test:  [40/50]  eta: 0:00:04  loss: 1.2412 (1.0374)  acc1: 76.8000 (80.3805)  acc5: 93.6000 (95.4829)  time: 0.2387  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1956 (1.0499)  acc1: 75.6000 (79.9520)  acc5: 93.6000 (95.3520)  time: 0.2398  data: 0.0002  max mem: 42519
Test: Total time: 0:00:18 (0.3738 s / it)
* Acc@1 80.256 Acc@5 95.530 loss 1.046
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.26%
Epoch: [181]  [   0/2502]  eta: 1:19:31  lr: 0.001534  min_lr: 0.001534  loss: 3.7727 (3.7727)  weight_decay: 0.0500 (0.0500)  time: 1.9069  data: 1.4287  max mem: 42519
Epoch: [181]  [ 200/2502]  eta: 0:18:47  lr: 0.001532  min_lr: 0.001532  loss: 3.5749 (3.2367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8417 (0.8076)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [181]  [ 400/2502]  eta: 0:17:00  lr: 0.001530  min_lr: 0.001530  loss: 3.2453 (3.2595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8174 (0.8059)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [181]  [ 600/2502]  eta: 0:15:20  lr: 0.001528  min_lr: 0.001528  loss: 2.8686 (3.2414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8007 (0.8116)  time: 0.4809  data: 0.0005  max mem: 42519
Epoch: [181]  [ 800/2502]  eta: 0:13:43  lr: 0.001527  min_lr: 0.001527  loss: 3.4476 (3.2417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8058 (0.8207)  time: 0.4814  data: 0.0005  max mem: 42519
Epoch: [181]  [1000/2502]  eta: 0:12:05  lr: 0.001525  min_lr: 0.001525  loss: 3.1321 (3.2445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7780 (0.8168)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [181]  [1200/2502]  eta: 0:10:28  lr: 0.001523  min_lr: 0.001523  loss: 3.0799 (3.2349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7809 (0.8136)  time: 0.4861  data: 0.0005  max mem: 42519
Epoch: [181]  [1400/2502]  eta: 0:08:51  lr: 0.001522  min_lr: 0.001522  loss: 3.4592 (3.2303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8146 (0.8151)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [181]  [1600/2502]  eta: 0:07:14  lr: 0.001520  min_lr: 0.001520  loss: 3.1298 (3.2328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7730 (0.8173)  time: 0.4813  data: 0.0005  max mem: 42519
Epoch: [181]  [1800/2502]  eta: 0:05:38  lr: 0.001518  min_lr: 0.001518  loss: 3.4069 (3.2361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8196 (0.8171)  time: 0.4876  data: 0.0005  max mem: 42519
Epoch: [181]  [2000/2502]  eta: 0:04:02  lr: 0.001516  min_lr: 0.001516  loss: 3.1368 (3.2367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7861 (0.8177)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [181]  [2200/2502]  eta: 0:02:25  lr: 0.001515  min_lr: 0.001515  loss: 3.3840 (3.2324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8188 (0.8179)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [181]  [2400/2502]  eta: 0:00:49  lr: 0.001513  min_lr: 0.001513  loss: 3.3921 (3.2339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7518 (0.8171)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [181]  [2501/2502]  eta: 0:00:00  lr: 0.001512  min_lr: 0.001512  loss: 3.3144 (3.2363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8215 (0.8168)  time: 0.4330  data: 0.0006  max mem: 42519
Epoch: [181] Total time: 0:20:05 (0.4817 s / it)
Averaged stats: lr: 0.001512  min_lr: 0.001512  loss: 3.3144 (3.2190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8215 (0.8168)
Test:  [ 0/50]  eta: 0:04:53  loss: 0.5798 (0.5798)  acc1: 94.0000 (94.0000)  acc5: 98.8000 (98.8000)  time: 5.8692  data: 5.5715  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.8650 (0.8554)  acc1: 87.6000 (85.9273)  acc5: 98.4000 (97.6727)  time: 0.7480  data: 0.5070  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.8919 (0.8993)  acc1: 84.4000 (84.3619)  acc5: 97.6000 (97.5619)  time: 0.2361  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0481 (0.9779)  acc1: 80.0000 (82.6581)  acc5: 95.2000 (96.4516)  time: 0.2370  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1931 (1.0388)  acc1: 76.8000 (80.8390)  acc5: 93.6000 (95.7659)  time: 0.2376  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1931 (1.0511)  acc1: 76.0000 (80.2160)  acc5: 94.4000 (95.7200)  time: 0.2367  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3512 s / it)
* Acc@1 80.260 Acc@5 95.690 loss 1.049
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.26%
Epoch: [182]  [   0/2502]  eta: 1:19:36  lr: 0.001512  min_lr: 0.001512  loss: 2.5414 (2.5414)  weight_decay: 0.0500 (0.0500)  time: 1.9093  data: 1.4242  max mem: 42519
Epoch: [182]  [ 200/2502]  eta: 0:18:41  lr: 0.001510  min_lr: 0.001510  loss: 3.3270 (3.1535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8732 (0.8479)  time: 0.4825  data: 0.0004  max mem: 42519
Epoch: [182]  [ 400/2502]  eta: 0:16:59  lr: 0.001508  min_lr: 0.001508  loss: 3.4423 (3.1970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8064 (0.8469)  time: 0.4897  data: 0.0004  max mem: 42519
Epoch: [182]  [ 600/2502]  eta: 0:15:20  lr: 0.001507  min_lr: 0.001507  loss: 3.0919 (3.2300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8320 (0.8397)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [182]  [ 800/2502]  eta: 0:13:41  lr: 0.001505  min_lr: 0.001505  loss: 3.0558 (3.2160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8355 (0.8419)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [182]  [1000/2502]  eta: 0:12:05  lr: 0.001503  min_lr: 0.001503  loss: 3.0906 (3.2287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8987 (0.8484)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [182]  [1200/2502]  eta: 0:10:27  lr: 0.001502  min_lr: 0.001502  loss: 3.3811 (3.2274)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8330 (0.8505)  time: 0.4811  data: 0.0005  max mem: 42519
Epoch: [182]  [1400/2502]  eta: 0:08:51  lr: 0.001500  min_lr: 0.001500  loss: 3.2536 (3.2139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7989 (0.8465)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [182]  [1600/2502]  eta: 0:07:14  lr: 0.001498  min_lr: 0.001498  loss: 3.3406 (3.2169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8126 (0.8442)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [182]  [1800/2502]  eta: 0:05:38  lr: 0.001496  min_lr: 0.001496  loss: 3.1929 (3.2131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8092 (0.8437)  time: 0.4812  data: 0.0004  max mem: 42519
Epoch: [182]  [2000/2502]  eta: 0:04:01  lr: 0.001495  min_lr: 0.001495  loss: 3.2470 (3.2177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8471 (0.8453)  time: 0.4776  data: 0.0004  max mem: 42519
Epoch: [182]  [2200/2502]  eta: 0:02:25  lr: 0.001493  min_lr: 0.001493  loss: 3.3119 (3.2196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8347 (0.8436)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [182]  [2400/2502]  eta: 0:00:49  lr: 0.001491  min_lr: 0.001491  loss: 3.4623 (3.2224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8109 (0.8407)  time: 0.4881  data: 0.0004  max mem: 42519
Epoch: [182]  [2501/2502]  eta: 0:00:00  lr: 0.001490  min_lr: 0.001490  loss: 3.0481 (3.2233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8332 (0.8414)  time: 0.4343  data: 0.0007  max mem: 42519
Epoch: [182] Total time: 0:20:03 (0.4812 s / it)
Averaged stats: lr: 0.001490  min_lr: 0.001490  loss: 3.0481 (3.2165)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8332 (0.8414)
Test:  [ 0/50]  eta: 0:04:30  loss: 0.5767 (0.5767)  acc1: 94.4000 (94.4000)  acc5: 99.6000 (99.6000)  time: 5.4175  data: 5.1339  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.8363 (0.8405)  acc1: 85.2000 (85.4546)  acc5: 98.0000 (97.5273)  time: 0.7084  data: 0.4672  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8611 (0.8704)  acc1: 84.0000 (84.4381)  acc5: 98.0000 (97.5048)  time: 0.2407  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9635 (0.9516)  acc1: 80.8000 (82.7355)  acc5: 94.8000 (96.4903)  time: 0.2433  data: 0.0007  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1654 (1.0111)  acc1: 77.6000 (81.0829)  acc5: 93.6000 (95.9415)  time: 0.2400  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1743 (1.0248)  acc1: 75.6000 (80.4800)  acc5: 94.4000 (95.8320)  time: 0.2373  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3465 s / it)
* Acc@1 80.622 Acc@5 95.604 loss 1.027
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.62%
Epoch: [183]  [   0/2502]  eta: 1:16:52  lr: 0.001490  min_lr: 0.001490  loss: 3.6518 (3.6518)  weight_decay: 0.0500 (0.0500)  time: 1.8435  data: 1.3506  max mem: 42519
Epoch: [183]  [ 200/2502]  eta: 0:18:46  lr: 0.001488  min_lr: 0.001488  loss: 3.3485 (3.2227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8161 (0.8072)  time: 0.4810  data: 0.0004  max mem: 42519
Epoch: [183]  [ 400/2502]  eta: 0:17:01  lr: 0.001487  min_lr: 0.001487  loss: 3.1712 (3.1936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.8214)  time: 0.4823  data: 0.0006  max mem: 42519
Epoch: [183]  [ 600/2502]  eta: 0:15:20  lr: 0.001485  min_lr: 0.001485  loss: 3.4875 (3.1944)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7786 (0.8308)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [183]  [ 800/2502]  eta: 0:13:42  lr: 0.001483  min_lr: 0.001483  loss: 3.2647 (3.1887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8436 (0.8343)  time: 0.4885  data: 0.0006  max mem: 42519
Epoch: [183]  [1000/2502]  eta: 0:12:04  lr: 0.001482  min_lr: 0.001482  loss: 3.2478 (3.1834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7895 (0.8303)  time: 0.4800  data: 0.0005  max mem: 42519
Epoch: [183]  [1200/2502]  eta: 0:10:28  lr: 0.001480  min_lr: 0.001480  loss: 3.2812 (3.1872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8001 (0.8292)  time: 0.4910  data: 0.0005  max mem: 42519
Epoch: [183]  [1400/2502]  eta: 0:08:51  lr: 0.001478  min_lr: 0.001478  loss: 3.4453 (3.1958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8597 (0.8295)  time: 0.4800  data: 0.0005  max mem: 42519
Epoch: [183]  [1600/2502]  eta: 0:07:14  lr: 0.001476  min_lr: 0.001476  loss: 3.2341 (3.1983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8226 (0.8312)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [183]  [1800/2502]  eta: 0:05:38  lr: 0.001475  min_lr: 0.001475  loss: 3.2957 (3.1992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8279 (0.8335)  time: 0.4854  data: 0.0004  max mem: 42519
Epoch: [183]  [2000/2502]  eta: 0:04:01  lr: 0.001473  min_lr: 0.001473  loss: 3.1923 (3.1944)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7856 (0.8344)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [183]  [2200/2502]  eta: 0:02:25  lr: 0.001471  min_lr: 0.001471  loss: 2.9488 (3.1950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8191 (0.8308)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [183]  [2400/2502]  eta: 0:00:49  lr: 0.001469  min_lr: 0.001469  loss: 3.4522 (3.1929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8516 (0.8338)  time: 0.4838  data: 0.0004  max mem: 42519
Epoch: [183]  [2501/2502]  eta: 0:00:00  lr: 0.001469  min_lr: 0.001469  loss: 3.3863 (3.1922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8089 (0.8336)  time: 0.4398  data: 0.0007  max mem: 42519
Epoch: [183] Total time: 0:20:05 (0.4818 s / it)
Averaged stats: lr: 0.001469  min_lr: 0.001469  loss: 3.3863 (3.2102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8089 (0.8336)
Test:  [ 0/50]  eta: 0:03:42  loss: 0.5347 (0.5347)  acc1: 94.4000 (94.4000)  acc5: 99.6000 (99.6000)  time: 4.4540  data: 4.1798  max mem: 42519
Test:  [10/50]  eta: 0:00:24  loss: 0.9054 (0.9065)  acc1: 85.6000 (84.9818)  acc5: 98.0000 (97.3818)  time: 0.6208  data: 0.3804  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.9217 (0.9357)  acc1: 84.0000 (84.2476)  acc5: 97.6000 (97.3143)  time: 0.2441  data: 0.0034  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0532 (1.0084)  acc1: 80.0000 (82.5806)  acc5: 95.6000 (96.2839)  time: 0.2801  data: 0.0386  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2319 (1.0702)  acc1: 77.2000 (80.8098)  acc5: 93.2000 (95.5805)  time: 0.2733  data: 0.0355  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2335 (1.0858)  acc1: 76.8000 (80.4000)  acc5: 93.2000 (95.4880)  time: 0.2722  data: 0.0353  max mem: 42519
Test: Total time: 0:00:17 (0.3407 s / it)
* Acc@1 80.432 Acc@5 95.478 loss 1.084
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.62%
Epoch: [184]  [   0/2502]  eta: 1:31:48  lr: 0.001469  min_lr: 0.001469  loss: 3.2198 (3.2198)  weight_decay: 0.0500 (0.0500)  time: 2.2018  data: 1.3603  max mem: 42519
Epoch: [184]  [ 200/2502]  eta: 0:18:42  lr: 0.001467  min_lr: 0.001467  loss: 3.3447 (3.1900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7677 (0.8111)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [184]  [ 400/2502]  eta: 0:16:57  lr: 0.001465  min_lr: 0.001465  loss: 3.1257 (3.2042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8074 (0.8150)  time: 0.4880  data: 0.0003  max mem: 42519
Epoch: [184]  [ 600/2502]  eta: 0:15:18  lr: 0.001463  min_lr: 0.001463  loss: 3.5027 (3.2124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8588 (0.8292)  time: 0.4783  data: 0.0005  max mem: 42519
Epoch: [184]  [ 800/2502]  eta: 0:13:40  lr: 0.001462  min_lr: 0.001462  loss: 3.2810 (3.1993)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8030 (0.8294)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [184]  [1000/2502]  eta: 0:12:03  lr: 0.001460  min_lr: 0.001460  loss: 3.4126 (3.2036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8173 (0.8323)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [184]  [1200/2502]  eta: 0:10:27  lr: 0.001458  min_lr: 0.001458  loss: 3.3707 (3.1984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7663 (0.8334)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [184]  [1400/2502]  eta: 0:08:50  lr: 0.001456  min_lr: 0.001456  loss: 3.2979 (3.2115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8490 (0.8360)  time: 0.4807  data: 0.0006  max mem: 42519
Epoch: [184]  [1600/2502]  eta: 0:07:14  lr: 0.001455  min_lr: 0.001455  loss: 3.3647 (3.2106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8045 (0.8363)  time: 0.4898  data: 0.0004  max mem: 42519
Epoch: [184]  [1800/2502]  eta: 0:05:38  lr: 0.001453  min_lr: 0.001453  loss: 3.3555 (3.2090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7941 (0.8367)  time: 0.4813  data: 0.0005  max mem: 42519
Epoch: [184]  [2000/2502]  eta: 0:04:01  lr: 0.001451  min_lr: 0.001451  loss: 3.1450 (3.2095)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8113 (0.8351)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [184]  [2200/2502]  eta: 0:02:25  lr: 0.001450  min_lr: 0.001450  loss: 3.2973 (3.2079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8557 (0.8371)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [184]  [2400/2502]  eta: 0:00:49  lr: 0.001448  min_lr: 0.001448  loss: 3.2467 (3.2037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7547 (0.8338)  time: 0.4805  data: 0.0005  max mem: 42519
Epoch: [184]  [2501/2502]  eta: 0:00:00  lr: 0.001447  min_lr: 0.001447  loss: 3.0292 (3.2013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.8334)  time: 0.4375  data: 0.0013  max mem: 42519
Epoch: [184] Total time: 0:20:04 (0.4815 s / it)
Averaged stats: lr: 0.001447  min_lr: 0.001447  loss: 3.0292 (3.1961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.8334)
Test:  [ 0/50]  eta: 0:02:26  loss: 0.4527 (0.4527)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 2.9328  data: 2.6522  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8303 (0.7850)  acc1: 84.4000 (85.8909)  acc5: 97.6000 (97.3818)  time: 0.4824  data: 0.2416  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8566 (0.8230)  acc1: 83.2000 (84.6857)  acc5: 97.2000 (97.4095)  time: 0.2507  data: 0.0118  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.8888 (0.9081)  acc1: 81.6000 (82.7097)  acc5: 96.0000 (96.3613)  time: 0.2509  data: 0.0118  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1258 (0.9664)  acc1: 77.2000 (81.1610)  acc5: 94.0000 (95.8049)  time: 0.2375  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1519 (0.9862)  acc1: 76.4000 (80.5360)  acc5: 94.4000 (95.7120)  time: 0.2373  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2987 s / it)
* Acc@1 80.746 Acc@5 95.778 loss 0.981
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.75%
Epoch: [185]  [   0/2502]  eta: 1:29:02  lr: 0.001447  min_lr: 0.001447  loss: 3.6960 (3.6960)  weight_decay: 0.0500 (0.0500)  time: 2.1353  data: 1.6399  max mem: 42519
Epoch: [185]  [ 200/2502]  eta: 0:18:50  lr: 0.001445  min_lr: 0.001445  loss: 3.3709 (3.2494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8127 (0.8414)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [185]  [ 400/2502]  eta: 0:17:00  lr: 0.001444  min_lr: 0.001444  loss: 3.2940 (3.1799)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8517 (0.8362)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [185]  [ 600/2502]  eta: 0:15:20  lr: 0.001442  min_lr: 0.001442  loss: 3.1970 (3.1941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7866 (0.8354)  time: 0.4813  data: 0.0004  max mem: 42519
Epoch: [185]  [ 800/2502]  eta: 0:13:43  lr: 0.001440  min_lr: 0.001440  loss: 3.2615 (3.1872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8133 (0.8347)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [185]  [1000/2502]  eta: 0:12:05  lr: 0.001438  min_lr: 0.001438  loss: 3.3819 (3.1905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8805 (0.8438)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [185]  [1200/2502]  eta: 0:10:28  lr: 0.001437  min_lr: 0.001437  loss: 3.4452 (3.1812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7946 (0.8440)  time: 0.4850  data: 0.0005  max mem: 42519
Epoch: [185]  [1400/2502]  eta: 0:08:51  lr: 0.001435  min_lr: 0.001435  loss: 3.2327 (3.1862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8713 (0.8511)  time: 0.4816  data: 0.0005  max mem: 42519
Epoch: [185]  [1600/2502]  eta: 0:07:15  lr: 0.001433  min_lr: 0.001433  loss: 3.0124 (3.1878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8292 (0.8506)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [185]  [1800/2502]  eta: 0:05:38  lr: 0.001431  min_lr: 0.001431  loss: 3.0407 (3.1883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8440 (0.8555)  time: 0.4855  data: 0.0005  max mem: 42519
Epoch: [185]  [2000/2502]  eta: 0:04:02  lr: 0.001430  min_lr: 0.001430  loss: 3.4298 (3.1936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8295 (0.8527)  time: 0.4813  data: 0.0005  max mem: 42519
Epoch: [185]  [2200/2502]  eta: 0:02:25  lr: 0.001428  min_lr: 0.001428  loss: 2.9852 (3.1928)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7986 (0.8510)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [185]  [2400/2502]  eta: 0:00:49  lr: 0.001426  min_lr: 0.001426  loss: 3.3789 (3.1920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8117 (0.8488)  time: 0.4898  data: 0.0004  max mem: 42519
Epoch: [185]  [2501/2502]  eta: 0:00:00  lr: 0.001425  min_lr: 0.001425  loss: 3.4292 (3.1935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7744 (0.8473)  time: 0.4417  data: 0.0008  max mem: 42519
Epoch: [185] Total time: 0:20:06 (0.4823 s / it)
Averaged stats: lr: 0.001425  min_lr: 0.001425  loss: 3.4292 (3.1961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7744 (0.8473)
Test:  [ 0/50]  eta: 0:02:33  loss: 0.6980 (0.6980)  acc1: 93.2000 (93.2000)  acc5: 99.2000 (99.2000)  time: 3.0733  data: 2.8015  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 1.0001 (0.9813)  acc1: 86.0000 (85.4182)  acc5: 98.0000 (97.0909)  time: 0.4993  data: 0.2552  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 1.0296 (1.0066)  acc1: 83.2000 (84.2095)  acc5: 97.6000 (97.1810)  time: 0.2435  data: 0.0029  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.1796 (1.1064)  acc1: 79.6000 (82.2452)  acc5: 95.2000 (95.9871)  time: 0.2609  data: 0.0224  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.3307 (1.1665)  acc1: 76.8000 (80.7415)  acc5: 93.6000 (95.4049)  time: 0.2571  data: 0.0198  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.3231 (1.1824)  acc1: 75.6000 (80.1520)  acc5: 94.0000 (95.3600)  time: 0.2477  data: 0.0105  max mem: 42519
Test: Total time: 0:00:15 (0.3064 s / it)
* Acc@1 80.646 Acc@5 95.442 loss 1.175
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.75%
Epoch: [186]  [   0/2502]  eta: 1:29:35  lr: 0.001425  min_lr: 0.001425  loss: 3.3138 (3.3138)  weight_decay: 0.0500 (0.0500)  time: 2.1486  data: 1.6346  max mem: 42519
Epoch: [186]  [ 200/2502]  eta: 0:18:44  lr: 0.001424  min_lr: 0.001424  loss: 3.0063 (3.1939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7983 (0.8184)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [186]  [ 400/2502]  eta: 0:17:00  lr: 0.001422  min_lr: 0.001422  loss: 3.3584 (3.1875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8706 (0.8479)  time: 0.4885  data: 0.0004  max mem: 42519
Epoch: [186]  [ 600/2502]  eta: 0:15:20  lr: 0.001420  min_lr: 0.001420  loss: 3.4125 (3.1984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8121 (0.8541)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [186]  [ 800/2502]  eta: 0:13:41  lr: 0.001419  min_lr: 0.001419  loss: 3.4551 (3.1990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8296 (0.8484)  time: 0.4803  data: 0.0005  max mem: 42519
Epoch: [186]  [1000/2502]  eta: 0:12:05  lr: 0.001417  min_lr: 0.001417  loss: 3.1497 (3.2000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7734 (0.8459)  time: 0.4844  data: 0.0004  max mem: 42519
Epoch: [186]  [1200/2502]  eta: 0:10:27  lr: 0.001415  min_lr: 0.001415  loss: 3.0345 (3.1906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8182 (0.8462)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [186]  [1400/2502]  eta: 0:08:51  lr: 0.001413  min_lr: 0.001413  loss: 3.1356 (3.1886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8670 (0.8486)  time: 0.4859  data: 0.0004  max mem: 42519
Epoch: [186]  [1600/2502]  eta: 0:07:14  lr: 0.001412  min_lr: 0.001412  loss: 3.2401 (3.1851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7978 (0.8462)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [186]  [1800/2502]  eta: 0:05:38  lr: 0.001410  min_lr: 0.001410  loss: 3.2488 (3.1882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7701 (0.8428)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [186]  [2000/2502]  eta: 0:04:01  lr: 0.001408  min_lr: 0.001408  loss: 3.2731 (3.1897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8230 (0.8425)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [186]  [2200/2502]  eta: 0:02:25  lr: 0.001407  min_lr: 0.001407  loss: 3.3563 (3.1883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8553 (0.8401)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [186]  [2400/2502]  eta: 0:00:49  lr: 0.001405  min_lr: 0.001405  loss: 3.4145 (3.1864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9018 (0.8413)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [186]  [2501/2502]  eta: 0:00:00  lr: 0.001404  min_lr: 0.001404  loss: 3.2702 (3.1875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8389 (0.8413)  time: 0.4367  data: 0.0009  max mem: 42519
Epoch: [186] Total time: 0:20:03 (0.4810 s / it)
Averaged stats: lr: 0.001404  min_lr: 0.001404  loss: 3.2702 (3.1886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8389 (0.8413)
Test:  [ 0/50]  eta: 0:02:18  loss: 0.5812 (0.5812)  acc1: 93.2000 (93.2000)  acc5: 99.6000 (99.6000)  time: 2.7696  data: 2.4802  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.8970 (0.9058)  acc1: 85.6000 (85.1273)  acc5: 98.0000 (97.6000)  time: 0.5145  data: 0.2721  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.9502 (0.9283)  acc1: 83.6000 (84.5905)  acc5: 97.6000 (97.4857)  time: 0.2694  data: 0.0259  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0720 (1.0142)  acc1: 79.6000 (82.5936)  acc5: 95.6000 (96.3484)  time: 0.2566  data: 0.0131  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2389 (1.0739)  acc1: 76.4000 (80.8976)  acc5: 93.2000 (95.7268)  time: 0.2623  data: 0.0249  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2323 (1.0917)  acc1: 75.6000 (80.2160)  acc5: 94.4000 (95.6480)  time: 0.2506  data: 0.0121  max mem: 42519
Test: Total time: 0:00:15 (0.3133 s / it)
* Acc@1 80.484 Acc@5 95.668 loss 1.091
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.75%
Epoch: [187]  [   0/2502]  eta: 1:27:26  lr: 0.001404  min_lr: 0.001404  loss: 3.1145 (3.1145)  weight_decay: 0.0500 (0.0500)  time: 2.0968  data: 1.5759  max mem: 42519
Epoch: [187]  [ 200/2502]  eta: 0:18:50  lr: 0.001402  min_lr: 0.001402  loss: 3.3154 (3.1888)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8603 (0.8513)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [187]  [ 400/2502]  eta: 0:17:01  lr: 0.001401  min_lr: 0.001401  loss: 3.3995 (3.2030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7707 (0.8377)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [187]  [ 600/2502]  eta: 0:15:21  lr: 0.001399  min_lr: 0.001399  loss: 3.4165 (3.2026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8772 (nan)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [187]  [ 800/2502]  eta: 0:13:43  lr: 0.001397  min_lr: 0.001397  loss: 3.2031 (3.1925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8166 (nan)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [187]  [1000/2502]  eta: 0:12:05  lr: 0.001395  min_lr: 0.001395  loss: 3.3404 (3.1991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7970 (nan)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [187]  [1200/2502]  eta: 0:10:28  lr: 0.001394  min_lr: 0.001394  loss: 3.2810 (3.1991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9015 (nan)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [187]  [1400/2502]  eta: 0:08:51  lr: 0.001392  min_lr: 0.001392  loss: 3.3110 (3.2045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8135 (nan)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [187]  [1600/2502]  eta: 0:07:15  lr: 0.001390  min_lr: 0.001390  loss: 3.2763 (3.2017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8171 (nan)  time: 0.4824  data: 0.0006  max mem: 42519
Epoch: [187]  [1800/2502]  eta: 0:05:38  lr: 0.001389  min_lr: 0.001389  loss: 3.4391 (3.1984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8422 (nan)  time: 0.4986  data: 0.0004  max mem: 42519
Epoch: [187]  [2000/2502]  eta: 0:04:02  lr: 0.001387  min_lr: 0.001387  loss: 3.1962 (3.1979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8034 (nan)  time: 0.4818  data: 0.0005  max mem: 42519
Epoch: [187]  [2200/2502]  eta: 0:02:25  lr: 0.001385  min_lr: 0.001385  loss: 3.4577 (3.1951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8797 (nan)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [187]  [2400/2502]  eta: 0:00:49  lr: 0.001383  min_lr: 0.001383  loss: 3.3565 (3.1922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8336 (nan)  time: 0.4864  data: 0.0004  max mem: 42519
Epoch: [187]  [2501/2502]  eta: 0:00:00  lr: 0.001383  min_lr: 0.001383  loss: 3.1896 (3.1929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8490 (nan)  time: 0.4401  data: 0.0010  max mem: 42519
Epoch: [187] Total time: 0:20:06 (0.4822 s / it)
Averaged stats: lr: 0.001383  min_lr: 0.001383  loss: 3.1896 (3.1844)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8490 (nan)
Test:  [ 0/50]  eta: 0:02:19  loss: 0.5868 (0.5868)  acc1: 94.4000 (94.4000)  acc5: 99.2000 (99.2000)  time: 2.7899  data: 2.5130  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.8486 (0.8622)  acc1: 87.2000 (85.8182)  acc5: 97.2000 (97.2364)  time: 0.5135  data: 0.2728  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8496 (0.8848)  acc1: 82.0000 (84.5524)  acc5: 97.6000 (97.3905)  time: 0.2642  data: 0.0247  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9889 (0.9714)  acc1: 80.4000 (82.4774)  acc5: 95.2000 (96.3613)  time: 0.2450  data: 0.0052  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1798 (1.0271)  acc1: 78.0000 (81.1317)  acc5: 93.6000 (95.7561)  time: 0.2590  data: 0.0216  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1798 (1.0440)  acc1: 76.0000 (80.4480)  acc5: 94.0000 (95.6960)  time: 0.2540  data: 0.0168  max mem: 42519
Test: Total time: 0:00:15 (0.3095 s / it)
* Acc@1 80.724 Acc@5 95.566 loss 1.040
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.75%
Epoch: [188]  [   0/2502]  eta: 1:25:54  lr: 0.001383  min_lr: 0.001383  loss: 3.3403 (3.3403)  weight_decay: 0.0500 (0.0500)  time: 2.0600  data: 1.2660  max mem: 42519
Epoch: [188]  [ 200/2502]  eta: 0:18:45  lr: 0.001381  min_lr: 0.001381  loss: 3.4166 (3.1885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8392 (0.8151)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [188]  [ 400/2502]  eta: 0:17:00  lr: 0.001379  min_lr: 0.001379  loss: 3.2530 (3.1631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8240 (0.8401)  time: 0.4993  data: 0.0007  max mem: 42519
Epoch: [188]  [ 600/2502]  eta: 0:15:20  lr: 0.001378  min_lr: 0.001378  loss: 3.0334 (3.1949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8467 (0.8389)  time: 0.4795  data: 0.0006  max mem: 42519
Epoch: [188]  [ 800/2502]  eta: 0:13:42  lr: 0.001376  min_lr: 0.001376  loss: 3.5112 (3.1891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8519 (0.8447)  time: 0.4822  data: 0.0007  max mem: 42519
Epoch: [188]  [1000/2502]  eta: 0:12:05  lr: 0.001374  min_lr: 0.001374  loss: 3.2689 (3.1824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8112 (nan)  time: 0.4878  data: 0.0006  max mem: 42519
Epoch: [188]  [1200/2502]  eta: 0:10:28  lr: 0.001372  min_lr: 0.001372  loss: 3.3259 (3.1930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8438 (nan)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [188]  [1400/2502]  eta: 0:08:51  lr: 0.001371  min_lr: 0.001371  loss: 3.3302 (3.1872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8737 (nan)  time: 0.4821  data: 0.0007  max mem: 42519
Epoch: [188]  [1600/2502]  eta: 0:07:15  lr: 0.001369  min_lr: 0.001369  loss: 3.4106 (3.1839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7833 (nan)  time: 0.4826  data: 0.0007  max mem: 42519
Epoch: [188]  [1800/2502]  eta: 0:05:38  lr: 0.001367  min_lr: 0.001367  loss: 3.2209 (3.1814)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8222 (nan)  time: 0.4808  data: 0.0007  max mem: 42519
Epoch: [188]  [2000/2502]  eta: 0:04:02  lr: 0.001366  min_lr: 0.001366  loss: 3.2216 (3.1869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8574 (nan)  time: 0.4800  data: 0.0006  max mem: 42519
Epoch: [188]  [2200/2502]  eta: 0:02:25  lr: 0.001364  min_lr: 0.001364  loss: 3.0319 (3.1836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8546 (nan)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [188]  [2400/2502]  eta: 0:00:49  lr: 0.001362  min_lr: 0.001362  loss: 3.2001 (3.1799)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8534 (nan)  time: 0.4801  data: 0.0007  max mem: 42519
Epoch: [188]  [2501/2502]  eta: 0:00:00  lr: 0.001361  min_lr: 0.001361  loss: 3.2774 (3.1793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8112 (nan)  time: 0.4404  data: 0.0010  max mem: 42519
Epoch: [188] Total time: 0:20:06 (0.4822 s / it)
Averaged stats: lr: 0.001361  min_lr: 0.001361  loss: 3.2774 (3.1743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8112 (nan)
Test:  [ 0/50]  eta: 0:02:48  loss: 0.5953 (0.5953)  acc1: 93.6000 (93.6000)  acc5: 98.8000 (98.8000)  time: 3.3788  data: 3.1113  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.9132 (0.8925)  acc1: 85.2000 (85.7455)  acc5: 97.6000 (97.2727)  time: 0.5214  data: 0.2832  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.9132 (0.9135)  acc1: 84.0000 (84.9143)  acc5: 97.6000 (97.2571)  time: 0.2373  data: 0.0015  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0418 (0.9988)  acc1: 80.8000 (82.8774)  acc5: 95.6000 (96.3742)  time: 0.2375  data: 0.0016  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1981 (1.0557)  acc1: 77.2000 (81.3659)  acc5: 93.6000 (95.7463)  time: 0.2359  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2068 (1.0724)  acc1: 76.4000 (80.7200)  acc5: 93.2000 (95.5440)  time: 0.2358  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3015 s / it)
* Acc@1 80.794 Acc@5 95.720 loss 1.071
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.79%
Epoch: [189]  [   0/2502]  eta: 1:34:06  lr: 0.001361  min_lr: 0.001361  loss: 3.5589 (3.5589)  weight_decay: 0.0500 (0.0500)  time: 2.2569  data: 1.7690  max mem: 42519
Epoch: [189]  [ 200/2502]  eta: 0:18:49  lr: 0.001360  min_lr: 0.001360  loss: 3.2455 (3.1530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8429 (0.8486)  time: 0.4774  data: 0.0005  max mem: 42519
Epoch: [189]  [ 400/2502]  eta: 0:17:00  lr: 0.001358  min_lr: 0.001358  loss: 3.2502 (3.1454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7926 (0.8430)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [189]  [ 600/2502]  eta: 0:15:19  lr: 0.001356  min_lr: 0.001356  loss: 3.3478 (3.1378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8158 (0.8485)  time: 0.4788  data: 0.0006  max mem: 42519
Epoch: [189]  [ 800/2502]  eta: 0:13:41  lr: 0.001355  min_lr: 0.001355  loss: 3.2564 (3.1592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8877 (0.8641)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [189]  [1000/2502]  eta: 0:12:04  lr: 0.001353  min_lr: 0.001353  loss: 3.1115 (3.1796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8390 (0.8625)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [189]  [1200/2502]  eta: 0:10:27  lr: 0.001351  min_lr: 0.001351  loss: 3.2011 (3.1832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8379 (0.8574)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [189]  [1400/2502]  eta: 0:08:51  lr: 0.001349  min_lr: 0.001349  loss: 3.4319 (3.1790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7937 (0.8534)  time: 0.4822  data: 0.0006  max mem: 42519
Epoch: [189]  [1600/2502]  eta: 0:07:14  lr: 0.001348  min_lr: 0.001348  loss: 2.9567 (3.1796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8658 (0.8579)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [189]  [1800/2502]  eta: 0:05:38  lr: 0.001346  min_lr: 0.001346  loss: 3.2639 (3.1774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9023 (0.8613)  time: 0.4948  data: 0.0005  max mem: 42519
Epoch: [189]  [2000/2502]  eta: 0:04:01  lr: 0.001344  min_lr: 0.001344  loss: 2.9549 (3.1766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8891 (0.8604)  time: 0.4808  data: 0.0004  max mem: 42519
Epoch: [189]  [2200/2502]  eta: 0:02:25  lr: 0.001343  min_lr: 0.001343  loss: 3.2565 (3.1719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8160 (0.8608)  time: 0.4811  data: 0.0004  max mem: 42519
Epoch: [189]  [2400/2502]  eta: 0:00:49  lr: 0.001341  min_lr: 0.001341  loss: 3.3094 (3.1727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8506 (0.8626)  time: 0.4853  data: 0.0005  max mem: 42519
Epoch: [189]  [2501/2502]  eta: 0:00:00  lr: 0.001340  min_lr: 0.001340  loss: 3.0854 (3.1713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8231 (0.8619)  time: 0.4351  data: 0.0007  max mem: 42519
Epoch: [189] Total time: 0:20:04 (0.4813 s / it)
Averaged stats: lr: 0.001340  min_lr: 0.001340  loss: 3.0854 (3.1826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8231 (0.8619)
Test:  [ 0/50]  eta: 0:04:32  loss: 0.5076 (0.5076)  acc1: 93.2000 (93.2000)  acc5: 99.2000 (99.2000)  time: 5.4464  data: 5.1682  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.8014 (0.8041)  acc1: 85.2000 (85.7818)  acc5: 98.0000 (97.4546)  time: 0.7130  data: 0.4726  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8215 (0.8238)  acc1: 83.2000 (85.1429)  acc5: 97.6000 (97.4667)  time: 0.2407  data: 0.0018  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9723 (0.9157)  acc1: 80.8000 (82.9548)  acc5: 96.4000 (96.3613)  time: 0.2408  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1717 (0.9818)  acc1: 77.2000 (81.2098)  acc5: 93.2000 (95.6683)  time: 0.2386  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1717 (1.0007)  acc1: 76.0000 (80.6720)  acc5: 94.0000 (95.6160)  time: 0.2372  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3460 s / it)
* Acc@1 80.850 Acc@5 95.674 loss 1.001
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.85%
Epoch: [190]  [   0/2502]  eta: 1:23:01  lr: 0.001340  min_lr: 0.001340  loss: 3.8388 (3.8388)  weight_decay: 0.0500 (0.0500)  time: 1.9910  data: 1.5003  max mem: 42519
Epoch: [190]  [ 200/2502]  eta: 0:18:41  lr: 0.001338  min_lr: 0.001338  loss: 3.2742 (3.1408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7931 (0.8650)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [190]  [ 400/2502]  eta: 0:16:57  lr: 0.001337  min_lr: 0.001337  loss: 3.3677 (3.1640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8785 (0.8812)  time: 0.4891  data: 0.0004  max mem: 42519
Epoch: [190]  [ 600/2502]  eta: 0:15:19  lr: 0.001335  min_lr: 0.001335  loss: 3.4158 (3.1760)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8018 (0.8867)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [190]  [ 800/2502]  eta: 0:13:41  lr: 0.001333  min_lr: 0.001333  loss: 3.3847 (3.1738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8260 (0.8804)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [190]  [1000/2502]  eta: 0:12:04  lr: 0.001332  min_lr: 0.001332  loss: 3.2381 (3.1760)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7962 (0.8686)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [190]  [1200/2502]  eta: 0:10:27  lr: 0.001330  min_lr: 0.001330  loss: 2.9428 (3.1731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8340 (0.8628)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [190]  [1400/2502]  eta: 0:08:50  lr: 0.001328  min_lr: 0.001328  loss: 3.2223 (3.1674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8288 (0.8648)  time: 0.4812  data: 0.0004  max mem: 42519
Epoch: [190]  [1600/2502]  eta: 0:07:14  lr: 0.001327  min_lr: 0.001327  loss: 3.4388 (3.1674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8366 (0.8644)  time: 0.4787  data: 0.0003  max mem: 42519
Epoch: [190]  [1800/2502]  eta: 0:05:37  lr: 0.001325  min_lr: 0.001325  loss: 3.1542 (3.1718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8738 (0.8825)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [190]  [2000/2502]  eta: 0:04:01  lr: 0.001323  min_lr: 0.001323  loss: 3.3343 (3.1709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8636 (0.8773)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [190]  [2200/2502]  eta: 0:02:25  lr: 0.001322  min_lr: 0.001322  loss: 2.9602 (3.1691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8413 (0.8756)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [190]  [2400/2502]  eta: 0:00:49  lr: 0.001320  min_lr: 0.001320  loss: 3.3382 (3.1715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7846 (0.8744)  time: 0.4810  data: 0.0004  max mem: 42519
Epoch: [190]  [2501/2502]  eta: 0:00:00  lr: 0.001319  min_lr: 0.001319  loss: 3.1074 (3.1706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8128 (0.8722)  time: 0.4347  data: 0.0006  max mem: 42519
Epoch: [190] Total time: 0:20:02 (0.4808 s / it)
Averaged stats: lr: 0.001319  min_lr: 0.001319  loss: 3.1074 (3.1682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8128 (0.8722)
Test:  [ 0/50]  eta: 0:03:27  loss: 0.5590 (0.5590)  acc1: 94.0000 (94.0000)  acc5: 99.2000 (99.2000)  time: 4.1561  data: 3.8573  max mem: 42519
Test:  [10/50]  eta: 0:00:26  loss: 0.9123 (0.8873)  acc1: 85.6000 (85.8182)  acc5: 97.6000 (97.6000)  time: 0.6645  data: 0.4189  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.9125 (0.9109)  acc1: 84.0000 (84.6476)  acc5: 97.6000 (97.5429)  time: 0.3048  data: 0.0661  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9860 (0.9860)  acc1: 80.8000 (83.0710)  acc5: 96.4000 (96.4387)  time: 0.2711  data: 0.0290  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1657 (1.0381)  acc1: 78.4000 (81.4829)  acc5: 93.6000 (95.8244)  time: 0.2702  data: 0.0259  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1633 (1.0545)  acc1: 78.0000 (80.8880)  acc5: 94.0000 (95.6800)  time: 0.2650  data: 0.0256  max mem: 42519
Test: Total time: 0:00:17 (0.3579 s / it)
* Acc@1 80.892 Acc@5 95.780 loss 1.051
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.89%
Epoch: [191]  [   0/2502]  eta: 1:39:40  lr: 0.001319  min_lr: 0.001319  loss: 3.4917 (3.4917)  weight_decay: 0.0500 (0.0500)  time: 2.3905  data: 1.8931  max mem: 42519
Epoch: [191]  [ 200/2502]  eta: 0:18:53  lr: 0.001317  min_lr: 0.001317  loss: 3.3528 (3.1005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8312 (0.8556)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [191]  [ 400/2502]  eta: 0:17:01  lr: 0.001316  min_lr: 0.001316  loss: 3.2843 (3.1262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8147 (0.8446)  time: 0.4811  data: 0.0004  max mem: 42519
Epoch: [191]  [ 600/2502]  eta: 0:15:19  lr: 0.001314  min_lr: 0.001314  loss: 3.3328 (3.1509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8128 (0.8394)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [191]  [ 800/2502]  eta: 0:13:42  lr: 0.001312  min_lr: 0.001312  loss: 3.2614 (3.1666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9059 (0.8542)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [191]  [1000/2502]  eta: 0:12:04  lr: 0.001311  min_lr: 0.001311  loss: 3.3911 (3.1671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8220 (0.8574)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [191]  [1200/2502]  eta: 0:10:27  lr: 0.001309  min_lr: 0.001309  loss: 3.0883 (3.1622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8238 (0.8652)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [191]  [1400/2502]  eta: 0:08:50  lr: 0.001307  min_lr: 0.001307  loss: 3.1178 (3.1558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8236 (0.8608)  time: 0.4772  data: 0.0004  max mem: 42519
Epoch: [191]  [1600/2502]  eta: 0:07:13  lr: 0.001305  min_lr: 0.001305  loss: 3.1759 (3.1590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8950 (0.8634)  time: 0.4811  data: 0.0004  max mem: 42519
Epoch: [191]  [1800/2502]  eta: 0:05:37  lr: 0.001304  min_lr: 0.001304  loss: 3.2591 (3.1604)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.4848  data: 0.0004  max mem: 42519
Epoch: [191]  [2000/2502]  eta: 0:04:01  lr: 0.001302  min_lr: 0.001302  loss: 3.2174 (3.1682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8371 (nan)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [191]  [2200/2502]  eta: 0:02:25  lr: 0.001300  min_lr: 0.001300  loss: 3.2993 (3.1706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8263 (nan)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [191]  [2400/2502]  eta: 0:00:49  lr: 0.001299  min_lr: 0.001299  loss: 3.2326 (3.1740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8222 (nan)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [191]  [2501/2502]  eta: 0:00:00  lr: 0.001298  min_lr: 0.001298  loss: 3.3685 (3.1738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8266 (nan)  time: 0.4366  data: 0.0007  max mem: 42519
Epoch: [191] Total time: 0:20:02 (0.4808 s / it)
Averaged stats: lr: 0.001298  min_lr: 0.001298  loss: 3.3685 (3.1692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8266 (nan)
Test:  [ 0/50]  eta: 0:04:44  loss: 0.5976 (0.5976)  acc1: 92.4000 (92.4000)  acc5: 99.2000 (99.2000)  time: 5.6837  data: 5.3899  max mem: 42519
Test:  [10/50]  eta: 0:00:31  loss: 0.8506 (0.8377)  acc1: 86.0000 (86.2545)  acc5: 98.0000 (97.5273)  time: 0.7754  data: 0.5336  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.8797 (0.8610)  acc1: 84.0000 (85.2952)  acc5: 98.0000 (97.6381)  time: 0.2612  data: 0.0243  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0313 (0.9508)  acc1: 80.8000 (83.2000)  acc5: 95.6000 (96.5677)  time: 0.2428  data: 0.0007  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1700 (1.0095)  acc1: 77.6000 (81.5707)  acc5: 94.0000 (95.9707)  time: 0.2426  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1700 (1.0264)  acc1: 77.2000 (81.1920)  acc5: 94.8000 (95.8720)  time: 0.2372  data: 0.0002  max mem: 42519
Test: Total time: 0:00:18 (0.3602 s / it)
* Acc@1 81.092 Acc@5 95.782 loss 1.027
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.09%
Epoch: [192]  [   0/2502]  eta: 1:20:20  lr: 0.001298  min_lr: 0.001298  loss: 2.8490 (2.8490)  weight_decay: 0.0500 (0.0500)  time: 1.9267  data: 1.4371  max mem: 42519
Epoch: [192]  [ 200/2502]  eta: 0:18:42  lr: 0.001296  min_lr: 0.001296  loss: 3.4849 (3.1422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8504 (0.8525)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [192]  [ 400/2502]  eta: 0:16:57  lr: 0.001295  min_lr: 0.001295  loss: 3.1963 (3.1437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8806 (0.8769)  time: 0.4888  data: 0.0004  max mem: 42519
Epoch: [192]  [ 600/2502]  eta: 0:15:19  lr: 0.001293  min_lr: 0.001293  loss: 3.4641 (3.1621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8645 (0.8814)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [192]  [ 800/2502]  eta: 0:13:40  lr: 0.001291  min_lr: 0.001291  loss: 3.2676 (3.1794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8679 (0.8800)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [192]  [1000/2502]  eta: 0:12:04  lr: 0.001290  min_lr: 0.001290  loss: 3.1455 (3.1625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8160 (0.8749)  time: 0.4857  data: 0.0004  max mem: 42519
Epoch: [192]  [1200/2502]  eta: 0:10:27  lr: 0.001288  min_lr: 0.001288  loss: 3.3610 (3.1662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8193 (0.8768)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [192]  [1400/2502]  eta: 0:08:50  lr: 0.001286  min_lr: 0.001286  loss: 3.1938 (3.1586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8866 (0.8819)  time: 0.4811  data: 0.0004  max mem: 42519
Epoch: [192]  [1600/2502]  eta: 0:07:14  lr: 0.001285  min_lr: 0.001285  loss: 3.3979 (3.1602)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8661 (0.8831)  time: 0.4879  data: 0.0004  max mem: 42519
Epoch: [192]  [1800/2502]  eta: 0:05:38  lr: 0.001283  min_lr: 0.001283  loss: 3.2850 (3.1612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8615 (0.8824)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [192]  [2000/2502]  eta: 0:04:01  lr: 0.001281  min_lr: 0.001281  loss: 3.1701 (3.1641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9126 (0.8827)  time: 0.4814  data: 0.0004  max mem: 42519
Epoch: [192]  [2200/2502]  eta: 0:02:25  lr: 0.001279  min_lr: 0.001279  loss: 3.1851 (3.1685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8957 (0.8850)  time: 0.4813  data: 0.0004  max mem: 42519
Epoch: [192]  [2400/2502]  eta: 0:00:49  lr: 0.001278  min_lr: 0.001278  loss: 3.4171 (3.1722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9277 (0.8857)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [192]  [2501/2502]  eta: 0:00:00  lr: 0.001277  min_lr: 0.001277  loss: 3.0010 (3.1718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8669 (0.8872)  time: 0.4320  data: 0.0009  max mem: 42519
Epoch: [192] Total time: 0:20:03 (0.4811 s / it)
Averaged stats: lr: 0.001277  min_lr: 0.001277  loss: 3.0010 (3.1578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8669 (0.8872)
Test:  [ 0/50]  eta: 0:04:19  loss: 0.5510 (0.5510)  acc1: 93.6000 (93.6000)  acc5: 99.2000 (99.2000)  time: 5.1843  data: 4.8946  max mem: 42519
Test:  [10/50]  eta: 0:00:27  loss: 0.8193 (0.8319)  acc1: 87.6000 (85.7818)  acc5: 98.4000 (97.6364)  time: 0.6869  data: 0.4454  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8292 (0.8659)  acc1: 83.6000 (84.9714)  acc5: 98.0000 (97.5238)  time: 0.2378  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9956 (0.9549)  acc1: 82.4000 (83.0968)  acc5: 95.6000 (96.3742)  time: 0.2387  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1681 (1.0120)  acc1: 76.0000 (81.3268)  acc5: 94.0000 (95.8244)  time: 0.2381  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1681 (1.0278)  acc1: 77.2000 (80.8640)  acc5: 94.8000 (95.7600)  time: 0.2373  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3399 s / it)
* Acc@1 80.964 Acc@5 95.724 loss 1.026
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.09%
Epoch: [193]  [   0/2502]  eta: 1:35:00  lr: 0.001277  min_lr: 0.001277  loss: 3.4884 (3.4884)  weight_decay: 0.0500 (0.0500)  time: 2.2784  data: 1.5250  max mem: 42519
Epoch: [193]  [ 200/2502]  eta: 0:18:48  lr: 0.001275  min_lr: 0.001275  loss: 2.8067 (3.0763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8186 (0.8499)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [193]  [ 400/2502]  eta: 0:17:01  lr: 0.001274  min_lr: 0.001274  loss: 3.0287 (3.0933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8386 (0.8675)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [193]  [ 600/2502]  eta: 0:15:19  lr: 0.001272  min_lr: 0.001272  loss: 3.3915 (3.1108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8514 (0.8661)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [193]  [ 800/2502]  eta: 0:13:42  lr: 0.001270  min_lr: 0.001270  loss: 3.2382 (3.1250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8872 (0.8706)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [193]  [1000/2502]  eta: 0:12:04  lr: 0.001269  min_lr: 0.001269  loss: 3.0374 (3.1261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8752 (0.8810)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [193]  [1200/2502]  eta: 0:10:27  lr: 0.001267  min_lr: 0.001267  loss: 3.1599 (3.1430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8406 (0.8962)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [193]  [1400/2502]  eta: 0:08:50  lr: 0.001265  min_lr: 0.001265  loss: 3.2120 (3.1449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8666 (0.8914)  time: 0.4783  data: 0.0003  max mem: 42519
Epoch: [193]  [1600/2502]  eta: 0:07:14  lr: 0.001264  min_lr: 0.001264  loss: 2.9755 (3.1427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8612 (0.8879)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [193]  [1800/2502]  eta: 0:05:37  lr: 0.001262  min_lr: 0.001262  loss: 3.0236 (3.1505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8196 (0.8867)  time: 0.4781  data: 0.0004  max mem: 42519
Epoch: [193]  [2000/2502]  eta: 0:04:01  lr: 0.001260  min_lr: 0.001260  loss: 3.1918 (3.1555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8249 (0.8843)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [193]  [2200/2502]  eta: 0:02:25  lr: 0.001259  min_lr: 0.001259  loss: 3.1768 (3.1500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8759 (0.8847)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [193]  [2400/2502]  eta: 0:00:49  lr: 0.001257  min_lr: 0.001257  loss: 3.1411 (3.1509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8602 (0.8866)  time: 0.4859  data: 0.0004  max mem: 42519
Epoch: [193]  [2501/2502]  eta: 0:00:00  lr: 0.001256  min_lr: 0.001256  loss: 3.3496 (3.1531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8496 (0.8861)  time: 0.4340  data: 0.0007  max mem: 42519
Epoch: [193] Total time: 0:20:03 (0.4809 s / it)
Averaged stats: lr: 0.001256  min_lr: 0.001256  loss: 3.3496 (3.1573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8496 (0.8861)
Test:  [ 0/50]  eta: 0:04:32  loss: 0.6791 (0.6791)  acc1: 92.4000 (92.4000)  acc5: 99.2000 (99.2000)  time: 5.4569  data: 5.1891  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.9456 (0.9742)  acc1: 86.4000 (85.5273)  acc5: 98.4000 (97.3455)  time: 0.7118  data: 0.4721  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.9576 (0.9818)  acc1: 83.6000 (84.7619)  acc5: 97.6000 (97.3143)  time: 0.2394  data: 0.0004  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0813 (1.0640)  acc1: 81.2000 (82.9677)  acc5: 95.6000 (96.4387)  time: 0.2426  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2896 (1.1269)  acc1: 76.8000 (81.1317)  acc5: 94.4000 (95.8927)  time: 0.2624  data: 0.0209  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.3107 (1.1464)  acc1: 76.8000 (80.5680)  acc5: 94.4000 (95.7840)  time: 0.2591  data: 0.0207  max mem: 42519
Test: Total time: 0:00:17 (0.3545 s / it)
* Acc@1 80.824 Acc@5 95.742 loss 1.147
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 81.09%
Epoch: [194]  [   0/2502]  eta: 1:44:03  lr: 0.001256  min_lr: 0.001256  loss: 2.8611 (2.8611)  weight_decay: 0.0500 (0.0500)  time: 2.4953  data: 1.7995  max mem: 42519
Epoch: [194]  [ 200/2502]  eta: 0:18:49  lr: 0.001254  min_lr: 0.001254  loss: 3.0434 (3.0773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8587 (0.8514)  time: 0.4834  data: 0.0005  max mem: 42519
Epoch: [194]  [ 400/2502]  eta: 0:16:59  lr: 0.001253  min_lr: 0.001253  loss: 3.3155 (3.1200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8365 (0.8704)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [194]  [ 600/2502]  eta: 0:15:20  lr: 0.001251  min_lr: 0.001251  loss: 3.4298 (3.1215)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9831 (0.8825)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [194]  [ 800/2502]  eta: 0:13:41  lr: 0.001249  min_lr: 0.001249  loss: 3.2092 (3.1210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8384 (0.8799)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [194]  [1000/2502]  eta: 0:12:05  lr: 0.001248  min_lr: 0.001248  loss: 3.3049 (3.1458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8434 (0.8827)  time: 0.4880  data: 0.0004  max mem: 42519
Epoch: [194]  [1200/2502]  eta: 0:10:28  lr: 0.001246  min_lr: 0.001246  loss: 3.0615 (3.1457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8932 (0.8848)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [194]  [1400/2502]  eta: 0:08:51  lr: 0.001244  min_lr: 0.001244  loss: 3.3101 (3.1405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9109 (0.8857)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [194]  [1600/2502]  eta: 0:07:14  lr: 0.001243  min_lr: 0.001243  loss: 3.1342 (3.1364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8356 (0.8851)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [194]  [1800/2502]  eta: 0:05:38  lr: 0.001241  min_lr: 0.001241  loss: 3.3416 (3.1411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8289 (0.8880)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [194]  [2000/2502]  eta: 0:04:01  lr: 0.001239  min_lr: 0.001239  loss: 3.4199 (3.1482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7972 (0.8869)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [194]  [2200/2502]  eta: 0:02:25  lr: 0.001238  min_lr: 0.001238  loss: 3.3906 (3.1546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8624 (0.8908)  time: 0.4889  data: 0.0005  max mem: 42519
Epoch: [194]  [2400/2502]  eta: 0:00:49  lr: 0.001236  min_lr: 0.001236  loss: 3.1507 (3.1502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8518 (0.8893)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [194]  [2501/2502]  eta: 0:00:00  lr: 0.001235  min_lr: 0.001235  loss: 3.0772 (3.1514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8591 (0.8896)  time: 0.4340  data: 0.0007  max mem: 42519
Epoch: [194] Total time: 0:20:04 (0.4814 s / it)
Averaged stats: lr: 0.001235  min_lr: 0.001235  loss: 3.0772 (3.1552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8591 (0.8896)
Test:  [ 0/50]  eta: 0:04:29  loss: 0.5686 (0.5686)  acc1: 93.2000 (93.2000)  acc5: 99.2000 (99.2000)  time: 5.3844  data: 5.0979  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.8429 (0.8802)  acc1: 85.6000 (85.7818)  acc5: 97.6000 (97.4182)  time: 0.7093  data: 0.4681  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8694 (0.8918)  acc1: 84.0000 (85.2381)  acc5: 97.6000 (97.5238)  time: 0.2401  data: 0.0028  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0331 (0.9844)  acc1: 80.8000 (83.1484)  acc5: 95.6000 (96.5548)  time: 0.2664  data: 0.0261  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2099 (1.0541)  acc1: 77.2000 (81.3659)  acc5: 93.6000 (95.9512)  time: 0.2661  data: 0.0264  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2099 (1.0675)  acc1: 76.4000 (80.8640)  acc5: 94.0000 (95.8400)  time: 0.2616  data: 0.0250  max mem: 42519
Test: Total time: 0:00:17 (0.3550 s / it)
* Acc@1 81.008 Acc@5 95.786 loss 1.070
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.09%
Epoch: [195]  [   0/2502]  eta: 1:36:56  lr: 0.001235  min_lr: 0.001235  loss: 3.1035 (3.1035)  weight_decay: 0.0500 (0.0500)  time: 2.3248  data: 1.5865  max mem: 42519
Epoch: [195]  [ 200/2502]  eta: 0:18:51  lr: 0.001234  min_lr: 0.001234  loss: 3.3219 (3.1511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8931 (0.9142)  time: 0.4809  data: 0.0004  max mem: 42519
Epoch: [195]  [ 400/2502]  eta: 0:17:01  lr: 0.001232  min_lr: 0.001232  loss: 3.0938 (3.1364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8700 (0.9011)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [195]  [ 600/2502]  eta: 0:15:21  lr: 0.001230  min_lr: 0.001230  loss: 3.3432 (3.1314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8765 (0.8937)  time: 0.4825  data: 0.0004  max mem: 42519
Epoch: [195]  [ 800/2502]  eta: 0:13:44  lr: 0.001229  min_lr: 0.001229  loss: 3.2560 (3.1436)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8990 (0.8915)  time: 0.4975  data: 0.0004  max mem: 42519
Epoch: [195]  [1000/2502]  eta: 0:12:06  lr: 0.001227  min_lr: 0.001227  loss: 3.2334 (3.1425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8504 (0.8885)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [195]  [1200/2502]  eta: 0:10:29  lr: 0.001225  min_lr: 0.001225  loss: 3.2444 (3.1507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8879 (0.8979)  time: 0.4824  data: 0.0007  max mem: 42519
Epoch: [195]  [1400/2502]  eta: 0:08:52  lr: 0.001224  min_lr: 0.001224  loss: 3.1764 (3.1501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8424 (0.8992)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [195]  [1600/2502]  eta: 0:07:15  lr: 0.001222  min_lr: 0.001222  loss: 3.3725 (3.1492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8743 (0.8996)  time: 0.4812  data: 0.0004  max mem: 42519
Epoch: [195]  [1800/2502]  eta: 0:05:38  lr: 0.001220  min_lr: 0.001220  loss: 3.3663 (3.1495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8546 (0.8977)  time: 0.4803  data: 0.0005  max mem: 42519
Epoch: [195]  [2000/2502]  eta: 0:04:02  lr: 0.001219  min_lr: 0.001219  loss: 3.2430 (3.1522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8811 (0.8974)  time: 0.4833  data: 0.0004  max mem: 42519
Epoch: [195]  [2200/2502]  eta: 0:02:25  lr: 0.001217  min_lr: 0.001217  loss: 3.2528 (3.1544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8777 (0.8970)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [195]  [2400/2502]  eta: 0:00:49  lr: 0.001215  min_lr: 0.001215  loss: 3.1914 (3.1551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8874 (0.9004)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [195]  [2501/2502]  eta: 0:00:00  lr: 0.001215  min_lr: 0.001215  loss: 3.1861 (3.1577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8587 (0.8985)  time: 0.4403  data: 0.0007  max mem: 42519
Epoch: [195] Total time: 0:20:07 (0.4825 s / it)
Averaged stats: lr: 0.001215  min_lr: 0.001215  loss: 3.1861 (3.1570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8587 (0.8985)
Test:  [ 0/50]  eta: 0:02:46  loss: 0.5264 (0.5264)  acc1: 94.4000 (94.4000)  acc5: 99.6000 (99.6000)  time: 3.3254  data: 3.0441  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.8793 (0.8470)  acc1: 87.6000 (86.2182)  acc5: 98.4000 (97.7818)  time: 0.5184  data: 0.2772  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8859 (0.8860)  acc1: 83.6000 (85.0857)  acc5: 98.0000 (97.6762)  time: 0.2395  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0473 (0.9824)  acc1: 80.0000 (82.8903)  acc5: 95.6000 (96.5419)  time: 0.2397  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2258 (1.0452)  acc1: 77.2000 (81.3854)  acc5: 94.0000 (95.8342)  time: 0.2376  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2211 (1.0651)  acc1: 76.8000 (80.7440)  acc5: 94.4000 (95.7200)  time: 0.2373  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3031 s / it)
* Acc@1 81.166 Acc@5 95.784 loss 1.061
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.17%
Epoch: [196]  [   0/2502]  eta: 1:20:57  lr: 0.001215  min_lr: 0.001215  loss: 3.4100 (3.4100)  weight_decay: 0.0500 (0.0500)  time: 1.9413  data: 1.4388  max mem: 42519
Epoch: [196]  [ 200/2502]  eta: 0:18:41  lr: 0.001213  min_lr: 0.001213  loss: 3.2086 (3.1417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8547 (0.8594)  time: 0.4809  data: 0.0005  max mem: 42519
Epoch: [196]  [ 400/2502]  eta: 0:16:56  lr: 0.001211  min_lr: 0.001211  loss: 3.1137 (3.1286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8473 (0.8749)  time: 0.4781  data: 0.0004  max mem: 42519
Epoch: [196]  [ 600/2502]  eta: 0:15:18  lr: 0.001210  min_lr: 0.001210  loss: 3.2755 (3.1385)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8632 (0.8761)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [196]  [ 800/2502]  eta: 0:13:40  lr: 0.001208  min_lr: 0.001208  loss: 3.1135 (3.1369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8774 (0.8789)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [196]  [1000/2502]  eta: 0:12:04  lr: 0.001206  min_lr: 0.001206  loss: 3.2606 (3.1361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8292 (0.8745)  time: 0.4808  data: 0.0004  max mem: 42519
Epoch: [196]  [1200/2502]  eta: 0:10:27  lr: 0.001205  min_lr: 0.001205  loss: 3.3131 (3.1288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8650 (0.8788)  time: 0.4783  data: 0.0005  max mem: 42519
Epoch: [196]  [1400/2502]  eta: 0:08:50  lr: 0.001203  min_lr: 0.001203  loss: 3.1526 (3.1269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9286 (0.8799)  time: 0.4778  data: 0.0004  max mem: 42519
Epoch: [196]  [1600/2502]  eta: 0:07:14  lr: 0.001201  min_lr: 0.001201  loss: 3.1376 (3.1273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8395 (0.8830)  time: 0.4773  data: 0.0004  max mem: 42519
Epoch: [196]  [1800/2502]  eta: 0:05:37  lr: 0.001200  min_lr: 0.001200  loss: 3.2794 (3.1257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8830 (0.8870)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [196]  [2000/2502]  eta: 0:04:01  lr: 0.001198  min_lr: 0.001198  loss: 3.4392 (3.1338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9328 (0.8912)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [196]  [2200/2502]  eta: 0:02:25  lr: 0.001197  min_lr: 0.001197  loss: 3.3385 (3.1301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9634 (0.8950)  time: 0.4865  data: 0.0005  max mem: 42519
Epoch: [196]  [2400/2502]  eta: 0:00:49  lr: 0.001195  min_lr: 0.001195  loss: 3.0813 (3.1343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9044 (0.8972)  time: 0.4860  data: 0.0004  max mem: 42519
Epoch: [196]  [2501/2502]  eta: 0:00:00  lr: 0.001194  min_lr: 0.001194  loss: 3.3907 (3.1339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9056 (0.8983)  time: 0.4368  data: 0.0013  max mem: 42519
Epoch: [196] Total time: 0:20:04 (0.4815 s / it)
Averaged stats: lr: 0.001194  min_lr: 0.001194  loss: 3.3907 (3.1419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9056 (0.8983)
Test:  [ 0/50]  eta: 0:03:18  loss: 0.5975 (0.5975)  acc1: 94.0000 (94.0000)  acc5: 99.2000 (99.2000)  time: 3.9612  data: 3.6734  max mem: 42519
Test:  [10/50]  eta: 0:00:23  loss: 0.9308 (0.9043)  acc1: 86.8000 (86.4364)  acc5: 98.0000 (97.6000)  time: 0.5761  data: 0.3343  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.9302 (0.9230)  acc1: 83.2000 (85.4286)  acc5: 98.0000 (97.5810)  time: 0.2388  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0413 (1.0091)  acc1: 80.8000 (83.2774)  acc5: 96.0000 (96.4387)  time: 0.2389  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2366 (1.0722)  acc1: 77.6000 (81.5512)  acc5: 93.6000 (95.7951)  time: 0.2375  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2501 (1.0918)  acc1: 76.0000 (80.9440)  acc5: 93.6000 (95.6560)  time: 0.2372  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3150 s / it)
* Acc@1 81.120 Acc@5 95.838 loss 1.091
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.17%
Epoch: [197]  [   0/2502]  eta: 1:24:49  lr: 0.001194  min_lr: 0.001194  loss: 3.1277 (3.1277)  weight_decay: 0.0500 (0.0500)  time: 2.0343  data: 1.4653  max mem: 42519
Epoch: [197]  [ 200/2502]  eta: 0:18:48  lr: 0.001192  min_lr: 0.001192  loss: 3.3535 (3.1324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9223 (0.9325)  time: 0.4883  data: 0.0005  max mem: 42519
Epoch: [197]  [ 400/2502]  eta: 0:17:02  lr: 0.001191  min_lr: 0.001191  loss: 3.3365 (3.1326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9054 (0.9259)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [197]  [ 600/2502]  eta: 0:15:22  lr: 0.001189  min_lr: 0.001189  loss: 2.9837 (3.1190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8947 (0.9099)  time: 0.4853  data: 0.0004  max mem: 42519
Epoch: [197]  [ 800/2502]  eta: 0:13:44  lr: 0.001188  min_lr: 0.001188  loss: 3.2348 (3.1240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8637 (0.9125)  time: 0.4803  data: 0.0005  max mem: 42519
Epoch: [197]  [1000/2502]  eta: 0:12:06  lr: 0.001186  min_lr: 0.001186  loss: 3.1863 (3.1345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9963 (0.9183)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [197]  [1200/2502]  eta: 0:10:28  lr: 0.001184  min_lr: 0.001184  loss: 3.3537 (3.1431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8804 (0.9201)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [197]  [1400/2502]  eta: 0:08:52  lr: 0.001183  min_lr: 0.001183  loss: 3.1096 (3.1370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9270 (0.9216)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [197]  [1600/2502]  eta: 0:07:15  lr: 0.001181  min_lr: 0.001181  loss: 2.8153 (3.1310)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9344 (0.9195)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [197]  [1800/2502]  eta: 0:05:38  lr: 0.001179  min_lr: 0.001179  loss: 3.4215 (3.1280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8703 (0.9200)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [197]  [2000/2502]  eta: 0:04:02  lr: 0.001178  min_lr: 0.001178  loss: 3.2465 (3.1307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9056 (0.9209)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [197]  [2200/2502]  eta: 0:02:25  lr: 0.001176  min_lr: 0.001176  loss: 3.1237 (3.1329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8696 (0.9242)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [197]  [2400/2502]  eta: 0:00:49  lr: 0.001174  min_lr: 0.001174  loss: 3.0733 (3.1315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8597 (0.9212)  time: 0.4939  data: 0.0004  max mem: 42519
Epoch: [197]  [2501/2502]  eta: 0:00:00  lr: 0.001174  min_lr: 0.001174  loss: 2.7495 (3.1306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8749 (0.9196)  time: 0.4351  data: 0.0009  max mem: 42519
Epoch: [197] Total time: 0:20:06 (0.4821 s / it)
Averaged stats: lr: 0.001174  min_lr: 0.001174  loss: 2.7495 (3.1355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8749 (0.9196)
Test:  [ 0/50]  eta: 0:04:36  loss: 0.4904 (0.4904)  acc1: 92.4000 (92.4000)  acc5: 99.6000 (99.6000)  time: 5.5318  data: 5.2342  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.7740 (0.8085)  acc1: 86.0000 (85.4545)  acc5: 98.0000 (97.7091)  time: 0.7251  data: 0.4831  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8197 (0.8341)  acc1: 83.6000 (84.5333)  acc5: 97.6000 (97.6571)  time: 0.2410  data: 0.0042  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9446 (0.9179)  acc1: 80.4000 (82.8645)  acc5: 96.4000 (96.6065)  time: 0.2395  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1431 (0.9817)  acc1: 77.6000 (81.3951)  acc5: 93.6000 (95.9610)  time: 0.2393  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1442 (0.9951)  acc1: 76.0000 (80.9280)  acc5: 93.6000 (95.8320)  time: 0.2398  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3488 s / it)
* Acc@1 81.200 Acc@5 95.810 loss 0.993
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.20%
Epoch: [198]  [   0/2502]  eta: 1:16:16  lr: 0.001174  min_lr: 0.001174  loss: 2.9965 (2.9965)  weight_decay: 0.0500 (0.0500)  time: 1.8291  data: 1.3327  max mem: 42519
Epoch: [198]  [ 200/2502]  eta: 0:18:41  lr: 0.001172  min_lr: 0.001172  loss: 3.0838 (3.0802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8921 (0.9093)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [198]  [ 400/2502]  eta: 0:16:56  lr: 0.001170  min_lr: 0.001170  loss: 3.1907 (3.0803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9224 (0.9125)  time: 0.4816  data: 0.0005  max mem: 42519
Epoch: [198]  [ 600/2502]  eta: 0:15:20  lr: 0.001169  min_lr: 0.001169  loss: 3.1924 (3.1016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8846 (0.9120)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [198]  [ 800/2502]  eta: 0:13:41  lr: 0.001167  min_lr: 0.001167  loss: 3.2280 (3.1051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8743 (0.9117)  time: 0.4782  data: 0.0005  max mem: 42519
Epoch: [198]  [1000/2502]  eta: 0:12:04  lr: 0.001165  min_lr: 0.001165  loss: 3.2167 (3.1157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8898 (nan)  time: 0.4799  data: 0.0005  max mem: 42519
Epoch: [198]  [1200/2502]  eta: 0:10:27  lr: 0.001164  min_lr: 0.001164  loss: 2.8899 (3.1187)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9040 (nan)  time: 0.4828  data: 0.0004  max mem: 42519
Epoch: [198]  [1400/2502]  eta: 0:08:50  lr: 0.001162  min_lr: 0.001162  loss: 3.2558 (3.1156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9901 (nan)  time: 0.4782  data: 0.0005  max mem: 42519
Epoch: [198]  [1600/2502]  eta: 0:07:14  lr: 0.001161  min_lr: 0.001161  loss: 3.3253 (3.1191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8015 (nan)  time: 0.4895  data: 0.0004  max mem: 42519
Epoch: [198]  [1800/2502]  eta: 0:05:38  lr: 0.001159  min_lr: 0.001159  loss: 3.2175 (3.1167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9212 (nan)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [198]  [2000/2502]  eta: 0:04:01  lr: 0.001157  min_lr: 0.001157  loss: 3.2600 (3.1124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8437 (nan)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [198]  [2200/2502]  eta: 0:02:25  lr: 0.001156  min_lr: 0.001156  loss: 3.2670 (3.1123)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8956 (nan)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [198]  [2400/2502]  eta: 0:00:49  lr: 0.001154  min_lr: 0.001154  loss: 3.0044 (3.1125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9271 (nan)  time: 0.4811  data: 0.0004  max mem: 42519
Epoch: [198]  [2501/2502]  eta: 0:00:00  lr: 0.001153  min_lr: 0.001153  loss: 3.1005 (3.1100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8849 (nan)  time: 0.4382  data: 0.0009  max mem: 42519
Epoch: [198] Total time: 0:20:05 (0.4820 s / it)
Averaged stats: lr: 0.001153  min_lr: 0.001153  loss: 3.1005 (3.1238)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8849 (nan)
Test:  [ 0/50]  eta: 0:03:36  loss: 0.5194 (0.5194)  acc1: 94.4000 (94.4000)  acc5: 99.2000 (99.2000)  time: 4.3323  data: 4.0689  max mem: 42519
Test:  [10/50]  eta: 0:00:24  loss: 0.8500 (0.8375)  acc1: 85.2000 (86.3636)  acc5: 98.0000 (97.4909)  time: 0.6107  data: 0.3704  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8881 (0.8686)  acc1: 84.4000 (85.3714)  acc5: 98.0000 (97.5429)  time: 0.2660  data: 0.0237  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0155 (0.9455)  acc1: 82.0000 (83.5097)  acc5: 96.8000 (96.6710)  time: 0.2977  data: 0.0559  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1464 (1.0092)  acc1: 77.2000 (81.6878)  acc5: 93.6000 (95.9610)  time: 0.2696  data: 0.0325  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1897 (1.0281)  acc1: 77.2000 (81.1600)  acc5: 93.6000 (95.8560)  time: 0.2661  data: 0.0292  max mem: 42519
Test: Total time: 0:00:17 (0.3457 s / it)
* Acc@1 81.224 Acc@5 95.772 loss 1.029
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.22%
Epoch: [199]  [   0/2502]  eta: 1:34:42  lr: 0.001153  min_lr: 0.001153  loss: 3.4081 (3.4081)  weight_decay: 0.0500 (0.0500)  time: 2.2712  data: 1.7708  max mem: 42519
Epoch: [199]  [ 200/2502]  eta: 0:18:49  lr: 0.001152  min_lr: 0.001152  loss: 3.2177 (3.0928)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8735 (0.8866)  time: 0.4894  data: 0.0004  max mem: 42519
Epoch: [199]  [ 400/2502]  eta: 0:17:00  lr: 0.001150  min_lr: 0.001150  loss: 3.4072 (3.0930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8609 (0.8961)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [199]  [ 600/2502]  eta: 0:15:20  lr: 0.001148  min_lr: 0.001148  loss: 3.2885 (3.1163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9334 (0.9004)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [199]  [ 800/2502]  eta: 0:13:42  lr: 0.001147  min_lr: 0.001147  loss: 2.9684 (3.1151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9106 (0.9005)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [199]  [1000/2502]  eta: 0:12:04  lr: 0.001145  min_lr: 0.001145  loss: 3.2679 (3.1238)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9567 (0.9093)  time: 0.4779  data: 0.0004  max mem: 42519
Epoch: [199]  [1200/2502]  eta: 0:10:27  lr: 0.001143  min_lr: 0.001143  loss: 3.1006 (3.1234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9003 (0.9124)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [199]  [1400/2502]  eta: 0:08:50  lr: 0.001142  min_lr: 0.001142  loss: 3.0229 (3.1204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8909 (0.9099)  time: 0.4792  data: 0.0006  max mem: 42519
Epoch: [199]  [1600/2502]  eta: 0:07:14  lr: 0.001140  min_lr: 0.001140  loss: 3.1063 (3.1233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8664 (0.9075)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [199]  [1800/2502]  eta: 0:05:37  lr: 0.001139  min_lr: 0.001139  loss: 3.4956 (3.1276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9385 (0.9067)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [199]  [2000/2502]  eta: 0:04:01  lr: 0.001137  min_lr: 0.001137  loss: 3.0485 (3.1325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8525 (0.9087)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [199]  [2200/2502]  eta: 0:02:25  lr: 0.001135  min_lr: 0.001135  loss: 3.3468 (3.1331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9043 (0.9114)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [199]  [2400/2502]  eta: 0:00:49  lr: 0.001134  min_lr: 0.001134  loss: 3.0864 (3.1366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8637 (0.9136)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [199]  [2501/2502]  eta: 0:00:00  lr: 0.001133  min_lr: 0.001133  loss: 3.1960 (3.1341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9228 (0.9147)  time: 0.4483  data: 0.0009  max mem: 42519
Epoch: [199] Total time: 0:20:04 (0.4813 s / it)
Averaged stats: lr: 0.001133  min_lr: 0.001133  loss: 3.1960 (3.1269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9228 (0.9147)
Test:  [ 0/50]  eta: 0:02:32  loss: 0.5803 (0.5803)  acc1: 93.2000 (93.2000)  acc5: 99.6000 (99.6000)  time: 3.0448  data: 2.7523  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8792 (0.8602)  acc1: 85.2000 (85.8909)  acc5: 98.4000 (97.9273)  time: 0.4939  data: 0.2507  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.9030 (0.8989)  acc1: 83.6000 (84.8381)  acc5: 98.0000 (97.8857)  time: 0.2776  data: 0.0351  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0776 (0.9872)  acc1: 80.4000 (83.1484)  acc5: 96.8000 (96.9677)  time: 0.2768  data: 0.0351  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2336 (1.0574)  acc1: 77.2000 (81.3659)  acc5: 94.4000 (96.2342)  time: 0.2371  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2399 (1.0691)  acc1: 76.0000 (80.8560)  acc5: 94.4000 (96.1200)  time: 0.2369  data: 0.0001  max mem: 42519
Test: Total time: 0:00:15 (0.3115 s / it)
* Acc@1 80.952 Acc@5 95.906 loss 1.075
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.22%
Epoch: [200]  [   0/2502]  eta: 1:31:44  lr: 0.001133  min_lr: 0.001133  loss: 2.9827 (2.9827)  weight_decay: 0.0500 (0.0500)  time: 2.2002  data: 1.6486  max mem: 42519
Epoch: [200]  [ 200/2502]  eta: 0:18:45  lr: 0.001131  min_lr: 0.001131  loss: 3.1164 (3.0989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8930 (0.9284)  time: 0.4818  data: 0.0004  max mem: 42519
Epoch: [200]  [ 400/2502]  eta: 0:16:58  lr: 0.001130  min_lr: 0.001130  loss: 3.2695 (3.0851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8993 (0.9161)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [200]  [ 600/2502]  eta: 0:15:20  lr: 0.001128  min_lr: 0.001128  loss: 3.2492 (3.0857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9210 (nan)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [200]  [ 800/2502]  eta: 0:13:41  lr: 0.001126  min_lr: 0.001126  loss: 3.2157 (3.1004)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8808 (nan)  time: 0.4781  data: 0.0004  max mem: 42519
Epoch: [200]  [1000/2502]  eta: 0:12:03  lr: 0.001125  min_lr: 0.001125  loss: 3.0259 (3.0970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9212 (nan)  time: 0.4782  data: 0.0006  max mem: 42519
Epoch: [200]  [1200/2502]  eta: 0:10:27  lr: 0.001123  min_lr: 0.001123  loss: 3.1829 (3.1004)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9443 (nan)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [200]  [1400/2502]  eta: 0:08:50  lr: 0.001122  min_lr: 0.001122  loss: 3.4351 (3.0957)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8748 (nan)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [200]  [1600/2502]  eta: 0:07:14  lr: 0.001120  min_lr: 0.001120  loss: 3.3317 (3.1094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8840 (nan)  time: 0.4852  data: 0.0005  max mem: 42519
Epoch: [200]  [1800/2502]  eta: 0:05:38  lr: 0.001118  min_lr: 0.001118  loss: 3.2264 (3.1162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8785 (nan)  time: 0.4811  data: 0.0004  max mem: 42519
Epoch: [200]  [2000/2502]  eta: 0:04:01  lr: 0.001117  min_lr: 0.001117  loss: 3.0082 (3.1136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8650 (nan)  time: 0.4809  data: 0.0004  max mem: 42519
Epoch: [200]  [2200/2502]  eta: 0:02:25  lr: 0.001115  min_lr: 0.001115  loss: 3.1320 (3.1160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9632 (nan)  time: 0.4865  data: 0.0004  max mem: 42519
Epoch: [200]  [2400/2502]  eta: 0:00:49  lr: 0.001114  min_lr: 0.001114  loss: 3.3809 (3.1205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9658 (nan)  time: 0.4823  data: 0.0005  max mem: 42519
Epoch: [200]  [2501/2502]  eta: 0:00:00  lr: 0.001113  min_lr: 0.001113  loss: 2.8477 (3.1170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8818 (nan)  time: 0.4391  data: 0.0008  max mem: 42519
Epoch: [200] Total time: 0:20:04 (0.4813 s / it)
Averaged stats: lr: 0.001113  min_lr: 0.001113  loss: 2.8477 (3.1204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8818 (nan)
Test:  [ 0/50]  eta: 0:02:39  loss: 0.4460 (0.4460)  acc1: 94.4000 (94.4000)  acc5: 99.6000 (99.6000)  time: 3.1848  data: 2.9028  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.7642 (0.7576)  acc1: 84.4000 (86.1091)  acc5: 97.6000 (97.7091)  time: 0.5071  data: 0.2659  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.7840 (0.7977)  acc1: 84.0000 (84.8952)  acc5: 97.6000 (97.6762)  time: 0.2447  data: 0.0014  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9144 (0.8731)  acc1: 80.4000 (83.2516)  acc5: 96.8000 (96.7226)  time: 0.2504  data: 0.0071  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0697 (0.9260)  acc1: 78.0000 (81.7366)  acc5: 94.4000 (96.1951)  time: 0.2442  data: 0.0069  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0914 (0.9414)  acc1: 77.6000 (81.2320)  acc5: 95.2000 (96.0880)  time: 0.2375  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3038 s / it)
* Acc@1 81.280 Acc@5 96.024 loss 0.945
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.28%
Epoch: [201]  [   0/2502]  eta: 1:27:47  lr: 0.001113  min_lr: 0.001113  loss: 3.0837 (3.0837)  weight_decay: 0.0500 (0.0500)  time: 2.1055  data: 1.6085  max mem: 42519
Epoch: [201]  [ 200/2502]  eta: 0:18:48  lr: 0.001111  min_lr: 0.001111  loss: 3.2280 (3.1104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8862 (0.9383)  time: 0.4900  data: 0.0004  max mem: 42519
Epoch: [201]  [ 400/2502]  eta: 0:17:00  lr: 0.001110  min_lr: 0.001110  loss: 2.9260 (3.0821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9065 (0.9361)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [201]  [ 600/2502]  eta: 0:15:19  lr: 0.001108  min_lr: 0.001108  loss: 2.9055 (3.0950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9827 (0.9433)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [201]  [ 800/2502]  eta: 0:13:42  lr: 0.001106  min_lr: 0.001106  loss: 3.2808 (3.0907)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8826 (0.9303)  time: 0.4848  data: 0.0004  max mem: 42519
Epoch: [201]  [1000/2502]  eta: 0:12:05  lr: 0.001105  min_lr: 0.001105  loss: 3.2152 (3.0981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9833 (0.9428)  time: 0.4803  data: 0.0005  max mem: 42519
Epoch: [201]  [1200/2502]  eta: 0:10:28  lr: 0.001103  min_lr: 0.001103  loss: 3.2777 (3.1080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8975 (0.9378)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [201]  [1400/2502]  eta: 0:08:51  lr: 0.001102  min_lr: 0.001102  loss: 2.9647 (3.1053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9312 (0.9344)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [201]  [1600/2502]  eta: 0:07:14  lr: 0.001100  min_lr: 0.001100  loss: 3.2399 (3.1047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8613 (0.9302)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [201]  [1800/2502]  eta: 0:05:38  lr: 0.001098  min_lr: 0.001098  loss: 3.1626 (3.1079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9366 (0.9319)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [201]  [2000/2502]  eta: 0:04:01  lr: 0.001097  min_lr: 0.001097  loss: 3.1425 (3.1140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9025 (0.9347)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [201]  [2200/2502]  eta: 0:02:25  lr: 0.001095  min_lr: 0.001095  loss: 3.2159 (3.1127)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8729 (0.9324)  time: 0.4820  data: 0.0004  max mem: 42519
Epoch: [201]  [2400/2502]  eta: 0:00:49  lr: 0.001094  min_lr: 0.001094  loss: 3.1515 (3.1116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8977 (0.9331)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [201]  [2501/2502]  eta: 0:00:00  lr: 0.001093  min_lr: 0.001093  loss: 3.2403 (3.1143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9429 (0.9355)  time: 0.4373  data: 0.0015  max mem: 42519
Epoch: [201] Total time: 0:20:05 (0.4817 s / it)
Averaged stats: lr: 0.001093  min_lr: 0.001093  loss: 3.2403 (3.1098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9429 (0.9355)
Test:  [ 0/50]  eta: 0:02:43  loss: 0.6362 (0.6362)  acc1: 93.2000 (93.2000)  acc5: 99.6000 (99.6000)  time: 3.2783  data: 2.9904  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.9308 (0.8994)  acc1: 85.6000 (86.0364)  acc5: 98.0000 (97.6000)  time: 0.5139  data: 0.2723  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.9642 (0.9261)  acc1: 83.6000 (84.8571)  acc5: 97.2000 (97.4857)  time: 0.2385  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0161 (1.0174)  acc1: 80.8000 (83.0194)  acc5: 96.0000 (96.4516)  time: 0.2501  data: 0.0099  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2311 (1.0691)  acc1: 77.6000 (81.5415)  acc5: 94.4000 (95.8927)  time: 0.2494  data: 0.0102  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2366 (1.0799)  acc1: 77.2000 (81.0080)  acc5: 94.4000 (95.7200)  time: 0.2384  data: 0.0015  max mem: 42519
Test: Total time: 0:00:15 (0.3057 s / it)
* Acc@1 81.314 Acc@5 95.886 loss nan
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.31%
Epoch: [202]  [   0/2502]  eta: 1:21:10  lr: 0.001093  min_lr: 0.001093  loss: 3.3213 (3.3213)  weight_decay: 0.0500 (0.0500)  time: 1.9464  data: 1.4550  max mem: 42519
Epoch: [202]  [ 200/2502]  eta: 0:18:39  lr: 0.001091  min_lr: 0.001091  loss: 3.1488 (3.0829)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0189 (0.9840)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [202]  [ 400/2502]  eta: 0:16:56  lr: 0.001090  min_lr: 0.001090  loss: 3.0950 (3.0961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8975 (0.9492)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [202]  [ 600/2502]  eta: 0:15:18  lr: 0.001088  min_lr: 0.001088  loss: 3.2306 (3.0839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9204 (0.9528)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [202]  [ 800/2502]  eta: 0:13:41  lr: 0.001086  min_lr: 0.001086  loss: 3.2485 (3.0882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8945 (0.9454)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [202]  [1000/2502]  eta: 0:12:03  lr: 0.001085  min_lr: 0.001085  loss: 3.3289 (3.0878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9275 (0.9451)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [202]  [1200/2502]  eta: 0:10:27  lr: 0.001083  min_lr: 0.001083  loss: 2.8284 (3.0890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9583 (0.9440)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [202]  [1400/2502]  eta: 0:08:50  lr: 0.001082  min_lr: 0.001082  loss: 3.1794 (3.0955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8935 (0.9389)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [202]  [1600/2502]  eta: 0:07:14  lr: 0.001080  min_lr: 0.001080  loss: 3.2868 (3.0983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8924 (0.9399)  time: 0.4894  data: 0.0004  max mem: 42519
Epoch: [202]  [1800/2502]  eta: 0:05:37  lr: 0.001078  min_lr: 0.001078  loss: 3.3770 (3.1028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8957 (0.9366)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [202]  [2000/2502]  eta: 0:04:01  lr: 0.001077  min_lr: 0.001077  loss: 3.1644 (3.1059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9123 (0.9369)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [202]  [2200/2502]  eta: 0:02:25  lr: 0.001075  min_lr: 0.001075  loss: 3.1416 (3.1062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9072 (0.9361)  time: 0.4886  data: 0.0005  max mem: 42519
Epoch: [202]  [2400/2502]  eta: 0:00:49  lr: 0.001074  min_lr: 0.001074  loss: 2.9191 (3.1044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9069 (0.9350)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [202]  [2501/2502]  eta: 0:00:00  lr: 0.001073  min_lr: 0.001073  loss: 3.2745 (3.1020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9471 (0.9357)  time: 0.4347  data: 0.0007  max mem: 42519
Epoch: [202] Total time: 0:20:04 (0.4812 s / it)
Averaged stats: lr: 0.001073  min_lr: 0.001073  loss: 3.2745 (3.0979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9471 (0.9357)
Test:  [ 0/50]  eta: 0:03:36  loss: 0.5879 (0.5879)  acc1: 93.6000 (93.6000)  acc5: 99.6000 (99.6000)  time: 4.3399  data: 4.0570  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.8164 (0.8505)  acc1: 86.4000 (86.3273)  acc5: 98.4000 (97.8182)  time: 0.7191  data: 0.4755  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8580 (0.8805)  acc1: 84.4000 (85.2762)  acc5: 98.0000 (97.7524)  time: 0.2974  data: 0.0589  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0170 (0.9642)  acc1: 81.6000 (83.4581)  acc5: 96.4000 (96.7484)  time: 0.2428  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2150 (1.0332)  acc1: 78.8000 (81.8537)  acc5: 94.0000 (96.0488)  time: 0.2629  data: 0.0197  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2178 (1.0505)  acc1: 77.6000 (81.2400)  acc5: 94.4000 (95.9120)  time: 0.2576  data: 0.0195  max mem: 42519
Test: Total time: 0:00:17 (0.3556 s / it)
* Acc@1 81.454 Acc@5 96.048 loss 1.045
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.45%
Epoch: [203]  [   0/2502]  eta: 1:20:43  lr: 0.001073  min_lr: 0.001073  loss: 3.1250 (3.1250)  weight_decay: 0.0500 (0.0500)  time: 1.9359  data: 1.4428  max mem: 42519
Epoch: [203]  [ 200/2502]  eta: 0:18:41  lr: 0.001071  min_lr: 0.001071  loss: 3.1862 (3.1134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8960 (0.9165)  time: 0.4815  data: 0.0005  max mem: 42519
Epoch: [203]  [ 400/2502]  eta: 0:16:59  lr: 0.001070  min_lr: 0.001070  loss: 3.0845 (3.1176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9122 (0.9376)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [203]  [ 600/2502]  eta: 0:15:19  lr: 0.001068  min_lr: 0.001068  loss: 3.1617 (3.1039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9124 (0.9321)  time: 0.4781  data: 0.0004  max mem: 42519
Epoch: [203]  [ 800/2502]  eta: 0:13:41  lr: 0.001066  min_lr: 0.001066  loss: 3.4306 (3.1098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9328 (0.9417)  time: 0.4953  data: 0.0004  max mem: 42519
Epoch: [203]  [1000/2502]  eta: 0:12:04  lr: 0.001065  min_lr: 0.001065  loss: 3.1552 (3.1147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9248 (0.9391)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [203]  [1200/2502]  eta: 0:10:27  lr: 0.001063  min_lr: 0.001063  loss: 3.2619 (3.1085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9508 (0.9377)  time: 0.4834  data: 0.0004  max mem: 42519
Epoch: [203]  [1400/2502]  eta: 0:08:51  lr: 0.001062  min_lr: 0.001062  loss: 3.1751 (3.1056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9566 (0.9390)  time: 0.4815  data: 0.0004  max mem: 42519
Epoch: [203]  [1600/2502]  eta: 0:07:14  lr: 0.001060  min_lr: 0.001060  loss: 2.9884 (3.1062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9640 (0.9426)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [203]  [1800/2502]  eta: 0:05:38  lr: 0.001059  min_lr: 0.001059  loss: 3.0059 (3.1005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9000 (0.9387)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [203]  [2000/2502]  eta: 0:04:01  lr: 0.001057  min_lr: 0.001057  loss: 3.2166 (3.1023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0269 (0.9438)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [203]  [2200/2502]  eta: 0:02:25  lr: 0.001055  min_lr: 0.001055  loss: 3.3979 (3.0973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8700 (0.9417)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [203]  [2400/2502]  eta: 0:00:49  lr: 0.001054  min_lr: 0.001054  loss: 3.2016 (3.0980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8785 (0.9409)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [203]  [2501/2502]  eta: 0:00:00  lr: 0.001053  min_lr: 0.001053  loss: 3.2398 (3.0991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9086 (0.9408)  time: 0.4368  data: 0.0007  max mem: 42519
Epoch: [203] Total time: 0:20:05 (0.4818 s / it)
Averaged stats: lr: 0.001053  min_lr: 0.001053  loss: 3.2398 (3.1017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9086 (0.9408)
Test:  [ 0/50]  eta: 0:04:32  loss: 0.5850 (0.5850)  acc1: 92.8000 (92.8000)  acc5: 99.6000 (99.6000)  time: 5.4578  data: 5.1693  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.8903 (0.9137)  acc1: 85.2000 (86.1818)  acc5: 97.6000 (97.6727)  time: 0.7119  data: 0.4704  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.9546 (0.9597)  acc1: 84.0000 (85.0857)  acc5: 97.6000 (97.6762)  time: 0.2397  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0593 (1.0318)  acc1: 80.8000 (83.4968)  acc5: 96.4000 (96.7226)  time: 0.2412  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2413 (1.0883)  acc1: 77.2000 (81.8049)  acc5: 94.4000 (96.1561)  time: 0.2388  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2396 (1.1038)  acc1: 76.8000 (81.1760)  acc5: 94.4000 (95.9840)  time: 0.2373  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3473 s / it)
* Acc@1 81.434 Acc@5 95.962 loss 1.104
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.45%
Epoch: [204]  [   0/2502]  eta: 1:53:00  lr: 0.001053  min_lr: 0.001053  loss: 3.0459 (3.0459)  weight_decay: 0.0500 (0.0500)  time: 2.7099  data: 1.7170  max mem: 42519
Epoch: [204]  [ 200/2502]  eta: 0:18:49  lr: 0.001051  min_lr: 0.001051  loss: 2.9529 (3.0581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9750 (0.9423)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [204]  [ 400/2502]  eta: 0:16:59  lr: 0.001050  min_lr: 0.001050  loss: 3.4369 (3.0690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9643 (0.9439)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [204]  [ 600/2502]  eta: 0:15:21  lr: 0.001048  min_lr: 0.001048  loss: 3.0136 (3.0835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9076 (0.9550)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [204]  [ 800/2502]  eta: 0:13:43  lr: 0.001047  min_lr: 0.001047  loss: 3.2830 (3.0845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9084 (0.9513)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [204]  [1000/2502]  eta: 0:12:05  lr: 0.001045  min_lr: 0.001045  loss: 3.3027 (3.0952)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0473 (0.9541)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [204]  [1200/2502]  eta: 0:10:28  lr: 0.001044  min_lr: 0.001044  loss: 3.1104 (3.0947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8725 (0.9450)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [204]  [1400/2502]  eta: 0:08:51  lr: 0.001042  min_lr: 0.001042  loss: 3.1487 (3.0960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9093 (0.9428)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [204]  [1600/2502]  eta: 0:07:14  lr: 0.001040  min_lr: 0.001040  loss: 3.1489 (3.0952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8719 (0.9410)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [204]  [1800/2502]  eta: 0:05:38  lr: 0.001039  min_lr: 0.001039  loss: 3.1825 (3.0965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9010 (0.9377)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [204]  [2000/2502]  eta: 0:04:01  lr: 0.001037  min_lr: 0.001037  loss: 3.0295 (3.0959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9201 (0.9417)  time: 0.4776  data: 0.0004  max mem: 42519
Epoch: [204]  [2200/2502]  eta: 0:02:25  lr: 0.001036  min_lr: 0.001036  loss: 2.9669 (3.0920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9237 (0.9430)  time: 0.4876  data: 0.0004  max mem: 42519
Epoch: [204]  [2400/2502]  eta: 0:00:49  lr: 0.001034  min_lr: 0.001034  loss: 3.3454 (3.0940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9688 (0.9468)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [204]  [2501/2502]  eta: 0:00:00  lr: 0.001033  min_lr: 0.001033  loss: 3.3891 (3.0937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9664 (0.9478)  time: 0.4338  data: 0.0007  max mem: 42519
Epoch: [204] Total time: 0:20:04 (0.4814 s / it)
Averaged stats: lr: 0.001033  min_lr: 0.001033  loss: 3.3891 (3.0975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9664 (0.9478)
Test:  [ 0/50]  eta: 0:04:20  loss: 0.5542 (0.5542)  acc1: 93.2000 (93.2000)  acc5: 99.6000 (99.6000)  time: 5.2136  data: 4.9467  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.8138 (0.8185)  acc1: 87.2000 (86.6182)  acc5: 97.6000 (97.7455)  time: 0.7101  data: 0.4699  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8475 (0.8575)  acc1: 84.0000 (85.4857)  acc5: 97.6000 (97.7143)  time: 0.2526  data: 0.0114  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9822 (0.9367)  acc1: 81.2000 (83.6129)  acc5: 96.8000 (96.7097)  time: 0.2452  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1732 (0.9948)  acc1: 78.4000 (82.0390)  acc5: 94.0000 (96.0878)  time: 0.2715  data: 0.0310  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1715 (1.0091)  acc1: 78.4000 (81.4800)  acc5: 94.8000 (95.9200)  time: 0.2675  data: 0.0308  max mem: 42519
Test: Total time: 0:00:17 (0.3587 s / it)
* Acc@1 81.630 Acc@5 96.006 loss 1.007
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.63%
Epoch: [205]  [   0/2502]  eta: 1:31:45  lr: 0.001033  min_lr: 0.001033  loss: 2.6604 (2.6604)  weight_decay: 0.0500 (0.0500)  time: 2.2003  data: 1.7004  max mem: 42519
Epoch: [205]  [ 200/2502]  eta: 0:18:50  lr: 0.001032  min_lr: 0.001032  loss: 3.2874 (3.1160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9183 (0.9125)  time: 0.4826  data: 0.0005  max mem: 42519
Epoch: [205]  [ 400/2502]  eta: 0:17:03  lr: 0.001030  min_lr: 0.001030  loss: 3.2962 (3.1248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9626 (0.9426)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [205]  [ 600/2502]  eta: 0:15:22  lr: 0.001029  min_lr: 0.001029  loss: 3.2213 (3.1119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9263 (0.9363)  time: 0.4825  data: 0.0007  max mem: 42519
Epoch: [205]  [ 800/2502]  eta: 0:13:45  lr: 0.001027  min_lr: 0.001027  loss: 3.0519 (3.1093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9918 (0.9443)  time: 0.4893  data: 0.0004  max mem: 42519
Epoch: [205]  [1000/2502]  eta: 0:12:07  lr: 0.001025  min_lr: 0.001025  loss: 3.2338 (3.1027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9157 (0.9473)  time: 0.4781  data: 0.0005  max mem: 42519
Epoch: [205]  [1200/2502]  eta: 0:10:29  lr: 0.001024  min_lr: 0.001024  loss: 3.1310 (3.0918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8928 (0.9436)  time: 0.4802  data: 0.0006  max mem: 42519
Epoch: [205]  [1400/2502]  eta: 0:08:52  lr: 0.001022  min_lr: 0.001022  loss: 3.2312 (3.0890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9630 (0.9462)  time: 0.4780  data: 0.0005  max mem: 42519
Epoch: [205]  [1600/2502]  eta: 0:07:15  lr: 0.001021  min_lr: 0.001021  loss: 2.9866 (3.0828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8934 (0.9483)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [205]  [1800/2502]  eta: 0:05:38  lr: 0.001019  min_lr: 0.001019  loss: 3.1491 (3.0804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8922 (0.9468)  time: 0.4819  data: 0.0005  max mem: 42519
Epoch: [205]  [2000/2502]  eta: 0:04:02  lr: 0.001018  min_lr: 0.001018  loss: 3.3450 (3.0857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9054 (0.9454)  time: 0.4779  data: 0.0004  max mem: 42519
Epoch: [205]  [2200/2502]  eta: 0:02:25  lr: 0.001016  min_lr: 0.001016  loss: 3.1187 (3.0830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9181 (0.9482)  time: 0.4805  data: 0.0005  max mem: 42519
Epoch: [205]  [2400/2502]  eta: 0:00:49  lr: 0.001015  min_lr: 0.001015  loss: 3.1188 (3.0838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9570 (0.9488)  time: 0.4858  data: 0.0004  max mem: 42519
Epoch: [205]  [2501/2502]  eta: 0:00:00  lr: 0.001014  min_lr: 0.001014  loss: 3.1048 (3.0860)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9230 (0.9498)  time: 0.4473  data: 0.0010  max mem: 42519
Epoch: [205] Total time: 0:20:05 (0.4819 s / it)
Averaged stats: lr: 0.001014  min_lr: 0.001014  loss: 3.1048 (3.0842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9230 (0.9498)
Test:  [ 0/50]  eta: 0:02:28  loss: 0.6476 (0.6476)  acc1: 93.2000 (93.2000)  acc5: 99.6000 (99.6000)  time: 2.9626  data: 2.6865  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.9313 (0.9024)  acc1: 85.6000 (86.5818)  acc5: 97.6000 (97.7091)  time: 0.4857  data: 0.2447  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.9526 (0.9359)  acc1: 84.4000 (85.5048)  acc5: 97.6000 (97.6762)  time: 0.2432  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0728 (1.0230)  acc1: 81.2000 (83.6258)  acc5: 95.6000 (96.6710)  time: 0.2492  data: 0.0062  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2708 (1.0853)  acc1: 77.2000 (81.7854)  acc5: 94.0000 (96.0390)  time: 0.2467  data: 0.0060  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2652 (1.0991)  acc1: 76.8000 (81.4080)  acc5: 94.0000 (95.8800)  time: 0.2404  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2998 s / it)
* Acc@1 81.416 Acc@5 95.960 loss 1.097
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.63%
Epoch: [206]  [   0/2502]  eta: 1:39:42  lr: 0.001014  min_lr: 0.001014  loss: 2.5164 (2.5164)  weight_decay: 0.0500 (0.0500)  time: 2.3913  data: 1.8738  max mem: 42519
Epoch: [206]  [ 200/2502]  eta: 0:18:45  lr: 0.001012  min_lr: 0.001012  loss: 3.2974 (3.0743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8860 (0.9446)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [206]  [ 400/2502]  eta: 0:16:59  lr: 0.001011  min_lr: 0.001011  loss: 2.8871 (3.0570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8997 (0.9415)  time: 0.4820  data: 0.0004  max mem: 42519
Epoch: [206]  [ 600/2502]  eta: 0:15:20  lr: 0.001009  min_lr: 0.001009  loss: 2.9476 (3.0574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9609 (0.9607)  time: 0.4778  data: 0.0005  max mem: 42519
Epoch: [206]  [ 800/2502]  eta: 0:13:41  lr: 0.001007  min_lr: 0.001007  loss: 3.1908 (3.0610)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0156 (0.9754)  time: 0.4780  data: 0.0005  max mem: 42519
Epoch: [206]  [1000/2502]  eta: 0:12:04  lr: 0.001006  min_lr: 0.001006  loss: 3.3061 (3.0690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8725 (0.9652)  time: 0.4853  data: 0.0004  max mem: 42519
Epoch: [206]  [1200/2502]  eta: 0:10:27  lr: 0.001004  min_lr: 0.001004  loss: 3.0009 (3.0827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9082 (0.9619)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [206]  [1400/2502]  eta: 0:08:51  lr: 0.001003  min_lr: 0.001003  loss: 3.0213 (3.0773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9382 (0.9609)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [206]  [1600/2502]  eta: 0:07:14  lr: 0.001001  min_lr: 0.001001  loss: 3.3877 (3.0796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8815 (0.9600)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [206]  [1800/2502]  eta: 0:05:38  lr: 0.001000  min_lr: 0.001000  loss: 3.1328 (3.0771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9605 (0.9654)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [206]  [2000/2502]  eta: 0:04:01  lr: 0.000998  min_lr: 0.000998  loss: 3.3307 (3.0734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9232 (0.9638)  time: 0.4815  data: 0.0004  max mem: 42519
Epoch: [206]  [2200/2502]  eta: 0:02:25  lr: 0.000997  min_lr: 0.000997  loss: 3.2342 (3.0790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9770 (0.9674)  time: 0.4892  data: 0.0004  max mem: 42519
Epoch: [206]  [2400/2502]  eta: 0:00:49  lr: 0.000995  min_lr: 0.000995  loss: 3.2637 (3.0809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8738 (0.9678)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [206]  [2501/2502]  eta: 0:00:00  lr: 0.000994  min_lr: 0.000994  loss: 3.1616 (3.0803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9851 (0.9674)  time: 0.4402  data: 0.0010  max mem: 42519
Epoch: [206] Total time: 0:20:05 (0.4817 s / it)
Averaged stats: lr: 0.000994  min_lr: 0.000994  loss: 3.1616 (3.0795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9851 (0.9674)
Test:  [ 0/50]  eta: 0:02:20  loss: 0.5781 (0.5781)  acc1: 92.8000 (92.8000)  acc5: 99.2000 (99.2000)  time: 2.8110  data: 2.5394  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8323 (0.8243)  acc1: 86.8000 (86.8364)  acc5: 98.4000 (97.7455)  time: 0.4781  data: 0.2380  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8512 (0.8575)  acc1: 83.6000 (85.5619)  acc5: 98.0000 (97.7905)  time: 0.2461  data: 0.0043  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9769 (0.9369)  acc1: 82.0000 (83.5097)  acc5: 96.4000 (96.7871)  time: 0.2428  data: 0.0007  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1816 (1.0002)  acc1: 77.6000 (81.8439)  acc5: 94.0000 (96.1073)  time: 0.2415  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1901 (1.0133)  acc1: 77.6000 (81.4160)  acc5: 94.0000 (95.9920)  time: 0.2413  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2963 s / it)
* Acc@1 81.592 Acc@5 96.010 loss 1.012
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.63%
Epoch: [207]  [   0/2502]  eta: 1:47:55  lr: 0.000994  min_lr: 0.000994  loss: 3.1180 (3.1180)  weight_decay: 0.0500 (0.0500)  time: 2.5880  data: 1.4496  max mem: 42519
Epoch: [207]  [ 200/2502]  eta: 0:18:49  lr: 0.000993  min_lr: 0.000993  loss: 3.2052 (3.0453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0163 (1.0070)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [207]  [ 400/2502]  eta: 0:17:01  lr: 0.000991  min_lr: 0.000991  loss: 3.3407 (3.0571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9180 (0.9665)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [207]  [ 600/2502]  eta: 0:15:21  lr: 0.000990  min_lr: 0.000990  loss: 2.9073 (3.0715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9793 (0.9615)  time: 0.4816  data: 0.0004  max mem: 42519
Epoch: [207]  [ 800/2502]  eta: 0:13:43  lr: 0.000988  min_lr: 0.000988  loss: 3.2453 (3.0688)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0160 (0.9680)  time: 0.4810  data: 0.0004  max mem: 42519
Epoch: [207]  [1000/2502]  eta: 0:12:05  lr: 0.000987  min_lr: 0.000987  loss: 3.1851 (3.0662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9634 (0.9669)  time: 0.4816  data: 0.0004  max mem: 42519
Epoch: [207]  [1200/2502]  eta: 0:10:28  lr: 0.000985  min_lr: 0.000985  loss: 3.1727 (3.0646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9620 (inf)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [207]  [1400/2502]  eta: 0:08:51  lr: 0.000983  min_lr: 0.000983  loss: 3.3250 (3.0645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9795 (inf)  time: 0.4899  data: 0.0004  max mem: 42519
Epoch: [207]  [1600/2502]  eta: 0:07:14  lr: 0.000982  min_lr: 0.000982  loss: 3.0043 (3.0655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9643 (inf)  time: 0.4777  data: 0.0005  max mem: 42519
Epoch: [207]  [1800/2502]  eta: 0:05:38  lr: 0.000980  min_lr: 0.000980  loss: 2.9008 (3.0710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8663 (inf)  time: 0.4815  data: 0.0005  max mem: 42519
Epoch: [207]  [2000/2502]  eta: 0:04:02  lr: 0.000979  min_lr: 0.000979  loss: 3.2124 (3.0794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9873 (inf)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [207]  [2200/2502]  eta: 0:02:25  lr: 0.000977  min_lr: 0.000977  loss: 3.3208 (3.0860)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9607 (inf)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [207]  [2400/2502]  eta: 0:00:49  lr: 0.000976  min_lr: 0.000976  loss: 3.0062 (3.0894)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0153 (inf)  time: 0.4866  data: 0.0005  max mem: 42519
Epoch: [207]  [2501/2502]  eta: 0:00:00  lr: 0.000975  min_lr: 0.000975  loss: 2.9090 (3.0894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9596 (inf)  time: 0.4432  data: 0.0009  max mem: 42519
Epoch: [207] Total time: 0:20:05 (0.4819 s / it)
Averaged stats: lr: 0.000975  min_lr: 0.000975  loss: 2.9090 (3.0830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9596 (inf)
Test:  [ 0/50]  eta: 0:02:11  loss: 0.5104 (0.5104)  acc1: 94.0000 (94.0000)  acc5: 99.6000 (99.6000)  time: 2.6336  data: 2.3610  max mem: 42519
Test:  [10/50]  eta: 0:00:18  loss: 0.7945 (0.8130)  acc1: 88.8000 (86.9818)  acc5: 97.6000 (97.7091)  time: 0.4579  data: 0.2151  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8275 (0.8474)  acc1: 84.8000 (85.8286)  acc5: 97.6000 (97.6952)  time: 0.2736  data: 0.0303  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9695 (0.9343)  acc1: 82.4000 (83.9226)  acc5: 96.4000 (96.6710)  time: 0.2724  data: 0.0304  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1524 (0.9922)  acc1: 77.6000 (82.1561)  acc5: 94.0000 (96.1854)  time: 0.2376  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1312 (1.0089)  acc1: 77.2000 (81.5280)  acc5: 94.0000 (96.0880)  time: 0.2373  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3015 s / it)
* Acc@1 81.632 Acc@5 96.114 loss 1.008
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.63%
Epoch: [208]  [   0/2502]  eta: 1:22:58  lr: 0.000975  min_lr: 0.000975  loss: 3.4406 (3.4406)  weight_decay: 0.0500 (0.0500)  time: 1.9897  data: 1.4876  max mem: 42519
Epoch: [208]  [ 200/2502]  eta: 0:18:42  lr: 0.000973  min_lr: 0.000973  loss: 2.9934 (3.0285)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9707 (0.9551)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [208]  [ 400/2502]  eta: 0:16:56  lr: 0.000972  min_lr: 0.000972  loss: 2.9556 (3.0741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9067 (0.9410)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [208]  [ 600/2502]  eta: 0:15:19  lr: 0.000970  min_lr: 0.000970  loss: 3.1195 (3.0849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9553 (0.9533)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [208]  [ 800/2502]  eta: 0:13:42  lr: 0.000969  min_lr: 0.000969  loss: 3.0597 (3.0872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9171 (0.9587)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [208]  [1000/2502]  eta: 0:12:04  lr: 0.000967  min_lr: 0.000967  loss: 3.3141 (3.0879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9048 (0.9583)  time: 0.4820  data: 0.0004  max mem: 42519
Epoch: [208]  [1200/2502]  eta: 0:10:28  lr: 0.000966  min_lr: 0.000966  loss: 3.2241 (3.0816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9856 (0.9617)  time: 0.4819  data: 0.0005  max mem: 42519
Epoch: [208]  [1400/2502]  eta: 0:08:51  lr: 0.000964  min_lr: 0.000964  loss: 3.0139 (3.0813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9051 (0.9613)  time: 0.4822  data: 0.0005  max mem: 42519
Epoch: [208]  [1600/2502]  eta: 0:07:15  lr: 0.000963  min_lr: 0.000963  loss: 3.3016 (3.0865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9506 (0.9657)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [208]  [1800/2502]  eta: 0:05:38  lr: 0.000961  min_lr: 0.000961  loss: 3.2370 (3.0896)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0239 (0.9681)  time: 0.4879  data: 0.0004  max mem: 42519
Epoch: [208]  [2000/2502]  eta: 0:04:02  lr: 0.000960  min_lr: 0.000960  loss: 3.1543 (3.0888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0193 (0.9709)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [208]  [2200/2502]  eta: 0:02:25  lr: 0.000958  min_lr: 0.000958  loss: 3.0584 (3.0891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9451 (0.9723)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [208]  [2400/2502]  eta: 0:00:49  lr: 0.000957  min_lr: 0.000957  loss: 3.2747 (3.0873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9022 (0.9752)  time: 0.4816  data: 0.0005  max mem: 42519
Epoch: [208]  [2501/2502]  eta: 0:00:00  lr: 0.000956  min_lr: 0.000956  loss: 3.0260 (3.0862)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0123 (0.9772)  time: 0.4359  data: 0.0011  max mem: 42519
Epoch: [208] Total time: 0:20:06 (0.4821 s / it)
Averaged stats: lr: 0.000956  min_lr: 0.000956  loss: 3.0260 (3.0757)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0123 (0.9772)
Test:  [ 0/50]  eta: 0:03:06  loss: 0.4923 (0.4923)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 3.7303  data: 3.3910  max mem: 42519
Test:  [10/50]  eta: 0:00:22  loss: 0.8372 (0.8246)  acc1: 88.0000 (87.0545)  acc5: 98.4000 (98.0000)  time: 0.5687  data: 0.3222  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.8440 (0.8511)  acc1: 84.8000 (85.8095)  acc5: 97.6000 (97.7333)  time: 0.2489  data: 0.0079  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9508 (0.9339)  acc1: 81.6000 (83.7548)  acc5: 96.8000 (96.7484)  time: 0.2416  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1500 (0.9968)  acc1: 78.8000 (82.0195)  acc5: 94.4000 (96.2146)  time: 0.2376  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1500 (1.0109)  acc1: 77.6000 (81.4640)  acc5: 94.8000 (96.1280)  time: 0.2373  data: 0.0001  max mem: 42519
Test: Total time: 0:00:15 (0.3139 s / it)
* Acc@1 81.696 Acc@5 96.066 loss 1.009
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.70%
Epoch: [209]  [   0/2502]  eta: 1:40:08  lr: 0.000956  min_lr: 0.000956  loss: 3.4348 (3.4348)  weight_decay: 0.0500 (0.0500)  time: 2.4014  data: 1.9026  max mem: 42519
Epoch: [209]  [ 200/2502]  eta: 0:18:47  lr: 0.000954  min_lr: 0.000954  loss: 3.2014 (3.0906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9211 (1.0219)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [209]  [ 400/2502]  eta: 0:17:01  lr: 0.000953  min_lr: 0.000953  loss: 3.1663 (3.0876)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9348 (1.0093)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [209]  [ 600/2502]  eta: 0:15:20  lr: 0.000951  min_lr: 0.000951  loss: 3.2595 (3.0799)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9415 (1.0064)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [209]  [ 800/2502]  eta: 0:13:42  lr: 0.000950  min_lr: 0.000950  loss: 3.2629 (3.0816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9991 (1.0039)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [209]  [1000/2502]  eta: 0:12:05  lr: 0.000948  min_lr: 0.000948  loss: 3.3619 (3.0791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9733 (1.0002)  time: 0.4778  data: 0.0004  max mem: 42519
Epoch: [209]  [1200/2502]  eta: 0:10:27  lr: 0.000947  min_lr: 0.000947  loss: 3.1879 (3.0750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9282 (0.9980)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [209]  [1400/2502]  eta: 0:08:51  lr: 0.000945  min_lr: 0.000945  loss: 3.2689 (3.0644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0159 (1.0022)  time: 0.4872  data: 0.0004  max mem: 42519
Epoch: [209]  [1600/2502]  eta: 0:07:14  lr: 0.000944  min_lr: 0.000944  loss: 3.1791 (3.0670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9448 (1.0010)  time: 0.4815  data: 0.0004  max mem: 42519
Epoch: [209]  [1800/2502]  eta: 0:05:38  lr: 0.000942  min_lr: 0.000942  loss: 3.2126 (3.0716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9655 (0.9986)  time: 0.4812  data: 0.0004  max mem: 42519
Epoch: [209]  [2000/2502]  eta: 0:04:02  lr: 0.000941  min_lr: 0.000941  loss: 3.2266 (3.0728)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8868 (0.9959)  time: 0.4832  data: 0.0004  max mem: 42519
Epoch: [209]  [2200/2502]  eta: 0:02:25  lr: 0.000939  min_lr: 0.000939  loss: 3.2588 (3.0699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9637 (0.9932)  time: 0.4863  data: 0.0005  max mem: 42519
Epoch: [209]  [2400/2502]  eta: 0:00:49  lr: 0.000937  min_lr: 0.000937  loss: 3.2908 (3.0675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9149 (0.9934)  time: 0.4903  data: 0.0005  max mem: 42519
Epoch: [209]  [2501/2502]  eta: 0:00:00  lr: 0.000937  min_lr: 0.000937  loss: 3.3170 (3.0683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9706 (0.9943)  time: 0.4452  data: 0.0008  max mem: 42519
Epoch: [209] Total time: 0:20:06 (0.4821 s / it)
Averaged stats: lr: 0.000937  min_lr: 0.000937  loss: 3.3170 (3.0677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9706 (0.9943)
Test:  [ 0/50]  eta: 0:03:00  loss: 0.4967 (0.4967)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 3.6112  data: 3.3350  max mem: 42519
Test:  [10/50]  eta: 0:00:21  loss: 0.7846 (0.7832)  acc1: 88.8000 (87.3091)  acc5: 98.4000 (97.9636)  time: 0.5433  data: 0.3036  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8100 (0.8254)  acc1: 84.4000 (85.8667)  acc5: 98.0000 (98.0191)  time: 0.2363  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9835 (0.9193)  acc1: 82.0000 (83.9742)  acc5: 97.2000 (97.0323)  time: 0.2362  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1606 (0.9874)  acc1: 78.0000 (82.1463)  acc5: 94.0000 (96.2537)  time: 0.2359  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1874 (1.0080)  acc1: 77.2000 (81.5680)  acc5: 94.4000 (96.1600)  time: 0.2357  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3057 s / it)
* Acc@1 81.772 Acc@5 96.100 loss 1.004
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.77%
Epoch: [210]  [   0/2502]  eta: 1:17:13  lr: 0.000937  min_lr: 0.000937  loss: 2.0698 (2.0698)  weight_decay: 0.0500 (0.0500)  time: 1.8518  data: 1.3583  max mem: 42519
Epoch: [210]  [ 200/2502]  eta: 0:18:43  lr: 0.000935  min_lr: 0.000935  loss: 3.1052 (3.0162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9341 (0.9552)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [210]  [ 400/2502]  eta: 0:16:59  lr: 0.000934  min_lr: 0.000934  loss: 3.1826 (3.0418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9747 (0.9647)  time: 0.4810  data: 0.0004  max mem: 42519
Epoch: [210]  [ 600/2502]  eta: 0:15:21  lr: 0.000932  min_lr: 0.000932  loss: 3.2415 (3.0465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9634 (0.9745)  time: 0.4785  data: 0.0003  max mem: 42519
Epoch: [210]  [ 800/2502]  eta: 0:13:42  lr: 0.000931  min_lr: 0.000931  loss: 3.0339 (3.0674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9910 (0.9805)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [210]  [1000/2502]  eta: 0:12:04  lr: 0.000929  min_lr: 0.000929  loss: 3.2099 (3.0766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9589 (0.9821)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [210]  [1200/2502]  eta: 0:10:28  lr: 0.000928  min_lr: 0.000928  loss: 3.1946 (3.0729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9264 (0.9808)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [210]  [1400/2502]  eta: 0:08:51  lr: 0.000926  min_lr: 0.000926  loss: 3.0286 (3.0733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9361 (0.9804)  time: 0.4816  data: 0.0005  max mem: 42519
Epoch: [210]  [1600/2502]  eta: 0:07:15  lr: 0.000925  min_lr: 0.000925  loss: 3.2894 (3.0760)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0604 (0.9865)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [210]  [1800/2502]  eta: 0:05:38  lr: 0.000923  min_lr: 0.000923  loss: 3.2070 (3.0766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8427 (nan)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [210]  [2000/2502]  eta: 0:04:02  lr: 0.000922  min_lr: 0.000922  loss: 2.9658 (3.0768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9786 (nan)  time: 0.4844  data: 0.0005  max mem: 42519
Epoch: [210]  [2200/2502]  eta: 0:02:25  lr: 0.000920  min_lr: 0.000920  loss: 3.2879 (3.0757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9388 (nan)  time: 0.4784  data: 0.0005  max mem: 42519
Epoch: [210]  [2400/2502]  eta: 0:00:49  lr: 0.000919  min_lr: 0.000919  loss: 3.2144 (3.0730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9742 (nan)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [210]  [2501/2502]  eta: 0:00:00  lr: 0.000918  min_lr: 0.000918  loss: 2.9131 (3.0696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9713 (nan)  time: 0.4414  data: 0.0012  max mem: 42519
Epoch: [210] Total time: 0:20:05 (0.4820 s / it)
Averaged stats: lr: 0.000918  min_lr: 0.000918  loss: 2.9131 (3.0638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9713 (nan)
Test:  [ 0/50]  eta: 0:03:41  loss: 0.4974 (0.4974)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 4.4217  data: 4.1320  max mem: 42519
Test:  [10/50]  eta: 0:00:24  loss: 0.8214 (0.8248)  acc1: 88.0000 (86.6546)  acc5: 98.4000 (97.8545)  time: 0.6179  data: 0.3766  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8315 (0.8492)  acc1: 84.0000 (85.6381)  acc5: 97.6000 (97.6952)  time: 0.2375  data: 0.0008  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9781 (0.9290)  acc1: 81.6000 (83.8968)  acc5: 95.6000 (96.6581)  time: 0.2376  data: 0.0004  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1675 (0.9895)  acc1: 78.0000 (82.2244)  acc5: 94.8000 (96.1366)  time: 0.2373  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1751 (1.0036)  acc1: 77.2000 (81.6480)  acc5: 94.8000 (96.0880)  time: 0.2370  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3236 s / it)
* Acc@1 81.858 Acc@5 96.106 loss 0.996
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.86%
Epoch: [211]  [   0/2502]  eta: 1:31:10  lr: 0.000918  min_lr: 0.000918  loss: 2.8796 (2.8796)  weight_decay: 0.0500 (0.0500)  time: 2.1866  data: 1.6889  max mem: 42519
Epoch: [211]  [ 200/2502]  eta: 0:18:46  lr: 0.000916  min_lr: 0.000916  loss: 3.2857 (3.0617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9645 (1.0009)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [211]  [ 400/2502]  eta: 0:16:58  lr: 0.000915  min_lr: 0.000915  loss: 3.1677 (3.0461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9642 (1.0091)  time: 0.4771  data: 0.0004  max mem: 42519
Epoch: [211]  [ 600/2502]  eta: 0:15:17  lr: 0.000913  min_lr: 0.000913  loss: 3.0761 (3.0506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9516 (1.0237)  time: 0.4771  data: 0.0004  max mem: 42519
Epoch: [211]  [ 800/2502]  eta: 0:13:39  lr: 0.000912  min_lr: 0.000912  loss: 3.1669 (3.0418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9792 (1.0102)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [211]  [1000/2502]  eta: 0:12:03  lr: 0.000910  min_lr: 0.000910  loss: 3.1381 (3.0374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9517 (0.9997)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [211]  [1200/2502]  eta: 0:10:26  lr: 0.000909  min_lr: 0.000909  loss: 2.9572 (3.0475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9769 (0.9962)  time: 0.4833  data: 0.0006  max mem: 42519
Epoch: [211]  [1400/2502]  eta: 0:08:50  lr: 0.000907  min_lr: 0.000907  loss: 3.2971 (3.0494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9236 (0.9946)  time: 0.4803  data: 0.0006  max mem: 42519
Epoch: [211]  [1600/2502]  eta: 0:07:14  lr: 0.000906  min_lr: 0.000906  loss: 3.2672 (3.0432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9763 (0.9958)  time: 0.4814  data: 0.0006  max mem: 42519
Epoch: [211]  [1800/2502]  eta: 0:05:38  lr: 0.000904  min_lr: 0.000904  loss: 3.2048 (3.0484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9865 (0.9952)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [211]  [2000/2502]  eta: 0:04:01  lr: 0.000903  min_lr: 0.000903  loss: 3.2227 (3.0470)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9475 (0.9964)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [211]  [2200/2502]  eta: 0:02:25  lr: 0.000901  min_lr: 0.000901  loss: 3.0658 (3.0503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9826 (0.9978)  time: 0.4787  data: 0.0003  max mem: 42519
Epoch: [211]  [2400/2502]  eta: 0:00:49  lr: 0.000900  min_lr: 0.000900  loss: 2.9147 (3.0542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9960 (0.9991)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [211]  [2501/2502]  eta: 0:00:00  lr: 0.000899  min_lr: 0.000899  loss: 3.1328 (3.0554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9274 (0.9984)  time: 0.4452  data: 0.0007  max mem: 42519
Epoch: [211] Total time: 0:20:03 (0.4812 s / it)
Averaged stats: lr: 0.000899  min_lr: 0.000899  loss: 3.1328 (3.0512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9274 (0.9984)
Test:  [ 0/50]  eta: 0:03:47  loss: 0.4933 (0.4933)  acc1: 94.0000 (94.0000)  acc5: 98.8000 (98.8000)  time: 4.5495  data: 4.2713  max mem: 42519
Test:  [10/50]  eta: 0:00:25  loss: 0.8319 (0.8363)  acc1: 87.2000 (86.4727)  acc5: 98.0000 (97.8182)  time: 0.6293  data: 0.3887  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8548 (0.8702)  acc1: 84.0000 (85.4286)  acc5: 98.0000 (97.7714)  time: 0.2376  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0366 (0.9656)  acc1: 82.4000 (83.7161)  acc5: 96.8000 (96.7355)  time: 0.2379  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2105 (1.0263)  acc1: 78.0000 (82.0000)  acc5: 94.4000 (96.2146)  time: 0.2376  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1882 (1.0369)  acc1: 77.2000 (81.5200)  acc5: 95.2000 (96.2000)  time: 0.2373  data: 0.0001  max mem: 42519
Test: Total time: 0:00:16 (0.3263 s / it)
* Acc@1 81.804 Acc@5 96.126 loss 1.029
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.86%
Epoch: [212]  [   0/2502]  eta: 1:45:44  lr: 0.000899  min_lr: 0.000899  loss: 3.2923 (3.2923)  weight_decay: 0.0500 (0.0500)  time: 2.5358  data: 2.0366  max mem: 42519
Epoch: [212]  [ 200/2502]  eta: 0:18:50  lr: 0.000897  min_lr: 0.000897  loss: 3.2573 (3.0778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0823 (1.0617)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [212]  [ 400/2502]  eta: 0:17:01  lr: 0.000896  min_lr: 0.000896  loss: 3.1885 (3.0205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9859 (1.0411)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [212]  [ 600/2502]  eta: 0:15:22  lr: 0.000894  min_lr: 0.000894  loss: 3.1928 (3.0400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9321 (1.0251)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [212]  [ 800/2502]  eta: 0:13:43  lr: 0.000893  min_lr: 0.000893  loss: 3.2833 (3.0540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9712 (1.0216)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [212]  [1000/2502]  eta: 0:12:05  lr: 0.000892  min_lr: 0.000892  loss: 3.1326 (3.0434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9100 (1.0214)  time: 0.4809  data: 0.0004  max mem: 42519
Epoch: [212]  [1200/2502]  eta: 0:10:29  lr: 0.000890  min_lr: 0.000890  loss: 3.1686 (3.0446)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0012 (1.0237)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [212]  [1400/2502]  eta: 0:08:52  lr: 0.000889  min_lr: 0.000889  loss: 3.1820 (3.0396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9829 (1.0256)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [212]  [1600/2502]  eta: 0:07:15  lr: 0.000887  min_lr: 0.000887  loss: 2.7836 (3.0434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9904 (1.0199)  time: 0.4799  data: 0.0005  max mem: 42519
Epoch: [212]  [1800/2502]  eta: 0:05:38  lr: 0.000886  min_lr: 0.000886  loss: 3.1215 (3.0439)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9404 (1.0156)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [212]  [2000/2502]  eta: 0:04:02  lr: 0.000884  min_lr: 0.000884  loss: 3.0987 (3.0459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9537 (1.0125)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [212]  [2200/2502]  eta: 0:02:25  lr: 0.000883  min_lr: 0.000883  loss: 3.2068 (3.0491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9656 (1.0101)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [212]  [2400/2502]  eta: 0:00:49  lr: 0.000881  min_lr: 0.000881  loss: 3.3452 (3.0506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0035 (1.0079)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [212]  [2501/2502]  eta: 0:00:00  lr: 0.000880  min_lr: 0.000880  loss: 3.1184 (3.0507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9699 (1.0090)  time: 0.4424  data: 0.0011  max mem: 42519
Epoch: [212] Total time: 0:20:06 (0.4820 s / it)
Averaged stats: lr: 0.000880  min_lr: 0.000880  loss: 3.1184 (3.0465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9699 (1.0090)
Test:  [ 0/50]  eta: 0:02:42  loss: 0.4607 (0.4607)  acc1: 94.4000 (94.4000)  acc5: 99.2000 (99.2000)  time: 3.2416  data: 2.9616  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.7277 (0.7632)  acc1: 88.0000 (86.6545)  acc5: 98.4000 (97.8909)  time: 0.5115  data: 0.2712  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.7296 (0.7828)  acc1: 85.2000 (86.0191)  acc5: 98.0000 (97.8857)  time: 0.2375  data: 0.0013  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9217 (0.8768)  acc1: 82.8000 (83.9613)  acc5: 96.4000 (96.8258)  time: 0.2365  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1300 (0.9426)  acc1: 77.2000 (82.2244)  acc5: 94.4000 (96.1659)  time: 0.2362  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1300 (0.9608)  acc1: 77.2000 (81.5760)  acc5: 95.2000 (96.0640)  time: 0.2359  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2998 s / it)
* Acc@1 81.920 Acc@5 96.152 loss 0.954
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.92%
Epoch: [213]  [   0/2502]  eta: 1:33:45  lr: 0.000880  min_lr: 0.000880  loss: 2.8609 (2.8609)  weight_decay: 0.0500 (0.0500)  time: 2.2484  data: 1.7516  max mem: 42519
Epoch: [213]  [ 200/2502]  eta: 0:18:44  lr: 0.000879  min_lr: 0.000879  loss: 3.0386 (3.0287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9520 (1.0373)  time: 0.4871  data: 0.0005  max mem: 42519
Epoch: [213]  [ 400/2502]  eta: 0:16:58  lr: 0.000877  min_lr: 0.000877  loss: 3.2830 (3.0337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9868 (1.0203)  time: 0.4879  data: 0.0004  max mem: 42519
Epoch: [213]  [ 600/2502]  eta: 0:15:17  lr: 0.000876  min_lr: 0.000876  loss: 2.9812 (3.0452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9596 (1.0252)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [213]  [ 800/2502]  eta: 0:13:40  lr: 0.000874  min_lr: 0.000874  loss: 3.1246 (3.0385)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9886 (1.0182)  time: 0.4847  data: 0.0004  max mem: 42519
Epoch: [213]  [1000/2502]  eta: 0:12:04  lr: 0.000873  min_lr: 0.000873  loss: 3.3056 (3.0433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9661 (1.0167)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [213]  [1200/2502]  eta: 0:10:27  lr: 0.000871  min_lr: 0.000871  loss: 3.1507 (3.0450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9580 (1.0129)  time: 0.4805  data: 0.0005  max mem: 42519
Epoch: [213]  [1400/2502]  eta: 0:08:50  lr: 0.000870  min_lr: 0.000870  loss: 2.9058 (3.0395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9605 (1.0120)  time: 0.4814  data: 0.0006  max mem: 42519
Epoch: [213]  [1600/2502]  eta: 0:07:14  lr: 0.000868  min_lr: 0.000868  loss: 3.0730 (3.0408)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9522 (1.0082)  time: 0.4845  data: 0.0006  max mem: 42519
Epoch: [213]  [1800/2502]  eta: 0:05:37  lr: 0.000867  min_lr: 0.000867  loss: 3.0115 (3.0386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9983 (1.0107)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [213]  [2000/2502]  eta: 0:04:01  lr: 0.000866  min_lr: 0.000866  loss: 3.3388 (3.0395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9370 (1.0094)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [213]  [2200/2502]  eta: 0:02:25  lr: 0.000864  min_lr: 0.000864  loss: 3.2091 (3.0371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9961 (1.0089)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [213]  [2400/2502]  eta: 0:00:49  lr: 0.000863  min_lr: 0.000863  loss: 3.2061 (3.0399)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0400 (1.0079)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [213]  [2501/2502]  eta: 0:00:00  lr: 0.000862  min_lr: 0.000862  loss: 3.0567 (3.0409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9693 (1.0066)  time: 0.4399  data: 0.0010  max mem: 42519
Epoch: [213] Total time: 0:20:03 (0.4810 s / it)
Averaged stats: lr: 0.000862  min_lr: 0.000862  loss: 3.0567 (3.0443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9693 (1.0066)
Test:  [ 0/50]  eta: 0:02:24  loss: 0.5593 (0.5593)  acc1: 95.2000 (95.2000)  acc5: 99.2000 (99.2000)  time: 2.8801  data: 2.6137  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8237 (0.8436)  acc1: 88.4000 (86.7636)  acc5: 98.0000 (97.7818)  time: 0.4826  data: 0.2381  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8546 (0.8703)  acc1: 83.6000 (85.7143)  acc5: 98.0000 (97.9048)  time: 0.2467  data: 0.0032  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9572 (0.9593)  acc1: 82.8000 (83.8452)  acc5: 96.4000 (96.7742)  time: 0.2458  data: 0.0049  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1550 (1.0133)  acc1: 80.4000 (82.4683)  acc5: 93.6000 (96.1854)  time: 0.2591  data: 0.0219  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1710 (1.0293)  acc1: 80.4000 (81.6880)  acc5: 94.8000 (96.1280)  time: 0.2571  data: 0.0200  max mem: 42519
Test: Total time: 0:00:15 (0.3045 s / it)
* Acc@1 82.030 Acc@5 96.240 loss 1.022
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.03%
Epoch: [214]  [   0/2502]  eta: 1:16:08  lr: 0.000862  min_lr: 0.000862  loss: 2.9223 (2.9223)  weight_decay: 0.0500 (0.0500)  time: 1.8258  data: 1.3446  max mem: 42519
Epoch: [214]  [ 200/2502]  eta: 0:18:43  lr: 0.000860  min_lr: 0.000860  loss: 3.0936 (3.0092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9818 (0.9733)  time: 0.4818  data: 0.0005  max mem: 42519
Epoch: [214]  [ 400/2502]  eta: 0:16:57  lr: 0.000859  min_lr: 0.000859  loss: 3.2268 (3.0373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9790 (1.0000)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [214]  [ 600/2502]  eta: 0:15:19  lr: 0.000857  min_lr: 0.000857  loss: 3.3186 (3.0251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9448 (1.0005)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [214]  [ 800/2502]  eta: 0:13:40  lr: 0.000856  min_lr: 0.000856  loss: 3.0526 (3.0216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9622 (1.0140)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [214]  [1000/2502]  eta: 0:12:03  lr: 0.000854  min_lr: 0.000854  loss: 3.2101 (3.0236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9619 (1.0100)  time: 0.4822  data: 0.0005  max mem: 42519
Epoch: [214]  [1200/2502]  eta: 0:10:27  lr: 0.000853  min_lr: 0.000853  loss: 2.9385 (3.0164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9433 (1.0091)  time: 0.4830  data: 0.0005  max mem: 42519
Epoch: [214]  [1400/2502]  eta: 0:08:51  lr: 0.000852  min_lr: 0.000852  loss: 3.0969 (3.0200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9800 (1.0076)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [214]  [1600/2502]  eta: 0:07:14  lr: 0.000850  min_lr: 0.000850  loss: 3.2980 (3.0203)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2595 (nan)  time: 0.4857  data: 0.0004  max mem: 42519
Epoch: [214]  [1800/2502]  eta: 0:05:38  lr: 0.000849  min_lr: 0.000849  loss: 3.1427 (3.0209)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0251 (nan)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [214]  [2000/2502]  eta: 0:04:01  lr: 0.000847  min_lr: 0.000847  loss: 3.1396 (3.0217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9732 (nan)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [214]  [2200/2502]  eta: 0:02:25  lr: 0.000846  min_lr: 0.000846  loss: 3.3490 (3.0278)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0485 (nan)  time: 0.4860  data: 0.0005  max mem: 42519
Epoch: [214]  [2400/2502]  eta: 0:00:49  lr: 0.000844  min_lr: 0.000844  loss: 3.1795 (3.0299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9344 (nan)  time: 0.4826  data: 0.0005  max mem: 42519
Epoch: [214]  [2501/2502]  eta: 0:00:00  lr: 0.000843  min_lr: 0.000843  loss: 3.2161 (3.0324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0980 (nan)  time: 0.4375  data: 0.0007  max mem: 42519
Epoch: [214] Total time: 0:20:04 (0.4814 s / it)
Averaged stats: lr: 0.000843  min_lr: 0.000843  loss: 3.2161 (3.0436)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0980 (nan)
Test:  [ 0/50]  eta: 0:02:14  loss: 0.5571 (0.5571)  acc1: 96.4000 (96.4000)  acc5: 99.2000 (99.2000)  time: 2.6975  data: 2.4231  max mem: 42519
Test:  [10/50]  eta: 0:00:18  loss: 0.8951 (0.8878)  acc1: 88.8000 (87.0545)  acc5: 97.6000 (97.6364)  time: 0.4652  data: 0.2245  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.9015 (0.9215)  acc1: 84.8000 (85.8095)  acc5: 97.6000 (97.6762)  time: 0.2435  data: 0.0026  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0225 (1.0052)  acc1: 82.4000 (84.1032)  acc5: 96.0000 (96.7097)  time: 0.2420  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2405 (1.0696)  acc1: 78.4000 (82.3317)  acc5: 94.0000 (96.0781)  time: 0.2550  data: 0.0172  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2353 (1.0821)  acc1: 77.6000 (81.8400)  acc5: 94.4000 (96.0320)  time: 0.2543  data: 0.0170  max mem: 42519
Test: Total time: 0:00:14 (0.2988 s / it)
* Acc@1 82.026 Acc@5 96.152 loss 1.078
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.03%
Epoch: [215]  [   0/2502]  eta: 1:38:58  lr: 0.000843  min_lr: 0.000843  loss: 3.4459 (3.4459)  weight_decay: 0.0500 (0.0500)  time: 2.3734  data: 1.8814  max mem: 42519
Epoch: [215]  [ 200/2502]  eta: 0:18:46  lr: 0.000842  min_lr: 0.000842  loss: 2.8093 (3.0317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9952 (1.0346)  time: 0.4875  data: 0.0004  max mem: 42519
Epoch: [215]  [ 400/2502]  eta: 0:17:01  lr: 0.000841  min_lr: 0.000841  loss: 3.2586 (3.0432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9879 (1.0184)  time: 0.4824  data: 0.0006  max mem: 42519
Epoch: [215]  [ 600/2502]  eta: 0:15:21  lr: 0.000839  min_lr: 0.000839  loss: 3.2405 (3.0433)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0353 (1.0253)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [215]  [ 800/2502]  eta: 0:13:44  lr: 0.000838  min_lr: 0.000838  loss: 3.2824 (3.0475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9784 (1.0301)  time: 0.4980  data: 0.0006  max mem: 42519
Epoch: [215]  [1000/2502]  eta: 0:12:06  lr: 0.000836  min_lr: 0.000836  loss: 3.3500 (3.0510)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9356 (1.0239)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [215]  [1200/2502]  eta: 0:10:28  lr: 0.000835  min_lr: 0.000835  loss: 3.0980 (3.0491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9762 (1.0232)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [215]  [1400/2502]  eta: 0:08:52  lr: 0.000833  min_lr: 0.000833  loss: 2.8979 (3.0383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0594 (1.0226)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [215]  [1600/2502]  eta: 0:07:15  lr: 0.000832  min_lr: 0.000832  loss: 3.1461 (3.0348)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0275 (1.0233)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [215]  [1800/2502]  eta: 0:05:38  lr: 0.000830  min_lr: 0.000830  loss: 3.0730 (3.0366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0152 (1.0200)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [215]  [2000/2502]  eta: 0:04:02  lr: 0.000829  min_lr: 0.000829  loss: 3.2935 (3.0352)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1008 (1.0249)  time: 0.4833  data: 0.0004  max mem: 42519
Epoch: [215]  [2200/2502]  eta: 0:02:25  lr: 0.000827  min_lr: 0.000827  loss: 3.1814 (3.0323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9652 (1.0242)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [215]  [2400/2502]  eta: 0:00:49  lr: 0.000826  min_lr: 0.000826  loss: 3.2144 (3.0357)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0408 (1.0273)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [215]  [2501/2502]  eta: 0:00:00  lr: 0.000825  min_lr: 0.000825  loss: 3.2227 (3.0356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9902 (1.0264)  time: 0.4436  data: 0.0007  max mem: 42519
Epoch: [215] Total time: 0:20:06 (0.4821 s / it)
Averaged stats: lr: 0.000825  min_lr: 0.000825  loss: 3.2227 (3.0350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9902 (1.0264)
Test:  [ 0/50]  eta: 0:03:38  loss: 0.5816 (0.5816)  acc1: 93.6000 (93.6000)  acc5: 99.2000 (99.2000)  time: 4.3768  data: 4.1204  max mem: 42519
Test:  [10/50]  eta: 0:00:24  loss: 0.7684 (0.8102)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (97.8909)  time: 0.6147  data: 0.3751  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8734 (0.8552)  acc1: 84.4000 (85.9238)  acc5: 97.6000 (97.8095)  time: 0.2442  data: 0.0018  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0018 (0.9475)  acc1: 80.8000 (84.0129)  acc5: 96.4000 (96.7742)  time: 0.2473  data: 0.0037  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1867 (1.0076)  acc1: 78.4000 (82.5756)  acc5: 94.0000 (96.2146)  time: 0.2526  data: 0.0140  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1913 (1.0218)  acc1: 77.6000 (81.9920)  acc5: 94.4000 (96.1280)  time: 0.2506  data: 0.0138  max mem: 42519
Test: Total time: 0:00:16 (0.3316 s / it)
* Acc@1 82.096 Acc@5 96.226 loss 1.015
Accuracy of the model on the 50000 test images: 82.1%
Max accuracy: 82.10%
Epoch: [216]  [   0/2502]  eta: 1:22:28  lr: 0.000825  min_lr: 0.000825  loss: 2.5349 (2.5349)  weight_decay: 0.0500 (0.0500)  time: 1.9780  data: 1.4975  max mem: 42519
Epoch: [216]  [ 200/2502]  eta: 0:18:42  lr: 0.000824  min_lr: 0.000824  loss: 2.9613 (3.0024)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0085 (1.0217)  time: 0.4809  data: 0.0004  max mem: 42519
Epoch: [216]  [ 400/2502]  eta: 0:16:58  lr: 0.000822  min_lr: 0.000822  loss: 3.0131 (3.0299)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0380 (1.0101)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [216]  [ 600/2502]  eta: 0:15:20  lr: 0.000821  min_lr: 0.000821  loss: 3.2244 (3.0241)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0185 (1.0209)  time: 0.4862  data: 0.0004  max mem: 42519
Epoch: [216]  [ 800/2502]  eta: 0:13:42  lr: 0.000819  min_lr: 0.000819  loss: 2.9066 (3.0316)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0756 (1.0321)  time: 0.4800  data: 0.0005  max mem: 42519
Epoch: [216]  [1000/2502]  eta: 0:12:05  lr: 0.000818  min_lr: 0.000818  loss: 3.0258 (3.0356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0795 (1.0346)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [216]  [1200/2502]  eta: 0:10:28  lr: 0.000817  min_lr: 0.000817  loss: 3.3251 (3.0361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9702 (1.0306)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [216]  [1400/2502]  eta: 0:08:51  lr: 0.000815  min_lr: 0.000815  loss: 3.1690 (3.0370)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0787 (1.0353)  time: 0.4780  data: 0.0004  max mem: 42519
Epoch: [216]  [1600/2502]  eta: 0:07:14  lr: 0.000814  min_lr: 0.000814  loss: 3.1618 (3.0332)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0619 (1.0367)  time: 0.4915  data: 0.0006  max mem: 42519
Epoch: [216]  [1800/2502]  eta: 0:05:38  lr: 0.000812  min_lr: 0.000812  loss: 2.9768 (3.0286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9813 (1.0317)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [216]  [2000/2502]  eta: 0:04:01  lr: 0.000811  min_lr: 0.000811  loss: 3.0633 (3.0316)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0083 (1.0380)  time: 0.4809  data: 0.0004  max mem: 42519
Epoch: [216]  [2200/2502]  eta: 0:02:25  lr: 0.000809  min_lr: 0.000809  loss: 2.8071 (3.0321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9756 (1.0396)  time: 0.4871  data: 0.0004  max mem: 42519
Epoch: [216]  [2400/2502]  eta: 0:00:49  lr: 0.000808  min_lr: 0.000808  loss: 3.0756 (3.0294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9776 (1.0393)  time: 0.4809  data: 0.0005  max mem: 42519
Epoch: [216]  [2501/2502]  eta: 0:00:00  lr: 0.000807  min_lr: 0.000807  loss: 3.1827 (3.0319)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9994 (1.0371)  time: 0.4395  data: 0.0010  max mem: 42519
Epoch: [216] Total time: 0:20:05 (0.4817 s / it)
Averaged stats: lr: 0.000807  min_lr: 0.000807  loss: 3.1827 (3.0273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9994 (1.0371)
Test:  [ 0/50]  eta: 0:03:18  loss: 0.5662 (0.5662)  acc1: 96.0000 (96.0000)  acc5: 99.2000 (99.2000)  time: 3.9601  data: 3.6763  max mem: 42519
Test:  [10/50]  eta: 0:00:23  loss: 0.8884 (0.8762)  acc1: 87.2000 (86.8000)  acc5: 98.4000 (97.9636)  time: 0.5756  data: 0.3347  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.8928 (0.9038)  acc1: 84.0000 (85.4667)  acc5: 97.6000 (97.8476)  time: 0.2396  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0532 (0.9901)  acc1: 81.6000 (83.6516)  acc5: 96.4000 (96.8774)  time: 0.2495  data: 0.0103  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1932 (1.0473)  acc1: 78.4000 (82.2829)  acc5: 94.8000 (96.3512)  time: 0.2470  data: 0.0101  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1972 (1.0621)  acc1: 77.6000 (81.8080)  acc5: 95.2000 (96.2640)  time: 0.2371  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3189 s / it)
* Acc@1 82.344 Acc@5 96.314 loss 1.056
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.34%
Epoch: [217]  [   0/2502]  eta: 1:22:11  lr: 0.000807  min_lr: 0.000807  loss: 2.3171 (2.3171)  weight_decay: 0.0500 (0.0500)  time: 1.9709  data: 1.4726  max mem: 42519
Epoch: [217]  [ 200/2502]  eta: 0:18:45  lr: 0.000806  min_lr: 0.000806  loss: 2.9796 (2.9989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9929 (1.0512)  time: 0.4902  data: 0.0004  max mem: 42519
Epoch: [217]  [ 400/2502]  eta: 0:16:59  lr: 0.000804  min_lr: 0.000804  loss: 3.0572 (3.0066)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0258 (1.0622)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [217]  [ 600/2502]  eta: 0:15:19  lr: 0.000803  min_lr: 0.000803  loss: 3.0547 (3.0120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9733 (1.0323)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [217]  [ 800/2502]  eta: 0:13:41  lr: 0.000801  min_lr: 0.000801  loss: 2.8905 (3.0179)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0794 (1.0375)  time: 0.4854  data: 0.0005  max mem: 42519
Epoch: [217]  [1000/2502]  eta: 0:12:04  lr: 0.000800  min_lr: 0.000800  loss: 3.0176 (3.0066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9405 (1.0393)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [217]  [1200/2502]  eta: 0:10:27  lr: 0.000799  min_lr: 0.000799  loss: 3.0142 (2.9979)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0222 (1.0417)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [217]  [1400/2502]  eta: 0:08:50  lr: 0.000797  min_lr: 0.000797  loss: 2.8672 (2.9951)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0838 (1.0461)  time: 0.4812  data: 0.0004  max mem: 42519
Epoch: [217]  [1600/2502]  eta: 0:07:14  lr: 0.000796  min_lr: 0.000796  loss: 2.9751 (2.9988)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0757 (1.0534)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [217]  [1800/2502]  eta: 0:05:38  lr: 0.000794  min_lr: 0.000794  loss: 3.0565 (3.0030)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0439 (1.0554)  time: 0.4822  data: 0.0005  max mem: 42519
Epoch: [217]  [2000/2502]  eta: 0:04:01  lr: 0.000793  min_lr: 0.000793  loss: 3.1662 (3.0067)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0700 (1.0588)  time: 0.4893  data: 0.0005  max mem: 42519
Epoch: [217]  [2200/2502]  eta: 0:02:25  lr: 0.000791  min_lr: 0.000791  loss: 3.0487 (3.0085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0388 (1.0566)  time: 0.4815  data: 0.0006  max mem: 42519
Epoch: [217]  [2400/2502]  eta: 0:00:49  lr: 0.000790  min_lr: 0.000790  loss: 3.1539 (3.0141)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0140 (1.0550)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [217]  [2501/2502]  eta: 0:00:00  lr: 0.000789  min_lr: 0.000789  loss: 3.2591 (3.0125)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0489 (1.0569)  time: 0.4388  data: 0.0009  max mem: 42519
Epoch: [217] Total time: 0:20:04 (0.4816 s / it)
Averaged stats: lr: 0.000789  min_lr: 0.000789  loss: 3.2591 (3.0146)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0489 (1.0569)
Test:  [ 0/50]  eta: 0:03:43  loss: 0.5916 (0.5916)  acc1: 94.4000 (94.4000)  acc5: 98.8000 (98.8000)  time: 4.4650  data: 4.1797  max mem: 42519
Test:  [10/50]  eta: 0:00:24  loss: 0.8629 (0.8651)  acc1: 88.4000 (87.1636)  acc5: 98.0000 (97.8546)  time: 0.6216  data: 0.3804  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8759 (0.8866)  acc1: 84.4000 (86.0381)  acc5: 97.6000 (97.8667)  time: 0.2545  data: 0.0151  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0455 (0.9719)  acc1: 82.4000 (84.3484)  acc5: 96.4000 (96.8516)  time: 0.2547  data: 0.0151  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1975 (1.0300)  acc1: 79.2000 (82.6829)  acc5: 94.4000 (96.2829)  time: 0.2375  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1777 (1.0432)  acc1: 78.0000 (82.1520)  acc5: 94.8000 (96.2080)  time: 0.2371  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3306 s / it)
* Acc@1 82.338 Acc@5 96.326 loss 1.036
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.34%
Epoch: [218]  [   0/2502]  eta: 1:29:00  lr: 0.000789  min_lr: 0.000789  loss: 2.3826 (2.3826)  weight_decay: 0.0500 (0.0500)  time: 2.1347  data: 1.6689  max mem: 42519
Epoch: [218]  [ 200/2502]  eta: 0:18:47  lr: 0.000788  min_lr: 0.000788  loss: 3.0677 (3.0139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9617 (1.0152)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [218]  [ 400/2502]  eta: 0:17:01  lr: 0.000786  min_lr: 0.000786  loss: 3.0156 (2.9997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1202 (1.0344)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [218]  [ 600/2502]  eta: 0:15:21  lr: 0.000785  min_lr: 0.000785  loss: 3.0126 (3.0099)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9679 (1.0480)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [218]  [ 800/2502]  eta: 0:13:42  lr: 0.000784  min_lr: 0.000784  loss: 3.1254 (3.0107)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0484 (1.0496)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [218]  [1000/2502]  eta: 0:12:04  lr: 0.000782  min_lr: 0.000782  loss: 2.8702 (3.0108)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0327 (1.0512)  time: 0.4776  data: 0.0004  max mem: 42519
Epoch: [218]  [1200/2502]  eta: 0:10:28  lr: 0.000781  min_lr: 0.000781  loss: 3.2233 (3.0115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9843 (1.0461)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [218]  [1400/2502]  eta: 0:08:51  lr: 0.000779  min_lr: 0.000779  loss: 3.1349 (3.0015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9753 (1.0446)  time: 0.4808  data: 0.0005  max mem: 42519
Epoch: [218]  [1600/2502]  eta: 0:07:14  lr: 0.000778  min_lr: 0.000778  loss: 3.1325 (3.0034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9913 (1.0444)  time: 0.4860  data: 0.0005  max mem: 42519
Epoch: [218]  [1800/2502]  eta: 0:05:38  lr: 0.000776  min_lr: 0.000776  loss: 2.9111 (3.0090)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1245 (1.0507)  time: 0.4781  data: 0.0005  max mem: 42519
Epoch: [218]  [2000/2502]  eta: 0:04:01  lr: 0.000775  min_lr: 0.000775  loss: 2.9896 (3.0059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9472 (1.0494)  time: 0.4775  data: 0.0004  max mem: 42519
Epoch: [218]  [2200/2502]  eta: 0:02:25  lr: 0.000774  min_lr: 0.000774  loss: 2.8275 (3.0048)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1599 (1.0544)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [218]  [2400/2502]  eta: 0:00:49  lr: 0.000772  min_lr: 0.000772  loss: 2.9502 (3.0027)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0369 (1.0525)  time: 0.4821  data: 0.0005  max mem: 42519
Epoch: [218]  [2501/2502]  eta: 0:00:00  lr: 0.000771  min_lr: 0.000771  loss: 3.0496 (3.0028)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2044 (1.0601)  time: 0.4385  data: 0.0009  max mem: 42519
Epoch: [218] Total time: 0:20:04 (0.4816 s / it)
Averaged stats: lr: 0.000771  min_lr: 0.000771  loss: 3.0496 (3.0102)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2044 (1.0601)
Test:  [ 0/50]  eta: 0:03:50  loss: 0.4714 (0.4714)  acc1: 96.4000 (96.4000)  acc5: 99.2000 (99.2000)  time: 4.6060  data: 4.3370  max mem: 42519
Test:  [10/50]  eta: 0:00:25  loss: 0.8397 (0.8011)  acc1: 88.4000 (87.2364)  acc5: 98.4000 (97.8545)  time: 0.6360  data: 0.3947  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8406 (0.8347)  acc1: 84.4000 (85.7714)  acc5: 98.0000 (97.8857)  time: 0.2669  data: 0.0262  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9555 (0.9173)  acc1: 81.6000 (84.0258)  acc5: 96.8000 (96.8903)  time: 0.2669  data: 0.0263  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1276 (0.9760)  acc1: 78.4000 (82.3707)  acc5: 94.8000 (96.4293)  time: 0.2381  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1064 (0.9899)  acc1: 78.8000 (81.8960)  acc5: 95.2000 (96.3600)  time: 0.2372  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3393 s / it)
* Acc@1 82.242 Acc@5 96.330 loss 0.982
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.34%
Epoch: [219]  [   0/2502]  eta: 1:44:14  lr: 0.000771  min_lr: 0.000771  loss: 3.0690 (3.0690)  weight_decay: 0.0500 (0.0500)  time: 2.4999  data: 1.3586  max mem: 42519
Epoch: [219]  [ 200/2502]  eta: 0:18:48  lr: 0.000770  min_lr: 0.000770  loss: 3.0286 (2.9746)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0517 (1.1136)  time: 0.4898  data: 0.0003  max mem: 42519
Epoch: [219]  [ 400/2502]  eta: 0:17:01  lr: 0.000769  min_lr: 0.000769  loss: 3.2058 (2.9967)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0075 (1.0725)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [219]  [ 600/2502]  eta: 0:15:21  lr: 0.000767  min_lr: 0.000767  loss: 2.8896 (2.9930)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0107 (1.0570)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [219]  [ 800/2502]  eta: 0:13:42  lr: 0.000766  min_lr: 0.000766  loss: 3.0656 (2.9865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9917 (1.0496)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [219]  [1000/2502]  eta: 0:12:05  lr: 0.000764  min_lr: 0.000764  loss: 3.0561 (2.9931)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0945 (1.0654)  time: 0.4897  data: 0.0006  max mem: 42519
Epoch: [219]  [1200/2502]  eta: 0:10:28  lr: 0.000763  min_lr: 0.000763  loss: 2.8443 (2.9880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9852 (1.0610)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [219]  [1400/2502]  eta: 0:08:51  lr: 0.000762  min_lr: 0.000762  loss: 2.9086 (2.9870)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0017 (1.0565)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [219]  [1600/2502]  eta: 0:07:15  lr: 0.000760  min_lr: 0.000760  loss: 3.2494 (2.9893)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1609 (1.0549)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [219]  [1800/2502]  eta: 0:05:38  lr: 0.000759  min_lr: 0.000759  loss: 3.1672 (2.9889)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0072 (1.0601)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [219]  [2000/2502]  eta: 0:04:01  lr: 0.000757  min_lr: 0.000757  loss: 3.0055 (2.9881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0792 (1.0622)  time: 0.4884  data: 0.0005  max mem: 42519
Epoch: [219]  [2200/2502]  eta: 0:02:25  lr: 0.000756  min_lr: 0.000756  loss: 2.9216 (2.9906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9890 (1.0652)  time: 0.4785  data: 0.0005  max mem: 42519
Epoch: [219]  [2400/2502]  eta: 0:00:49  lr: 0.000755  min_lr: 0.000755  loss: 3.0760 (2.9897)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0599 (1.0650)  time: 0.4794  data: 0.0003  max mem: 42519
Epoch: [219]  [2501/2502]  eta: 0:00:00  lr: 0.000754  min_lr: 0.000754  loss: 3.2805 (2.9889)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0641 (1.0654)  time: 0.4466  data: 0.0009  max mem: 42519
Epoch: [219] Total time: 0:20:05 (0.4817 s / it)
Averaged stats: lr: 0.000754  min_lr: 0.000754  loss: 3.2805 (3.0035)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0641 (1.0654)
Test:  [ 0/50]  eta: 0:03:22  loss: 0.6084 (0.6084)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 4.0578  data: 3.7845  max mem: 42519
Test:  [10/50]  eta: 0:00:23  loss: 0.8477 (0.8742)  acc1: 86.8000 (86.6909)  acc5: 98.4000 (98.0000)  time: 0.5843  data: 0.3445  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.9124 (0.9015)  acc1: 84.8000 (85.7524)  acc5: 98.0000 (97.9238)  time: 0.2542  data: 0.0131  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0411 (0.9851)  acc1: 82.4000 (83.8839)  acc5: 96.4000 (96.8774)  time: 0.2577  data: 0.0166  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2088 (1.0442)  acc1: 79.2000 (82.3512)  acc5: 94.0000 (96.3122)  time: 0.2405  data: 0.0037  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2007 (1.0583)  acc1: 78.0000 (81.7920)  acc5: 94.8000 (96.1920)  time: 0.2397  data: 0.0030  max mem: 42519
Test: Total time: 0:00:16 (0.3241 s / it)
* Acc@1 82.132 Acc@5 96.310 loss 1.051
Accuracy of the model on the 50000 test images: 82.1%
Max accuracy: 82.34%
Epoch: [220]  [   0/2502]  eta: 1:31:40  lr: 0.000754  min_lr: 0.000754  loss: 3.0340 (3.0340)  weight_decay: 0.0500 (0.0500)  time: 2.1983  data: 1.5005  max mem: 42519
Epoch: [220]  [ 200/2502]  eta: 0:18:48  lr: 0.000752  min_lr: 0.000752  loss: 3.0878 (2.9572)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0920 (1.0884)  time: 0.4851  data: 0.0004  max mem: 42519
Epoch: [220]  [ 400/2502]  eta: 0:17:00  lr: 0.000751  min_lr: 0.000751  loss: 3.1399 (2.9663)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0415 (1.0723)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [220]  [ 600/2502]  eta: 0:15:20  lr: 0.000750  min_lr: 0.000750  loss: 3.0976 (2.9839)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0098 (1.0749)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [220]  [ 800/2502]  eta: 0:13:41  lr: 0.000748  min_lr: 0.000748  loss: 2.9735 (2.9843)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1286 (1.0795)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [220]  [1000/2502]  eta: 0:12:03  lr: 0.000747  min_lr: 0.000747  loss: 3.0979 (2.9890)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0510 (1.0766)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [220]  [1200/2502]  eta: 0:10:27  lr: 0.000745  min_lr: 0.000745  loss: 2.9834 (2.9961)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0256 (1.0724)  time: 0.4787  data: 0.0003  max mem: 42519
Epoch: [220]  [1400/2502]  eta: 0:08:50  lr: 0.000744  min_lr: 0.000744  loss: 3.2577 (2.9974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9790 (1.0647)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [220]  [1600/2502]  eta: 0:07:14  lr: 0.000743  min_lr: 0.000743  loss: 3.1253 (2.9983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0608 (1.0678)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [220]  [1800/2502]  eta: 0:05:38  lr: 0.000741  min_lr: 0.000741  loss: 3.2511 (2.9954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0644 (1.0708)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [220]  [2000/2502]  eta: 0:04:01  lr: 0.000740  min_lr: 0.000740  loss: 3.1203 (2.9997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0108 (1.0689)  time: 0.4771  data: 0.0004  max mem: 42519
Epoch: [220]  [2200/2502]  eta: 0:02:25  lr: 0.000738  min_lr: 0.000738  loss: 3.1240 (2.9974)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0509 (1.0702)  time: 0.4772  data: 0.0004  max mem: 42519
Epoch: [220]  [2400/2502]  eta: 0:00:49  lr: 0.000737  min_lr: 0.000737  loss: 3.2133 (2.9988)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0217 (1.0721)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [220]  [2501/2502]  eta: 0:00:00  lr: 0.000736  min_lr: 0.000736  loss: 2.9871 (2.9985)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0868 (1.0739)  time: 0.4387  data: 0.0009  max mem: 42519
Epoch: [220] Total time: 0:20:03 (0.4810 s / it)
Averaged stats: lr: 0.000736  min_lr: 0.000736  loss: 2.9871 (2.9994)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0868 (1.0739)
Test:  [ 0/50]  eta: 0:04:18  loss: 0.5385 (0.5385)  acc1: 96.4000 (96.4000)  acc5: 99.2000 (99.2000)  time: 5.1783  data: 4.8960  max mem: 42519
Test:  [10/50]  eta: 0:00:27  loss: 0.8375 (0.8180)  acc1: 86.4000 (87.5636)  acc5: 98.4000 (97.8546)  time: 0.6939  data: 0.4528  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8441 (0.8514)  acc1: 84.4000 (86.3238)  acc5: 98.0000 (97.7905)  time: 0.2433  data: 0.0058  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9501 (0.9341)  acc1: 80.8000 (84.3742)  acc5: 96.8000 (96.9936)  time: 0.2398  data: 0.0018  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1368 (0.9887)  acc1: 78.4000 (82.8293)  acc5: 94.4000 (96.4000)  time: 0.2378  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1366 (1.0052)  acc1: 79.2000 (82.2800)  acc5: 94.4000 (96.2640)  time: 0.2383  data: 0.0001  max mem: 42519
Test: Total time: 0:00:17 (0.3410 s / it)
* Acc@1 82.308 Acc@5 96.296 loss 1.002
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.34%
Epoch: [221]  [   0/2502]  eta: 1:26:20  lr: 0.000736  min_lr: 0.000736  loss: 2.4480 (2.4480)  weight_decay: 0.0500 (0.0500)  time: 2.0707  data: 1.5937  max mem: 42519
Epoch: [221]  [ 200/2502]  eta: 0:18:39  lr: 0.000735  min_lr: 0.000735  loss: 2.9235 (3.0285)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9922 (1.0857)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [221]  [ 400/2502]  eta: 0:16:59  lr: 0.000734  min_lr: 0.000734  loss: 3.1482 (3.0126)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0059 (1.0682)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [221]  [ 600/2502]  eta: 0:15:18  lr: 0.000732  min_lr: 0.000732  loss: 3.0677 (2.9992)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [221]  [ 800/2502]  eta: 0:13:41  lr: 0.000731  min_lr: 0.000731  loss: 3.1251 (3.0044)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0908 (nan)  time: 0.4830  data: 0.0004  max mem: 42519
Epoch: [221]  [1000/2502]  eta: 0:12:05  lr: 0.000729  min_lr: 0.000729  loss: 2.8683 (2.9992)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0546 (nan)  time: 0.4813  data: 0.0005  max mem: 42519
Epoch: [221]  [1200/2502]  eta: 0:10:28  lr: 0.000728  min_lr: 0.000728  loss: 3.0130 (3.0004)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0674 (nan)  time: 0.4831  data: 0.0005  max mem: 42519
Epoch: [221]  [1400/2502]  eta: 0:08:51  lr: 0.000727  min_lr: 0.000727  loss: 2.9672 (2.9947)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0898 (nan)  time: 0.4860  data: 0.0004  max mem: 42519
Epoch: [221]  [1600/2502]  eta: 0:07:15  lr: 0.000725  min_lr: 0.000725  loss: 3.2292 (2.9917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1080 (nan)  time: 0.4817  data: 0.0005  max mem: 42519
Epoch: [221]  [1800/2502]  eta: 0:05:38  lr: 0.000724  min_lr: 0.000724  loss: 2.7806 (2.9852)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0274 (nan)  time: 0.4780  data: 0.0005  max mem: 42519
Epoch: [221]  [2000/2502]  eta: 0:04:02  lr: 0.000723  min_lr: 0.000723  loss: 3.0691 (2.9898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0175 (nan)  time: 0.4902  data: 0.0005  max mem: 42519
Epoch: [221]  [2200/2502]  eta: 0:02:25  lr: 0.000721  min_lr: 0.000721  loss: 3.2588 (2.9910)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0432 (nan)  time: 0.4803  data: 0.0005  max mem: 42519
Epoch: [221]  [2400/2502]  eta: 0:00:49  lr: 0.000720  min_lr: 0.000720  loss: 2.9397 (2.9919)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0575 (nan)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [221]  [2501/2502]  eta: 0:00:00  lr: 0.000719  min_lr: 0.000719  loss: 3.2351 (2.9957)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1012 (nan)  time: 0.4354  data: 0.0007  max mem: 42519
Epoch: [221] Total time: 0:20:05 (0.4818 s / it)
Averaged stats: lr: 0.000719  min_lr: 0.000719  loss: 3.2351 (3.0018)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1012 (nan)
Test:  [ 0/50]  eta: 0:04:13  loss: 0.6454 (0.6454)  acc1: 93.2000 (93.2000)  acc5: 99.2000 (99.2000)  time: 5.0703  data: 4.7819  max mem: 42519
Test:  [10/50]  eta: 0:00:27  loss: 0.9287 (0.9044)  acc1: 88.0000 (87.2727)  acc5: 98.4000 (97.7818)  time: 0.6764  data: 0.4351  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.9536 (0.9332)  acc1: 85.2000 (86.2095)  acc5: 98.0000 (97.7714)  time: 0.2408  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0790 (1.0170)  acc1: 82.8000 (84.3355)  acc5: 96.0000 (96.7871)  time: 0.2615  data: 0.0198  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2320 (1.0739)  acc1: 78.0000 (82.6927)  acc5: 94.0000 (96.2634)  time: 0.2574  data: 0.0196  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2320 (1.0874)  acc1: 78.0000 (82.2080)  acc5: 94.8000 (96.1840)  time: 0.2553  data: 0.0189  max mem: 42519
Test: Total time: 0:00:17 (0.3450 s / it)
* Acc@1 82.266 Acc@5 96.186 loss 1.085
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.34%
Epoch: [222]  [   0/2502]  eta: 1:35:28  lr: 0.000719  min_lr: 0.000719  loss: 3.2609 (3.2609)  weight_decay: 0.0500 (0.0500)  time: 2.2895  data: 1.6914  max mem: 42519
Epoch: [222]  [ 200/2502]  eta: 0:18:48  lr: 0.000718  min_lr: 0.000718  loss: 2.7277 (2.9855)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0503 (1.1104)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [222]  [ 400/2502]  eta: 0:17:00  lr: 0.000716  min_lr: 0.000716  loss: 3.1891 (3.0011)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0883 (1.1016)  time: 0.4810  data: 0.0004  max mem: 42519
Epoch: [222]  [ 600/2502]  eta: 0:15:22  lr: 0.000715  min_lr: 0.000715  loss: 3.1914 (3.0086)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0712 (1.0949)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [222]  [ 800/2502]  eta: 0:13:44  lr: 0.000714  min_lr: 0.000714  loss: 3.1648 (3.0030)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0908 (1.0836)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [222]  [1000/2502]  eta: 0:12:06  lr: 0.000712  min_lr: 0.000712  loss: 3.1038 (3.0023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0603 (1.0819)  time: 0.4838  data: 0.0005  max mem: 42519
Epoch: [222]  [1200/2502]  eta: 0:10:30  lr: 0.000711  min_lr: 0.000711  loss: 2.8678 (2.9991)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0705 (1.0831)  time: 0.4895  data: 0.0006  max mem: 42519
Epoch: [222]  [1400/2502]  eta: 0:08:52  lr: 0.000709  min_lr: 0.000709  loss: 3.2266 (3.0023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1216 (1.0883)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [222]  [1600/2502]  eta: 0:07:15  lr: 0.000708  min_lr: 0.000708  loss: 3.2110 (2.9905)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0098 (1.0890)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [222]  [1800/2502]  eta: 0:05:39  lr: 0.000707  min_lr: 0.000707  loss: 3.1464 (2.9930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9724 (1.0870)  time: 0.4805  data: 0.0006  max mem: 42519
Epoch: [222]  [2000/2502]  eta: 0:04:02  lr: 0.000705  min_lr: 0.000705  loss: 3.0746 (2.9935)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1061 (1.0873)  time: 0.4801  data: 0.0006  max mem: 42519
Epoch: [222]  [2200/2502]  eta: 0:02:25  lr: 0.000704  min_lr: 0.000704  loss: 2.9272 (2.9940)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0398 (1.0915)  time: 0.4876  data: 0.0005  max mem: 42519
Epoch: [222]  [2400/2502]  eta: 0:00:49  lr: 0.000703  min_lr: 0.000703  loss: 3.1787 (2.9957)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1342 (1.0940)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [222]  [2501/2502]  eta: 0:00:00  lr: 0.000702  min_lr: 0.000702  loss: 2.8976 (2.9925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0621 (1.0947)  time: 0.4404  data: 0.0010  max mem: 42519
Epoch: [222] Total time: 0:20:07 (0.4825 s / it)
Averaged stats: lr: 0.000702  min_lr: 0.000702  loss: 2.8976 (2.9853)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0621 (1.0947)
Test:  [ 0/50]  eta: 0:02:17  loss: 0.5268 (0.5268)  acc1: 94.8000 (94.8000)  acc5: 98.8000 (98.8000)  time: 2.7472  data: 2.4595  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8418 (0.8213)  acc1: 89.2000 (86.9818)  acc5: 98.0000 (97.7455)  time: 0.4832  data: 0.2398  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8518 (0.8433)  acc1: 84.8000 (85.9048)  acc5: 97.6000 (97.7905)  time: 0.2537  data: 0.0092  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9572 (0.9196)  acc1: 82.4000 (84.0387)  acc5: 96.4000 (96.9677)  time: 0.2484  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1518 (0.9774)  acc1: 78.0000 (82.4781)  acc5: 94.8000 (96.4683)  time: 0.2446  data: 0.0033  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1398 (0.9875)  acc1: 78.4000 (82.2080)  acc5: 94.8000 (96.3520)  time: 0.2400  data: 0.0031  max mem: 42519
Test: Total time: 0:00:14 (0.2989 s / it)
* Acc@1 82.324 Acc@5 96.280 loss 0.985
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.34%
Epoch: [223]  [   0/2502]  eta: 1:24:37  lr: 0.000702  min_lr: 0.000702  loss: 2.9984 (2.9984)  weight_decay: 0.0500 (0.0500)  time: 2.0292  data: 1.4154  max mem: 42519
Epoch: [223]  [ 200/2502]  eta: 0:18:43  lr: 0.000701  min_lr: 0.000701  loss: 2.9372 (2.9605)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1200 (1.0959)  time: 0.4857  data: 0.0004  max mem: 42519
Epoch: [223]  [ 400/2502]  eta: 0:16:58  lr: 0.000699  min_lr: 0.000699  loss: 3.0149 (2.9464)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0823 (1.0938)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [223]  [ 600/2502]  eta: 0:15:18  lr: 0.000698  min_lr: 0.000698  loss: 2.8494 (2.9427)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0002 (1.0854)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [223]  [ 800/2502]  eta: 0:13:41  lr: 0.000696  min_lr: 0.000696  loss: 3.1281 (2.9488)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0039 (1.0789)  time: 0.4853  data: 0.0004  max mem: 42519
Epoch: [223]  [1000/2502]  eta: 0:12:04  lr: 0.000695  min_lr: 0.000695  loss: 3.0197 (2.9636)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0834 (1.0806)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [223]  [1200/2502]  eta: 0:10:27  lr: 0.000694  min_lr: 0.000694  loss: 3.0209 (2.9600)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0598 (1.0879)  time: 0.4813  data: 0.0004  max mem: 42519
Epoch: [223]  [1400/2502]  eta: 0:08:51  lr: 0.000692  min_lr: 0.000692  loss: 3.0592 (2.9611)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0967 (1.0865)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [223]  [1600/2502]  eta: 0:07:14  lr: 0.000691  min_lr: 0.000691  loss: 3.1239 (2.9701)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0211 (1.0847)  time: 0.4813  data: 0.0005  max mem: 42519
Epoch: [223]  [1800/2502]  eta: 0:05:38  lr: 0.000690  min_lr: 0.000690  loss: 3.0538 (2.9693)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0802 (1.0868)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [223]  [2000/2502]  eta: 0:04:01  lr: 0.000688  min_lr: 0.000688  loss: 3.1610 (2.9740)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0956 (1.0951)  time: 0.4815  data: 0.0004  max mem: 42519
Epoch: [223]  [2200/2502]  eta: 0:02:25  lr: 0.000687  min_lr: 0.000687  loss: 3.0162 (2.9750)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0154 (1.0938)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [223]  [2400/2502]  eta: 0:00:49  lr: 0.000686  min_lr: 0.000686  loss: 3.0170 (2.9745)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0598 (1.0950)  time: 0.4819  data: 0.0004  max mem: 42519
Epoch: [223]  [2501/2502]  eta: 0:00:00  lr: 0.000685  min_lr: 0.000685  loss: 2.9986 (2.9743)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0700 (1.0962)  time: 0.4428  data: 0.0008  max mem: 42519
Epoch: [223] Total time: 0:20:05 (0.4820 s / it)
Averaged stats: lr: 0.000685  min_lr: 0.000685  loss: 2.9986 (2.9925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0700 (1.0962)
Test:  [ 0/50]  eta: 0:03:36  loss: 0.4906 (0.4906)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 4.3218  data: 4.0506  max mem: 42519
Test:  [10/50]  eta: 0:00:24  loss: 0.7859 (0.7706)  acc1: 86.0000 (87.4182)  acc5: 98.4000 (97.8182)  time: 0.6085  data: 0.3686  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.8082 (0.7950)  acc1: 83.6000 (86.0000)  acc5: 98.0000 (97.8476)  time: 0.2378  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9008 (0.8729)  acc1: 82.4000 (84.2968)  acc5: 96.0000 (97.1226)  time: 0.2386  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0953 (0.9299)  acc1: 78.0000 (82.8390)  acc5: 95.6000 (96.5951)  time: 0.2380  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0819 (0.9421)  acc1: 78.0000 (82.3600)  acc5: 95.2000 (96.5040)  time: 0.2371  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3225 s / it)
* Acc@1 82.578 Acc@5 96.394 loss 0.941
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.58%
Epoch: [224]  [   0/2502]  eta: 1:31:43  lr: 0.000685  min_lr: 0.000685  loss: 3.2510 (3.2510)  weight_decay: 0.0500 (0.0500)  time: 2.1996  data: 1.7139  max mem: 42519
Epoch: [224]  [ 200/2502]  eta: 0:18:48  lr: 0.000684  min_lr: 0.000684  loss: 3.0547 (2.9713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9976 (1.0690)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [224]  [ 400/2502]  eta: 0:17:01  lr: 0.000682  min_lr: 0.000682  loss: 3.0108 (2.9705)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1134 (1.0975)  time: 0.4823  data: 0.0004  max mem: 42519
Epoch: [224]  [ 600/2502]  eta: 0:15:21  lr: 0.000681  min_lr: 0.000681  loss: 2.9855 (2.9770)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1136 (1.1194)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [224]  [ 800/2502]  eta: 0:13:42  lr: 0.000680  min_lr: 0.000680  loss: 2.9219 (2.9811)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0657 (1.1164)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [224]  [1000/2502]  eta: 0:12:04  lr: 0.000678  min_lr: 0.000678  loss: 3.2902 (2.9763)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0513 (1.1068)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [224]  [1200/2502]  eta: 0:10:28  lr: 0.000677  min_lr: 0.000677  loss: 3.1732 (2.9833)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0906 (1.1099)  time: 0.4856  data: 0.0004  max mem: 42519
Epoch: [224]  [1400/2502]  eta: 0:08:51  lr: 0.000676  min_lr: 0.000676  loss: 3.0770 (2.9860)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0248 (1.1089)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [224]  [1600/2502]  eta: 0:07:14  lr: 0.000674  min_lr: 0.000674  loss: 3.0045 (2.9859)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1851 (1.1096)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [224]  [1800/2502]  eta: 0:05:38  lr: 0.000673  min_lr: 0.000673  loss: 2.8959 (2.9876)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0968 (1.1067)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [224]  [2000/2502]  eta: 0:04:01  lr: 0.000671  min_lr: 0.000671  loss: 3.3239 (2.9856)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1298 (1.1122)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [224]  [2200/2502]  eta: 0:02:25  lr: 0.000670  min_lr: 0.000670  loss: 2.9398 (2.9801)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0342 (1.1126)  time: 0.4898  data: 0.0005  max mem: 42519
Epoch: [224]  [2400/2502]  eta: 0:00:49  lr: 0.000669  min_lr: 0.000669  loss: 3.1518 (2.9828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0456 (1.1123)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [224]  [2501/2502]  eta: 0:00:00  lr: 0.000668  min_lr: 0.000668  loss: 3.1374 (2.9829)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1571 (1.1136)  time: 0.4318  data: 0.0006  max mem: 42519
Epoch: [224] Total time: 0:20:04 (0.4813 s / it)
Averaged stats: lr: 0.000668  min_lr: 0.000668  loss: 3.1374 (2.9726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1571 (1.1136)
Test:  [ 0/50]  eta: 0:04:53  loss: 0.6029 (0.6029)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 5.8618  data: 5.5834  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.9342 (0.9191)  acc1: 88.8000 (87.7455)  acc5: 98.4000 (98.1818)  time: 0.7485  data: 0.5080  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.9368 (0.9458)  acc1: 84.4000 (86.4381)  acc5: 97.6000 (97.8667)  time: 0.2408  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.1027 (1.0285)  acc1: 82.0000 (84.5677)  acc5: 96.8000 (96.9548)  time: 0.2691  data: 0.0239  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2474 (1.0817)  acc1: 78.8000 (82.9659)  acc5: 94.8000 (96.5366)  time: 0.2655  data: 0.0237  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2474 (1.1014)  acc1: 78.0000 (82.3680)  acc5: 94.8000 (96.4400)  time: 0.2603  data: 0.0234  max mem: 42519
Test: Total time: 0:00:18 (0.3645 s / it)
* Acc@1 82.530 Acc@5 96.354 loss 1.104
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.58%
Epoch: [225]  [   0/2502]  eta: 1:34:10  lr: 0.000668  min_lr: 0.000668  loss: 3.3963 (3.3963)  weight_decay: 0.0500 (0.0500)  time: 2.2583  data: 1.6908  max mem: 42519
Epoch: [225]  [ 200/2502]  eta: 0:18:44  lr: 0.000667  min_lr: 0.000667  loss: 2.9950 (2.9987)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0580 (1.1273)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [225]  [ 400/2502]  eta: 0:17:00  lr: 0.000665  min_lr: 0.000665  loss: 3.1602 (2.9998)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0681 (1.1234)  time: 0.4811  data: 0.0004  max mem: 42519
Epoch: [225]  [ 600/2502]  eta: 0:15:21  lr: 0.000664  min_lr: 0.000664  loss: 3.1713 (2.9881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0711 (1.1058)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [225]  [ 800/2502]  eta: 0:13:42  lr: 0.000663  min_lr: 0.000663  loss: 2.8980 (2.9749)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1567 (1.1270)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [225]  [1000/2502]  eta: 0:12:05  lr: 0.000661  min_lr: 0.000661  loss: 3.0383 (2.9802)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0246 (1.1107)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [225]  [1200/2502]  eta: 0:10:28  lr: 0.000660  min_lr: 0.000660  loss: 2.9854 (2.9732)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0859 (1.1074)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [225]  [1400/2502]  eta: 0:08:51  lr: 0.000659  min_lr: 0.000659  loss: 2.9805 (2.9769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0390 (1.1117)  time: 0.4842  data: 0.0005  max mem: 42519
Epoch: [225]  [1600/2502]  eta: 0:07:14  lr: 0.000657  min_lr: 0.000657  loss: 3.0410 (2.9769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0217 (nan)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [225]  [1800/2502]  eta: 0:05:38  lr: 0.000656  min_lr: 0.000656  loss: 2.9567 (2.9777)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0092 (nan)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [225]  [2000/2502]  eta: 0:04:01  lr: 0.000655  min_lr: 0.000655  loss: 3.0155 (2.9739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0572 (nan)  time: 0.4774  data: 0.0005  max mem: 42519
Epoch: [225]  [2200/2502]  eta: 0:02:25  lr: 0.000653  min_lr: 0.000653  loss: 3.0361 (2.9720)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0675 (nan)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [225]  [2400/2502]  eta: 0:00:49  lr: 0.000652  min_lr: 0.000652  loss: 2.9731 (2.9716)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0772 (nan)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [225]  [2501/2502]  eta: 0:00:00  lr: 0.000652  min_lr: 0.000652  loss: 2.8846 (2.9697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9797 (nan)  time: 0.4452  data: 0.0009  max mem: 42519
Epoch: [225] Total time: 0:20:04 (0.4814 s / it)
Averaged stats: lr: 0.000652  min_lr: 0.000652  loss: 2.8846 (2.9619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9797 (nan)
Test:  [ 0/50]  eta: 0:03:21  loss: 0.5395 (0.5395)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 4.0320  data: 3.7665  max mem: 42519
Test:  [10/50]  eta: 0:00:23  loss: 0.7938 (0.8086)  acc1: 90.0000 (87.9273)  acc5: 98.4000 (97.7818)  time: 0.5935  data: 0.3543  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.8286 (0.8450)  acc1: 85.2000 (86.4762)  acc5: 98.4000 (97.8667)  time: 0.2457  data: 0.0069  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9871 (0.9264)  acc1: 82.0000 (84.5290)  acc5: 96.4000 (97.0323)  time: 0.2447  data: 0.0011  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1435 (0.9800)  acc1: 79.2000 (83.0049)  acc5: 95.2000 (96.6342)  time: 0.2499  data: 0.0084  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1661 (0.9964)  acc1: 78.4000 (82.3680)  acc5: 95.2000 (96.5680)  time: 0.2448  data: 0.0077  max mem: 42519
Test: Total time: 0:00:16 (0.3236 s / it)
* Acc@1 82.604 Acc@5 96.382 loss 0.993
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.60%
Epoch: [226]  [   0/2502]  eta: 1:29:59  lr: 0.000651  min_lr: 0.000651  loss: 3.5392 (3.5392)  weight_decay: 0.0500 (0.0500)  time: 2.1580  data: 1.6613  max mem: 42519
Epoch: [226]  [ 200/2502]  eta: 0:18:46  lr: 0.000650  min_lr: 0.000650  loss: 3.1493 (2.9390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0097 (1.0456)  time: 0.4781  data: 0.0004  max mem: 42519
Epoch: [226]  [ 400/2502]  eta: 0:17:00  lr: 0.000649  min_lr: 0.000649  loss: 3.0369 (2.9506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1232 (1.0747)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [226]  [ 600/2502]  eta: 0:15:21  lr: 0.000648  min_lr: 0.000648  loss: 3.2152 (2.9522)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0930 (1.0819)  time: 0.4833  data: 0.0005  max mem: 42519
Epoch: [226]  [ 800/2502]  eta: 0:13:43  lr: 0.000646  min_lr: 0.000646  loss: 3.0210 (2.9438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1435 (1.1024)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [226]  [1000/2502]  eta: 0:12:05  lr: 0.000645  min_lr: 0.000645  loss: 3.0521 (2.9502)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0897 (1.0988)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [226]  [1200/2502]  eta: 0:10:28  lr: 0.000644  min_lr: 0.000644  loss: 3.1500 (2.9525)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0685 (1.1036)  time: 0.4813  data: 0.0005  max mem: 42519
Epoch: [226]  [1400/2502]  eta: 0:08:51  lr: 0.000642  min_lr: 0.000642  loss: 3.1292 (2.9534)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1155 (1.1064)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [226]  [1600/2502]  eta: 0:07:14  lr: 0.000641  min_lr: 0.000641  loss: 3.0966 (2.9538)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0442 (1.1062)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [226]  [1800/2502]  eta: 0:05:38  lr: 0.000640  min_lr: 0.000640  loss: 2.9739 (2.9591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1111 (1.1109)  time: 0.4880  data: 0.0004  max mem: 42519
Epoch: [226]  [2000/2502]  eta: 0:04:01  lr: 0.000638  min_lr: 0.000638  loss: 3.0705 (2.9627)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0682 (1.1102)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [226]  [2200/2502]  eta: 0:02:25  lr: 0.000637  min_lr: 0.000637  loss: 3.0413 (2.9621)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1031 (1.1123)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [226]  [2400/2502]  eta: 0:00:49  lr: 0.000636  min_lr: 0.000636  loss: 3.3113 (2.9672)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0615 (1.1118)  time: 0.4776  data: 0.0004  max mem: 42519
Epoch: [226]  [2501/2502]  eta: 0:00:00  lr: 0.000635  min_lr: 0.000635  loss: 2.9030 (2.9681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1051 (1.1134)  time: 0.4347  data: 0.0007  max mem: 42519
Epoch: [226] Total time: 0:20:05 (0.4817 s / it)
Averaged stats: lr: 0.000635  min_lr: 0.000635  loss: 2.9030 (2.9675)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1051 (1.1134)
Test:  [ 0/50]  eta: 0:03:47  loss: 0.5102 (0.5102)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 4.5413  data: 4.2541  max mem: 42519
Test:  [10/50]  eta: 0:00:25  loss: 0.8405 (0.8266)  acc1: 88.0000 (87.4546)  acc5: 98.4000 (97.9636)  time: 0.6400  data: 0.4005  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8624 (0.8450)  acc1: 84.4000 (86.4191)  acc5: 98.0000 (97.9810)  time: 0.2440  data: 0.0079  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9859 (0.9267)  acc1: 81.2000 (84.3871)  acc5: 96.8000 (97.1613)  time: 0.2741  data: 0.0354  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1563 (0.9843)  acc1: 78.0000 (82.8390)  acc5: 95.2000 (96.6146)  time: 0.2725  data: 0.0352  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1563 (0.9971)  acc1: 78.0000 (82.3520)  acc5: 94.8000 (96.5040)  time: 0.2698  data: 0.0350  max mem: 42519
Test: Total time: 0:00:17 (0.3414 s / it)
* Acc@1 82.592 Acc@5 96.378 loss 0.997
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.60%
Epoch: [227]  [   0/2502]  eta: 1:37:24  lr: 0.000635  min_lr: 0.000635  loss: 1.9486 (1.9486)  weight_decay: 0.0500 (0.0500)  time: 2.3358  data: 1.8531  max mem: 42519
Epoch: [227]  [ 200/2502]  eta: 0:18:44  lr: 0.000634  min_lr: 0.000634  loss: 3.2081 (2.9749)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0174 (1.0974)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [227]  [ 400/2502]  eta: 0:17:02  lr: 0.000632  min_lr: 0.000632  loss: 2.9473 (2.9708)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0742 (1.1091)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [227]  [ 600/2502]  eta: 0:15:21  lr: 0.000631  min_lr: 0.000631  loss: 3.1069 (2.9771)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0566 (1.1100)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [227]  [ 800/2502]  eta: 0:13:42  lr: 0.000630  min_lr: 0.000630  loss: 2.9430 (2.9707)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0356 (1.1132)  time: 0.4810  data: 0.0004  max mem: 42519
Epoch: [227]  [1000/2502]  eta: 0:12:06  lr: 0.000628  min_lr: 0.000628  loss: 3.0988 (2.9738)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0261 (1.1083)  time: 0.4834  data: 0.0006  max mem: 42519
Epoch: [227]  [1200/2502]  eta: 0:10:28  lr: 0.000627  min_lr: 0.000627  loss: 2.8580 (2.9791)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0368 (1.1084)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [227]  [1400/2502]  eta: 0:08:51  lr: 0.000626  min_lr: 0.000626  loss: 3.0759 (2.9857)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1482 (1.1142)  time: 0.4846  data: 0.0005  max mem: 42519
Epoch: [227]  [1600/2502]  eta: 0:07:15  lr: 0.000625  min_lr: 0.000625  loss: 2.9076 (2.9796)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1480 (1.1158)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [227]  [1800/2502]  eta: 0:05:38  lr: 0.000623  min_lr: 0.000623  loss: 3.0739 (2.9783)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1069 (1.1196)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [227]  [2000/2502]  eta: 0:04:02  lr: 0.000622  min_lr: 0.000622  loss: 3.0939 (2.9719)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0518 (1.1217)  time: 0.4805  data: 0.0005  max mem: 42519
Epoch: [227]  [2200/2502]  eta: 0:02:25  lr: 0.000621  min_lr: 0.000621  loss: 3.0574 (2.9717)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1237 (1.1207)  time: 0.4821  data: 0.0004  max mem: 42519
Epoch: [227]  [2400/2502]  eta: 0:00:49  lr: 0.000619  min_lr: 0.000619  loss: 2.9503 (2.9705)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1359 (1.1222)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [227]  [2501/2502]  eta: 0:00:00  lr: 0.000619  min_lr: 0.000619  loss: 3.2096 (2.9739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2169 (1.1262)  time: 0.4498  data: 0.0009  max mem: 42519
Epoch: [227] Total time: 0:20:06 (0.4821 s / it)
Averaged stats: lr: 0.000619  min_lr: 0.000619  loss: 3.2096 (2.9597)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2169 (1.1262)
Test:  [ 0/50]  eta: 0:02:27  loss: 0.5868 (0.5868)  acc1: 96.0000 (96.0000)  acc5: 99.2000 (99.2000)  time: 2.9514  data: 2.6752  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.9147 (0.8803)  acc1: 87.6000 (88.1091)  acc5: 98.8000 (97.9273)  time: 0.4918  data: 0.2506  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.9197 (0.9027)  acc1: 84.4000 (86.8000)  acc5: 98.4000 (97.9429)  time: 0.2454  data: 0.0044  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0515 (0.9824)  acc1: 82.8000 (84.7613)  acc5: 96.8000 (97.1097)  time: 0.2414  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1949 (1.0393)  acc1: 78.4000 (83.3463)  acc5: 95.2000 (96.6342)  time: 0.2376  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1908 (1.0531)  acc1: 77.6000 (82.5680)  acc5: 95.6000 (96.5760)  time: 0.2396  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2979 s / it)
* Acc@1 82.566 Acc@5 96.378 loss 1.056
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.60%
Epoch: [228]  [   0/2502]  eta: 1:42:38  lr: 0.000619  min_lr: 0.000619  loss: 3.4942 (3.4942)  weight_decay: 0.0500 (0.0500)  time: 2.4613  data: 1.5923  max mem: 42519
Epoch: [228]  [ 200/2502]  eta: 0:18:54  lr: 0.000617  min_lr: 0.000617  loss: 3.0523 (2.9271)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1492 (nan)  time: 0.4812  data: 0.0008  max mem: 42519
Epoch: [228]  [ 400/2502]  eta: 0:17:03  lr: 0.000616  min_lr: 0.000616  loss: 3.1028 (2.9406)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2513 (nan)  time: 0.4798  data: 0.0006  max mem: 42519
Epoch: [228]  [ 600/2502]  eta: 0:15:23  lr: 0.000615  min_lr: 0.000615  loss: 3.0672 (2.9503)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1530 (nan)  time: 0.4785  data: 0.0005  max mem: 42519
Epoch: [228]  [ 800/2502]  eta: 0:13:44  lr: 0.000614  min_lr: 0.000614  loss: 2.8711 (2.9565)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0932 (nan)  time: 0.4799  data: 0.0005  max mem: 42519
Epoch: [228]  [1000/2502]  eta: 0:12:06  lr: 0.000612  min_lr: 0.000612  loss: 3.0272 (2.9513)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1279 (nan)  time: 0.4804  data: 0.0006  max mem: 42519
Epoch: [228]  [1200/2502]  eta: 0:10:29  lr: 0.000611  min_lr: 0.000611  loss: 3.1337 (2.9569)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1194 (nan)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [228]  [1400/2502]  eta: 0:08:52  lr: 0.000610  min_lr: 0.000610  loss: 3.0767 (2.9575)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0904 (nan)  time: 0.4809  data: 0.0007  max mem: 42519
Epoch: [228]  [1600/2502]  eta: 0:07:15  lr: 0.000608  min_lr: 0.000608  loss: 3.1840 (2.9635)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1943 (nan)  time: 0.4791  data: 0.0006  max mem: 42519
Epoch: [228]  [1800/2502]  eta: 0:05:38  lr: 0.000607  min_lr: 0.000607  loss: 2.8784 (2.9628)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1643 (nan)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [228]  [2000/2502]  eta: 0:04:02  lr: 0.000606  min_lr: 0.000606  loss: 2.7867 (2.9626)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0330 (nan)  time: 0.4820  data: 0.0009  max mem: 42519
Epoch: [228]  [2200/2502]  eta: 0:02:25  lr: 0.000605  min_lr: 0.000605  loss: 3.0178 (2.9606)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1476 (nan)  time: 0.4794  data: 0.0007  max mem: 42519
Epoch: [228]  [2400/2502]  eta: 0:00:49  lr: 0.000603  min_lr: 0.000603  loss: 2.9873 (2.9563)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1353 (nan)  time: 0.4800  data: 0.0006  max mem: 42519
Epoch: [228]  [2501/2502]  eta: 0:00:00  lr: 0.000603  min_lr: 0.000603  loss: 3.0659 (2.9569)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1248 (nan)  time: 0.4383  data: 0.0014  max mem: 42519
Epoch: [228] Total time: 0:20:06 (0.4821 s / it)
Averaged stats: lr: 0.000603  min_lr: 0.000603  loss: 3.0659 (2.9515)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1248 (nan)
Test:  [ 0/50]  eta: 0:02:32  loss: 0.5292 (0.5292)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 3.0402  data: 2.7586  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.8610 (0.8423)  acc1: 86.8000 (87.4182)  acc5: 98.0000 (98.0727)  time: 0.5029  data: 0.2579  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8854 (0.8710)  acc1: 84.4000 (86.0952)  acc5: 98.0000 (98.0571)  time: 0.2458  data: 0.0055  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0337 (0.9570)  acc1: 81.6000 (84.4516)  acc5: 96.4000 (97.1613)  time: 0.2401  data: 0.0018  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1488 (1.0058)  acc1: 79.2000 (83.0829)  acc5: 95.2000 (96.5756)  time: 0.2378  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1412 (1.0199)  acc1: 78.0000 (82.5440)  acc5: 95.6000 (96.5520)  time: 0.2407  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3006 s / it)
* Acc@1 82.524 Acc@5 96.444 loss 1.022
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.60%
Epoch: [229]  [   0/2502]  eta: 1:33:46  lr: 0.000603  min_lr: 0.000603  loss: 3.4410 (3.4410)  weight_decay: 0.0500 (0.0500)  time: 2.2490  data: 1.2919  max mem: 42519
Epoch: [229]  [ 200/2502]  eta: 0:18:46  lr: 0.000601  min_lr: 0.000601  loss: 3.0907 (2.9297)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0579 (1.1690)  time: 0.4815  data: 0.0005  max mem: 42519
Epoch: [229]  [ 400/2502]  eta: 0:17:02  lr: 0.000600  min_lr: 0.000600  loss: 3.0317 (2.9263)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0648 (1.1249)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [229]  [ 600/2502]  eta: 0:15:21  lr: 0.000599  min_lr: 0.000599  loss: 2.9211 (2.9244)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0691 (1.1375)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [229]  [ 800/2502]  eta: 0:13:43  lr: 0.000597  min_lr: 0.000597  loss: 3.1100 (2.9426)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1606 (1.1578)  time: 0.4883  data: 0.0005  max mem: 42519
Epoch: [229]  [1000/2502]  eta: 0:12:05  lr: 0.000596  min_lr: 0.000596  loss: 2.8952 (2.9429)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1631 (1.1722)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [229]  [1200/2502]  eta: 0:10:28  lr: 0.000595  min_lr: 0.000595  loss: 3.1333 (2.9418)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1995 (1.1690)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [229]  [1400/2502]  eta: 0:08:51  lr: 0.000594  min_lr: 0.000594  loss: 2.9710 (2.9430)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1350 (1.1662)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [229]  [1600/2502]  eta: 0:07:15  lr: 0.000592  min_lr: 0.000592  loss: 3.0293 (2.9510)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1168 (1.1606)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [229]  [1800/2502]  eta: 0:05:38  lr: 0.000591  min_lr: 0.000591  loss: 3.0934 (2.9517)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0911 (1.1593)  time: 0.4792  data: 0.0006  max mem: 42519
Epoch: [229]  [2000/2502]  eta: 0:04:02  lr: 0.000590  min_lr: 0.000590  loss: 3.1899 (2.9592)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1216 (1.1625)  time: 0.4806  data: 0.0007  max mem: 42519
Epoch: [229]  [2200/2502]  eta: 0:02:25  lr: 0.000589  min_lr: 0.000589  loss: 3.0513 (2.9595)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1434 (1.1619)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [229]  [2400/2502]  eta: 0:00:49  lr: 0.000587  min_lr: 0.000587  loss: 2.7691 (2.9577)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0860 (1.1588)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [229]  [2501/2502]  eta: 0:00:00  lr: 0.000587  min_lr: 0.000587  loss: 2.9569 (2.9561)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1627 (1.1588)  time: 0.4441  data: 0.0011  max mem: 42519
Epoch: [229] Total time: 0:20:05 (0.4820 s / it)
Averaged stats: lr: 0.000587  min_lr: 0.000587  loss: 2.9569 (2.9466)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1627 (1.1588)
Test:  [ 0/50]  eta: 0:02:30  loss: 0.4740 (0.4740)  acc1: 94.4000 (94.4000)  acc5: 99.2000 (99.2000)  time: 3.0017  data: 2.7006  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8121 (0.7978)  acc1: 88.4000 (87.2727)  acc5: 98.4000 (97.8182)  time: 0.4887  data: 0.2460  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8637 (0.8277)  acc1: 84.4000 (86.4571)  acc5: 97.6000 (97.7905)  time: 0.2391  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9488 (0.9161)  acc1: 82.4000 (84.5419)  acc5: 96.4000 (96.9161)  time: 0.2393  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1285 (0.9707)  acc1: 77.6000 (82.8781)  acc5: 94.4000 (96.4195)  time: 0.2375  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1285 (0.9819)  acc1: 77.2000 (82.3920)  acc5: 94.8000 (96.4080)  time: 0.2372  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2955 s / it)
* Acc@1 82.782 Acc@5 96.418 loss 0.977
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.78%
Epoch: [230]  [   0/2502]  eta: 1:18:23  lr: 0.000587  min_lr: 0.000587  loss: 3.0701 (3.0701)  weight_decay: 0.0500 (0.0500)  time: 1.8799  data: 1.3897  max mem: 42519
Epoch: [230]  [ 200/2502]  eta: 0:18:42  lr: 0.000585  min_lr: 0.000585  loss: 3.0099 (2.9276)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1259 (1.1792)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [230]  [ 400/2502]  eta: 0:16:58  lr: 0.000584  min_lr: 0.000584  loss: 3.1637 (2.9581)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1005 (1.1526)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [230]  [ 600/2502]  eta: 0:15:20  lr: 0.000583  min_lr: 0.000583  loss: 3.0997 (2.9533)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2002 (1.1591)  time: 0.4831  data: 0.0004  max mem: 42519
Epoch: [230]  [ 800/2502]  eta: 0:13:42  lr: 0.000582  min_lr: 0.000582  loss: 2.7690 (2.9531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1628 (1.1537)  time: 0.4781  data: 0.0005  max mem: 42519
Epoch: [230]  [1000/2502]  eta: 0:12:04  lr: 0.000580  min_lr: 0.000580  loss: 2.9074 (2.9524)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1135 (1.1483)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [230]  [1200/2502]  eta: 0:10:27  lr: 0.000579  min_lr: 0.000579  loss: 2.9686 (2.9404)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1132 (1.1465)  time: 0.4826  data: 0.0004  max mem: 42519
Epoch: [230]  [1400/2502]  eta: 0:08:51  lr: 0.000578  min_lr: 0.000578  loss: 3.1963 (2.9436)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1292 (1.1535)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [230]  [1600/2502]  eta: 0:07:14  lr: 0.000577  min_lr: 0.000577  loss: 3.0911 (2.9471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2299 (1.1606)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [230]  [1800/2502]  eta: 0:05:38  lr: 0.000575  min_lr: 0.000575  loss: 3.1011 (2.9419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1016 (1.1575)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [230]  [2000/2502]  eta: 0:04:01  lr: 0.000574  min_lr: 0.000574  loss: 3.2182 (2.9394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2108 (1.1569)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [230]  [2200/2502]  eta: 0:02:25  lr: 0.000573  min_lr: 0.000573  loss: 3.1053 (2.9342)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2097 (1.1625)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [230]  [2400/2502]  eta: 0:00:49  lr: 0.000571  min_lr: 0.000571  loss: 2.9411 (2.9372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1150 (1.1608)  time: 0.4810  data: 0.0004  max mem: 42519
Epoch: [230]  [2501/2502]  eta: 0:00:00  lr: 0.000571  min_lr: 0.000571  loss: 3.1375 (2.9408)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0533 (1.1603)  time: 0.4408  data: 0.0010  max mem: 42519
Epoch: [230] Total time: 0:20:04 (0.4812 s / it)
Averaged stats: lr: 0.000571  min_lr: 0.000571  loss: 3.1375 (2.9344)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0533 (1.1603)
Test:  [ 0/50]  eta: 0:02:56  loss: 0.5862 (0.5862)  acc1: 94.8000 (94.8000)  acc5: 99.6000 (99.6000)  time: 3.5285  data: 3.2620  max mem: 42519
Test:  [10/50]  eta: 0:00:22  loss: 0.8391 (0.8455)  acc1: 86.0000 (87.4546)  acc5: 98.4000 (98.0000)  time: 0.5740  data: 0.3333  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.9036 (0.8813)  acc1: 84.4000 (86.3048)  acc5: 98.0000 (97.8095)  time: 0.2623  data: 0.0206  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0368 (0.9643)  acc1: 82.4000 (84.5806)  acc5: 96.8000 (96.9032)  time: 0.2528  data: 0.0105  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1686 (1.0230)  acc1: 78.4000 (82.7707)  acc5: 94.8000 (96.4976)  time: 0.2546  data: 0.0166  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1558 (1.0313)  acc1: 78.4000 (82.4960)  acc5: 95.6000 (96.4720)  time: 0.2481  data: 0.0111  max mem: 42519
Test: Total time: 0:00:16 (0.3221 s / it)
* Acc@1 82.678 Acc@5 96.432 loss 1.026
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.78%
Epoch: [231]  [   0/2502]  eta: 1:37:12  lr: 0.000571  min_lr: 0.000571  loss: 3.1508 (3.1508)  weight_decay: 0.0500 (0.0500)  time: 2.3313  data: 1.8376  max mem: 42519
Epoch: [231]  [ 200/2502]  eta: 0:18:48  lr: 0.000570  min_lr: 0.000570  loss: 3.1619 (2.9216)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0760 (1.1055)  time: 0.4799  data: 0.0011  max mem: 42519
Epoch: [231]  [ 400/2502]  eta: 0:17:02  lr: 0.000568  min_lr: 0.000568  loss: 3.1340 (2.9531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1439 (1.1431)  time: 0.4877  data: 0.0010  max mem: 42519
Epoch: [231]  [ 600/2502]  eta: 0:15:21  lr: 0.000567  min_lr: 0.000567  loss: 3.0541 (2.9181)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1459 (1.1514)  time: 0.4791  data: 0.0012  max mem: 42519
Epoch: [231]  [ 800/2502]  eta: 0:13:43  lr: 0.000566  min_lr: 0.000566  loss: 3.0509 (2.9099)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1433 (1.1496)  time: 0.4800  data: 0.0011  max mem: 42519
Epoch: [231]  [1000/2502]  eta: 0:12:06  lr: 0.000565  min_lr: 0.000565  loss: 3.0204 (2.9170)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1659 (1.1639)  time: 0.4808  data: 0.0010  max mem: 42519
Epoch: [231]  [1200/2502]  eta: 0:10:28  lr: 0.000563  min_lr: 0.000563  loss: 2.8631 (2.9307)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1223 (1.1617)  time: 0.4805  data: 0.0012  max mem: 42519
Epoch: [231]  [1400/2502]  eta: 0:08:52  lr: 0.000562  min_lr: 0.000562  loss: 3.0741 (2.9371)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1645 (1.1654)  time: 0.4877  data: 0.0010  max mem: 42519
Epoch: [231]  [1600/2502]  eta: 0:07:15  lr: 0.000561  min_lr: 0.000561  loss: 3.1327 (2.9405)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1382 (1.1634)  time: 0.4820  data: 0.0008  max mem: 42519
Epoch: [231]  [1800/2502]  eta: 0:05:38  lr: 0.000560  min_lr: 0.000560  loss: 3.1933 (2.9438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0752 (1.1655)  time: 0.4787  data: 0.0010  max mem: 42519
Epoch: [231]  [2000/2502]  eta: 0:04:02  lr: 0.000558  min_lr: 0.000558  loss: 3.2103 (2.9442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1479 (1.1746)  time: 0.4876  data: 0.0010  max mem: 42519
Epoch: [231]  [2200/2502]  eta: 0:02:25  lr: 0.000557  min_lr: 0.000557  loss: 3.1371 (2.9434)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0634 (1.1692)  time: 0.4813  data: 0.0011  max mem: 42519
Epoch: [231]  [2400/2502]  eta: 0:00:49  lr: 0.000556  min_lr: 0.000556  loss: 2.9298 (2.9378)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0847 (1.1693)  time: 0.4787  data: 0.0008  max mem: 42519
Epoch: [231]  [2501/2502]  eta: 0:00:00  lr: 0.000555  min_lr: 0.000555  loss: 3.0005 (2.9377)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1889 (1.1704)  time: 0.4434  data: 0.0007  max mem: 42519
Epoch: [231] Total time: 0:20:05 (0.4819 s / it)
Averaged stats: lr: 0.000555  min_lr: 0.000555  loss: 3.0005 (2.9382)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1889 (1.1704)
Test:  [ 0/50]  eta: 0:02:42  loss: 0.5425 (0.5425)  acc1: 94.0000 (94.0000)  acc5: 99.6000 (99.6000)  time: 3.2509  data: 2.9821  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.8148 (0.8356)  acc1: 88.8000 (87.3818)  acc5: 98.4000 (98.3273)  time: 0.5112  data: 0.2715  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8517 (0.8639)  acc1: 84.4000 (86.4381)  acc5: 98.0000 (98.1143)  time: 0.2416  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0262 (0.9553)  acc1: 81.6000 (84.3226)  acc5: 96.8000 (97.1355)  time: 0.2683  data: 0.0273  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1779 (1.0066)  acc1: 79.2000 (82.7707)  acc5: 95.2000 (96.6634)  time: 0.2959  data: 0.0594  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1400 (1.0200)  acc1: 78.4000 (82.2640)  acc5: 95.6000 (96.6240)  time: 0.2689  data: 0.0325  max mem: 42519
Test: Total time: 0:00:16 (0.3253 s / it)
* Acc@1 82.762 Acc@5 96.502 loss 1.015
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.78%
Epoch: [232]  [   0/2502]  eta: 1:43:18  lr: 0.000555  min_lr: 0.000555  loss: 3.2450 (3.2450)  weight_decay: 0.0500 (0.0500)  time: 2.4775  data: 1.3690  max mem: 42519
Epoch: [232]  [ 200/2502]  eta: 0:18:49  lr: 0.000554  min_lr: 0.000554  loss: 3.1542 (2.9564)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1665 (1.1771)  time: 0.4780  data: 0.0005  max mem: 42519
Epoch: [232]  [ 400/2502]  eta: 0:16:59  lr: 0.000553  min_lr: 0.000553  loss: 3.0511 (2.9519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1965 (1.1931)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [232]  [ 600/2502]  eta: 0:15:20  lr: 0.000552  min_lr: 0.000552  loss: 2.9871 (2.9500)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1921 (1.2129)  time: 0.4890  data: 0.0005  max mem: 42519
Epoch: [232]  [ 800/2502]  eta: 0:13:43  lr: 0.000550  min_lr: 0.000550  loss: 3.0988 (2.9512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0925 (1.1986)  time: 0.4890  data: 0.0004  max mem: 42519
Epoch: [232]  [1000/2502]  eta: 0:12:04  lr: 0.000549  min_lr: 0.000549  loss: 2.9059 (2.9451)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1208 (1.1811)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [232]  [1200/2502]  eta: 0:10:28  lr: 0.000548  min_lr: 0.000548  loss: 3.0292 (2.9417)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1619 (1.1687)  time: 0.4826  data: 0.0005  max mem: 42519
Epoch: [232]  [1400/2502]  eta: 0:08:51  lr: 0.000547  min_lr: 0.000547  loss: 3.0199 (2.9461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1605 (1.1683)  time: 0.4802  data: 0.0006  max mem: 42519
Epoch: [232]  [1600/2502]  eta: 0:07:15  lr: 0.000545  min_lr: 0.000545  loss: 3.0159 (2.9339)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1509 (1.1723)  time: 0.4827  data: 0.0005  max mem: 42519
Epoch: [232]  [1800/2502]  eta: 0:05:38  lr: 0.000544  min_lr: 0.000544  loss: 2.9785 (2.9293)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1456 (1.1741)  time: 0.4862  data: 0.0005  max mem: 42519
Epoch: [232]  [2000/2502]  eta: 0:04:02  lr: 0.000543  min_lr: 0.000543  loss: 3.0191 (2.9262)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1183 (1.1795)  time: 0.4827  data: 0.0005  max mem: 42519
Epoch: [232]  [2200/2502]  eta: 0:02:25  lr: 0.000542  min_lr: 0.000542  loss: 2.9313 (2.9277)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1263 (1.1798)  time: 0.4820  data: 0.0006  max mem: 42519
Epoch: [232]  [2400/2502]  eta: 0:00:49  lr: 0.000540  min_lr: 0.000540  loss: 3.0763 (2.9336)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1504 (1.1841)  time: 0.4809  data: 0.0006  max mem: 42519
Epoch: [232]  [2501/2502]  eta: 0:00:00  lr: 0.000540  min_lr: 0.000540  loss: 3.0410 (2.9359)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2062 (1.1851)  time: 0.4454  data: 0.0015  max mem: 42519
Epoch: [232] Total time: 0:20:06 (0.4823 s / it)
Averaged stats: lr: 0.000540  min_lr: 0.000540  loss: 3.0410 (2.9307)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2062 (1.1851)
Test:  [ 0/50]  eta: 0:02:34  loss: 0.5915 (0.5915)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 3.0869  data: 2.7963  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.7921 (0.8425)  acc1: 89.6000 (87.7091)  acc5: 98.8000 (98.1818)  time: 0.5033  data: 0.2579  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.9066 (0.8813)  acc1: 84.8000 (86.5143)  acc5: 98.4000 (98.1143)  time: 0.2686  data: 0.0297  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0085 (0.9639)  acc1: 82.0000 (84.7871)  acc5: 96.8000 (97.2000)  time: 0.2650  data: 0.0279  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1772 (1.0200)  acc1: 79.6000 (83.1122)  acc5: 94.8000 (96.5659)  time: 0.2407  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1772 (1.0386)  acc1: 77.6000 (82.5360)  acc5: 95.2000 (96.4960)  time: 0.2409  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3103 s / it)
* Acc@1 82.744 Acc@5 96.538 loss 1.036
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.78%
Epoch: [233]  [   0/2502]  eta: 1:32:45  lr: 0.000540  min_lr: 0.000540  loss: 2.8838 (2.8838)  weight_decay: 0.0500 (0.0500)  time: 2.2243  data: 1.7193  max mem: 42519
Epoch: [233]  [ 200/2502]  eta: 0:18:44  lr: 0.000539  min_lr: 0.000539  loss: 3.1965 (2.9691)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1787 (1.1752)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [233]  [ 400/2502]  eta: 0:17:00  lr: 0.000537  min_lr: 0.000537  loss: 3.0650 (2.9428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2564 (1.2001)  time: 0.4886  data: 0.0005  max mem: 42519
Epoch: [233]  [ 600/2502]  eta: 0:15:21  lr: 0.000536  min_lr: 0.000536  loss: 2.8566 (2.9150)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2151 (1.1906)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [233]  [ 800/2502]  eta: 0:13:42  lr: 0.000535  min_lr: 0.000535  loss: 2.8358 (2.9142)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1740 (1.1894)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [233]  [1000/2502]  eta: 0:12:06  lr: 0.000534  min_lr: 0.000534  loss: 3.0198 (2.9134)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1368 (1.1919)  time: 0.4813  data: 0.0004  max mem: 42519
Epoch: [233]  [1200/2502]  eta: 0:10:29  lr: 0.000533  min_lr: 0.000533  loss: 2.9502 (2.9224)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1464 (1.1906)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [233]  [1400/2502]  eta: 0:08:51  lr: 0.000531  min_lr: 0.000531  loss: 3.0225 (2.9199)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1389 (1.1946)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [233]  [1600/2502]  eta: 0:07:15  lr: 0.000530  min_lr: 0.000530  loss: 2.9647 (2.9265)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1903 (1.1958)  time: 0.4781  data: 0.0005  max mem: 42519
Epoch: [233]  [1800/2502]  eta: 0:05:38  lr: 0.000529  min_lr: 0.000529  loss: 3.0606 (2.9235)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1761 (1.1933)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [233]  [2000/2502]  eta: 0:04:01  lr: 0.000528  min_lr: 0.000528  loss: 2.9379 (2.9231)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1996 (1.1962)  time: 0.4878  data: 0.0005  max mem: 42519
Epoch: [233]  [2200/2502]  eta: 0:02:25  lr: 0.000526  min_lr: 0.000526  loss: 3.0054 (2.9151)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1621 (1.1946)  time: 0.4807  data: 0.0006  max mem: 42519
Epoch: [233]  [2400/2502]  eta: 0:00:49  lr: 0.000525  min_lr: 0.000525  loss: 2.9932 (2.9126)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2170 (1.1954)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [233]  [2501/2502]  eta: 0:00:00  lr: 0.000525  min_lr: 0.000525  loss: 3.1129 (2.9115)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1269 (1.1939)  time: 0.4380  data: 0.0014  max mem: 42519
Epoch: [233] Total time: 0:20:05 (0.4818 s / it)
Averaged stats: lr: 0.000525  min_lr: 0.000525  loss: 3.1129 (2.9261)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1269 (1.1939)
Test:  [ 0/50]  eta: 0:02:27  loss: 0.5938 (0.5938)  acc1: 94.4000 (94.4000)  acc5: 99.2000 (99.2000)  time: 2.9444  data: 2.6541  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8440 (0.8649)  acc1: 88.8000 (87.7091)  acc5: 98.4000 (97.9273)  time: 0.4882  data: 0.2440  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8919 (0.9033)  acc1: 85.6000 (86.5143)  acc5: 98.4000 (98.0191)  time: 0.2445  data: 0.0058  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0615 (0.9889)  acc1: 82.4000 (84.6581)  acc5: 97.6000 (97.0710)  time: 0.2640  data: 0.0266  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1659 (1.0397)  acc1: 80.0000 (83.2195)  acc5: 94.8000 (96.5561)  time: 0.2603  data: 0.0232  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1659 (1.0558)  acc1: 78.0000 (82.5440)  acc5: 95.6000 (96.5600)  time: 0.2380  data: 0.0010  max mem: 42519
Test: Total time: 0:00:15 (0.3053 s / it)
* Acc@1 82.904 Acc@5 96.468 loss 1.053
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.90%
Epoch: [234]  [   0/2502]  eta: 1:25:30  lr: 0.000525  min_lr: 0.000525  loss: 3.2137 (3.2137)  weight_decay: 0.0500 (0.0500)  time: 2.0506  data: 1.5504  max mem: 42519
Epoch: [234]  [ 200/2502]  eta: 0:18:46  lr: 0.000523  min_lr: 0.000523  loss: 2.9693 (2.9277)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0944 (1.1986)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [234]  [ 400/2502]  eta: 0:16:58  lr: 0.000522  min_lr: 0.000522  loss: 2.9735 (2.8995)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1394 (1.1963)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [234]  [ 600/2502]  eta: 0:15:19  lr: 0.000521  min_lr: 0.000521  loss: 3.0440 (2.9102)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1168 (1.1904)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [234]  [ 800/2502]  eta: 0:13:41  lr: 0.000520  min_lr: 0.000520  loss: 2.8355 (2.9075)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1897 (1.1944)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [234]  [1000/2502]  eta: 0:12:04  lr: 0.000519  min_lr: 0.000519  loss: 2.8024 (2.8975)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1511 (1.1966)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [234]  [1200/2502]  eta: 0:10:27  lr: 0.000517  min_lr: 0.000517  loss: 3.0655 (2.9009)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1929 (1.1979)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [234]  [1400/2502]  eta: 0:08:51  lr: 0.000516  min_lr: 0.000516  loss: 2.9327 (2.9015)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1824 (1.2018)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [234]  [1600/2502]  eta: 0:07:14  lr: 0.000515  min_lr: 0.000515  loss: 2.9095 (2.9046)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1904 (nan)  time: 0.4820  data: 0.0004  max mem: 42519
Epoch: [234]  [1800/2502]  eta: 0:05:38  lr: 0.000514  min_lr: 0.000514  loss: 3.0483 (2.9067)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1978 (nan)  time: 0.4873  data: 0.0005  max mem: 42519
Epoch: [234]  [2000/2502]  eta: 0:04:01  lr: 0.000513  min_lr: 0.000513  loss: 3.1106 (2.9076)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1421 (nan)  time: 0.4825  data: 0.0005  max mem: 42519
Epoch: [234]  [2200/2502]  eta: 0:02:25  lr: 0.000511  min_lr: 0.000511  loss: 2.9619 (2.9080)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1086 (nan)  time: 0.4783  data: 0.0005  max mem: 42519
Epoch: [234]  [2400/2502]  eta: 0:00:49  lr: 0.000510  min_lr: 0.000510  loss: 2.9589 (2.9059)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1960 (nan)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [234]  [2501/2502]  eta: 0:00:00  lr: 0.000510  min_lr: 0.000510  loss: 3.0201 (2.9050)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1100 (nan)  time: 0.4416  data: 0.0014  max mem: 42519
Epoch: [234] Total time: 0:20:04 (0.4813 s / it)
Averaged stats: lr: 0.000510  min_lr: 0.000510  loss: 3.0201 (2.9061)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1100 (nan)
Test:  [ 0/50]  eta: 0:02:47  loss: 0.5911 (0.5911)  acc1: 94.8000 (94.8000)  acc5: 98.8000 (98.8000)  time: 3.3600  data: 3.0976  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.8638 (0.8328)  acc1: 87.2000 (88.0364)  acc5: 98.0000 (98.0000)  time: 0.5245  data: 0.2821  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8863 (0.8687)  acc1: 84.0000 (86.7429)  acc5: 98.4000 (97.9619)  time: 0.2393  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0312 (0.9597)  acc1: 82.0000 (84.9290)  acc5: 95.6000 (97.0194)  time: 0.2377  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2011 (1.0211)  acc1: 78.8000 (83.3073)  acc5: 95.2000 (96.5854)  time: 0.2374  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2036 (1.0417)  acc1: 77.6000 (82.5120)  acc5: 95.2000 (96.4480)  time: 0.2371  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3032 s / it)
* Acc@1 82.796 Acc@5 96.402 loss 1.041
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.90%
Epoch: [235]  [   0/2502]  eta: 1:31:16  lr: 0.000510  min_lr: 0.000510  loss: 2.8102 (2.8102)  weight_decay: 0.0500 (0.0500)  time: 2.1890  data: 1.5195  max mem: 42519
Epoch: [235]  [ 200/2502]  eta: 0:18:46  lr: 0.000508  min_lr: 0.000508  loss: 3.0011 (2.9157)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1497 (1.2423)  time: 0.4825  data: 0.0004  max mem: 42519
Epoch: [235]  [ 400/2502]  eta: 0:17:03  lr: 0.000507  min_lr: 0.000507  loss: 2.9170 (2.8935)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1337 (1.2193)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [235]  [ 600/2502]  eta: 0:15:22  lr: 0.000506  min_lr: 0.000506  loss: 3.0737 (2.8898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1269 (1.2322)  time: 0.4814  data: 0.0004  max mem: 42519
Epoch: [235]  [ 800/2502]  eta: 0:13:43  lr: 0.000505  min_lr: 0.000505  loss: 3.1378 (2.8994)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2165 (1.2225)  time: 0.4867  data: 0.0006  max mem: 42519
Epoch: [235]  [1000/2502]  eta: 0:12:06  lr: 0.000504  min_lr: 0.000504  loss: 2.8510 (2.8948)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1730 (1.2244)  time: 0.4835  data: 0.0005  max mem: 42519
Epoch: [235]  [1200/2502]  eta: 0:10:28  lr: 0.000502  min_lr: 0.000502  loss: 2.9840 (2.8982)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2877 (1.2256)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [235]  [1400/2502]  eta: 0:08:51  lr: 0.000501  min_lr: 0.000501  loss: 3.0269 (2.8995)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2235 (1.2302)  time: 0.4779  data: 0.0004  max mem: 42519
Epoch: [235]  [1600/2502]  eta: 0:07:15  lr: 0.000500  min_lr: 0.000500  loss: 2.7321 (2.8977)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2503 (1.2279)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [235]  [1800/2502]  eta: 0:05:38  lr: 0.000499  min_lr: 0.000499  loss: 3.0036 (2.8935)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2481 (1.2358)  time: 0.4811  data: 0.0005  max mem: 42519
Epoch: [235]  [2000/2502]  eta: 0:04:01  lr: 0.000498  min_lr: 0.000498  loss: 3.0577 (2.9003)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1047 (1.2356)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [235]  [2200/2502]  eta: 0:02:25  lr: 0.000497  min_lr: 0.000497  loss: 2.9425 (2.9028)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1548 (1.2286)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [235]  [2400/2502]  eta: 0:00:49  lr: 0.000495  min_lr: 0.000495  loss: 2.9916 (2.9026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1738 (1.2256)  time: 0.4800  data: 0.0005  max mem: 42519
Epoch: [235]  [2501/2502]  eta: 0:00:00  lr: 0.000495  min_lr: 0.000495  loss: 3.0613 (2.9023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1647 (1.2255)  time: 0.4374  data: 0.0009  max mem: 42519
Epoch: [235] Total time: 0:20:05 (0.4817 s / it)
Averaged stats: lr: 0.000495  min_lr: 0.000495  loss: 3.0613 (2.9071)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1647 (1.2255)
Test:  [ 0/50]  eta: 0:02:18  loss: 0.5388 (0.5388)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 2.7624  data: 2.4755  max mem: 42519
Test:  [10/50]  eta: 0:00:18  loss: 0.8427 (0.8391)  acc1: 90.4000 (88.0000)  acc5: 98.4000 (98.0364)  time: 0.4654  data: 0.2255  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.8668 (0.8575)  acc1: 84.8000 (86.9524)  acc5: 98.4000 (98.0381)  time: 0.2377  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9764 (0.9387)  acc1: 83.2000 (84.9032)  acc5: 96.8000 (97.1613)  time: 0.2378  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1203 (0.9914)  acc1: 79.2000 (83.4829)  acc5: 94.8000 (96.6049)  time: 0.2407  data: 0.0053  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1176 (1.0040)  acc1: 78.8000 (82.8720)  acc5: 95.2000 (96.5280)  time: 0.2405  data: 0.0051  max mem: 42519
Test: Total time: 0:00:14 (0.2911 s / it)
* Acc@1 83.130 Acc@5 96.504 loss 1.003
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.13%
Epoch: [236]  [   0/2502]  eta: 1:17:32  lr: 0.000495  min_lr: 0.000495  loss: 3.1169 (3.1169)  weight_decay: 0.0500 (0.0500)  time: 1.8595  data: 1.3617  max mem: 42519
Epoch: [236]  [ 200/2502]  eta: 0:18:42  lr: 0.000494  min_lr: 0.000494  loss: 3.0564 (2.9136)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3187 (1.2413)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [236]  [ 400/2502]  eta: 0:16:57  lr: 0.000492  min_lr: 0.000492  loss: 2.9707 (2.8879)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1781 (1.2515)  time: 0.4808  data: 0.0004  max mem: 42519
Epoch: [236]  [ 600/2502]  eta: 0:15:18  lr: 0.000491  min_lr: 0.000491  loss: 2.9673 (2.8955)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1553 (1.2272)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [236]  [ 800/2502]  eta: 0:13:40  lr: 0.000490  min_lr: 0.000490  loss: 3.1997 (2.9088)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1956 (1.2263)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [236]  [1000/2502]  eta: 0:12:03  lr: 0.000489  min_lr: 0.000489  loss: 3.0074 (2.9080)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1718 (1.2224)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [236]  [1200/2502]  eta: 0:10:27  lr: 0.000488  min_lr: 0.000488  loss: 3.0255 (2.9146)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2075 (1.2236)  time: 0.4872  data: 0.0004  max mem: 42519
Epoch: [236]  [1400/2502]  eta: 0:08:50  lr: 0.000486  min_lr: 0.000486  loss: 3.0720 (2.9119)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2621 (1.2295)  time: 0.4809  data: 0.0005  max mem: 42519
Epoch: [236]  [1600/2502]  eta: 0:07:14  lr: 0.000485  min_lr: 0.000485  loss: 2.9474 (2.9135)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1760 (1.2269)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [236]  [1800/2502]  eta: 0:05:38  lr: 0.000484  min_lr: 0.000484  loss: 3.0917 (2.9128)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1132 (1.2310)  time: 0.4812  data: 0.0004  max mem: 42519
Epoch: [236]  [2000/2502]  eta: 0:04:01  lr: 0.000483  min_lr: 0.000483  loss: 2.9497 (2.9090)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2396 (1.2303)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [236]  [2200/2502]  eta: 0:02:25  lr: 0.000482  min_lr: 0.000482  loss: 2.8657 (2.9046)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3067 (1.2352)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [236]  [2400/2502]  eta: 0:00:49  lr: 0.000481  min_lr: 0.000481  loss: 2.9467 (2.9038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0643 (1.2302)  time: 0.4810  data: 0.0004  max mem: 42519
Epoch: [236]  [2501/2502]  eta: 0:00:00  lr: 0.000480  min_lr: 0.000480  loss: 3.1359 (2.9036)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2401 (1.2308)  time: 0.4422  data: 0.0021  max mem: 42519
Epoch: [236] Total time: 0:20:04 (0.4816 s / it)
Averaged stats: lr: 0.000480  min_lr: 0.000480  loss: 3.1359 (2.9014)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2401 (1.2308)
Test:  [ 0/50]  eta: 0:02:29  loss: 0.5769 (0.5769)  acc1: 94.0000 (94.0000)  acc5: 99.6000 (99.6000)  time: 2.9886  data: 2.7288  max mem: 42519
Test:  [10/50]  eta: 0:00:21  loss: 0.8533 (0.8576)  acc1: 89.6000 (87.3091)  acc5: 98.0000 (98.1455)  time: 0.5306  data: 0.2917  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8895 (0.8788)  acc1: 84.8000 (86.3048)  acc5: 98.0000 (98.0571)  time: 0.2645  data: 0.0243  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0553 (0.9737)  acc1: 81.6000 (84.3484)  acc5: 97.6000 (97.1742)  time: 0.2647  data: 0.0226  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1799 (1.0307)  acc1: 79.6000 (83.0244)  acc5: 95.2000 (96.6732)  time: 0.2894  data: 0.0508  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1799 (1.0440)  acc1: 78.8000 (82.4080)  acc5: 95.6000 (96.5760)  time: 0.2654  data: 0.0286  max mem: 42519
Test: Total time: 0:00:16 (0.3258 s / it)
* Acc@1 82.802 Acc@5 96.516 loss 1.039
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 83.13%
Epoch: [237]  [   0/2502]  eta: 1:26:23  lr: 0.000480  min_lr: 0.000480  loss: 2.1636 (2.1636)  weight_decay: 0.0500 (0.0500)  time: 2.0718  data: 1.6011  max mem: 42519
Epoch: [237]  [ 200/2502]  eta: 0:18:42  lr: 0.000479  min_lr: 0.000479  loss: 3.2053 (2.8764)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1711 (1.2246)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [237]  [ 400/2502]  eta: 0:16:58  lr: 0.000478  min_lr: 0.000478  loss: 2.7265 (2.8591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1726 (1.1987)  time: 0.4791  data: 0.0003  max mem: 42519
Epoch: [237]  [ 600/2502]  eta: 0:15:18  lr: 0.000477  min_lr: 0.000477  loss: 2.8048 (2.8502)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1617 (1.2053)  time: 0.4778  data: 0.0004  max mem: 42519
Epoch: [237]  [ 800/2502]  eta: 0:13:40  lr: 0.000475  min_lr: 0.000475  loss: 3.0401 (2.8508)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2049 (1.2119)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [237]  [1000/2502]  eta: 0:12:03  lr: 0.000474  min_lr: 0.000474  loss: 2.8680 (2.8588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1714 (1.2152)  time: 0.4808  data: 0.0005  max mem: 42519
Epoch: [237]  [1200/2502]  eta: 0:10:27  lr: 0.000473  min_lr: 0.000473  loss: 2.7090 (2.8627)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2381 (1.2146)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [237]  [1400/2502]  eta: 0:08:50  lr: 0.000472  min_lr: 0.000472  loss: 2.9958 (2.8640)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2246 (1.2219)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [237]  [1600/2502]  eta: 0:07:14  lr: 0.000471  min_lr: 0.000471  loss: 2.9762 (2.8724)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2073 (1.2234)  time: 0.4784  data: 0.0005  max mem: 42519
Epoch: [237]  [1800/2502]  eta: 0:05:37  lr: 0.000470  min_lr: 0.000470  loss: 3.0769 (2.8782)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1967 (1.2245)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [237]  [2000/2502]  eta: 0:04:01  lr: 0.000468  min_lr: 0.000468  loss: 3.1289 (2.8844)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2534 (1.2260)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [237]  [2200/2502]  eta: 0:02:25  lr: 0.000467  min_lr: 0.000467  loss: 2.9616 (2.8861)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.4816  data: 0.0004  max mem: 42519
Epoch: [237]  [2400/2502]  eta: 0:00:49  lr: 0.000466  min_lr: 0.000466  loss: 3.0662 (2.8879)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2017 (nan)  time: 0.4824  data: 0.0005  max mem: 42519
Epoch: [237]  [2501/2502]  eta: 0:00:00  lr: 0.000466  min_lr: 0.000466  loss: 2.8737 (2.8930)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3211 (nan)  time: 0.4431  data: 0.0009  max mem: 42519
Epoch: [237] Total time: 0:20:03 (0.4811 s / it)
Averaged stats: lr: 0.000466  min_lr: 0.000466  loss: 2.8737 (2.8932)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3211 (nan)
Test:  [ 0/50]  eta: 0:03:47  loss: 0.5768 (0.5768)  acc1: 95.2000 (95.2000)  acc5: 99.2000 (99.2000)  time: 4.5443  data: 4.2805  max mem: 42519
Test:  [10/50]  eta: 0:00:25  loss: 0.8303 (0.8466)  acc1: 88.8000 (87.2727)  acc5: 98.4000 (98.1455)  time: 0.6268  data: 0.3896  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8747 (0.8792)  acc1: 84.8000 (86.5714)  acc5: 98.0000 (98.1333)  time: 0.2377  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0579 (0.9621)  acc1: 82.8000 (84.6323)  acc5: 97.2000 (97.1097)  time: 0.2820  data: 0.0409  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1364 (1.0164)  acc1: 78.8000 (83.1610)  acc5: 94.8000 (96.6146)  time: 0.2841  data: 0.0456  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1737 (1.0318)  acc1: 78.4000 (82.6240)  acc5: 95.6000 (96.5520)  time: 0.2751  data: 0.0405  max mem: 42519
Test: Total time: 0:00:17 (0.3442 s / it)
* Acc@1 83.108 Acc@5 96.630 loss 1.026
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.13%
Epoch: [238]  [   0/2502]  eta: 1:37:37  lr: 0.000466  min_lr: 0.000466  loss: 3.1323 (3.1323)  weight_decay: 0.0500 (0.0500)  time: 2.3413  data: 1.4973  max mem: 42519
Epoch: [238]  [ 200/2502]  eta: 0:18:50  lr: 0.000464  min_lr: 0.000464  loss: 3.0820 (2.9193)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1992 (1.2695)  time: 0.4784  data: 0.0005  max mem: 42519
Epoch: [238]  [ 400/2502]  eta: 0:17:02  lr: 0.000463  min_lr: 0.000463  loss: 3.2102 (2.9084)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1378 (1.2361)  time: 0.4843  data: 0.0005  max mem: 42519
Epoch: [238]  [ 600/2502]  eta: 0:15:21  lr: 0.000462  min_lr: 0.000462  loss: 2.8905 (2.8942)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2450 (1.2321)  time: 0.4809  data: 0.0006  max mem: 42519
Epoch: [238]  [ 800/2502]  eta: 0:13:44  lr: 0.000461  min_lr: 0.000461  loss: 3.0002 (2.8877)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2210 (1.2529)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [238]  [1000/2502]  eta: 0:12:06  lr: 0.000460  min_lr: 0.000460  loss: 3.1641 (2.8916)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2007 (1.2542)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [238]  [1200/2502]  eta: 0:10:28  lr: 0.000459  min_lr: 0.000459  loss: 2.7538 (2.8909)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2173 (1.2486)  time: 0.4861  data: 0.0004  max mem: 42519
Epoch: [238]  [1400/2502]  eta: 0:08:52  lr: 0.000458  min_lr: 0.000458  loss: 2.7240 (2.9003)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1606 (1.2433)  time: 0.4864  data: 0.0005  max mem: 42519
Epoch: [238]  [1600/2502]  eta: 0:07:15  lr: 0.000456  min_lr: 0.000456  loss: 2.9509 (2.8914)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1765 (1.2414)  time: 0.4817  data: 0.0005  max mem: 42519
Epoch: [238]  [1800/2502]  eta: 0:05:38  lr: 0.000455  min_lr: 0.000455  loss: 2.9628 (2.8928)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2975 (1.2449)  time: 0.4809  data: 0.0005  max mem: 42519
Epoch: [238]  [2000/2502]  eta: 0:04:02  lr: 0.000454  min_lr: 0.000454  loss: 3.0512 (2.8906)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1620 (1.2453)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [238]  [2200/2502]  eta: 0:02:25  lr: 0.000453  min_lr: 0.000453  loss: 2.9717 (2.8909)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2117 (1.2440)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [238]  [2400/2502]  eta: 0:00:49  lr: 0.000452  min_lr: 0.000452  loss: 3.0160 (2.8898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1675 (1.2428)  time: 0.4870  data: 0.0005  max mem: 42519
Epoch: [238]  [2501/2502]  eta: 0:00:00  lr: 0.000451  min_lr: 0.000451  loss: 2.6955 (2.8884)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1520 (1.2412)  time: 0.4370  data: 0.0008  max mem: 42519
Epoch: [238] Total time: 0:20:05 (0.4819 s / it)
Averaged stats: lr: 0.000451  min_lr: 0.000451  loss: 2.6955 (2.8909)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1520 (1.2412)
Test:  [ 0/50]  eta: 0:03:16  loss: 0.5252 (0.5252)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 3.9250  data: 3.6471  max mem: 42519
Test:  [10/50]  eta: 0:00:24  loss: 0.7286 (0.7678)  acc1: 89.6000 (87.7455)  acc5: 98.4000 (98.0727)  time: 0.6016  data: 0.3614  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.7774 (0.7998)  acc1: 84.8000 (86.6286)  acc5: 98.0000 (98.0762)  time: 0.2551  data: 0.0167  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9411 (0.8826)  acc1: 82.4000 (84.8000)  acc5: 97.2000 (97.2258)  time: 0.2394  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0932 (0.9415)  acc1: 78.4000 (83.2293)  acc5: 95.2000 (96.6244)  time: 0.2374  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0672 (0.9548)  acc1: 77.6000 (82.6080)  acc5: 95.2000 (96.5680)  time: 0.2371  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3201 s / it)
* Acc@1 83.000 Acc@5 96.558 loss 0.944
Accuracy of the model on the 50000 test images: 83.0%
Max accuracy: 83.13%
Epoch: [239]  [   0/2502]  eta: 1:20:35  lr: 0.000451  min_lr: 0.000451  loss: 2.4750 (2.4750)  weight_decay: 0.0500 (0.0500)  time: 1.9325  data: 1.3447  max mem: 42519
Epoch: [239]  [ 200/2502]  eta: 0:18:43  lr: 0.000450  min_lr: 0.000450  loss: 2.6428 (2.7959)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1768 (1.2677)  time: 0.4845  data: 0.0005  max mem: 42519
Epoch: [239]  [ 400/2502]  eta: 0:16:59  lr: 0.000449  min_lr: 0.000449  loss: 2.9304 (2.8423)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1908 (1.2599)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [239]  [ 600/2502]  eta: 0:15:20  lr: 0.000448  min_lr: 0.000448  loss: 3.0676 (2.8478)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2174 (1.2669)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [239]  [ 800/2502]  eta: 0:13:42  lr: 0.000447  min_lr: 0.000447  loss: 3.1486 (2.8513)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2706 (1.2751)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [239]  [1000/2502]  eta: 0:12:05  lr: 0.000446  min_lr: 0.000446  loss: 2.7879 (2.8538)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1619 (1.2802)  time: 0.4908  data: 0.0005  max mem: 42519
Epoch: [239]  [1200/2502]  eta: 0:10:28  lr: 0.000445  min_lr: 0.000445  loss: 2.9031 (2.8470)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2355 (1.2773)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [239]  [1400/2502]  eta: 0:08:51  lr: 0.000443  min_lr: 0.000443  loss: 3.1042 (2.8575)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1876 (1.2728)  time: 0.4861  data: 0.0004  max mem: 42519
Epoch: [239]  [1600/2502]  eta: 0:07:14  lr: 0.000442  min_lr: 0.000442  loss: 2.7836 (2.8587)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2209 (1.2778)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [239]  [1800/2502]  eta: 0:05:38  lr: 0.000441  min_lr: 0.000441  loss: 2.9515 (2.8617)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2148 (1.2786)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [239]  [2000/2502]  eta: 0:04:01  lr: 0.000440  min_lr: 0.000440  loss: 3.1009 (2.8621)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2767 (1.2729)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [239]  [2200/2502]  eta: 0:02:25  lr: 0.000439  min_lr: 0.000439  loss: 2.9530 (2.8657)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2058 (1.2721)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [239]  [2400/2502]  eta: 0:00:49  lr: 0.000438  min_lr: 0.000438  loss: 3.1309 (2.8693)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2099 (1.2686)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [239]  [2501/2502]  eta: 0:00:00  lr: 0.000437  min_lr: 0.000437  loss: 2.8158 (2.8701)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2584 (1.2695)  time: 0.4336  data: 0.0008  max mem: 42519
Epoch: [239] Total time: 0:20:03 (0.4808 s / it)
Averaged stats: lr: 0.000437  min_lr: 0.000437  loss: 2.8158 (2.8758)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2584 (1.2695)
Test:  [ 0/50]  eta: 0:04:35  loss: 0.4957 (0.4957)  acc1: 96.0000 (96.0000)  acc5: 99.6000 (99.6000)  time: 5.5025  data: 5.2033  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.7478 (0.7573)  acc1: 90.0000 (88.3273)  acc5: 98.4000 (98.2909)  time: 0.7219  data: 0.4809  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.7902 (0.7996)  acc1: 86.0000 (86.8191)  acc5: 98.4000 (98.1524)  time: 0.2397  data: 0.0045  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9985 (0.8892)  acc1: 81.6000 (84.9032)  acc5: 97.2000 (97.2387)  time: 0.2377  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1258 (0.9470)  acc1: 78.4000 (83.2195)  acc5: 94.8000 (96.5659)  time: 0.2560  data: 0.0165  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1262 (0.9644)  acc1: 77.6000 (82.6880)  acc5: 95.2000 (96.4800)  time: 0.2542  data: 0.0169  max mem: 42519
Test: Total time: 0:00:17 (0.3535 s / it)
* Acc@1 83.006 Acc@5 96.614 loss 0.956
Accuracy of the model on the 50000 test images: 83.0%
Max accuracy: 83.13%
Epoch: [240]  [   0/2502]  eta: 1:41:41  lr: 0.000437  min_lr: 0.000437  loss: 3.0218 (3.0218)  weight_decay: 0.0500 (0.0500)  time: 2.4385  data: 1.9443  max mem: 42519
Epoch: [240]  [ 200/2502]  eta: 0:18:54  lr: 0.000436  min_lr: 0.000436  loss: 3.0938 (2.8514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2436 (1.2781)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [240]  [ 400/2502]  eta: 0:17:02  lr: 0.000435  min_lr: 0.000435  loss: 2.7525 (2.8477)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3335 (1.2842)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [240]  [ 600/2502]  eta: 0:15:21  lr: 0.000434  min_lr: 0.000434  loss: 3.1192 (2.8606)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2034 (1.2693)  time: 0.4784  data: 0.0005  max mem: 42519
Epoch: [240]  [ 800/2502]  eta: 0:13:43  lr: 0.000433  min_lr: 0.000433  loss: 2.6850 (2.8560)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2505 (1.2563)  time: 0.4816  data: 0.0004  max mem: 42519
Epoch: [240]  [1000/2502]  eta: 0:12:04  lr: 0.000432  min_lr: 0.000432  loss: 3.0563 (2.8681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3059 (1.2812)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [240]  [1200/2502]  eta: 0:10:27  lr: 0.000431  min_lr: 0.000431  loss: 2.9742 (2.8726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1224 (1.2758)  time: 0.4809  data: 0.0004  max mem: 42519
Epoch: [240]  [1400/2502]  eta: 0:08:51  lr: 0.000429  min_lr: 0.000429  loss: 3.0345 (2.8795)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3057 (1.2726)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [240]  [1600/2502]  eta: 0:07:14  lr: 0.000428  min_lr: 0.000428  loss: 3.0572 (2.8778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2070 (1.2693)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [240]  [1800/2502]  eta: 0:05:38  lr: 0.000427  min_lr: 0.000427  loss: 3.0750 (2.8801)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1607 (1.2701)  time: 0.4849  data: 0.0005  max mem: 42519
Epoch: [240]  [2000/2502]  eta: 0:04:02  lr: 0.000426  min_lr: 0.000426  loss: 2.9509 (2.8786)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1843 (1.2658)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [240]  [2200/2502]  eta: 0:02:25  lr: 0.000425  min_lr: 0.000425  loss: 2.8821 (2.8790)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2698 (1.2650)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [240]  [2400/2502]  eta: 0:00:49  lr: 0.000424  min_lr: 0.000424  loss: 2.8821 (2.8790)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2294 (1.2652)  time: 0.4882  data: 0.0005  max mem: 42519
Epoch: [240]  [2501/2502]  eta: 0:00:00  lr: 0.000423  min_lr: 0.000423  loss: 2.6121 (2.8778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3353 (1.2716)  time: 0.4340  data: 0.0009  max mem: 42519
Epoch: [240] Total time: 0:20:06 (0.4820 s / it)
Averaged stats: lr: 0.000423  min_lr: 0.000423  loss: 2.6121 (2.8721)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3353 (1.2716)
Test:  [ 0/50]  eta: 0:04:43  loss: 0.4887 (0.4887)  acc1: 95.2000 (95.2000)  acc5: 99.2000 (99.2000)  time: 5.6672  data: 5.3727  max mem: 42519
Test:  [10/50]  eta: 0:00:30  loss: 0.7712 (0.7559)  acc1: 88.8000 (88.2182)  acc5: 98.4000 (98.0364)  time: 0.7610  data: 0.5191  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.8072 (0.7941)  acc1: 86.0000 (86.7429)  acc5: 98.4000 (98.0191)  time: 0.2539  data: 0.0171  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9231 (0.8731)  acc1: 83.2000 (85.0065)  acc5: 96.8000 (97.1613)  time: 0.2685  data: 0.0307  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0852 (0.9334)  acc1: 78.8000 (83.3073)  acc5: 95.6000 (96.7317)  time: 0.2683  data: 0.0306  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0864 (0.9445)  acc1: 78.4000 (82.8000)  acc5: 96.0000 (96.7040)  time: 0.2672  data: 0.0304  max mem: 42519
Test: Total time: 0:00:18 (0.3668 s / it)
* Acc@1 83.156 Acc@5 96.632 loss 0.937
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.16%
Epoch: [241]  [   0/2502]  eta: 1:24:41  lr: 0.000423  min_lr: 0.000423  loss: 1.9458 (1.9458)  weight_decay: 0.0500 (0.0500)  time: 2.0310  data: 1.5335  max mem: 42519
Epoch: [241]  [ 200/2502]  eta: 0:18:41  lr: 0.000422  min_lr: 0.000422  loss: 2.9772 (2.8373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1356 (1.2585)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [241]  [ 400/2502]  eta: 0:17:00  lr: 0.000421  min_lr: 0.000421  loss: 2.8715 (2.8608)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2831 (1.2908)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [241]  [ 600/2502]  eta: 0:15:20  lr: 0.000420  min_lr: 0.000420  loss: 2.7809 (2.8505)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2557 (1.2842)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [241]  [ 800/2502]  eta: 0:13:42  lr: 0.000419  min_lr: 0.000419  loss: 3.0982 (2.8716)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3107 (1.2838)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [241]  [1000/2502]  eta: 0:12:05  lr: 0.000418  min_lr: 0.000418  loss: 3.0564 (2.8711)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2185 (1.2816)  time: 0.4888  data: 0.0005  max mem: 42519
Epoch: [241]  [1200/2502]  eta: 0:10:28  lr: 0.000417  min_lr: 0.000417  loss: 3.0171 (2.8775)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [241]  [1400/2502]  eta: 0:08:51  lr: 0.000416  min_lr: 0.000416  loss: 2.9884 (2.8779)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1859 (nan)  time: 0.4900  data: 0.0006  max mem: 42519
Epoch: [241]  [1600/2502]  eta: 0:07:14  lr: 0.000415  min_lr: 0.000415  loss: 2.8702 (2.8741)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2532 (nan)  time: 0.4794  data: 0.0007  max mem: 42519
Epoch: [241]  [1800/2502]  eta: 0:05:38  lr: 0.000413  min_lr: 0.000413  loss: 3.0541 (2.8754)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2369 (nan)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [241]  [2000/2502]  eta: 0:04:01  lr: 0.000412  min_lr: 0.000412  loss: 2.9504 (2.8719)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2313 (nan)  time: 0.4876  data: 0.0005  max mem: 42519
Epoch: [241]  [2200/2502]  eta: 0:02:25  lr: 0.000411  min_lr: 0.000411  loss: 2.9877 (2.8728)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1963 (nan)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [241]  [2400/2502]  eta: 0:00:49  lr: 0.000410  min_lr: 0.000410  loss: 2.8835 (2.8723)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2852 (nan)  time: 0.4782  data: 0.0005  max mem: 42519
Epoch: [241]  [2501/2502]  eta: 0:00:00  lr: 0.000410  min_lr: 0.000410  loss: 3.0735 (2.8746)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.4344  data: 0.0007  max mem: 42519
Epoch: [241] Total time: 0:20:03 (0.4812 s / it)
Averaged stats: lr: 0.000410  min_lr: 0.000410  loss: 3.0735 (2.8754)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/50]  eta: 0:04:41  loss: 0.6359 (0.6359)  acc1: 95.6000 (95.6000)  acc5: 98.8000 (98.8000)  time: 5.6339  data: 5.3364  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.9209 (0.8874)  acc1: 87.2000 (87.7091)  acc5: 98.0000 (97.9636)  time: 0.7284  data: 0.4857  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.9284 (0.9192)  acc1: 85.2000 (86.6857)  acc5: 98.0000 (97.8476)  time: 0.2382  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0813 (1.0050)  acc1: 82.4000 (84.7742)  acc5: 96.4000 (97.0065)  time: 0.2391  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2043 (1.0611)  acc1: 80.0000 (83.2585)  acc5: 94.8000 (96.5463)  time: 0.2404  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2058 (1.0763)  acc1: 78.8000 (82.8480)  acc5: 95.6000 (96.4880)  time: 0.2392  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3491 s / it)
* Acc@1 83.118 Acc@5 96.570 loss 1.073
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.16%
Epoch: [242]  [   0/2502]  eta: 1:39:12  lr: 0.000410  min_lr: 0.000410  loss: 2.7003 (2.7003)  weight_decay: 0.0500 (0.0500)  time: 2.3789  data: 1.7040  max mem: 42519
Epoch: [242]  [ 200/2502]  eta: 0:18:52  lr: 0.000409  min_lr: 0.000409  loss: 2.8587 (2.8307)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2417 (1.3034)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [242]  [ 400/2502]  eta: 0:17:03  lr: 0.000407  min_lr: 0.000407  loss: 2.9395 (2.8445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3181 (1.2954)  time: 0.4832  data: 0.0004  max mem: 42519
Epoch: [242]  [ 600/2502]  eta: 0:15:22  lr: 0.000406  min_lr: 0.000406  loss: 3.0378 (2.8461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1796 (1.2814)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [242]  [ 800/2502]  eta: 0:13:44  lr: 0.000405  min_lr: 0.000405  loss: 2.7870 (2.8295)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2401 (1.2795)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [242]  [1000/2502]  eta: 0:12:06  lr: 0.000404  min_lr: 0.000404  loss: 2.6236 (2.8258)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2530 (1.2794)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [242]  [1200/2502]  eta: 0:10:29  lr: 0.000403  min_lr: 0.000403  loss: 2.8383 (2.8298)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3423 (1.2852)  time: 0.4840  data: 0.0005  max mem: 42519
Epoch: [242]  [1400/2502]  eta: 0:08:52  lr: 0.000402  min_lr: 0.000402  loss: 2.5743 (2.8248)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2595 (1.2806)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [242]  [1600/2502]  eta: 0:07:15  lr: 0.000401  min_lr: 0.000401  loss: 2.7918 (2.8222)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3595 (1.2856)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [242]  [1800/2502]  eta: 0:05:38  lr: 0.000400  min_lr: 0.000400  loss: 2.8761 (2.8280)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3567 (1.2901)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [242]  [2000/2502]  eta: 0:04:02  lr: 0.000399  min_lr: 0.000399  loss: 2.9801 (2.8313)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2392 (1.2873)  time: 0.4811  data: 0.0005  max mem: 42519
Epoch: [242]  [2200/2502]  eta: 0:02:25  lr: 0.000398  min_lr: 0.000398  loss: 3.2162 (2.8368)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1893 (1.2830)  time: 0.4800  data: 0.0005  max mem: 42519
Epoch: [242]  [2400/2502]  eta: 0:00:49  lr: 0.000397  min_lr: 0.000397  loss: 2.9986 (2.8403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2756 (1.2852)  time: 0.4871  data: 0.0005  max mem: 42519
Epoch: [242]  [2501/2502]  eta: 0:00:00  lr: 0.000396  min_lr: 0.000396  loss: 2.9124 (2.8406)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2556 (1.2849)  time: 0.4370  data: 0.0009  max mem: 42519
Epoch: [242] Total time: 0:20:07 (0.4825 s / it)
Averaged stats: lr: 0.000396  min_lr: 0.000396  loss: 2.9124 (2.8629)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2556 (1.2849)
Test:  [ 0/50]  eta: 0:04:37  loss: 0.5920 (0.5920)  acc1: 94.4000 (94.4000)  acc5: 99.2000 (99.2000)  time: 5.5572  data: 5.2517  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.8218 (0.8434)  acc1: 88.0000 (87.8182)  acc5: 98.0000 (98.0000)  time: 0.7225  data: 0.4781  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8694 (0.8798)  acc1: 85.2000 (86.6286)  acc5: 98.0000 (98.0000)  time: 0.2390  data: 0.0012  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9905 (0.9525)  acc1: 83.2000 (84.9807)  acc5: 97.2000 (97.1097)  time: 0.2388  data: 0.0016  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1624 (1.0080)  acc1: 79.2000 (83.3659)  acc5: 94.8000 (96.6342)  time: 0.2396  data: 0.0008  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1624 (1.0181)  acc1: 78.0000 (82.8800)  acc5: 95.6000 (96.5600)  time: 0.2420  data: 0.0033  max mem: 42519
Test: Total time: 0:00:17 (0.3490 s / it)
* Acc@1 83.220 Acc@5 96.570 loss 1.010
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.22%
Epoch: [243]  [   0/2502]  eta: 1:25:51  lr: 0.000396  min_lr: 0.000396  loss: 2.9813 (2.9813)  weight_decay: 0.0500 (0.0500)  time: 2.0589  data: 1.5690  max mem: 42519
Epoch: [243]  [ 200/2502]  eta: 0:18:42  lr: 0.000395  min_lr: 0.000395  loss: 2.7731 (2.8278)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2306 (1.2584)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [243]  [ 400/2502]  eta: 0:16:57  lr: 0.000394  min_lr: 0.000394  loss: 2.8553 (2.8125)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2565 (1.2520)  time: 0.4816  data: 0.0006  max mem: 42519
Epoch: [243]  [ 600/2502]  eta: 0:15:18  lr: 0.000393  min_lr: 0.000393  loss: 3.0535 (2.8317)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1776 (1.2921)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [243]  [ 800/2502]  eta: 0:13:40  lr: 0.000392  min_lr: 0.000392  loss: 2.9500 (2.8464)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2047 (1.2950)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [243]  [1000/2502]  eta: 0:12:03  lr: 0.000391  min_lr: 0.000391  loss: 2.7717 (2.8548)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2057 (1.2826)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [243]  [1200/2502]  eta: 0:10:27  lr: 0.000390  min_lr: 0.000390  loss: 2.9686 (2.8560)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2519 (1.2791)  time: 0.4799  data: 0.0005  max mem: 42519
Epoch: [243]  [1400/2502]  eta: 0:08:50  lr: 0.000389  min_lr: 0.000389  loss: 2.8796 (2.8636)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2695 (1.2796)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [243]  [1600/2502]  eta: 0:07:14  lr: 0.000388  min_lr: 0.000388  loss: 2.8665 (2.8637)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3080 (1.2821)  time: 0.4808  data: 0.0004  max mem: 42519
Epoch: [243]  [1800/2502]  eta: 0:05:37  lr: 0.000387  min_lr: 0.000387  loss: 3.0616 (2.8662)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2553 (1.2852)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [243]  [2000/2502]  eta: 0:04:01  lr: 0.000386  min_lr: 0.000386  loss: 2.8078 (2.8628)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2604 (1.2901)  time: 0.4890  data: 0.0004  max mem: 42519
Epoch: [243]  [2200/2502]  eta: 0:02:25  lr: 0.000384  min_lr: 0.000384  loss: 2.8299 (2.8633)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1166 (1.2907)  time: 0.4808  data: 0.0004  max mem: 42519
Epoch: [243]  [2400/2502]  eta: 0:00:49  lr: 0.000383  min_lr: 0.000383  loss: 3.0019 (2.8663)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3045 (1.2893)  time: 0.4813  data: 0.0004  max mem: 42519
Epoch: [243]  [2501/2502]  eta: 0:00:00  lr: 0.000383  min_lr: 0.000383  loss: 2.6973 (2.8620)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3019 (1.2917)  time: 0.4380  data: 0.0008  max mem: 42519
Epoch: [243] Total time: 0:20:04 (0.4813 s / it)
Averaged stats: lr: 0.000383  min_lr: 0.000383  loss: 2.6973 (2.8540)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3019 (1.2917)
Test:  [ 0/50]  eta: 0:04:24  loss: 0.4656 (0.4656)  acc1: 94.4000 (94.4000)  acc5: 99.6000 (99.6000)  time: 5.2898  data: 4.9814  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.6917 (0.7107)  acc1: 88.8000 (88.1091)  acc5: 98.4000 (98.1455)  time: 0.7169  data: 0.4671  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.7471 (0.7515)  acc1: 86.0000 (87.0286)  acc5: 98.0000 (97.8857)  time: 0.2506  data: 0.0081  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.8753 (0.8326)  acc1: 83.6000 (84.9936)  acc5: 96.4000 (97.0452)  time: 0.2418  data: 0.0014  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0523 (0.8863)  acc1: 79.6000 (83.5317)  acc5: 95.2000 (96.6146)  time: 0.2402  data: 0.0012  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0252 (0.8998)  acc1: 78.8000 (82.9840)  acc5: 95.6000 (96.5200)  time: 0.2379  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3479 s / it)
* Acc@1 83.290 Acc@5 96.606 loss 0.891
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.29%
Epoch: [244]  [   0/2502]  eta: 1:34:53  lr: 0.000383  min_lr: 0.000383  loss: 3.1217 (3.1217)  weight_decay: 0.0500 (0.0500)  time: 2.2757  data: 1.7838  max mem: 42519
Epoch: [244]  [ 200/2502]  eta: 0:18:52  lr: 0.000382  min_lr: 0.000382  loss: 2.8894 (2.8670)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2884 (1.2570)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [244]  [ 400/2502]  eta: 0:17:01  lr: 0.000381  min_lr: 0.000381  loss: 2.9176 (2.8747)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1957 (1.2599)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [244]  [ 600/2502]  eta: 0:15:20  lr: 0.000380  min_lr: 0.000380  loss: 2.8372 (2.8625)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2384 (1.2628)  time: 0.4865  data: 0.0004  max mem: 42519
Epoch: [244]  [ 800/2502]  eta: 0:13:42  lr: 0.000379  min_lr: 0.000379  loss: 3.0221 (2.8579)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3196 (1.2846)  time: 0.4818  data: 0.0005  max mem: 42519
Epoch: [244]  [1000/2502]  eta: 0:12:05  lr: 0.000378  min_lr: 0.000378  loss: 2.9096 (2.8646)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2976 (1.2995)  time: 0.4818  data: 0.0004  max mem: 42519
Epoch: [244]  [1200/2502]  eta: 0:10:28  lr: 0.000377  min_lr: 0.000377  loss: 3.0732 (2.8625)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2799 (1.3022)  time: 0.4878  data: 0.0005  max mem: 42519
Epoch: [244]  [1400/2502]  eta: 0:08:51  lr: 0.000376  min_lr: 0.000376  loss: 3.1074 (2.8647)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2664 (1.3017)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [244]  [1600/2502]  eta: 0:07:14  lr: 0.000374  min_lr: 0.000374  loss: 3.0268 (2.8629)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2227 (1.2973)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [244]  [1800/2502]  eta: 0:05:38  lr: 0.000373  min_lr: 0.000373  loss: 2.9740 (2.8643)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4051 (1.3047)  time: 0.4813  data: 0.0004  max mem: 42519
Epoch: [244]  [2000/2502]  eta: 0:04:02  lr: 0.000372  min_lr: 0.000372  loss: 2.8011 (2.8641)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3892 (1.3072)  time: 0.4845  data: 0.0004  max mem: 42519
Epoch: [244]  [2200/2502]  eta: 0:02:25  lr: 0.000371  min_lr: 0.000371  loss: 3.0743 (2.8696)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2665 (1.3040)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [244]  [2400/2502]  eta: 0:00:49  lr: 0.000370  min_lr: 0.000370  loss: 2.7169 (2.8634)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3600 (1.3085)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [244]  [2501/2502]  eta: 0:00:00  lr: 0.000370  min_lr: 0.000370  loss: 2.5117 (2.8614)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2896 (1.3122)  time: 0.4342  data: 0.0007  max mem: 42519
Epoch: [244] Total time: 0:20:05 (0.4820 s / it)
Averaged stats: lr: 0.000370  min_lr: 0.000370  loss: 2.5117 (2.8580)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2896 (1.3122)
Test:  [ 0/50]  eta: 0:04:24  loss: 0.5152 (0.5152)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 5.2953  data: 5.0216  max mem: 42519
Test:  [10/50]  eta: 0:00:27  loss: 0.7645 (0.7433)  acc1: 88.8000 (88.1818)  acc5: 98.4000 (98.1455)  time: 0.6960  data: 0.4569  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.7655 (0.7755)  acc1: 85.6000 (87.0286)  acc5: 98.0000 (98.0381)  time: 0.2367  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9046 (0.8574)  acc1: 82.4000 (84.9936)  acc5: 96.4000 (97.2129)  time: 0.2377  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0623 (0.9135)  acc1: 79.6000 (83.5512)  acc5: 95.2000 (96.7805)  time: 0.2371  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0382 (0.9267)  acc1: 78.4000 (82.9840)  acc5: 95.2000 (96.6320)  time: 0.2359  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3397 s / it)
* Acc@1 83.442 Acc@5 96.632 loss 0.919
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.44%
Epoch: [245]  [   0/2502]  eta: 1:29:06  lr: 0.000370  min_lr: 0.000370  loss: 2.8808 (2.8808)  weight_decay: 0.0500 (0.0500)  time: 2.1368  data: 1.6524  max mem: 42519
Epoch: [245]  [ 200/2502]  eta: 0:18:44  lr: 0.000369  min_lr: 0.000369  loss: 2.8114 (2.8253)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2159 (1.3115)  time: 0.4820  data: 0.0005  max mem: 42519
Epoch: [245]  [ 400/2502]  eta: 0:16:58  lr: 0.000368  min_lr: 0.000368  loss: 2.9810 (2.8199)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2911 (1.3142)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [245]  [ 600/2502]  eta: 0:15:19  lr: 0.000367  min_lr: 0.000367  loss: 2.9510 (2.8337)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2787 (inf)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [245]  [ 800/2502]  eta: 0:13:41  lr: 0.000366  min_lr: 0.000366  loss: 3.0046 (2.8345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1879 (inf)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [245]  [1000/2502]  eta: 0:12:04  lr: 0.000365  min_lr: 0.000365  loss: 2.9489 (2.8328)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2505 (inf)  time: 0.4863  data: 0.0004  max mem: 42519
Epoch: [245]  [1200/2502]  eta: 0:10:27  lr: 0.000364  min_lr: 0.000364  loss: 2.8550 (2.8370)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2754 (inf)  time: 0.4817  data: 0.0005  max mem: 42519
Epoch: [245]  [1400/2502]  eta: 0:08:51  lr: 0.000363  min_lr: 0.000363  loss: 3.0021 (2.8455)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2110 (inf)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [245]  [1600/2502]  eta: 0:07:14  lr: 0.000362  min_lr: 0.000362  loss: 2.9739 (2.8479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2991 (inf)  time: 0.4791  data: 0.0006  max mem: 42519
Epoch: [245]  [1800/2502]  eta: 0:05:38  lr: 0.000360  min_lr: 0.000360  loss: 2.8980 (2.8486)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1519 (inf)  time: 0.4780  data: 0.0004  max mem: 42519
Epoch: [245]  [2000/2502]  eta: 0:04:01  lr: 0.000359  min_lr: 0.000359  loss: 2.9592 (2.8499)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2691 (inf)  time: 0.4853  data: 0.0004  max mem: 42519
Epoch: [245]  [2200/2502]  eta: 0:02:25  lr: 0.000358  min_lr: 0.000358  loss: 3.0826 (2.8518)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3469 (inf)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [245]  [2400/2502]  eta: 0:00:49  lr: 0.000357  min_lr: 0.000357  loss: 2.9898 (2.8502)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2881 (inf)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [245]  [2501/2502]  eta: 0:00:00  lr: 0.000357  min_lr: 0.000357  loss: 3.0619 (2.8494)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2538 (inf)  time: 0.4340  data: 0.0008  max mem: 42519
Epoch: [245] Total time: 0:20:04 (0.4812 s / it)
Averaged stats: lr: 0.000357  min_lr: 0.000357  loss: 3.0619 (2.8485)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2538 (inf)
Test:  [ 0/50]  eta: 0:03:38  loss: 0.5533 (0.5533)  acc1: 95.2000 (95.2000)  acc5: 99.6000 (99.6000)  time: 4.3628  data: 4.0728  max mem: 42519
Test:  [10/50]  eta: 0:00:24  loss: 0.7684 (0.8103)  acc1: 89.6000 (87.9636)  acc5: 98.4000 (98.1091)  time: 0.6104  data: 0.3707  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8350 (0.8471)  acc1: 85.6000 (86.8381)  acc5: 98.0000 (98.0000)  time: 0.2396  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9437 (0.9207)  acc1: 82.8000 (85.1097)  acc5: 97.6000 (97.1742)  time: 0.2522  data: 0.0119  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1237 (0.9752)  acc1: 80.4000 (83.6488)  acc5: 95.2000 (96.7220)  time: 0.2532  data: 0.0171  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1237 (0.9855)  acc1: 78.8000 (83.1920)  acc5: 95.6000 (96.6080)  time: 0.2517  data: 0.0169  max mem: 42519
Test: Total time: 0:00:16 (0.3307 s / it)
* Acc@1 83.412 Acc@5 96.688 loss 0.974
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.44%
Epoch: [246]  [   0/2502]  eta: 1:20:29  lr: 0.000357  min_lr: 0.000357  loss: 3.0727 (3.0727)  weight_decay: 0.0500 (0.0500)  time: 1.9302  data: 1.4021  max mem: 42519
Epoch: [246]  [ 200/2502]  eta: 0:18:45  lr: 0.000356  min_lr: 0.000356  loss: 2.9791 (2.8050)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2595 (1.3264)  time: 0.4805  data: 0.0005  max mem: 42519
Epoch: [246]  [ 400/2502]  eta: 0:16:59  lr: 0.000355  min_lr: 0.000355  loss: 2.9212 (2.8114)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2527 (1.2959)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [246]  [ 600/2502]  eta: 0:15:20  lr: 0.000354  min_lr: 0.000354  loss: 2.6597 (2.8212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2213 (1.2983)  time: 0.4931  data: 0.0005  max mem: 42519
Epoch: [246]  [ 800/2502]  eta: 0:13:42  lr: 0.000353  min_lr: 0.000353  loss: 2.7640 (2.8121)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2610 (1.2924)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [246]  [1000/2502]  eta: 0:12:04  lr: 0.000352  min_lr: 0.000352  loss: 3.1676 (2.8251)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3256 (1.3009)  time: 0.4786  data: 0.0006  max mem: 42519
Epoch: [246]  [1200/2502]  eta: 0:10:27  lr: 0.000351  min_lr: 0.000351  loss: 3.0187 (2.8294)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2543 (1.3047)  time: 0.4861  data: 0.0005  max mem: 42519
Epoch: [246]  [1400/2502]  eta: 0:08:51  lr: 0.000350  min_lr: 0.000350  loss: 3.1210 (2.8227)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3166 (1.3085)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [246]  [1600/2502]  eta: 0:07:14  lr: 0.000349  min_lr: 0.000349  loss: 2.8761 (2.8211)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2945 (1.3016)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [246]  [1800/2502]  eta: 0:05:38  lr: 0.000348  min_lr: 0.000348  loss: 3.0121 (2.8259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3844 (1.3039)  time: 0.4838  data: 0.0005  max mem: 42519
Epoch: [246]  [2000/2502]  eta: 0:04:01  lr: 0.000347  min_lr: 0.000347  loss: 3.1895 (2.8309)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2495 (1.3047)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [246]  [2200/2502]  eta: 0:02:25  lr: 0.000346  min_lr: 0.000346  loss: 2.9116 (2.8321)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2355 (1.3064)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [246]  [2400/2502]  eta: 0:00:49  lr: 0.000345  min_lr: 0.000345  loss: 2.9823 (2.8323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2040 (1.3039)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [246]  [2501/2502]  eta: 0:00:00  lr: 0.000344  min_lr: 0.000344  loss: 2.9774 (2.8334)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3158 (1.3047)  time: 0.4358  data: 0.0007  max mem: 42519
Epoch: [246] Total time: 0:20:04 (0.4814 s / it)
Averaged stats: lr: 0.000344  min_lr: 0.000344  loss: 2.9774 (2.8283)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3158 (1.3047)
Test:  [ 0/50]  eta: 0:02:41  loss: 0.6152 (0.6152)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 3.2226  data: 2.9449  max mem: 42519
Test:  [10/50]  eta: 0:00:21  loss: 0.8583 (0.8571)  acc1: 87.6000 (87.8545)  acc5: 98.8000 (98.2182)  time: 0.5307  data: 0.2900  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8781 (0.8899)  acc1: 85.6000 (87.0095)  acc5: 98.0000 (98.0952)  time: 0.2528  data: 0.0126  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9881 (0.9680)  acc1: 83.2000 (85.1484)  acc5: 97.2000 (97.3677)  time: 0.2491  data: 0.0084  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1707 (1.0238)  acc1: 79.6000 (83.6976)  acc5: 95.6000 (96.8585)  time: 0.2727  data: 0.0352  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1543 (1.0362)  acc1: 79.2000 (83.2240)  acc5: 95.6000 (96.7680)  time: 0.2643  data: 0.0273  max mem: 42519
Test: Total time: 0:00:15 (0.3194 s / it)
* Acc@1 83.422 Acc@5 96.698 loss 1.032
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.44%
Epoch: [247]  [   0/2502]  eta: 1:33:10  lr: 0.000344  min_lr: 0.000344  loss: 2.7035 (2.7035)  weight_decay: 0.0500 (0.0500)  time: 2.2344  data: 1.7507  max mem: 42519
Epoch: [247]  [ 200/2502]  eta: 0:18:45  lr: 0.000343  min_lr: 0.000343  loss: 2.7920 (2.8093)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2055 (1.3237)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [247]  [ 400/2502]  eta: 0:17:01  lr: 0.000342  min_lr: 0.000342  loss: 2.9337 (2.8342)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3250 (1.3827)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [247]  [ 600/2502]  eta: 0:15:21  lr: 0.000341  min_lr: 0.000341  loss: 2.9705 (2.8312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3296 (1.3681)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [247]  [ 800/2502]  eta: 0:13:42  lr: 0.000340  min_lr: 0.000340  loss: 2.8302 (2.8237)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2460 (1.3517)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [247]  [1000/2502]  eta: 0:12:05  lr: 0.000339  min_lr: 0.000339  loss: 2.9965 (2.8323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3148 (1.3544)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [247]  [1200/2502]  eta: 0:10:28  lr: 0.000338  min_lr: 0.000338  loss: 2.9013 (2.8287)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2998 (1.3547)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [247]  [1400/2502]  eta: 0:08:51  lr: 0.000337  min_lr: 0.000337  loss: 2.8589 (2.8249)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4007 (1.3560)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [247]  [1600/2502]  eta: 0:07:15  lr: 0.000336  min_lr: 0.000336  loss: 2.9972 (2.8215)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3122 (1.3619)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [247]  [1800/2502]  eta: 0:05:38  lr: 0.000335  min_lr: 0.000335  loss: 2.8695 (2.8199)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2592 (1.3557)  time: 0.4824  data: 0.0005  max mem: 42519
Epoch: [247]  [2000/2502]  eta: 0:04:02  lr: 0.000334  min_lr: 0.000334  loss: 2.8099 (2.8225)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3304 (1.3585)  time: 0.4913  data: 0.0005  max mem: 42519
Epoch: [247]  [2200/2502]  eta: 0:02:25  lr: 0.000333  min_lr: 0.000333  loss: 2.9986 (2.8258)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2498 (1.3529)  time: 0.4799  data: 0.0005  max mem: 42519
Epoch: [247]  [2400/2502]  eta: 0:00:49  lr: 0.000332  min_lr: 0.000332  loss: 2.9218 (2.8261)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3045 (1.3525)  time: 0.4809  data: 0.0005  max mem: 42519
Epoch: [247]  [2501/2502]  eta: 0:00:00  lr: 0.000332  min_lr: 0.000332  loss: 2.8827 (2.8254)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2699 (1.3511)  time: 0.4367  data: 0.0008  max mem: 42519
Epoch: [247] Total time: 0:20:05 (0.4818 s / it)
Averaged stats: lr: 0.000332  min_lr: 0.000332  loss: 2.8827 (2.8364)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2699 (1.3511)
Test:  [ 0/50]  eta: 0:04:07  loss: 0.4492 (0.4492)  acc1: 96.0000 (96.0000)  acc5: 99.6000 (99.6000)  time: 4.9480  data: 4.6613  max mem: 42519
Test:  [10/50]  eta: 0:00:26  loss: 0.7694 (0.7371)  acc1: 88.8000 (87.9636)  acc5: 98.8000 (98.2909)  time: 0.6661  data: 0.4241  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.7799 (0.7637)  acc1: 84.8000 (87.0857)  acc5: 98.4000 (98.1143)  time: 0.2408  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9137 (0.8430)  acc1: 83.2000 (85.2774)  acc5: 96.8000 (97.3548)  time: 0.2410  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0569 (0.9015)  acc1: 79.6000 (83.6585)  acc5: 95.6000 (96.8000)  time: 0.2378  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0347 (0.9166)  acc1: 77.6000 (83.0880)  acc5: 95.6000 (96.7440)  time: 0.2374  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3356 s / it)
* Acc@1 83.368 Acc@5 96.770 loss 0.907
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.44%
Epoch: [248]  [   0/2502]  eta: 1:42:02  lr: 0.000332  min_lr: 0.000332  loss: 2.1690 (2.1690)  weight_decay: 0.0500 (0.0500)  time: 2.4472  data: 1.9355  max mem: 42519
Epoch: [248]  [ 200/2502]  eta: 0:18:51  lr: 0.000331  min_lr: 0.000331  loss: 2.8873 (2.8423)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2843 (1.3219)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [248]  [ 400/2502]  eta: 0:17:01  lr: 0.000330  min_lr: 0.000330  loss: 2.9712 (2.8025)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2810 (1.2921)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [248]  [ 600/2502]  eta: 0:15:20  lr: 0.000329  min_lr: 0.000329  loss: 3.0060 (2.7985)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3425 (1.3271)  time: 0.4775  data: 0.0004  max mem: 42519
Epoch: [248]  [ 800/2502]  eta: 0:13:42  lr: 0.000328  min_lr: 0.000328  loss: 2.8293 (2.7998)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3369 (1.3391)  time: 0.4814  data: 0.0004  max mem: 42519
Epoch: [248]  [1000/2502]  eta: 0:12:04  lr: 0.000327  min_lr: 0.000327  loss: 2.8842 (2.8039)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3269 (1.3488)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [248]  [1200/2502]  eta: 0:10:28  lr: 0.000326  min_lr: 0.000326  loss: 2.9431 (2.8109)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2833 (1.3480)  time: 0.4789  data: 0.0003  max mem: 42519
Epoch: [248]  [1400/2502]  eta: 0:08:51  lr: 0.000325  min_lr: 0.000325  loss: 2.9946 (2.8081)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2790 (1.3433)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [248]  [1600/2502]  eta: 0:07:14  lr: 0.000324  min_lr: 0.000324  loss: 2.9409 (2.8064)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2844 (1.3423)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [248]  [1800/2502]  eta: 0:05:38  lr: 0.000323  min_lr: 0.000323  loss: 2.6446 (2.8074)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3896 (1.3423)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [248]  [2000/2502]  eta: 0:04:02  lr: 0.000322  min_lr: 0.000322  loss: 2.8884 (2.8136)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3385 (inf)  time: 0.4809  data: 0.0005  max mem: 42519
Epoch: [248]  [2200/2502]  eta: 0:02:25  lr: 0.000321  min_lr: 0.000321  loss: 3.0015 (2.8177)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2894 (inf)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [248]  [2400/2502]  eta: 0:00:49  lr: 0.000320  min_lr: 0.000320  loss: 2.9487 (2.8221)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2709 (inf)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [248]  [2501/2502]  eta: 0:00:00  lr: 0.000320  min_lr: 0.000320  loss: 2.8702 (2.8221)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2553 (inf)  time: 0.4388  data: 0.0010  max mem: 42519
Epoch: [248] Total time: 0:20:05 (0.4820 s / it)
Averaged stats: lr: 0.000320  min_lr: 0.000320  loss: 2.8702 (2.8231)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2553 (inf)
Test:  [ 0/50]  eta: 0:02:19  loss: 0.5271 (0.5271)  acc1: 93.6000 (93.6000)  acc5: 99.2000 (99.2000)  time: 2.7946  data: 2.5279  max mem: 42519
Test:  [10/50]  eta: 0:00:18  loss: 0.7986 (0.7734)  acc1: 88.4000 (88.2182)  acc5: 98.4000 (98.2182)  time: 0.4699  data: 0.2302  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.8130 (0.8080)  acc1: 84.8000 (87.2952)  acc5: 98.4000 (98.0952)  time: 0.2384  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9162 (0.8898)  acc1: 82.8000 (85.3290)  acc5: 97.2000 (97.2903)  time: 0.2385  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1114 (0.9453)  acc1: 79.6000 (83.6585)  acc5: 95.2000 (96.7805)  time: 0.2374  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0823 (0.9608)  acc1: 78.8000 (83.0800)  acc5: 95.2000 (96.6480)  time: 0.2371  data: 0.0001  max mem: 42519
Test: Total time: 0:00:14 (0.2905 s / it)
* Acc@1 83.462 Acc@5 96.688 loss 0.950
Accuracy of the model on the 50000 test images: 83.5%
Max accuracy: 83.46%
Epoch: [249]  [   0/2502]  eta: 1:17:27  lr: 0.000320  min_lr: 0.000320  loss: 2.8917 (2.8917)  weight_decay: 0.0500 (0.0500)  time: 1.8576  data: 1.3696  max mem: 42519
Epoch: [249]  [ 200/2502]  eta: 0:18:40  lr: 0.000319  min_lr: 0.000319  loss: 2.8651 (2.7989)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2721 (1.3057)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [249]  [ 400/2502]  eta: 0:16:58  lr: 0.000318  min_lr: 0.000318  loss: 2.9953 (2.8162)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2712 (1.3209)  time: 0.4885  data: 0.0005  max mem: 42519
Epoch: [249]  [ 600/2502]  eta: 0:15:19  lr: 0.000317  min_lr: 0.000317  loss: 2.8855 (2.8217)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2886 (1.3216)  time: 0.4778  data: 0.0005  max mem: 42519
Epoch: [249]  [ 800/2502]  eta: 0:13:41  lr: 0.000316  min_lr: 0.000316  loss: 2.8815 (2.8211)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2254 (1.3244)  time: 0.4782  data: 0.0006  max mem: 42519
Epoch: [249]  [1000/2502]  eta: 0:12:04  lr: 0.000315  min_lr: 0.000315  loss: 3.0014 (2.8152)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2795 (1.3279)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [249]  [1200/2502]  eta: 0:10:27  lr: 0.000314  min_lr: 0.000314  loss: 2.7945 (2.8166)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3273 (1.3321)  time: 0.4811  data: 0.0005  max mem: 42519
Epoch: [249]  [1400/2502]  eta: 0:08:51  lr: 0.000313  min_lr: 0.000313  loss: 2.9772 (2.8172)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3867 (1.3339)  time: 0.4813  data: 0.0005  max mem: 42519
Epoch: [249]  [1600/2502]  eta: 0:07:14  lr: 0.000312  min_lr: 0.000312  loss: 2.8591 (2.8202)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2769 (1.3294)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [249]  [1800/2502]  eta: 0:05:38  lr: 0.000311  min_lr: 0.000311  loss: 2.7490 (2.8176)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2518 (1.3321)  time: 0.4905  data: 0.0005  max mem: 42519
Epoch: [249]  [2000/2502]  eta: 0:04:01  lr: 0.000310  min_lr: 0.000310  loss: 3.0759 (2.8206)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2952 (1.3318)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [249]  [2200/2502]  eta: 0:02:25  lr: 0.000309  min_lr: 0.000309  loss: 2.6170 (2.8178)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4567 (1.3339)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [249]  [2400/2502]  eta: 0:00:49  lr: 0.000308  min_lr: 0.000308  loss: 2.9784 (2.8159)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2911 (1.3354)  time: 0.4804  data: 0.0006  max mem: 42519
Epoch: [249]  [2501/2502]  eta: 0:00:00  lr: 0.000307  min_lr: 0.000307  loss: 3.0467 (2.8139)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3818 (1.3386)  time: 0.4325  data: 0.0008  max mem: 42519
Epoch: [249] Total time: 0:20:05 (0.4817 s / it)
Averaged stats: lr: 0.000307  min_lr: 0.000307  loss: 3.0467 (2.8151)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3818 (1.3386)
Test:  [ 0/50]  eta: 0:04:33  loss: 0.5593 (0.5593)  acc1: 94.0000 (94.0000)  acc5: 99.2000 (99.2000)  time: 5.4663  data: 5.1689  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.8143 (0.8011)  acc1: 87.6000 (88.0364)  acc5: 98.4000 (98.2545)  time: 0.7141  data: 0.4704  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8307 (0.8360)  acc1: 85.2000 (87.1048)  acc5: 98.4000 (98.2476)  time: 0.2383  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9498 (0.9155)  acc1: 84.0000 (85.2774)  acc5: 97.2000 (97.4452)  time: 0.2387  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1218 (0.9705)  acc1: 79.6000 (83.8049)  acc5: 95.2000 (96.9659)  time: 0.2409  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1218 (0.9872)  acc1: 79.6000 (83.2080)  acc5: 96.0000 (96.8880)  time: 0.2397  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3465 s / it)
* Acc@1 83.516 Acc@5 96.776 loss 0.983
Accuracy of the model on the 50000 test images: 83.5%
Max accuracy: 83.52%
Epoch: [250]  [   0/2502]  eta: 1:37:30  lr: 0.000307  min_lr: 0.000307  loss: 3.4132 (3.4132)  weight_decay: 0.0500 (0.0500)  time: 2.3383  data: 1.8471  max mem: 42519
Epoch: [250]  [ 200/2502]  eta: 0:18:47  lr: 0.000307  min_lr: 0.000307  loss: 2.6827 (2.7731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1977 (1.2912)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [250]  [ 400/2502]  eta: 0:17:01  lr: 0.000306  min_lr: 0.000306  loss: 2.6960 (2.7899)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2953 (1.3473)  time: 0.4814  data: 0.0004  max mem: 42519
Epoch: [250]  [ 600/2502]  eta: 0:15:20  lr: 0.000305  min_lr: 0.000305  loss: 2.9340 (2.8103)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3672 (1.3545)  time: 0.4778  data: 0.0004  max mem: 42519
Epoch: [250]  [ 800/2502]  eta: 0:13:42  lr: 0.000304  min_lr: 0.000304  loss: 2.5790 (2.8088)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2833 (1.3617)  time: 0.4862  data: 0.0005  max mem: 42519
Epoch: [250]  [1000/2502]  eta: 0:12:04  lr: 0.000303  min_lr: 0.000303  loss: 2.8039 (2.8083)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3189 (1.3578)  time: 0.4785  data: 0.0005  max mem: 42519
Epoch: [250]  [1200/2502]  eta: 0:10:27  lr: 0.000302  min_lr: 0.000302  loss: 2.7959 (2.8056)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3651 (1.3597)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [250]  [1400/2502]  eta: 0:08:51  lr: 0.000301  min_lr: 0.000301  loss: 3.0123 (2.8020)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3567 (1.3665)  time: 0.4920  data: 0.0004  max mem: 42519
Epoch: [250]  [1600/2502]  eta: 0:07:14  lr: 0.000300  min_lr: 0.000300  loss: 2.9981 (2.8002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3527 (1.3657)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [250]  [1800/2502]  eta: 0:05:38  lr: 0.000299  min_lr: 0.000299  loss: 2.8802 (2.8014)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2749 (1.3668)  time: 0.4879  data: 0.0004  max mem: 42519
Epoch: [250]  [2000/2502]  eta: 0:04:01  lr: 0.000298  min_lr: 0.000298  loss: 2.8742 (2.8030)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3104 (1.3626)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [250]  [2200/2502]  eta: 0:02:25  lr: 0.000297  min_lr: 0.000297  loss: 2.8231 (2.8073)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2427 (1.3573)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [250]  [2400/2502]  eta: 0:00:49  lr: 0.000296  min_lr: 0.000296  loss: 2.9944 (2.8065)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3500 (1.3570)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [250]  [2501/2502]  eta: 0:00:00  lr: 0.000296  min_lr: 0.000296  loss: 2.9991 (2.8063)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3151 (1.3564)  time: 0.4347  data: 0.0007  max mem: 42519
Epoch: [250] Total time: 0:20:04 (0.4815 s / it)
Averaged stats: lr: 0.000296  min_lr: 0.000296  loss: 2.9991 (2.8133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3151 (1.3564)
Test:  [ 0/50]  eta: 0:04:57  loss: 0.5673 (0.5673)  acc1: 94.4000 (94.4000)  acc5: 99.2000 (99.2000)  time: 5.9452  data: 5.6569  max mem: 42519
Test:  [10/50]  eta: 0:00:30  loss: 0.8504 (0.8317)  acc1: 88.4000 (87.6000)  acc5: 98.4000 (98.1455)  time: 0.7561  data: 0.5147  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.8636 (0.8574)  acc1: 84.8000 (86.8952)  acc5: 98.4000 (98.0952)  time: 0.2374  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9567 (0.9348)  acc1: 83.2000 (85.0452)  acc5: 96.8000 (97.3161)  time: 0.2429  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1186 (0.9841)  acc1: 80.0000 (83.6976)  acc5: 95.2000 (96.8293)  time: 0.2444  data: 0.0005  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1175 (0.9991)  acc1: 78.8000 (83.1360)  acc5: 95.2000 (96.7280)  time: 0.2389  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3562 s / it)
* Acc@1 83.562 Acc@5 96.714 loss 0.994
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.56%
Epoch: [251]  [   0/2502]  eta: 1:27:39  lr: 0.000296  min_lr: 0.000296  loss: 2.4565 (2.4565)  weight_decay: 0.0500 (0.0500)  time: 2.1020  data: 1.6017  max mem: 42519
Epoch: [251]  [ 200/2502]  eta: 0:18:43  lr: 0.000295  min_lr: 0.000295  loss: 3.0042 (2.7514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3377 (1.3733)  time: 0.4796  data: 0.0006  max mem: 42519
Epoch: [251]  [ 400/2502]  eta: 0:16:58  lr: 0.000294  min_lr: 0.000294  loss: 2.8656 (2.7787)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2936 (1.3714)  time: 0.4881  data: 0.0004  max mem: 42519
Epoch: [251]  [ 600/2502]  eta: 0:15:20  lr: 0.000293  min_lr: 0.000293  loss: 2.9932 (2.7929)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3646 (1.3646)  time: 0.4818  data: 0.0005  max mem: 42519
Epoch: [251]  [ 800/2502]  eta: 0:13:42  lr: 0.000292  min_lr: 0.000292  loss: 2.9546 (2.7848)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2394 (1.3619)  time: 0.4805  data: 0.0005  max mem: 42519
Epoch: [251]  [1000/2502]  eta: 0:12:05  lr: 0.000291  min_lr: 0.000291  loss: 2.8601 (2.7957)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2391 (1.3602)  time: 0.4883  data: 0.0005  max mem: 42519
Epoch: [251]  [1200/2502]  eta: 0:10:28  lr: 0.000290  min_lr: 0.000290  loss: 2.9632 (2.7966)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3790 (1.3644)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [251]  [1400/2502]  eta: 0:08:51  lr: 0.000289  min_lr: 0.000289  loss: 3.0801 (2.8015)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3379 (1.3598)  time: 0.4802  data: 0.0006  max mem: 42519
Epoch: [251]  [1600/2502]  eta: 0:07:15  lr: 0.000288  min_lr: 0.000288  loss: 3.0301 (2.8036)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3690 (1.3672)  time: 0.4843  data: 0.0005  max mem: 42519
Epoch: [251]  [1800/2502]  eta: 0:05:38  lr: 0.000287  min_lr: 0.000287  loss: 2.8769 (2.7990)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3994 (1.3653)  time: 0.4810  data: 0.0006  max mem: 42519
Epoch: [251]  [2000/2502]  eta: 0:04:02  lr: 0.000286  min_lr: 0.000286  loss: 2.9592 (2.8009)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4504 (1.3732)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [251]  [2200/2502]  eta: 0:02:25  lr: 0.000285  min_lr: 0.000285  loss: 2.9777 (2.7998)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3260 (1.3710)  time: 0.4893  data: 0.0005  max mem: 42519
Epoch: [251]  [2400/2502]  eta: 0:00:49  lr: 0.000284  min_lr: 0.000284  loss: 3.0080 (2.7947)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3705 (1.3738)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [251]  [2501/2502]  eta: 0:00:00  lr: 0.000284  min_lr: 0.000284  loss: 2.9440 (2.7947)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3494 (1.3735)  time: 0.4367  data: 0.0007  max mem: 42519
Epoch: [251] Total time: 0:20:06 (0.4822 s / it)
Averaged stats: lr: 0.000284  min_lr: 0.000284  loss: 2.9440 (2.8078)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3494 (1.3735)
Test:  [ 0/50]  eta: 0:03:16  loss: 0.5831 (0.5831)  acc1: 94.8000 (94.8000)  acc5: 99.6000 (99.6000)  time: 3.9377  data: 3.6498  max mem: 42519
Test:  [10/50]  eta: 0:00:24  loss: 0.8351 (0.8281)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 0.6161  data: 0.3757  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8784 (0.8607)  acc1: 85.6000 (87.3524)  acc5: 98.0000 (97.8667)  time: 0.2641  data: 0.0245  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9684 (0.9356)  acc1: 84.4000 (85.6387)  acc5: 97.2000 (97.2258)  time: 0.2412  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1343 (0.9861)  acc1: 80.4000 (84.1366)  acc5: 95.6000 (96.7512)  time: 0.2370  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1260 (1.0012)  acc1: 79.2000 (83.5280)  acc5: 95.6000 (96.6880)  time: 0.2359  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3240 s / it)
* Acc@1 83.626 Acc@5 96.796 loss 0.996
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.63%
Epoch: [252]  [   0/2502]  eta: 1:20:33  lr: 0.000284  min_lr: 0.000284  loss: 3.0481 (3.0481)  weight_decay: 0.0500 (0.0500)  time: 1.9318  data: 1.4530  max mem: 42519
Epoch: [252]  [ 200/2502]  eta: 0:18:42  lr: 0.000283  min_lr: 0.000283  loss: 2.8628 (2.7847)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2451 (1.3315)  time: 0.4776  data: 0.0004  max mem: 42519
Epoch: [252]  [ 400/2502]  eta: 0:16:57  lr: 0.000282  min_lr: 0.000282  loss: 2.8641 (2.7832)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2715 (1.3387)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [252]  [ 600/2502]  eta: 0:15:18  lr: 0.000281  min_lr: 0.000281  loss: 2.9319 (2.7796)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3757 (1.3454)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [252]  [ 800/2502]  eta: 0:13:40  lr: 0.000280  min_lr: 0.000280  loss: 2.7390 (2.7799)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3302 (1.3508)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [252]  [1000/2502]  eta: 0:12:03  lr: 0.000279  min_lr: 0.000279  loss: 3.0620 (2.7887)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2526 (1.3494)  time: 0.4780  data: 0.0004  max mem: 42519
Epoch: [252]  [1200/2502]  eta: 0:10:27  lr: 0.000279  min_lr: 0.000279  loss: 2.8210 (2.7872)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3158 (1.3532)  time: 0.4844  data: 0.0004  max mem: 42519
Epoch: [252]  [1400/2502]  eta: 0:08:50  lr: 0.000278  min_lr: 0.000278  loss: 3.0310 (2.7925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3216 (1.3497)  time: 0.4785  data: 0.0005  max mem: 42519
Epoch: [252]  [1600/2502]  eta: 0:07:14  lr: 0.000277  min_lr: 0.000277  loss: 2.9632 (2.8035)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3405 (1.3552)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [252]  [1800/2502]  eta: 0:05:37  lr: 0.000276  min_lr: 0.000276  loss: 2.7644 (2.7993)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2984 (1.3660)  time: 0.4877  data: 0.0005  max mem: 42519
Epoch: [252]  [2000/2502]  eta: 0:04:01  lr: 0.000275  min_lr: 0.000275  loss: 2.8248 (2.8025)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3824 (1.3707)  time: 0.4779  data: 0.0004  max mem: 42519
Epoch: [252]  [2200/2502]  eta: 0:02:25  lr: 0.000274  min_lr: 0.000274  loss: 2.8435 (2.7992)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3517 (1.3750)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [252]  [2400/2502]  eta: 0:00:49  lr: 0.000273  min_lr: 0.000273  loss: 2.9386 (2.7977)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2765 (1.3758)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [252]  [2501/2502]  eta: 0:00:00  lr: 0.000273  min_lr: 0.000273  loss: 2.9906 (2.7973)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2532 (1.3746)  time: 0.4323  data: 0.0007  max mem: 42519
Epoch: [252] Total time: 0:20:03 (0.4810 s / it)
Averaged stats: lr: 0.000273  min_lr: 0.000273  loss: 2.9906 (2.7962)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2532 (1.3746)
Test:  [ 0/50]  eta: 0:04:15  loss: 0.5609 (0.5609)  acc1: 94.4000 (94.4000)  acc5: 99.2000 (99.2000)  time: 5.1135  data: 4.8116  max mem: 42519
Test:  [10/50]  eta: 0:00:27  loss: 0.7861 (0.8161)  acc1: 89.2000 (88.0727)  acc5: 98.4000 (98.2546)  time: 0.6840  data: 0.4430  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8578 (0.8458)  acc1: 85.6000 (87.2191)  acc5: 98.0000 (98.0762)  time: 0.2392  data: 0.0033  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9803 (0.9284)  acc1: 83.6000 (85.4710)  acc5: 96.8000 (97.3936)  time: 0.2386  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1273 (0.9855)  acc1: 79.6000 (83.9122)  acc5: 95.6000 (96.9366)  time: 0.2521  data: 0.0149  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1471 (1.0023)  acc1: 78.8000 (83.2880)  acc5: 95.6000 (96.8400)  time: 0.2497  data: 0.0147  max mem: 42519
Test: Total time: 0:00:17 (0.3448 s / it)
* Acc@1 83.448 Acc@5 96.784 loss 0.997
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.63%
Epoch: [253]  [   0/2502]  eta: 1:46:12  lr: 0.000273  min_lr: 0.000273  loss: 2.0701 (2.0701)  weight_decay: 0.0500 (0.0500)  time: 2.5468  data: 1.5981  max mem: 42519
Epoch: [253]  [ 200/2502]  eta: 0:18:53  lr: 0.000272  min_lr: 0.000272  loss: 2.7773 (2.7684)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4303 (1.3901)  time: 0.4828  data: 0.0005  max mem: 42519
Epoch: [253]  [ 400/2502]  eta: 0:17:04  lr: 0.000271  min_lr: 0.000271  loss: 2.9102 (2.7796)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4064 (1.4011)  time: 0.4795  data: 0.0006  max mem: 42519
Epoch: [253]  [ 600/2502]  eta: 0:15:24  lr: 0.000270  min_lr: 0.000270  loss: 2.9113 (2.8138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3433 (1.3826)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [253]  [ 800/2502]  eta: 0:13:45  lr: 0.000269  min_lr: 0.000269  loss: 2.8854 (2.8119)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3201 (1.3854)  time: 0.4798  data: 0.0006  max mem: 42519
Epoch: [253]  [1000/2502]  eta: 0:12:07  lr: 0.000268  min_lr: 0.000268  loss: 2.7581 (2.7952)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3780 (1.3937)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [253]  [1200/2502]  eta: 0:10:29  lr: 0.000267  min_lr: 0.000267  loss: 2.9857 (2.7978)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3566 (1.3914)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [253]  [1400/2502]  eta: 0:08:52  lr: 0.000266  min_lr: 0.000266  loss: 2.9012 (2.7999)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2836 (1.3871)  time: 0.4823  data: 0.0006  max mem: 42519
Epoch: [253]  [1600/2502]  eta: 0:07:15  lr: 0.000265  min_lr: 0.000265  loss: 2.9163 (2.8016)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3334 (1.3907)  time: 0.4779  data: 0.0006  max mem: 42519
Epoch: [253]  [1800/2502]  eta: 0:05:38  lr: 0.000265  min_lr: 0.000265  loss: 2.9038 (2.8029)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3416 (1.3887)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [253]  [2000/2502]  eta: 0:04:02  lr: 0.000264  min_lr: 0.000264  loss: 2.8160 (2.7997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3135 (1.3900)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [253]  [2200/2502]  eta: 0:02:25  lr: 0.000263  min_lr: 0.000263  loss: 2.7720 (2.8006)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4226 (1.3928)  time: 0.4872  data: 0.0004  max mem: 42519
Epoch: [253]  [2400/2502]  eta: 0:00:49  lr: 0.000262  min_lr: 0.000262  loss: 2.9212 (2.7998)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3030 (1.3877)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [253]  [2501/2502]  eta: 0:00:00  lr: 0.000261  min_lr: 0.000261  loss: 2.9765 (2.8009)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3084 (1.3861)  time: 0.4363  data: 0.0008  max mem: 42519
Epoch: [253] Total time: 0:20:06 (0.4821 s / it)
Averaged stats: lr: 0.000261  min_lr: 0.000261  loss: 2.9765 (2.7950)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3084 (1.3861)
Test:  [ 0/50]  eta: 0:02:17  loss: 0.4832 (0.4832)  acc1: 94.8000 (94.8000)  acc5: 99.6000 (99.6000)  time: 2.7563  data: 2.4771  max mem: 42519
Test:  [10/50]  eta: 0:00:18  loss: 0.7403 (0.7389)  acc1: 88.8000 (88.2909)  acc5: 98.8000 (98.4364)  time: 0.4699  data: 0.2256  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.7775 (0.7886)  acc1: 85.6000 (86.9905)  acc5: 98.0000 (98.1524)  time: 0.2456  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9076 (0.8705)  acc1: 83.6000 (85.2774)  acc5: 96.8000 (97.2000)  time: 0.2471  data: 0.0023  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0616 (0.9182)  acc1: 80.4000 (83.8634)  acc5: 94.8000 (96.7317)  time: 0.2470  data: 0.0081  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0601 (0.9311)  acc1: 78.4000 (83.2960)  acc5: 95.2000 (96.6880)  time: 0.2455  data: 0.0078  max mem: 42519
Test: Total time: 0:00:14 (0.2979 s / it)
* Acc@1 83.690 Acc@5 96.768 loss 0.922
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.69%
Epoch: [254]  [   0/2502]  eta: 1:15:42  lr: 0.000261  min_lr: 0.000261  loss: 3.1661 (3.1661)  weight_decay: 0.0500 (0.0500)  time: 1.8155  data: 1.3374  max mem: 42519
Epoch: [254]  [ 200/2502]  eta: 0:18:45  lr: 0.000261  min_lr: 0.000261  loss: 2.9908 (2.7526)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3645 (1.3770)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [254]  [ 400/2502]  eta: 0:17:00  lr: 0.000260  min_lr: 0.000260  loss: 2.9615 (2.7733)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4356 (1.3859)  time: 0.4835  data: 0.0004  max mem: 42519
Epoch: [254]  [ 600/2502]  eta: 0:15:20  lr: 0.000259  min_lr: 0.000259  loss: 2.9973 (2.7902)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3819 (1.3970)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [254]  [ 800/2502]  eta: 0:13:43  lr: 0.000258  min_lr: 0.000258  loss: 2.9589 (2.8026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3419 (1.3892)  time: 0.4898  data: 0.0005  max mem: 42519
Epoch: [254]  [1000/2502]  eta: 0:12:05  lr: 0.000257  min_lr: 0.000257  loss: 2.6965 (2.7998)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3769 (1.3874)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [254]  [1200/2502]  eta: 0:10:27  lr: 0.000256  min_lr: 0.000256  loss: 2.8817 (2.7930)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3156 (1.3920)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [254]  [1400/2502]  eta: 0:08:51  lr: 0.000255  min_lr: 0.000255  loss: 2.9887 (2.7988)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4859 (1.3970)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [254]  [1600/2502]  eta: 0:07:14  lr: 0.000254  min_lr: 0.000254  loss: 2.9335 (2.7953)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5312 (1.4068)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [254]  [1800/2502]  eta: 0:05:38  lr: 0.000254  min_lr: 0.000254  loss: 2.8104 (2.7942)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [254]  [2000/2502]  eta: 0:04:01  lr: 0.000253  min_lr: 0.000253  loss: 2.8105 (2.7932)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3146 (nan)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [254]  [2200/2502]  eta: 0:02:25  lr: 0.000252  min_lr: 0.000252  loss: 3.0357 (2.7963)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4036 (nan)  time: 0.4808  data: 0.0005  max mem: 42519
Epoch: [254]  [2400/2502]  eta: 0:00:49  lr: 0.000251  min_lr: 0.000251  loss: 2.7781 (2.7961)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4505 (nan)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [254]  [2501/2502]  eta: 0:00:00  lr: 0.000251  min_lr: 0.000251  loss: 2.9949 (2.7977)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4047 (nan)  time: 0.4356  data: 0.0008  max mem: 42519
Epoch: [254] Total time: 0:20:05 (0.4816 s / it)
Averaged stats: lr: 0.000251  min_lr: 0.000251  loss: 2.9949 (2.7803)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4047 (nan)
Test:  [ 0/50]  eta: 0:02:47  loss: 0.5555 (0.5555)  acc1: 94.4000 (94.4000)  acc5: 99.2000 (99.2000)  time: 3.3483  data: 3.0727  max mem: 42519
Test:  [10/50]  eta: 0:00:21  loss: 0.8124 (0.8079)  acc1: 88.8000 (88.0364)  acc5: 98.8000 (98.1818)  time: 0.5436  data: 0.3019  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.8307 (0.8382)  acc1: 84.4000 (87.1810)  acc5: 98.4000 (98.0571)  time: 0.2534  data: 0.0127  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9898 (0.9214)  acc1: 82.8000 (85.2774)  acc5: 96.8000 (97.2774)  time: 0.2612  data: 0.0210  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1004 (0.9676)  acc1: 80.4000 (84.0390)  acc5: 96.0000 (96.9171)  time: 0.2688  data: 0.0296  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0987 (0.9797)  acc1: 79.6000 (83.4880)  acc5: 96.0000 (96.8560)  time: 0.2481  data: 0.0090  max mem: 42519
Test: Total time: 0:00:16 (0.3206 s / it)
* Acc@1 83.700 Acc@5 96.808 loss 0.972
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.70%
Epoch: [255]  [   0/2502]  eta: 1:22:40  lr: 0.000250  min_lr: 0.000250  loss: 3.0570 (3.0570)  weight_decay: 0.0500 (0.0500)  time: 1.9824  data: 1.4868  max mem: 42519
Epoch: [255]  [ 200/2502]  eta: 0:18:42  lr: 0.000250  min_lr: 0.000250  loss: 2.8585 (2.8159)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4850 (1.5000)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [255]  [ 400/2502]  eta: 0:16:57  lr: 0.000249  min_lr: 0.000249  loss: 2.9251 (2.8048)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3473 (1.4295)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [255]  [ 600/2502]  eta: 0:15:20  lr: 0.000248  min_lr: 0.000248  loss: 2.6620 (2.8062)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2640 (1.3989)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [255]  [ 800/2502]  eta: 0:13:42  lr: 0.000247  min_lr: 0.000247  loss: 2.9193 (2.7926)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3864 (1.4012)  time: 0.4803  data: 0.0005  max mem: 42519
Epoch: [255]  [1000/2502]  eta: 0:12:04  lr: 0.000246  min_lr: 0.000246  loss: 2.8928 (2.7972)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2984 (1.3937)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [255]  [1200/2502]  eta: 0:10:28  lr: 0.000245  min_lr: 0.000245  loss: 2.6203 (2.7893)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4387 (1.4022)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [255]  [1400/2502]  eta: 0:08:51  lr: 0.000244  min_lr: 0.000244  loss: 2.7286 (2.7789)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3810 (1.3992)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [255]  [1600/2502]  eta: 0:07:15  lr: 0.000244  min_lr: 0.000244  loss: 2.9872 (2.7828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4019 (1.4022)  time: 0.4885  data: 0.0004  max mem: 42519
Epoch: [255]  [1800/2502]  eta: 0:05:38  lr: 0.000243  min_lr: 0.000243  loss: 2.7465 (2.7831)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5357 (1.4106)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [255]  [2000/2502]  eta: 0:04:02  lr: 0.000242  min_lr: 0.000242  loss: 3.0405 (2.7845)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4888 (1.4180)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [255]  [2200/2502]  eta: 0:02:25  lr: 0.000241  min_lr: 0.000241  loss: 2.9395 (2.7834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3925 (1.4181)  time: 0.4917  data: 0.0007  max mem: 42519
Epoch: [255]  [2400/2502]  eta: 0:00:49  lr: 0.000240  min_lr: 0.000240  loss: 2.9139 (2.7782)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3905 (1.4203)  time: 0.4839  data: 0.0005  max mem: 42519
Epoch: [255]  [2501/2502]  eta: 0:00:00  lr: 0.000240  min_lr: 0.000240  loss: 2.9728 (2.7797)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4821 (1.4244)  time: 0.4403  data: 0.0014  max mem: 42519
Epoch: [255] Total time: 0:20:05 (0.4820 s / it)
Averaged stats: lr: 0.000240  min_lr: 0.000240  loss: 2.9728 (2.7830)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4821 (1.4244)
Test:  [ 0/50]  eta: 0:02:42  loss: 0.5554 (0.5554)  acc1: 95.2000 (95.2000)  acc5: 99.2000 (99.2000)  time: 3.2543  data: 2.9918  max mem: 42519
Test:  [10/50]  eta: 0:00:22  loss: 0.7660 (0.8085)  acc1: 90.4000 (88.0364)  acc5: 98.4000 (98.1091)  time: 0.5649  data: 0.3259  max mem: 42519
Test:  [20/50]  eta: 0:00:12  loss: 0.8493 (0.8419)  acc1: 84.8000 (86.8381)  acc5: 98.0000 (98.0000)  time: 0.2686  data: 0.0310  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9889 (0.9203)  acc1: 83.2000 (85.2387)  acc5: 96.4000 (97.2387)  time: 0.2745  data: 0.0323  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1283 (0.9778)  acc1: 81.2000 (83.6976)  acc5: 95.2000 (96.7707)  time: 0.2723  data: 0.0311  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1323 (0.9960)  acc1: 78.4000 (83.1600)  acc5: 95.2000 (96.6880)  time: 0.2676  data: 0.0309  max mem: 42519
Test: Total time: 0:00:16 (0.3262 s / it)
* Acc@1 83.728 Acc@5 96.754 loss 0.986
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.73%
Epoch: [256]  [   0/2502]  eta: 1:29:35  lr: 0.000240  min_lr: 0.000240  loss: 2.8358 (2.8358)  weight_decay: 0.0500 (0.0500)  time: 2.1485  data: 1.6673  max mem: 42519
Epoch: [256]  [ 200/2502]  eta: 0:18:49  lr: 0.000239  min_lr: 0.000239  loss: 2.9217 (2.7452)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4718 (1.5232)  time: 0.4794  data: 0.0007  max mem: 42519
Epoch: [256]  [ 400/2502]  eta: 0:17:01  lr: 0.000238  min_lr: 0.000238  loss: 2.7176 (2.7762)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3978 (1.4857)  time: 0.4803  data: 0.0005  max mem: 42519
Epoch: [256]  [ 600/2502]  eta: 0:15:20  lr: 0.000237  min_lr: 0.000237  loss: 3.0838 (2.7867)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4001 (1.4948)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [256]  [ 800/2502]  eta: 0:13:43  lr: 0.000236  min_lr: 0.000236  loss: 2.5620 (2.7712)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3008 (1.4681)  time: 0.4897  data: 0.0004  max mem: 42519
Epoch: [256]  [1000/2502]  eta: 0:12:06  lr: 0.000236  min_lr: 0.000236  loss: 2.9275 (2.7834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3350 (1.4607)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [256]  [1200/2502]  eta: 0:10:28  lr: 0.000235  min_lr: 0.000235  loss: 2.9805 (2.7770)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3873 (1.4555)  time: 0.4846  data: 0.0006  max mem: 42519
Epoch: [256]  [1400/2502]  eta: 0:08:52  lr: 0.000234  min_lr: 0.000234  loss: 3.0695 (2.7762)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3705 (1.4511)  time: 0.4786  data: 0.0007  max mem: 42519
Epoch: [256]  [1600/2502]  eta: 0:07:15  lr: 0.000233  min_lr: 0.000233  loss: 2.8478 (2.7771)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3075 (1.4427)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [256]  [1800/2502]  eta: 0:05:38  lr: 0.000232  min_lr: 0.000232  loss: 2.9093 (2.7769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3163 (1.4350)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [256]  [2000/2502]  eta: 0:04:02  lr: 0.000231  min_lr: 0.000231  loss: 2.5823 (2.7719)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3621 (1.4329)  time: 0.4835  data: 0.0005  max mem: 42519
Epoch: [256]  [2200/2502]  eta: 0:02:25  lr: 0.000230  min_lr: 0.000230  loss: 2.7068 (2.7689)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3538 (1.4365)  time: 0.4801  data: 0.0006  max mem: 42519
Epoch: [256]  [2400/2502]  eta: 0:00:49  lr: 0.000230  min_lr: 0.000230  loss: 2.7933 (2.7703)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4547 (1.4359)  time: 0.4819  data: 0.0008  max mem: 42519
Epoch: [256]  [2501/2502]  eta: 0:00:00  lr: 0.000229  min_lr: 0.000229  loss: 2.8536 (2.7674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4803 (1.4382)  time: 0.4408  data: 0.0010  max mem: 42519
Epoch: [256] Total time: 0:20:06 (0.4823 s / it)
Averaged stats: lr: 0.000229  min_lr: 0.000229  loss: 2.8536 (2.7720)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4803 (1.4382)
Test:  [ 0/50]  eta: 0:02:19  loss: 0.5019 (0.5019)  acc1: 94.8000 (94.8000)  acc5: 99.6000 (99.6000)  time: 2.7885  data: 2.5023  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.7133 (0.7482)  acc1: 90.0000 (88.2545)  acc5: 98.8000 (98.2546)  time: 0.4962  data: 0.2542  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.7874 (0.7848)  acc1: 86.0000 (87.3524)  acc5: 98.0000 (98.0000)  time: 0.2553  data: 0.0150  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9113 (0.8591)  acc1: 83.6000 (85.5613)  acc5: 96.8000 (97.3290)  time: 0.2406  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0446 (0.9129)  acc1: 80.0000 (84.0293)  acc5: 95.6000 (96.8976)  time: 0.2419  data: 0.0048  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0623 (0.9253)  acc1: 79.6000 (83.4560)  acc5: 95.6000 (96.8800)  time: 0.2416  data: 0.0047  max mem: 42519
Test: Total time: 0:00:14 (0.2997 s / it)
* Acc@1 83.768 Acc@5 96.836 loss 0.918
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.77%
Epoch: [257]  [   0/2502]  eta: 1:31:00  lr: 0.000229  min_lr: 0.000229  loss: 2.8271 (2.8271)  weight_decay: 0.0500 (0.0500)  time: 2.1825  data: 1.6929  max mem: 42519
Epoch: [257]  [ 200/2502]  eta: 0:18:44  lr: 0.000228  min_lr: 0.000228  loss: 2.8442 (2.7823)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3738 (1.4198)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [257]  [ 400/2502]  eta: 0:16:57  lr: 0.000228  min_lr: 0.000228  loss: 2.9450 (2.7775)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3364 (1.4215)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [257]  [ 600/2502]  eta: 0:15:19  lr: 0.000227  min_lr: 0.000227  loss: 2.6708 (2.7651)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4090 (1.4187)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [257]  [ 800/2502]  eta: 0:13:41  lr: 0.000226  min_lr: 0.000226  loss: 2.9161 (2.7663)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3415 (1.4155)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [257]  [1000/2502]  eta: 0:12:04  lr: 0.000225  min_lr: 0.000225  loss: 2.6865 (2.7609)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3526 (1.4247)  time: 0.4852  data: 0.0004  max mem: 42519
Epoch: [257]  [1200/2502]  eta: 0:10:27  lr: 0.000224  min_lr: 0.000224  loss: 2.8078 (2.7617)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3897 (1.4209)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [257]  [1400/2502]  eta: 0:08:51  lr: 0.000223  min_lr: 0.000223  loss: 2.2599 (2.7594)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3834 (1.4142)  time: 0.4805  data: 0.0003  max mem: 42519
Epoch: [257]  [1600/2502]  eta: 0:07:14  lr: 0.000223  min_lr: 0.000223  loss: 2.9965 (2.7609)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4884 (1.4280)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [257]  [1800/2502]  eta: 0:05:38  lr: 0.000222  min_lr: 0.000222  loss: 3.0569 (2.7682)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2897 (1.4240)  time: 0.4815  data: 0.0005  max mem: 42519
Epoch: [257]  [2000/2502]  eta: 0:04:01  lr: 0.000221  min_lr: 0.000221  loss: 3.0202 (2.7678)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3405 (1.4289)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [257]  [2200/2502]  eta: 0:02:25  lr: 0.000220  min_lr: 0.000220  loss: 2.9516 (2.7693)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3030 (1.4273)  time: 0.4771  data: 0.0005  max mem: 42519
Epoch: [257]  [2400/2502]  eta: 0:00:49  lr: 0.000219  min_lr: 0.000219  loss: 2.8048 (2.7652)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3868 (1.4277)  time: 0.4779  data: 0.0004  max mem: 42519
Epoch: [257]  [2501/2502]  eta: 0:00:00  lr: 0.000219  min_lr: 0.000219  loss: 2.7107 (2.7644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3829 (1.4287)  time: 0.4383  data: 0.0010  max mem: 42519
Epoch: [257] Total time: 0:20:04 (0.4813 s / it)
Averaged stats: lr: 0.000219  min_lr: 0.000219  loss: 2.7107 (2.7688)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3829 (1.4287)
Test:  [ 0/50]  eta: 0:02:44  loss: 0.5474 (0.5474)  acc1: 94.4000 (94.4000)  acc5: 99.2000 (99.2000)  time: 3.2854  data: 3.0144  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.7864 (0.7994)  acc1: 89.2000 (88.6182)  acc5: 98.8000 (98.2182)  time: 0.5148  data: 0.2745  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8206 (0.8366)  acc1: 86.8000 (87.7143)  acc5: 98.4000 (97.9619)  time: 0.2382  data: 0.0010  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9904 (0.9198)  acc1: 84.0000 (85.8323)  acc5: 96.8000 (97.2516)  time: 0.2382  data: 0.0010  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1100 (0.9728)  acc1: 80.0000 (84.4000)  acc5: 95.2000 (96.8390)  time: 0.2375  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1100 (0.9880)  acc1: 80.0000 (83.8240)  acc5: 95.6000 (96.7680)  time: 0.2372  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3010 s / it)
* Acc@1 84.068 Acc@5 96.806 loss 0.980
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.07%
Epoch: [258]  [   0/2502]  eta: 1:28:20  lr: 0.000219  min_lr: 0.000219  loss: 2.9275 (2.9275)  weight_decay: 0.0500 (0.0500)  time: 2.1184  data: 1.6248  max mem: 42519
Epoch: [258]  [ 200/2502]  eta: 0:18:46  lr: 0.000218  min_lr: 0.000218  loss: 2.9325 (2.7762)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4010 (1.4326)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [258]  [ 400/2502]  eta: 0:17:00  lr: 0.000217  min_lr: 0.000217  loss: 2.8244 (2.7600)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3979 (1.4031)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [258]  [ 600/2502]  eta: 0:15:20  lr: 0.000216  min_lr: 0.000216  loss: 3.0176 (2.7554)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3279 (1.3926)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [258]  [ 800/2502]  eta: 0:13:41  lr: 0.000216  min_lr: 0.000216  loss: 2.8120 (2.7524)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4317 (1.3960)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [258]  [1000/2502]  eta: 0:12:04  lr: 0.000215  min_lr: 0.000215  loss: 2.8510 (2.7568)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3916 (1.4017)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [258]  [1200/2502]  eta: 0:10:27  lr: 0.000214  min_lr: 0.000214  loss: 2.7987 (2.7670)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4135 (1.4052)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [258]  [1400/2502]  eta: 0:08:51  lr: 0.000213  min_lr: 0.000213  loss: 2.7858 (2.7660)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5485 (1.4138)  time: 0.4869  data: 0.0005  max mem: 42519
Epoch: [258]  [1600/2502]  eta: 0:07:14  lr: 0.000212  min_lr: 0.000212  loss: 2.8558 (2.7653)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4067 (1.4156)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [258]  [1800/2502]  eta: 0:05:38  lr: 0.000212  min_lr: 0.000212  loss: 2.8023 (2.7602)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5663 (1.4292)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [258]  [2000/2502]  eta: 0:04:01  lr: 0.000211  min_lr: 0.000211  loss: 2.8549 (2.7559)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3855 (1.4345)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [258]  [2200/2502]  eta: 0:02:25  lr: 0.000210  min_lr: 0.000210  loss: 2.8579 (2.7584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3609 (1.4372)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [258]  [2400/2502]  eta: 0:00:49  lr: 0.000209  min_lr: 0.000209  loss: 2.8688 (2.7572)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5053 (1.4414)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [258]  [2501/2502]  eta: 0:00:00  lr: 0.000209  min_lr: 0.000209  loss: 2.8526 (2.7583)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3497 (1.4394)  time: 0.4378  data: 0.0010  max mem: 42519
Epoch: [258] Total time: 0:20:04 (0.4815 s / it)
Averaged stats: lr: 0.000209  min_lr: 0.000209  loss: 2.8526 (2.7627)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3497 (1.4394)
Test:  [ 0/50]  eta: 0:02:16  loss: 0.5866 (0.5866)  acc1: 95.2000 (95.2000)  acc5: 99.2000 (99.2000)  time: 2.7213  data: 2.4537  max mem: 42519
Test:  [10/50]  eta: 0:00:18  loss: 0.8369 (0.8484)  acc1: 91.2000 (89.0546)  acc5: 98.8000 (98.1818)  time: 0.4620  data: 0.2237  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.8758 (0.8795)  acc1: 86.4000 (87.6762)  acc5: 98.0000 (97.9429)  time: 0.2368  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9741 (0.9531)  acc1: 84.0000 (85.9226)  acc5: 96.4000 (97.2000)  time: 0.2366  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:02  loss: 1.1499 (1.0042)  acc1: 81.6000 (84.4585)  acc5: 95.6000 (96.7512)  time: 0.2356  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1547 (1.0180)  acc1: 80.0000 (83.8400)  acc5: 95.6000 (96.7360)  time: 0.2355  data: 0.0001  max mem: 42519
Test: Total time: 0:00:14 (0.2878 s / it)
* Acc@1 84.040 Acc@5 96.814 loss 1.011
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.07%
Epoch: [259]  [   0/2502]  eta: 1:26:27  lr: 0.000209  min_lr: 0.000209  loss: 2.4015 (2.4015)  weight_decay: 0.0500 (0.0500)  time: 2.0733  data: 1.3421  max mem: 42519
Epoch: [259]  [ 200/2502]  eta: 0:18:40  lr: 0.000208  min_lr: 0.000208  loss: 2.8510 (2.7050)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3627 (1.4431)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [259]  [ 400/2502]  eta: 0:16:56  lr: 0.000207  min_lr: 0.000207  loss: 2.7710 (2.7254)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3993 (1.4996)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [259]  [ 600/2502]  eta: 0:15:20  lr: 0.000206  min_lr: 0.000206  loss: 2.7313 (2.7356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2786 (1.4676)  time: 0.4846  data: 0.0005  max mem: 42519
Epoch: [259]  [ 800/2502]  eta: 0:13:42  lr: 0.000206  min_lr: 0.000206  loss: 2.8601 (2.7347)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4228 (1.4497)  time: 0.4813  data: 0.0005  max mem: 42519
Epoch: [259]  [1000/2502]  eta: 0:12:04  lr: 0.000205  min_lr: 0.000205  loss: 2.9206 (2.7401)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4188 (1.4547)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [259]  [1200/2502]  eta: 0:10:28  lr: 0.000204  min_lr: 0.000204  loss: 2.8971 (2.7353)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3882 (1.4526)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [259]  [1400/2502]  eta: 0:08:51  lr: 0.000203  min_lr: 0.000203  loss: 2.7201 (2.7331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3908 (1.4497)  time: 0.4809  data: 0.0005  max mem: 42519
Epoch: [259]  [1600/2502]  eta: 0:07:14  lr: 0.000203  min_lr: 0.000203  loss: 2.9226 (2.7375)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4729 (1.4562)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [259]  [1800/2502]  eta: 0:05:38  lr: 0.000202  min_lr: 0.000202  loss: 2.6300 (2.7379)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4060 (1.4494)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [259]  [2000/2502]  eta: 0:04:01  lr: 0.000201  min_lr: 0.000201  loss: 2.7840 (2.7409)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5565 (1.4601)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [259]  [2200/2502]  eta: 0:02:25  lr: 0.000200  min_lr: 0.000200  loss: 3.0057 (2.7457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5113 (1.4657)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [259]  [2400/2502]  eta: 0:00:49  lr: 0.000199  min_lr: 0.000199  loss: 2.9923 (2.7448)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3903 (1.4663)  time: 0.4781  data: 0.0004  max mem: 42519
Epoch: [259]  [2501/2502]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 2.8497 (2.7487)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4102 (1.4657)  time: 0.4342  data: 0.0009  max mem: 42519
Epoch: [259] Total time: 0:20:05 (0.4816 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 2.8497 (2.7479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4102 (1.4657)
Test:  [ 0/50]  eta: 0:04:31  loss: 0.5216 (0.5216)  acc1: 95.2000 (95.2000)  acc5: 99.6000 (99.6000)  time: 5.4230  data: 5.1594  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.7585 (0.7992)  acc1: 89.6000 (88.2182)  acc5: 98.8000 (98.2909)  time: 0.7068  data: 0.4694  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8355 (0.8343)  acc1: 86.0000 (87.3524)  acc5: 98.0000 (98.0952)  time: 0.2407  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9285 (0.9108)  acc1: 84.4000 (85.6000)  acc5: 96.8000 (97.2774)  time: 0.2523  data: 0.0057  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1229 (0.9623)  acc1: 79.6000 (84.0585)  acc5: 95.6000 (96.9366)  time: 0.2519  data: 0.0106  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1020 (0.9749)  acc1: 79.6000 (83.4960)  acc5: 95.6000 (96.9280)  time: 0.2452  data: 0.0104  max mem: 42519
Test: Total time: 0:00:17 (0.3518 s / it)
* Acc@1 83.944 Acc@5 96.898 loss 0.966
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 84.07%
Epoch: [260]  [   0/2502]  eta: 1:26:20  lr: 0.000199  min_lr: 0.000199  loss: 1.8522 (1.8522)  weight_decay: 0.0500 (0.0500)  time: 2.0704  data: 1.3224  max mem: 42519
Epoch: [260]  [ 200/2502]  eta: 0:18:47  lr: 0.000198  min_lr: 0.000198  loss: 2.9483 (2.7003)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4089 (1.4304)  time: 0.4897  data: 0.0004  max mem: 42519
Epoch: [260]  [ 400/2502]  eta: 0:17:01  lr: 0.000197  min_lr: 0.000197  loss: 2.8098 (2.7059)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4237 (1.4513)  time: 0.4814  data: 0.0004  max mem: 42519
Epoch: [260]  [ 600/2502]  eta: 0:15:21  lr: 0.000197  min_lr: 0.000197  loss: 2.7655 (2.6990)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2970 (1.4296)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [260]  [ 800/2502]  eta: 0:13:43  lr: 0.000196  min_lr: 0.000196  loss: 2.9347 (2.7177)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4993 (1.4575)  time: 0.4895  data: 0.0004  max mem: 42519
Epoch: [260]  [1000/2502]  eta: 0:12:06  lr: 0.000195  min_lr: 0.000195  loss: 2.8032 (2.7348)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3963 (1.4554)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [260]  [1200/2502]  eta: 0:10:29  lr: 0.000194  min_lr: 0.000194  loss: 2.8848 (2.7319)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3768 (1.4546)  time: 0.4813  data: 0.0005  max mem: 42519
Epoch: [260]  [1400/2502]  eta: 0:08:52  lr: 0.000194  min_lr: 0.000194  loss: 3.0331 (2.7453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3774 (1.4528)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [260]  [1600/2502]  eta: 0:07:15  lr: 0.000193  min_lr: 0.000193  loss: 2.8091 (2.7460)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3910 (1.4597)  time: 0.4784  data: 0.0007  max mem: 42519
Epoch: [260]  [1800/2502]  eta: 0:05:38  lr: 0.000192  min_lr: 0.000192  loss: 2.9110 (2.7497)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3398 (1.4569)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [260]  [2000/2502]  eta: 0:04:02  lr: 0.000191  min_lr: 0.000191  loss: 2.5655 (2.7472)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4224 (1.4598)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [260]  [2200/2502]  eta: 0:02:25  lr: 0.000191  min_lr: 0.000191  loss: 2.9176 (2.7504)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3629 (1.4558)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [260]  [2400/2502]  eta: 0:00:49  lr: 0.000190  min_lr: 0.000190  loss: 2.6114 (2.7468)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4430 (1.4564)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [260]  [2501/2502]  eta: 0:00:00  lr: 0.000189  min_lr: 0.000189  loss: 2.8754 (2.7489)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4086 (1.4572)  time: 0.4368  data: 0.0009  max mem: 42519
Epoch: [260] Total time: 0:20:05 (0.4818 s / it)
Averaged stats: lr: 0.000189  min_lr: 0.000189  loss: 2.8754 (2.7487)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4086 (1.4572)
Test:  [ 0/50]  eta: 0:04:05  loss: 0.5986 (0.5986)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 4.9124  data: 4.6412  max mem: 42519
Test:  [10/50]  eta: 0:00:26  loss: 0.8662 (0.8551)  acc1: 87.6000 (88.0000)  acc5: 98.8000 (98.2182)  time: 0.6623  data: 0.4223  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8710 (0.8883)  acc1: 85.6000 (87.1048)  acc5: 98.4000 (98.1333)  time: 0.2429  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0075 (0.9659)  acc1: 84.0000 (85.5742)  acc5: 96.8000 (97.3677)  time: 0.2497  data: 0.0060  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1810 (1.0188)  acc1: 80.8000 (84.1854)  acc5: 95.2000 (96.8781)  time: 0.2440  data: 0.0058  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1325 (1.0347)  acc1: 80.0000 (83.5280)  acc5: 95.2000 (96.7840)  time: 0.2425  data: 0.0056  max mem: 42519
Test: Total time: 0:00:16 (0.3387 s / it)
* Acc@1 83.922 Acc@5 96.868 loss 1.027
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 84.07%
Epoch: [261]  [   0/2502]  eta: 1:36:31  lr: 0.000189  min_lr: 0.000189  loss: 2.9229 (2.9229)  weight_decay: 0.0500 (0.0500)  time: 2.3148  data: 1.6882  max mem: 42519
Epoch: [261]  [ 200/2502]  eta: 0:18:47  lr: 0.000189  min_lr: 0.000189  loss: 2.8694 (2.7559)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4683 (1.5163)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [261]  [ 400/2502]  eta: 0:16:59  lr: 0.000188  min_lr: 0.000188  loss: 2.9088 (2.7629)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4745 (1.4891)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [261]  [ 600/2502]  eta: 0:15:20  lr: 0.000187  min_lr: 0.000187  loss: 2.8288 (2.7446)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4676 (nan)  time: 0.4781  data: 0.0005  max mem: 42519
Epoch: [261]  [ 800/2502]  eta: 0:13:41  lr: 0.000186  min_lr: 0.000186  loss: 2.8630 (2.7462)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3826 (nan)  time: 0.4775  data: 0.0006  max mem: 42519
Epoch: [261]  [1000/2502]  eta: 0:12:04  lr: 0.000186  min_lr: 0.000186  loss: 2.5859 (2.7481)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4181 (nan)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [261]  [1200/2502]  eta: 0:10:27  lr: 0.000185  min_lr: 0.000185  loss: 2.7736 (2.7403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4044 (nan)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [261]  [1400/2502]  eta: 0:08:50  lr: 0.000184  min_lr: 0.000184  loss: 2.8703 (2.7419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4679 (nan)  time: 0.4788  data: 0.0006  max mem: 42519
Epoch: [261]  [1600/2502]  eta: 0:07:14  lr: 0.000183  min_lr: 0.000183  loss: 2.8648 (2.7409)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5757 (nan)  time: 0.4850  data: 0.0005  max mem: 42519
Epoch: [261]  [1800/2502]  eta: 0:05:38  lr: 0.000183  min_lr: 0.000183  loss: 2.9219 (2.7397)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3334 (nan)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [261]  [2000/2502]  eta: 0:04:01  lr: 0.000182  min_lr: 0.000182  loss: 2.8319 (2.7376)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3881 (nan)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [261]  [2200/2502]  eta: 0:02:25  lr: 0.000181  min_lr: 0.000181  loss: 2.7988 (2.7397)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4560 (nan)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [261]  [2400/2502]  eta: 0:00:49  lr: 0.000180  min_lr: 0.000180  loss: 2.9462 (2.7428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4879 (nan)  time: 0.4795  data: 0.0006  max mem: 42519
Epoch: [261]  [2501/2502]  eta: 0:00:00  lr: 0.000180  min_lr: 0.000180  loss: 2.7653 (2.7402)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4666 (nan)  time: 0.4373  data: 0.0009  max mem: 42519
Epoch: [261] Total time: 0:20:04 (0.4813 s / it)
Averaged stats: lr: 0.000180  min_lr: 0.000180  loss: 2.7653 (2.7460)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4666 (nan)
Test:  [ 0/50]  eta: 0:04:10  loss: 0.5215 (0.5215)  acc1: 95.2000 (95.2000)  acc5: 99.2000 (99.2000)  time: 5.0179  data: 4.7377  max mem: 42519
Test:  [10/50]  eta: 0:00:26  loss: 0.7904 (0.7834)  acc1: 88.0000 (88.1455)  acc5: 98.8000 (98.2909)  time: 0.6718  data: 0.4311  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.8488 (0.8183)  acc1: 85.6000 (87.2952)  acc5: 98.0000 (98.0571)  time: 0.2381  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9517 (0.8987)  acc1: 84.8000 (85.5742)  acc5: 97.2000 (97.3161)  time: 0.2401  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1191 (0.9530)  acc1: 80.0000 (84.0878)  acc5: 95.2000 (96.8781)  time: 0.2393  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0810 (0.9672)  acc1: 79.2000 (83.5440)  acc5: 96.0000 (96.8480)  time: 0.2374  data: 0.0002  max mem: 42519
Test: Total time: 0:00:16 (0.3367 s / it)
* Acc@1 83.942 Acc@5 96.882 loss 0.957
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 84.07%
Epoch: [262]  [   0/2502]  eta: 1:30:57  lr: 0.000180  min_lr: 0.000180  loss: 2.6957 (2.6957)  weight_decay: 0.0500 (0.0500)  time: 2.1811  data: 1.4847  max mem: 42519
Epoch: [262]  [ 200/2502]  eta: 0:18:49  lr: 0.000179  min_lr: 0.000179  loss: 2.8535 (2.6550)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3815 (1.4981)  time: 0.4860  data: 0.0005  max mem: 42519
Epoch: [262]  [ 400/2502]  eta: 0:17:01  lr: 0.000179  min_lr: 0.000179  loss: 2.8473 (2.6920)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4159 (1.4953)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [262]  [ 600/2502]  eta: 0:15:20  lr: 0.000178  min_lr: 0.000178  loss: 2.7396 (2.7025)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4575 (1.4871)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [262]  [ 800/2502]  eta: 0:13:42  lr: 0.000177  min_lr: 0.000177  loss: 2.7506 (2.7084)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4833 (1.4861)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [262]  [1000/2502]  eta: 0:12:05  lr: 0.000176  min_lr: 0.000176  loss: 2.8842 (2.7179)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3891 (1.4855)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [262]  [1200/2502]  eta: 0:10:28  lr: 0.000176  min_lr: 0.000176  loss: 2.8337 (2.7212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4972 (1.4776)  time: 0.4888  data: 0.0005  max mem: 42519
Epoch: [262]  [1400/2502]  eta: 0:08:51  lr: 0.000175  min_lr: 0.000175  loss: 2.9047 (2.7265)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4737 (nan)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [262]  [1600/2502]  eta: 0:07:14  lr: 0.000174  min_lr: 0.000174  loss: 3.0600 (2.7376)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4044 (nan)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [262]  [1800/2502]  eta: 0:05:38  lr: 0.000173  min_lr: 0.000173  loss: 2.8468 (2.7382)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4603 (nan)  time: 0.4817  data: 0.0006  max mem: 42519
Epoch: [262]  [2000/2502]  eta: 0:04:01  lr: 0.000173  min_lr: 0.000173  loss: 2.7543 (2.7433)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4822 (nan)  time: 0.4803  data: 0.0005  max mem: 42519
Epoch: [262]  [2200/2502]  eta: 0:02:25  lr: 0.000172  min_lr: 0.000172  loss: 2.7893 (2.7414)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4590 (nan)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [262]  [2400/2502]  eta: 0:00:49  lr: 0.000171  min_lr: 0.000171  loss: 2.9631 (2.7447)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3772 (nan)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [262]  [2501/2502]  eta: 0:00:00  lr: 0.000171  min_lr: 0.000171  loss: 2.6928 (2.7428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3786 (nan)  time: 0.4343  data: 0.0008  max mem: 42519
Epoch: [262] Total time: 0:20:05 (0.4817 s / it)
Averaged stats: lr: 0.000171  min_lr: 0.000171  loss: 2.6928 (2.7370)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3786 (nan)
Test:  [ 0/50]  eta: 0:04:35  loss: 0.5136 (0.5136)  acc1: 95.6000 (95.6000)  acc5: 99.6000 (99.6000)  time: 5.5029  data: 5.2216  max mem: 42519
Test:  [10/50]  eta: 0:00:31  loss: 0.7809 (0.7762)  acc1: 89.2000 (88.3636)  acc5: 98.8000 (98.5091)  time: 0.7772  data: 0.5355  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.7993 (0.8138)  acc1: 86.4000 (87.5810)  acc5: 98.4000 (98.1905)  time: 0.2830  data: 0.0419  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9219 (0.8988)  acc1: 84.0000 (85.8194)  acc5: 97.2000 (97.3677)  time: 0.2494  data: 0.0087  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1034 (0.9502)  acc1: 80.8000 (84.4878)  acc5: 95.2000 (96.9268)  time: 0.2485  data: 0.0118  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0727 (0.9647)  acc1: 80.4000 (83.9600)  acc5: 96.0000 (96.8480)  time: 0.2482  data: 0.0117  max mem: 42519
Test: Total time: 0:00:18 (0.3669 s / it)
* Acc@1 84.270 Acc@5 96.932 loss 0.957
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.27%
Epoch: [263]  [   0/2502]  eta: 1:34:47  lr: 0.000171  min_lr: 0.000171  loss: 3.1618 (3.1618)  weight_decay: 0.0500 (0.0500)  time: 2.2733  data: 1.7858  max mem: 42519
Epoch: [263]  [ 200/2502]  eta: 0:18:47  lr: 0.000170  min_lr: 0.000170  loss: 2.7910 (2.7379)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4528 (1.4703)  time: 0.4871  data: 0.0004  max mem: 42519
Epoch: [263]  [ 400/2502]  eta: 0:17:00  lr: 0.000169  min_lr: 0.000169  loss: 2.8892 (2.7485)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3633 (1.5018)  time: 0.4799  data: 0.0005  max mem: 42519
Epoch: [263]  [ 600/2502]  eta: 0:15:20  lr: 0.000169  min_lr: 0.000169  loss: 2.7933 (2.7419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4626 (1.5089)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [263]  [ 800/2502]  eta: 0:13:42  lr: 0.000168  min_lr: 0.000168  loss: 2.8836 (2.7515)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4486 (1.4896)  time: 0.4819  data: 0.0004  max mem: 42519
Epoch: [263]  [1000/2502]  eta: 0:12:05  lr: 0.000167  min_lr: 0.000167  loss: 2.6755 (2.7443)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4915 (1.4822)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [263]  [1200/2502]  eta: 0:10:28  lr: 0.000167  min_lr: 0.000167  loss: 2.6668 (2.7317)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4976 (1.4925)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [263]  [1400/2502]  eta: 0:08:51  lr: 0.000166  min_lr: 0.000166  loss: 2.7946 (2.7354)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4643 (1.4958)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [263]  [1600/2502]  eta: 0:07:14  lr: 0.000165  min_lr: 0.000165  loss: 2.8573 (2.7398)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5522 (1.5070)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [263]  [1800/2502]  eta: 0:05:38  lr: 0.000164  min_lr: 0.000164  loss: 2.7277 (2.7382)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4796 (1.5074)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [263]  [2000/2502]  eta: 0:04:01  lr: 0.000164  min_lr: 0.000164  loss: 2.9062 (2.7341)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3370 (1.5014)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [263]  [2200/2502]  eta: 0:02:25  lr: 0.000163  min_lr: 0.000163  loss: 2.8747 (2.7382)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4041 (1.4997)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [263]  [2400/2502]  eta: 0:00:49  lr: 0.000162  min_lr: 0.000162  loss: 2.7638 (2.7415)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4735 (1.5020)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [263]  [2501/2502]  eta: 0:00:00  lr: 0.000162  min_lr: 0.000162  loss: 2.5200 (2.7402)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4360 (1.5022)  time: 0.4350  data: 0.0007  max mem: 42519
Epoch: [263] Total time: 0:20:04 (0.4813 s / it)
Averaged stats: lr: 0.000162  min_lr: 0.000162  loss: 2.5200 (2.7328)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4360 (1.5022)
Test:  [ 0/50]  eta: 0:04:51  loss: 0.4936 (0.4936)  acc1: 94.4000 (94.4000)  acc5: 99.2000 (99.2000)  time: 5.8263  data: 5.5162  max mem: 42519
Test:  [10/50]  eta: 0:00:30  loss: 0.7141 (0.7409)  acc1: 90.8000 (88.5818)  acc5: 98.8000 (98.1455)  time: 0.7556  data: 0.5123  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.7746 (0.7769)  acc1: 85.6000 (87.3333)  acc5: 98.4000 (98.0191)  time: 0.2430  data: 0.0062  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9129 (0.8587)  acc1: 84.0000 (85.6516)  acc5: 96.8000 (97.2129)  time: 0.2426  data: 0.0018  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0656 (0.9071)  acc1: 80.4000 (84.2829)  acc5: 95.2000 (96.8488)  time: 0.2470  data: 0.0027  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0324 (0.9219)  acc1: 80.0000 (83.8080)  acc5: 95.6000 (96.7840)  time: 0.2422  data: 0.0013  max mem: 42519
Test: Total time: 0:00:17 (0.3569 s / it)
* Acc@1 84.164 Acc@5 96.838 loss 0.910
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.27%
Epoch: [264]  [   0/2502]  eta: 1:36:04  lr: 0.000162  min_lr: 0.000162  loss: 2.1343 (2.1343)  weight_decay: 0.0500 (0.0500)  time: 2.3038  data: 1.3070  max mem: 42519
Epoch: [264]  [ 200/2502]  eta: 0:18:46  lr: 0.000161  min_lr: 0.000161  loss: 2.7909 (2.7390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4980 (1.5101)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [264]  [ 400/2502]  eta: 0:17:01  lr: 0.000160  min_lr: 0.000160  loss: 2.8354 (2.7387)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4336 (1.5249)  time: 0.4856  data: 0.0004  max mem: 42519
Epoch: [264]  [ 600/2502]  eta: 0:15:20  lr: 0.000160  min_lr: 0.000160  loss: 2.8579 (2.7632)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5960 (1.5543)  time: 0.4782  data: 0.0003  max mem: 42519
Epoch: [264]  [ 800/2502]  eta: 0:13:42  lr: 0.000159  min_lr: 0.000159  loss: 2.8098 (2.7645)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5015 (1.5513)  time: 0.4860  data: 0.0004  max mem: 42519
Epoch: [264]  [1000/2502]  eta: 0:12:04  lr: 0.000158  min_lr: 0.000158  loss: 2.9172 (2.7639)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3905 (1.5302)  time: 0.4771  data: 0.0004  max mem: 42519
Epoch: [264]  [1200/2502]  eta: 0:10:26  lr: 0.000158  min_lr: 0.000158  loss: 2.8442 (2.7549)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5187 (1.5377)  time: 0.4770  data: 0.0004  max mem: 42519
Epoch: [264]  [1400/2502]  eta: 0:08:50  lr: 0.000157  min_lr: 0.000157  loss: 2.8812 (2.7573)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4102 (1.5320)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [264]  [1600/2502]  eta: 0:07:13  lr: 0.000156  min_lr: 0.000156  loss: 2.9112 (2.7596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3777 (1.5311)  time: 0.4809  data: 0.0004  max mem: 42519
Epoch: [264]  [1800/2502]  eta: 0:05:37  lr: 0.000156  min_lr: 0.000156  loss: 2.4273 (2.7551)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3908 (1.5173)  time: 0.4818  data: 0.0004  max mem: 42519
Epoch: [264]  [2000/2502]  eta: 0:04:01  lr: 0.000155  min_lr: 0.000155  loss: 2.7050 (2.7536)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4707 (1.5135)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [264]  [2200/2502]  eta: 0:02:25  lr: 0.000154  min_lr: 0.000154  loss: 2.8521 (2.7500)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4312 (1.5099)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [264]  [2400/2502]  eta: 0:00:49  lr: 0.000154  min_lr: 0.000154  loss: 2.8792 (2.7474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3646 (1.5087)  time: 0.4808  data: 0.0005  max mem: 42519
Epoch: [264]  [2501/2502]  eta: 0:00:00  lr: 0.000153  min_lr: 0.000153  loss: 2.8090 (2.7465)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4569 (1.5076)  time: 0.4353  data: 0.0009  max mem: 42519
Epoch: [264] Total time: 0:20:03 (0.4812 s / it)
Averaged stats: lr: 0.000153  min_lr: 0.000153  loss: 2.8090 (2.7324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4569 (1.5076)
Test:  [ 0/50]  eta: 0:04:50  loss: 0.5337 (0.5337)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 5.8070  data: 5.4878  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.7663 (0.7866)  acc1: 89.6000 (88.1818)  acc5: 98.4000 (98.0727)  time: 0.7484  data: 0.5044  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.8227 (0.8183)  acc1: 85.2000 (87.2571)  acc5: 98.0000 (97.9048)  time: 0.2400  data: 0.0033  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9387 (0.9007)  acc1: 84.0000 (85.7419)  acc5: 96.4000 (97.2000)  time: 0.2412  data: 0.0018  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1045 (0.9527)  acc1: 80.8000 (84.3220)  acc5: 95.2000 (96.7512)  time: 0.2411  data: 0.0016  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0871 (0.9653)  acc1: 79.2000 (83.8720)  acc5: 95.6000 (96.6560)  time: 0.2372  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3533 s / it)
* Acc@1 84.214 Acc@5 96.856 loss 0.955
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.27%
Epoch: [265]  [   0/2502]  eta: 1:47:41  lr: 0.000153  min_lr: 0.000153  loss: 2.5203 (2.5203)  weight_decay: 0.0500 (0.0500)  time: 2.5825  data: 1.3139  max mem: 42519
Epoch: [265]  [ 200/2502]  eta: 0:18:55  lr: 0.000153  min_lr: 0.000153  loss: 2.7589 (2.7396)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4597 (1.5300)  time: 0.4827  data: 0.0004  max mem: 42519
Epoch: [265]  [ 400/2502]  eta: 0:17:02  lr: 0.000152  min_lr: 0.000152  loss: 2.8166 (2.7343)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4486 (1.5212)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [265]  [ 600/2502]  eta: 0:15:21  lr: 0.000151  min_lr: 0.000151  loss: 2.7154 (2.7353)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5337 (1.5241)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [265]  [ 800/2502]  eta: 0:13:42  lr: 0.000150  min_lr: 0.000150  loss: 2.8277 (2.7212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4649 (1.5221)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [265]  [1000/2502]  eta: 0:12:04  lr: 0.000150  min_lr: 0.000150  loss: 2.9206 (2.7160)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4583 (1.5141)  time: 0.4822  data: 0.0004  max mem: 42519
Epoch: [265]  [1200/2502]  eta: 0:10:28  lr: 0.000149  min_lr: 0.000149  loss: 2.7671 (2.7189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4436 (1.5036)  time: 0.4862  data: 0.0005  max mem: 42519
Epoch: [265]  [1400/2502]  eta: 0:08:50  lr: 0.000148  min_lr: 0.000148  loss: 2.8215 (2.7262)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5486 (1.5105)  time: 0.4769  data: 0.0004  max mem: 42519
Epoch: [265]  [1600/2502]  eta: 0:07:14  lr: 0.000148  min_lr: 0.000148  loss: 2.6409 (2.7261)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4503 (1.5079)  time: 0.4781  data: 0.0004  max mem: 42519
Epoch: [265]  [1800/2502]  eta: 0:05:37  lr: 0.000147  min_lr: 0.000147  loss: 2.6547 (2.7285)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4911 (1.5128)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [265]  [2000/2502]  eta: 0:04:01  lr: 0.000146  min_lr: 0.000146  loss: 2.6433 (2.7312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4916 (1.5161)  time: 0.4781  data: 0.0003  max mem: 42519
Epoch: [265]  [2200/2502]  eta: 0:02:25  lr: 0.000146  min_lr: 0.000146  loss: 2.7191 (2.7248)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3992 (1.5113)  time: 0.4932  data: 0.0003  max mem: 42519
Epoch: [265]  [2400/2502]  eta: 0:00:49  lr: 0.000145  min_lr: 0.000145  loss: 2.6217 (2.7256)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5630 (1.5134)  time: 0.4789  data: 0.0003  max mem: 42519
Epoch: [265]  [2501/2502]  eta: 0:00:00  lr: 0.000145  min_lr: 0.000145  loss: 2.8843 (2.7273)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4941 (1.5157)  time: 0.4315  data: 0.0006  max mem: 42519
Epoch: [265] Total time: 0:20:03 (0.4810 s / it)
Averaged stats: lr: 0.000145  min_lr: 0.000145  loss: 2.8843 (2.7226)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4941 (1.5157)
Test:  [ 0/50]  eta: 0:03:30  loss: 0.5030 (0.5030)  acc1: 96.0000 (96.0000)  acc5: 99.6000 (99.6000)  time: 4.2123  data: 3.9159  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.7391 (0.7829)  acc1: 90.0000 (88.5818)  acc5: 98.4000 (98.3273)  time: 0.7057  data: 0.4556  max mem: 42519
Test:  [20/50]  eta: 0:00:16  loss: 0.8150 (0.8170)  acc1: 86.0000 (87.5810)  acc5: 98.0000 (98.0952)  time: 0.3686  data: 0.1239  max mem: 42519
Test:  [30/50]  eta: 0:00:09  loss: 0.9448 (0.9019)  acc1: 83.6000 (85.6645)  acc5: 96.8000 (97.2903)  time: 0.3421  data: 0.1018  max mem: 42519
Test:  [40/50]  eta: 0:00:04  loss: 1.1098 (0.9531)  acc1: 80.0000 (84.3610)  acc5: 95.2000 (96.8390)  time: 0.2694  data: 0.0327  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0796 (0.9683)  acc1: 79.6000 (83.7120)  acc5: 95.6000 (96.7440)  time: 0.2371  data: 0.0002  max mem: 42519
Test: Total time: 0:00:19 (0.3845 s / it)
* Acc@1 84.028 Acc@5 96.862 loss 0.960
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.27%
Epoch: [266]  [   0/2502]  eta: 1:33:12  lr: 0.000145  min_lr: 0.000145  loss: 2.3884 (2.3884)  weight_decay: 0.0500 (0.0500)  time: 2.2352  data: 1.6276  max mem: 42519
Epoch: [266]  [ 200/2502]  eta: 0:18:46  lr: 0.000144  min_lr: 0.000144  loss: 2.8159 (2.7227)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4201 (1.4542)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [266]  [ 400/2502]  eta: 0:17:01  lr: 0.000143  min_lr: 0.000143  loss: 2.9347 (2.7183)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3851 (1.5060)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [266]  [ 600/2502]  eta: 0:15:20  lr: 0.000143  min_lr: 0.000143  loss: 2.8145 (2.7002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4216 (1.5170)  time: 0.4814  data: 0.0005  max mem: 42519
Epoch: [266]  [ 800/2502]  eta: 0:13:43  lr: 0.000142  min_lr: 0.000142  loss: 2.8820 (2.7032)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5264 (1.5233)  time: 0.4860  data: 0.0004  max mem: 42519
Epoch: [266]  [1000/2502]  eta: 0:12:05  lr: 0.000141  min_lr: 0.000141  loss: 2.7109 (2.6993)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5309 (1.5261)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [266]  [1200/2502]  eta: 0:10:28  lr: 0.000141  min_lr: 0.000141  loss: 2.5379 (2.6983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4097 (1.5295)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [266]  [1400/2502]  eta: 0:08:51  lr: 0.000140  min_lr: 0.000140  loss: 2.8161 (2.7014)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4520 (1.5190)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [266]  [1600/2502]  eta: 0:07:15  lr: 0.000139  min_lr: 0.000139  loss: 2.8943 (2.7110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4255 (1.5096)  time: 0.4812  data: 0.0004  max mem: 42519
Epoch: [266]  [1800/2502]  eta: 0:05:38  lr: 0.000139  min_lr: 0.000139  loss: 2.7642 (2.7081)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4007 (1.5045)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [266]  [2000/2502]  eta: 0:04:02  lr: 0.000138  min_lr: 0.000138  loss: 2.7969 (2.7091)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5297 (1.5026)  time: 0.4880  data: 0.0004  max mem: 42519
Epoch: [266]  [2200/2502]  eta: 0:02:25  lr: 0.000137  min_lr: 0.000137  loss: 2.7148 (2.7078)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5359 (1.5119)  time: 0.4818  data: 0.0005  max mem: 42519
Epoch: [266]  [2400/2502]  eta: 0:00:49  lr: 0.000137  min_lr: 0.000137  loss: 2.8267 (2.7094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5628 (1.5170)  time: 0.4780  data: 0.0005  max mem: 42519
Epoch: [266]  [2501/2502]  eta: 0:00:00  lr: 0.000137  min_lr: 0.000137  loss: 2.8669 (2.7124)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5063 (1.5189)  time: 0.4327  data: 0.0009  max mem: 42519
Epoch: [266] Total time: 0:20:06 (0.4821 s / it)
Averaged stats: lr: 0.000137  min_lr: 0.000137  loss: 2.8669 (2.7113)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5063 (1.5189)
Test:  [ 0/50]  eta: 0:04:29  loss: 0.5963 (0.5963)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 5.3973  data: 5.1092  max mem: 42519
Test:  [10/50]  eta: 0:00:30  loss: 0.8328 (0.8335)  acc1: 87.6000 (88.0727)  acc5: 98.8000 (98.4364)  time: 0.7706  data: 0.5265  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.8700 (0.8705)  acc1: 85.6000 (87.1048)  acc5: 98.0000 (98.0952)  time: 0.2896  data: 0.0459  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9929 (0.9593)  acc1: 84.4000 (85.3806)  acc5: 96.4000 (97.2774)  time: 0.2587  data: 0.0121  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1396 (1.0079)  acc1: 80.0000 (84.0000)  acc5: 95.6000 (96.8585)  time: 0.2551  data: 0.0142  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1385 (1.0228)  acc1: 79.6000 (83.5120)  acc5: 95.6000 (96.7600)  time: 0.2496  data: 0.0140  max mem: 42519
Test: Total time: 0:00:18 (0.3704 s / it)
* Acc@1 84.080 Acc@5 96.792 loss 1.011
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.27%
Epoch: [267]  [   0/2502]  eta: 1:45:08  lr: 0.000136  min_lr: 0.000136  loss: 3.0046 (3.0046)  weight_decay: 0.0500 (0.0500)  time: 2.5215  data: 1.4754  max mem: 42519
Epoch: [267]  [ 200/2502]  eta: 0:18:50  lr: 0.000136  min_lr: 0.000136  loss: 2.7461 (2.7067)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4820 (1.5245)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [267]  [ 400/2502]  eta: 0:17:02  lr: 0.000135  min_lr: 0.000135  loss: 2.9293 (2.6984)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4178 (1.5297)  time: 0.4799  data: 0.0004  max mem: 42519
Epoch: [267]  [ 600/2502]  eta: 0:15:21  lr: 0.000135  min_lr: 0.000135  loss: 2.7325 (2.7144)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5057 (1.5279)  time: 0.4785  data: 0.0005  max mem: 42519
Epoch: [267]  [ 800/2502]  eta: 0:13:42  lr: 0.000134  min_lr: 0.000134  loss: 2.8952 (2.7081)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4478 (1.5321)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [267]  [1000/2502]  eta: 0:12:05  lr: 0.000133  min_lr: 0.000133  loss: 2.7191 (2.7137)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4800 (1.5363)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [267]  [1200/2502]  eta: 0:10:28  lr: 0.000133  min_lr: 0.000133  loss: 2.7983 (2.7195)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5521 (1.5399)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [267]  [1400/2502]  eta: 0:08:51  lr: 0.000132  min_lr: 0.000132  loss: 2.8828 (2.7180)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4939 (1.5367)  time: 0.4814  data: 0.0006  max mem: 42519
Epoch: [267]  [1600/2502]  eta: 0:07:15  lr: 0.000131  min_lr: 0.000131  loss: 2.7046 (2.7112)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4656 (1.5282)  time: 0.4802  data: 0.0006  max mem: 42519
Epoch: [267]  [1800/2502]  eta: 0:05:38  lr: 0.000131  min_lr: 0.000131  loss: 2.7980 (2.7087)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5060 (1.5263)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [267]  [2000/2502]  eta: 0:04:02  lr: 0.000130  min_lr: 0.000130  loss: 3.0235 (2.7108)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4304 (1.5262)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [267]  [2200/2502]  eta: 0:02:25  lr: 0.000129  min_lr: 0.000129  loss: 2.7535 (2.7120)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4407 (1.5220)  time: 0.4882  data: 0.0004  max mem: 42519
Epoch: [267]  [2400/2502]  eta: 0:00:49  lr: 0.000129  min_lr: 0.000129  loss: 2.8078 (2.7123)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5814 (1.5306)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [267]  [2501/2502]  eta: 0:00:00  lr: 0.000129  min_lr: 0.000129  loss: 2.5926 (2.7114)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6177 (1.5328)  time: 0.4356  data: 0.0006  max mem: 42519
Epoch: [267] Total time: 0:20:06 (0.4822 s / it)
Averaged stats: lr: 0.000129  min_lr: 0.000129  loss: 2.5926 (2.7175)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6177 (1.5328)
Test:  [ 0/50]  eta: 0:03:55  loss: 0.4797 (0.4797)  acc1: 94.4000 (94.4000)  acc5: 99.6000 (99.6000)  time: 4.7119  data: 4.4199  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.7002 (0.7424)  acc1: 88.4000 (88.3273)  acc5: 98.8000 (98.4364)  time: 0.7313  data: 0.4857  max mem: 42519
Test:  [20/50]  eta: 0:00:16  loss: 0.7797 (0.7755)  acc1: 86.0000 (87.3524)  acc5: 98.4000 (98.2476)  time: 0.3298  data: 0.0908  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.8851 (0.8584)  acc1: 84.4000 (85.7032)  acc5: 97.2000 (97.3936)  time: 0.2820  data: 0.0449  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0657 (0.9090)  acc1: 80.8000 (84.1951)  acc5: 95.6000 (96.9463)  time: 0.2376  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0644 (0.9223)  acc1: 78.8000 (83.7280)  acc5: 95.6000 (96.8480)  time: 0.2373  data: 0.0002  max mem: 42519
Test: Total time: 0:00:18 (0.3658 s / it)
* Acc@1 84.108 Acc@5 96.890 loss 0.911
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.27%
Epoch: [268]  [   0/2502]  eta: 1:26:31  lr: 0.000128  min_lr: 0.000128  loss: 3.0751 (3.0751)  weight_decay: 0.0500 (0.0500)  time: 2.0751  data: 1.2682  max mem: 42519
Epoch: [268]  [ 200/2502]  eta: 0:18:47  lr: 0.000128  min_lr: 0.000128  loss: 2.7938 (2.6666)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5271 (1.5254)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [268]  [ 400/2502]  eta: 0:17:00  lr: 0.000127  min_lr: 0.000127  loss: 2.8817 (2.6984)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5024 (1.5502)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [268]  [ 600/2502]  eta: 0:15:18  lr: 0.000127  min_lr: 0.000127  loss: 2.5308 (2.6790)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4606 (1.5491)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [268]  [ 800/2502]  eta: 0:13:41  lr: 0.000126  min_lr: 0.000126  loss: 2.9157 (2.6795)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4884 (1.5550)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [268]  [1000/2502]  eta: 0:12:04  lr: 0.000125  min_lr: 0.000125  loss: 2.9070 (2.6889)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4549 (1.5537)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [268]  [1200/2502]  eta: 0:10:27  lr: 0.000125  min_lr: 0.000125  loss: 2.7281 (2.6918)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4493 (1.5505)  time: 0.4869  data: 0.0004  max mem: 42519
Epoch: [268]  [1400/2502]  eta: 0:08:50  lr: 0.000124  min_lr: 0.000124  loss: 2.8859 (2.6909)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5217 (1.5530)  time: 0.4806  data: 0.0006  max mem: 42519
Epoch: [268]  [1600/2502]  eta: 0:07:14  lr: 0.000124  min_lr: 0.000124  loss: 2.8002 (2.6906)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5178 (1.5466)  time: 0.4825  data: 0.0005  max mem: 42519
Epoch: [268]  [1800/2502]  eta: 0:05:38  lr: 0.000123  min_lr: 0.000123  loss: 2.5751 (2.6934)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5005 (nan)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [268]  [2000/2502]  eta: 0:04:01  lr: 0.000122  min_lr: 0.000122  loss: 2.7538 (2.6907)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4991 (nan)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [268]  [2200/2502]  eta: 0:02:25  lr: 0.000122  min_lr: 0.000122  loss: 2.6625 (2.6945)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5903 (nan)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [268]  [2400/2502]  eta: 0:00:49  lr: 0.000121  min_lr: 0.000121  loss: 2.7666 (2.6946)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4855 (nan)  time: 0.4818  data: 0.0004  max mem: 42519
Epoch: [268]  [2501/2502]  eta: 0:00:00  lr: 0.000121  min_lr: 0.000121  loss: 2.6121 (2.6953)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4921 (nan)  time: 0.4405  data: 0.0008  max mem: 42519
Epoch: [268] Total time: 0:20:05 (0.4816 s / it)
Averaged stats: lr: 0.000121  min_lr: 0.000121  loss: 2.6121 (2.7034)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4921 (nan)
Test:  [ 0/50]  eta: 0:04:35  loss: 0.5445 (0.5445)  acc1: 94.8000 (94.8000)  acc5: 99.2000 (99.2000)  time: 5.5196  data: 5.2177  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.7559 (0.7956)  acc1: 89.2000 (88.7273)  acc5: 98.8000 (98.2182)  time: 0.7180  data: 0.4750  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8221 (0.8308)  acc1: 86.0000 (87.5619)  acc5: 98.0000 (98.0381)  time: 0.2467  data: 0.0096  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9343 (0.9125)  acc1: 84.4000 (85.6903)  acc5: 97.6000 (97.3161)  time: 0.2751  data: 0.0382  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1455 (0.9622)  acc1: 80.0000 (84.2732)  acc5: 95.6000 (96.9463)  time: 0.2721  data: 0.0356  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0840 (0.9768)  acc1: 79.2000 (83.7120)  acc5: 95.6000 (96.8560)  time: 0.2554  data: 0.0189  max mem: 42519
Test: Total time: 0:00:18 (0.3624 s / it)
* Acc@1 84.190 Acc@5 96.870 loss 0.967
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.27%
Epoch: [269]  [   0/2502]  eta: 1:39:31  lr: 0.000121  min_lr: 0.000121  loss: 2.8100 (2.8100)  weight_decay: 0.0500 (0.0500)  time: 2.3866  data: 1.4166  max mem: 42519
Epoch: [269]  [ 200/2502]  eta: 0:18:45  lr: 0.000120  min_lr: 0.000120  loss: 2.8712 (2.7403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5701 (1.4773)  time: 0.4781  data: 0.0004  max mem: 42519
Epoch: [269]  [ 400/2502]  eta: 0:16:59  lr: 0.000120  min_lr: 0.000120  loss: 2.7964 (2.7263)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3511 (1.4555)  time: 0.4887  data: 0.0005  max mem: 42519
Epoch: [269]  [ 600/2502]  eta: 0:15:19  lr: 0.000119  min_lr: 0.000119  loss: 2.8350 (2.7204)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3670 (1.4544)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [269]  [ 800/2502]  eta: 0:13:41  lr: 0.000118  min_lr: 0.000118  loss: 2.8574 (2.7327)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5247 (1.4801)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [269]  [1000/2502]  eta: 0:12:04  lr: 0.000118  min_lr: 0.000118  loss: 2.8679 (2.7238)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5635 (1.5116)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [269]  [1200/2502]  eta: 0:10:27  lr: 0.000117  min_lr: 0.000117  loss: 2.8705 (2.7267)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5344 (1.5224)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [269]  [1400/2502]  eta: 0:08:50  lr: 0.000116  min_lr: 0.000116  loss: 2.6779 (2.7230)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6104 (1.5371)  time: 0.4811  data: 0.0005  max mem: 42519
Epoch: [269]  [1600/2502]  eta: 0:07:14  lr: 0.000116  min_lr: 0.000116  loss: 2.7424 (2.7243)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5563 (1.5390)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [269]  [1800/2502]  eta: 0:05:38  lr: 0.000115  min_lr: 0.000115  loss: 2.8124 (2.7206)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4528 (1.5449)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [269]  [2000/2502]  eta: 0:04:01  lr: 0.000115  min_lr: 0.000115  loss: 2.8328 (2.7232)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4149 (1.5462)  time: 0.4778  data: 0.0004  max mem: 42519
Epoch: [269]  [2200/2502]  eta: 0:02:25  lr: 0.000114  min_lr: 0.000114  loss: 2.8808 (2.7216)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4759 (1.5446)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [269]  [2400/2502]  eta: 0:00:49  lr: 0.000114  min_lr: 0.000114  loss: 2.8054 (2.7206)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4903 (1.5531)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [269]  [2501/2502]  eta: 0:00:00  lr: 0.000113  min_lr: 0.000113  loss: 2.9697 (2.7211)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5730 (1.5554)  time: 0.4328  data: 0.0007  max mem: 42519
Epoch: [269] Total time: 0:20:03 (0.4809 s / it)
Averaged stats: lr: 0.000113  min_lr: 0.000113  loss: 2.9697 (2.7129)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5730 (1.5554)
Test:  [ 0/50]  eta: 0:03:38  loss: 0.6906 (0.6906)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 4.3615  data: 4.0600  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.9128 (0.9363)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.2546)  time: 0.7476  data: 0.4986  max mem: 42519
Test:  [20/50]  eta: 0:00:16  loss: 0.9751 (0.9850)  acc1: 86.0000 (87.5619)  acc5: 98.0000 (98.0000)  time: 0.3552  data: 0.1140  max mem: 42519
Test:  [30/50]  eta: 0:00:09  loss: 1.1576 (1.0749)  acc1: 84.4000 (85.7032)  acc5: 96.8000 (97.2516)  time: 0.2923  data: 0.0546  max mem: 42519
Test:  [40/50]  eta: 0:00:04  loss: 1.2686 (1.1275)  acc1: 80.0000 (84.2634)  acc5: 95.6000 (96.8488)  time: 0.2486  data: 0.0120  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.2686 (1.1422)  acc1: 80.0000 (83.6720)  acc5: 95.6000 (96.7840)  time: 0.2367  data: 0.0001  max mem: 42519
Test: Total time: 0:00:18 (0.3733 s / it)
* Acc@1 84.196 Acc@5 96.872 loss 1.131
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.27%
Epoch: [270]  [   0/2502]  eta: 1:21:54  lr: 0.000113  min_lr: 0.000113  loss: 2.5469 (2.5469)  weight_decay: 0.0500 (0.0500)  time: 1.9643  data: 1.4878  max mem: 42519
Epoch: [270]  [ 200/2502]  eta: 0:18:45  lr: 0.000113  min_lr: 0.000113  loss: 2.7953 (2.6999)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5277 (1.5713)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [270]  [ 400/2502]  eta: 0:17:00  lr: 0.000112  min_lr: 0.000112  loss: 2.7626 (2.6845)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5145 (1.6125)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [270]  [ 600/2502]  eta: 0:15:20  lr: 0.000111  min_lr: 0.000111  loss: 2.8006 (2.6815)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4752 (1.5795)  time: 0.4822  data: 0.0004  max mem: 42519
Epoch: [270]  [ 800/2502]  eta: 0:13:42  lr: 0.000111  min_lr: 0.000111  loss: 2.2350 (2.6761)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4899 (1.5765)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [270]  [1000/2502]  eta: 0:12:05  lr: 0.000110  min_lr: 0.000110  loss: 2.5223 (2.6728)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5317 (1.5767)  time: 0.4780  data: 0.0004  max mem: 42519
Epoch: [270]  [1200/2502]  eta: 0:10:28  lr: 0.000110  min_lr: 0.000110  loss: 2.7427 (2.6748)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4678 (1.5640)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [270]  [1400/2502]  eta: 0:08:51  lr: 0.000109  min_lr: 0.000109  loss: 2.7024 (2.6829)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4861 (1.5711)  time: 0.4896  data: 0.0004  max mem: 42519
Epoch: [270]  [1600/2502]  eta: 0:07:14  lr: 0.000109  min_lr: 0.000109  loss: 2.8387 (2.6891)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4504 (1.5756)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [270]  [1800/2502]  eta: 0:05:38  lr: 0.000108  min_lr: 0.000108  loss: 2.7878 (2.6891)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5099 (1.5726)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [270]  [2000/2502]  eta: 0:04:01  lr: 0.000107  min_lr: 0.000107  loss: 2.6696 (2.6912)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3744 (1.5646)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [270]  [2200/2502]  eta: 0:02:25  lr: 0.000107  min_lr: 0.000107  loss: 2.7038 (2.6936)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4428 (1.5662)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [270]  [2400/2502]  eta: 0:00:49  lr: 0.000106  min_lr: 0.000106  loss: 2.7180 (2.6922)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4313 (1.5590)  time: 0.4807  data: 0.0006  max mem: 42519
Epoch: [270]  [2501/2502]  eta: 0:00:00  lr: 0.000106  min_lr: 0.000106  loss: 2.6534 (2.6915)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5283 (1.5573)  time: 0.4443  data: 0.0009  max mem: 42519
Epoch: [270] Total time: 0:20:05 (0.4818 s / it)
Averaged stats: lr: 0.000106  min_lr: 0.000106  loss: 2.6534 (2.6991)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5283 (1.5573)
Test:  [ 0/50]  eta: 0:04:14  loss: 0.4662 (0.4662)  acc1: 94.8000 (94.8000)  acc5: 99.6000 (99.6000)  time: 5.0858  data: 4.7964  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.6926 (0.7230)  acc1: 88.0000 (88.6546)  acc5: 98.8000 (98.4000)  time: 0.7360  data: 0.4940  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.7641 (0.7630)  acc1: 86.0000 (87.5619)  acc5: 98.0000 (98.1143)  time: 0.2844  data: 0.0452  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.8876 (0.8455)  acc1: 84.0000 (85.7677)  acc5: 96.8000 (97.3161)  time: 0.2564  data: 0.0136  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0443 (0.8955)  acc1: 80.0000 (84.3707)  acc5: 95.2000 (96.8585)  time: 0.2411  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0251 (0.9112)  acc1: 79.2000 (83.7760)  acc5: 95.6000 (96.8320)  time: 0.2372  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3567 s / it)
* Acc@1 84.254 Acc@5 96.910 loss 0.902
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.27%
Epoch: [271]  [   0/2502]  eta: 1:25:02  lr: 0.000106  min_lr: 0.000106  loss: 2.9859 (2.9859)  weight_decay: 0.0500 (0.0500)  time: 2.0393  data: 1.4929  max mem: 42519
Epoch: [271]  [ 200/2502]  eta: 0:18:43  lr: 0.000105  min_lr: 0.000105  loss: 2.9296 (2.6410)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5345 (1.6087)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [271]  [ 400/2502]  eta: 0:17:00  lr: 0.000105  min_lr: 0.000105  loss: 2.7750 (2.6684)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5008 (1.5737)  time: 0.4926  data: 0.0004  max mem: 42519
Epoch: [271]  [ 600/2502]  eta: 0:15:20  lr: 0.000104  min_lr: 0.000104  loss: 2.6961 (2.6840)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5242 (1.5866)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [271]  [ 800/2502]  eta: 0:13:42  lr: 0.000104  min_lr: 0.000104  loss: 2.7641 (2.6858)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4975 (1.5678)  time: 0.4807  data: 0.0005  max mem: 42519
Epoch: [271]  [1000/2502]  eta: 0:12:05  lr: 0.000103  min_lr: 0.000103  loss: 2.6679 (2.6886)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5034 (1.5468)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [271]  [1200/2502]  eta: 0:10:28  lr: 0.000102  min_lr: 0.000102  loss: 2.7938 (2.6917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4203 (1.5329)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [271]  [1400/2502]  eta: 0:08:51  lr: 0.000102  min_lr: 0.000102  loss: 2.8054 (2.6918)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4624 (1.5415)  time: 0.4900  data: 0.0004  max mem: 42519
Epoch: [271]  [1600/2502]  eta: 0:07:15  lr: 0.000101  min_lr: 0.000101  loss: 2.7827 (2.6935)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5748 (1.5446)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [271]  [1800/2502]  eta: 0:05:38  lr: 0.000101  min_lr: 0.000101  loss: 2.5802 (2.6936)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4329 (1.5361)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [271]  [2000/2502]  eta: 0:04:01  lr: 0.000100  min_lr: 0.000100  loss: 2.8087 (2.6910)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5365 (1.5369)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [271]  [2200/2502]  eta: 0:02:25  lr: 0.000100  min_lr: 0.000100  loss: 2.7442 (2.6894)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4734 (1.5416)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [271]  [2400/2502]  eta: 0:00:49  lr: 0.000099  min_lr: 0.000099  loss: 2.8200 (2.6928)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4433 (nan)  time: 0.4784  data: 0.0003  max mem: 42519
Epoch: [271]  [2501/2502]  eta: 0:00:00  lr: 0.000099  min_lr: 0.000099  loss: 2.7341 (2.6908)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4700 (nan)  time: 0.4331  data: 0.0007  max mem: 42519
Epoch: [271] Total time: 0:20:04 (0.4816 s / it)
Averaged stats: lr: 0.000099  min_lr: 0.000099  loss: 2.7341 (2.6950)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4700 (nan)
Test:  [ 0/50]  eta: 0:04:06  loss: 0.5122 (0.5122)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 4.9383  data: 4.6344  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.7380 (0.7775)  acc1: 89.6000 (88.6545)  acc5: 98.8000 (98.4000)  time: 0.7336  data: 0.4860  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.8236 (0.8139)  acc1: 85.6000 (87.5810)  acc5: 98.4000 (98.1905)  time: 0.2784  data: 0.0372  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9349 (0.8998)  acc1: 84.0000 (85.7032)  acc5: 97.2000 (97.3677)  time: 0.2612  data: 0.0225  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0886 (0.9502)  acc1: 80.0000 (84.3122)  acc5: 95.6000 (96.9756)  time: 0.2579  data: 0.0210  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0886 (0.9672)  acc1: 78.8000 (83.7200)  acc5: 96.0000 (96.8960)  time: 0.2371  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3582 s / it)
* Acc@1 84.142 Acc@5 96.862 loss 0.958
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.27%
Epoch: [272]  [   0/2502]  eta: 1:37:23  lr: 0.000099  min_lr: 0.000099  loss: 1.9954 (1.9954)  weight_decay: 0.0500 (0.0500)  time: 2.3357  data: 1.3683  max mem: 42519
Epoch: [272]  [ 200/2502]  eta: 0:18:48  lr: 0.000098  min_lr: 0.000098  loss: 2.4773 (2.6467)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5904 (1.6495)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [272]  [ 400/2502]  eta: 0:17:02  lr: 0.000098  min_lr: 0.000098  loss: 2.4521 (2.6810)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7195 (nan)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [272]  [ 600/2502]  eta: 0:15:20  lr: 0.000097  min_lr: 0.000097  loss: 2.8271 (2.6925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5306 (nan)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [272]  [ 800/2502]  eta: 0:13:42  lr: 0.000097  min_lr: 0.000097  loss: 2.7571 (2.6830)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5072 (nan)  time: 0.4803  data: 0.0005  max mem: 42519
Epoch: [272]  [1000/2502]  eta: 0:12:05  lr: 0.000096  min_lr: 0.000096  loss: 2.8537 (2.6905)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5341 (nan)  time: 0.4798  data: 0.0008  max mem: 42519
Epoch: [272]  [1200/2502]  eta: 0:10:28  lr: 0.000096  min_lr: 0.000096  loss: 2.7240 (2.6934)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4786 (nan)  time: 0.4828  data: 0.0005  max mem: 42519
Epoch: [272]  [1400/2502]  eta: 0:08:51  lr: 0.000095  min_lr: 0.000095  loss: 2.8179 (2.6898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5099 (nan)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [272]  [1600/2502]  eta: 0:07:14  lr: 0.000094  min_lr: 0.000094  loss: 2.7898 (2.6917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5571 (nan)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [272]  [1800/2502]  eta: 0:05:38  lr: 0.000094  min_lr: 0.000094  loss: 2.7844 (2.6913)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4477 (nan)  time: 0.4870  data: 0.0006  max mem: 42519
Epoch: [272]  [2000/2502]  eta: 0:04:01  lr: 0.000093  min_lr: 0.000093  loss: 2.7155 (2.6896)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5224 (nan)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [272]  [2200/2502]  eta: 0:02:25  lr: 0.000093  min_lr: 0.000093  loss: 2.8175 (2.6944)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6375 (nan)  time: 0.4817  data: 0.0005  max mem: 42519
Epoch: [272]  [2400/2502]  eta: 0:00:49  lr: 0.000092  min_lr: 0.000092  loss: 2.8307 (2.6892)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5257 (nan)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [272]  [2501/2502]  eta: 0:00:00  lr: 0.000092  min_lr: 0.000092  loss: 2.5564 (2.6905)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4784 (nan)  time: 0.4401  data: 0.0008  max mem: 42519
Epoch: [272] Total time: 0:20:04 (0.4814 s / it)
Averaged stats: lr: 0.000092  min_lr: 0.000092  loss: 2.5564 (2.6898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4784 (nan)
Test:  [ 0/50]  eta: 0:04:34  loss: 0.5007 (0.5007)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 5.4969  data: 5.1887  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.7027 (0.7441)  acc1: 89.6000 (88.5091)  acc5: 98.8000 (98.3273)  time: 0.7459  data: 0.5028  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.7828 (0.7837)  acc1: 85.6000 (87.3905)  acc5: 98.4000 (98.1333)  time: 0.2742  data: 0.0386  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.8997 (0.8690)  acc1: 84.4000 (85.6774)  acc5: 97.2000 (97.4323)  time: 0.2564  data: 0.0217  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0625 (0.9222)  acc1: 80.0000 (84.3317)  acc5: 95.6000 (97.0342)  time: 0.2350  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0463 (0.9366)  acc1: 80.0000 (83.8160)  acc5: 96.0000 (97.0160)  time: 0.2348  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3576 s / it)
* Acc@1 84.234 Acc@5 96.952 loss 0.928
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.27%
Epoch: [273]  [   0/2502]  eta: 1:41:07  lr: 0.000092  min_lr: 0.000092  loss: 2.5836 (2.5836)  weight_decay: 0.0500 (0.0500)  time: 2.4250  data: 1.7129  max mem: 42519
Epoch: [273]  [ 200/2502]  eta: 0:18:49  lr: 0.000092  min_lr: 0.000092  loss: 2.6891 (2.6544)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5552 (1.5661)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [273]  [ 400/2502]  eta: 0:16:58  lr: 0.000091  min_lr: 0.000091  loss: 2.7107 (2.6442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4150 (1.5925)  time: 0.4864  data: 0.0004  max mem: 42519
Epoch: [273]  [ 600/2502]  eta: 0:15:18  lr: 0.000090  min_lr: 0.000090  loss: 2.8361 (2.6559)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5204 (1.5741)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [273]  [ 800/2502]  eta: 0:13:40  lr: 0.000090  min_lr: 0.000090  loss: 2.8694 (2.6541)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4752 (1.5686)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [273]  [1000/2502]  eta: 0:12:03  lr: 0.000089  min_lr: 0.000089  loss: 2.6816 (2.6474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6005 (1.6103)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [273]  [1200/2502]  eta: 0:10:27  lr: 0.000089  min_lr: 0.000089  loss: 2.8102 (2.6589)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5525 (1.6093)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [273]  [1400/2502]  eta: 0:08:50  lr: 0.000088  min_lr: 0.000088  loss: 2.7556 (2.6553)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5221 (1.6060)  time: 0.4789  data: 0.0003  max mem: 42519
Epoch: [273]  [1600/2502]  eta: 0:07:14  lr: 0.000088  min_lr: 0.000088  loss: 2.6985 (2.6582)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5149 (1.5942)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [273]  [1800/2502]  eta: 0:05:37  lr: 0.000087  min_lr: 0.000087  loss: 2.7230 (2.6598)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5376 (1.5911)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [273]  [2000/2502]  eta: 0:04:01  lr: 0.000087  min_lr: 0.000087  loss: 2.8251 (2.6687)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3797 (1.5858)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [273]  [2200/2502]  eta: 0:02:25  lr: 0.000086  min_lr: 0.000086  loss: 2.8325 (2.6745)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6132 (1.5847)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [273]  [2400/2502]  eta: 0:00:49  lr: 0.000086  min_lr: 0.000086  loss: 2.8422 (2.6723)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5583 (1.5838)  time: 0.4800  data: 0.0003  max mem: 42519
Epoch: [273]  [2501/2502]  eta: 0:00:00  lr: 0.000085  min_lr: 0.000085  loss: 2.7007 (2.6755)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4668 (1.5815)  time: 0.4326  data: 0.0006  max mem: 42519
Epoch: [273] Total time: 0:20:03 (0.4809 s / it)
Averaged stats: lr: 0.000085  min_lr: 0.000085  loss: 2.7007 (2.6866)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4668 (1.5815)
Test:  [ 0/50]  eta: 0:04:39  loss: 0.6041 (0.6041)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 5.5889  data: 5.2890  max mem: 42519
Test:  [10/50]  eta: 0:00:31  loss: 0.8476 (0.8623)  acc1: 88.0000 (88.1818)  acc5: 98.8000 (98.2909)  time: 0.7822  data: 0.5399  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.9077 (0.9006)  acc1: 84.8000 (87.3524)  acc5: 98.0000 (98.0381)  time: 0.2740  data: 0.0334  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0219 (0.9825)  acc1: 84.8000 (85.6774)  acc5: 97.2000 (97.3290)  time: 0.2465  data: 0.0012  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1938 (1.0358)  acc1: 80.0000 (84.4000)  acc5: 95.6000 (96.9171)  time: 0.2491  data: 0.0039  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1938 (1.0498)  acc1: 80.0000 (83.8480)  acc5: 96.0000 (96.8640)  time: 0.2449  data: 0.0037  max mem: 42519
Test: Total time: 0:00:18 (0.3657 s / it)
* Acc@1 84.280 Acc@5 96.880 loss 1.041
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.28%
Epoch: [274]  [   0/2502]  eta: 1:19:39  lr: 0.000085  min_lr: 0.000085  loss: 3.0657 (3.0657)  weight_decay: 0.0500 (0.0500)  time: 1.9104  data: 1.4255  max mem: 42519
Epoch: [274]  [ 200/2502]  eta: 0:18:44  lr: 0.000085  min_lr: 0.000085  loss: 2.7975 (2.6471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5892 (1.5895)  time: 0.4811  data: 0.0004  max mem: 42519
Epoch: [274]  [ 400/2502]  eta: 0:17:00  lr: 0.000084  min_lr: 0.000084  loss: 2.3672 (2.6490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5215 (1.5805)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [274]  [ 600/2502]  eta: 0:15:20  lr: 0.000084  min_lr: 0.000084  loss: 2.8972 (2.6730)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5340 (1.5804)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [274]  [ 800/2502]  eta: 0:13:42  lr: 0.000083  min_lr: 0.000083  loss: 2.6819 (2.6764)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5198 (1.5723)  time: 0.4788  data: 0.0003  max mem: 42519
Epoch: [274]  [1000/2502]  eta: 0:12:04  lr: 0.000083  min_lr: 0.000083  loss: 2.5383 (2.6739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5093 (1.5684)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [274]  [1200/2502]  eta: 0:10:27  lr: 0.000082  min_lr: 0.000082  loss: 2.8460 (2.6787)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5394 (1.5768)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [274]  [1400/2502]  eta: 0:08:50  lr: 0.000082  min_lr: 0.000082  loss: 2.7973 (2.6770)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5561 (1.5851)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [274]  [1600/2502]  eta: 0:07:14  lr: 0.000081  min_lr: 0.000081  loss: 2.8929 (2.6721)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5166 (1.5824)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [274]  [1800/2502]  eta: 0:05:37  lr: 0.000081  min_lr: 0.000081  loss: 2.6485 (2.6656)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5175 (1.5738)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [274]  [2000/2502]  eta: 0:04:01  lr: 0.000080  min_lr: 0.000080  loss: 2.6646 (2.6645)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5680 (1.5736)  time: 0.4778  data: 0.0004  max mem: 42519
Epoch: [274]  [2200/2502]  eta: 0:02:25  lr: 0.000080  min_lr: 0.000080  loss: 2.8168 (2.6664)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4962 (1.5678)  time: 0.4780  data: 0.0004  max mem: 42519
Epoch: [274]  [2400/2502]  eta: 0:00:49  lr: 0.000079  min_lr: 0.000079  loss: 2.7868 (2.6672)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5354 (1.5752)  time: 0.4870  data: 0.0004  max mem: 42519
Epoch: [274]  [2501/2502]  eta: 0:00:00  lr: 0.000079  min_lr: 0.000079  loss: 2.8972 (2.6700)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5026 (1.5762)  time: 0.4334  data: 0.0008  max mem: 42519
Epoch: [274] Total time: 0:20:03 (0.4810 s / it)
Averaged stats: lr: 0.000079  min_lr: 0.000079  loss: 2.8972 (2.6738)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5026 (1.5762)
Test:  [ 0/50]  eta: 0:04:40  loss: 0.6040 (0.6040)  acc1: 95.6000 (95.6000)  acc5: 99.6000 (99.6000)  time: 5.6138  data: 5.3195  max mem: 42519
Test:  [10/50]  eta: 0:00:30  loss: 0.8536 (0.8695)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.3636)  time: 0.7596  data: 0.5183  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.9053 (0.9061)  acc1: 85.2000 (87.3143)  acc5: 98.0000 (98.1524)  time: 0.2553  data: 0.0193  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0212 (0.9892)  acc1: 84.4000 (85.5871)  acc5: 97.2000 (97.3807)  time: 0.2374  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1903 (1.0413)  acc1: 80.8000 (84.2146)  acc5: 95.6000 (96.9463)  time: 0.2406  data: 0.0031  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1903 (1.0548)  acc1: 80.4000 (83.7280)  acc5: 95.6000 (96.8480)  time: 0.2397  data: 0.0030  max mem: 42519
Test: Total time: 0:00:17 (0.3554 s / it)
* Acc@1 84.322 Acc@5 96.906 loss 1.043
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.32%
Epoch: [275]  [   0/2502]  eta: 1:19:45  lr: 0.000079  min_lr: 0.000079  loss: 2.4249 (2.4249)  weight_decay: 0.0500 (0.0500)  time: 1.9127  data: 1.4235  max mem: 42519
Epoch: [275]  [ 200/2502]  eta: 0:18:45  lr: 0.000079  min_lr: 0.000079  loss: 2.6816 (2.6755)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5253 (1.5381)  time: 0.4799  data: 0.0005  max mem: 42519
Epoch: [275]  [ 400/2502]  eta: 0:16:58  lr: 0.000078  min_lr: 0.000078  loss: 2.8380 (2.6781)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4210 (1.5452)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [275]  [ 600/2502]  eta: 0:15:20  lr: 0.000078  min_lr: 0.000078  loss: 2.6580 (2.6731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4615 (1.5478)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [275]  [ 800/2502]  eta: 0:13:43  lr: 0.000077  min_lr: 0.000077  loss: 2.9432 (2.6700)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5674 (1.5612)  time: 0.4785  data: 0.0005  max mem: 42519
Epoch: [275]  [1000/2502]  eta: 0:12:05  lr: 0.000077  min_lr: 0.000077  loss: 2.8467 (2.6805)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6115 (1.5707)  time: 0.4958  data: 0.0005  max mem: 42519
Epoch: [275]  [1200/2502]  eta: 0:10:28  lr: 0.000076  min_lr: 0.000076  loss: 2.7551 (2.6880)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5773 (1.5732)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [275]  [1400/2502]  eta: 0:08:51  lr: 0.000076  min_lr: 0.000076  loss: 2.6852 (2.6852)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5029 (1.5710)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [275]  [1600/2502]  eta: 0:07:14  lr: 0.000075  min_lr: 0.000075  loss: 2.8460 (2.6887)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4950 (1.5691)  time: 0.4803  data: 0.0006  max mem: 42519
Epoch: [275]  [1800/2502]  eta: 0:05:38  lr: 0.000075  min_lr: 0.000075  loss: 2.5499 (2.6880)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4482 (1.5632)  time: 0.4778  data: 0.0005  max mem: 42519
Epoch: [275]  [2000/2502]  eta: 0:04:01  lr: 0.000074  min_lr: 0.000074  loss: 2.7985 (2.6881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4904 (1.5636)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [275]  [2200/2502]  eta: 0:02:25  lr: 0.000074  min_lr: 0.000074  loss: 2.5032 (2.6869)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4778 (1.5640)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [275]  [2400/2502]  eta: 0:00:49  lr: 0.000073  min_lr: 0.000073  loss: 2.6339 (2.6888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6153 (1.5653)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [275]  [2501/2502]  eta: 0:00:00  lr: 0.000073  min_lr: 0.000073  loss: 2.6608 (2.6867)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5792 (1.5658)  time: 0.4359  data: 0.0008  max mem: 42519
Epoch: [275] Total time: 0:20:04 (0.4814 s / it)
Averaged stats: lr: 0.000073  min_lr: 0.000073  loss: 2.6608 (2.6828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5792 (1.5658)
Test:  [ 0/50]  eta: 0:04:36  loss: 0.5357 (0.5357)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 5.5279  data: 5.2348  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.7811 (0.7849)  acc1: 89.2000 (88.5455)  acc5: 98.8000 (98.4727)  time: 0.7179  data: 0.4763  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8283 (0.8226)  acc1: 86.0000 (87.6762)  acc5: 98.4000 (98.2286)  time: 0.2387  data: 0.0017  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9540 (0.9084)  acc1: 84.0000 (85.7806)  acc5: 96.8000 (97.3290)  time: 0.2452  data: 0.0031  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1026 (0.9596)  acc1: 80.8000 (84.4390)  acc5: 95.6000 (96.9463)  time: 0.2437  data: 0.0017  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1005 (0.9749)  acc1: 79.2000 (83.8320)  acc5: 95.6000 (96.8320)  time: 0.2373  data: 0.0002  max mem: 42519
Test: Total time: 0:00:17 (0.3486 s / it)
* Acc@1 84.306 Acc@5 96.872 loss 0.966
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.32%
Epoch: [276]  [   0/2502]  eta: 1:29:23  lr: 0.000073  min_lr: 0.000073  loss: 3.0594 (3.0594)  weight_decay: 0.0500 (0.0500)  time: 2.1437  data: 1.6750  max mem: 42519
Epoch: [276]  [ 200/2502]  eta: 0:18:48  lr: 0.000073  min_lr: 0.000073  loss: 2.7624 (2.6801)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4684 (1.6079)  time: 0.4804  data: 0.0004  max mem: 42519
Epoch: [276]  [ 400/2502]  eta: 0:17:00  lr: 0.000072  min_lr: 0.000072  loss: 2.6495 (2.6575)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5633 (1.6045)  time: 0.4786  data: 0.0005  max mem: 42519
Epoch: [276]  [ 600/2502]  eta: 0:15:19  lr: 0.000072  min_lr: 0.000072  loss: 2.8362 (2.6762)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6129 (1.6317)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [276]  [ 800/2502]  eta: 0:13:42  lr: 0.000071  min_lr: 0.000071  loss: 2.3225 (2.6717)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6110 (1.6333)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [276]  [1000/2502]  eta: 0:12:05  lr: 0.000071  min_lr: 0.000071  loss: 2.8025 (2.6792)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4601 (1.6431)  time: 0.4793  data: 0.0005  max mem: 42519
Epoch: [276]  [1200/2502]  eta: 0:10:28  lr: 0.000070  min_lr: 0.000070  loss: 2.8729 (2.6796)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5573 (1.6406)  time: 0.4817  data: 0.0004  max mem: 42519
Epoch: [276]  [1400/2502]  eta: 0:08:51  lr: 0.000070  min_lr: 0.000070  loss: 2.8933 (2.6845)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5366 (1.6296)  time: 0.4807  data: 0.0004  max mem: 42519
Epoch: [276]  [1600/2502]  eta: 0:07:15  lr: 0.000069  min_lr: 0.000069  loss: 2.6266 (2.6770)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5686 (1.6173)  time: 0.4809  data: 0.0005  max mem: 42519
Epoch: [276]  [1800/2502]  eta: 0:05:38  lr: 0.000069  min_lr: 0.000069  loss: 2.6379 (2.6789)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5145 (1.6195)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [276]  [2000/2502]  eta: 0:04:02  lr: 0.000068  min_lr: 0.000068  loss: 2.8163 (2.6735)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4488 (1.6182)  time: 0.4868  data: 0.0005  max mem: 42519
Epoch: [276]  [2200/2502]  eta: 0:02:25  lr: 0.000068  min_lr: 0.000068  loss: 2.6666 (2.6737)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4215 (1.6285)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [276]  [2400/2502]  eta: 0:00:49  lr: 0.000067  min_lr: 0.000067  loss: 2.7001 (2.6700)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5895 (1.6201)  time: 0.4817  data: 0.0004  max mem: 42519
Epoch: [276]  [2501/2502]  eta: 0:00:00  lr: 0.000067  min_lr: 0.000067  loss: 2.7926 (2.6688)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5560 (1.6178)  time: 0.4421  data: 0.0008  max mem: 42519
Epoch: [276] Total time: 0:20:06 (0.4821 s / it)
Averaged stats: lr: 0.000067  min_lr: 0.000067  loss: 2.7926 (2.6726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5560 (1.6178)
Test:  [ 0/50]  eta: 0:02:26  loss: 0.5553 (0.5553)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 2.9252  data: 2.6585  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8006 (0.8208)  acc1: 88.0000 (88.4364)  acc5: 98.8000 (98.4000)  time: 0.4818  data: 0.2421  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8627 (0.8532)  acc1: 85.6000 (87.5429)  acc5: 98.0000 (98.1333)  time: 0.2547  data: 0.0138  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.9768 (0.9398)  acc1: 84.8000 (85.8839)  acc5: 96.8000 (97.3290)  time: 0.2788  data: 0.0381  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1428 (0.9948)  acc1: 80.4000 (84.3707)  acc5: 95.6000 (96.9366)  time: 0.2612  data: 0.0246  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1280 (1.0113)  acc1: 78.8000 (83.8240)  acc5: 95.6000 (96.8160)  time: 0.2619  data: 0.0244  max mem: 42519
Test: Total time: 0:00:15 (0.3103 s / it)
* Acc@1 84.306 Acc@5 96.884 loss 1.002
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.32%
Epoch: [277]  [   0/2502]  eta: 1:24:15  lr: 0.000067  min_lr: 0.000067  loss: 2.4608 (2.4608)  weight_decay: 0.0500 (0.0500)  time: 2.0204  data: 1.5523  max mem: 42519
Epoch: [277]  [ 200/2502]  eta: 0:18:47  lr: 0.000067  min_lr: 0.000067  loss: 2.7153 (2.7082)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4728 (1.5806)  time: 0.4818  data: 0.0005  max mem: 42519
Epoch: [277]  [ 400/2502]  eta: 0:17:01  lr: 0.000066  min_lr: 0.000066  loss: 2.6477 (2.6789)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5101 (1.5732)  time: 0.4800  data: 0.0005  max mem: 42519
Epoch: [277]  [ 600/2502]  eta: 0:15:22  lr: 0.000066  min_lr: 0.000066  loss: 2.6911 (2.6515)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4296 (1.5517)  time: 0.4862  data: 0.0004  max mem: 42519
Epoch: [277]  [ 800/2502]  eta: 0:13:43  lr: 0.000065  min_lr: 0.000065  loss: 2.8711 (2.6597)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4844 (1.6003)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [277]  [1000/2502]  eta: 0:12:05  lr: 0.000065  min_lr: 0.000065  loss: 2.6838 (2.6564)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4929 (1.5866)  time: 0.4867  data: 0.0006  max mem: 42519
Epoch: [277]  [1200/2502]  eta: 0:10:28  lr: 0.000064  min_lr: 0.000064  loss: 2.8676 (2.6526)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5250 (1.5793)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [277]  [1400/2502]  eta: 0:08:51  lr: 0.000064  min_lr: 0.000064  loss: 2.8197 (2.6582)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6247 (1.5793)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [277]  [1600/2502]  eta: 0:07:14  lr: 0.000064  min_lr: 0.000064  loss: 2.5249 (2.6665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5468 (1.5804)  time: 0.4782  data: 0.0005  max mem: 42519
Epoch: [277]  [1800/2502]  eta: 0:05:38  lr: 0.000063  min_lr: 0.000063  loss: 2.6880 (2.6654)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5350 (1.5807)  time: 0.4859  data: 0.0004  max mem: 42519
Epoch: [277]  [2000/2502]  eta: 0:04:01  lr: 0.000063  min_lr: 0.000063  loss: 2.7078 (2.6653)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6239 (1.5850)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [277]  [2200/2502]  eta: 0:02:25  lr: 0.000062  min_lr: 0.000062  loss: 2.8488 (2.6655)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5360 (1.5874)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [277]  [2400/2502]  eta: 0:00:49  lr: 0.000062  min_lr: 0.000062  loss: 2.7313 (2.6646)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6185 (1.5922)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [277]  [2501/2502]  eta: 0:00:00  lr: 0.000062  min_lr: 0.000062  loss: 2.5175 (2.6621)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5367 (1.5908)  time: 0.4379  data: 0.0012  max mem: 42519
Epoch: [277] Total time: 0:20:04 (0.4816 s / it)
Averaged stats: lr: 0.000062  min_lr: 0.000062  loss: 2.5175 (2.6691)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5367 (1.5908)
Test:  [ 0/50]  eta: 0:02:34  loss: 0.4589 (0.4589)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 3.0957  data: 2.8328  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.6848 (0.7217)  acc1: 88.8000 (88.9091)  acc5: 98.8000 (98.4364)  time: 0.4981  data: 0.2579  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.7741 (0.7571)  acc1: 86.0000 (87.8095)  acc5: 98.4000 (98.2476)  time: 0.2429  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.8587 (0.8390)  acc1: 84.4000 (85.9226)  acc5: 97.2000 (97.4581)  time: 0.2425  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0270 (0.8901)  acc1: 80.8000 (84.5561)  acc5: 95.6000 (97.0342)  time: 0.2376  data: 0.0006  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0037 (0.9054)  acc1: 79.6000 (83.9280)  acc5: 96.0000 (96.9600)  time: 0.2372  data: 0.0004  max mem: 42519
Test: Total time: 0:00:14 (0.2987 s / it)
* Acc@1 84.434 Acc@5 96.944 loss 0.895
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.43%
Epoch: [278]  [   0/2502]  eta: 1:30:27  lr: 0.000062  min_lr: 0.000062  loss: 2.8694 (2.8694)  weight_decay: 0.0500 (0.0500)  time: 2.1693  data: 1.6854  max mem: 42519
Epoch: [278]  [ 200/2502]  eta: 0:18:44  lr: 0.000061  min_lr: 0.000061  loss: 2.6701 (2.6541)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5251 (1.5654)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [278]  [ 400/2502]  eta: 0:16:58  lr: 0.000061  min_lr: 0.000061  loss: 2.2918 (2.6685)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4878 (1.5427)  time: 0.4863  data: 0.0004  max mem: 42519
Epoch: [278]  [ 600/2502]  eta: 0:15:18  lr: 0.000060  min_lr: 0.000060  loss: 2.7875 (2.6708)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5406 (1.5525)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [278]  [ 800/2502]  eta: 0:13:40  lr: 0.000060  min_lr: 0.000060  loss: 2.5199 (2.6704)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5973 (1.5597)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [278]  [1000/2502]  eta: 0:12:03  lr: 0.000059  min_lr: 0.000059  loss: 2.7667 (2.6741)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5723 (1.6061)  time: 0.4784  data: 0.0005  max mem: 42519
Epoch: [278]  [1200/2502]  eta: 0:10:26  lr: 0.000059  min_lr: 0.000059  loss: 2.8213 (2.6664)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5460 (1.5995)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [278]  [1400/2502]  eta: 0:08:50  lr: 0.000059  min_lr: 0.000059  loss: 2.7067 (2.6652)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5297 (1.5968)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [278]  [1600/2502]  eta: 0:07:14  lr: 0.000058  min_lr: 0.000058  loss: 2.7607 (2.6718)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4913 (1.5957)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [278]  [1800/2502]  eta: 0:05:38  lr: 0.000058  min_lr: 0.000058  loss: 2.8100 (2.6759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5895 (1.5951)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [278]  [2000/2502]  eta: 0:04:01  lr: 0.000057  min_lr: 0.000057  loss: 2.6743 (2.6844)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4840 (1.5951)  time: 0.4882  data: 0.0004  max mem: 42519
Epoch: [278]  [2200/2502]  eta: 0:02:25  lr: 0.000057  min_lr: 0.000057  loss: 2.8520 (2.6816)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5045 (1.5958)  time: 0.4868  data: 0.0006  max mem: 42519
Epoch: [278]  [2400/2502]  eta: 0:00:49  lr: 0.000056  min_lr: 0.000056  loss: 2.8717 (2.6805)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5127 (1.5990)  time: 0.4798  data: 0.0005  max mem: 42519
Epoch: [278]  [2501/2502]  eta: 0:00:00  lr: 0.000056  min_lr: 0.000056  loss: 2.8850 (2.6828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4712 (1.5974)  time: 0.4391  data: 0.0011  max mem: 42519
Epoch: [278] Total time: 0:20:03 (0.4812 s / it)
Averaged stats: lr: 0.000056  min_lr: 0.000056  loss: 2.8850 (2.6719)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4712 (1.5974)
Test:  [ 0/50]  eta: 0:02:20  loss: 0.6145 (0.6145)  acc1: 95.2000 (95.2000)  acc5: 99.2000 (99.2000)  time: 2.8024  data: 2.5264  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8205 (0.8457)  acc1: 89.2000 (89.1273)  acc5: 98.8000 (98.4000)  time: 0.4937  data: 0.2515  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8848 (0.8846)  acc1: 86.0000 (87.7524)  acc5: 98.4000 (98.1714)  time: 0.2551  data: 0.0123  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0288 (0.9699)  acc1: 84.4000 (86.0387)  acc5: 96.8000 (97.4065)  time: 0.2629  data: 0.0211  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1644 (1.0214)  acc1: 80.8000 (84.6342)  acc5: 95.2000 (97.0049)  time: 0.2578  data: 0.0208  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1644 (1.0374)  acc1: 79.2000 (84.0320)  acc5: 96.0000 (96.9280)  time: 0.2371  data: 0.0002  max mem: 42519
Test: Total time: 0:00:15 (0.3056 s / it)
* Acc@1 84.412 Acc@5 96.926 loss 1.027
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.43%
Epoch: [279]  [   0/2502]  eta: 1:41:37  lr: 0.000056  min_lr: 0.000056  loss: 2.6643 (2.6643)  weight_decay: 0.0500 (0.0500)  time: 2.4370  data: 1.9479  max mem: 42519
Epoch: [279]  [ 200/2502]  eta: 0:18:52  lr: 0.000056  min_lr: 0.000056  loss: 2.8443 (2.7174)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5383 (inf)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [279]  [ 400/2502]  eta: 0:17:03  lr: 0.000055  min_lr: 0.000055  loss: 2.4882 (2.7145)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5959 (inf)  time: 0.4824  data: 0.0004  max mem: 42519
Epoch: [279]  [ 600/2502]  eta: 0:15:22  lr: 0.000055  min_lr: 0.000055  loss: 2.6444 (2.6931)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4859 (inf)  time: 0.4839  data: 0.0004  max mem: 42519
Epoch: [279]  [ 800/2502]  eta: 0:13:43  lr: 0.000055  min_lr: 0.000055  loss: 2.7960 (2.6896)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5418 (inf)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [279]  [1000/2502]  eta: 0:12:05  lr: 0.000054  min_lr: 0.000054  loss: 2.7451 (2.6857)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5406 (inf)  time: 0.4864  data: 0.0004  max mem: 42519
Epoch: [279]  [1200/2502]  eta: 0:10:28  lr: 0.000054  min_lr: 0.000054  loss: 2.7035 (2.6859)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5046 (inf)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [279]  [1400/2502]  eta: 0:08:51  lr: 0.000053  min_lr: 0.000053  loss: 2.7931 (2.6819)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5834 (inf)  time: 0.4799  data: 0.0005  max mem: 42519
Epoch: [279]  [1600/2502]  eta: 0:07:15  lr: 0.000053  min_lr: 0.000053  loss: 2.5617 (2.6746)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5735 (inf)  time: 0.4795  data: 0.0005  max mem: 42519
Epoch: [279]  [1800/2502]  eta: 0:05:38  lr: 0.000053  min_lr: 0.000053  loss: 2.8643 (2.6773)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5607 (inf)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [279]  [2000/2502]  eta: 0:04:01  lr: 0.000052  min_lr: 0.000052  loss: 2.5960 (2.6696)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5504 (inf)  time: 0.4787  data: 0.0006  max mem: 42519
Epoch: [279]  [2200/2502]  eta: 0:02:25  lr: 0.000052  min_lr: 0.000052  loss: 2.8354 (2.6724)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6782 (inf)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [279]  [2400/2502]  eta: 0:00:49  lr: 0.000051  min_lr: 0.000051  loss: 2.7834 (2.6710)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6046 (inf)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [279]  [2501/2502]  eta: 0:00:00  lr: 0.000051  min_lr: 0.000051  loss: 2.8606 (2.6722)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6361 (inf)  time: 0.4365  data: 0.0012  max mem: 42519
Epoch: [279] Total time: 0:20:04 (0.4815 s / it)
Averaged stats: lr: 0.000051  min_lr: 0.000051  loss: 2.8606 (2.6720)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6361 (inf)
Test:  [ 0/50]  eta: 0:02:54  loss: 0.5988 (0.5988)  acc1: 95.6000 (95.6000)  acc5: 99.6000 (99.6000)  time: 3.4913  data: 3.2059  max mem: 42519
Test:  [10/50]  eta: 0:00:21  loss: 0.8379 (0.8585)  acc1: 88.0000 (88.6909)  acc5: 98.8000 (98.3273)  time: 0.5346  data: 0.2919  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.9108 (0.8923)  acc1: 86.0000 (87.6191)  acc5: 98.4000 (98.0571)  time: 0.2444  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 1.0191 (0.9770)  acc1: 84.4000 (85.8452)  acc5: 96.8000 (97.2903)  time: 0.2493  data: 0.0057  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1733 (1.0286)  acc1: 80.4000 (84.5268)  acc5: 95.6000 (96.9171)  time: 0.2430  data: 0.0055  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1643 (1.0430)  acc1: 80.0000 (83.9120)  acc5: 95.6000 (96.8640)  time: 0.2421  data: 0.0052  max mem: 42519
Test: Total time: 0:00:15 (0.3094 s / it)
* Acc@1 84.402 Acc@5 96.942 loss 1.033
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.43%
Epoch: [280]  [   0/2502]  eta: 1:33:20  lr: 0.000051  min_lr: 0.000051  loss: 2.1297 (2.1297)  weight_decay: 0.0500 (0.0500)  time: 2.2385  data: 1.3371  max mem: 42519
Epoch: [280]  [ 200/2502]  eta: 0:18:47  lr: 0.000051  min_lr: 0.000051  loss: 2.3235 (2.6534)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4509 (1.5620)  time: 0.4810  data: 0.0004  max mem: 42519
Epoch: [280]  [ 400/2502]  eta: 0:17:01  lr: 0.000050  min_lr: 0.000050  loss: 2.4056 (2.6649)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6101 (1.6313)  time: 0.4992  data: 0.0005  max mem: 42519
Epoch: [280]  [ 600/2502]  eta: 0:15:20  lr: 0.000050  min_lr: 0.000050  loss: 2.8094 (2.6646)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6304 (1.6486)  time: 0.4779  data: 0.0004  max mem: 42519
Epoch: [280]  [ 800/2502]  eta: 0:13:41  lr: 0.000050  min_lr: 0.000050  loss: 2.5629 (2.6603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5119 (1.6640)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [280]  [1000/2502]  eta: 0:12:04  lr: 0.000049  min_lr: 0.000049  loss: 2.7560 (2.6625)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5000 (1.6370)  time: 0.4778  data: 0.0004  max mem: 42519
Epoch: [280]  [1200/2502]  eta: 0:10:27  lr: 0.000049  min_lr: 0.000049  loss: 2.8578 (2.6584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6506 (1.6388)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [280]  [1400/2502]  eta: 0:08:50  lr: 0.000048  min_lr: 0.000048  loss: 2.8320 (2.6589)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4679 (1.6242)  time: 0.4808  data: 0.0004  max mem: 42519
Epoch: [280]  [1600/2502]  eta: 0:07:14  lr: 0.000048  min_lr: 0.000048  loss: 2.5551 (2.6581)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6410 (1.6261)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [280]  [1800/2502]  eta: 0:05:37  lr: 0.000048  min_lr: 0.000048  loss: 2.7867 (2.6596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4975 (1.6216)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [280]  [2000/2502]  eta: 0:04:01  lr: 0.000047  min_lr: 0.000047  loss: 2.7886 (2.6608)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4829 (1.6217)  time: 0.4865  data: 0.0005  max mem: 42519
Epoch: [280]  [2200/2502]  eta: 0:02:25  lr: 0.000047  min_lr: 0.000047  loss: 2.6872 (2.6612)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5067 (1.6224)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [280]  [2400/2502]  eta: 0:00:49  lr: 0.000046  min_lr: 0.000046  loss: 2.7185 (2.6601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5710 (1.6185)  time: 0.4853  data: 0.0004  max mem: 42519
Epoch: [280]  [2501/2502]  eta: 0:00:00  lr: 0.000046  min_lr: 0.000046  loss: 2.7166 (2.6601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4821 (1.6170)  time: 0.4352  data: 0.0008  max mem: 42519
Epoch: [280] Total time: 0:20:03 (0.4810 s / it)
Averaged stats: lr: 0.000046  min_lr: 0.000046  loss: 2.7166 (2.6642)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4821 (1.6170)
Test:  [ 0/50]  eta: 0:02:34  loss: 0.4990 (0.4990)  acc1: 95.6000 (95.6000)  acc5: 99.6000 (99.6000)  time: 3.0929  data: 2.8307  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.7331 (0.7507)  acc1: 88.4000 (88.9091)  acc5: 99.2000 (98.4000)  time: 0.5209  data: 0.2774  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.7892 (0.7891)  acc1: 86.4000 (87.7333)  acc5: 98.4000 (98.2095)  time: 0.2563  data: 0.0114  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9207 (0.8734)  acc1: 84.8000 (85.9871)  acc5: 97.2000 (97.4065)  time: 0.2494  data: 0.0007  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0823 (0.9277)  acc1: 80.4000 (84.5561)  acc5: 95.6000 (96.9854)  time: 0.2589  data: 0.0160  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0703 (0.9423)  acc1: 79.2000 (83.9680)  acc5: 95.6000 (96.8720)  time: 0.2526  data: 0.0157  max mem: 42519
Test: Total time: 0:00:15 (0.3121 s / it)
* Acc@1 84.366 Acc@5 96.950 loss 0.931
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.43%
Epoch: [281]  [   0/2502]  eta: 1:30:38  lr: 0.000046  min_lr: 0.000046  loss: 2.4981 (2.4981)  weight_decay: 0.0500 (0.0500)  time: 2.1735  data: 1.6891  max mem: 42519
Epoch: [281]  [ 200/2502]  eta: 0:18:47  lr: 0.000046  min_lr: 0.000046  loss: 2.7637 (2.6332)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5407 (1.6110)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [281]  [ 400/2502]  eta: 0:17:00  lr: 0.000046  min_lr: 0.000046  loss: 2.8442 (2.6690)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5459 (1.6085)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [281]  [ 600/2502]  eta: 0:15:20  lr: 0.000045  min_lr: 0.000045  loss: 2.8500 (2.6806)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5655 (1.6115)  time: 0.4955  data: 0.0004  max mem: 42519
Epoch: [281]  [ 800/2502]  eta: 0:13:42  lr: 0.000045  min_lr: 0.000045  loss: 2.7315 (2.6708)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4960 (1.5941)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [281]  [1000/2502]  eta: 0:12:04  lr: 0.000044  min_lr: 0.000044  loss: 2.8032 (2.6704)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5364 (1.5876)  time: 0.4779  data: 0.0003  max mem: 42519
Epoch: [281]  [1200/2502]  eta: 0:10:27  lr: 0.000044  min_lr: 0.000044  loss: 2.8698 (2.6736)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5640 (1.5847)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [281]  [1400/2502]  eta: 0:08:51  lr: 0.000044  min_lr: 0.000044  loss: 2.8652 (2.6726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5176 (1.5835)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [281]  [1600/2502]  eta: 0:07:14  lr: 0.000043  min_lr: 0.000043  loss: 2.7902 (2.6676)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5539 (1.5897)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [281]  [1800/2502]  eta: 0:05:38  lr: 0.000043  min_lr: 0.000043  loss: 2.7720 (2.6759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5378 (1.5884)  time: 0.4864  data: 0.0005  max mem: 42519
Epoch: [281]  [2000/2502]  eta: 0:04:01  lr: 0.000043  min_lr: 0.000043  loss: 2.7674 (2.6754)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5603 (1.5923)  time: 0.4784  data: 0.0003  max mem: 42519
Epoch: [281]  [2200/2502]  eta: 0:02:25  lr: 0.000042  min_lr: 0.000042  loss: 2.6270 (2.6748)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5385 (1.5880)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [281]  [2400/2502]  eta: 0:00:49  lr: 0.000042  min_lr: 0.000042  loss: 2.4446 (2.6726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5777 (1.5872)  time: 0.4813  data: 0.0004  max mem: 42519
Epoch: [281]  [2501/2502]  eta: 0:00:00  lr: 0.000042  min_lr: 0.000042  loss: 2.8511 (2.6700)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4961 (1.5863)  time: 0.4337  data: 0.0007  max mem: 42519
Epoch: [281] Total time: 0:20:04 (0.4814 s / it)
Averaged stats: lr: 0.000042  min_lr: 0.000042  loss: 2.8511 (2.6678)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4961 (1.5863)
Test:  [ 0/50]  eta: 0:04:07  loss: 0.6218 (0.6218)  acc1: 96.0000 (96.0000)  acc5: 99.6000 (99.6000)  time: 4.9556  data: 4.6702  max mem: 42519
Test:  [10/50]  eta: 0:00:26  loss: 0.8848 (0.8785)  acc1: 88.0000 (88.5455)  acc5: 98.8000 (98.4000)  time: 0.6641  data: 0.4249  max mem: 42519
Test:  [20/50]  eta: 0:00:13  loss: 0.9064 (0.9155)  acc1: 86.0000 (87.5429)  acc5: 98.0000 (98.0762)  time: 0.2368  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 1.0495 (1.0021)  acc1: 84.4000 (85.7936)  acc5: 96.4000 (97.2645)  time: 0.2612  data: 0.0209  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2043 (1.0561)  acc1: 80.0000 (84.3317)  acc5: 95.2000 (96.8683)  time: 0.2592  data: 0.0206  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1871 (1.0692)  acc1: 79.2000 (83.7680)  acc5: 96.0000 (96.7920)  time: 0.2549  data: 0.0204  max mem: 42519
Test: Total time: 0:00:17 (0.3424 s / it)
* Acc@1 84.276 Acc@5 96.916 loss 1.059
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.43%
Epoch: [282]  [   0/2502]  eta: 1:20:57  lr: 0.000042  min_lr: 0.000042  loss: 2.7266 (2.7266)  weight_decay: 0.0500 (0.0500)  time: 1.9414  data: 1.3154  max mem: 42519
Epoch: [282]  [ 200/2502]  eta: 0:18:40  lr: 0.000041  min_lr: 0.000041  loss: 2.7737 (2.6771)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5813 (1.6898)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [282]  [ 400/2502]  eta: 0:16:58  lr: 0.000041  min_lr: 0.000041  loss: 2.5197 (2.6728)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5439 (1.6510)  time: 0.4907  data: 0.0003  max mem: 42519
Epoch: [282]  [ 600/2502]  eta: 0:15:18  lr: 0.000041  min_lr: 0.000041  loss: 2.6941 (2.6645)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5863 (1.6367)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [282]  [ 800/2502]  eta: 0:13:40  lr: 0.000040  min_lr: 0.000040  loss: 2.6359 (2.6667)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5967 (1.6367)  time: 0.4814  data: 0.0004  max mem: 42519
Epoch: [282]  [1000/2502]  eta: 0:12:04  lr: 0.000040  min_lr: 0.000040  loss: 2.7727 (2.6651)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5024 (1.6218)  time: 0.4790  data: 0.0003  max mem: 42519
Epoch: [282]  [1200/2502]  eta: 0:10:27  lr: 0.000040  min_lr: 0.000040  loss: 2.5687 (2.6646)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5668 (inf)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [282]  [1400/2502]  eta: 0:08:50  lr: 0.000039  min_lr: 0.000039  loss: 2.8624 (2.6651)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5914 (inf)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [282]  [1600/2502]  eta: 0:07:14  lr: 0.000039  min_lr: 0.000039  loss: 2.8611 (2.6631)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6669 (inf)  time: 0.4785  data: 0.0003  max mem: 42519
Epoch: [282]  [1800/2502]  eta: 0:05:37  lr: 0.000038  min_lr: 0.000038  loss: 2.6792 (2.6623)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5132 (inf)  time: 0.4787  data: 0.0003  max mem: 42519
Epoch: [282]  [2000/2502]  eta: 0:04:01  lr: 0.000038  min_lr: 0.000038  loss: 2.6863 (2.6629)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5702 (inf)  time: 0.4880  data: 0.0004  max mem: 42519
Epoch: [282]  [2200/2502]  eta: 0:02:25  lr: 0.000038  min_lr: 0.000038  loss: 2.7772 (2.6595)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5632 (inf)  time: 0.4818  data: 0.0004  max mem: 42519
Epoch: [282]  [2400/2502]  eta: 0:00:49  lr: 0.000037  min_lr: 0.000037  loss: 2.7632 (2.6621)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4914 (inf)  time: 0.4816  data: 0.0004  max mem: 42519
Epoch: [282]  [2501/2502]  eta: 0:00:00  lr: 0.000037  min_lr: 0.000037  loss: 2.7316 (2.6614)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5042 (inf)  time: 0.4359  data: 0.0007  max mem: 42519
Epoch: [282] Total time: 0:20:03 (0.4812 s / it)
Averaged stats: lr: 0.000037  min_lr: 0.000037  loss: 2.7316 (2.6581)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5042 (inf)
Test:  [ 0/50]  eta: 0:03:19  loss: 0.4858 (0.4858)  acc1: 96.8000 (96.8000)  acc5: 99.2000 (99.2000)  time: 3.9838  data: 3.6957  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.7297 (0.7384)  acc1: 89.2000 (89.0909)  acc5: 98.8000 (98.4000)  time: 0.7161  data: 0.4751  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.7878 (0.7807)  acc1: 86.0000 (87.8095)  acc5: 98.0000 (98.2286)  time: 0.3187  data: 0.0817  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9109 (0.8641)  acc1: 84.4000 (86.0774)  acc5: 97.2000 (97.4968)  time: 0.2679  data: 0.0270  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0606 (0.9151)  acc1: 80.4000 (84.7610)  acc5: 95.6000 (97.0732)  time: 0.2854  data: 0.0448  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0492 (0.9291)  acc1: 79.2000 (84.1360)  acc5: 96.0000 (97.0000)  time: 0.2605  data: 0.0232  max mem: 42519
Test: Total time: 0:00:18 (0.3659 s / it)
* Acc@1 84.468 Acc@5 97.022 loss 0.919
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.47%
Epoch: [283]  [   0/2502]  eta: 1:17:47  lr: 0.000037  min_lr: 0.000037  loss: 2.9930 (2.9930)  weight_decay: 0.0500 (0.0500)  time: 1.8654  data: 1.3857  max mem: 42519
Epoch: [283]  [ 200/2502]  eta: 0:18:43  lr: 0.000037  min_lr: 0.000037  loss: 2.8243 (2.6712)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5336 (1.5385)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [283]  [ 400/2502]  eta: 0:16:55  lr: 0.000037  min_lr: 0.000037  loss: 2.7778 (2.6584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6071 (1.6031)  time: 0.4772  data: 0.0004  max mem: 42519
Epoch: [283]  [ 600/2502]  eta: 0:15:17  lr: 0.000036  min_lr: 0.000036  loss: 2.7132 (2.6437)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6571 (1.6200)  time: 0.4838  data: 0.0004  max mem: 42519
Epoch: [283]  [ 800/2502]  eta: 0:13:39  lr: 0.000036  min_lr: 0.000036  loss: 2.7453 (2.6521)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5545 (1.6231)  time: 0.4802  data: 0.0004  max mem: 42519
Epoch: [283]  [1000/2502]  eta: 0:12:02  lr: 0.000036  min_lr: 0.000036  loss: 2.8355 (2.6589)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5881 (1.6140)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [283]  [1200/2502]  eta: 0:10:27  lr: 0.000035  min_lr: 0.000035  loss: 2.6738 (2.6539)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4312 (1.6078)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [283]  [1400/2502]  eta: 0:08:50  lr: 0.000035  min_lr: 0.000035  loss: 2.6830 (2.6464)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4598 (1.5975)  time: 0.4834  data: 0.0004  max mem: 42519
Epoch: [283]  [1600/2502]  eta: 0:07:14  lr: 0.000035  min_lr: 0.000035  loss: 2.8658 (2.6503)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4692 (1.6065)  time: 0.4792  data: 0.0005  max mem: 42519
Epoch: [283]  [1800/2502]  eta: 0:05:38  lr: 0.000034  min_lr: 0.000034  loss: 2.8010 (2.6506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5024 (1.6034)  time: 0.4845  data: 0.0004  max mem: 42519
Epoch: [283]  [2000/2502]  eta: 0:04:01  lr: 0.000034  min_lr: 0.000034  loss: 2.8190 (2.6518)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5400 (1.6001)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [283]  [2200/2502]  eta: 0:02:25  lr: 0.000034  min_lr: 0.000034  loss: 2.5245 (2.6510)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6267 (1.5986)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [283]  [2400/2502]  eta: 0:00:49  lr: 0.000033  min_lr: 0.000033  loss: 2.6999 (2.6495)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4683 (1.5953)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [283]  [2501/2502]  eta: 0:00:00  lr: 0.000033  min_lr: 0.000033  loss: 2.7229 (2.6531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5815 (1.5977)  time: 0.4353  data: 0.0007  max mem: 42519
Epoch: [283] Total time: 0:20:03 (0.4812 s / it)
Averaged stats: lr: 0.000033  min_lr: 0.000033  loss: 2.7229 (2.6512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5815 (1.5977)
Test:  [ 0/50]  eta: 0:04:42  loss: 0.5677 (0.5677)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 5.6576  data: 5.3537  max mem: 42519
Test:  [10/50]  eta: 0:00:29  loss: 0.7876 (0.8131)  acc1: 88.0000 (88.3636)  acc5: 98.8000 (98.3273)  time: 0.7335  data: 0.4923  max mem: 42519
Test:  [20/50]  eta: 0:00:14  loss: 0.8548 (0.8445)  acc1: 86.0000 (87.5429)  acc5: 98.4000 (98.1333)  time: 0.2388  data: 0.0033  max mem: 42519
Test:  [30/50]  eta: 0:00:08  loss: 0.9768 (0.9297)  acc1: 84.4000 (85.8452)  acc5: 96.8000 (97.3548)  time: 0.2513  data: 0.0144  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1360 (0.9855)  acc1: 80.4000 (84.4878)  acc5: 95.6000 (96.9659)  time: 0.2505  data: 0.0142  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1308 (1.0000)  acc1: 80.0000 (83.8880)  acc5: 96.0000 (96.8720)  time: 0.2491  data: 0.0140  max mem: 42519
Test: Total time: 0:00:17 (0.3533 s / it)
* Acc@1 84.446 Acc@5 96.948 loss 0.989
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.47%
Epoch: [284]  [   0/2502]  eta: 1:39:13  lr: 0.000033  min_lr: 0.000033  loss: 3.0103 (3.0103)  weight_decay: 0.0500 (0.0500)  time: 2.3796  data: 1.3725  max mem: 42519
Epoch: [284]  [ 200/2502]  eta: 0:18:49  lr: 0.000033  min_lr: 0.000033  loss: 2.6461 (2.6032)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5093 (1.5199)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [284]  [ 400/2502]  eta: 0:16:59  lr: 0.000032  min_lr: 0.000032  loss: 2.8139 (2.6519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5798 (1.5722)  time: 0.4792  data: 0.0006  max mem: 42519
Epoch: [284]  [ 600/2502]  eta: 0:15:18  lr: 0.000032  min_lr: 0.000032  loss: 2.8552 (2.6516)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5533 (1.5960)  time: 0.4773  data: 0.0004  max mem: 42519
Epoch: [284]  [ 800/2502]  eta: 0:13:40  lr: 0.000032  min_lr: 0.000032  loss: 2.6734 (2.6509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5794 (1.6234)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [284]  [1000/2502]  eta: 0:12:03  lr: 0.000032  min_lr: 0.000032  loss: 2.5450 (2.6391)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5080 (1.6158)  time: 0.4779  data: 0.0005  max mem: 42519
Epoch: [284]  [1200/2502]  eta: 0:10:26  lr: 0.000031  min_lr: 0.000031  loss: 2.8259 (2.6528)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6328 (1.6352)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [284]  [1400/2502]  eta: 0:08:50  lr: 0.000031  min_lr: 0.000031  loss: 2.7059 (2.6520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4231 (1.6279)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [284]  [1600/2502]  eta: 0:07:13  lr: 0.000031  min_lr: 0.000031  loss: 2.7612 (2.6590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6583 (1.6305)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [284]  [1800/2502]  eta: 0:05:37  lr: 0.000030  min_lr: 0.000030  loss: 2.8003 (2.6540)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5851 (1.6335)  time: 0.4888  data: 0.0004  max mem: 42519
Epoch: [284]  [2000/2502]  eta: 0:04:01  lr: 0.000030  min_lr: 0.000030  loss: 2.5915 (2.6532)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5520 (1.6335)  time: 0.4800  data: 0.0004  max mem: 42519
Epoch: [284]  [2200/2502]  eta: 0:02:25  lr: 0.000030  min_lr: 0.000030  loss: 2.9407 (2.6568)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5992 (1.6350)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [284]  [2400/2502]  eta: 0:00:49  lr: 0.000029  min_lr: 0.000029  loss: 2.7942 (2.6543)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5773 (1.6290)  time: 0.4812  data: 0.0005  max mem: 42519
Epoch: [284]  [2501/2502]  eta: 0:00:00  lr: 0.000029  min_lr: 0.000029  loss: 2.7260 (2.6533)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5725 (1.6302)  time: 0.4308  data: 0.0008  max mem: 42519
Epoch: [284] Total time: 0:20:02 (0.4807 s / it)
Averaged stats: lr: 0.000029  min_lr: 0.000029  loss: 2.7260 (2.6493)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5725 (1.6302)
Test:  [ 0/50]  eta: 0:04:09  loss: 0.5028 (0.5028)  acc1: 96.0000 (96.0000)  acc5: 99.6000 (99.6000)  time: 4.9912  data: 4.7087  max mem: 42519
Test:  [10/50]  eta: 0:00:28  loss: 0.7350 (0.7648)  acc1: 89.6000 (88.9455)  acc5: 98.8000 (98.2909)  time: 0.7168  data: 0.4688  max mem: 42519
Test:  [20/50]  eta: 0:00:16  loss: 0.8213 (0.8028)  acc1: 86.4000 (87.8476)  acc5: 98.4000 (98.1524)  time: 0.3346  data: 0.0898  max mem: 42519
Test:  [30/50]  eta: 0:00:09  loss: 0.9153 (0.8860)  acc1: 84.8000 (86.1548)  acc5: 96.8000 (97.3806)  time: 0.3329  data: 0.0911  max mem: 42519
Test:  [40/50]  eta: 0:00:04  loss: 1.0895 (0.9370)  acc1: 81.2000 (84.8000)  acc5: 95.6000 (97.0049)  time: 0.2614  data: 0.0238  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0580 (0.9498)  acc1: 80.4000 (84.1520)  acc5: 96.0000 (96.9360)  time: 0.2371  data: 0.0002  max mem: 42519
Test: Total time: 0:00:19 (0.3833 s / it)
* Acc@1 84.458 Acc@5 96.988 loss 0.938
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.47%
Epoch: [285]  [   0/2502]  eta: 1:29:43  lr: 0.000029  min_lr: 0.000029  loss: 3.1418 (3.1418)  weight_decay: 0.0500 (0.0500)  time: 2.1517  data: 1.6129  max mem: 42519
Epoch: [285]  [ 200/2502]  eta: 0:18:47  lr: 0.000029  min_lr: 0.000029  loss: 2.7437 (2.6817)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4631 (1.5590)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [285]  [ 400/2502]  eta: 0:16:58  lr: 0.000029  min_lr: 0.000029  loss: 2.7904 (2.6507)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5259 (1.5920)  time: 0.4771  data: 0.0004  max mem: 42519
Epoch: [285]  [ 600/2502]  eta: 0:15:18  lr: 0.000028  min_lr: 0.000028  loss: 2.7991 (2.6540)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5369 (1.6041)  time: 0.4777  data: 0.0004  max mem: 42519
Epoch: [285]  [ 800/2502]  eta: 0:13:40  lr: 0.000028  min_lr: 0.000028  loss: 2.6760 (2.6615)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5982 (1.6139)  time: 0.4801  data: 0.0005  max mem: 42519
Epoch: [285]  [1000/2502]  eta: 0:12:03  lr: 0.000028  min_lr: 0.000028  loss: 2.8045 (2.6419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4872 (1.6070)  time: 0.4811  data: 0.0005  max mem: 42519
Epoch: [285]  [1200/2502]  eta: 0:10:27  lr: 0.000027  min_lr: 0.000027  loss: 2.6942 (2.6473)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5450 (1.6108)  time: 0.4879  data: 0.0004  max mem: 42519
Epoch: [285]  [1400/2502]  eta: 0:08:50  lr: 0.000027  min_lr: 0.000027  loss: 2.6299 (2.6467)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4593 (1.5983)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [285]  [1600/2502]  eta: 0:07:14  lr: 0.000027  min_lr: 0.000027  loss: 2.7142 (2.6469)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5310 (1.5949)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [285]  [1800/2502]  eta: 0:05:37  lr: 0.000027  min_lr: 0.000027  loss: 2.8173 (2.6465)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6465 (1.6103)  time: 0.4851  data: 0.0004  max mem: 42519
Epoch: [285]  [2000/2502]  eta: 0:04:01  lr: 0.000026  min_lr: 0.000026  loss: 2.8599 (2.6454)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7062 (1.6146)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [285]  [2200/2502]  eta: 0:02:25  lr: 0.000026  min_lr: 0.000026  loss: 2.7759 (2.6481)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5904 (1.6200)  time: 0.4812  data: 0.0004  max mem: 42519
Epoch: [285]  [2400/2502]  eta: 0:00:49  lr: 0.000026  min_lr: 0.000026  loss: 2.6284 (2.6537)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6029 (1.6174)  time: 0.4817  data: 0.0005  max mem: 42519
Epoch: [285]  [2501/2502]  eta: 0:00:00  lr: 0.000026  min_lr: 0.000026  loss: 2.4741 (2.6541)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6723 (1.6192)  time: 0.4302  data: 0.0006  max mem: 42519
Epoch: [285] Total time: 0:20:02 (0.4807 s / it)
Averaged stats: lr: 0.000026  min_lr: 0.000026  loss: 2.4741 (2.6495)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6723 (1.6192)
Test:  [ 0/50]  eta: 0:04:08  loss: 0.5111 (0.5111)  acc1: 96.0000 (96.0000)  acc5: 99.2000 (99.2000)  time: 4.9690  data: 4.6898  max mem: 42519
Test:  [10/50]  eta: 0:00:27  loss: 0.7218 (0.7557)  acc1: 89.6000 (89.0909)  acc5: 98.8000 (98.3273)  time: 0.6765  data: 0.4354  max mem: 42519
Test:  [20/50]  eta: 0:00:15  loss: 0.8049 (0.7934)  acc1: 85.6000 (87.8857)  acc5: 98.4000 (98.1143)  time: 0.3106  data: 0.0687  max mem: 42519
Test:  [30/50]  eta: 0:00:09  loss: 0.9072 (0.8720)  acc1: 85.2000 (86.1032)  acc5: 96.8000 (97.3290)  time: 0.3866  data: 0.1400  max mem: 42519
Test:  [40/50]  eta: 0:00:04  loss: 1.0602 (0.9215)  acc1: 80.4000 (84.7220)  acc5: 95.6000 (96.9366)  time: 0.3404  data: 0.0985  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0386 (0.9353)  acc1: 80.0000 (84.1520)  acc5: 95.6000 (96.8560)  time: 0.3353  data: 0.0982  max mem: 42519
Test: Total time: 0:00:20 (0.4045 s / it)
* Acc@1 84.462 Acc@5 96.972 loss 0.924
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.47%
Epoch: [286]  [   0/2502]  eta: 1:51:37  lr: 0.000026  min_lr: 0.000026  loss: 2.6881 (2.6881)  weight_decay: 0.0500 (0.0500)  time: 2.6769  data: 1.5829  max mem: 42519
Epoch: [286]  [ 200/2502]  eta: 0:18:51  lr: 0.000025  min_lr: 0.000025  loss: 2.6038 (2.6112)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6346 (1.6923)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [286]  [ 400/2502]  eta: 0:17:01  lr: 0.000025  min_lr: 0.000025  loss: 2.7975 (2.6206)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5924 (1.6485)  time: 0.4801  data: 0.0004  max mem: 42519
Epoch: [286]  [ 600/2502]  eta: 0:15:21  lr: 0.000025  min_lr: 0.000025  loss: 2.7456 (2.6403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5720 (1.6310)  time: 0.4789  data: 0.0003  max mem: 42519
Epoch: [286]  [ 800/2502]  eta: 0:13:42  lr: 0.000025  min_lr: 0.000025  loss: 2.7235 (2.6530)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5753 (1.6354)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [286]  [1000/2502]  eta: 0:12:04  lr: 0.000024  min_lr: 0.000024  loss: 2.5886 (2.6425)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5757 (1.6332)  time: 0.4773  data: 0.0004  max mem: 42519
Epoch: [286]  [1200/2502]  eta: 0:10:27  lr: 0.000024  min_lr: 0.000024  loss: 2.6200 (2.6432)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5403 (1.6336)  time: 0.4778  data: 0.0004  max mem: 42519
Epoch: [286]  [1400/2502]  eta: 0:08:50  lr: 0.000024  min_lr: 0.000024  loss: 2.6415 (2.6575)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5930 (1.6426)  time: 0.4816  data: 0.0004  max mem: 42519
Epoch: [286]  [1600/2502]  eta: 0:07:14  lr: 0.000023  min_lr: 0.000023  loss: 2.5153 (2.6477)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4695 (1.6373)  time: 0.4823  data: 0.0005  max mem: 42519
Epoch: [286]  [1800/2502]  eta: 0:05:37  lr: 0.000023  min_lr: 0.000023  loss: 2.6628 (2.6443)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5855 (1.6363)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [286]  [2000/2502]  eta: 0:04:01  lr: 0.000023  min_lr: 0.000023  loss: 2.7163 (2.6398)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6077 (1.6290)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [286]  [2200/2502]  eta: 0:02:25  lr: 0.000023  min_lr: 0.000023  loss: 2.8028 (2.6457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5582 (1.6324)  time: 0.4776  data: 0.0004  max mem: 42519
Epoch: [286]  [2400/2502]  eta: 0:00:49  lr: 0.000022  min_lr: 0.000022  loss: 2.6772 (2.6485)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4820 (1.6347)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [286]  [2501/2502]  eta: 0:00:00  lr: 0.000022  min_lr: 0.000022  loss: 2.7562 (2.6469)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5258 (1.6347)  time: 0.4299  data: 0.0007  max mem: 42519
Epoch: [286] Total time: 0:20:02 (0.4807 s / it)
Averaged stats: lr: 0.000022  min_lr: 0.000022  loss: 2.7562 (2.6521)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5258 (1.6347)
Test:  [ 0/50]  eta: 0:02:27  loss: 0.4833 (0.4833)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 2.9555  data: 2.5312  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.6994 (0.7380)  acc1: 88.0000 (88.6546)  acc5: 98.8000 (98.4000)  time: 0.5108  data: 0.2568  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.7768 (0.7775)  acc1: 86.0000 (87.6381)  acc5: 98.0000 (98.1524)  time: 0.2568  data: 0.0150  max mem: 42519
Test:  [30/50]  eta: 0:00:07  loss: 0.8936 (0.8595)  acc1: 84.8000 (86.0516)  acc5: 96.8000 (97.3677)  time: 0.3017  data: 0.0567  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0609 (0.9112)  acc1: 80.8000 (84.6537)  acc5: 95.6000 (96.9951)  time: 0.3399  data: 0.0992  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0583 (0.9259)  acc1: 80.8000 (83.9600)  acc5: 96.0000 (96.9440)  time: 0.2811  data: 0.0430  max mem: 42519
Test: Total time: 0:00:17 (0.3437 s / it)
* Acc@1 84.456 Acc@5 96.958 loss 0.915
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.47%
Epoch: [287]  [   0/2502]  eta: 2:04:38  lr: 0.000022  min_lr: 0.000022  loss: 2.7638 (2.7638)  weight_decay: 0.0500 (0.0500)  time: 2.9889  data: 2.0301  max mem: 42519
Epoch: [287]  [ 200/2502]  eta: 0:18:57  lr: 0.000022  min_lr: 0.000022  loss: 2.6800 (2.6498)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5131 (1.5764)  time: 0.4789  data: 0.0003  max mem: 42519
Epoch: [287]  [ 400/2502]  eta: 0:17:04  lr: 0.000022  min_lr: 0.000022  loss: 2.7266 (2.6372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5415 (1.5777)  time: 0.4793  data: 0.0003  max mem: 42519
Epoch: [287]  [ 600/2502]  eta: 0:15:22  lr: 0.000021  min_lr: 0.000021  loss: 2.6752 (2.6405)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5405 (1.5734)  time: 0.4797  data: 0.0005  max mem: 42519
Epoch: [287]  [ 800/2502]  eta: 0:13:43  lr: 0.000021  min_lr: 0.000021  loss: 2.8108 (2.6422)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5279 (1.5912)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [287]  [1000/2502]  eta: 0:12:05  lr: 0.000021  min_lr: 0.000021  loss: 2.5551 (2.6476)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.4779  data: 0.0004  max mem: 42519
Epoch: [287]  [1200/2502]  eta: 0:10:28  lr: 0.000021  min_lr: 0.000021  loss: 2.7569 (2.6508)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5770 (nan)  time: 0.4877  data: 0.0004  max mem: 42519
Epoch: [287]  [1400/2502]  eta: 0:08:51  lr: 0.000020  min_lr: 0.000020  loss: 2.5911 (2.6452)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5373 (nan)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [287]  [1600/2502]  eta: 0:07:14  lr: 0.000020  min_lr: 0.000020  loss: 2.7204 (2.6403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5114 (nan)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [287]  [1800/2502]  eta: 0:05:38  lr: 0.000020  min_lr: 0.000020  loss: 2.7247 (2.6415)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5690 (nan)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [287]  [2000/2502]  eta: 0:04:01  lr: 0.000020  min_lr: 0.000020  loss: 2.4543 (2.6395)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6037 (nan)  time: 0.4795  data: 0.0006  max mem: 42519
Epoch: [287]  [2200/2502]  eta: 0:02:25  lr: 0.000019  min_lr: 0.000019  loss: 2.6326 (2.6378)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4893 (nan)  time: 0.4789  data: 0.0005  max mem: 42519
Epoch: [287]  [2400/2502]  eta: 0:00:49  lr: 0.000019  min_lr: 0.000019  loss: 2.9058 (2.6419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5625 (nan)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [287]  [2501/2502]  eta: 0:00:00  lr: 0.000019  min_lr: 0.000019  loss: 2.7940 (2.6412)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5647 (nan)  time: 0.4309  data: 0.0006  max mem: 42519
Epoch: [287] Total time: 0:20:04 (0.4815 s / it)
Averaged stats: lr: 0.000019  min_lr: 0.000019  loss: 2.7940 (2.6473)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5647 (nan)
Test:  [ 0/50]  eta: 0:02:20  loss: 0.5667 (0.5667)  acc1: 95.6000 (95.6000)  acc5: 99.6000 (99.6000)  time: 2.8189  data: 2.5293  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.7743 (0.8141)  acc1: 90.0000 (88.7636)  acc5: 98.8000 (98.4364)  time: 0.4784  data: 0.2365  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.8568 (0.8515)  acc1: 86.0000 (87.6571)  acc5: 98.4000 (98.2476)  time: 0.2414  data: 0.0038  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9739 (0.9347)  acc1: 84.4000 (86.0645)  acc5: 97.2000 (97.4839)  time: 0.2403  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1423 (0.9850)  acc1: 80.4000 (84.7122)  acc5: 95.6000 (97.0732)  time: 0.2400  data: 0.0004  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0974 (0.9998)  acc1: 80.0000 (84.0560)  acc5: 96.0000 (97.0000)  time: 0.2393  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2937 s / it)
* Acc@1 84.542 Acc@5 96.988 loss 0.989
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.54%
Epoch: [288]  [   0/2502]  eta: 2:19:09  lr: 0.000019  min_lr: 0.000019  loss: 1.6814 (1.6814)  weight_decay: 0.0500 (0.0500)  time: 3.3369  data: 2.8494  max mem: 42519
Epoch: [288]  [ 200/2502]  eta: 0:19:03  lr: 0.000019  min_lr: 0.000019  loss: 2.7932 (2.6303)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5629 (1.6509)  time: 0.4927  data: 0.0006  max mem: 42519
Epoch: [288]  [ 400/2502]  eta: 0:17:09  lr: 0.000019  min_lr: 0.000019  loss: 2.5554 (2.6106)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6677 (1.6762)  time: 0.4827  data: 0.0006  max mem: 42519
Epoch: [288]  [ 600/2502]  eta: 0:15:25  lr: 0.000018  min_lr: 0.000018  loss: 2.7353 (2.6337)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5956 (1.6561)  time: 0.4785  data: 0.0005  max mem: 42519
Epoch: [288]  [ 800/2502]  eta: 0:13:45  lr: 0.000018  min_lr: 0.000018  loss: 2.4102 (2.6282)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6592 (1.6623)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [288]  [1000/2502]  eta: 0:12:07  lr: 0.000018  min_lr: 0.000018  loss: 2.8704 (2.6324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5726 (1.6676)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [288]  [1200/2502]  eta: 0:10:29  lr: 0.000018  min_lr: 0.000018  loss: 2.8096 (2.6407)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5828 (1.6682)  time: 0.4822  data: 0.0006  max mem: 42519
Epoch: [288]  [1400/2502]  eta: 0:08:52  lr: 0.000017  min_lr: 0.000017  loss: 2.7733 (2.6430)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6168 (1.6632)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [288]  [1600/2502]  eta: 0:07:15  lr: 0.000017  min_lr: 0.000017  loss: 2.6520 (2.6371)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5100 (1.6580)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [288]  [1800/2502]  eta: 0:05:39  lr: 0.000017  min_lr: 0.000017  loss: 2.6820 (2.6389)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5501 (1.6522)  time: 0.4794  data: 0.0005  max mem: 42519
Epoch: [288]  [2000/2502]  eta: 0:04:02  lr: 0.000017  min_lr: 0.000017  loss: 2.7247 (2.6398)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5652 (1.6436)  time: 0.4811  data: 0.0005  max mem: 42519
Epoch: [288]  [2200/2502]  eta: 0:02:25  lr: 0.000017  min_lr: 0.000017  loss: 2.8883 (2.6460)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5845 (1.6445)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [288]  [2400/2502]  eta: 0:00:49  lr: 0.000016  min_lr: 0.000016  loss: 2.7340 (2.6449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5532 (1.6441)  time: 0.4840  data: 0.0005  max mem: 42519
Epoch: [288]  [2501/2502]  eta: 0:00:00  lr: 0.000016  min_lr: 0.000016  loss: 2.7069 (2.6434)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5361 (1.6592)  time: 0.4316  data: 0.0008  max mem: 42519
Epoch: [288] Total time: 0:20:07 (0.4825 s / it)
Averaged stats: lr: 0.000016  min_lr: 0.000016  loss: 2.7069 (2.6425)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5361 (1.6592)
Test:  [ 0/50]  eta: 0:02:19  loss: 0.5863 (0.5863)  acc1: 95.2000 (95.2000)  acc5: 99.6000 (99.6000)  time: 2.7853  data: 2.5179  max mem: 42519
Test:  [10/50]  eta: 0:00:18  loss: 0.8076 (0.8362)  acc1: 88.4000 (88.7273)  acc5: 98.4000 (98.1818)  time: 0.4693  data: 0.2293  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.8716 (0.8747)  acc1: 86.0000 (87.5619)  acc5: 98.0000 (97.9810)  time: 0.2379  data: 0.0004  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0007 (0.9585)  acc1: 84.8000 (85.7936)  acc5: 97.2000 (97.2516)  time: 0.2403  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1577 (1.0085)  acc1: 80.0000 (84.4781)  acc5: 95.2000 (96.8976)  time: 0.2414  data: 0.0005  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1398 (1.0227)  acc1: 80.0000 (83.8640)  acc5: 96.0000 (96.7760)  time: 0.2401  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2930 s / it)
* Acc@1 84.356 Acc@5 96.894 loss 1.011
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.54%
Epoch: [289]  [   0/2502]  eta: 2:20:05  lr: 0.000016  min_lr: 0.000016  loss: 1.9566 (1.9566)  weight_decay: 0.0500 (0.0500)  time: 3.3594  data: 2.8170  max mem: 42519
Epoch: [289]  [ 200/2502]  eta: 0:19:06  lr: 0.000016  min_lr: 0.000016  loss: 2.5568 (2.6063)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6669 (1.6460)  time: 0.4941  data: 0.0004  max mem: 42519
Epoch: [289]  [ 400/2502]  eta: 0:17:09  lr: 0.000016  min_lr: 0.000016  loss: 2.8106 (2.6290)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4917 (1.6110)  time: 0.4790  data: 0.0004  max mem: 42519
Epoch: [289]  [ 600/2502]  eta: 0:15:26  lr: 0.000016  min_lr: 0.000016  loss: 2.7098 (2.6164)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6314 (1.6123)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [289]  [ 800/2502]  eta: 0:13:46  lr: 0.000015  min_lr: 0.000015  loss: 2.8224 (2.6142)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5027 (1.5955)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [289]  [1000/2502]  eta: 0:12:08  lr: 0.000015  min_lr: 0.000015  loss: 2.6624 (2.6100)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4462 (1.5884)  time: 0.4822  data: 0.0004  max mem: 42519
Epoch: [289]  [1200/2502]  eta: 0:10:30  lr: 0.000015  min_lr: 0.000015  loss: 2.7860 (2.6178)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4731 (1.5905)  time: 0.4806  data: 0.0005  max mem: 42519
Epoch: [289]  [1400/2502]  eta: 0:08:53  lr: 0.000015  min_lr: 0.000015  loss: 2.7901 (2.6248)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5886 (1.5986)  time: 0.4809  data: 0.0006  max mem: 42519
Epoch: [289]  [1600/2502]  eta: 0:07:16  lr: 0.000014  min_lr: 0.000014  loss: 2.7186 (2.6309)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4861 (1.5947)  time: 0.4876  data: 0.0005  max mem: 42519
Epoch: [289]  [1800/2502]  eta: 0:05:39  lr: 0.000014  min_lr: 0.000014  loss: 2.5706 (2.6297)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5148 (1.5976)  time: 0.4828  data: 0.0004  max mem: 42519
Epoch: [289]  [2000/2502]  eta: 0:04:02  lr: 0.000014  min_lr: 0.000014  loss: 2.4861 (2.6281)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5077 (1.5948)  time: 0.4793  data: 0.0004  max mem: 42519
Epoch: [289]  [2200/2502]  eta: 0:02:25  lr: 0.000014  min_lr: 0.000014  loss: 2.7362 (2.6298)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5518 (1.5994)  time: 0.4800  data: 0.0005  max mem: 42519
Epoch: [289]  [2400/2502]  eta: 0:00:49  lr: 0.000014  min_lr: 0.000014  loss: 2.5615 (2.6302)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5540 (1.6016)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [289]  [2501/2502]  eta: 0:00:00  lr: 0.000014  min_lr: 0.000014  loss: 2.6507 (2.6346)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6037 (1.6062)  time: 0.4312  data: 0.0007  max mem: 42519
Epoch: [289] Total time: 0:20:07 (0.4827 s / it)
Averaged stats: lr: 0.000014  min_lr: 0.000014  loss: 2.6507 (2.6428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6037 (1.6062)
Test:  [ 0/50]  eta: 0:02:20  loss: 0.4810 (0.4810)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 2.8042  data: 2.5210  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.7103 (0.7349)  acc1: 89.2000 (89.0545)  acc5: 98.8000 (98.2545)  time: 0.4777  data: 0.2363  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.7725 (0.7698)  acc1: 86.0000 (87.9238)  acc5: 98.4000 (98.1905)  time: 0.2416  data: 0.0041  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.8994 (0.8520)  acc1: 84.4000 (86.1548)  acc5: 96.8000 (97.4839)  time: 0.2382  data: 0.0004  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0561 (0.9041)  acc1: 80.4000 (84.7415)  acc5: 96.0000 (97.1024)  time: 0.2380  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0345 (0.9182)  acc1: 79.2000 (84.1280)  acc5: 96.0000 (97.0240)  time: 0.2377  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2929 s / it)
* Acc@1 84.540 Acc@5 96.998 loss 0.907
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.54%
Epoch: [290]  [   0/2502]  eta: 2:18:57  lr: 0.000014  min_lr: 0.000014  loss: 3.0319 (3.0319)  weight_decay: 0.0500 (0.0500)  time: 3.3324  data: 2.7609  max mem: 42519
Epoch: [290]  [ 200/2502]  eta: 0:19:00  lr: 0.000013  min_lr: 0.000013  loss: 2.8368 (2.6758)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6199 (1.7885)  time: 0.4870  data: 0.0003  max mem: 42519
Epoch: [290]  [ 400/2502]  eta: 0:17:07  lr: 0.000013  min_lr: 0.000013  loss: 2.7531 (2.6562)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5178 (1.6804)  time: 0.4881  data: 0.0005  max mem: 42519
Epoch: [290]  [ 600/2502]  eta: 0:15:24  lr: 0.000013  min_lr: 0.000013  loss: 2.6886 (2.6555)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5611 (1.6565)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [290]  [ 800/2502]  eta: 0:13:44  lr: 0.000013  min_lr: 0.000013  loss: 2.5605 (2.6572)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5730 (1.6565)  time: 0.4781  data: 0.0004  max mem: 42519
Epoch: [290]  [1000/2502]  eta: 0:12:06  lr: 0.000013  min_lr: 0.000013  loss: 2.8855 (2.6525)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6222 (1.6461)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [290]  [1200/2502]  eta: 0:10:28  lr: 0.000012  min_lr: 0.000012  loss: 2.7847 (2.6480)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5162 (1.6452)  time: 0.4780  data: 0.0004  max mem: 42519
Epoch: [290]  [1400/2502]  eta: 0:08:51  lr: 0.000012  min_lr: 0.000012  loss: 2.5082 (2.6526)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5895 (1.6538)  time: 0.4775  data: 0.0004  max mem: 42519
Epoch: [290]  [1600/2502]  eta: 0:07:14  lr: 0.000012  min_lr: 0.000012  loss: 2.6214 (2.6544)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5893 (1.6457)  time: 0.4872  data: 0.0003  max mem: 42519
Epoch: [290]  [1800/2502]  eta: 0:05:38  lr: 0.000012  min_lr: 0.000012  loss: 2.6919 (2.6524)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5480 (1.6418)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [290]  [2000/2502]  eta: 0:04:01  lr: 0.000012  min_lr: 0.000012  loss: 2.5969 (2.6514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6403 (1.6380)  time: 0.4771  data: 0.0003  max mem: 42519
Epoch: [290]  [2200/2502]  eta: 0:02:25  lr: 0.000011  min_lr: 0.000011  loss: 2.9168 (2.6533)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5042 (1.6302)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [290]  [2400/2502]  eta: 0:00:49  lr: 0.000011  min_lr: 0.000011  loss: 2.8379 (2.6558)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5379 (1.6311)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [290]  [2501/2502]  eta: 0:00:00  lr: 0.000011  min_lr: 0.000011  loss: 2.7139 (2.6556)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5625 (1.6310)  time: 0.4317  data: 0.0009  max mem: 42519
Epoch: [290] Total time: 0:20:03 (0.4812 s / it)
Averaged stats: lr: 0.000011  min_lr: 0.000011  loss: 2.7139 (2.6425)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5625 (1.6310)
Test:  [ 0/50]  eta: 0:02:36  loss: 0.5295 (0.5295)  acc1: 96.0000 (96.0000)  acc5: 99.6000 (99.6000)  time: 3.1302  data: 2.8488  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.7611 (0.7793)  acc1: 89.2000 (88.9091)  acc5: 98.8000 (98.3636)  time: 0.4984  data: 0.2594  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8267 (0.8177)  acc1: 86.0000 (87.8286)  acc5: 98.4000 (98.1524)  time: 0.2353  data: 0.0004  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9494 (0.9010)  acc1: 84.4000 (86.1548)  acc5: 96.8000 (97.3936)  time: 0.2354  data: 0.0004  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0987 (0.9530)  acc1: 80.4000 (84.7415)  acc5: 95.6000 (97.0146)  time: 0.2352  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0670 (0.9675)  acc1: 80.0000 (84.1360)  acc5: 95.6000 (96.9360)  time: 0.2351  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2959 s / it)
* Acc@1 84.556 Acc@5 97.004 loss 0.955
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.56%
Epoch: [291]  [   0/2502]  eta: 1:28:21  lr: 0.000011  min_lr: 0.000011  loss: 3.1542 (3.1542)  weight_decay: 0.0500 (0.0500)  time: 2.1189  data: 1.6394  max mem: 42519
Epoch: [291]  [ 200/2502]  eta: 0:18:45  lr: 0.000011  min_lr: 0.000011  loss: 2.9702 (2.6579)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5542 (1.6825)  time: 0.4849  data: 0.0004  max mem: 42519
Epoch: [291]  [ 400/2502]  eta: 0:16:58  lr: 0.000011  min_lr: 0.000011  loss: 2.6319 (2.6458)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4727 (1.6467)  time: 0.4893  data: 0.0005  max mem: 42519
Epoch: [291]  [ 600/2502]  eta: 0:15:18  lr: 0.000011  min_lr: 0.000011  loss: 2.9059 (2.6566)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6222 (1.6402)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [291]  [ 800/2502]  eta: 0:13:41  lr: 0.000010  min_lr: 0.000010  loss: 2.6897 (2.6437)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5695 (1.6276)  time: 0.4804  data: 0.0003  max mem: 42519
Epoch: [291]  [1000/2502]  eta: 0:12:04  lr: 0.000010  min_lr: 0.000010  loss: 2.7574 (2.6372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5268 (1.6306)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [291]  [1200/2502]  eta: 0:10:27  lr: 0.000010  min_lr: 0.000010  loss: 2.9179 (2.6433)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4790 (1.6251)  time: 0.4790  data: 0.0003  max mem: 42519
Epoch: [291]  [1400/2502]  eta: 0:08:50  lr: 0.000010  min_lr: 0.000010  loss: 2.5511 (2.6415)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5904 (1.6202)  time: 0.4782  data: 0.0003  max mem: 42519
Epoch: [291]  [1600/2502]  eta: 0:07:14  lr: 0.000010  min_lr: 0.000010  loss: 2.7918 (2.6387)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5430 (1.6188)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [291]  [1800/2502]  eta: 0:05:37  lr: 0.000010  min_lr: 0.000010  loss: 2.4870 (2.6373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6289 (1.6274)  time: 0.4780  data: 0.0004  max mem: 42519
Epoch: [291]  [2000/2502]  eta: 0:04:01  lr: 0.000009  min_lr: 0.000009  loss: 2.5924 (2.6347)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5728 (1.6304)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [291]  [2200/2502]  eta: 0:02:25  lr: 0.000009  min_lr: 0.000009  loss: 2.8898 (2.6303)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4304 (1.6178)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [291]  [2400/2502]  eta: 0:00:49  lr: 0.000009  min_lr: 0.000009  loss: 2.7597 (2.6283)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5046 (1.6135)  time: 0.4796  data: 0.0005  max mem: 42519
Epoch: [291]  [2501/2502]  eta: 0:00:00  lr: 0.000009  min_lr: 0.000009  loss: 2.7842 (2.6296)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6256 (1.6126)  time: 0.4331  data: 0.0006  max mem: 42519
Epoch: [291] Total time: 0:20:03 (0.4812 s / it)
Averaged stats: lr: 0.000009  min_lr: 0.000009  loss: 2.7842 (2.6420)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6256 (1.6126)
Test:  [ 0/50]  eta: 0:02:35  loss: 0.5357 (0.5357)  acc1: 96.0000 (96.0000)  acc5: 99.6000 (99.6000)  time: 3.1191  data: 2.8399  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.7562 (0.7903)  acc1: 88.8000 (88.9091)  acc5: 98.8000 (98.3273)  time: 0.4998  data: 0.2587  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8371 (0.8318)  acc1: 86.0000 (87.7905)  acc5: 98.4000 (98.1524)  time: 0.2380  data: 0.0006  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9567 (0.9148)  acc1: 84.4000 (86.0516)  acc5: 96.8000 (97.4194)  time: 0.2382  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1011 (0.9656)  acc1: 80.8000 (84.7317)  acc5: 95.6000 (97.0439)  time: 0.2379  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0835 (0.9789)  acc1: 79.2000 (84.1440)  acc5: 96.0000 (96.9760)  time: 0.2377  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2992 s / it)
* Acc@1 84.536 Acc@5 97.002 loss 0.967
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.56%
Epoch: [292]  [   0/2502]  eta: 2:24:31  lr: 0.000009  min_lr: 0.000009  loss: 2.6056 (2.6056)  weight_decay: 0.0500 (0.0500)  time: 3.4658  data: 2.1622  max mem: 42519
Epoch: [292]  [ 200/2502]  eta: 0:19:03  lr: 0.000009  min_lr: 0.000009  loss: 2.8367 (2.6574)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4485 (1.6121)  time: 0.4871  data: 0.0004  max mem: 42519
Epoch: [292]  [ 400/2502]  eta: 0:17:06  lr: 0.000009  min_lr: 0.000009  loss: 2.9498 (2.6629)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5978 (1.6269)  time: 0.4906  data: 0.0004  max mem: 42519
Epoch: [292]  [ 600/2502]  eta: 0:15:25  lr: 0.000009  min_lr: 0.000009  loss: 2.7409 (2.6465)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5508 (1.6518)  time: 0.4856  data: 0.0004  max mem: 42519
Epoch: [292]  [ 800/2502]  eta: 0:13:45  lr: 0.000008  min_lr: 0.000008  loss: 2.6021 (2.6434)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4919 (1.6359)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [292]  [1000/2502]  eta: 0:12:06  lr: 0.000008  min_lr: 0.000008  loss: 2.8675 (2.6518)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5432 (1.6347)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [292]  [1200/2502]  eta: 0:10:29  lr: 0.000008  min_lr: 0.000008  loss: 2.8444 (2.6475)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5699 (1.6330)  time: 0.4791  data: 0.0005  max mem: 42519
Epoch: [292]  [1400/2502]  eta: 0:08:52  lr: 0.000008  min_lr: 0.000008  loss: 2.5621 (2.6415)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5921 (1.6292)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [292]  [1600/2502]  eta: 0:07:15  lr: 0.000008  min_lr: 0.000008  loss: 2.3729 (2.6393)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5718 (1.6247)  time: 0.4821  data: 0.0004  max mem: 42519
Epoch: [292]  [1800/2502]  eta: 0:05:38  lr: 0.000008  min_lr: 0.000008  loss: 2.5419 (2.6392)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5422 (1.6211)  time: 0.4798  data: 0.0006  max mem: 42519
Epoch: [292]  [2000/2502]  eta: 0:04:02  lr: 0.000008  min_lr: 0.000008  loss: 2.7597 (2.6359)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6291 (1.6238)  time: 0.4799  data: 0.0005  max mem: 42519
Epoch: [292]  [2200/2502]  eta: 0:02:25  lr: 0.000007  min_lr: 0.000007  loss: 2.6677 (2.6370)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5439 (1.6265)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [292]  [2400/2502]  eta: 0:00:49  lr: 0.000007  min_lr: 0.000007  loss: 2.6611 (2.6389)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5197 (1.6209)  time: 0.4809  data: 0.0003  max mem: 42519
Epoch: [292]  [2501/2502]  eta: 0:00:00  lr: 0.000007  min_lr: 0.000007  loss: 2.7815 (2.6419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5548 (1.6206)  time: 0.4338  data: 0.0007  max mem: 42519
Epoch: [292] Total time: 0:20:06 (0.4822 s / it)
Averaged stats: lr: 0.000007  min_lr: 0.000007  loss: 2.7815 (2.6464)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5548 (1.6206)
Test:  [ 0/50]  eta: 0:02:17  loss: 0.6022 (0.6022)  acc1: 96.0000 (96.0000)  acc5: 99.2000 (99.2000)  time: 2.7580  data: 2.4906  max mem: 42519
Test:  [10/50]  eta: 0:00:18  loss: 0.8315 (0.8595)  acc1: 89.2000 (88.9455)  acc5: 98.8000 (98.3636)  time: 0.4670  data: 0.2269  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.8950 (0.8955)  acc1: 86.0000 (87.8667)  acc5: 98.0000 (98.1714)  time: 0.2379  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0319 (0.9830)  acc1: 84.4000 (86.1806)  acc5: 96.8000 (97.4323)  time: 0.2381  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:02  loss: 1.1974 (1.0353)  acc1: 80.8000 (84.8098)  acc5: 95.6000 (97.0537)  time: 0.2378  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1582 (1.0483)  acc1: 80.4000 (84.1840)  acc5: 95.6000 (96.9600)  time: 0.2375  data: 0.0001  max mem: 42519
Test: Total time: 0:00:14 (0.2910 s / it)
* Acc@1 84.562 Acc@5 97.014 loss 1.038
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.56%
Epoch: [293]  [   0/2502]  eta: 2:03:45  lr: 0.000007  min_lr: 0.000007  loss: 1.7021 (1.7021)  weight_decay: 0.0500 (0.0500)  time: 2.9678  data: 2.4722  max mem: 42519
Epoch: [293]  [ 200/2502]  eta: 0:19:00  lr: 0.000007  min_lr: 0.000007  loss: 2.8555 (2.6630)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4805 (1.4934)  time: 0.4806  data: 0.0004  max mem: 42519
Epoch: [293]  [ 400/2502]  eta: 0:17:06  lr: 0.000007  min_lr: 0.000007  loss: 2.6740 (2.6339)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5138 (1.5429)  time: 0.4785  data: 0.0003  max mem: 42519
Epoch: [293]  [ 600/2502]  eta: 0:15:23  lr: 0.000007  min_lr: 0.000007  loss: 2.6485 (2.6293)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5065 (1.5690)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [293]  [ 800/2502]  eta: 0:13:45  lr: 0.000007  min_lr: 0.000007  loss: 2.6956 (2.6430)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5205 (1.5641)  time: 0.4821  data: 0.0006  max mem: 42519
Epoch: [293]  [1000/2502]  eta: 0:12:07  lr: 0.000006  min_lr: 0.000006  loss: 2.4947 (2.6387)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5859 (1.5705)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [293]  [1200/2502]  eta: 0:10:29  lr: 0.000006  min_lr: 0.000006  loss: 2.7034 (2.6472)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5247 (1.5671)  time: 0.4826  data: 0.0006  max mem: 42519
Epoch: [293]  [1400/2502]  eta: 0:08:52  lr: 0.000006  min_lr: 0.000006  loss: 2.8854 (2.6548)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5713 (1.5680)  time: 0.4797  data: 0.0004  max mem: 42519
Epoch: [293]  [1600/2502]  eta: 0:07:15  lr: 0.000006  min_lr: 0.000006  loss: 2.7369 (2.6520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6636 (1.5862)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [293]  [1800/2502]  eta: 0:05:39  lr: 0.000006  min_lr: 0.000006  loss: 2.7877 (2.6468)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5089 (1.5872)  time: 0.4808  data: 0.0004  max mem: 42519
Epoch: [293]  [2000/2502]  eta: 0:04:02  lr: 0.000006  min_lr: 0.000006  loss: 2.6085 (2.6479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5186 (1.5880)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [293]  [2200/2502]  eta: 0:02:25  lr: 0.000006  min_lr: 0.000006  loss: 2.8789 (2.6505)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5011 (1.5894)  time: 0.4808  data: 0.0004  max mem: 42519
Epoch: [293]  [2400/2502]  eta: 0:00:49  lr: 0.000006  min_lr: 0.000006  loss: 2.6175 (2.6459)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4914 (1.5865)  time: 0.4783  data: 0.0005  max mem: 42519
Epoch: [293]  [2501/2502]  eta: 0:00:00  lr: 0.000006  min_lr: 0.000006  loss: 2.6533 (2.6456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5597 (1.5922)  time: 0.4300  data: 0.0006  max mem: 42519
Epoch: [293] Total time: 0:20:06 (0.4824 s / it)
Averaged stats: lr: 0.000006  min_lr: 0.000006  loss: 2.6533 (2.6447)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5597 (1.5922)
Test:  [ 0/50]  eta: 0:02:30  loss: 0.5283 (0.5283)  acc1: 95.6000 (95.6000)  acc5: 99.6000 (99.6000)  time: 3.0135  data: 2.7529  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.7485 (0.7760)  acc1: 89.6000 (88.7636)  acc5: 98.8000 (98.3636)  time: 0.4879  data: 0.2507  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8264 (0.8149)  acc1: 86.4000 (87.7333)  acc5: 98.4000 (98.1714)  time: 0.2355  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9427 (0.9007)  acc1: 84.4000 (86.0645)  acc5: 96.8000 (97.4452)  time: 0.2356  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1061 (0.9532)  acc1: 80.8000 (84.6732)  acc5: 95.6000 (97.0537)  time: 0.2354  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0757 (0.9688)  acc1: 79.2000 (84.1120)  acc5: 96.0000 (96.9680)  time: 0.2353  data: 0.0002  max mem: 42519
Test: Total time: 0:00:14 (0.2945 s / it)
* Acc@1 84.502 Acc@5 97.006 loss 0.958
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.56%
Epoch: [294]  [   0/2502]  eta: 2:19:16  lr: 0.000006  min_lr: 0.000006  loss: 2.8067 (2.8067)  weight_decay: 0.0500 (0.0500)  time: 3.3399  data: 2.5273  max mem: 42519
Epoch: [294]  [ 200/2502]  eta: 0:18:58  lr: 0.000005  min_lr: 0.000005  loss: 2.9072 (2.6247)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5957 (nan)  time: 0.4791  data: 0.0004  max mem: 42519
Epoch: [294]  [ 400/2502]  eta: 0:17:07  lr: 0.000005  min_lr: 0.000005  loss: 2.6060 (2.6445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5234 (nan)  time: 0.4880  data: 0.0004  max mem: 42519
Epoch: [294]  [ 600/2502]  eta: 0:15:24  lr: 0.000005  min_lr: 0.000005  loss: 2.6655 (2.6470)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4737 (nan)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [294]  [ 800/2502]  eta: 0:13:45  lr: 0.000005  min_lr: 0.000005  loss: 2.6068 (2.6394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5651 (nan)  time: 0.4911  data: 0.0004  max mem: 42519
Epoch: [294]  [1000/2502]  eta: 0:12:07  lr: 0.000005  min_lr: 0.000005  loss: 2.6921 (2.6425)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4796 (nan)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [294]  [1200/2502]  eta: 0:10:29  lr: 0.000005  min_lr: 0.000005  loss: 2.7653 (2.6391)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4789 (nan)  time: 0.4809  data: 0.0005  max mem: 42519
Epoch: [294]  [1400/2502]  eta: 0:08:52  lr: 0.000005  min_lr: 0.000005  loss: 2.6311 (2.6392)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5248 (nan)  time: 0.4815  data: 0.0004  max mem: 42519
Epoch: [294]  [1600/2502]  eta: 0:07:15  lr: 0.000005  min_lr: 0.000005  loss: 2.6701 (2.6463)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4485 (nan)  time: 0.4799  data: 0.0005  max mem: 42519
Epoch: [294]  [1800/2502]  eta: 0:05:39  lr: 0.000005  min_lr: 0.000005  loss: 2.8025 (2.6430)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5095 (nan)  time: 0.4825  data: 0.0004  max mem: 42519
Epoch: [294]  [2000/2502]  eta: 0:04:02  lr: 0.000004  min_lr: 0.000004  loss: 2.5829 (2.6419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5397 (nan)  time: 0.4787  data: 0.0003  max mem: 42519
Epoch: [294]  [2200/2502]  eta: 0:02:25  lr: 0.000004  min_lr: 0.000004  loss: 2.8272 (2.6397)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5137 (nan)  time: 0.4786  data: 0.0004  max mem: 42519
Epoch: [294]  [2400/2502]  eta: 0:00:49  lr: 0.000004  min_lr: 0.000004  loss: 2.6463 (2.6376)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5534 (nan)  time: 0.4782  data: 0.0005  max mem: 42519
Epoch: [294]  [2501/2502]  eta: 0:00:00  lr: 0.000004  min_lr: 0.000004  loss: 2.6787 (2.6360)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5538 (nan)  time: 0.4314  data: 0.0009  max mem: 42519
Epoch: [294] Total time: 0:20:06 (0.4823 s / it)
Averaged stats: lr: 0.000004  min_lr: 0.000004  loss: 2.6787 (2.6382)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5538 (nan)
Test:  [ 0/50]  eta: 0:02:23  loss: 0.6110 (0.6110)  acc1: 96.0000 (96.0000)  acc5: 99.6000 (99.6000)  time: 2.8757  data: 2.5967  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.8498 (0.8702)  acc1: 89.2000 (88.8727)  acc5: 98.8000 (98.5091)  time: 0.4768  data: 0.2366  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.8894 (0.9075)  acc1: 86.0000 (87.7714)  acc5: 98.4000 (98.3238)  time: 0.2370  data: 0.0005  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 1.0434 (0.9935)  acc1: 84.4000 (86.0387)  acc5: 97.2000 (97.5484)  time: 0.2370  data: 0.0004  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.2080 (1.0447)  acc1: 80.4000 (84.6732)  acc5: 96.0000 (97.1610)  time: 0.2368  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.1729 (1.0590)  acc1: 79.6000 (84.0160)  acc5: 96.0000 (97.0720)  time: 0.2366  data: 0.0001  max mem: 42519
Test: Total time: 0:00:14 (0.2932 s / it)
* Acc@1 84.480 Acc@5 97.012 loss 1.049
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.56%
Epoch: [295]  [   0/2502]  eta: 2:12:10  lr: 0.000004  min_lr: 0.000004  loss: 2.4362 (2.4362)  weight_decay: 0.0500 (0.0500)  time: 3.1698  data: 2.5289  max mem: 42519
Epoch: [295]  [ 200/2502]  eta: 0:19:02  lr: 0.000004  min_lr: 0.000004  loss: 2.6033 (2.6566)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6537 (1.6317)  time: 0.4806  data: 0.0006  max mem: 42519
Epoch: [295]  [ 400/2502]  eta: 0:17:06  lr: 0.000004  min_lr: 0.000004  loss: 2.6671 (2.6446)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5438 (1.6413)  time: 0.4805  data: 0.0005  max mem: 42519
Epoch: [295]  [ 600/2502]  eta: 0:15:25  lr: 0.000004  min_lr: 0.000004  loss: 2.7474 (2.6494)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5485 (1.6366)  time: 0.4817  data: 0.0004  max mem: 42519
Epoch: [295]  [ 800/2502]  eta: 0:13:46  lr: 0.000004  min_lr: 0.000004  loss: 2.6482 (2.6300)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5394 (1.6352)  time: 0.4885  data: 0.0006  max mem: 42519
Epoch: [295]  [1000/2502]  eta: 0:12:07  lr: 0.000004  min_lr: 0.000004  loss: 2.5396 (2.6181)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6023 (1.6301)  time: 0.4787  data: 0.0004  max mem: 42519
Epoch: [295]  [1200/2502]  eta: 0:10:29  lr: 0.000004  min_lr: 0.000004  loss: 2.7774 (2.6302)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5739 (1.6226)  time: 0.4851  data: 0.0005  max mem: 42519
Epoch: [295]  [1400/2502]  eta: 0:08:52  lr: 0.000003  min_lr: 0.000003  loss: 2.6171 (2.6327)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5199 (1.6180)  time: 0.4787  data: 0.0005  max mem: 42519
Epoch: [295]  [1600/2502]  eta: 0:07:15  lr: 0.000003  min_lr: 0.000003  loss: 2.6023 (2.6369)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4948 (1.6110)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [295]  [1800/2502]  eta: 0:05:38  lr: 0.000003  min_lr: 0.000003  loss: 2.5068 (2.6425)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5666 (1.6152)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [295]  [2000/2502]  eta: 0:04:02  lr: 0.000003  min_lr: 0.000003  loss: 2.6964 (2.6389)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4698 (1.6136)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [295]  [2200/2502]  eta: 0:02:25  lr: 0.000003  min_lr: 0.000003  loss: 2.7876 (2.6450)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5078 (1.6151)  time: 0.4857  data: 0.0006  max mem: 42519
Epoch: [295]  [2400/2502]  eta: 0:00:49  lr: 0.000003  min_lr: 0.000003  loss: 2.5631 (2.6449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4873 (1.6150)  time: 0.4775  data: 0.0004  max mem: 42519
Epoch: [295]  [2501/2502]  eta: 0:00:00  lr: 0.000003  min_lr: 0.000003  loss: 2.8031 (2.6440)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4899 (1.6125)  time: 0.4314  data: 0.0007  max mem: 42519
Epoch: [295] Total time: 0:20:05 (0.4817 s / it)
Averaged stats: lr: 0.000003  min_lr: 0.000003  loss: 2.8031 (2.6431)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4899 (1.6125)
Test:  [ 0/50]  eta: 0:02:14  loss: 0.5032 (0.5032)  acc1: 96.0000 (96.0000)  acc5: 99.6000 (99.6000)  time: 2.6952  data: 2.4243  max mem: 42519
Test:  [10/50]  eta: 0:00:21  loss: 0.7242 (0.7587)  acc1: 89.2000 (88.8364)  acc5: 98.8000 (98.3636)  time: 0.5308  data: 0.2824  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.7921 (0.7953)  acc1: 86.0000 (87.7905)  acc5: 98.4000 (98.1905)  time: 0.2777  data: 0.0344  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9318 (0.8799)  acc1: 84.4000 (86.0903)  acc5: 96.8000 (97.4452)  time: 0.2387  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0747 (0.9333)  acc1: 80.8000 (84.7122)  acc5: 95.6000 (97.0537)  time: 0.2357  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0627 (0.9487)  acc1: 80.4000 (84.0800)  acc5: 96.0000 (96.9520)  time: 0.2349  data: 0.0001  max mem: 42519
Test: Total time: 0:00:15 (0.3031 s / it)
* Acc@1 84.542 Acc@5 96.996 loss 0.938
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.56%
Epoch: [296]  [   0/2502]  eta: 1:55:51  lr: 0.000003  min_lr: 0.000003  loss: 2.7685 (2.7685)  weight_decay: 0.0500 (0.0500)  time: 2.7782  data: 2.0903  max mem: 42519
Epoch: [296]  [ 200/2502]  eta: 0:18:53  lr: 0.000003  min_lr: 0.000003  loss: 2.8990 (2.6714)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4588 (1.5809)  time: 0.4810  data: 0.0005  max mem: 42519
Epoch: [296]  [ 400/2502]  eta: 0:17:03  lr: 0.000003  min_lr: 0.000003  loss: 2.7075 (2.6538)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5703 (1.5819)  time: 0.4784  data: 0.0006  max mem: 42519
Epoch: [296]  [ 600/2502]  eta: 0:15:21  lr: 0.000003  min_lr: 0.000003  loss: 2.7911 (2.6465)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5207 (1.5814)  time: 0.4803  data: 0.0005  max mem: 42519
Epoch: [296]  [ 800/2502]  eta: 0:13:42  lr: 0.000003  min_lr: 0.000003  loss: 2.6901 (2.6443)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6286 (1.6020)  time: 0.4833  data: 0.0005  max mem: 42519
Epoch: [296]  [1000/2502]  eta: 0:12:04  lr: 0.000003  min_lr: 0.000003  loss: 2.8674 (2.6516)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5672 (1.6081)  time: 0.4780  data: 0.0006  max mem: 42519
Epoch: [296]  [1200/2502]  eta: 0:10:27  lr: 0.000003  min_lr: 0.000003  loss: 2.2317 (2.6438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5178 (1.6057)  time: 0.4792  data: 0.0006  max mem: 42519
Epoch: [296]  [1400/2502]  eta: 0:08:50  lr: 0.000002  min_lr: 0.000002  loss: 2.8134 (2.6441)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5810 (1.6019)  time: 0.4820  data: 0.0005  max mem: 42519
Epoch: [296]  [1600/2502]  eta: 0:07:14  lr: 0.000002  min_lr: 0.000002  loss: 2.6123 (2.6426)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6313 (1.6076)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [296]  [1800/2502]  eta: 0:05:38  lr: 0.000002  min_lr: 0.000002  loss: 2.3717 (2.6418)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5692 (1.6104)  time: 0.4790  data: 0.0005  max mem: 42519
Epoch: [296]  [2000/2502]  eta: 0:04:01  lr: 0.000002  min_lr: 0.000002  loss: 2.7396 (2.6389)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5323 (1.6081)  time: 0.4804  data: 0.0005  max mem: 42519
Epoch: [296]  [2200/2502]  eta: 0:02:25  lr: 0.000002  min_lr: 0.000002  loss: 2.6509 (2.6409)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4984 (1.6011)  time: 0.4781  data: 0.0005  max mem: 42519
Epoch: [296]  [2400/2502]  eta: 0:00:49  lr: 0.000002  min_lr: 0.000002  loss: 2.6959 (2.6443)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5020 (1.6002)  time: 0.4802  data: 0.0005  max mem: 42519
Epoch: [296]  [2501/2502]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.4952 (2.6445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4505 (1.5966)  time: 0.4311  data: 0.0008  max mem: 42519
Epoch: [296] Total time: 0:20:03 (0.4811 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.4952 (2.6427)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4505 (1.5966)
Test:  [ 0/50]  eta: 0:02:20  loss: 0.5086 (0.5086)  acc1: 95.6000 (95.6000)  acc5: 99.2000 (99.2000)  time: 2.8070  data: 2.5293  max mem: 42519
Test:  [10/50]  eta: 0:00:21  loss: 0.7353 (0.7618)  acc1: 89.6000 (88.9455)  acc5: 98.8000 (98.2909)  time: 0.5290  data: 0.2882  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.8069 (0.7994)  acc1: 86.0000 (87.7905)  acc5: 98.4000 (98.1524)  time: 0.2726  data: 0.0323  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9187 (0.8841)  acc1: 84.4000 (86.0645)  acc5: 97.2000 (97.4323)  time: 0.2418  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1044 (0.9353)  acc1: 80.8000 (84.7024)  acc5: 95.6000 (97.0634)  time: 0.2385  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0573 (0.9505)  acc1: 80.0000 (84.0880)  acc5: 95.6000 (96.9840)  time: 0.2374  data: 0.0001  max mem: 42519
Test: Total time: 0:00:15 (0.3052 s / it)
* Acc@1 84.552 Acc@5 97.002 loss 0.940
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.56%
Epoch: [297]  [   0/2502]  eta: 1:27:15  lr: 0.000002  min_lr: 0.000002  loss: 2.0246 (2.0246)  weight_decay: 0.0500 (0.0500)  time: 2.0927  data: 1.5748  max mem: 42519
Epoch: [297]  [ 200/2502]  eta: 0:18:48  lr: 0.000002  min_lr: 0.000002  loss: 2.7422 (2.6211)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4106 (1.5507)  time: 0.4805  data: 0.0004  max mem: 42519
Epoch: [297]  [ 400/2502]  eta: 0:17:01  lr: 0.000002  min_lr: 0.000002  loss: 2.5822 (2.6477)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4917 (1.5714)  time: 0.4818  data: 0.0005  max mem: 42519
Epoch: [297]  [ 600/2502]  eta: 0:15:21  lr: 0.000002  min_lr: 0.000002  loss: 2.7624 (2.6463)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5370 (1.5894)  time: 0.4798  data: 0.0011  max mem: 42519
Epoch: [297]  [ 800/2502]  eta: 0:13:44  lr: 0.000002  min_lr: 0.000002  loss: 2.7083 (2.6420)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5935 (1.5883)  time: 0.4928  data: 0.0004  max mem: 42519
Epoch: [297]  [1000/2502]  eta: 0:12:06  lr: 0.000002  min_lr: 0.000002  loss: 2.7046 (2.6433)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5976 (1.5905)  time: 0.4823  data: 0.0004  max mem: 42519
Epoch: [297]  [1200/2502]  eta: 0:10:29  lr: 0.000002  min_lr: 0.000002  loss: 2.7755 (2.6471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5944 (1.5935)  time: 0.4796  data: 0.0004  max mem: 42519
Epoch: [297]  [1400/2502]  eta: 0:08:52  lr: 0.000002  min_lr: 0.000002  loss: 2.7059 (2.6483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5542 (1.6004)  time: 0.4798  data: 0.0004  max mem: 42519
Epoch: [297]  [1600/2502]  eta: 0:07:15  lr: 0.000002  min_lr: 0.000002  loss: 2.7378 (2.6471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5148 (1.6003)  time: 0.4782  data: 0.0004  max mem: 42519
Epoch: [297]  [1800/2502]  eta: 0:05:38  lr: 0.000002  min_lr: 0.000002  loss: 2.4419 (2.6440)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4141 (1.5915)  time: 0.4781  data: 0.0004  max mem: 42519
Epoch: [297]  [2000/2502]  eta: 0:04:02  lr: 0.000002  min_lr: 0.000002  loss: 2.6639 (2.6423)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4589 (1.5907)  time: 0.4794  data: 0.0004  max mem: 42519
Epoch: [297]  [2200/2502]  eta: 0:02:25  lr: 0.000002  min_lr: 0.000002  loss: 2.9320 (2.6462)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5206 (nan)  time: 0.4859  data: 0.0005  max mem: 42519
Epoch: [297]  [2400/2502]  eta: 0:00:49  lr: 0.000002  min_lr: 0.000002  loss: 2.4167 (2.6434)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5614 (nan)  time: 0.4788  data: 0.0004  max mem: 42519
Epoch: [297]  [2501/2502]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.8328 (2.6422)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4984 (nan)  time: 0.4315  data: 0.0009  max mem: 42519
Epoch: [297] Total time: 0:20:06 (0.4823 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.8328 (2.6438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4984 (nan)
Test:  [ 0/50]  eta: 0:02:19  loss: 0.5248 (0.5248)  acc1: 95.6000 (95.6000)  acc5: 99.6000 (99.6000)  time: 2.7993  data: 2.5171  max mem: 42519
Test:  [10/50]  eta: 0:00:19  loss: 0.7634 (0.7815)  acc1: 88.4000 (88.9091)  acc5: 98.8000 (98.4364)  time: 0.4767  data: 0.2354  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.8298 (0.8216)  acc1: 86.0000 (87.7524)  acc5: 98.4000 (98.2286)  time: 0.2413  data: 0.0039  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9506 (0.9039)  acc1: 84.8000 (86.1161)  acc5: 97.2000 (97.4968)  time: 0.2382  data: 0.0006  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.1133 (0.9550)  acc1: 80.8000 (84.7512)  acc5: 95.6000 (97.1317)  time: 0.2377  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0855 (0.9691)  acc1: 80.4000 (84.1520)  acc5: 96.0000 (97.0000)  time: 0.2374  data: 0.0001  max mem: 42519
Test: Total time: 0:00:14 (0.2925 s / it)
* Acc@1 84.530 Acc@5 97.022 loss 0.958
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.56%
Epoch: [298]  [   0/2502]  eta: 1:27:27  lr: 0.000002  min_lr: 0.000002  loss: 2.9274 (2.9274)  weight_decay: 0.0500 (0.0500)  time: 2.0975  data: 1.6065  max mem: 42519
Epoch: [298]  [ 200/2502]  eta: 0:18:46  lr: 0.000001  min_lr: 0.000001  loss: 2.5523 (2.5960)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5164 (1.5597)  time: 0.4790  data: 0.0006  max mem: 42519
Epoch: [298]  [ 400/2502]  eta: 0:17:01  lr: 0.000001  min_lr: 0.000001  loss: 2.9149 (2.6283)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5624 (1.5718)  time: 0.4825  data: 0.0004  max mem: 42519
Epoch: [298]  [ 600/2502]  eta: 0:15:20  lr: 0.000001  min_lr: 0.000001  loss: 2.8063 (2.6352)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5263 (1.5891)  time: 0.4785  data: 0.0004  max mem: 42519
Epoch: [298]  [ 800/2502]  eta: 0:13:42  lr: 0.000001  min_lr: 0.000001  loss: 2.6805 (2.6433)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6033 (1.6121)  time: 0.4842  data: 0.0004  max mem: 42519
Epoch: [298]  [1000/2502]  eta: 0:12:04  lr: 0.000001  min_lr: 0.000001  loss: 2.6699 (2.6389)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5763 (1.6077)  time: 0.4833  data: 0.0004  max mem: 42519
Epoch: [298]  [1200/2502]  eta: 0:10:27  lr: 0.000001  min_lr: 0.000001  loss: 2.7761 (2.6312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5306 (1.6043)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [298]  [1400/2502]  eta: 0:08:51  lr: 0.000001  min_lr: 0.000001  loss: 2.6900 (2.6335)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5843 (1.5998)  time: 0.4805  data: 0.0005  max mem: 42519
Epoch: [298]  [1600/2502]  eta: 0:07:14  lr: 0.000001  min_lr: 0.000001  loss: 2.8032 (2.6278)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4763 (1.5968)  time: 0.4784  data: 0.0005  max mem: 42519
Epoch: [298]  [1800/2502]  eta: 0:05:38  lr: 0.000001  min_lr: 0.000001  loss: 2.8794 (2.6290)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5191 (1.5998)  time: 0.4794  data: 0.0006  max mem: 42519
Epoch: [298]  [2000/2502]  eta: 0:04:01  lr: 0.000001  min_lr: 0.000001  loss: 2.8542 (2.6273)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5102 (1.5956)  time: 0.4788  data: 0.0005  max mem: 42519
Epoch: [298]  [2200/2502]  eta: 0:02:25  lr: 0.000001  min_lr: 0.000001  loss: 2.6103 (2.6267)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5492 (inf)  time: 0.4876  data: 0.0006  max mem: 42519
Epoch: [298]  [2400/2502]  eta: 0:00:49  lr: 0.000001  min_lr: 0.000001  loss: 2.7724 (2.6256)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4759 (inf)  time: 0.4830  data: 0.0005  max mem: 42519
Epoch: [298]  [2501/2502]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.4808 (2.6253)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5465 (inf)  time: 0.4318  data: 0.0009  max mem: 42519
Epoch: [298] Total time: 0:20:04 (0.4813 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.4808 (2.6371)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5465 (inf)
Test:  [ 0/50]  eta: 0:02:15  loss: 0.5046 (0.5046)  acc1: 95.6000 (95.6000)  acc5: 99.6000 (99.6000)  time: 2.7047  data: 2.4418  max mem: 42519
Test:  [10/50]  eta: 0:00:18  loss: 0.7352 (0.7615)  acc1: 88.8000 (88.8727)  acc5: 98.8000 (98.3636)  time: 0.4667  data: 0.2270  max mem: 42519
Test:  [20/50]  eta: 0:00:10  loss: 0.7994 (0.7997)  acc1: 86.0000 (87.7524)  acc5: 98.0000 (98.1524)  time: 0.2405  data: 0.0030  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.9327 (0.8835)  acc1: 84.4000 (86.0645)  acc5: 96.8000 (97.3936)  time: 0.2380  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:02  loss: 1.0863 (0.9345)  acc1: 80.8000 (84.7024)  acc5: 95.6000 (97.0146)  time: 0.2377  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0598 (0.9496)  acc1: 80.0000 (84.0880)  acc5: 96.0000 (96.9200)  time: 0.2374  data: 0.0001  max mem: 42519
Test: Total time: 0:00:14 (0.2899 s / it)
* Acc@1 84.554 Acc@5 96.964 loss 0.938
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.56%
Epoch: [299]  [   0/2502]  eta: 1:26:06  lr: 0.000001  min_lr: 0.000001  loss: 2.7969 (2.7969)  weight_decay: 0.0500 (0.0500)  time: 2.0649  data: 1.3875  max mem: 42519
Epoch: [299]  [ 200/2502]  eta: 0:18:49  lr: 0.000001  min_lr: 0.000001  loss: 2.8909 (2.6411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5425 (1.6238)  time: 0.4783  data: 0.0004  max mem: 42519
Epoch: [299]  [ 400/2502]  eta: 0:17:02  lr: 0.000001  min_lr: 0.000001  loss: 2.4122 (2.6307)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6743 (1.6366)  time: 0.4891  data: 0.0004  max mem: 42519
Epoch: [299]  [ 600/2502]  eta: 0:15:20  lr: 0.000001  min_lr: 0.000001  loss: 2.8710 (2.6288)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5856 (1.6279)  time: 0.4795  data: 0.0004  max mem: 42519
Epoch: [299]  [ 800/2502]  eta: 0:13:41  lr: 0.000001  min_lr: 0.000001  loss: 2.5813 (2.6445)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.4783  data: 0.0005  max mem: 42519
Epoch: [299]  [1000/2502]  eta: 0:12:04  lr: 0.000001  min_lr: 0.000001  loss: 2.7861 (2.6425)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5047 (nan)  time: 0.4775  data: 0.0004  max mem: 42519
Epoch: [299]  [1200/2502]  eta: 0:10:28  lr: 0.000001  min_lr: 0.000001  loss: 2.7812 (2.6460)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5271 (nan)  time: 0.4821  data: 0.0004  max mem: 42519
Epoch: [299]  [1400/2502]  eta: 0:08:51  lr: 0.000001  min_lr: 0.000001  loss: 2.7526 (2.6429)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5397 (nan)  time: 0.4792  data: 0.0004  max mem: 42519
Epoch: [299]  [1600/2502]  eta: 0:07:15  lr: 0.000001  min_lr: 0.000001  loss: 2.7164 (2.6456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4771 (nan)  time: 0.4803  data: 0.0004  max mem: 42519
Epoch: [299]  [1800/2502]  eta: 0:05:38  lr: 0.000001  min_lr: 0.000001  loss: 2.7455 (2.6430)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4922 (nan)  time: 0.4823  data: 0.0005  max mem: 42519
Epoch: [299]  [2000/2502]  eta: 0:04:02  lr: 0.000001  min_lr: 0.000001  loss: 2.6585 (2.6430)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5139 (nan)  time: 0.4789  data: 0.0004  max mem: 42519
Epoch: [299]  [2200/2502]  eta: 0:02:25  lr: 0.000001  min_lr: 0.000001  loss: 2.6766 (2.6392)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4719 (nan)  time: 0.4811  data: 0.0004  max mem: 42519
Epoch: [299]  [2400/2502]  eta: 0:00:49  lr: 0.000001  min_lr: 0.000001  loss: 2.6718 (2.6386)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4800 (nan)  time: 0.4784  data: 0.0004  max mem: 42519
Epoch: [299]  [2501/2502]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.6649 (2.6400)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5472 (nan)  time: 0.4311  data: 0.0009  max mem: 42519
Epoch: [299] Total time: 0:20:05 (0.4818 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.6649 (2.6442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5472 (nan)
Test:  [ 0/50]  eta: 0:02:42  loss: 0.4627 (0.4627)  acc1: 96.0000 (96.0000)  acc5: 99.6000 (99.6000)  time: 3.2421  data: 2.9614  max mem: 42519
Test:  [10/50]  eta: 0:00:20  loss: 0.6881 (0.7182)  acc1: 89.2000 (88.9818)  acc5: 98.8000 (98.3636)  time: 0.5107  data: 0.2696  max mem: 42519
Test:  [20/50]  eta: 0:00:11  loss: 0.7663 (0.7533)  acc1: 86.0000 (87.8476)  acc5: 98.4000 (98.1524)  time: 0.2378  data: 0.0004  max mem: 42519
Test:  [30/50]  eta: 0:00:06  loss: 0.8773 (0.8374)  acc1: 84.8000 (86.1548)  acc5: 97.2000 (97.4452)  time: 0.2380  data: 0.0005  max mem: 42519
Test:  [40/50]  eta: 0:00:03  loss: 1.0685 (0.8889)  acc1: 80.8000 (84.7317)  acc5: 95.6000 (97.0146)  time: 0.2377  data: 0.0003  max mem: 42519
Test:  [49/50]  eta: 0:00:00  loss: 1.0244 (0.9034)  acc1: 79.6000 (84.1280)  acc5: 96.0000 (96.9360)  time: 0.2374  data: 0.0001  max mem: 42519
Test: Total time: 0:00:14 (0.2998 s / it)
* Acc@1 84.566 Acc@5 96.992 loss 0.893
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.57%
Training time 1 day, 22:55:00
