# Image classification reference training scripts


###### ResNet-50 ###############################

### SGD ###

```
lr=0.3
wd=1e-4

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 --master_port=9900 train.py\
    --model resnet50 --epochs 100  --batch-size 128  --accum_steps 1 --opt sgd --lr $lr --wd $wd  --print-freq 100\
    --output-dir  ./outputs --data-path /the/path/to/dataset/
    --log-suffix adamw_lr_${lr}_wd_${wd} \
    --tb-vis \

```

### AdamW ###

```
lr=0.003
wd=0.1
epoch=150
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 --master_port=9900 train.py\
    --model resnet50  --epochs $epoch  --batch-size 128 --accum_steps 1 --opt adamw --lr $lr --wd $wd  --print-freq 100\
    --lr-warmup-method linear --lr-warmup-epochs 30  --lr-scheduler cosineannealinglr \
    --output-dir  ./outputs --data-path /the/path/to/dataset/
    --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra --ra-sampler --cutmix-alpha 0.2 \
    --log-suffix adamw_lr_${lr}_wd_${wd}_warmup_30_epoch_${epoch} \
    --tb-vis \

```

### Adan ####

```
lr=0.003
wd=0.1
epoch=150
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 --master_port=9900 train.py\
    --model resnet50  --epochs $epoch  --batch-size 128 --accum_steps 1 --opt adan --lr $lr --wd $wd  --print-freq 100\
    --lr-warmup-method linear --lr-warmup-epochs 20  --lr-scheduler cosineannealinglr \
    --output-dir  ./outputs --data-path /the/path/to/dataset/
    --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra --ra-sampler --cutmix-alpha 0.2 \
    --log-suffix adan_lr_${lr}_wd_${wd} \
    --tb-vis \
```

### Lion ####


lr=0.001
wd=0.3
epoch=150
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 --master_port=9900 train.py\
    --model resnet50  --epochs epoch  --batch-size 128 --accum_steps 1 --opt lion --lr $lr --wd $wd  --print-freq 100\
    --lr-warmup-method linear --lr-warmup-epochs 20  --lr-scheduler cosineannealinglr \
    --output-dir  ./outputs --data-path /the/path/to/datasets/
    --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra --ra-sampler --cutmix-alpha 0.2 \
    --log-suffix lion_lr_${lr}_wd_${wd} \
    --tb-vis \

### SoftSignSGD ####

```
lr=0.006
wd=0.005
mom=0.95
pow=3.0
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 --master_port=9900 train.py\
    --model resnet50   --epochs 150  --batch-size 128  --accum_steps 1 --opt softsignsgd --lr $lr --wd $wd  --momentum $mom  --sss_power $pow  --print-freq 100\
    --output-dir  ./outputs --data-path --data-path /the/path/to/datasets/
    --lr-warmup-method linear --lr-warmup-epochs 20  --lr-scheduler cosineannealinglr \
    --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra --ra-sampler --cutmix-alpha 0.2 \
    --log-suffix softsignsgd_lr_${lr}_wd_${wd}_mom_${mom}_pow_${pow} \
    --tb-vis \

```

###### vit_b_16 #################################

### AdamW ####

```
lr=0.003
wd=0.3
wm_epoch=30
epoch=150

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8  --master_port=8907 train.py\
    --model vit_b_16 --epochs $epoch --batch-size 16  --accum_steps 2 --opt adamw --lr $lr --wd $wd --print-freq 50\
    --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs $wm_epoch \
    --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra\
    --ra-sampler --cutmix-alpha 1.0 \
    --output-dir  ./outputs --data-path /the/path/to/datasets/ \
    --log-suffix adamW_lr_${lr}_wd_${wd}_epoch_${epoch} \
    --tb-vis \
```

### Adan ####
```
lr=0.006
wd=0.15
wm_epoch=30

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8  --master_port=9907 train.py\
    --model vit_b_16 --epochs 150 --batch-size 256  --accum_steps 2 --opt adan --lr $lr --wd $wd --print-freq 50\
    --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs $wm_epoch \
    --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra\
    --ra-sampler --cutmix-alpha 1.0 \
    --output-dir  ./outputs --data-path /the/path/to/datasets \
    --log-suffix adan_lr_${lr}_wd_${wd}_momentum_${momentum}_wm_epoch_${wm_epoch} \
    --tb-vis \
```

### Lion ####
```
lr=0.001
wd=0.09
wm_epoch=30 

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8  --master_port=8907 train.py\
    --model vit_b_16 --epochs 150 --batch-size 256  --accum_steps 2 --opt lion --lr $lr --wd $wd  --print-freq 50\
    --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs $wm_epoch\
    --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra\
    --ra-sampler --cutmix-alpha 1.0 \
    --output-dir  ./outputs --data-path  /the/path/to/datasets/
    --log-suffix lion_lr_${lr}_wd_${wd}_wm_epoch_${wm_epoch} \
    --tb-vis \
    --use-deterministic-algorithms \
```

### SoftSignSGD ####

```
lr=0.006
wd=0.15
mom=0.95
pow=3.0
wm_epoch=30
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8  --master_port=9907 train.py\
    --model vit_b_16 --epochs 150 --batch-size 256  --accum_steps 2 --opt softsignsgd --lr $lr --wd $wd --momentum $mom --sss_power $pow --print-freq 50\
    --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs $wm_epoch\
    --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra\
    --ra-sampler --cutmix-alpha 1.0 \
    --output-dir  ./outputs --data-path--data-path /the/path/to/datasets/
    --log-suffix softsignsgd_lr_${lr}_wd_${wd}_momentum_${mom}_power_${pow}_wm_${wm_epoch} \
    --tb-vis \
```





Note that the above command corresponds to training on a single node with 8 GPUs.
