_base_ = [
    '../_base_/models/vit-base-p16.py',
    '../_base_/datasets/imagenet_bs64_pil_resize_autoaug.py',
    '../_base_/default_runtime.py'
]

# specific to vit pretrain
paramwise_cfg = dict(custom_keys={
    '.cls_token': dict(decay_mult=0.0),
    '.pos_embed': dict(decay_mult=0.0)
})

pretrained = 'https://download.openmmlab.com/mmclassification/v0/vit/pretrain/vit-base-p16_3rdparty_pt-64xb64_in1k-224_20210928-02284250.pth'  # noqa

model = dict(
    head=dict(
        loss=dict(type='CrossEntropyLoss', loss_weight=1.0, _delete_=True), ),
    backbone=dict(
        img_size=224,
        init_cfg=dict(
            type='Pretrained',
            checkpoint=pretrained,
            _delete_=True,
            prefix='backbone')))

img_norm_cfg = dict(
    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)

train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='RandomResizedCrop', scale=224, backend='pillow'),
    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='ImageToTensor', keys=['img']),
    dict(type='ToTensor', keys=['gt_label']),
    dict(type='ToHalf', keys=['img']),
    dict(type='Collect', keys=['img', 'gt_label'])
]

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='Resize', scale=(224, -1), keep_ratio=True, backend='pillow'),
    dict(type='CenterCrop', crop_size=224),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='ImageToTensor', keys=['img']),
    dict(type='ToHalf', keys=['img']),
    dict(type='Collect', keys=['img'])
]

# change batch size
data = dict(
    samples_per_gpu=17,
    workers_per_gpu=16,
    drop_last=True,
    train=dict(pipeline=train_pipeline),
    train_dataloader=dict(mode='async'),
    val=dict(pipeline=test_pipeline, ),
    val_dataloader=dict(samples_per_gpu=4, workers_per_gpu=1),
    test=dict(pipeline=test_pipeline),
    test_dataloader=dict(samples_per_gpu=4, workers_per_gpu=1))

# optimizer
optimizer = dict(
    type='SGD',
    lr=0.08,
    weight_decay=1e-5,
    momentum=0.9,
    paramwise_cfg=paramwise_cfg,
)

# learning policy
param_scheduler = [
    dict(type='LinearLR', start_factor=0.02, by_epoch=False, begin=0, end=800),
    dict(
        type='CosineAnnealingLR',
        T_max=4200,
        by_epoch=False,
        begin=800,
        end=5000)
]

# ipu cfg
# model partition config
ipu_model_cfg = dict(
    train_split_edges=[
        dict(layer_to_call='backbone.patch_embed', ipu_id=0),
        dict(layer_to_call='backbone.layers.3', ipu_id=1),
        dict(layer_to_call='backbone.layers.6', ipu_id=2),
        dict(layer_to_call='backbone.layers.9', ipu_id=3)
    ],
    train_ckpt_nodes=['backbone.layers.{}'.format(i) for i in range(12)])

# device config
options_cfg = dict(
    randomSeed=42,
    partialsType='half',
    train_cfg=dict(
        executionStrategy='SameAsIpu',
        Training=dict(gradientAccumulation=32),
        availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
    ),
    eval_cfg=dict(deviceIterations=1, ),
)

# add model partition config and device config to runner
runner = dict(
    type='IterBasedRunner',
    ipu_model_cfg=ipu_model_cfg,
    options_cfg=options_cfg,
    max_iters=5000)

default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1000))

fp16 = dict(loss_scale=256.0, velocity_accum_type='half', accum_type='half')
