Collections:
  - Name: Vision Transformer
    Metadata:
      Architecture:
        - Attention Dropout
        - Convolution
        - Dense Connections
        - Dropout
        - GELU
        - Layer Normalization
        - Multi-Head Attention
        - Scaled Dot-Product Attention
        - Tanh Activation
    Paper:
      Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at
        Scale'
      URL: https://arxiv.org/abs/2010.11929
    README: configs/vision_transformer/README.md
    Code:
      URL: https://github.com/open-mmlab/mmpretrain/blob/v0.17.0/mmcls/models/backbones/vision_transformer.py
      Version: v0.17.0

Models:
  - Name: vit-base-p32_in21k-pre_3rdparty_in1k-384px
    Metadata:
      FLOPs: 13056716544
      Parameters: 88297192
      Training Data:
        - ImageNet-21k
        - ImageNet-1k
    In Collection: Vision Transformer
    Results:
      - Dataset: ImageNet-1k
        Task: Image Classification
        Metrics:
          Top 1 Accuracy: 84.01
          Top 5 Accuracy: 97.08
    Weights: https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth
    Config: configs/vision_transformer/vit-base-p32_64xb64_in1k-384px.py
    Converted From:
      Weights: https://console.cloud.google.com/storage/browser/_details/vit_models/augreg/B_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz
      Code: https://github.com/google-research/vision_transformer/blob/88a52f8892c80c10de99194990a517b4d80485fd/vit_jax/models.py#L208
  - Name: vit-base-p16_32xb128-mae_in1k
    Metadata:
      FLOPs: 17581972224
      Parameters: 86567656
      Training Data:
        - ImageNet-1k
    In Collection: Vision Transformer
    Results:
      - Dataset: ImageNet-1k
        Task: Image Classification
        Metrics:
          Top 1 Accuracy: 82.37
          Top 5 Accuracy: 96.15
    Weights: https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.pth
    Config: configs/vision_transformer/vit-base-p16_32xb128-mae_in1k.py
  - Name: vit-base-p16_in21k-pre_3rdparty_in1k-384px
    Metadata:
      FLOPs: 55538974464
      Parameters: 86859496
      Training Data:
        - ImageNet-21k
        - ImageNet-1k
    In Collection: Vision Transformer
    Results:
      - Dataset: ImageNet-1k
        Task: Image Classification
        Metrics:
          Top 1 Accuracy: 85.43
          Top 5 Accuracy: 97.77
    Weights: https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth
    Config: configs/vision_transformer/vit-base-p16_64xb64_in1k-384px.py
    Converted From:
      Weights: https://console.cloud.google.com/storage/browser/_details/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz
      Code: https://github.com/google-research/vision_transformer/blob/88a52f8892c80c10de99194990a517b4d80485fd/vit_jax/models.py#L208
  - Name: vit-large-p16_in21k-pre_3rdparty_in1k-384px
    Metadata:
      FLOPs: 191210034176
      Parameters: 304715752
      Training Data:
        - ImageNet-21k
        - ImageNet-1k
    In Collection: Vision Transformer
    Results:
      - Dataset: ImageNet-1k
        Task: Image Classification
        Metrics:
          Top 1 Accuracy: 85.63
          Top 5 Accuracy: 97.63
    Weights: https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth
    Config: configs/vision_transformer/vit-large-p16_64xb64_in1k-384px.py
    Converted From:
      Weights: https://console.cloud.google.com/storage/browser/_details/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_strong1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz
      Code: https://github.com/google-research/vision_transformer/blob/88a52f8892c80c10de99194990a517b4d80485fd/vit_jax/models.py#L208
