{
    "vit_small_patch16_224_cls": {
        "model_name": "vit_small_patch16_224.augreg_in21k_ft_in1k",
        "source": "timm",
        "model_parameters": {
            "token_extraction": "cls_token"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "norm"
        ],
        "objective": "Supervised",
        "dataset": "ImageNet21k + finetuned on ImageNet1k",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 384,
        "alignment": null,
        "dataset_class": "ImageNet21k + finetuned on ImageNet1k",
        "size": 22050664,
        "size_fmt": "22.1M",
        "size_class": "small",
        "set_length": 13
    },
    "vit_base_patch16_224_cls": {
        "model_name": "vit_base_patch16_224.augreg2_in21k_ft_in1k",
        "source": "timm",
        "model_parameters": {
            "token_extraction": "cls_token"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "norm"
        ],
        "objective": "Supervised",
        "dataset": "ImageNet21k + finetuned on ImageNet1k",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 768,
        "alignment": null,
        "dataset_class": "ImageNet21k + finetuned on ImageNet1k",
        "size": 86567656,
        "size_fmt": "86.6M",
        "size_class": "small",
        "set_length": 13
    },
    "vit_large_patch16_224_cls": {
        "model_name": "vit_large_patch16_224.augreg_in21k_ft_in1k",
        "source": "timm",
        "model_parameters": {
            "token_extraction": "cls_token"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "blocks.12.norm2",
            "blocks.13.norm2",
            "blocks.14.norm2",
            "blocks.15.norm2",
            "blocks.16.norm2",
            "blocks.17.norm2",
            "blocks.18.norm2",
            "blocks.19.norm2",
            "blocks.20.norm2",
            "blocks.21.norm2",
            "blocks.22.norm2",
            "blocks.23.norm2",
            "norm"
        ],
        "objective": "Supervised",
        "dataset": "ImageNet1k + finetuned on ImageNet21k",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 1024,
        "alignment": null,
        "dataset_class": "ImageNet1k + finetuned on ImageNet21k",
        "size": 304326632,
        "size_fmt": "304.3M",
        "size_class": "large",
        "set_length": 25
    },
    "vit_small_patch16_224_ap": {
        "model_name": "vit_small_patch16_224.augreg_in21k_ft_in1k",
        "source": "timm",
        "model_parameters": {
            "token_extraction": "avg_pool"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "norm"
        ],
        "objective": "Supervised",
        "dataset": "ImageNet21k + finetuned on ImageNet1k",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 384,
        "alignment": null,
        "dataset_class": "ImageNet21k + finetuned on ImageNet1k",
        "size": 22050664,
        "size_fmt": "22.1M",
        "size_class": "small",
        "set_length": 13
    },
    "vit_base_patch16_224_ap": {
        "model_name": "vit_base_patch16_224.augreg2_in21k_ft_in1k",
        "source": "timm",
        "model_parameters": {
            "token_extraction": "avg_pool"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "norm"
        ],
        "objective": "Supervised",
        "dataset": "ImageNet21k + finetuned on ImageNet1k",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 768,
        "alignment": null,
        "dataset_class": "ImageNet21k + finetuned on ImageNet1k",
        "size": 86567656,
        "size_fmt": "86.6M",
        "size_class": "small",
        "set_length": 13
    },
    "vit_large_patch16_224_ap": {
        "model_name": "vit_large_patch16_224.augreg_in21k_ft_in1k",
        "source": "timm",
        "model_parameters": {
            "token_extraction": "avg_pool"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "blocks.12.norm2",
            "blocks.13.norm2",
            "blocks.14.norm2",
            "blocks.15.norm2",
            "blocks.16.norm2",
            "blocks.17.norm2",
            "blocks.18.norm2",
            "blocks.19.norm2",
            "blocks.20.norm2",
            "blocks.21.norm2",
            "blocks.22.norm2",
            "blocks.23.norm2",
            "norm"
        ],
        "objective": "Supervised",
        "dataset": "ImageNet1k + finetuned on ImageNet21k",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 1024,
        "alignment": null,
        "dataset_class": "ImageNet1k + finetuned on ImageNet21k",
        "size": 304326632,
        "size_fmt": "304.3M",
        "size_class": "large",
        "set_length": 25
    },
    "vit_small_patch16_224_at": {
        "model_name": "vit_small_patch16_224.augreg_in21k_ft_in1k",
        "source": "timm",
        "model_parameters": {
            "token_extraction": "all_tokens"
        },
        "module_names": [
            "norm"
        ],
        "objective": "Supervised",
        "dataset": "ImageNet21k + finetuned on ImageNet1k",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 384,
        "alignment": null,
        "dataset_class": "ImageNet21k + finetuned on ImageNet1k",
        "size": 22050664,
        "size_fmt": "22.1M",
        "size_class": "small",
        "set_length": 197
    },
    "vit_base_patch16_224_at": {
        "model_name": "vit_base_patch16_224.augreg2_in21k_ft_in1k",
        "source": "timm",
        "model_parameters": {
            "token_extraction": "all_tokens"
        },
        "module_names": [
            "norm"
        ],
        "objective": "Supervised",
        "dataset": "ImageNet21k + finetuned on ImageNet1k",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 768,
        "alignment": null,
        "dataset_class": "ImageNet21k + finetuned on ImageNet1k",
        "size": 86567656,
        "size_fmt": "86.6M",
        "size_class": "small",
        "set_length": 197
    },
    "vit_large_patch16_224_at": {
        "model_name": "vit_large_patch16_224.augreg_in21k_ft_in1k",
        "source": "timm",
        "model_parameters": {
            "token_extraction": "all_tokens"
        },
        "module_names": [
            "norm"
        ],
        "objective": "Supervised",
        "dataset": "ImageNet21k + finetuned on ImageNet1k",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 1024,
        "alignment": null,
        "dataset_class": "ImageNet21k + finetuned on ImageNet1k",
        "size": 304326632,
        "size_fmt": "304.3M",
        "size_class": "large",
        "set_length": 197
    },
    "dinov2-vit-small-p14_cls": {
        "model_name": "dinov2-vit-small-p14",
        "source": "ssl",
        "model_parameters": {
            "token_extraction": "cls_token"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "norm"
        ],
        "objective": "Self-Supervised",
        "dataset": "LVD-142M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "alignment": null,
        "dataset_class": "Large",
        "embedding_dim": 384,
        "size": 86580480,
        "size_fmt": "86.6M",
        "size_class": "small",
        "set_length": 13
    },
    "dinov2-vit-base-p14_cls": {
        "model_name": "dinov2-vit-base-p14",
        "source": "ssl",
        "model_parameters": {
            "token_extraction": "cls_token"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "norm"
        ],
        "objective": "Self-Supervised",
        "dataset": "LVD-142M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "alignment": null,
        "dataset_class": "Large",
        "embedding_dim": 768,
        "size": 86580480,
        "size_fmt": "86.6M",
        "size_class": "small",
        "set_length": 13
    },
    "dinov2-vit-large-p14_cls": {
        "model_name": "dinov2-vit-large-p14",
        "source": "ssl",
        "model_parameters": {
            "token_extraction": "cls_token"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "blocks.12.norm2",
            "blocks.13.norm2",
            "blocks.14.norm2",
            "blocks.15.norm2",
            "blocks.16.norm2",
            "blocks.17.norm2",
            "blocks.18.norm2",
            "blocks.19.norm2",
            "blocks.20.norm2",
            "blocks.21.norm2",
            "blocks.22.norm2",
            "blocks.23.norm2",
            "norm"
        ],
        "objective": "Self-Supervised",
        "dataset": "LVD-142M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 1024,
        "alignment": null,
        "dataset_class": "Large",
        "size": 304368640,
        "size_fmt": "304.4M",
        "size_class": "large",
        "set_length": 25
    },
    "dinov2-vit-small-p14_ap": {
        "model_name": "dinov2-vit-small-p14",
        "source": "ssl",
        "model_parameters": {
            "token_extraction": "avg_pool"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "norm"
        ],
        "objective": "Self-Supervised",
        "dataset": "LVD-142M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "alignment": null,
        "dataset_class": "Large",
        "embedding_dim": 384,
        "size": 86580480,
        "size_fmt": "86.6M",
        "size_class": "small",
        "set_length": 13
    },
    "dinov2-vit-base-p14_ap": {
        "model_name": "dinov2-vit-base-p14",
        "source": "ssl",
        "model_parameters": {
            "token_extraction": "avg_pool"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "norm"
        ],
        "objective": "Self-Supervised",
        "dataset": "LVD-142M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "alignment": null,
        "dataset_class": "Large",
        "embedding_dim": 768,
        "size": 86580480,
        "size_fmt": "86.6M",
        "size_class": "small",
        "set_length": 13
    },
    "dinov2-vit-large-p14_ap": {
        "model_name": "dinov2-vit-large-p14",
        "source": "ssl",
        "model_parameters": {
            "token_extraction": "avg_pool"
        },
        "module_names": [
            "blocks.0.norm2",
            "blocks.1.norm2",
            "blocks.2.norm2",
            "blocks.3.norm2",
            "blocks.4.norm2",
            "blocks.5.norm2",
            "blocks.6.norm2",
            "blocks.7.norm2",
            "blocks.8.norm2",
            "blocks.9.norm2",
            "blocks.10.norm2",
            "blocks.11.norm2",
            "blocks.12.norm2",
            "blocks.13.norm2",
            "blocks.14.norm2",
            "blocks.15.norm2",
            "blocks.16.norm2",
            "blocks.17.norm2",
            "blocks.18.norm2",
            "blocks.19.norm2",
            "blocks.20.norm2",
            "blocks.21.norm2",
            "blocks.22.norm2",
            "blocks.23.norm2",
            "norm"
        ],
        "objective": "Self-Supervised",
        "dataset": "LVD-142M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 1024,
        "alignment": null,
        "dataset_class": "Large",
        "size": 304368640,
        "size_fmt": "304.4M",
        "size_class": "large",
        "set_length": 25
    },
    "dinov2-vit-small-p14_at": {
        "model_name": "dinov2-vit-small-p14",
        "source": "ssl",
        "model_parameters": {
            "token_extraction": "all_tokens"
        },
        "module_names": [
            "norm"
        ],
        "objective": "Self-Supervised",
        "dataset": "LVD-142M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "alignment": null,
        "dataset_class": "Large",
        "embedding_dim": 384,
        "size": 86580480,
        "size_fmt": "86.6M",
        "size_class": "small",
        "set_length": 257
    },
    "dinov2-vit-base-p14_at": {
        "model_name": "dinov2-vit-base-p14",
        "source": "ssl",
        "model_parameters": {
            "token_extraction": "all_tokens"
        },
        "module_names": [
            "norm"
        ],
        "objective": "Self-Supervised",
        "dataset": "LVD-142M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "alignment": null,
        "dataset_class": "Large",
        "embedding_dim": 768,
        "size": 86580480,
        "size_fmt": "86.6M",
        "size_class": "small",
        "set_length": 257
    },
    "dinov2-vit-large-p14_at": {
        "model_name": "dinov2-vit-large-p14",
        "source": "ssl",
        "model_parameters": {
            "token_extraction": "all_tokens"
        },
        "module_names": [
            "norm"
        ],
        "objective": "Self-Supervised",
        "dataset": "LVD-142M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 1024,
        "alignment": null,
        "dataset_class": "Large",
        "size": 304368640,
        "size_fmt": "304.4M",
        "size_class": "large",
        "set_length": 257
    },
    "OpenCLIP_ViT-B-32_openai_cls": {
        "model_name": "OpenCLIP",
        "source": "custom",
        "model_parameters": {
            "variant": "ViT-B-32",
            "dataset": "openai",
            "vision_cfg": {
                "image_size": 224,
                "layers": 12,
                "width": 768,
                "patch_size": 32,
                "pool_type": "none"
            },
            "token_extraction": "cls_token"
        },
        "module_names": [
            "visual.transformer.resblocks.0.ln_2",
            "visual.transformer.resblocks.1.ln_2",
            "visual.transformer.resblocks.2.ln_2",
            "visual.transformer.resblocks.3.ln_2",
            "visual.transformer.resblocks.4.ln_2",
            "visual.transformer.resblocks.5.ln_2",
            "visual.transformer.resblocks.6.ln_2",
            "visual.transformer.resblocks.7.ln_2",
            "visual.transformer.resblocks.8.ln_2",
            "visual.transformer.resblocks.9.ln_2",
            "visual.transformer.resblocks.10.ln_2",
            "visual.transformer.resblocks.11.ln_2",
            "visual"
        ],
        "objective": "Image-Text",
        "dataset": "WIT-400M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 768,
        "alignment": null,
        "dataset_class": "Large",
        "size": 149620737,
        "size_fmt": "149.6M",
        "size_class": "medium",
        "set_length": 13
    },
    "OpenCLIP_ViT-B-16_openai_cls": {
        "model_name": "OpenCLIP",
        "source": "custom",
        "model_parameters": {
            "variant": "ViT-B-16",
            "dataset": "openai",
            "vision_cfg": {
                "image_size": 224,
                "layers": 12,
                "width": 768,
                "patch_size": 16,
                "pool_type": "none"
            },
            "token_extraction": "cls_token"
        },
        "module_names": [
            "visual.transformer.resblocks.0.ln_2",
            "visual.transformer.resblocks.1.ln_2",
            "visual.transformer.resblocks.2.ln_2",
            "visual.transformer.resblocks.3.ln_2",
            "visual.transformer.resblocks.4.ln_2",
            "visual.transformer.resblocks.5.ln_2",
            "visual.transformer.resblocks.6.ln_2",
            "visual.transformer.resblocks.7.ln_2",
            "visual.transformer.resblocks.8.ln_2",
            "visual.transformer.resblocks.9.ln_2",
            "visual.transformer.resblocks.10.ln_2",
            "visual.transformer.resblocks.11.ln_2",
            "visual"
        ],
        "objective": "Image-Text",
        "dataset": "WIT-400M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 768,
        "alignment": null,
        "dataset_class": "Large",
        "size": 149620737,
        "size_fmt": "149.6M",
        "size_class": "medium",
        "set_length": 13
    },
    "OpenCLIP_ViT-L-14_openai_cls": {
        "model_name": "OpenCLIP",
        "source": "custom",
        "model_parameters": {
            "variant": "ViT-L-14",
            "dataset": "openai",
            "vision_cfg": {
                "image_size": 224,
                "layers": 24,
                "width": 1024,
                "patch_size": 14,
                "pool_type": "none"
            },
            "token_extraction": "cls_token"
        },
        "module_names": [
            "visual.transformer.resblocks.0.ln_2",
            "visual.transformer.resblocks.1.ln_2",
            "visual.transformer.resblocks.2.ln_2",
            "visual.transformer.resblocks.3.ln_2",
            "visual.transformer.resblocks.4.ln_2",
            "visual.transformer.resblocks.5.ln_2",
            "visual.transformer.resblocks.6.ln_2",
            "visual.transformer.resblocks.7.ln_2",
            "visual.transformer.resblocks.8.ln_2",
            "visual.transformer.resblocks.9.ln_2",
            "visual.transformer.resblocks.10.ln_2",
            "visual.transformer.resblocks.11.ln_2",
            "visual.transformer.resblocks.12.ln_2",
            "visual.transformer.resblocks.13.ln_2",
            "visual.transformer.resblocks.14.ln_2",
            "visual.transformer.resblocks.15.ln_2",
            "visual.transformer.resblocks.16.ln_2",
            "visual.transformer.resblocks.17.ln_2",
            "visual.transformer.resblocks.18.ln_2",
            "visual.transformer.resblocks.19.ln_2",
            "visual.transformer.resblocks.20.ln_2",
            "visual.transformer.resblocks.21.ln_2",
            "visual.transformer.resblocks.22.ln_2",
            "visual.transformer.resblocks.23.ln_2",
            "visual"
        ],
        "objective": "Image-Text",
        "dataset": "WIT-400M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 1024,
        "alignment": null,
        "dataset_class": "Large",
        "size": 427616513,
        "size_fmt": "427.6M",
        "size_class": "xlarge",
        "set_length": 25
    },
    "OpenCLIP_ViT-B-32_openai_ap": {
        "model_name": "OpenCLIP",
        "source": "custom",
        "model_parameters": {
            "variant": "ViT-B-32",
            "dataset": "openai",
            "vision_cfg": {
                "image_size": 224,
                "layers": 12,
                "width": 768,
                "patch_size": 32,
                "pool_type": "none"
            },
            "token_extraction": "avg_pool"
        },
        "module_names": [
            "visual.transformer.resblocks.0.ln_2",
            "visual.transformer.resblocks.1.ln_2",
            "visual.transformer.resblocks.2.ln_2",
            "visual.transformer.resblocks.3.ln_2",
            "visual.transformer.resblocks.4.ln_2",
            "visual.transformer.resblocks.5.ln_2",
            "visual.transformer.resblocks.6.ln_2",
            "visual.transformer.resblocks.7.ln_2",
            "visual.transformer.resblocks.8.ln_2",
            "visual.transformer.resblocks.9.ln_2",
            "visual.transformer.resblocks.10.ln_2",
            "visual.transformer.resblocks.11.ln_2",
            "visual"
        ],
        "objective": "Image-Text",
        "dataset": "WIT-400M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 768,
        "alignment": null,
        "dataset_class": "Large",
        "size": 149620737,
        "size_fmt": "149.6M",
        "size_class": "medium",
        "set_length": 13
    },
    "OpenCLIP_ViT-B-16_openai_ap": {
        "model_name": "OpenCLIP",
        "source": "custom",
        "model_parameters": {
            "variant": "ViT-B-16",
            "dataset": "openai",
            "vision_cfg": {
                "image_size": 224,
                "layers": 12,
                "width": 768,
                "patch_size": 16,
                "pool_type": "none"
            },
            "token_extraction": "avg_pool"
        },
        "module_names": [
            "visual.transformer.resblocks.0.ln_2",
            "visual.transformer.resblocks.1.ln_2",
            "visual.transformer.resblocks.2.ln_2",
            "visual.transformer.resblocks.3.ln_2",
            "visual.transformer.resblocks.4.ln_2",
            "visual.transformer.resblocks.5.ln_2",
            "visual.transformer.resblocks.6.ln_2",
            "visual.transformer.resblocks.7.ln_2",
            "visual.transformer.resblocks.8.ln_2",
            "visual.transformer.resblocks.9.ln_2",
            "visual.transformer.resblocks.10.ln_2",
            "visual.transformer.resblocks.11.ln_2",
            "visual"
        ],
        "objective": "Image-Text",
        "dataset": "WIT-400M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 768,
        "alignment": null,
        "dataset_class": "Large",
        "size": 149620737,
        "size_fmt": "149.6M",
        "size_class": "medium",
        "set_length": 13
    },
    "OpenCLIP_ViT-L-14_openai_ap": {
        "model_name": "OpenCLIP",
        "source": "custom",
        "model_parameters": {
            "variant": "ViT-L-14",
            "dataset": "openai",
            "vision_cfg": {
                "image_size": 224,
                "layers": 24,
                "width": 1024,
                "patch_size": 14,
                "pool_type": "none"
            },
            "token_extraction": "avg_pool"
        },
        "module_names": [
            "visual.transformer.resblocks.0.ln_2",
            "visual.transformer.resblocks.1.ln_2",
            "visual.transformer.resblocks.2.ln_2",
            "visual.transformer.resblocks.3.ln_2",
            "visual.transformer.resblocks.4.ln_2",
            "visual.transformer.resblocks.5.ln_2",
            "visual.transformer.resblocks.6.ln_2",
            "visual.transformer.resblocks.7.ln_2",
            "visual.transformer.resblocks.8.ln_2",
            "visual.transformer.resblocks.9.ln_2",
            "visual.transformer.resblocks.10.ln_2",
            "visual.transformer.resblocks.11.ln_2",
            "visual.transformer.resblocks.12.ln_2",
            "visual.transformer.resblocks.13.ln_2",
            "visual.transformer.resblocks.14.ln_2",
            "visual.transformer.resblocks.15.ln_2",
            "visual.transformer.resblocks.16.ln_2",
            "visual.transformer.resblocks.17.ln_2",
            "visual.transformer.resblocks.18.ln_2",
            "visual.transformer.resblocks.19.ln_2",
            "visual.transformer.resblocks.20.ln_2",
            "visual.transformer.resblocks.21.ln_2",
            "visual.transformer.resblocks.22.ln_2",
            "visual.transformer.resblocks.23.ln_2",
            "visual"
        ],
        "objective": "Image-Text",
        "dataset": "WIT-400M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 1024,
        "alignment": null,
        "dataset_class": "Large",
        "size": 427616513,
        "size_fmt": "427.6M",
        "size_class": "xlarge",
        "set_length": 25
    },
    "OpenCLIP_ViT-B-32_openai_at": {
        "model_name": "OpenCLIP",
        "source": "custom",
        "model_parameters": {
            "variant": "ViT-B-32",
            "dataset": "openai",
            "vision_cfg": {
                "image_size": 224,
                "layers": 12,
                "width": 768,
                "patch_size": 32,
                "pool_type": "none"
            },
            "token_extraction": "all_tokens"
        },
        "module_names": [
            "visual"
        ],
        "objective": "Image-Text",
        "dataset": "WIT-400M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 512,
        "alignment": null,
        "dataset_class": "Large",
        "size": 149620737,
        "size_fmt": "149.6M",
        "size_class": "medium",
        "set_length": 50
    },
    "OpenCLIP_ViT-B-16_openai_at": {
        "model_name": "OpenCLIP",
        "source": "custom",
        "model_parameters": {
            "variant": "ViT-B-16",
            "dataset": "openai",
            "vision_cfg": {
                "image_size": 224,
                "layers": 12,
                "width": 768,
                "patch_size": 16,
                "pool_type": "none"
            },
            "token_extraction": "all_tokens"
        },
        "module_names": [
            "visual"
        ],
        "objective": "Image-Text",
        "dataset": "WIT-400M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 512,
        "alignment": null,
        "dataset_class": "Large",
        "size": 149620737,
        "size_fmt": "149.6M",
        "size_class": "medium",
        "set_length": 197
    },
    "OpenCLIP_ViT-L-14_openai_at": {
        "model_name": "OpenCLIP",
        "source": "custom",
        "model_parameters": {
            "variant": "ViT-L-14",
            "dataset": "openai",
            "vision_cfg": {
                "image_size": 224,
                "layers": 24,
                "width": 1024,
                "patch_size": 14,
                "pool_type": "none"
            },
            "token_extraction": "all_tokens"
        },
        "module_names": [
            "visual"
        ],
        "objective": "Image-Text",
        "dataset": "WIT-400M",
        "architecture_class": "Transformer",
        "architecture": "ViT",
        "embedding_dim": 768,
        "alignment": null,
        "dataset_class": "Large",
        "size": 427616513,
        "size_fmt": "427.6M",
        "size_class": "xlarge",
        "set_length": 257
    }
}