name: CLIP

model:
  _target_: models.clip.CLIP

  visual: # Vision model to be trained
    _target_: timm.create_model
    model_name: vit_base_patch32_clip_224.openai
    pretrained: true

  language: # Language model to be trained
    _target_: models.transformer.LanguageEncoder
    model_name: clip-ViT-B-32-multilingual-v1
    freeze: false
    output_value: "sentence_embedding"
    normalize_embeddings: true
    use_dataset_cache: false

  image_projection: null # Visual projection

  text_projection: null # Language projection



