defaults:
  - _self_
  - generate_cfg: top_k_top_p

_target_: src.model.ttnet.TTNetLSTM
image_encoder: ViT-B/32
dim: 512
finetune_image_encoder: false
num_context_layers: 2
context_pos_emb: "relative"
max_transformations: ${dataset.dataset_cfg.train.max_transformations}
num_lstm_layers: 2
embed_dropout: 0.1
lstm_dropout: 0.5
