# Image setting
input_image_embed_size: 1024

# Text Setting
#tokenizer = "roberta-base"
vocab_size: 50265
mlm_prob: 0.15
input_text_embed_size: 768

# Transformer Setting
hidden_size: 768
num_heads: 12
num_layers: 12
mlp_ratio: 4
drop_rate: 0.1
num_fuse_block: 6
