data:
  technical:
    fragments_h: 7
    fragments_w: 7
    fsize_h: 32
    fsize_w: 32
    aligned: 32
    clip_len: 32
    frame_interval: 2
    num_clips: 3
  aesthetic:
    size_h: 224
    size_w: 224
    clip_len: 32
    frame_interval: 2
    t_frag: 32
    num_clips: 1

model:
  backbone:
    technical:
      type: swin_tiny_grpb
      checkpoint: true
      pretrained:
    aesthetic:
      type: conv_tiny
  backbone_preserve_keys: technical,aesthetic
  divide_head: true
  vqa_head:
    in_channels: 768
    hidden_channels: 64