LlavaQwenForCausalLM(
  (model): LlavaQwenModel(
    (embed_tokens): Embedding(151646, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2FlashAttention2(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
          (dy_yarn_rotary_emb): QwenDynamicYaRNScaledRotaryEmbedding()
          (pi_rotary_emb): LinearScaledRotaryEmbedding()
          (ntk_rotary_emb): NTKScaledRotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
    (vision_tower): SigLipVisionTower(
      (vision_tower): SigLipVisionModel(
        (vision_model): SigLipVisionTransformer(
          (embeddings): SigLipVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
            (position_embedding): Embedding(729, 1152)
          )
          (encoder): SigLipEncoder(
            (layers): ModuleList(
              (0-25): 26 x SigLipEncoderLayer(
                (self_attn): SigLipAttention(
                  (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
                  (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
                  (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
                  (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
                )
                (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
                (mlp): SigLipMLP(
                  (activation_fn): PytorchGELUTanh()
                  (fc1): Linear(in_features=1152, out_features=4304, bias=True)
                  (fc2): Linear(in_features=4304, out_features=1152, bias=True)
                )
                (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              )
            )
          )
          (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
          (head): Identity()
        )
      )
    )
    (vision_resampler): IdentityMap()
    (mm_projector): Sequential(
      (0): Linear(in_features=1152, out_features=3584, bias=True)
      (1): GELU(approximate='none')
      (2): Linear(in_features=3584, out_features=3584, bias=True)
    )
  )
  (lm_head): Linear(in_features=3584, out_features=151646, bias=False)
)