TRAIN:
  OPTIMIZER_NAME: adamw
QUANT:
  P:
    EXCLUDE_SUFFIX: [norm.weight, norm1.weight, norm2.weight, head.weight, patch_embed.proj.weight, relative_position_bias_table]
    BITS: 8
    QUANT_TYPE:
      DEFAULT: dim01
      DEFAULT_ONLY: False
      patch_embed.proj.weight: dim0
      layers.0.blocks.0.attn.qkv.weight: dim01
      layers.0.blocks.0.attn.proj.weight: dim01
      layers.0.blocks.0.mlp.fc1.weight: dim0
      layers.0.blocks.0.mlp.fc2.weight: dim1
      layers.0.blocks.1.attn.qkv.weight: dim10
      layers.0.blocks.1.attn.proj.weight: dim10
      layers.0.blocks.1.mlp.fc1.weight: dim0
      layers.0.blocks.1.mlp.fc2.weight: dim10
      layers.0.downsample.reduction.weight: dim01
      layers.1.blocks.0.attn.qkv.weight: dim10
      layers.1.blocks.0.attn.proj.weight: dim01
      layers.1.blocks.0.mlp.fc1.weight: dim0
      layers.1.blocks.0.mlp.fc2.weight: dim1
      layers.1.blocks.1.attn.qkv.weight: dim10
      layers.1.blocks.1.attn.proj.weight: dim01
      layers.1.blocks.1.mlp.fc1.weight: dim01
      layers.1.blocks.1.mlp.fc2.weight: dim1
      layers.1.downsample.reduction.weight: dim01
      layers.2.blocks.0.attn.qkv.weight: dim01
      layers.2.blocks.0.attn.proj.weight: dim01
      layers.2.blocks.0.mlp.fc1.weight: dim01
      layers.2.blocks.0.mlp.fc2.weight: dim1
      layers.2.blocks.1.attn.qkv.weight: dim01
      layers.2.blocks.1.attn.proj.weight: dim01
      layers.2.blocks.1.mlp.fc1.weight: dim10
      layers.2.blocks.1.mlp.fc2.weight: dim1
      layers.2.blocks.2.attn.qkv.weight: dim0
      layers.2.blocks.2.attn.proj.weight: dim01
      layers.2.blocks.2.mlp.fc1.weight: dim0
      layers.2.blocks.2.mlp.fc2.weight: dim1
      layers.2.blocks.3.attn.qkv.weight: dim10
      layers.2.blocks.3.attn.proj.weight: dim01
      layers.2.blocks.3.mlp.fc1.weight: dim0
      layers.2.blocks.3.mlp.fc2.weight: dim10
      layers.2.blocks.4.attn.qkv.weight: dim10
      layers.2.blocks.4.attn.proj.weight: dim01
      layers.2.blocks.4.mlp.fc1.weight: dim01
      layers.2.blocks.4.mlp.fc2.weight: dim1
      layers.2.blocks.5.attn.qkv.weight: dim0
      layers.2.blocks.5.attn.proj.weight: dim01
      layers.2.blocks.5.mlp.fc1.weight: dim0
      layers.2.blocks.5.mlp.fc2.weight: dim01
      layers.2.downsample.reduction.weight: dim01
      layers.3.blocks.0.attn.qkv.weight: dim10
      layers.3.blocks.0.attn.proj.weight: dim01
      layers.3.blocks.0.mlp.fc1.weight: dim01
      layers.3.blocks.0.mlp.fc2.weight: dim01
      layers.3.blocks.1.attn.qkv.weight: dim10
      layers.3.blocks.1.attn.proj.weight: dim01
      layers.3.blocks.1.mlp.fc1.weight: dim10
      layers.3.blocks.1.mlp.fc2.weight: dim01
      head.weight: dim01
    TRANSFORM:
      DEFAULT_ONLY: False
      layers.0.blocks.0.attn.qkv.weight: 2
      layers.0.blocks.0.attn.proj.weight: 2
      layers.0.blocks.0.mlp.fc1.weight: 2
      layers.0.blocks.0.mlp.fc2.weight: 2
      layers.0.blocks.1.mlp.fc1.weight: 2
      layers.1.blocks.0.attn.qkv.weight: 2
      layers.1.blocks.0.attn.proj.weight: 2
      layers.1.blocks.0.mlp.fc1.weight: 2
      layers.1.blocks.0.mlp.fc2.weight: 2
      layers.1.blocks.1.attn.qkv.weight: 2
      layers.1.blocks.1.mlp.fc1.weight: 2
      layers.1.blocks.1.mlp.fc2.weight: 2
      layers.1.downsample.reduction.weight: 2
      layers.2.blocks.0.attn.qkv.weight: 2
      layers.2.blocks.0.attn.proj.weight: 2
      layers.2.blocks.0.mlp.fc1.weight: 2
      layers.2.blocks.0.mlp.fc2.weight: 2
      layers.2.blocks.1.attn.qkv.weight: 2
      layers.2.blocks.1.attn.proj.weight: 2
      layers.2.blocks.1.mlp.fc1.weight: 2
      layers.2.blocks.1.mlp.fc2.weight: 2
      layers.2.blocks.2.attn.qkv.weight: 2
      layers.2.blocks.2.attn.proj.weight: 2
      layers.2.blocks.2.mlp.fc1.weight: 2
      layers.2.blocks.2.mlp.fc2.weight: 2
      layers.2.blocks.3.attn.qkv.weight: 2
      layers.2.blocks.3.attn.proj.weight: 2
      layers.2.blocks.3.mlp.fc1.weight: 2
      layers.2.blocks.3.mlp.fc2.weight: 2
      layers.2.blocks.4.attn.qkv.weight: 2
      layers.2.blocks.4.attn.proj.weight: 2
      layers.2.blocks.4.mlp.fc1.weight: 2
      layers.2.blocks.4.mlp.fc2.weight: 2
      layers.2.blocks.5.attn.qkv.weight: 2
      layers.2.blocks.5.attn.proj.weight: 2
      layers.2.blocks.5.mlp.fc1.weight: 2
      layers.2.blocks.5.mlp.fc2.weight: 2
      layers.2.downsample.reduction.weight: 2
      layers.3.blocks.0.attn.qkv.weight: 2
      layers.3.blocks.0.attn.proj.weight: 2
      layers.3.blocks.0.mlp.fc1.weight: 2
      layers.3.blocks.0.mlp.fc2.weight: 2
      layers.3.blocks.1.attn.qkv.weight: 2
      layers.3.blocks.1.attn.proj.weight: 2
      layers.3.blocks.1.mlp.fc1.weight: 2
      layers.3.blocks.1.mlp.fc2.weight: 2
  G:
    ENABLE: False
  M:
    EXCLUDE_SUFFIX: [norm.weight, norm1.weight, norm2.weight, head.weight, patch_embed.proj.weight, relative_position_bias_table]
    BITS: 4
    QUANT_TYPE:
      DEFAULT: dim10
      DEFAULT_ONLY: False
      patch_embed.proj.weight: dim0
      layers.0.blocks.0.attn.qkv.weight: dim10
      layers.0.blocks.0.attn.proj.weight: dim01
      layers.0.blocks.0.mlp.fc1.weight: dim10
      layers.0.blocks.0.mlp.fc2.weight: dim01
      layers.0.blocks.1.attn.qkv.weight: dim10
      layers.0.blocks.1.attn.proj.weight: dim01
      layers.0.blocks.1.mlp.fc1.weight: dim10
      layers.0.blocks.1.mlp.fc2.weight: dim01
      layers.0.downsample.reduction.weight: dim01
      layers.1.blocks.0.attn.qkv.weight: dim10
      layers.1.blocks.0.attn.proj.weight: dim10
      layers.1.blocks.0.mlp.fc1.weight: dim10
      layers.1.blocks.0.mlp.fc2.weight: dim01
      layers.1.blocks.1.attn.qkv.weight: dim10
      layers.1.blocks.1.attn.proj.weight: dim01
      layers.1.blocks.1.mlp.fc1.weight: dim10
      layers.1.blocks.1.mlp.fc2.weight: dim01
      layers.1.downsample.reduction.weight: dim10
      layers.2.blocks.0.attn.qkv.weight: dim10
      layers.2.blocks.0.attn.proj.weight: dim10
      layers.2.blocks.0.mlp.fc1.weight: dim10
      layers.2.blocks.0.mlp.fc2.weight: dim10
      layers.2.blocks.1.attn.qkv.weight: dim10
      layers.2.blocks.1.attn.proj.weight: dim01
      layers.2.blocks.1.mlp.fc1.weight: dim10
      layers.2.blocks.1.mlp.fc2.weight: dim10
      layers.2.blocks.2.attn.qkv.weight: dim10
      layers.2.blocks.2.attn.proj.weight: dim01
      layers.2.blocks.2.mlp.fc1.weight: dim10
      layers.2.blocks.2.mlp.fc2.weight: dim01
      layers.2.blocks.3.attn.qkv.weight: dim10
      layers.2.blocks.3.attn.proj.weight: dim01
      layers.2.blocks.3.mlp.fc1.weight: dim10
      layers.2.blocks.3.mlp.fc2.weight: dim10
      layers.2.blocks.4.attn.qkv.weight: dim10
      layers.2.blocks.4.attn.proj.weight: dim01
      layers.2.blocks.4.mlp.fc1.weight: dim10
      layers.2.blocks.4.mlp.fc2.weight: dim10
      layers.2.blocks.5.attn.qkv.weight: dim10
      layers.2.blocks.5.attn.proj.weight: dim10
      layers.2.blocks.5.mlp.fc1.weight: dim01
      layers.2.blocks.5.mlp.fc2.weight: dim10
      layers.2.downsample.reduction.weight: dim10
      layers.3.blocks.0.attn.qkv.weight: dim01
      layers.3.blocks.0.attn.proj.weight: dim01
      layers.3.blocks.0.mlp.fc1.weight: dim10
      layers.3.blocks.0.mlp.fc2.weight: dim01
      layers.3.blocks.1.attn.qkv.weight: dim10
      layers.3.blocks.1.attn.proj.weight: dim01
      layers.3.blocks.1.mlp.fc1.weight: dim10
      layers.3.blocks.1.mlp.fc2.weight: dim10
      head.weight: dim10
    TRANSFORM:
      DEFAULT_ONLY: False
      layers.0.blocks.0.attn.qkv.weight: 2
      layers.0.blocks.0.attn.proj.weight: 2
      layers.0.blocks.1.mlp.fc2.weight: 2
      layers.0.downsample.reduction.weight: 2
      layers.1.blocks.0.attn.qkv.weight: 2
      layers.1.blocks.0.mlp.fc2.weight: 2
      layers.1.blocks.1.attn.qkv.weight: 2
      layers.1.blocks.1.mlp.fc1.weight: 2
      layers.2.blocks.0.mlp.fc1.weight: 2
      layers.2.blocks.0.mlp.fc2.weight: 2
      layers.2.blocks.1.attn.qkv.weight: 2
      layers.2.blocks.1.attn.proj.weight: 2
      layers.2.blocks.1.mlp.fc1.weight: 2
      layers.2.blocks.1.mlp.fc2.weight: 2
      layers.2.blocks.2.attn.qkv.weight: 2
      layers.2.blocks.2.attn.proj.weight: 2
      layers.2.blocks.2.mlp.fc1.weight: 2
      layers.2.blocks.2.mlp.fc2.weight: 2
      layers.2.blocks.3.attn.qkv.weight: 2
      layers.2.blocks.3.attn.proj.weight: 2
      layers.2.blocks.3.mlp.fc1.weight: 2
      layers.2.blocks.3.mlp.fc2.weight: 2
      layers.2.blocks.4.attn.qkv.weight: 2
      layers.2.blocks.4.attn.proj.weight: 2
      layers.2.blocks.4.mlp.fc1.weight: 2
      layers.2.blocks.4.mlp.fc2.weight: 2
      layers.2.blocks.5.attn.qkv.weight: 2
      layers.2.blocks.5.attn.proj.weight: 2
      layers.2.blocks.5.mlp.fc1.weight: 2
      layers.2.blocks.5.mlp.fc2.weight: 2
      layers.2.downsample.reduction.weight: 2
      layers.3.blocks.0.attn.qkv.weight: 2
      layers.3.blocks.0.attn.proj.weight: 2
      layers.3.blocks.0.mlp.fc1.weight: 2
      layers.3.blocks.0.mlp.fc2.weight: 2
      layers.3.blocks.1.attn.qkv.weight: 2
      layers.3.blocks.1.attn.proj.weight: 2
      layers.3.blocks.1.mlp.fc1.weight: 2
      layers.3.blocks.1.mlp.fc2.weight: 2
      head.weight: 2
  SQM:
    EXCLUDE_SUFFIX: [norm.weight, norm1.weight, norm2.weight, head.weight, patch_embed.proj.weight, relative_position_bias_table]
    BITS: 4
    QUANT_TYPE:
      DEFAULT: dim10
      DEFAULT_ONLY: False
      patch_embed.proj.weight: dim0
      layers.0.blocks.0.attn.qkv.weight: dim01
      layers.0.blocks.0.attn.proj.weight: dim10
      layers.0.blocks.0.mlp.fc1.weight: dim10
      layers.0.blocks.0.mlp.fc2.weight: dim01
      layers.0.blocks.1.attn.qkv.weight: dim10
      layers.0.blocks.1.attn.proj.weight: dim01
      layers.0.blocks.1.mlp.fc1.weight: dim01
      layers.0.blocks.1.mlp.fc2.weight: dim01
      layers.0.downsample.reduction.weight: dim01
      layers.1.blocks.0.attn.qkv.weight: dim10
      layers.1.blocks.0.attn.proj.weight: dim01
      layers.1.blocks.0.mlp.fc1.weight: dim01
      layers.1.blocks.0.mlp.fc2.weight: dim01
      layers.1.blocks.1.attn.qkv.weight: dim10
      layers.1.blocks.1.attn.proj.weight: dim01
      layers.1.blocks.1.mlp.fc1.weight: dim10
      layers.1.blocks.1.mlp.fc2.weight: dim01
      layers.1.downsample.reduction.weight: dim01
      layers.2.blocks.0.attn.qkv.weight: dim10
      layers.2.blocks.0.attn.proj.weight: dim01
      layers.2.blocks.0.mlp.fc1.weight: dim10
      layers.2.blocks.0.mlp.fc2.weight: dim01
      layers.2.blocks.1.attn.qkv.weight: dim10
      layers.2.blocks.1.attn.proj.weight: dim01
      layers.2.blocks.1.mlp.fc1.weight: dim10
      layers.2.blocks.1.mlp.fc2.weight: dim01
      layers.2.blocks.2.attn.qkv.weight: dim10
      layers.2.blocks.2.attn.proj.weight: dim01
      layers.2.blocks.2.mlp.fc1.weight: dim10
      layers.2.blocks.2.mlp.fc2.weight: dim01
      layers.2.blocks.3.attn.qkv.weight: dim10
      layers.2.blocks.3.attn.proj.weight: dim10
      layers.2.blocks.3.mlp.fc1.weight: dim10
      layers.2.blocks.3.mlp.fc2.weight: dim10
      layers.2.blocks.4.attn.qkv.weight: dim10
      layers.2.blocks.4.attn.proj.weight: dim01
      layers.2.blocks.4.mlp.fc1.weight: dim01
      layers.2.blocks.4.mlp.fc2.weight: dim01
      layers.2.blocks.5.attn.qkv.weight: dim10
      layers.2.blocks.5.attn.proj.weight: dim01
      layers.2.blocks.5.mlp.fc1.weight: dim10
      layers.2.blocks.5.mlp.fc2.weight: dim01
      layers.2.downsample.reduction.weight: dim10
      layers.3.blocks.0.attn.qkv.weight: dim10
      layers.3.blocks.0.attn.proj.weight: dim01
      layers.3.blocks.0.mlp.fc1.weight: dim01
      layers.3.blocks.0.mlp.fc2.weight: dim01
      layers.3.blocks.1.attn.qkv.weight: dim10
      layers.3.blocks.1.attn.proj.weight: dim01
      layers.3.blocks.1.mlp.fc1.weight: dim10
      layers.3.blocks.1.mlp.fc2.weight: dim01
      head.weight: dim10
    TRANSFORM:
      DEFAULT_ONLY: False
      layers.0.blocks.0.attn.qkv.weight: 3
      layers.0.blocks.1.mlp.fc1.weight: 2
      layers.1.blocks.0.mlp.fc1.weight: 2
      layers.3.blocks.0.mlp.fc1.weight: 2
