TRAIN:
  OPTIMIZER_NAME: adamw
QUANT:
  P:
    EXCLUDE_SUFFIX: [norm.weight, norm1.weight, norm2.weight, head.weight, patch_embed.proj.weight, relative_position_bias_table]
    BITS: 8
    SCALE_TYPE:
      DEFAULT: sm3
      DEFAULT_ONLY: True
      patch_embed.proj.weight: dim01
      layers.0.blocks.0.attn.qkv.weight: dim10
      layers.0.blocks.0.attn.proj.weight: dim01
      layers.0.blocks.0.mlp.fc1.weight: dim10
      layers.0.blocks.0.mlp.fc2.weight: dim10
      layers.0.blocks.1.attn.qkv.weight: dim10
      layers.0.blocks.1.attn.proj.weight: dim01
      layers.0.blocks.1.mlp.fc1.weight: dim10
      layers.0.blocks.1.mlp.fc2.weight: dim01
      layers.0.downsample.reduction.weight: dim01
      layers.1.blocks.0.attn.qkv.weight: dim10
      layers.1.blocks.0.attn.proj.weight: dim01
      layers.1.blocks.0.mlp.fc1.weight: dim01
      layers.1.blocks.0.mlp.fc2.weight: dim10
      layers.1.blocks.1.attn.qkv.weight: dim10
      layers.1.blocks.1.attn.proj.weight: dim10
      layers.1.blocks.1.mlp.fc1.weight: dim01
      layers.1.blocks.1.mlp.fc2.weight: dim10
      layers.1.downsample.reduction.weight: dim01
      layers.2.blocks.0.attn.qkv.weight: dim10
      layers.2.blocks.0.attn.proj.weight: dim01
      layers.2.blocks.0.mlp.fc1.weight: dim10
      layers.2.blocks.0.mlp.fc2.weight: dim10
      layers.2.blocks.1.attn.qkv.weight: dim01
      layers.2.blocks.1.attn.proj.weight: dim01
      layers.2.blocks.1.mlp.fc1.weight: dim10
      layers.2.blocks.1.mlp.fc2.weight: dim10
      layers.2.blocks.2.attn.qkv.weight: dim01
      layers.2.blocks.2.attn.proj.weight: dim01
      layers.2.blocks.2.mlp.fc1.weight: dim01
      layers.2.blocks.2.mlp.fc2.weight: dim10
      layers.2.blocks.3.attn.qkv.weight: dim10
      layers.2.blocks.3.attn.proj.weight: dim01
      layers.2.blocks.3.mlp.fc1.weight: dim01
      layers.2.blocks.3.mlp.fc2.weight: dim10
      layers.2.blocks.4.attn.qkv.weight: dim01
      layers.2.blocks.4.attn.proj.weight: dim01
      layers.2.blocks.4.mlp.fc1.weight: dim01
      layers.2.blocks.4.mlp.fc2.weight: dim10
      layers.2.blocks.5.attn.qkv.weight: dim01
      layers.2.blocks.5.attn.proj.weight: dim01
      layers.2.blocks.5.mlp.fc1.weight: dim01
      layers.2.blocks.5.mlp.fc2.weight: dim01
      layers.2.downsample.reduction.weight: dim01
      layers.3.blocks.0.attn.qkv.weight: dim10
      layers.3.blocks.0.attn.proj.weight: dim01
      layers.3.blocks.0.mlp.fc1.weight: dim10
      layers.3.blocks.0.mlp.fc2.weight: dim01
      layers.3.blocks.1.attn.qkv.weight: dim10
      layers.3.blocks.1.attn.proj.weight: dim01
      layers.3.blocks.1.mlp.fc1.weight: dim10
      layers.3.blocks.1.mlp.fc2.weight: dim01
      head.weight: dim01
    QUANT_TYPE:
      DEFAULT: nonlinear
    ROUND_TYPE: sr
  G:
    ENABLE: False
  M:
    EXCLUDE_SUFFIX: [norm.weight, norm1.weight, norm2.weight, head.weight, patch_embed.proj.weight, relative_position_bias_table]
    BITS: 4
    SCALE_TYPE:
      DEFAULT: sm3
      DEFAULT_ONLY: True
      patch_embed.proj.weight: dim0
      layers.0.blocks.0.attn.qkv.weight: dim01
      layers.0.blocks.0.attn.proj.weight: dim10
      layers.0.blocks.0.mlp.fc1.weight: dim10
      layers.0.blocks.0.mlp.fc2.weight: dim01
      layers.0.blocks.1.attn.qkv.weight: dim10
      layers.0.blocks.1.attn.proj.weight: dim10
      layers.0.blocks.1.mlp.fc1.weight: dim10
      layers.0.blocks.1.mlp.fc2.weight: dim01
      layers.0.downsample.reduction.weight: dim10
      layers.1.blocks.0.attn.qkv.weight: dim10
      layers.1.blocks.0.attn.proj.weight: dim10
      layers.1.blocks.0.mlp.fc1.weight: dim10
      layers.1.blocks.0.mlp.fc2.weight: dim01
      layers.1.blocks.1.attn.qkv.weight: dim10
      layers.1.blocks.1.attn.proj.weight: dim01
      layers.1.blocks.1.mlp.fc1.weight: dim10
      layers.1.blocks.1.mlp.fc2.weight: dim01
      layers.1.downsample.reduction.weight: dim10
      layers.2.blocks.0.attn.qkv.weight: dim10
      layers.2.blocks.0.attn.proj.weight: dim10
      layers.2.blocks.0.mlp.fc1.weight: dim10
      layers.2.blocks.0.mlp.fc2.weight: dim10
      layers.2.blocks.1.attn.qkv.weight: dim10
      layers.2.blocks.1.attn.proj.weight: dim10
      layers.2.blocks.1.mlp.fc1.weight: dim10
      layers.2.blocks.1.mlp.fc2.weight: dim01
      layers.2.blocks.2.attn.qkv.weight: dim10
      layers.2.blocks.2.attn.proj.weight: dim01
      layers.2.blocks.2.mlp.fc1.weight: dim10
      layers.2.blocks.2.mlp.fc2.weight: dim01
      layers.2.blocks.3.attn.qkv.weight: dim10
      layers.2.blocks.3.attn.proj.weight: dim01
      layers.2.blocks.3.mlp.fc1.weight: dim10
      layers.2.blocks.3.mlp.fc2.weight: dim10
      layers.2.blocks.4.attn.qkv.weight: dim10
      layers.2.blocks.4.attn.proj.weight: dim01
      layers.2.blocks.4.mlp.fc1.weight: dim10
      layers.2.blocks.4.mlp.fc2.weight: dim10
      layers.2.blocks.5.attn.qkv.weight: dim10
      layers.2.blocks.5.attn.proj.weight: dim01
      layers.2.blocks.5.mlp.fc1.weight: dim10
      layers.2.blocks.5.mlp.fc2.weight: dim10
      layers.2.downsample.reduction.weight: dim10
      layers.3.blocks.0.attn.qkv.weight: dim10
      layers.3.blocks.0.attn.proj.weight: dim01
      layers.3.blocks.0.mlp.fc1.weight: dim10
      layers.3.blocks.0.mlp.fc2.weight: dim10
      layers.3.blocks.1.attn.qkv.weight: dim10
      layers.3.blocks.1.attn.proj.weight: dim01
      layers.3.blocks.1.mlp.fc1.weight: dim10
      layers.3.blocks.1.mlp.fc2.weight: dim10
      head.weight: dim10
    QUANT_TYPE:
      DEFAULT: customized-2
    ROUND_TYPE: nearest
  SQM:
    EXCLUDE_SUFFIX: [norm.weight, norm1.weight, norm2.weight, head.weight, patch_embed.proj.weight, relative_position_bias_table]
    BITS: 4
    TRANSFORM:
      DEFAULT: None
    SCALE_TYPE:
      DEFAULT: sm3
      DEFAULT_ONLY: True
      patch_embed.proj.weight: tensor
      layers.0.blocks.0.attn.qkv.weight: dim10
      layers.0.blocks.0.attn.proj.weight: dim10
      layers.0.blocks.0.mlp.fc1.weight: dim10
      layers.0.blocks.0.mlp.fc2.weight: dim01
      layers.0.blocks.1.attn.qkv.weight: dim10
      layers.0.blocks.1.attn.proj.weight: dim10
      layers.0.blocks.1.mlp.fc1.weight: dim10
      layers.0.blocks.1.mlp.fc2.weight: dim10
      layers.0.downsample.reduction.weight: dim01
      layers.1.blocks.0.attn.qkv.weight: dim01
      layers.1.blocks.0.attn.proj.weight: dim0
      layers.1.blocks.0.mlp.fc1.weight: dim10
      layers.1.blocks.0.mlp.fc2.weight: dim01
      layers.1.blocks.1.attn.qkv.weight: dim01
      layers.1.blocks.1.attn.proj.weight: dim0
      layers.1.blocks.1.mlp.fc1.weight: dim10
      layers.1.blocks.1.mlp.fc2.weight: dim01
      layers.1.downsample.reduction.weight: dim01
      layers.2.blocks.0.attn.qkv.weight: dim01
      layers.2.blocks.0.attn.proj.weight: dim10
      layers.2.blocks.0.mlp.fc1.weight: dim01
      layers.2.blocks.0.mlp.fc2.weight: dim10
      layers.2.blocks.1.attn.qkv.weight: dim01
      layers.2.blocks.1.attn.proj.weight: dim0
      layers.2.blocks.1.mlp.fc1.weight: dim10
      layers.2.blocks.1.mlp.fc2.weight: dim10
      layers.2.blocks.2.attn.qkv.weight: dim01
      layers.2.blocks.2.attn.proj.weight: dim0
      layers.2.blocks.2.mlp.fc1.weight: dim01
      layers.2.blocks.2.mlp.fc2.weight: dim01
      layers.2.blocks.3.attn.qkv.weight: dim01
      layers.2.blocks.3.attn.proj.weight: dim0
      layers.2.blocks.3.mlp.fc1.weight: dim01
      layers.2.blocks.3.mlp.fc2.weight: dim01
      layers.2.blocks.4.attn.qkv.weight: dim01
      layers.2.blocks.4.attn.proj.weight: dim0
      layers.2.blocks.4.mlp.fc1.weight: dim01
      layers.2.blocks.4.mlp.fc2.weight: dim01
      layers.2.blocks.5.attn.qkv.weight: dim01
      layers.2.blocks.5.attn.proj.weight: dim0
      layers.2.blocks.5.mlp.fc1.weight: dim10
      layers.2.blocks.5.mlp.fc2.weight: dim01
      layers.2.downsample.reduction.weight: dim10
      layers.3.blocks.0.attn.qkv.weight: dim10
      layers.3.blocks.0.attn.proj.weight: dim10
      layers.3.blocks.0.mlp.fc1.weight: dim01
      layers.3.blocks.0.mlp.fc2.weight: dim01
      layers.3.blocks.1.attn.qkv.weight: dim10
      layers.3.blocks.1.attn.proj.weight: dim10
      layers.3.blocks.1.mlp.fc1.weight: dim10
      layers.3.blocks.1.mlp.fc2.weight: dim01
      head.weight: dim1
    QUANT_TYPE:
      DEFAULT: customized-log
    ROUND_TYPE: sr
  DEBUG:
    TRUNCATED_RATE_STAT_ITER : False