quant_config:
  algo: smoothquant
  kwargs:
    w_dtype: int8    # select from ['int2', 'int3', 'int4', 'int8', 'float16']
    a_dtype: int8   # select from ['int2', 'int3', 'int4', 'int8', 'float16']
    device: 'cuda'
    offload: 'cpu'
    w_groupsize: 128        
    w_qtype: "per_channel" 
    a_qtype: "per_token"                   
    block_sequential: False
    layer_sequential: False
    alpha: 0.5
    quant_out: False 
    skip_layers:                   # the linear layer to skip, should match the module name
      - lm_head
  calibrate_config:
    name: 'wikitext2'
    split: train
    nsample: 1
    seqlen: 2048
