def make_quant(
    module,
    qcfg: QuantizeConfig,
    quant_result: Dict[str, Dict[str, Any]],
    backend: BACKEND,
    lm_head_name: str,
    pack: bool = False,
    device: DEVICE = None,
    from_quantized: bool = False,
) -> Type[BaseQuantLinear]:
    
    # ...
    # gptqmodel_5.6.0 
    # if not pack and format == FORMAT.GPTQ and backend == BACKEND.BITBLAS:
    #     backend = BACKEND.TORCH


    # BPDQ
    # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
    bpdq_kwargs = {}
    is_bpdq_load = getattr(qcfg, "bpdq_flag", False) and from_quantized
    if is_bpdq_load:
        log.info("BPDQ model flag detected. Forcing backend to TORCH and passing BPDQ params.")
        if backend == BACKEND.AUTO:
             backend = BACKEND.TORCH
        bpdq_kwargs["bpdq_flag"] = True
        msb_num = getattr(qcfg, "msb_num", None)
        n_iters = getattr(qcfg, "n_iters", None)
        alpha = getattr(qcfg, "alpha", None)

        if msb_num is not None:
            bpdq_kwargs["bpdq_k_bits"] = msb_num
            bpdq_kwargs["n_iters"] = n_iters
            bpdq_kwargs["alpha"] = alpha
        else:
            raise ValueError(f"Check msb_num and bpdq_k_bits setting, it's TODO format.")
    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



    # ...
    # gptqmodel_5.6.0 
            # linear_cls = create_quant_layer(
            #     linear_cls=cls,
            #     bits=bits,
            #     desc_act=desc_act,
            #     dynamic=dynamic,
            #     group_size=group_size,
            #     module=module,
            #     sym=sym,
            #     device=device,
            #     quant_result=quant_result,
            #     lm_head_name=lm_head_name,
            #     pack_dtype=pack_dtype,
            #     backend=backend,
            #     adapter=qcfg.adapter,
            # )


    # BPDQ
            linear_cls = create_quant_layer(
                linear_cls=cls,
                bits=bits,
                desc_act=desc_act,
                dynamic=dynamic,
                group_size=group_size,
                module=module,
                sym=sym,
                device=device,
                quant_result=quant_result,
                lm_head_name=lm_head_name,
                pack_dtype=pack_dtype,
                backend=backend,
                adapter=qcfg.adapter,
                **bpdq_kwargs 
            )


def create_quant_module(
    name: str,
    linear_cls: Type[BaseQuantLinear],
    bits: int,
    desc_act: bool,
    dynamic,
    group_size: int,
    module: nn.Module,
    submodule: nn.Module,
    sym: bool,
    device: DEVICE,
    lm_head_name: str,
    pack_dtype: torch.dtype,
    backend: BACKEND = BACKEND.AUTO,
    register_buffers: bool = True,
    adapter: Optional[Adapter] = None,
    # NOTE BPDQ
    **kwargs
    # NOTE BPDQ
):

# ... 

    new_layer = linear_cls(
        bits=tmp_bits,
        group_size=tmp_group_size,
        desc_act=tmp_desc_act,
        sym=tmp_sym,
        in_features=in_features,
        out_features=out_features,
        pack_dtype=tmp_pack_dtype,
        bias=bias,
        #weight_dtype=submodule.qweight.dtype if isinstance(submodule, BaseQuantLinear) else submodule.weight.dtype,
        name=name,
        lm_head_name=lm_head_name,
        backend=backend,
        register_buffers=register_buffers,
        adapter=adapter,
        # NOTE BPDQ
        **kwargs
        # NOTE BPDQ
    )


def create_quant_layer(
        linear_cls: Type[BaseQuantLinear],
        bits: int,
        desc_act: bool,
        dynamic,
        group_size: int,
        quant_result: Dict[str, Dict[str, Any]],
        module,
        sym: bool,
        device: DEVICE,
        lm_head_name: str,
        pack_dtype: torch.dtype,
        backend: BACKEND,
        adapter: Optional[Adapter] = None,
        # NOTE BPDQ
        **kwargs
        # NOTE BPDQ
) -> Type[BaseQuantLinear]:
    if isinstance(module, linear_cls):
        return linear_cls
    for name, submodule in module.named_modules():
        # skip non-quantized modules
        if name not in quant_result:
            continue

        create_quant_module(
            name=name,
            linear_cls=linear_cls,
            bits=bits,
            desc_act=desc_act,
            dynamic=dynamic,
            group_size=group_size,
            module=module,
            submodule=submodule,
            sym=sym,
            device=device,
            lm_head_name=lm_head_name,
            pack_dtype=pack_dtype,
            backend=backend,
            adapter=adapter,
            # NOTE BPDQ
            **kwargs
            # NOTE BPDQ
        )

    return linear_cl



@torch.inference_mode()
def pack_module(
    name,
    qModules,
    q_scales,
    q_zeros,
    q_g_idx,
    layers,
    quant_linear_cls,
    lock: threading.Lock,
    q_scales_extra=None,
    quantize_config: Optional[QuantizeConfig] = None,
    quant_result: Optional[Dict[str, Any]] = None,
    # NOTE BPDQ
    my_c: Optional[torch.Tensor] = None,
    my_b: Optional[torch.Tensor] = None
    # NOTE BPDQ
):


    if lock is not None:
        with lock:
            layers[name] = layer
            qModules[name] = module
    else:
        layers[name] = layer
        qModules[name] = module
    # NOTE BPDQ
    is_bpdq = (
        quantize_config is not None
        and getattr(quantize_config, "bpdq_flag", False)
        and my_c is not None
        and my_b is not None
    )
    if is_bpdq:
        packer_label = "module.pack_bpdq"
        with log_time_block(
            packer_label,
            logger=log,
            module_name=name,
        ):
            my_c = my_c.to(CPU)
            my_b = my_b.to(CPU)

            for attr in ("qweight", "qzeros", "scales", "g_idx"):
                if hasattr(module, attr):
                    try:
                        module._buffers.pop(attr, None)
                    except Exception:
                        setattr(module, attr, None)

            module.register_buffer("c", my_c)
            module.register_buffer("B", my_b)

            # if getattr(layer, "bias", None) is not None and not hasattr(module, "bias"):
            #     module.register_buffer("bias", layer.bias.detach().to(torch.float16))
            
            layer_bias = getattr(layer, "bias", None)
            if layer_bias is not None:
                b = layer_bias.detach().to(device=CPU, dtype=torch.float16).contiguous()
                if isinstance(getattr(module, "_buffers", None), dict) and isinstance(module._buffers.get("bias", None), torch.Tensor):
                    module._buffers["bias"].copy_(b)
                else:
                    module.register_buffer("bias", b)
            else:
                if isinstance(getattr(module, "_buffers", None), dict):
                    module._buffers.pop("bias", None)
                module.bias = None

        return packer_label
    # NOTE BPDQ

    # ...
        if (
            quantize_config is not None
            and quantize_config.quant_method == METHOD.GPTQ
            and quantize_config.format == FORMAT.GPTQ
            and getattr(quant_linear_cls, "REQUIRES_FORMAT_V2", False)
            # NOTE BPDQ
            and (my_c is None and my_b is None) 
            # NOTE BPDQ
        ):
            with log_time_block(
                "convert_v2_to_v1",
                logger=log,
                module_name=name,
            ):
                convert_gptq_v2_to_v1_format_module(
                    module=module,
                    quantize_config=quantize_config,
                )





