model:
    model_id: "flux"
    n_text_tokens: 512
sparse_plan:  # the offline sparse_plan
     empty:
         max_threshold: 1.5
     permute:
         max_threshold: 0.01
         sparse_percentage: 0.9
     sparse:
        sparse_type: "threshold"   # ['threshold', 'mean']
        sparse_block_size: 1  # the base block_sparse_size is [17K//13, 17K//6, 17K//9]
        # only pair with online, override the sparse_block_size in sparse_plan. 
        # (!) it is the actual block_size, without considering attn_ds_rate since it does not relate with the offline calib_data downsampletd
        #  online_sparse_block_size: 64  
        max_threshold: 0.015
        sum_threshold: 0.015
        timestep_wise: True
        block_sum_k: 5
calib_data:
    qkv: False
    attn_map: False
    attn_ds_rate: 16
    qkv_ds_rate: 4
    qkv_ds_type: "part"
remain_fp_regex: "patch_embed|time_embedding|norm_out|proj_out|norm1|norm2|timestep_embedder|norm|linear_1|linear_2" 
#weight:
    #n_bits: 8
    #sym: False
#act:
    #n_bits: 8
    #sym: True
attn:
    # exp_of_two_softmax: False
    head_split_num: 6
    skip_text_quant: True
    FP8: False
    sparse:
        permute_plan: "logs/calib_data/test/permute_plan.pth"
        sparse_plan: "logs/calib_data/test/sparse_plan.pth"
        permute: True
        block_sparse: True
        pre_softmax: True
        rescale_text_embeds: False  # rescale to balance the rate between masked_image / text
        online: False
        empty_head: True
        empty_head_procssor: 'zero'  # ['zero','uniform']
        skip_timestep_percentage: 0.0
    # -----------------------------
    qk:
        n_bits: 8
        sym: True
        smooth: True
        onlyKsmooth: False
    v:
        n_bits: 8
        per_group: 4096
        sym: True
    attn_map:  # V*attn_map_post_softmax quantized
        n_bits: 8
        sym: True
        block_size: 64  # usually the same as "online_sparse_block_size"
        group: 'block'
    # -----------------------------



