model:
    model_id: "flux"
    n_text_tokens: 512
sparse_plan:  # the offline sparse_plan
     empty:
         max_threshold: 1.0
     permute:
         max_threshold: 0.01
         sparse_percentage: 0.9
     sparse:
        sparse_type: "threshold"   # ['threshold', 'mean']
        sparse_block_size: 1  # the base block_sparse_size is [17K//13, 17K//6, 17K//9]
        # only pair with online, override the sparse_block_size in sparse_plan. 
        # (!) it is the actual block_size, without considering attn_ds_rate since it does not relate with the offline calib_data downsampletd
        #  online_sparse_block_size: 128  
        max_threshold: 0.01
        sum_threshold: 0.01
        block_sum_k: 5
calib_data:  # IMPORTANT: the permute plan downsample have different ds_rate compared with fp 
    qkv: False
    attn_map: True
    attn_ds_rate: 16  # fp: [1,5,5], permute: block_size(64)
    qkv_ds_rate: 4
    qkv_ds_type: "part"
remain_fp_regex: "patch_embed|time_embedding|norm_out|proj_out|norm1|norm2|timestep_embedder|norm|linear_1|linear_2"   
#weight:
    #n_bits: 8
    #sym: False
#act:
    #n_bits: 8
    #sym: True
attn:
    # exp_of_two_softmax: False
    head_split_num: 6   # 8 for sparse_only
    sparse:
        permute: True
        #block_sparse: True
        #online: True
        #empty_head: True
        #empty_head_procssor: 'zero'  # ['zero','uniform']
    # -----------------------------
    # qk:
    #     n_bits: 8
    #     sym: True
    # v:
    #     n_bits: 4
    #     sym: True
    # attn_map:  # V*attn_map_post_softmax quantized
    #     n_bits: 4
    #     sym: True
    #     block_size: 64  # usually the same as "online_sparse_block_size"
    #     group: 'block'
    # -----------------------------