model:
    model_id: "flux"
    n_text_tokens: 512
sparse_plan:  # the offline sparse_plan
     empty:
         max_threshold: 1.0
     permute:
         max_threshold: 0.01
         sparse_percentage: 0.9
     sparse:
        sparse_type: "threshold"   # ['threshold', 'mean']
        sparse_block_size: 1  # the base block_sparse_size is [17K//13, 17K//6, 17K//9]
        # only pair with online, override the sparse_block_size in sparse_plan. 
        # (!) it is the actual block_size, without considering attn_ds_rate since it does not relate with the offline calib_data downsampletd
        #  online_sparse_block_size: 64  
        max_threshold: 0.01
        sum_threshold: 0.01
        timestep_wise: True
        block_sum_k: 5
calib_data:  # IMPORTANT: the permute plan downsample have different ds_rate compared with fp 
    qkv: False
    attn_map: True
    attn_ds_rate: 16  # fp: [1,5,5], permute: block_size(64)
    qkv_ds_rate: 16
remain_fp_regex: "patch_embed|time_embedding|norm_out|proj_out|norm1|norm2|timestep_embedder|norm|linear_1|linear_2"   
#weight:
    #n_bits: 8
    #sym: False
#act:
    #n_bits: 8
    #sym: True
attn:
    # exp_of_two_softmax: False
    head_split_num: 6   # 8 for sparse_only
    sparse:
        permute: True
        block_sparse: True
        pre_softmax: True
        rescale_text_embeds: False  # rescale to balance the rate between masked_image / text
        online: False
        empty_head: True
        empty_head_procssor: 'zero'  # ['zero','uniform']
        skip_timestep_percentage: 0.0
    # -----------------------------
    # qk:
    #     n_bits: 8
    #     sym: True
    # v:
    #     n_bits: 4
    #     sym: True
    # attn_map:  # V*attn_map_post_softmax quantized
    #     n_bits: 4
    #     sym: True
    #     block_size: 64  # usually the same as "online_sparse_block_size"
    #     group: 'block'
    # -----------------------------