model:
    model_id: "cogvideox"
    n_text_tokens: 226
sparse_plan:  # the offline sparse_plan
     empty:
         max_threshold: 5.e-3
     permute:
         max_threshold: 1.e-3
         sparse_percentage: 0.9
     sparse:
        sparse_type: "threshold"   # ['threshold', 'mean']
        sparse_block_size: 1  # the base block_sparse_size is [17K//13, 17K//6, 17K//9]
        # only pair with online, override the sparse_block_size in sparse_plan. 
        # (!) it is the actual block_size, without considering attn_ds_rate since it does not relate with the offline calib_data downsampletd
        #  online_sparse_block_size: 64  
        max_threshold: 0.15
        sum_threshold: 0.15
        timestep_wise: True
        block_sum_k: 5
calib_data:  # IMPORTANT: the permute plan downsample have different ds_rate compared with fp 
    qkv: False
    attn_map: True
    attn_ds_rate: 64  # fp: [1,5,5], permute: block_size(64)
    qkv_ds_rate: 16
remain_fp_regex: "patch_embed|time_embedding|norm_out|proj_out|norm1|norm2"  
#weight:
    #n_bits: 8
    #sym: False
#act:
    #n_bits: 8
    #sym: True
attn:
    # exp_of_two_softmax: False
    head_split_num: 16   # 8 for sparse_only
    skip_text_quant: True
    FP8: False
    sparse:
        permute_plan: "logs/calib_data/gen_sparse/permute_plan.pth"
        sparse_plan: "logs/calib_data/gen_sparse/sparse_plan.pth"
        permute: True
        block_sparse: True
        online: False
        empty_head: True
        empty_head_procssor: 'zero'  # ['zero','uniform']
        skip_timestep_percentage: 0.0
    # -----------------------------
    qk:
        n_bits: 8
        sym: True
    v:
        n_bits: 8
        per_group: 4096
        sym: True
    attn_map:  # V*attn_map_post_softmax quantized
        n_bits: 8
        sym: True
        block_size: 64  # usually the same as "online_sparse_block_size"
        group: 'block'
    # -----------------------------



