workload,variant,status,timing_method,median_ms,mean_ms,std_ms,min_ms,xla_flops,tflops,utilization_pct,output_shape,error
1p_Flash_Attention,baseline,success,device_profiler,22.9762,22.947,0.165,22.0933,2246282575872,97.77,10.6,"[4, 64, 4096, 128]",
1p_Flash_Attention,optimized,success,device_profiler,6.5106,6.5106,0.0018,6.506,2199023255552,337.76,36.8,"[4, 64, 4096, 128]",
2p_GQA_Attention,baseline,success,device_profiler,50.3364,50.3267,0.1488,49.8333,4449607090176,88.4,9.6,"[4, 4096, 128, 128]",
2p_GQA_Attention,optimized,success,device_profiler,15.2735,15.2734,0.0018,15.2693,4398046511104,287.95,31.4,"[4, 4096, 128, 128]",
3p_MLA_Attention,baseline,success,device_profiler,19.8843,19.8692,0.052,19.7405,4464620601344,224.53,24.5,"[4, 2048, 7168]",
3p_MLA_Attention,optimized,success,device_profiler,20.6213,20.5991,0.1213,19.7932,5276837609472,255.89,27.9,"[4, 2048, 7168]",
4p_Sparse_Attention,baseline,success,device_profiler,5.6623,5.6881,0.4376,5.0326,558362001408,98.61,10.7,"[64, 4096, 128]",
4p_Sparse_Attention,optimized,success,device_profiler,6.949,6.9492,0.0017,6.9451,2199023255552,316.45,34.5,"[4, 64, 4096, 128]",
5p_Flex_Attention,baseline,success,device_profiler,37.1101,37.0655,0.2155,36.2302,2259167477760,60.88,6.6,"[4, 64, 4096, 128]",
6p_Paged_Attention,baseline,success,device_profiler,7.7828,7.7826,0.0026,7.7772,8589934592,1.1,0.1,"[64, 64, 128]",
6p_Paged_Attention,optimized,success,device_profiler,3.1142,3.1143,0.0001,3.1142,8589934592,2.76,0.3,"[64, 64, 128]",
7p_Ragged_Paged_Attention,baseline,success,device_profiler,48.4853,48.442,1.0108,46.808,549755813888,11.34,1.2,"[4096, 64, 128]",
7p_Ragged_Paged_Attention,optimized,success,device_profiler,2.9829,2.9829,0.0002,2.9825,549755813888,184.3,20.1,"[4096, 64, 128]",
8p_GEMM,baseline,success,device_profiler,5.3218,5.3222,0.0027,5.3161,3848290697216,723.12,78.8,"[8192, 28672]",
8p_GEMM,optimized,success,device_profiler,5.4052,5.4493,0.3076,5.087,3848290697216,711.96,77.6,"[8192, 28672]",
9p_SwiGLU_MLP,baseline,success,device_profiler,16.4977,16.5079,0.0294,16.4832,11546046496768,699.86,76.2,"[2, 4096, 8192]",
10p_Sparse_MoE,baseline,success,device_profiler,30.7251,30.7302,0.0412,30.5624,23095750426624,751.69,81.9,"[2, 4096, 4096]",
11p_Megablox_GMM,baseline,success,device_profiler,3.0404,3.1998,0.2479,3.0334,412316860416,135.61,14.8,"[32768, 1536]",
11p_Megablox_GMM,optimized,success,device_profiler,1.9168,1.9879,0.156,1.9045,412316860416,215.11,23.4,"[32768, 1536]",
12p_RMSNorm,baseline,success,device_profiler,1.1385,1.2658,0.2605,1.1144,1342177280,1.18,0.1,"[8, 4096, 8192]",
13p_Cross_Entropy,baseline,success,device_profiler,14.2096,14.2094,0.0031,14.199,8616570519552,606.39,66.1,[],
14p_Ragged_Dot,baseline,success,device_profiler,1.2445,1.2446,0.0013,1.2418,962072674304,773.06,84.2,"[8, 1024, 14336]",
15p_RetNet_Retention,baseline,success,device_profiler,12.2,12.1598,0.2938,10.113,1106558058496,90.7,9.9,"[4, 16, 4096, 256]",
16p_Mamba2_SSD,baseline,success,device_profiler,28.2603,28.3128,0.2571,27.6284,1138325979136,40.28,4.4,"[4, 64, 4096, 64]",
17p_Triangle_Multiplication,baseline,success,device_profiler,14.5211,14.4921,0.1556,13.4893,1397915910144,96.27,10.5,"[1536, 1536, 128]",
18k_Conv2D_ReLU_BiasAdd,baseline,success,device_profiler,1.7671,1.767,0.0011,1.7646,300429803520,170.01,18.5,"[128, 128, 126, 126]",
19k_Matmul_Subtract_Multiply_ReLU,baseline,success,device_profiler,0.6297,0.6298,0.0001,0.6297,549890031616,873.19,95.1,"[4096, 8192]",
20k_Gemm_Multiply_LeakyReLU,baseline,success,device_profiler,0.6337,0.6337,0.0001,0.6336,549923586048,867.76,94.5,"[4096, 8192]",
21k_Gemm_Divide_Sum_Scaling,baseline,success,device_profiler,0.6788,0.6788,0.0006,0.6772,549822922752,809.95,88.2,"[4096, 1]",
22k_Conv2d_InstanceNorm_Divide,baseline,success,device_profiler,2.8542,2.854,0.0017,2.8501,303291105280,106.26,11.6,"[128, 128, 126, 126]",
23k_Matmul_Sum_Max_AvgPool_LogSumExp_LogSumExp,baseline,success,device_profiler,0.7273,0.7273,0.0001,0.7271,549822922752,756.02,82.4,"[4096, 1]",
24k_Matmul_Scale_ResidualAdd_Clamp_LogSumExp_Mish,baseline,success,device_profiler,0.7905,0.7905,0.0001,0.7903,550124978176,695.9,75.8,"[4096, 1]",
25k_Conv3d_GroupNorm_Mean,baseline,success,device_profiler,0.3383,0.3384,0.0001,0.3381,10766134272,31.82,3.5,[128],
26k_BMM_InstanceNorm_Sum_ResidualAdd_Multiply,baseline,success,device_profiler,0.6856,0.6857,0.0005,0.6849,550158467072,802.47,87.4,"[4096, 8192]",
27k_Matmul_Mish_Mish,baseline,success,device_profiler,0.7829,0.7829,0.0001,0.7828,550259130368,702.85,76.6,"[4096, 8192]",
28k_ConvTranspose3d_LayerNorm_GELU_Scaling,baseline,success,device_profiler,14.7864,14.7866,0.0083,14.7622,1005074186240,67.97,7.4,"[32, 64, 32, 64, 64]",
29k_Matmul_Swish_Sum_GroupNorm,baseline,success,device_profiler,1.1193,1.1193,0.0012,1.1167,275517014016,246.14,26.8,"[8192, 4096]",
30k_Matmul_Scaling_ResidualAdd,baseline,success,device_profiler,0.644,0.644,0.0001,0.6439,549957140480,853.96,93.0,"[16384, 4096]",
31k_Gemm_BatchNorm_GELU_ReLU,baseline,success,device_profiler,3.7396,3.7506,0.0425,3.6741,2201976045568,588.83,64.1,"[16384, 8192]",
32k_Gemm_Sigmoid_LogSumExp,baseline,success,device_profiler,0.5503,0.5504,0.0005,0.5497,412669247488,749.91,81.7,[16384],
33k_Conv3d_Mish_Tanh,baseline,success,device_profiler,2.4173,2.4173,0.0017,2.4136,204409798656,84.56,9.2,"[16, 64, 30, 62, 62]",
34k_Conv2d_Activation_BatchNorm,baseline,success,device_profiler,1.5584,1.5586,0.001,1.5559,152165744640,97.64,10.6,"[64, 128, 126, 126]",
35k_Gemm_Scaling_Hardtanh_GELU,baseline,success,device_profiler,0.6636,0.6637,0.0001,0.6636,550124912640,828.96,90.3,"[4096, 8192]",
36k_Matmul_Sigmoid_Sum,baseline,success,device_profiler,0.7202,0.7202,0.0001,0.7201,549923586048,763.53,83.2,"[4096, 1]",
37k_Matmul_Swish_Scaling,baseline,success,device_profiler,0.6426,0.6426,0.0001,0.6424,549957140480,855.88,93.2,"[4096, 8192]",
38k_Matmul_Dropout_Softmax,baseline,success,device_profiler,0.8448,0.8447,0.0002,0.8443,550024249344,651.09,70.9,"[4096, 8192]",
39k_Conv2d_GELU_GlobalAvgPool,baseline,success,device_profiler,1.3322,1.3324,0.0007,1.3313,81919844352,61.49,6.7,"[128, 64]",
40k_Gemm_GroupNorm_Min_BiasAdd,baseline,success,device_profiler,1.417,1.4172,0.001,1.416,550198312960,388.29,42.3,"[1, 8192, 4096, 1]",
41k_Gemm_Add_ReLU,baseline,success,device_profiler,0.6292,0.6293,0.0001,0.6291,549822922752,873.79,95.2,"[4096, 8192]",
42k_Gemm_Max_Subtract_GELU,baseline,success,device_profiler,0.6274,0.6275,0.0001,0.6273,549822922752,876.3,95.5,"[4096, 1]",
43k_Gemm_BatchNorm_Scaling_Softmax,baseline,success,device_profiler,0.9111,0.9111,0.0004,0.9104,550124912640,603.79,65.8,"[4096, 8192]",
44k_Matmul_Divide_GELU,baseline,success,device_profiler,0.6599,0.66,0.0001,0.6599,550091358208,833.54,90.8,"[4096, 8192]",
45k_Gemm_GroupNorm_Swish_Multiply_Swish,baseline,success,device_profiler,1.4063,1.4065,0.0007,1.4056,550430048256,391.4,42.6,"[4096, 8192]",
46k_Conv2d_GroupNorm_Tanh_HardSwish_ResidualAdd_LogSumExp,baseline,success,device_profiler,1.011,1.0111,0.0012,1.0091,21467408384,21.23,2.3,"[128, 1, 126, 126]",
47k_Matmul_Add_Swish_Tanh_GELU_Hardtanh,baseline,success,device_profiler,0.695,0.6951,0.0001,0.695,550259130368,791.7,86.2,"[4096, 8192]",
48k_Matmul_BatchNorm_BiasAdd_Divide_Swish,baseline,success,device_profiler,0.6642,0.6642,0.0001,0.6641,550091358208,828.2,90.2,"[4096, 8192]",
49k_Matmul_AvgPool_GELU_Scale_Max,baseline,success,device_profiler,0.8563,0.8564,0.0007,0.8547,549843894272,642.09,69.9,[4096],
50k_Matmul_GELU_Softmax,baseline,success,device_profiler,0.8537,0.8538,0.0003,0.8532,550225575936,644.49,70.2,"[4096, 8192]",
