-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
pcclKernel_AllReduce_RING_LL_Sum___ali_bfloat16(nccl...           NaN       0.000us           NaN       0.000us       0.000us        6.398s        43.29%        6.398s        6.398s           0 b           0 b           0 b           0 b             1  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us        2.767s        18.72%        2.767s       1.920ms           0 b           0 b           0 b           0 b          1441  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us        1.620s        10.96%        1.620s       1.445ms           0 b           0 b           0 b           0 b          1121  
void kDequantizeBlockwise<__nv_bfloat16, 512, 64, 8,...           NaN       0.000us           NaN       0.000us       0.000us        1.015s         6.87%        1.015s     604.211us           0 b           0 b           0 b           0 b          1680  
void flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel<Fl...           NaN       0.000us           NaN       0.000us       0.000us     361.437ms         2.45%     361.437ms       4.518ms           0 b           0 b           0 b           0 b            80  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us     302.445ms         2.05%     302.445ms     110.786us           0 b           0 b           0 b           0 b          2730  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us     233.524ms         1.58%     233.524ms       1.460ms           0 b           0 b           0 b           0 b           160  
void flash_fwd_kernel<Flash_fwd_kernel_traits<128, 1...           NaN       0.000us           NaN       0.000us       0.000us     219.554ms         1.49%     219.554ms       1.372ms           0 b           0 b           0 b           0 b           160  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us     173.768ms         1.18%     173.768ms     103.433us           0 b           0 b           0 b           0 b          1680  
void at::native::elementwise_kernel<128, 4, at::nati...           NaN       0.000us           NaN       0.000us       0.000us     160.092ms         1.08%     160.092ms     111.021us           0 b           0 b           0 b           0 b          1442  
void at::native::(anonymous namespace)::fused_dropou...           NaN       0.000us           NaN       0.000us       0.000us     129.036ms         0.87%     129.036ms     115.210us           0 b           0 b           0 b           0 b          1120  
                         Memcpy DtoD (Device -> Device)           NaN       0.000us           NaN       0.000us       0.000us     128.849ms         0.87%     128.849ms      45.707us           0 b           0 b           0 b           0 b          2819  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us     107.202ms         0.73%     107.202ms     335.008us           0 b           0 b           0 b           0 b           320  
void at::native::unrolled_elementwise_kernel<at::nat...           NaN       0.000us           NaN       0.000us       0.000us      92.507ms         0.63%      92.507ms     190.345us           0 b           0 b           0 b           0 b           486  
void at::native::unrolled_elementwise_kernel<at::nat...           NaN       0.000us           NaN       0.000us       0.000us      91.448ms         0.62%      91.448ms     187.011us           0 b           0 b           0 b           0 b           489  
void at::native::elementwise_kernel<128, 2, at::nati...           NaN       0.000us           NaN       0.000us       0.000us      79.884ms         0.54%      79.884ms     165.391us           0 b           0 b           0 b           0 b           483  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us      74.600ms         0.50%      74.600ms      66.608us           0 b           0 b           0 b           0 b          1120  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us      71.163ms         0.48%      71.163ms     127.076us           0 b           0 b           0 b           0 b           560  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us      66.049ms         0.45%      66.049ms     206.403us           0 b           0 b           0 b           0 b           320  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us      65.257ms         0.44%      65.257ms     135.108us           0 b           0 b           0 b           0 b           483  
void at::native::elementwise_kernel<128, 4, at::nati...           NaN       0.000us           NaN       0.000us       0.000us      48.279ms         0.33%      48.279ms     100.580us           0 b           0 b           0 b           0 b           480  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us      44.449ms         0.30%      44.449ms     277.804us           0 b           0 b           0 b           0 b           160  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us      44.442ms         0.30%      44.442ms     138.450us           0 b           0 b           0 b           0 b           321  
void at::native::(anonymous namespace)::CatArrayBatc...           NaN       0.000us           NaN       0.000us       0.000us      39.666ms         0.27%      39.666ms     123.955us           0 b           0 b           0 b           0 b           320  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us      33.100ms         0.22%      33.100ms      68.959us           0 b           0 b           0 b           0 b           480  
void at::native::reduce_kernel<128, 4, at::native::R...           NaN       0.000us           NaN       0.000us       0.000us      32.688ms         0.22%      32.688ms      51.075us           0 b           0 b           0 b           0 b           640  
void at::native::elementwise_kernel<128, 4, at::nati...           NaN       0.000us           NaN       0.000us       0.000us      31.628ms         0.21%      31.628ms      65.892us           0 b           0 b           0 b           0 b           480  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us      30.393ms         0.21%      30.393ms     184.201us           0 b           0 b           0 b           0 b           165  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us      28.379ms         0.19%      28.379ms     354.743us           0 b           0 b           0 b           0 b            80  
void at::native::reduce_kernel<512, 1, at::native::R...           NaN       0.000us           NaN       0.000us       0.000us      28.172ms         0.19%      28.172ms      87.492us           0 b           0 b           0 b           0 b           322  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us      28.133ms         0.19%      28.133ms      70.331us           0 b           0 b           0 b           0 b           400  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us      22.617ms         0.15%      22.617ms     141.359us           0 b           0 b           0 b           0 b           160  
void at::native::elementwise_kernel<128, 4, at::nati...           NaN       0.000us           NaN       0.000us       0.000us      21.645ms         0.15%      21.645ms      45.095us           0 b           0 b           0 b           0 b           480  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us      21.262ms         0.14%      21.262ms      64.823us           0 b           0 b           0 b           0 b           328  
void at::native::elementwise_kernel<128, 2, at::nati...           NaN       0.000us           NaN       0.000us       0.000us      18.329ms         0.12%      18.329ms     113.846us           0 b           0 b           0 b           0 b           161  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us      16.612ms         0.11%      16.612ms      69.217us           0 b           0 b           0 b           0 b           240  
void at::native::elementwise_kernel<128, 2, at::nati...           NaN       0.000us           NaN       0.000us       0.000us      15.949ms         0.11%      15.949ms       9.493us           0 b           0 b           0 b           0 b          1680  
void kDequantizeBlockwise<float, 512, 64, 8, 0>(floa...           NaN       0.000us           NaN       0.000us       0.000us      15.511ms         0.10%      15.511ms       9.233us           0 b           0 b           0 b           0 b          1680  
void at::native::reduce_kernel<512, 1, at::native::R...           NaN       0.000us           NaN       0.000us       0.000us      14.420ms         0.10%      14.420ms      89.567us           0 b           0 b           0 b           0 b           161  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us      13.193ms         0.09%      13.193ms      40.221us           0 b           0 b           0 b           0 b           328  
void flash_bwd_dot_do_o_kernel<true, Flash_bwd_kerne...           NaN       0.000us           NaN       0.000us       0.000us      12.622ms         0.09%      12.622ms     157.773us           0 b           0 b           0 b           0 b            80  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us      10.511ms         0.07%      10.511ms     131.389us           0 b           0 b           0 b           0 b            80  
void flash_bwd_convert_dq_kernel<Flash_bwd_kernel_tr...           NaN       0.000us           NaN       0.000us       0.000us      10.263ms         0.07%      10.263ms     128.288us           0 b           0 b           0 b           0 b            80  
pcclKernel_AllGather_RING_LL_Sum_int8_t(ncclWorkElem...           NaN       0.000us           NaN       0.000us       0.000us       8.106ms         0.05%       8.106ms       8.106ms           0 b           0 b           0 b           0 b             1  
reduction_dtypeFP32xBF16_align8_fusion0_prefetch0_co...           NaN       0.000us           NaN       0.000us       0.000us       7.217ms         0.05%       7.217ms       4.296us           0 b           0 b           0 b           0 b          1680  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us       3.438ms         0.02%       3.438ms      10.743us           0 b           0 b           0 b           0 b           320  
void at::native::(anonymous namespace)::cunn_SoftMax...           NaN       0.000us           NaN       0.000us       0.000us       2.342ms         0.02%       2.342ms       2.342ms           0 b           0 b           0 b           0 b             1  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us       2.142ms         0.01%       2.142ms      13.387us           0 b           0 b           0 b           0 b           160  
void at::native::(anonymous namespace)::CatArrayBatc...           NaN       0.000us           NaN       0.000us       0.000us       1.665ms         0.01%       1.665ms     554.897us           0 b           0 b           0 b           0 b             3  
void at::native::(anonymous namespace)::cunn_SoftMax...           NaN       0.000us           NaN       0.000us       0.000us       1.662ms         0.01%       1.662ms       1.662ms           0 b           0 b           0 b           0 b             1  
void at::native::reduce_kernel<512, 1, at::native::R...           NaN       0.000us           NaN       0.000us       0.000us       1.487ms         0.01%       1.487ms       8.750us           0 b           0 b           0 b           0 b           170  
gemm_ktype0_aiu1_mtype1_dtypeBF16xBF16xFP32xBF16_til...           NaN       0.000us           NaN       0.000us       0.000us       1.332ms         0.01%       1.332ms       8.327us           0 b           0 b           0 b           0 b           160  
void at::native::unrolled_elementwise_kernel<at::nat...           NaN       0.000us           NaN       0.000us       0.000us       1.213ms         0.01%       1.213ms       7.135us           0 b           0 b           0 b           0 b           170  
void at::native::reduce_kernel<512, 1, at::native::R...           NaN       0.000us           NaN       0.000us       0.000us       1.063ms         0.01%       1.063ms       6.644us           0 b           0 b           0 b           0 b           160  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us     811.206us         0.01%     811.206us     811.206us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us     677.207us         0.00%     677.207us       2.110us           0 b           0 b           0 b           0 b           321  
pcclKernel_AllReduce_RING_LL_Sum_float(ncclWorkElem,...           NaN       0.000us           NaN       0.000us       0.000us     649.837us         0.00%     649.837us     649.837us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us     617.843us         0.00%     617.843us       1.919us           0 b           0 b           0 b           0 b           322  
void at::native::(anonymous namespace)::multi_tensor...           NaN       0.000us           NaN       0.000us       0.000us     563.638us         0.00%     563.638us     187.879us           0 b           0 b           0 b           0 b             3  
void at::native::(anonymous namespace)::multi_tensor...           NaN       0.000us           NaN       0.000us       0.000us     507.718us         0.00%     507.718us      84.620us           0 b           0 b           0 b           0 b             6  
void at::native::(anonymous namespace)::multi_tensor...           NaN       0.000us           NaN       0.000us       0.000us     501.998us         0.00%     501.998us     167.333us           0 b           0 b           0 b           0 b             3  
void at::native::(anonymous namespace)::multi_tensor...           NaN       0.000us           NaN       0.000us       0.000us     423.918us         0.00%     423.918us     141.306us           0 b           0 b           0 b           0 b             3  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us     382.405us         0.00%     382.405us       2.375us           0 b           0 b           0 b           0 b           161  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us     368.721us         0.00%     368.721us       2.290us           0 b           0 b           0 b           0 b           161  
void at::native::unrolled_elementwise_kernel<at::nat...           NaN       0.000us           NaN       0.000us       0.000us     353.244us         0.00%     353.244us       2.208us           0 b           0 b           0 b           0 b           160  
void at::native::(anonymous namespace)::multi_tensor...           NaN       0.000us           NaN       0.000us       0.000us     305.439us         0.00%     305.439us     101.813us           0 b           0 b           0 b           0 b             3  
void at::native::(anonymous namespace)::multi_tensor...           NaN       0.000us           NaN       0.000us       0.000us     285.839us         0.00%     285.839us      95.280us           0 b           0 b           0 b           0 b             3  
void at::native::(anonymous namespace)::multi_tensor...           NaN       0.000us           NaN       0.000us       0.000us     275.159us         0.00%     275.159us      91.720us           0 b           0 b           0 b           0 b             3  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us     227.199us         0.00%     227.199us      75.733us           0 b           0 b           0 b           0 b             3  
void at::native::(anonymous namespace)::indexSelectL...           NaN       0.000us           NaN       0.000us       0.000us     211.962us         0.00%     211.962us     211.962us           0 b           0 b           0 b           0 b             1  
                                        Memset (Device)           NaN       0.000us           NaN       0.000us       0.000us      75.360us         0.00%      75.360us       0.209us           0 b           0 b           0 b           0 b           360  
                         Memcpy DtoH (Device -> Pinned)           NaN       0.000us           NaN       0.000us       0.000us      70.721us         0.00%      70.721us       0.437us           0 b           0 b           0 b           0 b           162  
void at::native::unrolled_elementwise_kernel<at::nat...           NaN       0.000us           NaN       0.000us       0.000us      19.720us         0.00%      19.720us       9.860us           0 b           0 b           0 b           0 b             2  
void at::native::elementwise_kernel<128, 4, at::nati...           NaN       0.000us           NaN       0.000us       0.000us      16.320us         0.00%      16.320us      16.320us           0 b           0 b           0 b           0 b             1  
void at::native::reduce_kernel<512, 1, at::native::R...           NaN       0.000us           NaN       0.000us       0.000us      14.160us         0.00%      14.160us       7.080us           0 b           0 b           0 b           0 b             2  
void at::native::unrolled_elementwise_kernel<at::nat...           NaN       0.000us           NaN       0.000us       0.000us      13.400us         0.00%      13.400us      13.400us           0 b           0 b           0 b           0 b             1  
void at::native::(anonymous namespace)::CatArrayBatc...           NaN       0.000us           NaN       0.000us       0.000us      12.440us         0.00%      12.440us      12.440us           0 b           0 b           0 b           0 b             1  
void at::native::(anonymous namespace)::nll_loss_for...           NaN       0.000us           NaN       0.000us       0.000us      11.560us         0.00%      11.560us      11.560us           0 b           0 b           0 b           0 b             1  
void at::native::(anonymous namespace)::CatArrayBatc...           NaN       0.000us           NaN       0.000us       0.000us      11.200us         0.00%      11.200us       5.600us           0 b           0 b           0 b           0 b             2  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       8.720us         0.00%       8.720us       1.744us           0 b           0 b           0 b           0 b             5  
void at::native::unrolled_elementwise_kernel<at::nat...           NaN       0.000us           NaN       0.000us       0.000us       8.681us         0.00%       8.681us       4.341us           0 b           0 b           0 b           0 b             2  
void at::native::(anonymous namespace)::nll_loss_bac...           NaN       0.000us           NaN       0.000us       0.000us       8.320us         0.00%       8.320us       8.320us           0 b           0 b           0 b           0 b             1  
void at::native::reduce_kernel<512, 1, at::native::R...           NaN       0.000us           NaN       0.000us       0.000us       7.480us         0.00%       7.480us       7.480us           0 b           0 b           0 b           0 b             1  
void at::native::unrolled_elementwise_kernel<at::nat...           NaN       0.000us           NaN       0.000us       0.000us       6.600us         0.00%       6.600us       3.300us           0 b           0 b           0 b           0 b             2  
void at::native::reduce_kernel<512, 1, at::native::R...           NaN       0.000us           NaN       0.000us       0.000us       6.321us         0.00%       6.321us       6.321us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       5.760us         0.00%       5.760us       5.760us           0 b           0 b           0 b           0 b             1  
_ZN7cutlass6KernelINS_4gemm6kernel4GemmINS1_11thread...           NaN       0.000us           NaN       0.000us       0.000us       5.560us         0.00%       5.560us       5.560us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       5.400us         0.00%       5.400us       5.400us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       2.800us         0.00%       2.800us       1.400us           0 b           0 b           0 b           0 b             2  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       2.760us         0.00%       2.760us       2.760us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       2.480us         0.00%       2.480us       2.480us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       2.200us         0.00%       2.200us       2.200us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       2.080us         0.00%       2.080us       2.080us           0 b           0 b           0 b           0 b             1  
void (anonymous namespace)::elementwise_kernel_with_...           NaN       0.000us           NaN       0.000us       0.000us       1.880us         0.00%       1.880us       1.880us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       1.880us         0.00%       1.880us       1.880us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       1.880us         0.00%       1.880us       1.880us           0 b           0 b           0 b           0 b             1  
                       Memcpy HtoD (Pageable -> Device)           NaN       0.000us           NaN       0.000us       0.000us       1.680us         0.00%       1.680us       1.680us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       1.680us         0.00%       1.680us       1.680us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       1.560us         0.00%       1.560us       1.560us           0 b           0 b           0 b           0 b             1  
void at::native::vectorized_elementwise_kernel<4, at...           NaN       0.000us           NaN       0.000us       0.000us       1.440us         0.00%       1.440us       1.440us           0 b           0 b           0 b           0 b             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CUDA time total: 14.778s
