seq_len:  2544
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference         0.00%       0.000us         0.00%       0.000us       0.000us        1.506s        84.13%        1.506s     376.451ms             4  
                                        model_inference        69.22%        1.746s       100.00%        2.523s        2.523s       0.000us         0.00%     312.491ms     312.491ms             1  
                                           MatMul8bitLt         4.80%     121.044ms        22.59%     569.927ms       2.544ms     111.648ms         6.24%     197.185ms     880.290us           224  
                                            aten::copy_         0.54%      13.735ms         1.60%      40.435ms      18.531us      72.590ms         4.06%      76.601ms      35.106us          2182  
                                         aten::_to_copy         0.26%       6.456ms         1.78%      44.915ms      34.418us       0.000us         0.00%      66.040ms      50.605us          1305  
                                               aten::to         0.11%       2.788ms         1.85%      46.657ms      16.711us       0.000us         0.00%      65.686ms      23.527us          2792  
ampere_int32_i16832gemm_int8_128x128_ldg16_stages_64...         0.00%       0.000us         0.00%       0.000us       0.000us      63.784ms         3.56%      63.784ms     284.750us           224  
                                            aten::fill_         0.57%      14.413ms         1.05%      26.580ms      10.700us      25.700ms         1.44%      29.386ms      11.830us          2484  
                         Memcpy HtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      28.864ms         1.61%      28.864ms     721.600us            40  
                         Memcpy DtoH (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      28.856ms         1.61%      28.856ms     721.400us            40  
                                            aten::zero_         0.23%       5.881ms         1.06%      26.797ms      13.162us       0.000us         0.00%      27.692ms      13.601us          2036  
                                            aten::zeros         0.31%       7.881ms         2.41%      60.790ms      29.858us       0.000us         0.00%      26.993ms      13.258us          2036  
                                              aten::mul         0.26%       6.477ms         0.59%      14.867ms      32.110us      15.899ms         0.89%      17.106ms      36.946us           463  
void kdequant_mm_int32_fp16<4, 128, 512>(int*, float...         0.00%       0.000us         0.00%       0.000us       0.000us      15.700ms         0.88%      15.700ms      70.089us           224  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.545ms         0.87%      15.545ms      14.542us          1069  
                                           aten::matmul         0.04%       1.042ms         4.31%     108.842ms     528.359us       0.000us         0.00%      15.065ms      73.131us           206  
                                               aten::mm         0.28%       7.130ms         4.09%     103.216ms     593.195us       9.625ms         0.54%      15.032ms      86.391us           174  
                     aten::scaled_dot_product_attention         0.02%     380.000us         0.26%       6.525ms     203.906us       0.000us         0.00%      14.465ms     452.031us            32  
              aten::_scaled_dot_product_flash_attention         0.02%     561.000us         0.24%       6.145ms     192.031us       0.000us         0.00%      14.465ms     452.031us            32  
                         aten::_flash_attention_forward         0.03%     772.000us         0.21%       5.319ms     166.219us      13.958ms         0.78%      14.465ms     452.031us            32  
void pytorch_flash::flash_fwd_kernel<pytorch_flash::...         0.00%       0.000us         0.00%       0.000us       0.000us      13.958ms         0.78%      13.958ms     436.188us            32  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      12.316ms         0.69%      12.316ms      51.966us           237  
                                       cudaLaunchKernel         9.45%     238.285ms         9.45%     238.285ms      26.666us      11.257ms         0.63%      11.257ms       1.260us          8936  
void kTransformRowToFormat<256, 8, 32, 256, 0, 4>(ch...         0.00%       0.000us         0.00%       0.000us       0.000us      10.737ms         0.60%      10.737ms      47.933us           224  
                                             aten::add_         0.08%       2.049ms         0.12%       2.919ms      16.873us       9.994ms         0.56%      10.000ms      57.803us           173  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       9.862ms         0.55%       9.862ms      12.421us           794  
                                            aten::clone         0.08%       1.947ms         0.58%      14.734ms      40.928us       0.000us         0.00%       9.586ms      26.628us           360  
void kgetColRowStats<__half, 64, 4, 16, 256, 1>(__ha...         0.00%       0.000us         0.00%       0.000us       0.000us       9.451ms         0.53%       9.451ms      42.192us           224  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       9.026ms         0.50%       9.026ms      12.181us           741  
                                    cudaPeekAtLastError         0.00%       0.000us         0.00%       0.000us       0.000us       6.116ms         0.34%       6.116ms       1.889us          3237  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       5.795ms         0.32%       5.795ms     181.094us            32  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.559ms         0.31%       5.559ms      28.803us           193  
                                              aten::add         0.11%       2.765ms         0.51%      12.793ms      66.285us       4.836ms         0.27%       5.322ms      27.575us           193  
void kDoubleRowColQuant<64, 4, 16, 256, 1>(__half*, ...         0.00%       0.000us         0.00%       0.000us       0.000us       5.071ms         0.28%       5.071ms      29.312us           173  
                                   cudaFuncSetAttribute         2.38%      59.985ms         2.38%      59.985ms      78.720us       4.730ms         0.26%       4.730ms       6.207us           762  
                                           aten::linear         0.00%       4.000us         0.35%       8.894ms       8.894ms       0.000us         0.00%       4.277ms       4.277ms             1  
void cutlass::Kernel<cutlass_80_tensorop_f16_s16816g...         0.00%       0.000us         0.00%       0.000us       0.000us       4.277ms         0.24%       4.277ms       4.277ms             1  
void kTransformRowToFormat<256, 8, 32, 256, 0, 2>(ch...         0.00%       0.000us         0.00%       0.000us       0.000us       4.100ms         0.23%       4.100ms      18.304us           224  
void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       3.984ms         0.22%       3.984ms      61.292us            65  
                                             aten::sort         0.21%       5.361ms         2.06%      51.866ms     299.803us       1.722ms         0.10%       3.288ms      19.006us           173  
                                             aten::silu         0.02%     582.000us         0.20%       4.936ms     154.250us       3.201ms         0.18%       3.252ms     101.625us            32  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.201ms         0.18%       3.201ms     100.031us            32  
                                              aten::pow         0.06%       1.589ms         1.04%      26.335ms     405.154us       3.067ms         0.17%       3.142ms      48.338us            65  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.067ms         0.17%       3.067ms      47.185us            65  
                                 aten::_index_put_impl_         0.20%       5.098ms         0.71%      17.831ms      51.535us       1.272ms         0.07%       2.964ms       8.566us           346  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       2.950ms         0.16%       2.950ms      30.102us            98  
                                         aten::_unique2         0.31%       7.766ms         1.04%      26.175ms     151.301us       2.108ms         0.12%       2.903ms      16.780us           173  
                                       aten::index_put_         0.06%       1.427ms         0.74%      18.744ms      54.173us       0.000us         0.00%       2.829ms       8.176us           346  
                                            aten::index         0.33%       8.401ms         0.98%      24.769ms      47.724us       1.977ms         0.11%       2.616ms       5.040us           519  
                                        cudaMemcpyAsync         0.54%      13.692ms         0.54%      13.692ms       8.068us       2.548ms         0.14%       2.548ms       1.501us          1697  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.449ms         0.14%       2.449ms      38.266us            64  
                                              aten::cat         0.07%       1.649ms         0.22%       5.659ms      58.948us       2.398ms         0.13%       2.434ms      25.354us            96  
                                          aten::cumsum_         0.22%       5.573ms         0.99%      24.932ms     111.304us     466.000us         0.03%       2.346ms      10.473us           224  
void at::native::(anonymous namespace)::CatArrayBatc...         0.00%       0.000us         0.00%       0.000us       0.000us       2.287ms         0.13%       2.287ms      35.734us            64  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       2.060ms         0.12%       2.060ms       6.821us           302  
void at_cuda_detail::cub::DeviceRadixSortSingleTileK...         0.00%       0.000us         0.00%       0.000us       0.000us       1.759ms         0.10%       1.759ms      10.227us           172  
void kDoubleRowColQuant<64, 4, 16, 256, 0>(__half*, ...         0.00%       0.000us         0.00%       0.000us       0.000us       1.625ms         0.09%       1.625ms      31.863us            51  
                       ampere_fp16_sgemm_fp16_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       1.579ms         0.09%       1.579ms      26.763us            59  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.540ms         0.09%       1.540ms       4.451us           346  
void gemmk1_kernel<int, float, 256, 5, false, false,...         0.00%       0.000us         0.00%       0.000us       0.000us       1.492ms         0.08%       1.492ms      40.324us            37  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.272ms         0.07%       1.272ms       3.676us           346  
                                             aten::mean         0.05%       1.352ms         0.35%       8.867ms     136.415us       1.172ms         0.07%       1.210ms      18.615us            65  
void kExtractOutliers<4>(char*, int*, char*, int, in...         0.00%       0.000us         0.00%       0.000us       0.000us       1.180ms         0.07%       1.180ms       6.821us           173  
void at::native::reduce_kernel<512, 1, at::native::R...         0.00%       0.000us         0.00%       0.000us       0.000us       1.172ms         0.07%       1.172ms      18.031us            65  
                               aten::cross_entropy_loss         0.00%      40.000us         0.49%      12.271ms      12.271ms       0.000us         0.00%       1.135ms       1.135ms             1  
                                      aten::log_softmax         0.00%      12.000us         0.33%       8.344ms       8.344ms       0.000us         0.00%       1.086ms       1.086ms             1  
                                     aten::_log_softmax         0.00%      47.000us         0.33%       8.332ms       8.332ms       1.086ms         0.06%       1.086ms       1.086ms             1  
void at::native::(anonymous namespace)::cunn_SoftMax...         0.00%       0.000us         0.00%       0.000us       0.000us       1.086ms         0.06%       1.086ms       1.086ms             1  
void at::native::radixSortKVInPlace<-2, -1, 128, 32,...         0.00%       0.000us         0.00%       0.000us       0.000us       1.023ms         0.06%       1.023ms      15.984us            64  
                                  cudaStreamIsCapturing         0.01%     367.000us         0.01%     367.000us       0.666us     957.000us         0.05%     957.000us       1.737us           551  
                                              aten::neg         0.03%     841.000us         0.37%       9.433ms     147.391us     578.000us         0.03%     895.000us      13.984us            64  
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.56%      39.413ms         1.56%      39.413ms      49.328us     859.000us         0.05%     859.000us       1.075us           799  
                                 cudaDeviceGetAttribute         0.00%       5.000us         0.00%       5.000us       0.007us     685.000us         0.04%     685.000us       1.024us           669  
void cutlass::Kernel<cutlass_80_tensorop_f16_s16816g...         0.00%       0.000us         0.00%       0.000us       0.000us     631.000us         0.04%     631.000us      45.071us            14  
                                  cudaStreamSynchronize         1.12%      28.201ms         1.12%      28.201ms      30.787us     620.000us         0.03%     620.000us       0.677us           916  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     578.000us         0.03%     578.000us       9.031us            64  
void cutlass::Kernel<cutlass_80_wmma_tensorop_f16_s1...         0.00%       0.000us         0.00%       0.000us       0.000us     527.000us         0.03%     527.000us      21.958us            24  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     520.000us         0.03%     520.000us       1.002us           519  
                                    aten::empty_strided         0.34%       8.655ms         0.63%      15.929ms      10.204us       0.000us         0.00%     510.000us       0.327us          1561  
                                             aten::item         0.06%       1.388ms         1.46%      36.838ms      64.628us       0.000us         0.00%     506.000us       0.888us           570  
                              aten::_local_scalar_dense         0.16%       3.941ms         1.43%      36.052ms      63.249us     144.000us         0.01%     506.000us       0.888us           570  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     460.000us         0.03%     460.000us       3.382us           136  
                                             cudaMalloc         2.08%      52.370ms         2.08%      52.370ms      98.625us     453.000us         0.03%     453.000us       0.853us           531  
void at_cuda_detail::cub::DeviceScanKernel<at_cuda_d...         0.00%       0.000us         0.00%       0.000us       0.000us     453.000us         0.03%     453.000us       2.022us           224  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     437.000us         0.02%     437.000us       2.526us           173  
void cutlass::Kernel<cutlass_75_tensorop_f16_s1688ge...         0.00%       0.000us         0.00%       0.000us       0.000us     396.000us         0.02%     396.000us      26.400us            15  
                                            aten::empty         0.96%      24.143ms         2.00%      50.361ms      11.294us       0.000us         0.00%     331.000us       0.074us          4459  
                                       aten::contiguous         0.01%     219.000us         0.15%       3.751ms      27.581us       0.000us         0.00%     328.000us       2.412us           136  
void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     314.000us         0.02%     314.000us       2.309us           136  
                                   cudaGetSymbolAddress         0.01%     139.000us         0.01%     139.000us      69.500us     297.000us         0.02%     297.000us     148.500us             2  
void cutlass::Kernel<cutlass_80_tensorop_f16_s16816g...         0.00%       0.000us         0.00%       0.000us       0.000us     291.000us         0.02%     291.000us      48.500us             6  
void at::native::radixSortKVInPlace<-2, -1, 32, 32, ...         0.00%       0.000us         0.00%       0.000us       0.000us     283.000us         0.02%     283.000us      12.864us            22  
                                              aten::div         0.08%       2.100ms         0.48%      12.057ms      69.694us     193.000us         0.01%     281.000us       1.624us           173  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     269.000us         0.02%     269.000us       0.600us           448  
void at_cuda_detail::cub::DeviceSelectSweepKernel<at...         0.00%       0.000us         0.00%       0.000us       0.000us     246.000us         0.01%     246.000us       1.422us           173  
                                           aten::arange         0.09%       2.220ms         0.46%      11.503ms      33.246us      32.000us         0.00%     240.000us       0.694us           346  
                                              aten::sin         0.01%     340.000us         0.07%       1.695ms      52.969us      34.000us         0.00%     240.000us       7.500us            32  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     194.000us         0.01%     194.000us       1.121us           173  
ampere_fp16_s1688gemm_fp16_128x128_ldg8_f2f_stages_3...         0.00%       0.000us         0.00%       0.000us       0.000us     191.000us         0.01%     191.000us      15.917us            12  
void at::native::bitonicSortKVInPlace<-2, -1, 16, 16...         0.00%       0.000us         0.00%       0.000us       0.000us     181.000us         0.01%     181.000us       3.017us            60  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.523s
Self CUDA time total: 1.790s

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        69.22%        1.746s       100.00%        2.523s        2.523s       0.000us         0.00%     312.491ms     312.491ms             1  
                                               aten::to         0.11%       2.788ms         1.85%      46.657ms      16.711us       0.000us         0.00%      65.686ms      23.527us          2792  
                                        aten::embedding         0.00%     109.000us         1.86%      46.809ms      46.809ms       0.000us         0.00%      54.000us      54.000us             1  
                                          aten::reshape         0.06%       1.586ms         0.09%       2.306ms       1.997us       0.000us         0.00%       0.000us       0.000us          1155  
                                             aten::view         0.10%       2.490ms         0.10%       2.490ms       1.342us       0.000us         0.00%       0.000us       0.000us          1855  
                                     aten::index_select         0.07%       1.870ms         1.85%      46.680ms      46.680ms      54.000us         0.00%      54.000us      54.000us             1  
                                            aten::empty         0.96%      24.143ms         2.00%      50.361ms      11.294us       0.000us         0.00%     331.000us       0.074us          4459  
                                          aten::resize_         0.05%       1.200ms         0.05%       1.329ms       3.819us       0.000us         0.00%       0.000us       0.000us           348  
                                  cudaStreamIsCapturing         0.01%     367.000us         0.01%     367.000us       0.666us     957.000us         0.05%     957.000us       1.737us           551  
                                             cudaMalloc         2.08%      52.370ms         2.08%      52.370ms      98.625us     453.000us         0.03%     453.000us       0.853us           531  
                                       cudaLaunchKernel         9.45%     238.285ms         9.45%     238.285ms      26.666us      11.257ms         0.63%      11.257ms       1.260us          8936  
                                        model_inference         0.00%       0.000us         0.00%       0.000us       0.000us        1.506s        84.13%        1.506s     376.451ms             4  
void at::native::(anonymous namespace)::indexSelectL...         0.00%       0.000us         0.00%       0.000us       0.000us      54.000us         0.00%      54.000us      54.000us             1  
                                           aten::arange         0.09%       2.220ms         0.46%      11.503ms      33.246us      32.000us         0.00%     240.000us       0.694us           346  
void (anonymous namespace)::elementwise_kernel_with_...         0.00%       0.000us         0.00%       0.000us       0.000us      32.000us         0.00%      32.000us       0.185us           173  
                                        aten::unsqueeze         0.02%     464.000us         0.02%     509.000us       3.161us       0.000us         0.00%       0.000us       0.000us           161  
                                       aten::as_strided         0.03%     846.000us         0.03%     846.000us       0.310us       0.000us         0.00%       0.000us       0.000us          2727  
                                         aten::_to_copy         0.26%       6.456ms         1.78%      44.915ms      34.418us       0.000us         0.00%      66.040ms      50.605us          1305  
                                    aten::empty_strided         0.34%       8.655ms         0.63%      15.929ms      10.204us       0.000us         0.00%     510.000us       0.327us          1561  
                                            aten::copy_         0.54%      13.735ms         1.60%      40.435ms      18.531us      72.590ms         4.06%      76.601ms      35.106us          2182  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       2.950ms         0.16%       2.950ms      30.102us            98  
                                              aten::pow         0.06%       1.589ms         1.04%      26.335ms     405.154us       3.067ms         0.17%       3.142ms      48.338us            65  
                                      aten::result_type         0.00%       1.000us         0.00%       1.000us       0.015us       0.000us         0.00%       0.000us       0.000us            65  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.067ms         0.17%       3.067ms      47.185us            65  
                                             aten::mean         0.05%       1.352ms         0.35%       8.867ms     136.415us       1.172ms         0.07%       1.210ms      18.615us            65  
void at::native::reduce_kernel<512, 1, at::native::R...         0.00%       0.000us         0.00%       0.000us       0.000us       1.172ms         0.07%       1.172ms      18.031us            65  
                                              aten::add         0.11%       2.765ms         0.51%      12.793ms      66.285us       4.836ms         0.27%       5.322ms      27.575us           193  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.000us         0.00%      65.000us       1.000us            65  
                                            aten::rsqrt         0.04%     985.000us         0.59%      14.826ms     228.092us      65.000us         0.00%     142.000us       2.185us            65  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.000us         0.00%      65.000us       1.000us            65  
                                              aten::mul         0.26%       6.477ms         0.59%      14.867ms      32.110us      15.899ms         0.89%      17.106ms      36.946us           463  
void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       3.984ms         0.22%       3.984ms      61.292us            65  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       2.060ms         0.12%       2.060ms       6.821us           302  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.559ms         0.31%       5.559ms      28.803us           193  
                                           MatMul8bitLt         4.80%     121.044ms        22.59%     569.927ms       2.544ms     111.648ms         6.24%     197.185ms     880.290us           224  
                                            aten::fill_         0.57%      14.413ms         1.05%      26.580ms      10.700us      25.700ms         1.44%      29.386ms      11.830us          2484  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     269.000us         0.02%     269.000us       0.600us           448  
                                            aten::zeros         0.31%       7.881ms         2.41%      60.790ms      29.858us       0.000us         0.00%      26.993ms      13.258us          2036  
                                            aten::zero_         0.23%       5.881ms         1.06%      26.797ms      13.162us       0.000us         0.00%      27.692ms      13.601us          2036  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       9.862ms         0.55%       9.862ms      12.421us           794  
void kgetColRowStats<__half, 64, 4, 16, 256, 1>(__ha...         0.00%       0.000us         0.00%       0.000us       0.000us       9.451ms         0.53%       9.451ms      42.192us           224  
                                    cudaPeekAtLastError         0.00%       0.000us         0.00%       0.000us       0.000us       6.116ms         0.34%       6.116ms       1.889us          3237  
                                          aten::cumsum_         0.22%       5.573ms         0.99%      24.932ms     111.304us     466.000us         0.03%       2.346ms      10.473us           224  
                                     cudaGetDeviceCount         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us             1  
                                  cudaFuncGetAttributes         0.76%      19.255ms         0.76%      19.255ms     310.565us      17.000us         0.00%      64.000us       1.032us            62  
void at_cuda_detail::cub::DeviceScanInitKernel<at_cu...         0.00%       0.000us         0.00%       0.000us       0.000us      13.000us         0.00%      13.000us       0.058us           224  
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         1.56%      39.413ms         1.56%      39.413ms      49.328us     859.000us         0.05%     859.000us       1.075us           799  
                                 cudaDeviceGetAttribute         0.00%       5.000us         0.00%       5.000us       0.007us     685.000us         0.04%     685.000us       1.024us           669  
void at_cuda_detail::cub::DeviceScanKernel<at_cuda_d...         0.00%       0.000us         0.00%       0.000us       0.000us     453.000us         0.03%     453.000us       2.022us           224  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.545ms         0.87%      15.545ms      14.542us          1069  
                                           aten::select         0.07%       1.711ms         0.08%       1.948ms       4.907us       0.000us         0.00%       0.000us       0.000us           397  
                                             aten::item         0.06%       1.388ms         1.46%      36.838ms      64.628us       0.000us         0.00%     506.000us       0.888us           570  
                              aten::_local_scalar_dense         0.16%       3.941ms         1.43%      36.052ms      63.249us     144.000us         0.01%     506.000us       0.888us           570  
                                          cudaHostAlloc         0.04%     896.000us         0.04%     896.000us     448.000us       0.000us         0.00%       0.000us       0.000us             2  
                                        cudaMemcpyAsync         0.54%      13.692ms         0.54%      13.692ms       8.068us       2.548ms         0.14%       2.548ms       1.501us          1697  
                                  cudaStreamSynchronize         1.12%      28.201ms         1.12%      28.201ms      30.787us     620.000us         0.03%     620.000us       0.677us           916  
                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     144.000us         0.01%     144.000us       0.253us           570  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.000us         0.00%      24.000us       0.139us           173  
void kDoubleRowColQuant<64, 4, 16, 256, 1>(__half*, ...         0.00%       0.000us         0.00%       0.000us       0.000us       5.071ms         0.28%       5.071ms      29.312us           173  
                                             aten::sort         0.21%       5.361ms         2.06%      51.866ms     299.803us       1.722ms         0.10%       3.288ms      19.006us           173  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       9.026ms         0.50%       9.026ms      12.181us           741  
void at::native::bitonicSortKVInPlace<-2, -1, 16, 16...         0.00%       0.000us         0.00%       0.000us       0.000us     181.000us         0.01%     181.000us       3.017us            60  
                                            aten::index         0.33%       8.401ms         0.98%      24.769ms      47.724us       1.977ms         0.11%       2.616ms       5.040us           519  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     437.000us         0.02%     437.000us       2.526us           173  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.540ms         0.09%       1.540ms       4.451us           346  
void kTransformRowToFormat<256, 8, 32, 256, 0, 4>(ch...         0.00%       0.000us         0.00%       0.000us       0.000us      10.737ms         0.60%      10.737ms      47.933us           224  
                                         aten::_unique2         0.31%       7.766ms         1.04%      26.175ms     151.301us       2.108ms         0.12%       2.903ms      16.780us           173  
void at_cuda_detail::cub::DeviceRadixSortSingleTileK...         0.00%       0.000us         0.00%       0.000us       0.000us       1.759ms         0.10%       1.759ms      10.227us           172  
void at_cuda_detail::cub::DeviceCompactInitKernel<at...         0.00%       0.000us         0.00%       0.000us       0.000us      74.000us         0.00%      74.000us       0.428us           173  
void at_cuda_detail::cub::DeviceSelectSweepKernel<at...         0.00%       0.000us         0.00%       0.000us       0.000us     246.000us         0.01%     246.000us       1.422us           173  
void kExtractOutliers<4>(char*, int*, char*, int, in...         0.00%       0.000us         0.00%       0.000us       0.000us       1.180ms         0.07%       1.180ms       6.821us           173  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     101.000us         0.01%     101.000us       2.730us            37  
                                              aten::div         0.08%       2.100ms         0.48%      12.057ms      69.694us     193.000us         0.01%     281.000us       1.624us           173  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     194.000us         0.01%     194.000us       1.121us           173  
                                                aten::t         0.02%     486.000us         0.04%       1.133ms       6.511us       0.000us         0.00%       0.000us       0.000us           174  
                                        aten::transpose         0.05%       1.357ms         0.06%       1.589ms       3.439us       0.000us         0.00%       0.000us       0.000us           462  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     520.000us         0.03%     520.000us       1.002us           519  
                                       aten::lift_fresh         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us           346  
                                            aten::slice         0.12%       3.077ms         0.13%       3.222ms       4.319us       0.000us         0.00%       0.000us       0.000us           746  
                                       aten::index_put_         0.06%       1.427ms         0.74%      18.744ms      54.173us       0.000us         0.00%       2.829ms       8.176us           346  
                                 aten::_index_put_impl_         0.20%       5.098ms         0.71%      17.831ms      51.535us       1.272ms         0.07%       2.964ms       8.566us           346  
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us           346  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.272ms         0.07%       1.272ms       3.676us           346  
void kTransformRowToFormat<256, 8, 32, 256, 0, 2>(ch...         0.00%       0.000us         0.00%       0.000us       0.000us       4.100ms         0.23%       4.100ms      18.304us           224  
                                               cudaFree         0.93%      23.416ms         0.93%      23.416ms       3.903ms       0.000us         0.00%       0.000us       0.000us             6  
                                   cudaGetSymbolAddress         0.01%     139.000us         0.01%     139.000us      69.500us     297.000us         0.02%     297.000us     148.500us             2  
                                   cudaFuncSetAttribute         2.38%      59.985ms         2.38%      59.985ms      78.720us       4.730ms         0.26%       4.730ms       6.207us           762  
          cudaOccupancyMaxActiveBlocksPerMultiprocessor         0.14%       3.548ms         0.14%       3.548ms      95.892us       6.000us         0.00%       6.000us       0.162us            37  
ampere_int32_i16832gemm_int8_128x128_ldg16_stages_64...         0.00%       0.000us         0.00%       0.000us       0.000us      63.784ms         3.56%      63.784ms     284.750us           224  
void kdequant_mm_int32_fp16<4, 128, 512>(int*, float...         0.00%       0.000us         0.00%       0.000us       0.000us      15.700ms         0.88%      15.700ms      70.089us           224  
                                           aten::matmul         0.04%       1.042ms         4.31%     108.842ms     528.359us       0.000us         0.00%      15.065ms      73.131us           206  
                                               aten::mm         0.28%       7.130ms         4.09%     103.216ms     593.195us       9.625ms         0.54%      15.032ms      86.391us           174  
void gemmk1_kernel<int, float, 256, 5, false, false,...         0.00%       0.000us         0.00%       0.000us       0.000us       1.492ms         0.08%       1.492ms      40.324us            37  
                                             aten::add_         0.08%       2.049ms         0.12%       2.919ms      16.873us       9.994ms         0.56%      10.000ms      57.803us           173  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      12.316ms         0.69%      12.316ms      51.966us           237  
                                            aten::clone         0.08%       1.947ms         0.58%      14.734ms      40.928us       0.000us         0.00%       9.586ms      26.628us           360  
                aten::_has_compatible_shallow_copy_type         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us           224  
                                           aten::expand         0.01%     320.000us         0.01%     328.000us       3.417us       0.000us         0.00%       0.000us       0.000us            96  
                                              aten::bmm         0.04%     980.000us         0.17%       4.185ms     130.781us      33.000us         0.00%      33.000us       1.031us            32  
void gemmk1_kernel<int, float, 256, 5, false, false,...         0.00%       0.000us         0.00%       0.000us       0.000us      33.000us         0.00%      33.000us       1.031us            32  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.523s
Self CUDA time total: 1.790s

