|
Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
|
Template performing matrix multiply-add operation within a thread.
#include <igemm_multiply_add.h>
Public Types | |
| typedef Shape< 4, 1, 1 > | InstructionShape |
| The shape of the instruction. More... | |
| typedef ThreadGemmShape_ | ThreadGemmShape |
| Shape of the thread-level GEMM (K-by-N-by-M) More... | |
| typedef ThreadGemmShape | AccumulatorsPerThread |
| Aliased for compatibility. Will be removed in CUTLASS v2.0. More... | |
| typedef ThreadsPerWarp_ | ThreadsPerWarp |
| The number of threads per warp. More... | |
| typedef ShapeMul< ThreadGemmShape, ThreadsPerWarp >::Shape | AccumulatorsPerWarp |
| The number of accumulators per warp. More... | |
| typedef int8_t | ScalarA |
| The type for A. More... | |
| typedef Fragment< ScalarA, AccumulatorsPerThread::kW *4 > | FragmentA |
| The fragment for A. More... | |
| typedef int8_t | ScalarB |
| The type for B. More... | |
| typedef Fragment< ScalarB, AccumulatorsPerThread::kH *4 > | FragmentB |
| The fragment for B. More... | |
| typedef int | ScalarC |
| The type for C and D. More... | |
| typedef Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW > | Accumulators |
| The accumulators. More... | |
Public Member Functions | |
| CUTLASS_DEVICE | ThreadMultiplyAdd () |
| Ctor. More... | |
| CUTLASS_DEVICE void | multiply_add (FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d) |
| Multiply : d = a*b + c. More... | |
| typedef Fragment<ScalarC, AccumulatorsPerThread::kH * AccumulatorsPerThread::kW> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::Accumulators |
| typedef ThreadGemmShape cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::AccumulatorsPerThread |
| typedef ShapeMul<ThreadGemmShape, ThreadsPerWarp>::Shape cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::AccumulatorsPerWarp |
| typedef Fragment<ScalarA, AccumulatorsPerThread::kW * 4> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::FragmentA |
| typedef Fragment<ScalarB, AccumulatorsPerThread::kH * 4> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::FragmentB |
| typedef Shape<4, 1, 1> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::InstructionShape |
| typedef int8_t cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::ScalarA |
| typedef int8_t cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::ScalarB |
| typedef int cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::ScalarC |
| typedef ThreadGemmShape_ cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::ThreadGemmShape |
| typedef ThreadsPerWarp_ cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::ThreadsPerWarp |
|
inline |
|
inline |
1.8.14