41 template <
typename ThreadGemmShape_,
typename ThreadsPerWarp_>
75 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) 77 int const* a_int =
reinterpret_cast<int const*
>(&a[0]);
78 int const* b_int =
reinterpret_cast<int const*
>(&b[0]);
80 for (
int j = 0; j < AccumulatorsPerThread::kH; ++j) {
81 for (
int i = 0; i < AccumulatorsPerThread::kW; ++i) {
82 asm volatile(
"dp4a.s32.s32 %0, %1, %2, %3;" 83 :
"=r"(d[j * AccumulatorsPerThread::kW + i])
84 :
"r"(a_int[i]),
"r"(b_int[j]),
"r"(c[j * AccumulatorsPerThread::kW + i]));
Fragment< ScalarA, AccumulatorsPerThread::kW *4 > FragmentA
The fragment for A.
Definition: igemm_multiply_add.h:56
Shape< A_::kD *B_::kD, A_::kH *B_::kH, A_::kW *B_::kW, A_::kC *B_::kC > Shape
Definition: shape.h:119
A template defining Fragment Concept.
Definition: fragment.h:99
Template implementing matrix multiply-add operations on fragments.
CUTLASS_DEVICE ThreadMultiplyAdd()
Ctor.
Definition: igemm_multiply_add.h:67
CUTLASS_DEVICE void multiply_add(FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d)
Multiply : d = a*b + c.
Definition: igemm_multiply_add.h:70
int ScalarC
The type for C and D.
Definition: igemm_multiply_add.h:62
Shape< 4, 1, 1 > InstructionShape
The shape of the instruction.
Definition: igemm_multiply_add.h:44
ThreadsPerWarp_ ThreadsPerWarp
The number of threads per warp.
Definition: igemm_multiply_add.h:50
ThreadGemmShape_ ThreadGemmShape
Shape of the thread-level GEMM (K-by-N-by-M)
Definition: igemm_multiply_add.h:46
Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW > Accumulators
The accumulators.
Definition: igemm_multiply_add.h:64
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
ShapeMul< ThreadGemmShape, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: igemm_multiply_add.h:52
Template performing matrix multiply-add operation within a thread.
Definition: thread_multiply_add.h:44
ThreadGemmShape AccumulatorsPerThread
Aliased for compatibility. Will be removed in CUTLASS v2.0.
Definition: igemm_multiply_add.h:48
Fragment< ScalarB, AccumulatorsPerThread::kH *4 > FragmentB
The fragment for B.
Definition: igemm_multiply_add.h:60
int8_t ScalarA
The type for A.
Definition: igemm_multiply_add.h:54
Defines Fragment, a statically-sized array for storing parts of matrices within a thread's registers...
int8_t ScalarB
The type for B.
Definition: igemm_multiply_add.h:58