Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
|
Template performing matrix multiply-add operation within a thread.
#include <fp16_sgemm_multiply_add.h>
Public Types | |
typedef Shape< 1, 1, 1, 1 > | InstructionShape |
The shape of the instruction. More... | |
typedef ThreadGemmShape_ | ThreadGemmShape |
The shape of a thread-leveel matrix multiply accumulate. More... | |
typedef ThreadGemmShape | AccumulatorsPerThread |
Aliased to "AccumulatorsPerThread" for compatibility. Expect to be renamed in CUTLASS v2.0. More... | |
typedef ThreadsPerWarp_ | ThreadsPerWarp |
The number of threads per warp. More... | |
typedef ShapeMul< ThreadGemmShape, ThreadsPerWarp >::Shape | AccumulatorsPerWarp |
The number of accumulators per warp. More... | |
typedef half | ScalarA |
The type for A. specialized to half. More... | |
typedef Fragment< ScalarA, AccumulatorsPerThread::kW > | FragmentA |
The fragment for A. More... | |
typedef half | ScalarB |
The type for B. specialized to half. More... | |
typedef Fragment< ScalarB, AccumulatorsPerThread::kH > | FragmentB |
The fragment for B. More... | |
typedef float | ScalarC |
The type for C and D. specialized to float. More... | |
typedef Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW, 16 > | Accumulators |
The accumulators. More... | |
Public Member Functions | |
CUTLASS_DEVICE | ThreadMultiplyAdd () |
Ctor. More... | |
CUTLASS_DEVICE void | multiply_add (FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d) |
Multiply : d = a*b + c. More... | |
typedef Fragment<ScalarC, AccumulatorsPerThread::kH * AccumulatorsPerThread::kW, 16> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >::Accumulators |
typedef ThreadGemmShape cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >::AccumulatorsPerThread |
typedef ShapeMul<ThreadGemmShape, ThreadsPerWarp>::Shape cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >::AccumulatorsPerWarp |
typedef Fragment<ScalarA, AccumulatorsPerThread::kW> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >::FragmentA |
typedef Fragment<ScalarB, AccumulatorsPerThread::kH> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >::FragmentB |
typedef Shape<1, 1, 1, 1> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >::InstructionShape |
typedef half cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >::ScalarA |
typedef half cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >::ScalarB |
typedef float cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >::ScalarC |
typedef ThreadGemmShape_ cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >::ThreadGemmShape |
typedef ThreadsPerWarp_ cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >::ThreadsPerWarp |
|
inline |
|
inline |