Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
igemm_traits.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
30 #pragma once
31 
32 #include "cutlass/convert.h"
33 #include "cutlass/gemm/gemm.h"
43 #include "cutlass/reshape_tile.h"
44 
45 namespace cutlass {
46 namespace gemm {
47 
49 
50 template <
52  typename OutputTile_,
54  typename ScalarD_,
56  typename ThreadGemmShape_>
57 struct IgemmConfig : public GemmConfig<
59  int8_t,
61  int8_t,
63  ScalarD_,
65  ScalarD_,
67  OutputTile_,
69  ThreadMultiplyAdd<ThreadGemmShape_, Shape<1, 4, 8>, int8_t, int8_t, int>,
71  4,
73  4,
75  16,
77  4,
79  4,
81  16,
83  1,
85  4,
87  1,
89  2,
91  false,
93  false,
95  false> {};
96 
98 
99 template <typename OutputTile_, typename ThreadGemmShape_>
100 struct IgemmConfig<OutputTile_, int8_t, ThreadGemmShape_>
101  : public GemmConfig<
103  int8_t,
105  int8_t,
107  int8_t,
109  int8_t,
111  OutputTile_,
113  ThreadMultiplyAdd<ThreadGemmShape_, Shape<1, 4, 8>, int8_t, int8_t, int>,
115  4,
117  4,
119  16,
121  4,
123  4,
125  16,
127  4,
129  4,
131  4,
133  2,
135  false,
137  true,
139  false> {};
140 
142 
143 template <enum MatrixLayout::Kind kLayout_, typename GemmConfig_, typename Index_>
144 struct IgemmTileTraitsHelperA : public GemmTileTraitsHelperA<kLayout_, GemmConfig_> {};
145 
147 
148 template <typename GemmConfig_, typename Index_>
149 struct IgemmTileTraitsHelperA<MatrixLayout::kColumnMajor, GemmConfig_, Index_>
150  : public GemmTileTraitsHelperA<MatrixLayout::kColumnMajor, GemmConfig_> {
153 
155  static int const kScalarsPerStsA = 16;
156 
158  typedef IgemmGlobalTileTraits<
160  // The layout.
162  // The pointer is float const.
163  int8_t const,
164  // The tile has size KxM in GEMM's terminology.
166  // The threads are distributed as warps x 32 (the traits may reorganize).
168  // The number of scalars per LDG (LDG.32 or LDG.128, etc).
169  GemmConfig_::kScalarsPerLdgA>
171 
174 
177  // The pointer is float.
178  int8_t,
179  // The tile has size KxM in GEMM's terminology.
180  Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / 4, GemmConfig_::OutputTile::kW * 4>,
181  // The threads are distributed as warps x 32 (the traits may reorganize).
182  typename GlobalTileTraits::Threads,
183  // The number of scalars per STS (STS.32 or STS.128, etc).
184  kScalarsPerStsA>
186 };
187 
189 
190 template <typename GemmConfig_, typename Index_>
191 struct IgemmTileTraitsHelperA<MatrixLayout::kRowMajor, GemmConfig_, Index_> {
194 
196  typedef int8_t Scalar;
198  typedef int8_t MultiplyAddScalar;
199 
201  static int const kScalarsPerStsA = 16;
202 
204  typedef IgemmGlobalTileTraits<
206  // The layout.
208  // The pointer is float const.
209  int8_t const,
210  // The tile has size NxK in GEMM's terminology.
212  // The threads are distributed as warps x 32 (the traits may reorganize).
214  // The number of scalars per LDG (LDG.32 or LDG.128, etc).
215  GemmConfig_::kScalarsPerLdgA>
217 
220 
223  // The pointer is int8.
224  int8_t,
225  // The tile has size KxN in GEMM's terminology.
226  Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / 4, GemmConfig_::OutputTile::kW * 4>,
227  // The threads are distributed as (threads / K) x K (the traits may reorganize).
228  typename GlobalTileTraits::Threads,
229  // The number of scalars per STS.
230  kScalarsPerStsA,
231  // The skew to avoid bank conflicts added in the tile W dimension.
232  16>
234 
237  // The pointer is float const.
238  int8_t const,
239  // The output tile size.
240  typename GemmConfig_::OutputTile,
241  // The number of warps.
242  typename GemmConfig_::Warps,
243  // The number of threads per warp.
244  typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
245  // The shape of the FMA instruction.
246  typename GemmConfig_::InstructionShape,
247  // The number of stages.
248  GemmConfig_::kStages,
249  // The number of scalars per LDS.
250  16,
251  // The skew.
252  SharedStoreTileTraits::kSkew>
254 };
255 
257 
258 template <enum MatrixLayout::Kind kLayout_, typename GemmConfig_, typename Index_>
259 struct IgemmTileTraitsHelperB : public GemmTileTraitsHelperB<kLayout_, GemmConfig_> {};
260 
262 
263 template <typename GemmConfig_, typename Index_>
264 struct IgemmTileTraitsHelperB<MatrixLayout::kColumnMajor, GemmConfig_, Index_> {
267 
269  typedef int8_t Scalar;
271  typedef int8_t MultiplyAddScalar;
272 
274  static int const kScalarsPerStsB = 16;
275 
277  typedef IgemmGlobalTileTraits<
279  // The layout.
281  // The pointer is float const.
282  int8_t const,
283  // The tile has size NxK in GEMM's terminology.
285  // The threads are distributed as warps x 32 (the traits may reorganize).
287  // The number of scalars per LDG (LDG.32 or LDG.128, etc).
288  GemmConfig_::kScalarsPerLdgB>
290 
293 
296  // The pointer is int8.
297  int8_t,
298  // The tile has size KxN in GEMM's terminology.
299  Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / 4, GemmConfig_::OutputTile::kH * 4>,
300  // The threads are distributed as (threads / K) x K (the traits may reorganize).
301  typename GlobalTileTraits::Threads,
302  // The number of scalars per STS.
303  kScalarsPerStsB,
304  // The skew to avoid bank conflicts added in the tile W dimension.
305  16>
307 
310  // The pointer is float const.
311  int8_t const,
312  // The output tile size.
313  typename GemmConfig_::OutputTile,
314  // The number of warps.
315  typename GemmConfig_::Warps,
316  // The number of threads per warp.
317  typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
318  // The shape of the FMA instruction.
319  typename GemmConfig_::InstructionShape,
320  // The number of stages.
321  GemmConfig_::kStages,
322  // The number of scalars per LDS.
323  16,
324  // The skew.
325  SharedStoreTileTraits::kSkew>
327 };
328 
330 
331 template <typename GemmConfig_, typename Index_>
332 struct IgemmTileTraitsHelperB<MatrixLayout::kRowMajor, GemmConfig_, Index_>
333  : public GemmTileTraitsHelperB<MatrixLayout::kRowMajor, GemmConfig_> {
336 
338  static int const kScalarsPerStsB = 16;
339 
341  typedef IgemmGlobalTileTraits<
343  // The layout.
345  // The pointer is float const.
346  int8_t const,
347  // The tile has size KxM in GEMM's terminology.
349  // The threads are distributed as warps x 32 (the traits may reorganize).
351  // The number of scalars per LDG (LDG.32 or LDG.128, etc).
352  GemmConfig_::kScalarsPerLdgB>
354 
357 
360  // The pointer is float.
361  int8_t,
362  // The tile has size KxM in GEMM's terminology.
363  Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / 4, GemmConfig_::OutputTile::kH * 4>,
364  // The threads are distributed as warps x 32 (the traits may reorganize).
365  typename GlobalTileTraits::Threads,
366  // The number of scalars per STS (STS.32 or STS.128, etc).
367  kScalarsPerStsB>
369 };
370 
372 
373 template <enum MatrixLayout::Kind kLayout_, typename Iterator_>
375 
376 template <typename Iterator_>
377 struct IgemmTransformerA<MatrixLayout::kRowMajor, Iterator_> {
379 };
380 
381 template <typename Iterator_>
382 struct IgemmTransformerA<MatrixLayout::kColumnMajor, Iterator_> {
384 };
385 
387 
388 template <enum MatrixLayout::Kind kLayout_, typename Iterator_>
390 
391 template <typename Iterator_>
392 struct IgemmTransformerB<MatrixLayout::kColumnMajor, Iterator_> {
394 };
395 
396 template <typename Iterator_>
397 struct IgemmTransformerB<MatrixLayout::kRowMajor, Iterator_> {
399 };
400 
402 
403 template <
405  MatrixLayout::Kind kLayoutA_,
407  MatrixLayout::Kind kLayoutB_,
409  typename OutputTile_,
411  typename ScalarD_,
413  typename EpilogueFunctor_,
415  typename ThreadGemmShape_ = Shape<32, 8, 8>,
417  typename Index_ = int>
425 
427  typedef typename GemmTileTraitsHelperA::GlobalLoadIterator GlobalLoadIteratorA;
429  typedef typename IgemmTransformerA<GemmTileTraitsHelperA::kLayout,
432  typedef TileStoreIterator<typename GemmTileTraitsHelperA::SharedStoreTileTraits,
433  typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar,
443 
445  typedef typename GemmTileTraitsHelperB::GlobalLoadIterator GlobalLoadIteratorB;
446  // The default transformer for B.
447  typedef typename IgemmTransformerB<GemmTileTraitsHelperB::kLayout,
450  typedef TileStoreIterator<typename GemmTileTraitsHelperB::SharedStoreTileTraits,
451  typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar,
461 
463  typedef TileLoadIterator<typename GemmTileTraitsHelperA::SharedLoadTileTraits,
464  typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar,
472  typedef TileLoadIterator<typename GemmTileTraitsHelperB::SharedLoadTileTraits,
473  typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar,
480 
485 
488 };
489 
491 
492 template <typename ScalarD_>
494  typedef float Scalar;
495 };
496 
497 template <>
498 struct IgemmEpilogueScalar<int> {
499  typedef int Scalar;
500 };
501 
503 
504 template <
506  MatrixLayout::Kind kLayoutA_,
508  MatrixLayout::Kind kLayoutB_,
510  typename OutputTile_ = Shape<32, 128, 128>,
512  typename ScalarD_ = int,
516  typename ThreadGemmShape_ = Shape<32, 8, 8>,
518  typename Index_ = int,
520  typename Helper_ = IgemmTraitsHelper<kLayoutA_,
521  kLayoutB_,
522  OutputTile_,
523  ScalarD_,
524  EpilogueFunctor_,
525  ThreadGemmShape_,
526  Index_> >
527 struct IgemmTraits : public GemmTraits<
528  // The config.
529  typename Helper_::GemmConfig,
530  // The stream to load A from global memory to shared memory.
531  typename Helper_::GlobalLoadStreamA,
532  // The stream to load B from global memory to shared memory.
533  typename Helper_::GlobalLoadStreamB,
534  // The stream to load A from shared memory.
535  typename Helper_::SharedLoadStreamA,
536  // The stream to load B from shared memory.
537  typename Helper_::SharedLoadStreamB,
538  // The epilogue.
539  typename Helper_::Epilogue,
540  // The block swizzle to reorganize the grid.
541  IdentityBlockSwizzle,
542  // The index.
543  Index_,
544  // The tool used to clear accumulators.
545  typename Helper_::ClearAccumulators> {};
546 
548 
549 } // namespace gemm
550 } // namespace cutlass
IgemmTransformerB< GemmTileTraitsHelperB::kLayout, GlobalLoadIteratorB >::Transformer GlobalTransformerB
Definition: igemm_traits.h:448
Definition: load_store.h:41
GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ > Base
The base config.
Definition: igemm_traits.h:335
Definition: convert.h:33
Definition: gemm_shared_tile.h:128
Base::Threads Threads
The threads.
Definition: igemm_global_tile.h:66
IgemmTileTraitsHelperB< kLayoutB_, GemmConfig, Index_ > GemmTileTraitsHelperB
The GEMM config for B.
Definition: igemm_traits.h:424
IgemmSwizzle< Iterator_ > Transformer
Definition: igemm_traits.h:383
Defines iterators for efficiently loading and storing to global memory.
Transposes a fragment of data containing packed 8-bit integer elements.
Copy< typename Iterator_::Fragment > Transformer
Definition: igemm_traits.h:393
GemmSharedStoreWithSkewTileAbTraits< int8_t, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/4, GemmConfig_::OutputTile::kW *4 >, typename GlobalTileTraits::Threads, kScalarsPerStsA, 16 > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for A^N.
Definition: igemm_traits.h:233
IgemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kColumnMajor, int8_t const, Shape< 1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^T.
Definition: igemm_traits.h:289
Defines structural properties of complete GEMM computation.
IgemmGlobalIteratorAb< GlobalTileTraits, Index_ > GlobalLoadIterator
The global load iterator.
Definition: igemm_traits.h:219
Definition: igemm_traits.h:144
Definition: igemm_epilogue.h:290
Kind
Enumeration defining fundamental contiguous layouts.
Definition: matrix_traits.h:159
Definition: convert.h:69
IgemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kRowMajor, int8_t const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kH >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^T.
Definition: igemm_traits.h:353
Definition: gemm_shared_tile.h:38
Definition: tile_iterator.h:65
int8_t MultiplyAddScalar
The scalar stored in shared memory.
Definition: igemm_traits.h:198
GemmTileTraitsHelperB::GlobalLoadIterator GlobalLoadIteratorB
The iterator to load B from global memory.
Definition: igemm_traits.h:445
Implements matrix multiply accumulate operation of 8-bit integer data using DP4A instruction.
Definition: gemm_shared_tile.h:200
TileStoreIterator< typename GemmTileTraitsHelperB::SharedStoreTileTraits, typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorB
The iterator to store B to shared memory.
Definition: igemm_traits.h:454
GemmSharedLoadTileBTraits< int8_t const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 16, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for B^N.
Definition: igemm_traits.h:326
Definition: gemm_global_tile.h:163
int8_t MultiplyAddScalar
The scalar stored in shared memory.
Definition: igemm_traits.h:271
Implements the epilogue phase of the GEMM kernel that efficiently updates global memory with the comp...
IgemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kRowMajor, int8_t const, Shape< 1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^T.
Definition: igemm_traits.h:216
Definition: gemm_global_stream.h:52
Definition: gemm_traits.h:191
IgemmEpilogue< IgemmEpilogueTraits< GemmConfig, EpilogueFunctor_ > > Epilogue
The epilogue.
Definition: igemm_traits.h:487
int Scalar
Definition: igemm_traits.h:499
IgemmSwizzle< Iterator_ > Transformer
Definition: igemm_traits.h:398
GemmSharedStoreTileAbTraits< int8_t, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/4, GemmConfig_::OutputTile::kW *4 >, typename GlobalTileTraits::Threads, kScalarsPerStsA > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for A^N.
Definition: igemm_traits.h:185
Definition: igemm_swizzle.h:38
Definition: igemm_traits.h:259
Definition: igemm_traits.h:418
An iterator implementing Tile Load Iterator Concept for loading a tile from memory.
Definition: tile_iterator.h:402
IgemmTransformerA< GemmTileTraitsHelperA::kLayout, GlobalLoadIteratorA >::Transformer GlobalTransformerA
The default transformer for A.
Definition: igemm_traits.h:430
Defines iterators for efficiently loading and storing tiles to and from shared memory.
GlobalLoadStream< GemmOperand::kB, GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB > GlobalLoadStreamB
The stream to load B from global memory to shared memory.
Definition: igemm_traits.h:460
Definition: gemm_shared_stream.h:45
Definition: igemm_global_tile.h:50
Defines a type for restructuring a tile.
GemmTileTraitsHelperA::GlobalLoadIterator GlobalLoadIteratorA
The iterator to load A from global memory.
Definition: igemm_traits.h:427
Definition: gemm_config.h:76
TileStoreIterator< typename GemmTileTraitsHelperA::SharedStoreTileTraits, typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorA
The iterator to store A to shared memory.
Definition: igemm_traits.h:436
Definition: gemm_traits.h:52
Definition: matrix_traits.h:357
Definition: igemm_traits.h:57
IgemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kColumnMajor, int8_t const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kW >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^N.
Definition: igemm_traits.h:170
Definition: igemm_global_tile.h:95
Definition: igemm_traits.h:374
float Scalar
Definition: igemm_traits.h:494
Definition: gemm_traits.h:349
Copy< typename Iterator_::Fragment > Transformer
Definition: igemm_traits.h:378
Definition: igemm_traits.h:527
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
Definition: matrix_traits.h:159
TileLoadIterator< typename GemmTileTraitsHelperB::SharedLoadTileTraits, typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorB
The iterator to load B from shared memory.
Definition: igemm_traits.h:476
GemmSharedStoreTileAbTraits< int8_t, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/4, GemmConfig_::OutputTile::kH *4 >, typename GlobalTileTraits::Threads, kScalarsPerStsB > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for B^N.
Definition: igemm_traits.h:368
Definition: matrix_traits.h:159
ReshapeThreads< VectorizedTile, Threads_ >::Threads Threads
The threads shape.
Definition: gemm_global_tile.h:88
Template performing matrix multiply-add operation within a thread.
Definition: thread_multiply_add.h:44
IgemmConfig< OutputTile_, ScalarD_, ThreadGemmShape_ > GemmConfig
The IGEMM config.
Definition: igemm_traits.h:420
IgemmGlobalIteratorAb< GlobalTileTraits, Index_ > GlobalLoadIterator
The global load iterator.
Definition: igemm_traits.h:292
GemmGlobalIteratorAb< GlobalTileTraits, Index_ > GlobalLoadIterator
The global load iterator.
Definition: igemm_traits.h:173
GemmGlobalIteratorAb< GlobalTileTraits, Index_ > GlobalLoadIterator
The global load iterator.
Definition: igemm_traits.h:356
GemmConfig::MultiplyAdd MultiplyAdd
The multiply-add functor.
Definition: igemm_traits.h:482
Definition: igemm_traits.h:389
Functor to compute linear combination of fragments.
Definition: linear_scaling.h:51
SharedLoadStream< SharedLoadIteratorA, Copy< typename SharedLoadIteratorA::Fragment > > SharedLoadStreamA
The stream to load A from shared memory.
Definition: igemm_traits.h:470
Definition: matrix_traits.h:357
GlobalLoadStream< GemmOperand::kA, GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA > GlobalLoadStreamA
The stream to load A from global memory to shared memory.
Definition: igemm_traits.h:442
IgemmTileTraitsHelperA< kLayoutA_, GemmConfig, Index_ > GemmTileTraitsHelperA
The GEMM config for A.
Definition: igemm_traits.h:422
Implements a software-pipelined efficient GEMM.
GemmSharedLoadTileATraits< int8_t const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 16, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for A^N.
Definition: igemm_traits.h:253
SharedLoadStream< SharedLoadIteratorB, Copy< typename SharedLoadIteratorB::Fragment > > SharedLoadStreamB
The stream to load B from shared memory.
Definition: igemm_traits.h:479
Defines structural properties of the GEMM epilogue.
Definition: igemm_traits.h:493
Defines the epilogue phase of the GEMM computation for IGEMM, supporting integer and floating-point o...
Defines conversion operations among Fragments of different base type.
GemmSharedStoreWithSkewTileAbTraits< int8_t, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/4, GemmConfig_::OutputTile::kH *4 >, typename GlobalTileTraits::Threads, kScalarsPerStsB, 16 > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for B^N.
Definition: igemm_traits.h:306
Implements tile iterators to partition the thread block tile into 2D subtiles and efficiently load ea...
TileLoadIterator< typename GemmTileTraitsHelperA::SharedLoadTileTraits, typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorA
The iterator to load A from shared memory.
Definition: igemm_traits.h:467
GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ > Base
The base config.
Definition: igemm_traits.h:152
An iterator implementing Tile Store Iterator Concept for storing a tile to memory.
Definition: tile_iterator.h:841
ClearAccumulators< typename MultiplyAdd::ScalarC > ClearAccumulators
The object to clear accumulators.
Definition: igemm_traits.h:484