70 typedef Shape<Base::VectorizedTile::kH / Base::Threads::kH / 4,
72 Base::VectorizedTile::kW / Base::Threads::kW,
83 return make_Coord(0, thread_offset_h, thread_offset_w, 0);
94 template <
typename TileTraits_,
typename Index_ =
int>
105 :
Base(_params, threadblock_offset, thread_offset_func),
mask_(0xffffffff) { }
111 int const kBlock = TileTraits_::Tile::kW;
113 int const kResidue = (int)(bounds[1] % kBlock);
117 if (left > 0 && left < 4) {
118 mask_ = (1u << (8 * left)) - 1u;
125 reinterpret_cast<uint32_t&
>(value) &=
mask_;
Base::Threads Threads
The threads.
Definition: igemm_global_tile.h:66
Computes the thread offset in (H, W) based on thread ID.
Definition: igemm_global_tile.h:77
Defines iterators for efficiently loading and storing to global memory.
Definition: gemm_global_tile.h:70
A Coord is a coordinate of arbitrary rank into a tensor or matrix.
Shape< Base::VectorizedTile::kH/Base::Threads::kH/4, 4, Base::VectorizedTile::kW/Base::Threads::kW, Base::VectorizedTile::kC/Base::kAccessSize > Iterations
The number of iterations needed to load/store the tile.
Definition: igemm_global_tile.h:74
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:368
CUTLASS_HOST_DEVICE void load_element(typename Base::AccessType &value, int d, int h, int w, int c) const
Loads a single fragment element from memory.
Definition: gemm_global_tile.h:267
Kind
Enumeration defining fundamental contiguous layouts.
Definition: matrix_traits.h:159
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: igemm_global_tile.h:79
Definition: gemm_global_tile.h:163
static int const kH
The height of the cube.
Definition: shape.h:68
An iterator implementing Tile Load Iterator Concept for loading a tile from memory.
Definition: tile_iterator.h:402
Definition: igemm_global_tile.h:50
CUTLASS_DEVICE void load_element(typename Base::AccessType &value, int d, int h, int w, int c) const
Definition: igemm_global_tile.h:122
GemmGlobalIteratorAb< TileTraits_, Index_ > Base
The base class.
Definition: igemm_global_tile.h:97
Definition: igemm_global_tile.h:95
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
TileTraits_::ThreadOffset ThreadOffset
The functor to compute the thread offset.
Definition: igemm_global_tile.h:99
uint32_t mask_
The mask to clean up the values.
Definition: igemm_global_tile.h:129
ReshapeThreads< VectorizedTile, Threads_ >::Threads Threads
The threads shape.
Definition: gemm_global_tile.h:88
Shape< 1, 4, Base::VectorizedTile::kC > ThreadsDelta
The threads strides.
Definition: igemm_global_tile.h:89
TileTraits_::ThreadOffset ThreadOffset
The thread offset.
Definition: gemm_global_tile.h:194
static int const kW
The width of the cube.
Definition: shape.h:70
CUTLASS_DEVICE void initialize_predicates(const Coord< 3 > &bounds, const Coord< 3 > &threadblock_offset)
Definition: igemm_global_tile.h:107
Parameters.
Definition: tile_iterator.h:497
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_global_tile.h:80
Kind
Definition: matrix_traits.h:357
Shape< Base::Threads::kH *4, 1, Base::Threads::kW, Base::kAccessSize > Delta
The strides in each dimension between different loads/stores.
Definition: igemm_global_tile.h:68
Defines properties of matrices used to denote layout and operands to GEMM kernels.
CUTLASS_HOST_DEVICE void initialize_predicates(const Coord< 3 > &bounds, const Coord< 3 > &block_offset)
Definition: gemm_global_tile.h:219
Coord< 4 > thread_offset
Offset of an individual lane from the start of the tile.
Definition: gemm_global_tile.h:213
CUTLASS_DEVICE IgemmGlobalIteratorAb(typename Base::Params const &_params, const Coord< 3 > &threadblock_offset, ThreadOffset thread_offset_func=ThreadOffset())
Constructor.
Definition: igemm_global_tile.h:102
GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ > Base
The base class.
Definition: igemm_global_tile.h:64