Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
Namespaces | Classes | Functions
cutlass Namespace Reference

Namespaces

 detail
 
 gemm
 
 MatrixLayout
 Defines data layouts of various matrix formats usable by TensorRef and other classes.
 
 platform
 
 reduction
 

Classes

struct  AlignedStruct
 
struct  bin1_t
 
struct  ComputeOffsetFromShape
 Compute the offset for the given coordinates in a cube. More...
 
struct  ComputeOffsetFromStrides
 Compute the offset for the given coordinates in a cube. More...
 
struct  ComputeThreadOffsetFromStrides
 Decompose threadId.x into coordinate of a cube whose dimensions are specified by Threads_. Afterwards compute the offset of those coordinates using Strides_. More...
 
struct  ComputeThreadOffsetFromStrides< Shape< 1, T_h_, T_w_, 1 >, Shape< 1, S_h_, S_w_, 1 > >
 Specialization for D=1 and C=1. More...
 
struct  ComputeThreadOffsetFromStrides< Shape< 1, T_h_, T_w_, T_c_ >, Shape< 1, S_h_, S_w_, S_c_ > >
 Specialization for D=1. More...
 
struct  ConstPredicateTileAdapter
 Adapter to enable random access to predicates via logical coordinate within a tile. More...
 
struct  Convert
 
struct  Convert< Fragment< InputScalar_, kScalars_ >, Fragment< OutputScalar_, kScalars_ > >
 
struct  Coord
 Statically-sized array specifying Coords within a tensor. More...
 
struct  Copy
 
struct  divide_assert
 
struct  DumpType
 
struct  Extent
 Returns the extent of a scalar or vector. More...
 
struct  Extent< Vector< T, Lanes > >
 Returns the number of lanes of a vector if need be. More...
 
struct  Extent< Vector< T, Lanes > const >
 Returns the number of lanes of a vector if need be. More...
 
struct  Fragment
 A template defining Fragment Concept. More...
 
struct  FragmentConstIterator
 
struct  FragmentElementType
 Specifies whether iterator storage fragment consists of Scalar values or WMMA matrix. More...
 
struct  FragmentIterator
 A template defining Fragment Iterator Concept. More...
 
struct  GemmOperand
 Gemm operand - D = A * B + C. More...
 
struct  Identity
 Describes identity elements. More...
 
struct  IdentityTensorMapFunc
 
struct  int4_t
 
struct  is_pow2
 
struct  IteratorAdvance
 Specifies dimension in which post-increment accesses advance. More...
 
struct  KernelLaunchConfiguration
 Structure containing the basic launch configuration of a CUDA kernel. More...
 
struct  Load
 
struct  Load< double, 2, Memory_, FragmentElementType::kScalar, double, kStride, 16 >
 
struct  Load< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, 1, 2 >
 Partial specialization for 16b loads. More...
 
struct  Load< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 16 >
 
struct  Load< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 4 >
 
struct  Load< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 8 >
 
struct  Load< Scalar_, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size >
 
struct  Load< Vector< bin1_t, 32 >, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size >
 
struct  Load< Vector< int4_t, 8 >, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size >
 
struct  Load< Vector< uint4_t, 8 >, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size >
 
struct  log2_down
 
struct  log2_down< N, 1, Count >
 
struct  log2_up
 
struct  log2_up< N, 1, Count >
 
struct  MatrixCoord
 
struct  MatrixTransform
 Transformation applied to matrix operands. More...
 
struct  Max
 
struct  MemorySpace
 Enum to specify which memory space data resides in. More...
 
struct  Min
 
struct  PredicatedTileLoadStream
 Generic stream for loading and transforming fragments. More...
 
struct  PredicatedTileStoreStream
 Generic stream for transforming and storing fragments. More...
 
struct  PredicateTileAdapter
 Adapter to enable random access to predicates via logical coordinate within a tile. More...
 
struct  PredicateVector
 Statically sized array of bits implementing. More...
 
struct  RegularTilePredicateFunctor
 Functor computing a predicate given the logical position of an access. More...
 
struct  ReshapeTile
 
struct  ReshapeTile< Tile_, kAccessSize_, true >
 
struct  ScalarIO
 Helper to enable formatted printing of CUTLASS scalar types to an ostream. More...
 
struct  Shape
 A Shape implementing Layout Concept describing the dimensions of a cube. More...
 
struct  ShapeAdd
 
struct  ShapeCount
 Compute derived counted of a Layout Concept based class. More...
 
struct  ShapeDiv
 
struct  ShapeDivCeiling
 
struct  ShapeMax
 
struct  ShapeMin
 
struct  ShapeMul
 
struct  ShapeScale
 
struct  ShapeStrides
 
struct  ShapeSub
 
struct  sqrt_est
 
struct  StorageType
 
struct  StorageType< 1 >
 
struct  StorageType< 2 >
 
struct  StorageType< 4 >
 
struct  Store
 
struct  Store< double, 2, Memory_, FragmentElementType::kScalar, double, kStride, 16 >
 
struct  Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, 1, 2 >
 
struct  Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 16 >
 
struct  Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 4 >
 
struct  Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 8 >
 
struct  Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size >
 
class  TensorRef
 
class  TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >
 Specialization for rank=1 case with no internal StrideVector. More...
 
struct  TensorRefArray
 
struct  TensorRefBatchStrided
 
class  TensorView
 Defines a view into a logical tensor. More...
 
struct  TileAllocation
 Class for storing a tile in memory and accessing it through a tensor ref. More...
 
struct  TileCoord
 
struct  TiledThreadOffset
 Basic thread offset function computed from a thread shape. More...
 
struct  TileIteratorBase
 Iterator for accessing a stripmined tile in memory. More...
 
struct  TileLoadIterator
 An iterator implementing Tile Load Iterator Concept for loading a tile from memory. More...
 
struct  TileLoadStream
 Generic stream for loading and transforming fragments. More...
 
struct  TileStoreIterator
 An iterator implementing Tile Store Iterator Concept for storing a tile to memory. More...
 
struct  TileStoreStream
 Generic stream for transforming and storing fragments. More...
 
struct  TileTraits
 A template defining Tile Traits Concept. More...
 
struct  TileTraitsContiguousMajor
 
struct  TileTraitsStandard
 Chooses 'best' shape to enable warp raking along contiguous dimension if possible. More...
 
struct  TileTraitsStrideMajor
 
struct  TileTraitsWarpRake
 Tiling in which warps rake across the contiguous dimension. More...
 
struct  TrivialPredicateTileAdapter
 Always returns true predicate. More...
 
struct  uint4_t
 
union  Vector
 
union  Vector< bin1_t, kLanes_ >
 Vector definition for 1-bit binary datatype. More...
 
union  Vector< half, 1 >
 
union  Vector< half, kLanes_ >
 
union  Vector< int4_t, kLanes_ >
 Vector definition for 4-bit signed integer datatype. More...
 
union  Vector< uint4_t, kLanes_ >
 Vector definition for 4-bit unsigned integer datatype. More...
 
struct  Vectorize
 
struct  Vectorize< Vector< bin1_t, 32 >, kLanes_ >
 
struct  Vectorize< Vector< int4_t, 8 >, kLanes_ >
 
struct  Vectorize< Vector< uint4_t, 8 >, kLanes_ >
 
struct  VectorTraits
 Traits describing properties of vectors and scalar-as-vectors. More...
 
struct  VectorTraits< Vector< T, Lanes > >
 Partial specialization for actual cutlass::Vector. More...
 
struct  VectorTraits< Vector< T, Lanes > const >
 Partial specialization for actual cutlass::Vector. More...
 
struct  WmmaReshapeTile
 
struct  WmmaReshapeTile< Tile_, kAccessSize_, kLdsPerAccess_, true >
 
struct  ZipConvert
 Zips two convert operations. More...
 
struct  ZipFragment
 A template defining Fragment Concept. More...
 
struct  ZipTensorRef
 
struct  ZipTileAllocation
 Manages a pair of tile allocations as if they are one allocation. More...
 
class  ZipTileIterator
 Constructs an iterator from a pair of iterators. More...
 

Functions

template<typename T , int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord< Rank, Index > operator* (T s, Coord< Rank, Index > coord)
 Scalar multiplication. More...
 
template<typename T , int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord< Rank, Index > operator* (Coord< Rank, Index > coord, T s)
 Scalar multiplication. More...
 
template<typename T , int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord< Rank, Index > operator/ (T s, Coord< Rank, Index > coord)
 Scalar division. More...
 
template<typename T , int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord< Rank, Index > operator/ (Coord< Rank, Index > coord, T s)
 Scalar division. More...
 
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord (int _0)
 Helper to make a 2-element coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 2 > make_Coord (int _0, int _1)
 Helper to make a 2-element coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 3 > make_Coord (int _0, int _1, int _2)
 Helper to make a 3-element coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 4 > make_Coord (int _0, int _1, int _2, int _3)
 Helper to make a 4-element coordinate. More...
 
template<typename Shape_ >
CUTLASS_HOST_DEVICE Coord< 3 > make_Coord_from_shape ()
 
template<int Rank>
std::ostream & operator<< (std::ostream &out, Coord< Rank > const &coord)
 
template<typename T >
std::ostream & operator<< (std::ostream &out, ScalarIO< T > const &scalar)
 Default printing to ostream. More...
 
template<>
std::ostream & operator<< (std::ostream &out, ScalarIO< int8_t > const &scalar)
 Printing to ostream of int8_t as integer rather than character. More...
 
template<>
std::ostream & operator<< (std::ostream &out, ScalarIO< uint8_t > const &scalar)
 Printing to ostream of uint8_t as integer rather than character. More...
 
template<>
std::ostream & operator<< (std::ostream &out, ScalarIO< cutlass::Vector< cutlass::bin1_t, 32 > > const &scalar)
 Printing to ostream of vector of 1b elements. More...
 
template<>
std::ostream & operator<< (std::ostream &out, ScalarIO< cutlass::Vector< cutlass::int4_t, 8 > > const &scalar)
 Printing to ostream of vector of 4b signed integer elements. More...
 
template<>
std::ostream & operator<< (std::ostream &out, ScalarIO< cutlass::Vector< cutlass::uint4_t, 8 > > const &scalar)
 Printing to ostream of vector of 4b unsigned integer elements. More...
 
template<typename InputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void iterator_load (InputIterator &iterator, Fragment &fragment)
 
template<typename OutputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void iterator_store (OutputIterator &iterator, Fragment &fragment)
 
template<typename TensorRef_ >
CUTLASS_HOST_DEVICE TensorRefBatchStrided< typename TensorRef_::Storage, TensorRef_::kRank, typename TensorRef_::MapFunc, TensorRef_::kStorageGrank, typename TensorRef_::Index, typename TensorRef_::LongIndex > make_TensorRefBatchStrided (TensorRef_ const &ref, typename TensorRef_::LongIndex batch_stride=0)
 Helper to construct a TensorRefBatchStrided<> object using type deduction. More...
 
template<typename dividend_t , typename divisor_t >
CUTLASS_HOST_DEVICE dividend_t round_nearest (dividend_t dividend, divisor_t divisor)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t gcd (value_t a, value_t b)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t lcm (value_t a, value_t b)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t clz (value_t x)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t find_log2 (value_t x)
 
__host__ CUTLASS_DEVICE cudaError_t cuda_perror_impl (cudaError_t error, const char *filename, int line)
 The corresponding error message is printed to stderr (or stdout in device code) along with the supplied source context. More...
 
template<>
struct __align__ (1) AlignedStruct< 1 >
 
template<>
struct __align__ (2) AlignedStruct< 2 >
 
template<>
struct __align__ (4) AlignedStruct< 4 >
 
template<>
struct __align__ (8) AlignedStruct< 8 >
 
template<>
struct __align__ (16) AlignedStruct< 16 >
 
template<>
struct __align__ (32) AlignedStruct< 32 >
 
template<>
struct __align__ (64) AlignedStruct< 64 >
 
template<typename Scalar_ >
CUTLASS_HOST_DEVICE void make_zero (Scalar_ &x)
 
template<typename Scalar_ , int kLanes_>
CUTLASS_HOST_DEVICE void make_zero (Vector< Scalar_, kLanes_ > &vec)
 
template<typename First , typename Second >
CUTLASS_HOST_DEVICE ZipFragment< First, Second > make_ZipFragment (First const &first, Second const &second)
 Helper to construct a ZipFragment object. More...
 
template<typename First , typename Second >
CUTLASS_HOST_DEVICE ZipConvert< First, Second > make_ZipConvert (First const &first, Second const &second)
 Helper to construct a ZipConvert object. More...
 
template<typename First , typename Second >
CUTLASS_HOST_DEVICE ZipTensorRef< First, Second > make_ZipTensorRef (First const &first, Second const &second)
 Constructs a ZipTensorRef. More...
 

Function Documentation

◆ __align__() [1/7]

template<>
struct cutlass::__align__ ( )

◆ __align__() [2/7]

template<>
struct cutlass::__align__ ( 64  )

◆ __align__() [3/7]

template<>
struct cutlass::__align__ ( )

◆ __align__() [4/7]

template<>
struct cutlass::__align__ ( )

◆ __align__() [5/7]

template<>
struct cutlass::__align__ ( )

◆ __align__() [6/7]

template<>
struct cutlass::__align__ ( 16  )

◆ __align__() [7/7]

template<>
struct cutlass::__align__ ( 32  )

◆ clz()

template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::clz ( value_t  x)

log2 computation, what's the difference between the below codes and log2_up/down codes?

◆ cuda_perror_impl()

__host__ CUTLASS_DEVICE cudaError_t cutlass::cuda_perror_impl ( cudaError_t  error,
const char *  filename,
int  line 
)
Returns
The CUDA error.

◆ find_log2()

template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::find_log2 ( value_t  x)

◆ gcd()

template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::gcd ( value_t  a,
value_t  b 
)

Greatest common divisor

◆ iterator_load()

template<typename InputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::iterator_load ( InputIterator &  iterator,
Fragment fragment 
)

◆ iterator_store()

template<typename OutputIterator , typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::iterator_store ( OutputIterator &  iterator,
Fragment fragment 
)

◆ lcm()

template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::lcm ( value_t  a,
value_t  b 
)

Least common multiple

◆ make_Coord() [1/4]

CUTLASS_HOST_DEVICE Coord<1> cutlass::make_Coord ( int  _0)

◆ make_Coord() [2/4]

CUTLASS_HOST_DEVICE Coord<2> cutlass::make_Coord ( int  _0,
int  _1 
)

◆ make_Coord() [3/4]

CUTLASS_HOST_DEVICE Coord<3> cutlass::make_Coord ( int  _0,
int  _1,
int  _2 
)

◆ make_Coord() [4/4]

CUTLASS_HOST_DEVICE Coord<4> cutlass::make_Coord ( int  _0,
int  _1,
int  _2,
int  _3 
)

◆ make_Coord_from_shape()

template<typename Shape_ >
CUTLASS_HOST_DEVICE Coord<3> cutlass::make_Coord_from_shape ( )

◆ make_TensorRefBatchStrided()

template<typename TensorRef_ >
CUTLASS_HOST_DEVICE TensorRefBatchStrided< typename TensorRef_::Storage, TensorRef_::kRank, typename TensorRef_::MapFunc, TensorRef_::kStorageGrank, typename TensorRef_::Index, typename TensorRef_::LongIndex> cutlass::make_TensorRefBatchStrided ( TensorRef_ const &  ref,
typename TensorRef_::LongIndex  batch_stride = 0 
)

◆ make_zero() [1/2]

template<typename Scalar_ >
CUTLASS_HOST_DEVICE void cutlass::make_zero ( Scalar_ &  x)

◆ make_zero() [2/2]

template<typename Scalar_ , int kLanes_>
CUTLASS_HOST_DEVICE void cutlass::make_zero ( Vector< Scalar_, kLanes_ > &  vec)

◆ make_ZipConvert()

template<typename First , typename Second >
CUTLASS_HOST_DEVICE ZipConvert<First, Second> cutlass::make_ZipConvert ( First const &  first,
Second const &  second 
)

◆ make_ZipFragment()

template<typename First , typename Second >
CUTLASS_HOST_DEVICE ZipFragment<First, Second> cutlass::make_ZipFragment ( First const &  first,
Second const &  second 
)

◆ make_ZipTensorRef()

template<typename First , typename Second >
CUTLASS_HOST_DEVICE ZipTensorRef<First, Second> cutlass::make_ZipTensorRef ( First const &  first,
Second const &  second 
)

◆ operator*() [1/2]

template<typename T , int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord<Rank, Index> cutlass::operator* ( s,
Coord< Rank, Index >  coord 
)

◆ operator*() [2/2]

template<typename T , int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord<Rank, Index> cutlass::operator* ( Coord< Rank, Index >  coord,
s 
)

◆ operator/() [1/2]

template<typename T , int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord<Rank, Index> cutlass::operator/ ( s,
Coord< Rank, Index >  coord 
)

◆ operator/() [2/2]

template<typename T , int Rank, typename Index >
CUTLASS_HOST_DEVICE Coord<Rank, Index> cutlass::operator/ ( Coord< Rank, Index >  coord,
s 
)

◆ operator<<() [1/7]

template<int Rank>
std::ostream& cutlass::operator<< ( std::ostream &  out,
Coord< Rank > const &  coord 
)

◆ operator<<() [2/7]

template<typename T >
std::ostream& cutlass::operator<< ( std::ostream &  out,
ScalarIO< T > const &  scalar 
)
inline

◆ operator<<() [3/7]

template<>
std::ostream& cutlass::operator<< ( std::ostream &  out,
ScalarIO< int8_t > const &  scalar 
)
inline

◆ operator<<() [4/7]

template<>
std::ostream& cutlass::operator<< ( std::ostream &  out,
ScalarIO< uint8_t > const &  scalar 
)
inline

◆ operator<<() [5/7]

template<>
std::ostream& cutlass::operator<< ( std::ostream &  out,
ScalarIO< cutlass::Vector< cutlass::bin1_t, 32 > > const &  scalar 
)
inline

◆ operator<<() [6/7]

template<>
std::ostream& cutlass::operator<< ( std::ostream &  out,
ScalarIO< cutlass::Vector< cutlass::int4_t, 8 > > const &  scalar 
)
inline

◆ operator<<() [7/7]

template<>
std::ostream& cutlass::operator<< ( std::ostream &  out,
ScalarIO< cutlass::Vector< cutlass::uint4_t, 8 > > const &  scalar 
)
inline

◆ round_nearest()

template<typename dividend_t , typename divisor_t >
CUTLASS_HOST_DEVICE dividend_t cutlass::round_nearest ( dividend_t  dividend,
divisor_t  divisor 
)

Round dividend up to the nearest multiple of divisor