Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
load_store.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
28 #pragma once
29 
30 #include "cutlass/vector.h"
31 namespace cutlass {
32 
34 
38 struct MemorySpace {
39  enum Kind {
40  kGeneric, // Data accessed through pointer dereferencing
41  kShared, // Data resides in shared memory
42  kGlobal // Data resides in global memory
43  };
44 };
45 
49 };
50 
52 
53 template <typename Scalar_,
54  int kAccessSize,
55  MemorySpace::Kind Memory_,
57  typename FragmentElement_ = Scalar_,
58  int kStride = 1,
59  size_t size = (sizeof(Scalar_) * kAccessSize)>
60 struct Load {
63 
65  static CUTLASS_HOST_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) {
66  dst = *reinterpret_cast<AccessType const*>(pointer + offset);
67  }
68 
69 };
70 
72 
74 template <typename Scalar_, int kAccessSize, MemorySpace::Kind Memory_>
75 struct Load<Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, 1, 2> {
78 
80  static CUTLASS_HOST_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) {
81  reinterpret_cast<uint16_t&>(dst) = reinterpret_cast<uint16_t const*>(&pointer[offset])[0];
82  }
83 };
84 
86 
87 template <typename Scalar_, int kAccessSize, MemorySpace::Kind Memory_, int kStride>
88 struct Load<Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 4> {
91 
93  static CUTLASS_HOST_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) {
94  dst.registers[0] = reinterpret_cast<uint32_t const*>(&pointer[offset])[0];
95  }
96 
97 };
98 
100 
101 template <typename Scalar_, int kAccessSize, MemorySpace::Kind Memory_, int kStride>
102 struct Load<Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 8> {
105 
107  static CUTLASS_HOST_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) {
108  uint2 tmp = reinterpret_cast<uint2 const*>(&pointer[offset])[0];
109  dst.registers[0] = tmp.x;
110  dst.registers[1] = tmp.y;
111  }
112 };
113 
115 
116 template <MemorySpace::Kind Memory_, int kStride>
117 struct Load<double, 2, Memory_, FragmentElementType::kScalar, double, kStride, 16> {
120 
122  static CUTLASS_HOST_DEVICE void load(AccessType& dst, double const* pointer, int offset) {
123  double2 tmp = reinterpret_cast<double2 const*>(&pointer[offset])[0];
124  dst[0] = tmp.x;
125  dst[1] = tmp.y;
126  }
127 };
128 
130 
131 #if defined(__CUDACC_VERSION_MAJOR) && __CUDACC_VERSION_MAJOR < 10
132 // WAR bug in NVCC where the upper and lower half of the register end up being the same
133 template <MemorySpace::Kind Memory_, int kStride>
134 struct Load<half, 8, Memory_, FragmentElementType::kScalar, half, kStride, 16> {
136  typedef typename Vectorize<half, 8>::Type AccessType;
137 
139  static CUTLASS_HOST_DEVICE void load(AccessType& dst, half const* pointer, int offset) {
140  int2 tmp = reinterpret_cast<int2 const*>(&pointer[offset])[0];
141  dst.registers[0] = tmp.x;
142  dst.registers[1] = tmp.y;
143 
144  tmp = reinterpret_cast<int2 const*>(&pointer[offset + 4])[0];
145  dst.registers[2] = tmp.x;
146  dst.registers[3] = tmp.y;
147  }
148 };
149 
150 #endif
151 
153 
154 template <typename Scalar_, int kAccessSize, MemorySpace::Kind Memory_, int kStride>
155 struct Load<Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 16> {
158 
160  static CUTLASS_HOST_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) {
161  uint4 tmp = reinterpret_cast<uint4 const*>(&pointer[offset])[0];
162  dst.registers[0] = tmp.x;
163  dst.registers[1] = tmp.y;
164  dst.registers[2] = tmp.z;
165  dst.registers[3] = tmp.w;
166  }
167 };
168 
170 
171 template <typename Scalar_,
172  int kAccessSize,
173  MemorySpace::Kind Memory_,
175  typename FragmentElement_ = Scalar_,
176  int kStride = 1,
177  size_t size = (sizeof(Scalar_) * kAccessSize)>
178 struct Store {
181 
183  static CUTLASS_HOST_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) {
184  pointer[offset] = *reinterpret_cast<Scalar_ const*>(&src);
185  }
186 };
187 
189 
190 template <typename Scalar_, int kAccessSize, MemorySpace::Kind Memory_>
191 struct Store<Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, 1, 2> {
194 
196  static CUTLASS_HOST_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) {
197  uint16_t* addr = reinterpret_cast<uint16_t*>(&pointer[offset]);
198  addr[0] = reinterpret_cast<uint16_t const&>(src);
199  }
200 };
201 
203 
204 template <typename Scalar_, int kAccessSize, MemorySpace::Kind Memory_, int kStride>
205 struct Store<Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 4> {
208 
210  static CUTLASS_HOST_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) {
211  uint32_t* addr = reinterpret_cast<uint32_t*>(&pointer[offset]);
212  addr[0] = src.registers[0];
213  }
214 };
215 
217 
218 template <typename Scalar_, int kAccessSize, MemorySpace::Kind Memory_, int kStride>
219 struct Store<Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 8> {
222 
224  static CUTLASS_HOST_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) {
225  uint2* addr = reinterpret_cast<uint2*>(&pointer[offset]);
226  addr[0] = make_uint2(src.registers[0], src.registers[1]);
227  }
228 };
229 
231 
232 template <MemorySpace::Kind Memory_, int kStride>
233 struct Store<double, 2, Memory_, FragmentElementType::kScalar, double, kStride, 16> {
236 
238  static CUTLASS_HOST_DEVICE void store(AccessType const& src, double* pointer, int offset) {
239  double2* addr = reinterpret_cast<double2*>(&pointer[offset]);
240  addr[0] = make_double2(src[0], src[1]);
241  }
242 };
243 
245 
246 template <typename Scalar_, int kAccessSize, MemorySpace::Kind Memory_, int kStride>
247 struct Store<Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 16> {
250 
252  static CUTLASS_HOST_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) {
253  uint4* addr = reinterpret_cast<uint4*>(&pointer[offset]);
254  addr[0] = make_uint4(src.registers[0], src.registers[1], src.registers[2], src.registers[3]);
255  }
256 };
257 
259 
260 template <typename Scalar_,
261  int kAccessSize,
262  MemorySpace::Kind Memory_,
263  typename FragmentElement_,
264  int kStride,
265  size_t size>
266 struct Load<Scalar_,
267  kAccessSize,
268  Memory_,
269  FragmentElementType::kWmmaMatrix,
270  FragmentElement_,
271  kStride,
272  size> {
274  typedef FragmentElement_ AccessType;
275 
277  static CUTLASS_HOST_DEVICE void load(AccessType& value, Scalar_ const* pointer, int offset) {
278  value.load(&pointer[offset], kStride);
279  }
280 };
281 
283 
284 template <int kAccessSize,
285  MemorySpace::Kind Memory_,
286  typename FragmentElement_,
287  int kStride,
288  size_t size>
289 struct Load<Vector<bin1_t, 32>,
290  kAccessSize,
291  Memory_,
292  FragmentElementType::kWmmaMatrix,
293  FragmentElement_,
294  kStride,
295  size> {
297  typedef FragmentElement_ AccessType;
298 
300  static CUTLASS_HOST_DEVICE void load(AccessType& value, Vector<bin1_t, 32> const* pointer,
301  int offset) {
302  value.load(&pointer[offset], kStride * 32);
303  }
304 };
305 
307 
308 template <int kAccessSize,
309  MemorySpace::Kind Memory_,
310  typename FragmentElement_,
311  int kStride,
312  size_t size>
313 struct Load<Vector<int4_t, 8>,
314  kAccessSize,
315  Memory_,
316  FragmentElementType::kWmmaMatrix,
317  FragmentElement_,
318  kStride,
319  size> {
321  typedef FragmentElement_ AccessType;
322 
324  static CUTLASS_HOST_DEVICE void load(AccessType& value, Vector<int4_t, 8> const* pointer,
325  int offset) {
326  value.load(&pointer[offset], kStride * 8);
327  }
328 };
329 
331 
332 template <int kAccessSize,
333  MemorySpace::Kind Memory_,
334  typename FragmentElement_,
335  int kStride,
336  size_t size>
337 struct Load<Vector<uint4_t, 8>,
338  kAccessSize,
339  Memory_,
340  FragmentElementType::kWmmaMatrix,
341  FragmentElement_,
342  kStride,
343  size> {
345  typedef FragmentElement_ AccessType;
346 
348  static CUTLASS_HOST_DEVICE void load(AccessType& value, Vector<uint4_t, 8> const* pointer,
349  int offset) {
350  value.load(&pointer[offset], kStride * 8);
351  }
352 };
353 
355 template <typename Scalar_,
356  int kAccessSize,
357  MemorySpace::Kind Memory_,
358  typename FragmentElement_,
359  int kStride,
360  size_t size>
361 struct Store<Scalar_,
362  kAccessSize,
363  Memory_,
364  FragmentElementType::kWmmaMatrix,
365  FragmentElement_,
366  kStride,
367  size> {
369  typedef FragmentElement_ AccessType;
370 
372  static CUTLASS_HOST_DEVICE void store(AccessType const& value, Scalar_* pointer, int offset) {
373  value.store(&pointer[offset], kStride);
374  }
375 };
376 
378 
379 } // namespace cutlass
static CUTLASS_HOST_DEVICE void load(AccessType &value, Vector< bin1_t, 32 > const *pointer, int offset)
The load function.
Definition: load_store.h:300
Vectorize< Scalar_, kAccessSize >::Type AccessType
The output type.
Definition: load_store.h:157
Vectorize< Scalar_, kAccessSize >::Type AccessType
The output type.
Definition: load_store.h:77
static CUTLASS_HOST_DEVICE void store(AccessType const &src, double *pointer, int offset)
The store function.
Definition: load_store.h:238
static CUTLASS_HOST_DEVICE void load(AccessType &value, Vector< int4_t, 8 > const *pointer, int offset)
The load function.
Definition: load_store.h:324
Definition: load_store.h:41
Definition: convert.h:33
Definition: numeric_types.h:39
Enum to specify which memory space data resides in.
Definition: load_store.h:38
static CUTLASS_HOST_DEVICE void store(AccessType const &src, Scalar_ *pointer, int offset)
The store function.
Definition: load_store.h:196
static CUTLASS_HOST_DEVICE void store(AccessType const &src, Scalar_ *pointer, int offset)
The store function.
Definition: load_store.h:252
Specifies whether iterator storage fragment consists of Scalar values or WMMA matrix.
Definition: load_store.h:47
Definition: load_store.h:42
Vectorize< double, 2 >::Type AccessType
The output type.
Definition: load_store.h:119
Vectorize< FragmentElement_, kAccessSize >::Type AccessType
The output type.
Definition: load_store.h:180
Kind
Definition: load_store.h:39
Definition: load_store.h:178
static CUTLASS_HOST_DEVICE void load(AccessType &dst, Scalar_ const *pointer, int offset)
The load function.
Definition: load_store.h:160
uint32_t registers[kRegisters]
The data in registers.
Definition: vector.h:81
static CUTLASS_HOST_DEVICE void load(AccessType &value, Scalar_ const *pointer, int offset)
The load function.
Definition: load_store.h:277
Vectorize< Scalar_, kAccessSize >::Type AccessType
The output type.
Definition: load_store.h:193
Vectorize< Scalar_, kAccessSize >::Type AccessType
The output type.
Definition: load_store.h:104
Kind
Definition: load_store.h:48
Definition: load_store.h:40
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46
Vectorize< Scalar_, kAccessSize >::Type AccessType
The output type.
Definition: load_store.h:62
static CUTLASS_HOST_DEVICE void load(AccessType &dst, Scalar_ const *pointer, int offset)
The load function.
Definition: load_store.h:107
Definition: vector.h:62
Definition: load_store.h:60
static CUTLASS_HOST_DEVICE void load(AccessType &dst, Scalar_ const *pointer, int offset)
The load function.
Definition: load_store.h:93
Definition: load_store.h:48
Vector< Element_, kLanes_ > Type
Definition: vector.h:271
Defines a 1D vector of elements held in the registers of each thread.
Vectorize< Scalar_, kAccessSize >::Type AccessType
The output type.
Definition: load_store.h:249
static CUTLASS_HOST_DEVICE void load(AccessType &value, Vector< uint4_t, 8 > const *pointer, int offset)
The load function.
Definition: load_store.h:348
Definition: numeric_types.h:43
Vectorize< Scalar_, kAccessSize >::Type AccessType
The output type.
Definition: load_store.h:90
static CUTLASS_HOST_DEVICE void store(AccessType const &src, Scalar_ *pointer, int offset)
The store function.
Definition: load_store.h:183
static CUTLASS_HOST_DEVICE void store(AccessType const &src, Scalar_ *pointer, int offset)
The store function.
Definition: load_store.h:224
Vectorize< Scalar_, kAccessSize >::Type AccessType
The output type.
Definition: load_store.h:221
Vectorize< Scalar_, kAccessSize >::Type AccessType
The output type.
Definition: load_store.h:207
static CUTLASS_HOST_DEVICE void store(AccessType const &value, Scalar_ *pointer, int offset)
The store function.
Definition: load_store.h:372
Definition: numeric_types.h:41
static CUTLASS_HOST_DEVICE void store(AccessType const &src, Scalar_ *pointer, int offset)
The store function.
Definition: load_store.h:210
Vectorize< double, 2 >::Type AccessType
The output type.
Definition: load_store.h:235
static CUTLASS_HOST_DEVICE void load(AccessType &dst, Scalar_ const *pointer, int offset)
The load function.
Definition: load_store.h:65
static CUTLASS_HOST_DEVICE void load(AccessType &dst, Scalar_ const *pointer, int offset)
The load function.
Definition: load_store.h:80
static CUTLASS_HOST_DEVICE void load(AccessType &dst, double const *pointer, int offset)
The load function.
Definition: load_store.h:122