Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
fragment_multiply_add.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
28 #pragma once
29 
30 #include "cutlass/fragment.h"
31 
32 namespace cutlass {
33 namespace gemm {
34 
36 
37 template < typename ScalarAlphaBeta_,
38  typename ScalarAccum_,
39  bool fragMul2 = true /*number of element per fragment is multiple of 2*/
40 >
45  typedef ScalarAlphaBeta_ ScalarAlphaBeta;
47  typedef ScalarAccum_ ScalarAccum;
48 
50  CUTLASS_DEVICE FragmentMultiplyAdd() {}
51 
53  template <typename FragmentB_, typename FragmentCd_>
54  CUTLASS_DEVICE void multiply(ScalarAlphaBeta a, FragmentB_ const& b, FragmentCd_& d) {
55 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
56  int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
57  for (int j = 0; j < FragmentCd_::kElements; ++j) {
58  d[j] = b[j * kReduction + 0];
59  for (int k = 1; k < kReduction; ++k) {
60  d[j] += b[j * kReduction + k];
61  }
62  d[j] = a * ScalarAlphaBeta(d[j]);
63  }
64 #endif
65  }
66 
68  template <typename FragmentB_, typename FragmentCd_>
69  CUTLASS_DEVICE void multiply_add(ScalarAlphaBeta a,
70  FragmentB_ const& b,
71  FragmentCd_ const& c,
72  FragmentCd_& d) {
73 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
74  int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
75  for (int j = 0; j < FragmentCd_::kElements; ++j) {
76  d[j] = b[j * kReduction + 0];
77  for (int k = 1; k < kReduction; ++k) {
78  d[j] += b[j * kReduction + k];
79  }
80  d[j] = a * ScalarAlphaBeta(d[j]) + ScalarAlphaBeta(c[j]);
81  }
82 #endif
83  }
84 };
85 
87 
88 #if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
89 template <>
90 struct FragmentMultiplyAdd<half, half, true> {
94  typedef half ScalarAlphaBeta;
96  typedef half ScalarAccum;
97 
99  CUTLASS_DEVICE FragmentMultiplyAdd() {}
100 
102  template <typename FragmentB_, typename FragmentCd_>
103  CUTLASS_DEVICE void multiply(half a, FragmentB_ const& b, FragmentCd_& d) {
104 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
105  // The input.
106  __half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]);
107  // The output.
108  __half2* d_half2 = reinterpret_cast<__half2*>(&d[0]);
109 
110  // Assemble a half2 from a.
111  __half2 const a_half2 = __half2half2(a);
112 
113  int const kReduction = (FragmentB_::kElements / FragmentCd_::kElements);
114 
115  for (int j = 0; j < FragmentCd_::kElements / 2; ++j) {
116  d_half2[j] = __hmul2(a_half2, b_half2[j * kReduction + 0]);
117 
118  for (int k = 1; k < kReduction; ++k) {
119  d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + k], d_half2[j]);
120  }
121  }
122 #endif
123  }
124 
125 
127  template <typename FragmentB_, typename FragmentCd_>
128  CUTLASS_DEVICE void multiply_add(half a,
129  FragmentB_ const& b,
130  FragmentCd_ const& c,
131  FragmentCd_& d) {
132 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
133  // The inputs.
134  __half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]);
135  __half2 const* c_half2 = reinterpret_cast<__half2 const*>(&c[0]);
136  // The output.
137  __half2* d_half2 = reinterpret_cast<__half2*>(&d[0]);
138 
139  // Assemble a half2 from a.
140  __half2 const a_half2 = __half2half2(a);
141 
142  int const kReduction = (FragmentB_::kElements / FragmentCd_::kElements);
143  for (int j = 0; j < FragmentCd_::kElements / 2; ++j) {
144  d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + 0], c_half2[j]);
145 
146  for (int k = 1; k < kReduction; ++k) {
147  d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + k], d_half2[j]);
148  }
149  }
150 #endif
151  }
152 };
153 
154 #endif
155 
157 
158 } // namespace gemm
159 } // namespace cutlass
CUTLASS_DEVICE void multiply(ScalarAlphaBeta a, FragmentB_ const &b, FragmentCd_ &d)
Multiply : d = a*b.
Definition: fragment_multiply_add.h:54
Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: fragment_multiply_add.h:92
Definition: convert.h:33
half ScalarAlphaBeta
The type for alpha and beta.
Definition: fragment_multiply_add.h:94
CUTLASS_DEVICE FragmentMultiplyAdd()
Ctor.
Definition: fragment_multiply_add.h:50
CUTLASS_DEVICE FragmentMultiplyAdd()
Ctor.
Definition: fragment_multiply_add.h:99
CUTLASS_DEVICE void multiply(half a, FragmentB_ const &b, FragmentCd_ &d)
Multiply : d = a*b.
Definition: fragment_multiply_add.h:103
ScalarAccum_ ScalarAccum
The type for accumlator.
Definition: fragment_multiply_add.h:47
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
ScalarAlphaBeta_ ScalarAlphaBeta
The type for alpha and beta.
Definition: fragment_multiply_add.h:45
CUTLASS_DEVICE void multiply_add(half a, FragmentB_ const &b, FragmentCd_ const &c, FragmentCd_ &d)
Multiply : d = a*b + c.
Definition: fragment_multiply_add.h:128
Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: fragment_multiply_add.h:43
Defines Fragment, a statically-sized array for storing parts of matrices within a thread&#39;s registers...
half ScalarAccum
The type for accumlator.
Definition: fragment_multiply_add.h:96
CUTLASS_DEVICE void multiply_add(ScalarAlphaBeta a, FragmentB_ const &b, FragmentCd_ const &c, FragmentCd_ &d)
Multiply : d = a*b + c.
Definition: fragment_multiply_add.h:69
Definition: fragment_multiply_add.h:41