37 template <
typename GlobalIterator_>
42 typedef typename GlobalIterator::Fragment
Fragment;
64 int const* src_int =
reinterpret_cast<int const*
>(&src[0]);
65 int* dst_int =
reinterpret_cast<int*
>(&dst[0]);
68 for (
int d = 0; d < FragmentShape::kD; ++d) {
69 for (
int h = 0; h < FragmentShape::kH / 4; ++h) {
70 for (
int w = 0; w < ShapeCount<FragmentShape>::kWc / 4; ++w) {
90 int b0, b1, b2, b3, c0;
91 asm volatile(
"prmt.b32 %0, %1, %2, 0x0040;" :
"=r"(b0) :
"r"(a0),
"r"(a1));
92 asm volatile(
"prmt.b32 %0, %1, %2, 0x0040;" :
"=r"(c0) :
"r"(a2),
"r"(a3));
93 asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" :
"=r"(b0) :
"r"(b0),
"r"(c0));
95 asm volatile(
"prmt.b32 %0, %1, %2, 0x0051;" :
"=r"(b1) :
"r"(a0),
"r"(a1));
96 asm volatile(
"prmt.b32 %0, %1, %2, 0x0051;" :
"=r"(c0) :
"r"(a2),
"r"(a3));
97 asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" :
"=r"(b1) :
"r"(b1),
"r"(c0));
99 asm volatile(
"prmt.b32 %0, %1, %2, 0x0062;" :
"=r"(b2) :
"r"(a0),
"r"(a1));
100 asm volatile(
"prmt.b32 %0, %1, %2, 0x0062;" :
"=r"(c0) :
"r"(a2),
"r"(a3));
101 asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" :
"=r"(b2) :
"r"(b2),
"r"(c0));
103 asm volatile(
"prmt.b32 %0, %1, %2, 0x0073;" :
"=r"(b3) :
"r"(a0),
"r"(a1));
104 asm volatile(
"prmt.b32 %0, %1, %2, 0x0073;" :
"=r"(c0) :
"r"(a2),
"r"(a3));
105 asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" :
"=r"(b3) :
"r"(b3),
"r"(c0));
GlobalIterator::FragmentShape FragmentShape
The shape of the source fragment.
Definition: igemm_swizzle.h:44
Definition: igemm_swizzle.h:38
GlobalIterator_ GlobalIterator
The global iterator.
Definition: igemm_swizzle.h:40
CUTLASS_DEVICE void transform(Fragment const &src, Fragment &dst)
Transform a fragment.
Definition: igemm_swizzle.h:62
Fragment OutputFragment
The destination fragment.
Definition: igemm_swizzle.h:49
Fragment InputFragment
The source fragment.
Definition: igemm_swizzle.h:47
GlobalIterator::Fragment Fragment
The source fragment.
Definition: igemm_swizzle.h:42
CUTLASS_DEVICE IgemmSwizzle()
The src/dst must be int8 fragments.
Definition: igemm_swizzle.h:59
Defines Fragment, a statically-sized array for storing parts of matrices within a thread's registers...
Compute derived counted of a Layout Concept based class.
Definition: shape.h:79