#ifndef CUFFTDX_FFT_10000_FP32_INV_PTX_HPP
#define CUFFTDX_FFT_10000_FP32_INV_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<397, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<924>;
.reg .b32 r<23>;
.reg .b64 rd<17>;
mov.u32 r1, %tid.y;
mov.u32 r2, %20;
mad.lo.s32 r3, r1, 80000, r2;
mov.u32 r4, %tid.x;
add.f32 f41, %29, %45;
add.f32 f42, %24, f41;
add.f32 f43, %34, %40;
add.f32 f44, f43, f42;
add.f32 f45, %31, %47;
add.f32 f46, %25, f45;
add.f32 f47, %36, %41;
add.f32 f48, f47, f46;
fma.rn.f32 f49, f41, 0f3E9E377A, %24;
mul.f32 f50, f43, 0f3F4F1BBD;
sub.f32 f51, f49, f50;
sub.f32 f52, %31, %47;
mul.f32 f53, f52, 0f3F737871;
sub.f32 f54, %36, %41;
fma.rn.f32 f55, f54, 0f3F167918, f53;
sub.f32 f56, f51, f55;
add.f32 f57, f55, f51;
mul.f32 f58, f41, 0f3F4F1BBD;
sub.f32 f59, %24, f58;
fma.rn.f32 f60, f43, 0f3E9E377A, f59;
mul.f32 f61, f52, 0f3F167918;
mul.f32 f62, f54, 0f3F737871;
sub.f32 f63, f61, f62;
sub.f32 f64, f60, f63;
add.f32 f65, f63, f60;
fma.rn.f32 f66, f45, 0f3E9E377A, %25;
mul.f32 f67, f47, 0f3F4F1BBD;
sub.f32 f68, f66, f67;
sub.f32 f69, %29, %45;
mul.f32 f70, f69, 0f3F737871;
sub.f32 f71, %34, %40;
fma.rn.f32 f72, f71, 0f3F167918, f70;
add.f32 f73, f72, f68;
sub.f32 f74, f68, f72;
mul.f32 f75, f45, 0f3F4F1BBD;
sub.f32 f76, %25, f75;
fma.rn.f32 f77, f47, 0f3E9E377A, f76;
mul.f32 f78, f69, 0f3F167918;
mul.f32 f79, f71, 0f3F737871;
sub.f32 f80, f78, f79;
add.f32 f81, f80, f77;
sub.f32 f82, f77, f80;
add.f32 f83, %32, %48;
add.f32 f84, %26, f83;
add.f32 f85, %37, %42;
add.f32 f86, f85, f84;
add.f32 f87, %33, %49;
add.f32 f88, %28, f87;
add.f32 f89, %39, %44;
add.f32 f90, f89, f88;
fma.rn.f32 f91, f83, 0f3E9E377A, %26;
mul.f32 f92, f85, 0f3F4F1BBD;
sub.f32 f93, f91, f92;
sub.f32 f94, %33, %49;
mul.f32 f95, f94, 0f3F737871;
sub.f32 f96, %39, %44;
fma.rn.f32 f97, f96, 0f3F167918, f95;
sub.f32 f98, f93, f97;
add.f32 f99, f97, f93;
mul.f32 f100, f83, 0f3F4F1BBD;
sub.f32 f101, %26, f100;
fma.rn.f32 f102, f85, 0f3E9E377A, f101;
mul.f32 f103, f94, 0f3F167918;
mul.f32 f104, f96, 0f3F737871;
sub.f32 f105, f103, f104;
sub.f32 f106, f102, f105;
add.f32 f107, f105, f102;
fma.rn.f32 f108, f87, 0f3E9E377A, %28;
mul.f32 f109, f89, 0f3F4F1BBD;
sub.f32 f110, f108, f109;
sub.f32 f111, %32, %48;
mul.f32 f112, f111, 0f3F737871;
sub.f32 f113, %37, %42;
fma.rn.f32 f114, f113, 0f3F167918, f112;
add.f32 f115, f114, f110;
sub.f32 f116, f110, f114;
mul.f32 f117, f87, 0f3F4F1BBD;
sub.f32 f118, %28, f117;
fma.rn.f32 f119, f89, 0f3E9E377A, f118;
mul.f32 f120, f111, 0f3F167918;
mul.f32 f121, f113, 0f3F737871;
sub.f32 f122, f120, f121;
add.f32 f123, f122, f119;
sub.f32 f124, f119, f122;
mul.f32 f125, f98, 0f3F4F1BBD;
mul.f32 f126, f115, 0f3F167918;
sub.f32 f127, f125, f126;
mul.f32 f128, f115, 0f3F4F1BBD;
fma.rn.f32 f129, f98, 0f3F167918, f128;
mul.f32 f130, f106, 0f3E9E377A;
mul.f32 f131, f123, 0f3F737871;
sub.f32 f132, f130, f131;
mul.f32 f133, f123, 0f3E9E377A;
fma.rn.f32 f134, f106, 0f3F737871, f133;
mul.f32 f135, f107, 0fBE9E377A;
mul.f32 f136, f124, 0f3F737871;
sub.f32 f137, f135, f136;
mul.f32 f138, f124, 0fBE9E377A;
fma.rn.f32 f139, f107, 0f3F737871, f138;
mul.f32 f140, f99, 0fBF4F1BBD;
mul.f32 f141, f116, 0f3F167918;
sub.f32 f142, f140, f141;
mul.f32 f143, f116, 0fBF4F1BBD;
fma.rn.f32 f144, f99, 0f3F167918, f143;
sub.f32 f145, f44, f86;
sub.f32 f146, f48, f90;
add.f32 f147, f56, f127;
add.f32 f148, f73, f129;
sub.f32 f149, f56, f127;
sub.f32 f150, f73, f129;
add.f32 f151, f64, f132;
add.f32 f152, f81, f134;
sub.f32 f153, f64, f132;
sub.f32 f154, f81, f134;
add.f32 f155, f65, f137;
add.f32 f156, f82, f139;
sub.f32 f157, f65, f137;
sub.f32 f158, f82, f139;
add.f32 f159, f57, f142;
add.f32 f160, f74, f144;
sub.f32 f161, f57, f142;
sub.f32 f162, f74, f144;
mul.wide.u32 rd2, r4, 274877907;
shr.u64 rd3, rd2, 38;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 1000;
sub.s32 r7, r4, r6;
mad.lo.s32 r8, r5, 80000, r3;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %21;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f163, f164}, [rd6];
mul.f32 f167, f148, f164;
mul.f32 f168, f147, f164;
mul.f32 f169, f163, f148;
mul.f32 f170, f163, f163;
mul.f32 f171, f164, f164;
sub.f32 f172, f170, f171;
mul.f32 f173, f164, f163;
fma.rn.f32 f174, f164, f163, f173;
mul.f32 f175, f152, f174;
mul.f32 f176, f151, f174;
mul.f32 f177, f172, f152;
mul.f32 f178, f163, f172;
mul.f32 f179, f164, f174;
sub.f32 f180, f178, f179;
mul.f32 f181, f163, f174;
fma.rn.f32 f182, f164, f172, f181;
mul.f32 f183, f156, f182;
mul.f32 f184, f155, f182;
mul.f32 f185, f180, f156;
mul.f32 f186, f163, f180;
mul.f32 f187, f164, f182;
sub.f32 f188, f186, f187;
mul.f32 f189, f163, f182;
fma.rn.f32 f190, f164, f180, f189;
mul.f32 f191, f160, f190;
mul.f32 f192, f159, f190;
mul.f32 f193, f188, f160;
mul.f32 f194, f163, f188;
mul.f32 f195, f164, f190;
sub.f32 f196, f194, f195;
mul.f32 f197, f163, f190;
fma.rn.f32 f198, f164, f188, f197;
mul.f32 f199, f146, f198;
mul.f32 f200, f145, f198;
mul.f32 f201, f196, f146;
mul.f32 f202, f163, f196;
mul.f32 f203, f164, f198;
sub.f32 f204, f202, f203;
mul.f32 f205, f163, f198;
fma.rn.f32 f206, f164, f196, f205;
mul.f32 f207, f150, f206;
mul.f32 f208, f149, f206;
mul.f32 f209, f204, f150;
mul.f32 f210, f163, f204;
mul.f32 f211, f164, f206;
sub.f32 f212, f210, f211;
mul.f32 f213, f163, f206;
fma.rn.f32 f214, f164, f204, f213;
mul.f32 f215, f154, f214;
mul.f32 f216, f153, f214;
mul.f32 f217, f212, f154;
mul.f32 f218, f163, f212;
mul.f32 f219, f164, f214;
sub.f32 f220, f218, f219;
mul.f32 f221, f163, f214;
fma.rn.f32 f222, f164, f212, f221;
mul.f32 f223, f158, f222;
mul.f32 f224, f157, f222;
mul.f32 f225, f220, f158;
mul.f32 f226, f163, f220;
mul.f32 f227, f164, f222;
sub.f32 f228, f226, f227;
mul.f32 f229, f163, f222;
fma.rn.f32 f230, f164, f220, f229;
mul.f32 f231, f162, f230;
mul.f32 f232, f161, f230;
mul.f32 f233, f228, f162;
barrier.sync 0;
mad.lo.s32 r9, r7, 80, r8;
add.f32 f234, f48, f90;
add.f32 f235, f44, f86;
st.shared.v2.f32 [r9], {f235, f234};
fma.rn.f32 f236, f163, f147, f167;
sub.f32 f237, f169, f168;
st.shared.v2.f32 [r9+8], {f236, f237};
fma.rn.f32 f238, f172, f151, f175;
sub.f32 f239, f177, f176;
st.shared.v2.f32 [r9+16], {f238, f239};
fma.rn.f32 f240, f180, f155, f183;
sub.f32 f241, f185, f184;
st.shared.v2.f32 [r9+24], {f240, f241};
sub.f32 f242, f193, f192;
fma.rn.f32 f243, f188, f159, f191;
st.shared.v2.f32 [r9+32], {f243, f242};
fma.rn.f32 f244, f196, f145, f199;
sub.f32 f245, f201, f200;
st.shared.v2.f32 [r9+40], {f244, f245};
fma.rn.f32 f246, f204, f149, f207;
sub.f32 f247, f209, f208;
st.shared.v2.f32 [r9+48], {f246, f247};
fma.rn.f32 f248, f212, f153, f215;
sub.f32 f249, f217, f216;
st.shared.v2.f32 [r9+56], {f248, f249};
fma.rn.f32 f250, f220, f157, f223;
sub.f32 f251, f225, f224;
st.shared.v2.f32 [r9+64], {f250, f251};
fma.rn.f32 f252, f228, f161, f231;
sub.f32 f253, f233, f232;
st.shared.v2.f32 [r9+72], {f252, f253};
barrier.sync 0;
mad.lo.s32 r10, r7, -72, r9;
ld.shared.v2.f32 {f254, f255}, [r10];
ld.shared.v2.f32 {f258, f259}, [r10+8000];
ld.shared.v2.f32 {f262, f263}, [r10+16000];
ld.shared.v2.f32 {f266, f267}, [r10+24000];
ld.shared.v2.f32 {f270, f271}, [r10+32000];
ld.shared.v2.f32 {f274, f275}, [r10+40000];
ld.shared.v2.f32 {f278, f279}, [r10+48000];
ld.shared.v2.f32 {f282, f283}, [r10+56000];
ld.shared.v2.f32 {f286, f287}, [r10+64000];
ld.shared.v2.f32 {f290, f291}, [r10+72000];
add.f32 f294, f262, f286;
add.f32 f295, f254, f294;
add.f32 f296, f270, f278;
add.f32 f297, f296, f295;
add.f32 f298, f263, f287;
add.f32 f299, f255, f298;
add.f32 f300, f271, f279;
add.f32 f301, f300, f299;
fma.rn.f32 f302, f294, 0f3E9E377A, f254;
mul.f32 f303, f296, 0f3F4F1BBD;
sub.f32 f304, f302, f303;
sub.f32 f305, f263, f287;
mul.f32 f306, f305, 0f3F737871;
sub.f32 f307, f271, f279;
fma.rn.f32 f308, f307, 0f3F167918, f306;
sub.f32 f309, f304, f308;
add.f32 f310, f308, f304;
mul.f32 f311, f294, 0f3F4F1BBD;
sub.f32 f312, f254, f311;
fma.rn.f32 f313, f296, 0f3E9E377A, f312;
mul.f32 f314, f305, 0f3F167918;
mul.f32 f315, f307, 0f3F737871;
sub.f32 f316, f314, f315;
sub.f32 f317, f313, f316;
add.f32 f318, f316, f313;
fma.rn.f32 f319, f298, 0f3E9E377A, f255;
mul.f32 f320, f300, 0f3F4F1BBD;
sub.f32 f321, f319, f320;
sub.f32 f322, f262, f286;
mul.f32 f323, f322, 0f3F737871;
sub.f32 f324, f270, f278;
fma.rn.f32 f325, f324, 0f3F167918, f323;
add.f32 f326, f325, f321;
sub.f32 f327, f321, f325;
mul.f32 f328, f298, 0f3F4F1BBD;
sub.f32 f329, f255, f328;
fma.rn.f32 f330, f300, 0f3E9E377A, f329;
mul.f32 f331, f322, 0f3F167918;
mul.f32 f332, f324, 0f3F737871;
sub.f32 f333, f331, f332;
add.f32 f334, f333, f330;
sub.f32 f335, f330, f333;
add.f32 f336, f266, f290;
add.f32 f337, f258, f336;
add.f32 f338, f274, f282;
add.f32 f339, f338, f337;
add.f32 f340, f267, f291;
add.f32 f341, f259, f340;
add.f32 f342, f275, f283;
add.f32 f343, f342, f341;
fma.rn.f32 f344, f336, 0f3E9E377A, f258;
mul.f32 f345, f338, 0f3F4F1BBD;
sub.f32 f346, f344, f345;
sub.f32 f347, f267, f291;
mul.f32 f348, f347, 0f3F737871;
sub.f32 f349, f275, f283;
fma.rn.f32 f350, f349, 0f3F167918, f348;
sub.f32 f351, f346, f350;
add.f32 f352, f350, f346;
mul.f32 f353, f336, 0f3F4F1BBD;
sub.f32 f354, f258, f353;
fma.rn.f32 f355, f338, 0f3E9E377A, f354;
mul.f32 f356, f347, 0f3F167918;
mul.f32 f357, f349, 0f3F737871;
sub.f32 f358, f356, f357;
sub.f32 f359, f355, f358;
add.f32 f360, f358, f355;
fma.rn.f32 f361, f340, 0f3E9E377A, f259;
mul.f32 f362, f342, 0f3F4F1BBD;
sub.f32 f363, f361, f362;
sub.f32 f364, f266, f290;
mul.f32 f365, f364, 0f3F737871;
sub.f32 f366, f274, f282;
fma.rn.f32 f367, f366, 0f3F167918, f365;
add.f32 f368, f367, f363;
sub.f32 f369, f363, f367;
mul.f32 f370, f340, 0f3F4F1BBD;
sub.f32 f371, f259, f370;
fma.rn.f32 f372, f342, 0f3E9E377A, f371;
mul.f32 f373, f364, 0f3F167918;
mul.f32 f374, f366, 0f3F737871;
sub.f32 f375, f373, f374;
add.f32 f376, f375, f372;
sub.f32 f377, f372, f375;
mul.f32 f378, f351, 0f3F4F1BBD;
mul.f32 f379, f368, 0f3F167918;
sub.f32 f380, f378, f379;
mul.f32 f381, f368, 0f3F4F1BBD;
fma.rn.f32 f382, f351, 0f3F167918, f381;
mul.f32 f383, f359, 0f3E9E377A;
mul.f32 f384, f376, 0f3F737871;
sub.f32 f385, f383, f384;
mul.f32 f386, f376, 0f3E9E377A;
fma.rn.f32 f387, f359, 0f3F737871, f386;
mul.f32 f388, f360, 0fBE9E377A;
mul.f32 f389, f377, 0f3F737871;
sub.f32 f390, f388, f389;
mul.f32 f391, f377, 0fBE9E377A;
fma.rn.f32 f392, f360, 0f3F737871, f391;
mul.f32 f393, f352, 0fBF4F1BBD;
mul.f32 f394, f369, 0f3F167918;
sub.f32 f395, f393, f394;
mul.f32 f396, f369, 0fBF4F1BBD;
fma.rn.f32 f397, f352, 0f3F167918, f396;
sub.f32 f398, f297, f339;
sub.f32 f399, f301, f343;
add.f32 f400, f309, f380;
add.f32 f401, f326, f382;
sub.f32 f402, f309, f380;
sub.f32 f403, f326, f382;
add.f32 f404, f317, f385;
add.f32 f405, f334, f387;
sub.f32 f406, f317, f385;
sub.f32 f407, f334, f387;
add.f32 f408, f318, f390;
add.f32 f409, f335, f392;
sub.f32 f410, f318, f390;
sub.f32 f411, f335, f392;
add.f32 f412, f310, f395;
add.f32 f413, f327, f397;
sub.f32 f414, f310, f395;
sub.f32 f415, f327, f397;
mul.wide.u32 rd7, r7, -858993459;
shr.u64 rd8, rd7, 35;
cvt.u32.u64 r11, rd8;
mul.lo.s32 r12, r11, 10;
sub.s32 r13, r7, r12;
mul.wide.u32 rd9, r11, 8;
mov.u64 rd10, %22;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f416, f417}, [rd11];
mul.f32 f420, f401, f417;
mul.f32 f421, f400, f417;
mul.f32 f422, f416, f401;
mul.f32 f423, f416, f416;
mul.f32 f424, f417, f417;
sub.f32 f425, f423, f424;
mul.f32 f426, f417, f416;
fma.rn.f32 f427, f417, f416, f426;
mul.f32 f428, f405, f427;
mul.f32 f429, f404, f427;
mul.f32 f430, f425, f405;
mul.f32 f431, f416, f425;
mul.f32 f432, f417, f427;
sub.f32 f433, f431, f432;
mul.f32 f434, f416, f427;
fma.rn.f32 f435, f417, f425, f434;
mul.f32 f436, f409, f435;
mul.f32 f437, f408, f435;
mul.f32 f438, f433, f409;
mul.f32 f439, f416, f433;
mul.f32 f440, f417, f435;
sub.f32 f441, f439, f440;
mul.f32 f442, f416, f435;
fma.rn.f32 f443, f417, f433, f442;
mul.f32 f444, f413, f443;
mul.f32 f445, f412, f443;
mul.f32 f446, f441, f413;
mul.f32 f447, f416, f441;
mul.f32 f448, f417, f443;
sub.f32 f449, f447, f448;
mul.f32 f450, f416, f443;
fma.rn.f32 f451, f417, f441, f450;
mul.f32 f452, f399, f451;
mul.f32 f453, f398, f451;
mul.f32 f454, f449, f399;
mul.f32 f455, f416, f449;
mul.f32 f456, f417, f451;
sub.f32 f457, f455, f456;
mul.f32 f458, f416, f451;
fma.rn.f32 f459, f417, f449, f458;
mul.f32 f460, f403, f459;
mul.f32 f461, f402, f459;
mul.f32 f462, f457, f403;
mul.f32 f463, f416, f457;
mul.f32 f464, f417, f459;
sub.f32 f465, f463, f464;
mul.f32 f466, f416, f459;
fma.rn.f32 f467, f417, f457, f466;
mul.f32 f468, f407, f467;
mul.f32 f469, f406, f467;
mul.f32 f470, f465, f407;
mul.f32 f471, f416, f465;
mul.f32 f472, f417, f467;
sub.f32 f473, f471, f472;
mul.f32 f474, f416, f467;
fma.rn.f32 f475, f417, f465, f474;
mul.f32 f476, f411, f475;
mul.f32 f477, f410, f475;
mul.f32 f478, f473, f411;
mul.f32 f479, f416, f473;
mul.f32 f480, f417, f475;
sub.f32 f481, f479, f480;
mul.f32 f482, f416, f475;
fma.rn.f32 f483, f417, f473, f482;
mul.f32 f484, f415, f483;
mul.f32 f485, f414, f483;
mul.f32 f486, f481, f415;
shl.b32 r14, r13, 3;
add.s32 r15, r8, r14;
barrier.sync 0;
mad.lo.s32 r16, r11, 800, r15;
add.f32 f487, f301, f343;
add.f32 f488, f297, f339;
st.shared.v2.f32 [r16], {f488, f487};
fma.rn.f32 f489, f416, f400, f420;
sub.f32 f490, f422, f421;
st.shared.v2.f32 [r16+80], {f489, f490};
fma.rn.f32 f491, f425, f404, f428;
sub.f32 f492, f430, f429;
st.shared.v2.f32 [r16+160], {f491, f492};
fma.rn.f32 f493, f433, f408, f436;
sub.f32 f494, f438, f437;
st.shared.v2.f32 [r16+240], {f493, f494};
fma.rn.f32 f495, f441, f412, f444;
sub.f32 f496, f446, f445;
st.shared.v2.f32 [r16+320], {f495, f496};
sub.f32 f497, f454, f453;
fma.rn.f32 f498, f449, f398, f452;
st.shared.v2.f32 [r16+400], {f498, f497};
sub.f32 f499, f462, f461;
fma.rn.f32 f500, f457, f402, f460;
st.shared.v2.f32 [r16+480], {f500, f499};
fma.rn.f32 f501, f465, f406, f468;
sub.f32 f502, f470, f469;
st.shared.v2.f32 [r16+560], {f501, f502};
fma.rn.f32 f503, f473, f410, f476;
sub.f32 f504, f478, f477;
st.shared.v2.f32 [r16+640], {f503, f504};
fma.rn.f32 f505, f481, f414, f484;
sub.f32 f506, f486, f485;
st.shared.v2.f32 [r16+720], {f505, f506};
barrier.sync 0;
ld.shared.v2.f32 {f507, f508}, [r10];
ld.shared.v2.f32 {f511, f512}, [r10+8000];
ld.shared.v2.f32 {f515, f516}, [r10+16000];
ld.shared.v2.f32 {f519, f520}, [r10+24000];
ld.shared.v2.f32 {f523, f524}, [r10+32000];
ld.shared.v2.f32 {f527, f528}, [r10+40000];
ld.shared.v2.f32 {f531, f532}, [r10+48000];
ld.shared.v2.f32 {f535, f536}, [r10+56000];
ld.shared.v2.f32 {f539, f540}, [r10+64000];
ld.shared.v2.f32 {f543, f544}, [r10+72000];
add.f32 f547, f515, f539;
add.f32 f548, f507, f547;
add.f32 f549, f523, f531;
add.f32 f550, f549, f548;
add.f32 f551, f516, f540;
add.f32 f552, f508, f551;
add.f32 f553, f524, f532;
add.f32 f554, f553, f552;
fma.rn.f32 f555, f547, 0f3E9E377A, f507;
mul.f32 f556, f549, 0f3F4F1BBD;
sub.f32 f557, f555, f556;
sub.f32 f558, f516, f540;
mul.f32 f559, f558, 0f3F737871;
sub.f32 f560, f524, f532;
fma.rn.f32 f561, f560, 0f3F167918, f559;
sub.f32 f562, f557, f561;
add.f32 f563, f561, f557;
mul.f32 f564, f547, 0f3F4F1BBD;
sub.f32 f565, f507, f564;
fma.rn.f32 f566, f549, 0f3E9E377A, f565;
mul.f32 f567, f558, 0f3F167918;
mul.f32 f568, f560, 0f3F737871;
sub.f32 f569, f567, f568;
sub.f32 f570, f566, f569;
add.f32 f571, f569, f566;
fma.rn.f32 f572, f551, 0f3E9E377A, f508;
mul.f32 f573, f553, 0f3F4F1BBD;
sub.f32 f574, f572, f573;
sub.f32 f575, f515, f539;
mul.f32 f576, f575, 0f3F737871;
sub.f32 f577, f523, f531;
fma.rn.f32 f578, f577, 0f3F167918, f576;
add.f32 f579, f578, f574;
sub.f32 f580, f574, f578;
mul.f32 f581, f551, 0f3F4F1BBD;
sub.f32 f582, f508, f581;
fma.rn.f32 f583, f553, 0f3E9E377A, f582;
mul.f32 f584, f575, 0f3F167918;
mul.f32 f585, f577, 0f3F737871;
sub.f32 f586, f584, f585;
add.f32 f587, f586, f583;
sub.f32 f588, f583, f586;
add.f32 f589, f519, f543;
add.f32 f590, f511, f589;
add.f32 f591, f527, f535;
add.f32 f592, f591, f590;
add.f32 f593, f520, f544;
add.f32 f594, f512, f593;
add.f32 f595, f528, f536;
add.f32 f596, f595, f594;
fma.rn.f32 f597, f589, 0f3E9E377A, f511;
mul.f32 f598, f591, 0f3F4F1BBD;
sub.f32 f599, f597, f598;
sub.f32 f600, f520, f544;
mul.f32 f601, f600, 0f3F737871;
sub.f32 f602, f528, f536;
fma.rn.f32 f603, f602, 0f3F167918, f601;
sub.f32 f604, f599, f603;
add.f32 f605, f603, f599;
mul.f32 f606, f589, 0f3F4F1BBD;
sub.f32 f607, f511, f606;
fma.rn.f32 f608, f591, 0f3E9E377A, f607;
mul.f32 f609, f600, 0f3F167918;
mul.f32 f610, f602, 0f3F737871;
sub.f32 f611, f609, f610;
sub.f32 f612, f608, f611;
add.f32 f613, f611, f608;
fma.rn.f32 f614, f593, 0f3E9E377A, f512;
mul.f32 f615, f595, 0f3F4F1BBD;
sub.f32 f616, f614, f615;
sub.f32 f617, f519, f543;
mul.f32 f618, f617, 0f3F737871;
sub.f32 f619, f527, f535;
fma.rn.f32 f620, f619, 0f3F167918, f618;
add.f32 f621, f620, f616;
sub.f32 f622, f616, f620;
mul.f32 f623, f593, 0f3F4F1BBD;
sub.f32 f624, f512, f623;
fma.rn.f32 f625, f595, 0f3E9E377A, f624;
mul.f32 f626, f617, 0f3F167918;
mul.f32 f627, f619, 0f3F737871;
sub.f32 f628, f626, f627;
add.f32 f629, f628, f625;
sub.f32 f630, f625, f628;
mul.f32 f631, f604, 0f3F4F1BBD;
mul.f32 f632, f621, 0f3F167918;
sub.f32 f633, f631, f632;
mul.f32 f634, f621, 0f3F4F1BBD;
fma.rn.f32 f635, f604, 0f3F167918, f634;
mul.f32 f636, f612, 0f3E9E377A;
mul.f32 f637, f629, 0f3F737871;
sub.f32 f638, f636, f637;
mul.f32 f639, f629, 0f3E9E377A;
fma.rn.f32 f640, f612, 0f3F737871, f639;
mul.f32 f641, f613, 0fBE9E377A;
mul.f32 f642, f630, 0f3F737871;
sub.f32 f643, f641, f642;
mul.f32 f644, f630, 0fBE9E377A;
fma.rn.f32 f645, f613, 0f3F737871, f644;
mul.f32 f646, f605, 0fBF4F1BBD;
mul.f32 f647, f622, 0f3F167918;
sub.f32 f648, f646, f647;
mul.f32 f649, f622, 0fBF4F1BBD;
fma.rn.f32 f650, f605, 0f3F167918, f649;
sub.f32 f651, f550, f592;
sub.f32 f652, f554, f596;
add.f32 f653, f562, f633;
add.f32 f654, f579, f635;
sub.f32 f655, f562, f633;
sub.f32 f656, f579, f635;
add.f32 f657, f570, f638;
add.f32 f658, f587, f640;
sub.f32 f659, f570, f638;
sub.f32 f660, f587, f640;
add.f32 f661, f571, f643;
add.f32 f662, f588, f645;
sub.f32 f663, f571, f643;
sub.f32 f664, f588, f645;
add.f32 f665, f563, f648;
add.f32 f666, f580, f650;
sub.f32 f667, f563, f648;
sub.f32 f668, f580, f650;
mul.wide.u32 rd12, r7, 1374389535;
shr.u64 rd13, rd12, 37;
cvt.u32.u64 r17, rd13;
mul.lo.s32 r18, r17, 100;
sub.s32 r19, r7, r18;
mul.wide.u32 rd14, r17, 8;
mov.u64 rd15, %23;
add.s64 rd16, rd15, rd14;
ld.global.v2.f32 {f669, f670}, [rd16];
mul.f32 f673, f654, f670;
mul.f32 f674, f653, f670;
mul.f32 f675, f669, f654;
mul.f32 f676, f669, f669;
mul.f32 f677, f670, f670;
sub.f32 f678, f676, f677;
mul.f32 f679, f670, f669;
fma.rn.f32 f680, f670, f669, f679;
mul.f32 f681, f658, f680;
mul.f32 f682, f657, f680;
mul.f32 f683, f678, f658;
mul.f32 f684, f669, f678;
mul.f32 f685, f670, f680;
sub.f32 f686, f684, f685;
mul.f32 f687, f669, f680;
fma.rn.f32 f688, f670, f678, f687;
mul.f32 f689, f662, f688;
mul.f32 f690, f661, f688;
mul.f32 f691, f686, f662;
mul.f32 f692, f669, f686;
mul.f32 f693, f670, f688;
sub.f32 f694, f692, f693;
mul.f32 f695, f669, f688;
fma.rn.f32 f696, f670, f686, f695;
mul.f32 f697, f666, f696;
mul.f32 f698, f665, f696;
mul.f32 f699, f694, f666;
mul.f32 f700, f669, f694;
mul.f32 f701, f670, f696;
sub.f32 f702, f700, f701;
mul.f32 f703, f669, f696;
fma.rn.f32 f704, f670, f694, f703;
mul.f32 f705, f652, f704;
mul.f32 f706, f651, f704;
mul.f32 f707, f702, f652;
mul.f32 f708, f669, f702;
mul.f32 f709, f670, f704;
sub.f32 f710, f708, f709;
mul.f32 f711, f669, f704;
fma.rn.f32 f712, f670, f702, f711;
mul.f32 f713, f656, f712;
mul.f32 f714, f655, f712;
mul.f32 f715, f710, f656;
mul.f32 f716, f669, f710;
mul.f32 f717, f670, f712;
sub.f32 f718, f716, f717;
mul.f32 f719, f669, f712;
fma.rn.f32 f720, f670, f710, f719;
mul.f32 f721, f660, f720;
mul.f32 f722, f659, f720;
mul.f32 f723, f718, f660;
mul.f32 f724, f669, f718;
mul.f32 f725, f670, f720;
sub.f32 f726, f724, f725;
mul.f32 f727, f669, f720;
fma.rn.f32 f728, f670, f718, f727;
mul.f32 f729, f664, f728;
mul.f32 f730, f663, f728;
mul.f32 f731, f726, f664;
mul.f32 f732, f669, f726;
mul.f32 f733, f670, f728;
sub.f32 f734, f732, f733;
mul.f32 f735, f669, f728;
fma.rn.f32 f736, f670, f726, f735;
mul.f32 f737, f668, f736;
mul.f32 f738, f667, f736;
mul.f32 f739, f734, f668;
shl.b32 r20, r19, 3;
add.s32 r21, r8, r20;
barrier.sync 0;
mad.lo.s32 r22, r17, 8000, r21;
add.f32 f740, f554, f596;
add.f32 f741, f550, f592;
st.shared.v2.f32 [r22], {f741, f740};
fma.rn.f32 f742, f669, f653, f673;
sub.f32 f743, f675, f674;
st.shared.v2.f32 [r22+800], {f742, f743};
fma.rn.f32 f744, f678, f657, f681;
sub.f32 f745, f683, f682;
st.shared.v2.f32 [r22+1600], {f744, f745};
fma.rn.f32 f746, f686, f661, f689;
sub.f32 f747, f691, f690;
st.shared.v2.f32 [r22+2400], {f746, f747};
fma.rn.f32 f748, f694, f665, f697;
sub.f32 f749, f699, f698;
st.shared.v2.f32 [r22+3200], {f748, f749};
sub.f32 f750, f707, f706;
fma.rn.f32 f751, f702, f651, f705;
st.shared.v2.f32 [r22+4000], {f751, f750};
sub.f32 f752, f715, f714;
fma.rn.f32 f753, f710, f655, f713;
st.shared.v2.f32 [r22+4800], {f753, f752};
fma.rn.f32 f754, f718, f659, f721;
sub.f32 f755, f723, f722;
st.shared.v2.f32 [r22+5600], {f754, f755};
fma.rn.f32 f756, f726, f663, f729;
sub.f32 f757, f731, f730;
st.shared.v2.f32 [r22+6400], {f756, f757};
fma.rn.f32 f758, f734, f667, f737;
sub.f32 f759, f739, f738;
st.shared.v2.f32 [r22+7200], {f758, f759};
barrier.sync 0;
ld.shared.v2.f32 {f760, f761}, [r10];
ld.shared.v2.f32 {f764, f765}, [r10+8000];
ld.shared.v2.f32 {f768, f769}, [r10+16000];
ld.shared.v2.f32 {f772, f773}, [r10+24000];
ld.shared.v2.f32 {f776, f777}, [r10+32000];
ld.shared.v2.f32 {f780, f781}, [r10+40000];
ld.shared.v2.f32 {f784, f785}, [r10+48000];
ld.shared.v2.f32 {f788, f789}, [r10+56000];
ld.shared.v2.f32 {f792, f793}, [r10+64000];
ld.shared.v2.f32 {f796, f797}, [r10+72000];
add.f32 f800, f768, f792;
add.f32 f801, f760, f800;
add.f32 f802, f776, f784;
add.f32 f803, f802, f801;
add.f32 f804, f769, f793;
add.f32 f805, f761, f804;
add.f32 f806, f777, f785;
add.f32 f807, f806, f805;
fma.rn.f32 f808, f800, 0f3E9E377A, f760;
mul.f32 f809, f802, 0f3F4F1BBD;
sub.f32 f810, f808, f809;
sub.f32 f811, f769, f793;
mul.f32 f812, f811, 0f3F737871;
sub.f32 f813, f777, f785;
fma.rn.f32 f814, f813, 0f3F167918, f812;
sub.f32 f815, f810, f814;
add.f32 f816, f814, f810;
mul.f32 f817, f800, 0f3F4F1BBD;
sub.f32 f818, f760, f817;
fma.rn.f32 f819, f802, 0f3E9E377A, f818;
mul.f32 f820, f811, 0f3F167918;
mul.f32 f821, f813, 0f3F737871;
sub.f32 f822, f820, f821;
sub.f32 f823, f819, f822;
add.f32 f824, f822, f819;
fma.rn.f32 f825, f804, 0f3E9E377A, f761;
mul.f32 f826, f806, 0f3F4F1BBD;
sub.f32 f827, f825, f826;
sub.f32 f828, f768, f792;
mul.f32 f829, f828, 0f3F737871;
sub.f32 f830, f776, f784;
fma.rn.f32 f831, f830, 0f3F167918, f829;
add.f32 f832, f831, f827;
sub.f32 f833, f827, f831;
mul.f32 f834, f804, 0f3F4F1BBD;
sub.f32 f835, f761, f834;
fma.rn.f32 f836, f806, 0f3E9E377A, f835;
mul.f32 f837, f828, 0f3F167918;
mul.f32 f838, f830, 0f3F737871;
sub.f32 f839, f837, f838;
add.f32 f840, f839, f836;
sub.f32 f841, f836, f839;
add.f32 f842, f772, f796;
add.f32 f843, f764, f842;
add.f32 f844, f780, f788;
add.f32 f845, f844, f843;
add.f32 f846, f773, f797;
add.f32 f847, f765, f846;
add.f32 f848, f781, f789;
add.f32 f849, f848, f847;
fma.rn.f32 f850, f842, 0f3E9E377A, f764;
mul.f32 f851, f844, 0f3F4F1BBD;
sub.f32 f852, f850, f851;
sub.f32 f853, f773, f797;
mul.f32 f854, f853, 0f3F737871;
sub.f32 f855, f781, f789;
fma.rn.f32 f856, f855, 0f3F167918, f854;
sub.f32 f857, f852, f856;
add.f32 f858, f856, f852;
mul.f32 f859, f842, 0f3F4F1BBD;
sub.f32 f860, f764, f859;
fma.rn.f32 f861, f844, 0f3E9E377A, f860;
mul.f32 f862, f853, 0f3F167918;
mul.f32 f863, f855, 0f3F737871;
sub.f32 f864, f862, f863;
sub.f32 f865, f861, f864;
add.f32 f866, f864, f861;
fma.rn.f32 f867, f846, 0f3E9E377A, f765;
mul.f32 f868, f848, 0f3F4F1BBD;
sub.f32 f869, f867, f868;
sub.f32 f870, f772, f796;
mul.f32 f871, f870, 0f3F737871;
sub.f32 f872, f780, f788;
fma.rn.f32 f873, f872, 0f3F167918, f871;
add.f32 f874, f873, f869;
sub.f32 f875, f869, f873;
mul.f32 f876, f846, 0f3F4F1BBD;
sub.f32 f877, f765, f876;
fma.rn.f32 f878, f848, 0f3E9E377A, f877;
mul.f32 f879, f870, 0f3F167918;
mul.f32 f880, f872, 0f3F737871;
sub.f32 f881, f879, f880;
add.f32 f882, f881, f878;
sub.f32 f883, f878, f881;
mul.f32 f884, f857, 0f3F4F1BBD;
mul.f32 f885, f874, 0f3F167918;
sub.f32 f886, f884, f885;
mul.f32 f887, f874, 0f3F4F1BBD;
fma.rn.f32 f888, f857, 0f3F167918, f887;
mul.f32 f889, f865, 0f3E9E377A;
mul.f32 f890, f882, 0f3F737871;
sub.f32 f891, f889, f890;
mul.f32 f892, f882, 0f3E9E377A;
fma.rn.f32 f893, f865, 0f3F737871, f892;
mul.f32 f894, f866, 0fBE9E377A;
mul.f32 f895, f883, 0f3F737871;
sub.f32 f896, f894, f895;
mul.f32 f897, f883, 0fBE9E377A;
fma.rn.f32 f898, f866, 0f3F737871, f897;
mul.f32 f899, f858, 0fBF4F1BBD;
mul.f32 f900, f875, 0f3F167918;
sub.f32 f901, f899, f900;
mul.f32 f902, f875, 0fBF4F1BBD;
fma.rn.f32 f903, f858, 0f3F167918, f902;
add.f32 %1, f807, f849;
add.f32 %0, f803, f845;
add.f32 %3, f832, f888;
add.f32 %2, f815, f886;
add.f32 %5, f840, f893;
add.f32 %4, f823, f891;
add.f32 %7, f841, f898;
add.f32 %6, f824, f896;
add.f32 %9, f833, f903;
add.f32 %8, f816, f901;
sub.f32 %11, f807, f849;
sub.f32 %10, f803, f845;
sub.f32 %13, f832, f888;
sub.f32 %12, f815, f886;
sub.f32 %15, f840, f893;
sub.f32 %14, f823, f891;
sub.f32 %17, f841, f898;
sub.f32 %16, f824, f896;
sub.f32 %19, f833, f903;
sub.f32 %18, f816, f901;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_10000), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<398, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<864>;
.reg .b32 r<23>;
.reg .b64 rd<17>;
mov.u32 r1, %tid.y;
mov.u32 r2, %20;
mad.lo.s32 r3, r1, 40000, r2;
mov.u32 r4, %tid.x;
add.f32 f41, %29, %45;
add.f32 f42, %24, f41;
add.f32 f43, %34, %40;
add.f32 f44, f43, f42;
add.f32 f45, %31, %47;
add.f32 f46, %25, f45;
add.f32 f47, %36, %41;
add.f32 f48, f47, f46;
fma.rn.f32 f49, f41, 0f3E9E377A, %24;
mul.f32 f50, f43, 0f3F4F1BBD;
sub.f32 f51, f49, f50;
sub.f32 f52, %31, %47;
mul.f32 f53, f52, 0f3F737871;
sub.f32 f54, %36, %41;
fma.rn.f32 f55, f54, 0f3F167918, f53;
sub.f32 f56, f51, f55;
add.f32 f57, f55, f51;
mul.f32 f58, f41, 0f3F4F1BBD;
sub.f32 f59, %24, f58;
fma.rn.f32 f60, f43, 0f3E9E377A, f59;
mul.f32 f61, f52, 0f3F167918;
mul.f32 f62, f54, 0f3F737871;
sub.f32 f63, f61, f62;
sub.f32 f64, f60, f63;
add.f32 f65, f63, f60;
fma.rn.f32 f66, f45, 0f3E9E377A, %25;
mul.f32 f67, f47, 0f3F4F1BBD;
sub.f32 f68, f66, f67;
sub.f32 f69, %29, %45;
mul.f32 f70, f69, 0f3F737871;
sub.f32 f71, %34, %40;
fma.rn.f32 f72, f71, 0f3F167918, f70;
add.f32 f73, f72, f68;
sub.f32 f74, f68, f72;
mul.f32 f75, f45, 0f3F4F1BBD;
sub.f32 f76, %25, f75;
fma.rn.f32 f77, f47, 0f3E9E377A, f76;
mul.f32 f78, f69, 0f3F167918;
mul.f32 f79, f71, 0f3F737871;
sub.f32 f80, f78, f79;
add.f32 f81, f80, f77;
sub.f32 f82, f77, f80;
add.f32 f83, %32, %48;
add.f32 f84, %26, f83;
add.f32 f85, %37, %42;
add.f32 f86, f85, f84;
add.f32 f87, %33, %49;
add.f32 f88, %28, f87;
add.f32 f89, %39, %44;
add.f32 f90, f89, f88;
fma.rn.f32 f91, f83, 0f3E9E377A, %26;
mul.f32 f92, f85, 0f3F4F1BBD;
sub.f32 f93, f91, f92;
sub.f32 f94, %33, %49;
mul.f32 f95, f94, 0f3F737871;
sub.f32 f96, %39, %44;
fma.rn.f32 f97, f96, 0f3F167918, f95;
sub.f32 f98, f93, f97;
add.f32 f99, f97, f93;
mul.f32 f100, f83, 0f3F4F1BBD;
sub.f32 f101, %26, f100;
fma.rn.f32 f102, f85, 0f3E9E377A, f101;
mul.f32 f103, f94, 0f3F167918;
mul.f32 f104, f96, 0f3F737871;
sub.f32 f105, f103, f104;
sub.f32 f106, f102, f105;
add.f32 f107, f105, f102;
fma.rn.f32 f108, f87, 0f3E9E377A, %28;
mul.f32 f109, f89, 0f3F4F1BBD;
sub.f32 f110, f108, f109;
sub.f32 f111, %32, %48;
mul.f32 f112, f111, 0f3F737871;
sub.f32 f113, %37, %42;
fma.rn.f32 f114, f113, 0f3F167918, f112;
add.f32 f115, f114, f110;
sub.f32 f116, f110, f114;
mul.f32 f117, f87, 0f3F4F1BBD;
sub.f32 f118, %28, f117;
fma.rn.f32 f119, f89, 0f3E9E377A, f118;
mul.f32 f120, f111, 0f3F167918;
mul.f32 f121, f113, 0f3F737871;
sub.f32 f122, f120, f121;
add.f32 f123, f122, f119;
sub.f32 f124, f119, f122;
mul.f32 f125, f98, 0f3F4F1BBD;
mul.f32 f126, f115, 0f3F167918;
sub.f32 f127, f125, f126;
mul.f32 f128, f115, 0f3F4F1BBD;
fma.rn.f32 f129, f98, 0f3F167918, f128;
mul.f32 f130, f106, 0f3E9E377A;
mul.f32 f131, f123, 0f3F737871;
sub.f32 f132, f130, f131;
mul.f32 f133, f123, 0f3E9E377A;
fma.rn.f32 f134, f106, 0f3F737871, f133;
mul.f32 f135, f107, 0fBE9E377A;
mul.f32 f136, f124, 0f3F737871;
sub.f32 f137, f135, f136;
mul.f32 f138, f124, 0fBE9E377A;
fma.rn.f32 f139, f107, 0f3F737871, f138;
mul.f32 f140, f99, 0fBF4F1BBD;
mul.f32 f141, f116, 0f3F167918;
sub.f32 f142, f140, f141;
mul.f32 f143, f116, 0fBF4F1BBD;
fma.rn.f32 f144, f99, 0f3F167918, f143;
add.f32 f145, f44, f86;
add.f32 f146, f48, f90;
sub.f32 f147, f44, f86;
sub.f32 f148, f48, f90;
add.f32 f149, f56, f127;
add.f32 f150, f73, f129;
sub.f32 f151, f56, f127;
sub.f32 f152, f73, f129;
add.f32 f153, f64, f132;
add.f32 f154, f81, f134;
sub.f32 f155, f64, f132;
sub.f32 f156, f81, f134;
add.f32 f157, f65, f137;
add.f32 f158, f82, f139;
sub.f32 f159, f65, f137;
sub.f32 f160, f82, f139;
add.f32 f161, f57, f142;
add.f32 f162, f74, f144;
sub.f32 f163, f57, f142;
sub.f32 f164, f74, f144;
mul.wide.u32 rd2, r4, 274877907;
shr.u64 rd3, rd2, 38;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 1000;
sub.s32 r7, r4, r6;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %21;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f165, f166}, [rd6];
mul.f32 f169, f150, f166;
fma.rn.f32 f170, f165, f149, f169;
mul.f32 f171, f149, f166;
mul.f32 f172, f165, f150;
sub.f32 f173, f172, f171;
mul.f32 f174, f165, f165;
mul.f32 f175, f166, f166;
sub.f32 f176, f174, f175;
mul.f32 f177, f166, f165;
fma.rn.f32 f178, f166, f165, f177;
mul.f32 f179, f154, f178;
fma.rn.f32 f180, f176, f153, f179;
mul.f32 f181, f153, f178;
mul.f32 f182, f176, f154;
sub.f32 f183, f182, f181;
mul.f32 f184, f165, f176;
mul.f32 f185, f166, f178;
sub.f32 f186, f184, f185;
mul.f32 f187, f165, f178;
fma.rn.f32 f188, f166, f176, f187;
mul.f32 f189, f158, f188;
fma.rn.f32 f190, f186, f157, f189;
mul.f32 f191, f157, f188;
mul.f32 f192, f186, f158;
sub.f32 f193, f192, f191;
mul.f32 f194, f165, f186;
mul.f32 f195, f166, f188;
sub.f32 f196, f194, f195;
mul.f32 f197, f165, f188;
fma.rn.f32 f198, f166, f186, f197;
mul.f32 f199, f162, f198;
fma.rn.f32 f200, f196, f161, f199;
mul.f32 f201, f161, f198;
mul.f32 f202, f196, f162;
sub.f32 f203, f202, f201;
mul.f32 f204, f165, f196;
mul.f32 f205, f166, f198;
sub.f32 f206, f204, f205;
mul.f32 f207, f165, f198;
fma.rn.f32 f208, f166, f196, f207;
mul.f32 f209, f148, f208;
fma.rn.f32 f210, f206, f147, f209;
mul.f32 f211, f147, f208;
mul.f32 f212, f206, f148;
sub.f32 f213, f212, f211;
mul.f32 f214, f165, f206;
mul.f32 f215, f166, f208;
sub.f32 f216, f214, f215;
mul.f32 f217, f165, f208;
fma.rn.f32 f218, f166, f206, f217;
mul.f32 f219, f152, f218;
fma.rn.f32 f220, f216, f151, f219;
mul.f32 f221, f151, f218;
mul.f32 f222, f216, f152;
sub.f32 f223, f222, f221;
mul.f32 f224, f165, f216;
mul.f32 f225, f166, f218;
sub.f32 f226, f224, f225;
mul.f32 f227, f165, f218;
fma.rn.f32 f228, f166, f216, f227;
mul.f32 f229, f156, f228;
fma.rn.f32 f230, f226, f155, f229;
mul.f32 f231, f155, f228;
mul.f32 f232, f226, f156;
sub.f32 f233, f232, f231;
mul.f32 f234, f165, f226;
mul.f32 f235, f166, f228;
sub.f32 f236, f234, f235;
mul.f32 f237, f165, f228;
fma.rn.f32 f238, f166, f226, f237;
mul.f32 f239, f160, f238;
fma.rn.f32 f240, f236, f159, f239;
mul.f32 f241, f159, f238;
mul.f32 f242, f236, f160;
sub.f32 f243, f242, f241;
mul.f32 f244, f165, f236;
mul.f32 f245, f166, f238;
sub.f32 f246, f244, f245;
mul.f32 f247, f165, f238;
fma.rn.f32 f248, f166, f236, f247;
mul.f32 f249, f164, f248;
fma.rn.f32 f250, f246, f163, f249;
mul.f32 f251, f163, f248;
mul.f32 f252, f246, f164;
sub.f32 f253, f252, f251;
mad.lo.s32 r8, r5, 40000, r3;
barrier.sync 0;
mad.lo.s32 r9, r7, 40, r8;
st.shared.v2.f32 [r9], {f145, f170};
st.shared.v2.f32 [r9+8], {f180, f190};
st.shared.v2.f32 [r9+16], {f200, f210};
st.shared.v2.f32 [r9+24], {f220, f230};
st.shared.v2.f32 [r9+32], {f240, f250};
barrier.sync 0;
mad.lo.s32 r10, r7, -36, r9;
ld.shared.f32 f254, [r10];
ld.shared.f32 f255, [r10+4000];
ld.shared.f32 f256, [r10+8000];
ld.shared.f32 f257, [r10+12000];
ld.shared.f32 f258, [r10+16000];
ld.shared.f32 f259, [r10+20000];
ld.shared.f32 f260, [r10+24000];
ld.shared.f32 f261, [r10+28000];
ld.shared.f32 f262, [r10+32000];
ld.shared.f32 f263, [r10+36000];
barrier.sync 0;
st.shared.v2.f32 [r9], {f146, f173};
st.shared.v2.f32 [r9+8], {f183, f193};
st.shared.v2.f32 [r9+16], {f203, f213};
st.shared.v2.f32 [r9+24], {f223, f233};
st.shared.v2.f32 [r9+32], {f243, f253};
barrier.sync 0;
ld.shared.f32 f264, [r10];
ld.shared.f32 f265, [r10+4000];
ld.shared.f32 f266, [r10+8000];
ld.shared.f32 f267, [r10+12000];
ld.shared.f32 f268, [r10+16000];
ld.shared.f32 f269, [r10+20000];
ld.shared.f32 f270, [r10+24000];
ld.shared.f32 f271, [r10+28000];
ld.shared.f32 f272, [r10+32000];
ld.shared.f32 f273, [r10+36000];
add.f32 f274, f256, f262;
add.f32 f275, f254, f274;
add.f32 f276, f258, f260;
add.f32 f277, f276, f275;
add.f32 f278, f266, f272;
add.f32 f279, f264, f278;
add.f32 f280, f268, f270;
add.f32 f281, f280, f279;
fma.rn.f32 f282, f274, 0f3E9E377A, f254;
mul.f32 f283, f276, 0f3F4F1BBD;
sub.f32 f284, f282, f283;
sub.f32 f285, f266, f272;
mul.f32 f286, f285, 0f3F737871;
sub.f32 f287, f268, f270;
fma.rn.f32 f288, f287, 0f3F167918, f286;
sub.f32 f289, f284, f288;
add.f32 f290, f288, f284;
mul.f32 f291, f274, 0f3F4F1BBD;
sub.f32 f292, f254, f291;
fma.rn.f32 f293, f276, 0f3E9E377A, f292;
mul.f32 f294, f285, 0f3F167918;
mul.f32 f295, f287, 0f3F737871;
sub.f32 f296, f294, f295;
sub.f32 f297, f293, f296;
add.f32 f298, f296, f293;
fma.rn.f32 f299, f278, 0f3E9E377A, f264;
mul.f32 f300, f280, 0f3F4F1BBD;
sub.f32 f301, f299, f300;
sub.f32 f302, f256, f262;
mul.f32 f303, f302, 0f3F737871;
sub.f32 f304, f258, f260;
fma.rn.f32 f305, f304, 0f3F167918, f303;
add.f32 f306, f305, f301;
sub.f32 f307, f301, f305;
mul.f32 f308, f278, 0f3F4F1BBD;
sub.f32 f309, f264, f308;
fma.rn.f32 f310, f280, 0f3E9E377A, f309;
mul.f32 f311, f302, 0f3F167918;
mul.f32 f312, f304, 0f3F737871;
sub.f32 f313, f311, f312;
add.f32 f314, f313, f310;
sub.f32 f315, f310, f313;
add.f32 f316, f257, f263;
add.f32 f317, f255, f316;
add.f32 f318, f259, f261;
add.f32 f319, f318, f317;
add.f32 f320, f267, f273;
add.f32 f321, f265, f320;
add.f32 f322, f269, f271;
add.f32 f323, f322, f321;
fma.rn.f32 f324, f316, 0f3E9E377A, f255;
mul.f32 f325, f318, 0f3F4F1BBD;
sub.f32 f326, f324, f325;
sub.f32 f327, f267, f273;
mul.f32 f328, f327, 0f3F737871;
sub.f32 f329, f269, f271;
fma.rn.f32 f330, f329, 0f3F167918, f328;
sub.f32 f331, f326, f330;
add.f32 f332, f330, f326;
mul.f32 f333, f316, 0f3F4F1BBD;
sub.f32 f334, f255, f333;
fma.rn.f32 f335, f318, 0f3E9E377A, f334;
mul.f32 f336, f327, 0f3F167918;
mul.f32 f337, f329, 0f3F737871;
sub.f32 f338, f336, f337;
sub.f32 f339, f335, f338;
add.f32 f340, f338, f335;
fma.rn.f32 f341, f320, 0f3E9E377A, f265;
mul.f32 f342, f322, 0f3F4F1BBD;
sub.f32 f343, f341, f342;
sub.f32 f344, f257, f263;
mul.f32 f345, f344, 0f3F737871;
sub.f32 f346, f259, f261;
fma.rn.f32 f347, f346, 0f3F167918, f345;
add.f32 f348, f347, f343;
sub.f32 f349, f343, f347;
mul.f32 f350, f320, 0f3F4F1BBD;
sub.f32 f351, f265, f350;
fma.rn.f32 f352, f322, 0f3E9E377A, f351;
mul.f32 f353, f344, 0f3F167918;
mul.f32 f354, f346, 0f3F737871;
sub.f32 f355, f353, f354;
add.f32 f356, f355, f352;
sub.f32 f357, f352, f355;
mul.f32 f358, f331, 0f3F4F1BBD;
mul.f32 f359, f348, 0f3F167918;
sub.f32 f360, f358, f359;
mul.f32 f361, f348, 0f3F4F1BBD;
fma.rn.f32 f362, f331, 0f3F167918, f361;
mul.f32 f363, f339, 0f3E9E377A;
mul.f32 f364, f356, 0f3F737871;
sub.f32 f365, f363, f364;
mul.f32 f366, f356, 0f3E9E377A;
fma.rn.f32 f367, f339, 0f3F737871, f366;
mul.f32 f368, f340, 0fBE9E377A;
mul.f32 f369, f357, 0f3F737871;
sub.f32 f370, f368, f369;
mul.f32 f371, f357, 0fBE9E377A;
fma.rn.f32 f372, f340, 0f3F737871, f371;
mul.f32 f373, f332, 0fBF4F1BBD;
mul.f32 f374, f349, 0f3F167918;
sub.f32 f375, f373, f374;
mul.f32 f376, f349, 0fBF4F1BBD;
fma.rn.f32 f377, f332, 0f3F167918, f376;
add.f32 f378, f277, f319;
add.f32 f379, f281, f323;
sub.f32 f380, f277, f319;
sub.f32 f381, f281, f323;
add.f32 f382, f289, f360;
add.f32 f383, f306, f362;
sub.f32 f384, f289, f360;
sub.f32 f385, f306, f362;
add.f32 f386, f297, f365;
add.f32 f387, f314, f367;
sub.f32 f388, f297, f365;
sub.f32 f389, f314, f367;
add.f32 f390, f298, f370;
add.f32 f391, f315, f372;
sub.f32 f392, f298, f370;
sub.f32 f393, f315, f372;
add.f32 f394, f290, f375;
add.f32 f395, f307, f377;
sub.f32 f396, f290, f375;
sub.f32 f397, f307, f377;
mul.wide.u32 rd7, r7, -858993459;
shr.u64 rd8, rd7, 35;
cvt.u32.u64 r11, rd8;
mul.lo.s32 r12, r11, 10;
sub.s32 r13, r7, r12;
mul.wide.u32 rd9, r11, 8;
mov.u64 rd10, %22;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f398, f399}, [rd11];
mul.f32 f402, f383, f399;
fma.rn.f32 f403, f398, f382, f402;
mul.f32 f404, f382, f399;
mul.f32 f405, f398, f383;
sub.f32 f406, f405, f404;
mul.f32 f407, f398, f398;
mul.f32 f408, f399, f399;
sub.f32 f409, f407, f408;
mul.f32 f410, f399, f398;
fma.rn.f32 f411, f399, f398, f410;
mul.f32 f412, f387, f411;
fma.rn.f32 f413, f409, f386, f412;
mul.f32 f414, f386, f411;
mul.f32 f415, f409, f387;
sub.f32 f416, f415, f414;
mul.f32 f417, f398, f409;
mul.f32 f418, f399, f411;
sub.f32 f419, f417, f418;
mul.f32 f420, f398, f411;
fma.rn.f32 f421, f399, f409, f420;
mul.f32 f422, f391, f421;
fma.rn.f32 f423, f419, f390, f422;
mul.f32 f424, f390, f421;
mul.f32 f425, f419, f391;
sub.f32 f426, f425, f424;
mul.f32 f427, f398, f419;
mul.f32 f428, f399, f421;
sub.f32 f429, f427, f428;
mul.f32 f430, f398, f421;
fma.rn.f32 f431, f399, f419, f430;
mul.f32 f432, f395, f431;
fma.rn.f32 f433, f429, f394, f432;
mul.f32 f434, f394, f431;
mul.f32 f435, f429, f395;
sub.f32 f436, f435, f434;
mul.f32 f437, f398, f429;
mul.f32 f438, f399, f431;
sub.f32 f439, f437, f438;
mul.f32 f440, f398, f431;
fma.rn.f32 f441, f399, f429, f440;
mul.f32 f442, f381, f441;
fma.rn.f32 f443, f439, f380, f442;
mul.f32 f444, f380, f441;
mul.f32 f445, f439, f381;
sub.f32 f446, f445, f444;
mul.f32 f447, f398, f439;
mul.f32 f448, f399, f441;
sub.f32 f449, f447, f448;
mul.f32 f450, f398, f441;
fma.rn.f32 f451, f399, f439, f450;
mul.f32 f452, f385, f451;
fma.rn.f32 f453, f449, f384, f452;
mul.f32 f454, f384, f451;
mul.f32 f455, f449, f385;
sub.f32 f456, f455, f454;
mul.f32 f457, f398, f449;
mul.f32 f458, f399, f451;
sub.f32 f459, f457, f458;
mul.f32 f460, f398, f451;
fma.rn.f32 f461, f399, f449, f460;
mul.f32 f462, f389, f461;
fma.rn.f32 f463, f459, f388, f462;
mul.f32 f464, f388, f461;
mul.f32 f465, f459, f389;
sub.f32 f466, f465, f464;
mul.f32 f467, f398, f459;
mul.f32 f468, f399, f461;
sub.f32 f469, f467, f468;
mul.f32 f470, f398, f461;
fma.rn.f32 f471, f399, f459, f470;
mul.f32 f472, f393, f471;
fma.rn.f32 f473, f469, f392, f472;
mul.f32 f474, f392, f471;
mul.f32 f475, f469, f393;
sub.f32 f476, f475, f474;
mul.f32 f477, f398, f469;
mul.f32 f478, f399, f471;
sub.f32 f479, f477, f478;
mul.f32 f480, f398, f471;
fma.rn.f32 f481, f399, f469, f480;
mul.f32 f482, f397, f481;
fma.rn.f32 f483, f479, f396, f482;
mul.f32 f484, f396, f481;
mul.f32 f485, f479, f397;
sub.f32 f486, f485, f484;
shl.b32 r14, r13, 2;
add.s32 r15, r8, r14;
barrier.sync 0;
mad.lo.s32 r16, r11, 400, r15;
st.shared.f32 [r16], f378;
st.shared.f32 [r16+40], f403;
st.shared.f32 [r16+80], f413;
st.shared.f32 [r16+120], f423;
st.shared.f32 [r16+160], f433;
st.shared.f32 [r16+200], f443;
st.shared.f32 [r16+240], f453;
st.shared.f32 [r16+280], f463;
st.shared.f32 [r16+320], f473;
st.shared.f32 [r16+360], f483;
barrier.sync 0;
ld.shared.f32 f487, [r10];
ld.shared.f32 f488, [r10+4000];
ld.shared.f32 f489, [r10+8000];
ld.shared.f32 f490, [r10+12000];
ld.shared.f32 f491, [r10+16000];
ld.shared.f32 f492, [r10+20000];
ld.shared.f32 f493, [r10+24000];
ld.shared.f32 f494, [r10+28000];
ld.shared.f32 f495, [r10+32000];
ld.shared.f32 f496, [r10+36000];
barrier.sync 0;
st.shared.f32 [r16], f379;
st.shared.f32 [r16+40], f406;
st.shared.f32 [r16+80], f416;
st.shared.f32 [r16+120], f426;
st.shared.f32 [r16+160], f436;
st.shared.f32 [r16+200], f446;
st.shared.f32 [r16+240], f456;
st.shared.f32 [r16+280], f466;
st.shared.f32 [r16+320], f476;
st.shared.f32 [r16+360], f486;
barrier.sync 0;
ld.shared.f32 f497, [r10];
ld.shared.f32 f498, [r10+4000];
ld.shared.f32 f499, [r10+8000];
ld.shared.f32 f500, [r10+12000];
ld.shared.f32 f501, [r10+16000];
ld.shared.f32 f502, [r10+20000];
ld.shared.f32 f503, [r10+24000];
ld.shared.f32 f504, [r10+28000];
ld.shared.f32 f505, [r10+32000];
ld.shared.f32 f506, [r10+36000];
add.f32 f507, f489, f495;
add.f32 f508, f487, f507;
add.f32 f509, f491, f493;
add.f32 f510, f509, f508;
add.f32 f511, f499, f505;
add.f32 f512, f497, f511;
add.f32 f513, f501, f503;
add.f32 f514, f513, f512;
fma.rn.f32 f515, f507, 0f3E9E377A, f487;
mul.f32 f516, f509, 0f3F4F1BBD;
sub.f32 f517, f515, f516;
sub.f32 f518, f499, f505;
mul.f32 f519, f518, 0f3F737871;
sub.f32 f520, f501, f503;
fma.rn.f32 f521, f520, 0f3F167918, f519;
sub.f32 f522, f517, f521;
add.f32 f523, f521, f517;
mul.f32 f524, f507, 0f3F4F1BBD;
sub.f32 f525, f487, f524;
fma.rn.f32 f526, f509, 0f3E9E377A, f525;
mul.f32 f527, f518, 0f3F167918;
mul.f32 f528, f520, 0f3F737871;
sub.f32 f529, f527, f528;
sub.f32 f530, f526, f529;
add.f32 f531, f529, f526;
fma.rn.f32 f532, f511, 0f3E9E377A, f497;
mul.f32 f533, f513, 0f3F4F1BBD;
sub.f32 f534, f532, f533;
sub.f32 f535, f489, f495;
mul.f32 f536, f535, 0f3F737871;
sub.f32 f537, f491, f493;
fma.rn.f32 f538, f537, 0f3F167918, f536;
add.f32 f539, f538, f534;
sub.f32 f540, f534, f538;
mul.f32 f541, f511, 0f3F4F1BBD;
sub.f32 f542, f497, f541;
fma.rn.f32 f543, f513, 0f3E9E377A, f542;
mul.f32 f544, f535, 0f3F167918;
mul.f32 f545, f537, 0f3F737871;
sub.f32 f546, f544, f545;
add.f32 f547, f546, f543;
sub.f32 f548, f543, f546;
add.f32 f549, f490, f496;
add.f32 f550, f488, f549;
add.f32 f551, f492, f494;
add.f32 f552, f551, f550;
add.f32 f553, f500, f506;
add.f32 f554, f498, f553;
add.f32 f555, f502, f504;
add.f32 f556, f555, f554;
fma.rn.f32 f557, f549, 0f3E9E377A, f488;
mul.f32 f558, f551, 0f3F4F1BBD;
sub.f32 f559, f557, f558;
sub.f32 f560, f500, f506;
mul.f32 f561, f560, 0f3F737871;
sub.f32 f562, f502, f504;
fma.rn.f32 f563, f562, 0f3F167918, f561;
sub.f32 f564, f559, f563;
add.f32 f565, f563, f559;
mul.f32 f566, f549, 0f3F4F1BBD;
sub.f32 f567, f488, f566;
fma.rn.f32 f568, f551, 0f3E9E377A, f567;
mul.f32 f569, f560, 0f3F167918;
mul.f32 f570, f562, 0f3F737871;
sub.f32 f571, f569, f570;
sub.f32 f572, f568, f571;
add.f32 f573, f571, f568;
fma.rn.f32 f574, f553, 0f3E9E377A, f498;
mul.f32 f575, f555, 0f3F4F1BBD;
sub.f32 f576, f574, f575;
sub.f32 f577, f490, f496;
mul.f32 f578, f577, 0f3F737871;
sub.f32 f579, f492, f494;
fma.rn.f32 f580, f579, 0f3F167918, f578;
add.f32 f581, f580, f576;
sub.f32 f582, f576, f580;
mul.f32 f583, f553, 0f3F4F1BBD;
sub.f32 f584, f498, f583;
fma.rn.f32 f585, f555, 0f3E9E377A, f584;
mul.f32 f586, f577, 0f3F167918;
mul.f32 f587, f579, 0f3F737871;
sub.f32 f588, f586, f587;
add.f32 f589, f588, f585;
sub.f32 f590, f585, f588;
mul.f32 f591, f564, 0f3F4F1BBD;
mul.f32 f592, f581, 0f3F167918;
sub.f32 f593, f591, f592;
mul.f32 f594, f581, 0f3F4F1BBD;
fma.rn.f32 f595, f564, 0f3F167918, f594;
mul.f32 f596, f572, 0f3E9E377A;
mul.f32 f597, f589, 0f3F737871;
sub.f32 f598, f596, f597;
mul.f32 f599, f589, 0f3E9E377A;
fma.rn.f32 f600, f572, 0f3F737871, f599;
mul.f32 f601, f573, 0fBE9E377A;
mul.f32 f602, f590, 0f3F737871;
sub.f32 f603, f601, f602;
mul.f32 f604, f590, 0fBE9E377A;
fma.rn.f32 f605, f573, 0f3F737871, f604;
mul.f32 f606, f565, 0fBF4F1BBD;
mul.f32 f607, f582, 0f3F167918;
sub.f32 f608, f606, f607;
mul.f32 f609, f582, 0fBF4F1BBD;
fma.rn.f32 f610, f565, 0f3F167918, f609;
add.f32 f611, f510, f552;
add.f32 f612, f514, f556;
sub.f32 f613, f510, f552;
sub.f32 f614, f514, f556;
add.f32 f615, f522, f593;
add.f32 f616, f539, f595;
sub.f32 f617, f522, f593;
sub.f32 f618, f539, f595;
add.f32 f619, f530, f598;
add.f32 f620, f547, f600;
sub.f32 f621, f530, f598;
sub.f32 f622, f547, f600;
add.f32 f623, f531, f603;
add.f32 f624, f548, f605;
sub.f32 f625, f531, f603;
sub.f32 f626, f548, f605;
add.f32 f627, f523, f608;
add.f32 f628, f540, f610;
sub.f32 f629, f523, f608;
sub.f32 f630, f540, f610;
mul.wide.u32 rd12, r7, 1374389535;
shr.u64 rd13, rd12, 37;
cvt.u32.u64 r17, rd13;
mul.lo.s32 r18, r17, 100;
sub.s32 r19, r7, r18;
mul.wide.u32 rd14, r17, 8;
mov.u64 rd15, %23;
add.s64 rd16, rd15, rd14;
ld.global.v2.f32 {f631, f632}, [rd16];
mul.f32 f635, f616, f632;
fma.rn.f32 f636, f631, f615, f635;
mul.f32 f637, f615, f632;
mul.f32 f638, f631, f616;
sub.f32 f639, f638, f637;
mul.f32 f640, f631, f631;
mul.f32 f641, f632, f632;
sub.f32 f642, f640, f641;
mul.f32 f643, f632, f631;
fma.rn.f32 f644, f632, f631, f643;
mul.f32 f645, f620, f644;
fma.rn.f32 f646, f642, f619, f645;
mul.f32 f647, f619, f644;
mul.f32 f648, f642, f620;
sub.f32 f649, f648, f647;
mul.f32 f650, f631, f642;
mul.f32 f651, f632, f644;
sub.f32 f652, f650, f651;
mul.f32 f653, f631, f644;
fma.rn.f32 f654, f632, f642, f653;
mul.f32 f655, f624, f654;
fma.rn.f32 f656, f652, f623, f655;
mul.f32 f657, f623, f654;
mul.f32 f658, f652, f624;
sub.f32 f659, f658, f657;
mul.f32 f660, f631, f652;
mul.f32 f661, f632, f654;
sub.f32 f662, f660, f661;
mul.f32 f663, f631, f654;
fma.rn.f32 f664, f632, f652, f663;
mul.f32 f665, f628, f664;
fma.rn.f32 f666, f662, f627, f665;
mul.f32 f667, f627, f664;
mul.f32 f668, f662, f628;
sub.f32 f669, f668, f667;
mul.f32 f670, f631, f662;
mul.f32 f671, f632, f664;
sub.f32 f672, f670, f671;
mul.f32 f673, f631, f664;
fma.rn.f32 f674, f632, f662, f673;
mul.f32 f675, f614, f674;
fma.rn.f32 f676, f672, f613, f675;
mul.f32 f677, f613, f674;
mul.f32 f678, f672, f614;
sub.f32 f679, f678, f677;
mul.f32 f680, f631, f672;
mul.f32 f681, f632, f674;
sub.f32 f682, f680, f681;
mul.f32 f683, f631, f674;
fma.rn.f32 f684, f632, f672, f683;
mul.f32 f685, f618, f684;
fma.rn.f32 f686, f682, f617, f685;
mul.f32 f687, f617, f684;
mul.f32 f688, f682, f618;
sub.f32 f689, f688, f687;
mul.f32 f690, f631, f682;
mul.f32 f691, f632, f684;
sub.f32 f692, f690, f691;
mul.f32 f693, f631, f684;
fma.rn.f32 f694, f632, f682, f693;
mul.f32 f695, f622, f694;
fma.rn.f32 f696, f692, f621, f695;
mul.f32 f697, f621, f694;
mul.f32 f698, f692, f622;
sub.f32 f699, f698, f697;
mul.f32 f700, f631, f692;
mul.f32 f701, f632, f694;
sub.f32 f702, f700, f701;
mul.f32 f703, f631, f694;
fma.rn.f32 f704, f632, f692, f703;
mul.f32 f705, f626, f704;
fma.rn.f32 f706, f702, f625, f705;
mul.f32 f707, f625, f704;
mul.f32 f708, f702, f626;
sub.f32 f709, f708, f707;
mul.f32 f710, f631, f702;
mul.f32 f711, f632, f704;
sub.f32 f712, f710, f711;
mul.f32 f713, f631, f704;
fma.rn.f32 f714, f632, f702, f713;
mul.f32 f715, f630, f714;
fma.rn.f32 f716, f712, f629, f715;
mul.f32 f717, f629, f714;
mul.f32 f718, f712, f630;
sub.f32 f719, f718, f717;
shl.b32 r20, r19, 2;
add.s32 r21, r8, r20;
barrier.sync 0;
mad.lo.s32 r22, r17, 4000, r21;
st.shared.f32 [r22], f611;
st.shared.f32 [r22+400], f636;
st.shared.f32 [r22+800], f646;
st.shared.f32 [r22+1200], f656;
st.shared.f32 [r22+1600], f666;
st.shared.f32 [r22+2000], f676;
st.shared.f32 [r22+2400], f686;
st.shared.f32 [r22+2800], f696;
st.shared.f32 [r22+3200], f706;
st.shared.f32 [r22+3600], f716;
barrier.sync 0;
ld.shared.f32 f720, [r10];
ld.shared.f32 f721, [r10+4000];
ld.shared.f32 f722, [r10+8000];
ld.shared.f32 f723, [r10+12000];
ld.shared.f32 f724, [r10+16000];
ld.shared.f32 f725, [r10+20000];
ld.shared.f32 f726, [r10+24000];
ld.shared.f32 f727, [r10+28000];
ld.shared.f32 f728, [r10+32000];
ld.shared.f32 f729, [r10+36000];
barrier.sync 0;
st.shared.f32 [r22], f612;
st.shared.f32 [r22+400], f639;
st.shared.f32 [r22+800], f649;
st.shared.f32 [r22+1200], f659;
st.shared.f32 [r22+1600], f669;
st.shared.f32 [r22+2000], f679;
st.shared.f32 [r22+2400], f689;
st.shared.f32 [r22+2800], f699;
st.shared.f32 [r22+3200], f709;
st.shared.f32 [r22+3600], f719;
barrier.sync 0;
ld.shared.f32 f730, [r10];
ld.shared.f32 f731, [r10+4000];
ld.shared.f32 f732, [r10+8000];
ld.shared.f32 f733, [r10+12000];
ld.shared.f32 f734, [r10+16000];
ld.shared.f32 f735, [r10+20000];
ld.shared.f32 f736, [r10+24000];
ld.shared.f32 f737, [r10+28000];
ld.shared.f32 f738, [r10+32000];
ld.shared.f32 f739, [r10+36000];
add.f32 f740, f722, f728;
add.f32 f741, f720, f740;
add.f32 f742, f724, f726;
add.f32 f743, f742, f741;
add.f32 f744, f732, f738;
add.f32 f745, f730, f744;
add.f32 f746, f734, f736;
add.f32 f747, f746, f745;
fma.rn.f32 f748, f740, 0f3E9E377A, f720;
mul.f32 f749, f742, 0f3F4F1BBD;
sub.f32 f750, f748, f749;
sub.f32 f751, f732, f738;
mul.f32 f752, f751, 0f3F737871;
sub.f32 f753, f734, f736;
fma.rn.f32 f754, f753, 0f3F167918, f752;
sub.f32 f755, f750, f754;
add.f32 f756, f754, f750;
mul.f32 f757, f740, 0f3F4F1BBD;
sub.f32 f758, f720, f757;
fma.rn.f32 f759, f742, 0f3E9E377A, f758;
mul.f32 f760, f751, 0f3F167918;
mul.f32 f761, f753, 0f3F737871;
sub.f32 f762, f760, f761;
sub.f32 f763, f759, f762;
add.f32 f764, f762, f759;
fma.rn.f32 f765, f744, 0f3E9E377A, f730;
mul.f32 f766, f746, 0f3F4F1BBD;
sub.f32 f767, f765, f766;
sub.f32 f768, f722, f728;
mul.f32 f769, f768, 0f3F737871;
sub.f32 f770, f724, f726;
fma.rn.f32 f771, f770, 0f3F167918, f769;
add.f32 f772, f771, f767;
sub.f32 f773, f767, f771;
mul.f32 f774, f744, 0f3F4F1BBD;
sub.f32 f775, f730, f774;
fma.rn.f32 f776, f746, 0f3E9E377A, f775;
mul.f32 f777, f768, 0f3F167918;
mul.f32 f778, f770, 0f3F737871;
sub.f32 f779, f777, f778;
add.f32 f780, f779, f776;
sub.f32 f781, f776, f779;
add.f32 f782, f723, f729;
add.f32 f783, f721, f782;
add.f32 f784, f725, f727;
add.f32 f785, f784, f783;
add.f32 f786, f733, f739;
add.f32 f787, f731, f786;
add.f32 f788, f735, f737;
add.f32 f789, f788, f787;
fma.rn.f32 f790, f782, 0f3E9E377A, f721;
mul.f32 f791, f784, 0f3F4F1BBD;
sub.f32 f792, f790, f791;
sub.f32 f793, f733, f739;
mul.f32 f794, f793, 0f3F737871;
sub.f32 f795, f735, f737;
fma.rn.f32 f796, f795, 0f3F167918, f794;
sub.f32 f797, f792, f796;
add.f32 f798, f796, f792;
mul.f32 f799, f782, 0f3F4F1BBD;
sub.f32 f800, f721, f799;
fma.rn.f32 f801, f784, 0f3E9E377A, f800;
mul.f32 f802, f793, 0f3F167918;
mul.f32 f803, f795, 0f3F737871;
sub.f32 f804, f802, f803;
sub.f32 f805, f801, f804;
add.f32 f806, f804, f801;
fma.rn.f32 f807, f786, 0f3E9E377A, f731;
mul.f32 f808, f788, 0f3F4F1BBD;
sub.f32 f809, f807, f808;
sub.f32 f810, f723, f729;
mul.f32 f811, f810, 0f3F737871;
sub.f32 f812, f725, f727;
fma.rn.f32 f813, f812, 0f3F167918, f811;
add.f32 f814, f813, f809;
sub.f32 f815, f809, f813;
mul.f32 f816, f786, 0f3F4F1BBD;
sub.f32 f817, f731, f816;
fma.rn.f32 f818, f788, 0f3E9E377A, f817;
mul.f32 f819, f810, 0f3F167918;
mul.f32 f820, f812, 0f3F737871;
sub.f32 f821, f819, f820;
add.f32 f822, f821, f818;
sub.f32 f823, f818, f821;
mul.f32 f824, f797, 0f3F4F1BBD;
mul.f32 f825, f814, 0f3F167918;
sub.f32 f826, f824, f825;
mul.f32 f827, f814, 0f3F4F1BBD;
fma.rn.f32 f828, f797, 0f3F167918, f827;
mul.f32 f829, f805, 0f3E9E377A;
mul.f32 f830, f822, 0f3F737871;
sub.f32 f831, f829, f830;
mul.f32 f832, f822, 0f3E9E377A;
fma.rn.f32 f833, f805, 0f3F737871, f832;
mul.f32 f834, f806, 0fBE9E377A;
mul.f32 f835, f823, 0f3F737871;
sub.f32 f836, f834, f835;
mul.f32 f837, f823, 0fBE9E377A;
fma.rn.f32 f838, f806, 0f3F737871, f837;
mul.f32 f839, f798, 0fBF4F1BBD;
mul.f32 f840, f815, 0f3F167918;
sub.f32 f841, f839, f840;
mul.f32 f842, f815, 0fBF4F1BBD;
fma.rn.f32 f843, f798, 0f3F167918, f842;
add.f32 %0, f743, f785;
add.f32 %1, f747, f789;
add.f32 %3, f772, f828;
add.f32 %2, f755, f826;
add.f32 %5, f780, f833;
add.f32 %4, f763, f831;
add.f32 %7, f781, f838;
add.f32 %6, f764, f836;
add.f32 %9, f773, f843;
add.f32 %8, f756, f841;
sub.f32 %10, f743, f785;
sub.f32 %11, f747, f789;
sub.f32 %13, f772, f828;
sub.f32 %12, f755, f826;
sub.f32 %15, f780, f833;
sub.f32 %14, f763, f831;
sub.f32 %17, f781, f838;
sub.f32 %16, f764, f836;
sub.f32 %19, f773, f843;
sub.f32 %18, f756, f841;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_10000), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y));
};


#endif
