
#include "fht.h"
static inline void helper_float_1(float *buf);
static inline void helper_float_1(float *buf) {
  for (int j = 0; j < 2; j += 2) {
    for (int k = 0; k < 1; ++k) {
      float u = buf[j + k];
      float v = buf[j + k + 1];
      buf[j + k] = u + v;
      buf[j + k + 1] = u - v;
    }
  }
}
static inline void helper_float_2(float *buf);
static inline void helper_float_2(float *buf) {
  for (int j = 0; j < 4; j += 4) {
    __asm__ volatile (
      "movups (%0), %%xmm0\n"
      "movaps %%xmm0, %%xmm8\n"
      "shufps $160, %%xmm8, %%xmm8\n"
      "shufps $245, %%xmm0, %%xmm0\n"
      "xorps %%xmm9, %%xmm9\n"
      "subps %%xmm0, %%xmm9\n"
      "addsubps %%xmm9, %%xmm8\n"
      "movaps %%xmm8, %%xmm0\n"
      "movaps %%xmm0, %%xmm8\n"
      "shufps $68, %%xmm8, %%xmm8\n"
      "xorps %%xmm9, %%xmm9\n"
      "movaps %%xmm0, %%xmm10\n"
      "shufps $14, %%xmm9, %%xmm10\n"
      "movaps %%xmm0, %%xmm11\n"
      "shufps $224, %%xmm11, %%xmm9\n"
      "addps %%xmm8, %%xmm10\n"
      "subps %%xmm9, %%xmm10\n"
      "movaps %%xmm10, %%xmm0\n"
      "movups %%xmm0, (%0)\n"
      :: "r"(buf + j) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
    );
  }
}
static inline void helper_float_3(float *buf);
static inline void helper_float_3(float *buf) {
  for (int j = 0; j < 8; j += 8) {
    for (int k = 0; k < 4; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm0, %%xmm0\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm0, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm1, %%xmm1\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm1, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm1\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm0, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm0, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm1, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm1, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm1\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
static inline void helper_float_4(float *buf);
static inline void helper_float_4(float *buf) {
  for (int j = 0; j < 16; j += 16) {
    for (int k = 0; k < 4; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm0, %%xmm0\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm0, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm1, %%xmm1\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm1, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm2, %%xmm2\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm2, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm3, %%xmm3\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm3, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm3\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm0, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm0, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm1, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm1, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm3, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm3, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm3\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movups %%xmm0, (%0)\n"
        "movups %%xmm1, (%1)\n"
        "movups %%xmm2, (%2)\n"
        "movups %%xmm3, (%3)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
static inline void helper_float_5(float *buf);
static inline void helper_float_5(float *buf) {
  for (int j = 0; j < 32; j += 32) {
    for (int k = 0; k < 4; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movups (%4), %%xmm4\n"
        "movups (%5), %%xmm5\n"
        "movups (%6), %%xmm6\n"
        "movups (%7), %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm0, %%xmm0\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm0, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm1, %%xmm1\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm1, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm2, %%xmm2\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm2, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm3, %%xmm3\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm3, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm4, %%xmm4\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm4, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm5, %%xmm5\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm5, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm6, %%xmm6\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm6, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm7, %%xmm7\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm7, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm0, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm0, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm1, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm1, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm3, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm3, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm4, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm4, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm5, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm5, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm6, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm6, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm7, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm7, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm4, %%xmm12\n"
        "movaps %%xmm4, %%xmm13\n"
        "addps %%xmm5, %%xmm12\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm6, %%xmm14\n"
        "movaps %%xmm6, %%xmm15\n"
        "addps %%xmm7, %%xmm14\n"
        "subps %%xmm7, %%xmm15\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movaps %%xmm12, %%xmm4\n"
        "movaps %%xmm12, %%xmm6\n"
        "addps %%xmm14, %%xmm4\n"
        "subps %%xmm14, %%xmm6\n"
        "movaps %%xmm13, %%xmm5\n"
        "movaps %%xmm13, %%xmm7\n"
        "addps %%xmm15, %%xmm5\n"
        "subps %%xmm15, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm12\n"
        "addps %%xmm4, %%xmm8\n"
        "subps %%xmm4, %%xmm12\n"
        "movaps %%xmm1, %%xmm9\n"
        "movaps %%xmm1, %%xmm13\n"
        "addps %%xmm5, %%xmm9\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm14\n"
        "addps %%xmm6, %%xmm10\n"
        "subps %%xmm6, %%xmm14\n"
        "movaps %%xmm3, %%xmm11\n"
        "movaps %%xmm3, %%xmm15\n"
        "addps %%xmm7, %%xmm11\n"
        "subps %%xmm7, %%xmm15\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        "movups %%xmm10, (%2)\n"
        "movups %%xmm11, (%3)\n"
        "movups %%xmm12, (%4)\n"
        "movups %%xmm13, (%5)\n"
        "movups %%xmm14, (%6)\n"
        "movups %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
static inline void helper_float_6(float *buf);
static inline void helper_float_6(float *buf) {
  for (int j = 0; j < 64; j += 32) {
    for (int k = 0; k < 4; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movups (%4), %%xmm4\n"
        "movups (%5), %%xmm5\n"
        "movups (%6), %%xmm6\n"
        "movups (%7), %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm0, %%xmm0\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm0, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm1, %%xmm1\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm1, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm2, %%xmm2\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm2, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm3, %%xmm3\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm3, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm4, %%xmm4\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm4, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm5, %%xmm5\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm5, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm6, %%xmm6\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm6, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm7, %%xmm7\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm7, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm0, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm0, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm1, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm1, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm3, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm3, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm4, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm4, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm5, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm5, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm6, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm6, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm7, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm7, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm4, %%xmm12\n"
        "movaps %%xmm4, %%xmm13\n"
        "addps %%xmm5, %%xmm12\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm6, %%xmm14\n"
        "movaps %%xmm6, %%xmm15\n"
        "addps %%xmm7, %%xmm14\n"
        "subps %%xmm7, %%xmm15\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movaps %%xmm12, %%xmm4\n"
        "movaps %%xmm12, %%xmm6\n"
        "addps %%xmm14, %%xmm4\n"
        "subps %%xmm14, %%xmm6\n"
        "movaps %%xmm13, %%xmm5\n"
        "movaps %%xmm13, %%xmm7\n"
        "addps %%xmm15, %%xmm5\n"
        "subps %%xmm15, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm12\n"
        "addps %%xmm4, %%xmm8\n"
        "subps %%xmm4, %%xmm12\n"
        "movaps %%xmm1, %%xmm9\n"
        "movaps %%xmm1, %%xmm13\n"
        "addps %%xmm5, %%xmm9\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm14\n"
        "addps %%xmm6, %%xmm10\n"
        "subps %%xmm6, %%xmm14\n"
        "movaps %%xmm3, %%xmm11\n"
        "movaps %%xmm3, %%xmm15\n"
        "addps %%xmm7, %%xmm11\n"
        "subps %%xmm7, %%xmm15\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        "movups %%xmm10, (%2)\n"
        "movups %%xmm11, (%3)\n"
        "movups %%xmm12, (%4)\n"
        "movups %%xmm13, (%5)\n"
        "movups %%xmm14, (%6)\n"
        "movups %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 64; j += 64) {
    for (int k = 0; k < 32; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
void helper_float_7_recursive(float *buf, int depth);
void helper_float_7_recursive(float *buf, int depth) {
  if (depth == 7) {
    for (int j = 0; j < 128; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 128; j += 128) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_7(float *buf);
void helper_float_7(float *buf) {
  helper_float_7_recursive(buf, 7);
}
static inline void helper_float_8(float *buf);
static inline void helper_float_8(float *buf) {
  for (int j = 0; j < 256; j += 32) {
    for (int k = 0; k < 4; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movups (%4), %%xmm4\n"
        "movups (%5), %%xmm5\n"
        "movups (%6), %%xmm6\n"
        "movups (%7), %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm0, %%xmm0\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm0, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm1, %%xmm1\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm1, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm2, %%xmm2\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm2, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm3, %%xmm3\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm3, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm4, %%xmm4\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm4, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm5, %%xmm5\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm5, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm6, %%xmm6\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm6, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm7, %%xmm7\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm7, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm0, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm0, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm1, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm1, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm3, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm3, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm4, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm4, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm5, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm5, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm6, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm6, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm7, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm7, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm4, %%xmm12\n"
        "movaps %%xmm4, %%xmm13\n"
        "addps %%xmm5, %%xmm12\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm6, %%xmm14\n"
        "movaps %%xmm6, %%xmm15\n"
        "addps %%xmm7, %%xmm14\n"
        "subps %%xmm7, %%xmm15\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movaps %%xmm12, %%xmm4\n"
        "movaps %%xmm12, %%xmm6\n"
        "addps %%xmm14, %%xmm4\n"
        "subps %%xmm14, %%xmm6\n"
        "movaps %%xmm13, %%xmm5\n"
        "movaps %%xmm13, %%xmm7\n"
        "addps %%xmm15, %%xmm5\n"
        "subps %%xmm15, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm12\n"
        "addps %%xmm4, %%xmm8\n"
        "subps %%xmm4, %%xmm12\n"
        "movaps %%xmm1, %%xmm9\n"
        "movaps %%xmm1, %%xmm13\n"
        "addps %%xmm5, %%xmm9\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm14\n"
        "addps %%xmm6, %%xmm10\n"
        "subps %%xmm6, %%xmm14\n"
        "movaps %%xmm3, %%xmm11\n"
        "movaps %%xmm3, %%xmm15\n"
        "addps %%xmm7, %%xmm11\n"
        "subps %%xmm7, %%xmm15\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        "movups %%xmm10, (%2)\n"
        "movups %%xmm11, (%3)\n"
        "movups %%xmm12, (%4)\n"
        "movups %%xmm13, (%5)\n"
        "movups %%xmm14, (%6)\n"
        "movups %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 256; j += 256) {
    for (int k = 0; k < 32; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movups (%4), %%xmm4\n"
        "movups (%5), %%xmm5\n"
        "movups (%6), %%xmm6\n"
        "movups (%7), %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm4, %%xmm12\n"
        "movaps %%xmm4, %%xmm13\n"
        "addps %%xmm5, %%xmm12\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm6, %%xmm14\n"
        "movaps %%xmm6, %%xmm15\n"
        "addps %%xmm7, %%xmm14\n"
        "subps %%xmm7, %%xmm15\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movaps %%xmm12, %%xmm4\n"
        "movaps %%xmm12, %%xmm6\n"
        "addps %%xmm14, %%xmm4\n"
        "subps %%xmm14, %%xmm6\n"
        "movaps %%xmm13, %%xmm5\n"
        "movaps %%xmm13, %%xmm7\n"
        "addps %%xmm15, %%xmm5\n"
        "subps %%xmm15, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm12\n"
        "addps %%xmm4, %%xmm8\n"
        "subps %%xmm4, %%xmm12\n"
        "movaps %%xmm1, %%xmm9\n"
        "movaps %%xmm1, %%xmm13\n"
        "addps %%xmm5, %%xmm9\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm14\n"
        "addps %%xmm6, %%xmm10\n"
        "subps %%xmm6, %%xmm14\n"
        "movaps %%xmm3, %%xmm11\n"
        "movaps %%xmm3, %%xmm15\n"
        "addps %%xmm7, %%xmm11\n"
        "subps %%xmm7, %%xmm15\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        "movups %%xmm10, (%2)\n"
        "movups %%xmm11, (%3)\n"
        "movups %%xmm12, (%4)\n"
        "movups %%xmm13, (%5)\n"
        "movups %%xmm14, (%6)\n"
        "movups %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
static inline void helper_float_9(float *buf);
static inline void helper_float_9(float *buf) {
  for (int j = 0; j < 512; j += 32) {
    for (int k = 0; k < 4; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movups (%4), %%xmm4\n"
        "movups (%5), %%xmm5\n"
        "movups (%6), %%xmm6\n"
        "movups (%7), %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm0, %%xmm0\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm0, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm1, %%xmm1\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm1, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm2, %%xmm2\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm2, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm3, %%xmm3\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm3, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm4, %%xmm4\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm4, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm5, %%xmm5\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm5, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm6, %%xmm6\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm6, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm7, %%xmm7\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm7, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm0, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm0, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm1, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm1, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm3, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm3, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm4, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm4, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm5, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm5, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm6, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm6, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm7, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm7, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm4, %%xmm12\n"
        "movaps %%xmm4, %%xmm13\n"
        "addps %%xmm5, %%xmm12\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm6, %%xmm14\n"
        "movaps %%xmm6, %%xmm15\n"
        "addps %%xmm7, %%xmm14\n"
        "subps %%xmm7, %%xmm15\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movaps %%xmm12, %%xmm4\n"
        "movaps %%xmm12, %%xmm6\n"
        "addps %%xmm14, %%xmm4\n"
        "subps %%xmm14, %%xmm6\n"
        "movaps %%xmm13, %%xmm5\n"
        "movaps %%xmm13, %%xmm7\n"
        "addps %%xmm15, %%xmm5\n"
        "subps %%xmm15, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm12\n"
        "addps %%xmm4, %%xmm8\n"
        "subps %%xmm4, %%xmm12\n"
        "movaps %%xmm1, %%xmm9\n"
        "movaps %%xmm1, %%xmm13\n"
        "addps %%xmm5, %%xmm9\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm14\n"
        "addps %%xmm6, %%xmm10\n"
        "subps %%xmm6, %%xmm14\n"
        "movaps %%xmm3, %%xmm11\n"
        "movaps %%xmm3, %%xmm15\n"
        "addps %%xmm7, %%xmm11\n"
        "subps %%xmm7, %%xmm15\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        "movups %%xmm10, (%2)\n"
        "movups %%xmm11, (%3)\n"
        "movups %%xmm12, (%4)\n"
        "movups %%xmm13, (%5)\n"
        "movups %%xmm14, (%6)\n"
        "movups %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 512; j += 256) {
    for (int k = 0; k < 32; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movups (%4), %%xmm4\n"
        "movups (%5), %%xmm5\n"
        "movups (%6), %%xmm6\n"
        "movups (%7), %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm4, %%xmm12\n"
        "movaps %%xmm4, %%xmm13\n"
        "addps %%xmm5, %%xmm12\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm6, %%xmm14\n"
        "movaps %%xmm6, %%xmm15\n"
        "addps %%xmm7, %%xmm14\n"
        "subps %%xmm7, %%xmm15\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movaps %%xmm12, %%xmm4\n"
        "movaps %%xmm12, %%xmm6\n"
        "addps %%xmm14, %%xmm4\n"
        "subps %%xmm14, %%xmm6\n"
        "movaps %%xmm13, %%xmm5\n"
        "movaps %%xmm13, %%xmm7\n"
        "addps %%xmm15, %%xmm5\n"
        "subps %%xmm15, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm12\n"
        "addps %%xmm4, %%xmm8\n"
        "subps %%xmm4, %%xmm12\n"
        "movaps %%xmm1, %%xmm9\n"
        "movaps %%xmm1, %%xmm13\n"
        "addps %%xmm5, %%xmm9\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm14\n"
        "addps %%xmm6, %%xmm10\n"
        "subps %%xmm6, %%xmm14\n"
        "movaps %%xmm3, %%xmm11\n"
        "movaps %%xmm3, %%xmm15\n"
        "addps %%xmm7, %%xmm11\n"
        "subps %%xmm7, %%xmm15\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        "movups %%xmm10, (%2)\n"
        "movups %%xmm11, (%3)\n"
        "movups %%xmm12, (%4)\n"
        "movups %%xmm13, (%5)\n"
        "movups %%xmm14, (%6)\n"
        "movups %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 512; j += 512) {
    for (int k = 0; k < 256; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
static inline void helper_float_10(float *buf);
static inline void helper_float_10(float *buf) {
  for (int j = 0; j < 1024; j += 32) {
    for (int k = 0; k < 4; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movups (%4), %%xmm4\n"
        "movups (%5), %%xmm5\n"
        "movups (%6), %%xmm6\n"
        "movups (%7), %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm0, %%xmm0\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm0, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm1, %%xmm1\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm1, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm2, %%xmm2\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm2, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm3, %%xmm3\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm3, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm4, %%xmm4\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm4, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm5, %%xmm5\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm5, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm6, %%xmm6\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm6, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm7, %%xmm7\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm7, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm0, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm0, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm1, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm1, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm3, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm3, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm4, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm4, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm5, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm5, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm6, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm6, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm7, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm7, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm4, %%xmm12\n"
        "movaps %%xmm4, %%xmm13\n"
        "addps %%xmm5, %%xmm12\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm6, %%xmm14\n"
        "movaps %%xmm6, %%xmm15\n"
        "addps %%xmm7, %%xmm14\n"
        "subps %%xmm7, %%xmm15\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movaps %%xmm12, %%xmm4\n"
        "movaps %%xmm12, %%xmm6\n"
        "addps %%xmm14, %%xmm4\n"
        "subps %%xmm14, %%xmm6\n"
        "movaps %%xmm13, %%xmm5\n"
        "movaps %%xmm13, %%xmm7\n"
        "addps %%xmm15, %%xmm5\n"
        "subps %%xmm15, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm12\n"
        "addps %%xmm4, %%xmm8\n"
        "subps %%xmm4, %%xmm12\n"
        "movaps %%xmm1, %%xmm9\n"
        "movaps %%xmm1, %%xmm13\n"
        "addps %%xmm5, %%xmm9\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm14\n"
        "addps %%xmm6, %%xmm10\n"
        "subps %%xmm6, %%xmm14\n"
        "movaps %%xmm3, %%xmm11\n"
        "movaps %%xmm3, %%xmm15\n"
        "addps %%xmm7, %%xmm11\n"
        "subps %%xmm7, %%xmm15\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        "movups %%xmm10, (%2)\n"
        "movups %%xmm11, (%3)\n"
        "movups %%xmm12, (%4)\n"
        "movups %%xmm13, (%5)\n"
        "movups %%xmm14, (%6)\n"
        "movups %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 1024; j += 256) {
    for (int k = 0; k < 32; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movups (%4), %%xmm4\n"
        "movups (%5), %%xmm5\n"
        "movups (%6), %%xmm6\n"
        "movups (%7), %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm4, %%xmm12\n"
        "movaps %%xmm4, %%xmm13\n"
        "addps %%xmm5, %%xmm12\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm6, %%xmm14\n"
        "movaps %%xmm6, %%xmm15\n"
        "addps %%xmm7, %%xmm14\n"
        "subps %%xmm7, %%xmm15\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movaps %%xmm12, %%xmm4\n"
        "movaps %%xmm12, %%xmm6\n"
        "addps %%xmm14, %%xmm4\n"
        "subps %%xmm14, %%xmm6\n"
        "movaps %%xmm13, %%xmm5\n"
        "movaps %%xmm13, %%xmm7\n"
        "addps %%xmm15, %%xmm5\n"
        "subps %%xmm15, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm12\n"
        "addps %%xmm4, %%xmm8\n"
        "subps %%xmm4, %%xmm12\n"
        "movaps %%xmm1, %%xmm9\n"
        "movaps %%xmm1, %%xmm13\n"
        "addps %%xmm5, %%xmm9\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm14\n"
        "addps %%xmm6, %%xmm10\n"
        "subps %%xmm6, %%xmm14\n"
        "movaps %%xmm3, %%xmm11\n"
        "movaps %%xmm3, %%xmm15\n"
        "addps %%xmm7, %%xmm11\n"
        "subps %%xmm7, %%xmm15\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        "movups %%xmm10, (%2)\n"
        "movups %%xmm11, (%3)\n"
        "movups %%xmm12, (%4)\n"
        "movups %%xmm13, (%5)\n"
        "movups %%xmm14, (%6)\n"
        "movups %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 1024; j += 1024) {
    for (int k = 0; k < 256; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movups %%xmm0, (%0)\n"
        "movups %%xmm1, (%1)\n"
        "movups %%xmm2, (%2)\n"
        "movups %%xmm3, (%3)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
static inline void helper_float_11(float *buf);
static inline void helper_float_11(float *buf) {
  for (int j = 0; j < 2048; j += 32) {
    for (int k = 0; k < 4; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movups (%4), %%xmm4\n"
        "movups (%5), %%xmm5\n"
        "movups (%6), %%xmm6\n"
        "movups (%7), %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm0, %%xmm0\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm0, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm1, %%xmm1\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm1, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm2, %%xmm2\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm2, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm3, %%xmm3\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm3, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm4, %%xmm4\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm4, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm5, %%xmm5\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm5, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm6, %%xmm6\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm6, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $160, %%xmm8, %%xmm8\n"
        "shufps $245, %%xmm7, %%xmm7\n"
        "xorps %%xmm9, %%xmm9\n"
        "subps %%xmm7, %%xmm9\n"
        "addsubps %%xmm9, %%xmm8\n"
        "movaps %%xmm8, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm0, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm0, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm0\n"
        "movaps %%xmm1, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm1, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm1, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm1\n"
        "movaps %%xmm2, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm2\n"
        "movaps %%xmm3, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm3, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm3, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm3\n"
        "movaps %%xmm4, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm4, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm4, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm4\n"
        "movaps %%xmm5, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm5, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm5, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm5\n"
        "movaps %%xmm6, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm6, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm6, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm6\n"
        "movaps %%xmm7, %%xmm8\n"
        "shufps $68, %%xmm8, %%xmm8\n"
        "xorps %%xmm9, %%xmm9\n"
        "movaps %%xmm7, %%xmm10\n"
        "shufps $14, %%xmm9, %%xmm10\n"
        "movaps %%xmm7, %%xmm11\n"
        "shufps $224, %%xmm11, %%xmm9\n"
        "addps %%xmm8, %%xmm10\n"
        "subps %%xmm9, %%xmm10\n"
        "movaps %%xmm10, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm4, %%xmm12\n"
        "movaps %%xmm4, %%xmm13\n"
        "addps %%xmm5, %%xmm12\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm6, %%xmm14\n"
        "movaps %%xmm6, %%xmm15\n"
        "addps %%xmm7, %%xmm14\n"
        "subps %%xmm7, %%xmm15\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movaps %%xmm12, %%xmm4\n"
        "movaps %%xmm12, %%xmm6\n"
        "addps %%xmm14, %%xmm4\n"
        "subps %%xmm14, %%xmm6\n"
        "movaps %%xmm13, %%xmm5\n"
        "movaps %%xmm13, %%xmm7\n"
        "addps %%xmm15, %%xmm5\n"
        "subps %%xmm15, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm12\n"
        "addps %%xmm4, %%xmm8\n"
        "subps %%xmm4, %%xmm12\n"
        "movaps %%xmm1, %%xmm9\n"
        "movaps %%xmm1, %%xmm13\n"
        "addps %%xmm5, %%xmm9\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm14\n"
        "addps %%xmm6, %%xmm10\n"
        "subps %%xmm6, %%xmm14\n"
        "movaps %%xmm3, %%xmm11\n"
        "movaps %%xmm3, %%xmm15\n"
        "addps %%xmm7, %%xmm11\n"
        "subps %%xmm7, %%xmm15\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        "movups %%xmm10, (%2)\n"
        "movups %%xmm11, (%3)\n"
        "movups %%xmm12, (%4)\n"
        "movups %%xmm13, (%5)\n"
        "movups %%xmm14, (%6)\n"
        "movups %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 2048; j += 256) {
    for (int k = 0; k < 32; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movups (%4), %%xmm4\n"
        "movups (%5), %%xmm5\n"
        "movups (%6), %%xmm6\n"
        "movups (%7), %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm4, %%xmm12\n"
        "movaps %%xmm4, %%xmm13\n"
        "addps %%xmm5, %%xmm12\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm6, %%xmm14\n"
        "movaps %%xmm6, %%xmm15\n"
        "addps %%xmm7, %%xmm14\n"
        "subps %%xmm7, %%xmm15\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movaps %%xmm12, %%xmm4\n"
        "movaps %%xmm12, %%xmm6\n"
        "addps %%xmm14, %%xmm4\n"
        "subps %%xmm14, %%xmm6\n"
        "movaps %%xmm13, %%xmm5\n"
        "movaps %%xmm13, %%xmm7\n"
        "addps %%xmm15, %%xmm5\n"
        "subps %%xmm15, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm12\n"
        "addps %%xmm4, %%xmm8\n"
        "subps %%xmm4, %%xmm12\n"
        "movaps %%xmm1, %%xmm9\n"
        "movaps %%xmm1, %%xmm13\n"
        "addps %%xmm5, %%xmm9\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm14\n"
        "addps %%xmm6, %%xmm10\n"
        "subps %%xmm6, %%xmm14\n"
        "movaps %%xmm3, %%xmm11\n"
        "movaps %%xmm3, %%xmm15\n"
        "addps %%xmm7, %%xmm11\n"
        "subps %%xmm7, %%xmm15\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        "movups %%xmm10, (%2)\n"
        "movups %%xmm11, (%3)\n"
        "movups %%xmm12, (%4)\n"
        "movups %%xmm13, (%5)\n"
        "movups %%xmm14, (%6)\n"
        "movups %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 2048; j += 2048) {
    for (int k = 0; k < 256; k += 4) {
      __asm__ volatile (
        "movups (%0), %%xmm0\n"
        "movups (%1), %%xmm1\n"
        "movups (%2), %%xmm2\n"
        "movups (%3), %%xmm3\n"
        "movups (%4), %%xmm4\n"
        "movups (%5), %%xmm5\n"
        "movups (%6), %%xmm6\n"
        "movups (%7), %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm9\n"
        "addps %%xmm1, %%xmm8\n"
        "subps %%xmm1, %%xmm9\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm11\n"
        "addps %%xmm3, %%xmm10\n"
        "subps %%xmm3, %%xmm11\n"
        "movaps %%xmm4, %%xmm12\n"
        "movaps %%xmm4, %%xmm13\n"
        "addps %%xmm5, %%xmm12\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm6, %%xmm14\n"
        "movaps %%xmm6, %%xmm15\n"
        "addps %%xmm7, %%xmm14\n"
        "subps %%xmm7, %%xmm15\n"
        "movaps %%xmm8, %%xmm0\n"
        "movaps %%xmm8, %%xmm2\n"
        "addps %%xmm10, %%xmm0\n"
        "subps %%xmm10, %%xmm2\n"
        "movaps %%xmm9, %%xmm1\n"
        "movaps %%xmm9, %%xmm3\n"
        "addps %%xmm11, %%xmm1\n"
        "subps %%xmm11, %%xmm3\n"
        "movaps %%xmm12, %%xmm4\n"
        "movaps %%xmm12, %%xmm6\n"
        "addps %%xmm14, %%xmm4\n"
        "subps %%xmm14, %%xmm6\n"
        "movaps %%xmm13, %%xmm5\n"
        "movaps %%xmm13, %%xmm7\n"
        "addps %%xmm15, %%xmm5\n"
        "subps %%xmm15, %%xmm7\n"
        "movaps %%xmm0, %%xmm8\n"
        "movaps %%xmm0, %%xmm12\n"
        "addps %%xmm4, %%xmm8\n"
        "subps %%xmm4, %%xmm12\n"
        "movaps %%xmm1, %%xmm9\n"
        "movaps %%xmm1, %%xmm13\n"
        "addps %%xmm5, %%xmm9\n"
        "subps %%xmm5, %%xmm13\n"
        "movaps %%xmm2, %%xmm10\n"
        "movaps %%xmm2, %%xmm14\n"
        "addps %%xmm6, %%xmm10\n"
        "subps %%xmm6, %%xmm14\n"
        "movaps %%xmm3, %%xmm11\n"
        "movaps %%xmm3, %%xmm15\n"
        "addps %%xmm7, %%xmm11\n"
        "subps %%xmm7, %%xmm15\n"
        "movups %%xmm8, (%0)\n"
        "movups %%xmm9, (%1)\n"
        "movups %%xmm10, (%2)\n"
        "movups %%xmm11, (%3)\n"
        "movups %%xmm12, (%4)\n"
        "movups %%xmm13, (%5)\n"
        "movups %%xmm14, (%6)\n"
        "movups %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
void helper_float_12_recursive(float *buf, int depth);
void helper_float_12_recursive(float *buf, int depth) {
  if (depth == 7) {
    for (int j = 0; j < 128; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 128; j += 128) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 10) {
    helper_float_12_recursive(buf + 0, 7);
    helper_float_12_recursive(buf + 128, 7);
    helper_float_12_recursive(buf + 256, 7);
    helper_float_12_recursive(buf + 384, 7);
    helper_float_12_recursive(buf + 512, 7);
    helper_float_12_recursive(buf + 640, 7);
    helper_float_12_recursive(buf + 768, 7);
    helper_float_12_recursive(buf + 896, 7);
    for (int j = 0; j < 1024; j += 1024) {
      for (int k = 0; k < 128; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 12) {
    helper_float_12_recursive(buf + 0, 10);
    helper_float_12_recursive(buf + 1024, 10);
    helper_float_12_recursive(buf + 2048, 10);
    helper_float_12_recursive(buf + 3072, 10);
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 1024; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_12(float *buf);
void helper_float_12(float *buf) {
  helper_float_12_recursive(buf, 12);
}
void helper_float_13_recursive(float *buf, int depth);
void helper_float_13_recursive(float *buf, int depth) {
  if (depth == 11) {
    for (int j = 0; j < 2048; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 2048; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 13) {
    helper_float_13_recursive(buf + 0, 11);
    helper_float_13_recursive(buf + 2048, 11);
    helper_float_13_recursive(buf + 4096, 11);
    helper_float_13_recursive(buf + 6144, 11);
    for (int j = 0; j < 8192; j += 8192) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_13(float *buf);
void helper_float_13(float *buf) {
  helper_float_13_recursive(buf, 13);
}
void helper_float_14_recursive(float *buf, int depth);
void helper_float_14_recursive(float *buf, int depth) {
  if (depth == 11) {
    for (int j = 0; j < 2048; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 2048; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_float_14_recursive(buf + 0, 11);
    helper_float_14_recursive(buf + 2048, 11);
    helper_float_14_recursive(buf + 4096, 11);
    helper_float_14_recursive(buf + 6144, 11);
    helper_float_14_recursive(buf + 8192, 11);
    helper_float_14_recursive(buf + 10240, 11);
    helper_float_14_recursive(buf + 12288, 11);
    helper_float_14_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_14(float *buf);
void helper_float_14(float *buf) {
  helper_float_14_recursive(buf, 14);
}
void helper_float_15_recursive(float *buf, int depth);
void helper_float_15_recursive(float *buf, int depth) {
  if (depth == 13) {
    for (int j = 0; j < 8192; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 8192) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_float_15_recursive(buf + 0, 13);
    helper_float_15_recursive(buf + 8192, 13);
    helper_float_15_recursive(buf + 16384, 13);
    helper_float_15_recursive(buf + 24576, 13);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 8192; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_15(float *buf);
void helper_float_15(float *buf) {
  helper_float_15_recursive(buf, 15);
}
void helper_float_16_recursive(float *buf, int depth);
void helper_float_16_recursive(float *buf, int depth) {
  if (depth == 11) {
    for (int j = 0; j < 2048; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 2048; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_float_16_recursive(buf + 0, 11);
    helper_float_16_recursive(buf + 2048, 11);
    helper_float_16_recursive(buf + 4096, 11);
    helper_float_16_recursive(buf + 6144, 11);
    helper_float_16_recursive(buf + 8192, 11);
    helper_float_16_recursive(buf + 10240, 11);
    helper_float_16_recursive(buf + 12288, 11);
    helper_float_16_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 16) {
    helper_float_16_recursive(buf + 0, 14);
    helper_float_16_recursive(buf + 16384, 14);
    helper_float_16_recursive(buf + 32768, 14);
    helper_float_16_recursive(buf + 49152, 14);
    for (int j = 0; j < 65536; j += 65536) {
      for (int k = 0; k < 16384; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_16(float *buf);
void helper_float_16(float *buf) {
  helper_float_16_recursive(buf, 16);
}
void helper_float_17_recursive(float *buf, int depth);
void helper_float_17_recursive(float *buf, int depth) {
  if (depth == 11) {
    for (int j = 0; j < 2048; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 2048; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_float_17_recursive(buf + 0, 11);
    helper_float_17_recursive(buf + 2048, 11);
    helper_float_17_recursive(buf + 4096, 11);
    helper_float_17_recursive(buf + 6144, 11);
    helper_float_17_recursive(buf + 8192, 11);
    helper_float_17_recursive(buf + 10240, 11);
    helper_float_17_recursive(buf + 12288, 11);
    helper_float_17_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 17) {
    helper_float_17_recursive(buf + 0, 14);
    helper_float_17_recursive(buf + 16384, 14);
    helper_float_17_recursive(buf + 32768, 14);
    helper_float_17_recursive(buf + 49152, 14);
    helper_float_17_recursive(buf + 65536, 14);
    helper_float_17_recursive(buf + 81920, 14);
    helper_float_17_recursive(buf + 98304, 14);
    helper_float_17_recursive(buf + 114688, 14);
    for (int j = 0; j < 131072; j += 131072) {
      for (int k = 0; k < 16384; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_17(float *buf);
void helper_float_17(float *buf) {
  helper_float_17_recursive(buf, 17);
}
void helper_float_18_recursive(float *buf, int depth);
void helper_float_18_recursive(float *buf, int depth) {
  if (depth == 13) {
    for (int j = 0; j < 8192; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 8192) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 16) {
    helper_float_18_recursive(buf + 0, 13);
    helper_float_18_recursive(buf + 8192, 13);
    helper_float_18_recursive(buf + 16384, 13);
    helper_float_18_recursive(buf + 24576, 13);
    helper_float_18_recursive(buf + 32768, 13);
    helper_float_18_recursive(buf + 40960, 13);
    helper_float_18_recursive(buf + 49152, 13);
    helper_float_18_recursive(buf + 57344, 13);
    for (int j = 0; j < 65536; j += 65536) {
      for (int k = 0; k < 8192; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 18) {
    helper_float_18_recursive(buf + 0, 16);
    helper_float_18_recursive(buf + 65536, 16);
    helper_float_18_recursive(buf + 131072, 16);
    helper_float_18_recursive(buf + 196608, 16);
    for (int j = 0; j < 262144; j += 262144) {
      for (int k = 0; k < 65536; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_18(float *buf);
void helper_float_18(float *buf) {
  helper_float_18_recursive(buf, 18);
}
void helper_float_19_recursive(float *buf, int depth);
void helper_float_19_recursive(float *buf, int depth) {
  if (depth == 13) {
    for (int j = 0; j < 8192; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 8192) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 16) {
    helper_float_19_recursive(buf + 0, 13);
    helper_float_19_recursive(buf + 8192, 13);
    helper_float_19_recursive(buf + 16384, 13);
    helper_float_19_recursive(buf + 24576, 13);
    helper_float_19_recursive(buf + 32768, 13);
    helper_float_19_recursive(buf + 40960, 13);
    helper_float_19_recursive(buf + 49152, 13);
    helper_float_19_recursive(buf + 57344, 13);
    for (int j = 0; j < 65536; j += 65536) {
      for (int k = 0; k < 8192; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 19) {
    helper_float_19_recursive(buf + 0, 16);
    helper_float_19_recursive(buf + 65536, 16);
    helper_float_19_recursive(buf + 131072, 16);
    helper_float_19_recursive(buf + 196608, 16);
    helper_float_19_recursive(buf + 262144, 16);
    helper_float_19_recursive(buf + 327680, 16);
    helper_float_19_recursive(buf + 393216, 16);
    helper_float_19_recursive(buf + 458752, 16);
    for (int j = 0; j < 524288; j += 524288) {
      for (int k = 0; k < 65536; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_19(float *buf);
void helper_float_19(float *buf) {
  helper_float_19_recursive(buf, 19);
}
void helper_float_20_recursive(float *buf, int depth);
void helper_float_20_recursive(float *buf, int depth) {
  if (depth == 8) {
    for (int j = 0; j < 256; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 256; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 11) {
    helper_float_20_recursive(buf + 0, 8);
    helper_float_20_recursive(buf + 256, 8);
    helper_float_20_recursive(buf + 512, 8);
    helper_float_20_recursive(buf + 768, 8);
    helper_float_20_recursive(buf + 1024, 8);
    helper_float_20_recursive(buf + 1280, 8);
    helper_float_20_recursive(buf + 1536, 8);
    helper_float_20_recursive(buf + 1792, 8);
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_float_20_recursive(buf + 0, 11);
    helper_float_20_recursive(buf + 2048, 11);
    helper_float_20_recursive(buf + 4096, 11);
    helper_float_20_recursive(buf + 6144, 11);
    helper_float_20_recursive(buf + 8192, 11);
    helper_float_20_recursive(buf + 10240, 11);
    helper_float_20_recursive(buf + 12288, 11);
    helper_float_20_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 17) {
    helper_float_20_recursive(buf + 0, 14);
    helper_float_20_recursive(buf + 16384, 14);
    helper_float_20_recursive(buf + 32768, 14);
    helper_float_20_recursive(buf + 49152, 14);
    helper_float_20_recursive(buf + 65536, 14);
    helper_float_20_recursive(buf + 81920, 14);
    helper_float_20_recursive(buf + 98304, 14);
    helper_float_20_recursive(buf + 114688, 14);
    for (int j = 0; j < 131072; j += 131072) {
      for (int k = 0; k < 16384; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 20) {
    helper_float_20_recursive(buf + 0, 17);
    helper_float_20_recursive(buf + 131072, 17);
    helper_float_20_recursive(buf + 262144, 17);
    helper_float_20_recursive(buf + 393216, 17);
    helper_float_20_recursive(buf + 524288, 17);
    helper_float_20_recursive(buf + 655360, 17);
    helper_float_20_recursive(buf + 786432, 17);
    helper_float_20_recursive(buf + 917504, 17);
    for (int j = 0; j < 1048576; j += 1048576) {
      for (int k = 0; k < 131072; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_20(float *buf);
void helper_float_20(float *buf) {
  helper_float_20_recursive(buf, 20);
}
void helper_float_21_recursive(float *buf, int depth);
void helper_float_21_recursive(float *buf, int depth) {
  if (depth == 13) {
    for (int j = 0; j < 8192; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 8192) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 16) {
    helper_float_21_recursive(buf + 0, 13);
    helper_float_21_recursive(buf + 8192, 13);
    helper_float_21_recursive(buf + 16384, 13);
    helper_float_21_recursive(buf + 24576, 13);
    helper_float_21_recursive(buf + 32768, 13);
    helper_float_21_recursive(buf + 40960, 13);
    helper_float_21_recursive(buf + 49152, 13);
    helper_float_21_recursive(buf + 57344, 13);
    for (int j = 0; j < 65536; j += 65536) {
      for (int k = 0; k < 8192; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 19) {
    helper_float_21_recursive(buf + 0, 16);
    helper_float_21_recursive(buf + 65536, 16);
    helper_float_21_recursive(buf + 131072, 16);
    helper_float_21_recursive(buf + 196608, 16);
    helper_float_21_recursive(buf + 262144, 16);
    helper_float_21_recursive(buf + 327680, 16);
    helper_float_21_recursive(buf + 393216, 16);
    helper_float_21_recursive(buf + 458752, 16);
    for (int j = 0; j < 524288; j += 524288) {
      for (int k = 0; k < 65536; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 21) {
    helper_float_21_recursive(buf + 0, 19);
    helper_float_21_recursive(buf + 524288, 19);
    helper_float_21_recursive(buf + 1048576, 19);
    helper_float_21_recursive(buf + 1572864, 19);
    for (int j = 0; j < 2097152; j += 2097152) {
      for (int k = 0; k < 524288; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_21(float *buf);
void helper_float_21(float *buf) {
  helper_float_21_recursive(buf, 21);
}
void helper_float_22_recursive(float *buf, int depth);
void helper_float_22_recursive(float *buf, int depth) {
  if (depth == 11) {
    for (int j = 0; j < 2048; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 2048; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_float_22_recursive(buf + 0, 11);
    helper_float_22_recursive(buf + 2048, 11);
    helper_float_22_recursive(buf + 4096, 11);
    helper_float_22_recursive(buf + 6144, 11);
    helper_float_22_recursive(buf + 8192, 11);
    helper_float_22_recursive(buf + 10240, 11);
    helper_float_22_recursive(buf + 12288, 11);
    helper_float_22_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 17) {
    helper_float_22_recursive(buf + 0, 14);
    helper_float_22_recursive(buf + 16384, 14);
    helper_float_22_recursive(buf + 32768, 14);
    helper_float_22_recursive(buf + 49152, 14);
    helper_float_22_recursive(buf + 65536, 14);
    helper_float_22_recursive(buf + 81920, 14);
    helper_float_22_recursive(buf + 98304, 14);
    helper_float_22_recursive(buf + 114688, 14);
    for (int j = 0; j < 131072; j += 131072) {
      for (int k = 0; k < 16384; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 20) {
    helper_float_22_recursive(buf + 0, 17);
    helper_float_22_recursive(buf + 131072, 17);
    helper_float_22_recursive(buf + 262144, 17);
    helper_float_22_recursive(buf + 393216, 17);
    helper_float_22_recursive(buf + 524288, 17);
    helper_float_22_recursive(buf + 655360, 17);
    helper_float_22_recursive(buf + 786432, 17);
    helper_float_22_recursive(buf + 917504, 17);
    for (int j = 0; j < 1048576; j += 1048576) {
      for (int k = 0; k < 131072; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 22) {
    helper_float_22_recursive(buf + 0, 20);
    helper_float_22_recursive(buf + 1048576, 20);
    helper_float_22_recursive(buf + 2097152, 20);
    helper_float_22_recursive(buf + 3145728, 20);
    for (int j = 0; j < 4194304; j += 4194304) {
      for (int k = 0; k < 1048576; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_22(float *buf);
void helper_float_22(float *buf) {
  helper_float_22_recursive(buf, 22);
}
void helper_float_23_recursive(float *buf, int depth);
void helper_float_23_recursive(float *buf, int depth) {
  if (depth == 6) {
    for (int j = 0; j < 64; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 64; j += 64) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 9) {
    helper_float_23_recursive(buf + 0, 6);
    helper_float_23_recursive(buf + 64, 6);
    helper_float_23_recursive(buf + 128, 6);
    helper_float_23_recursive(buf + 192, 6);
    helper_float_23_recursive(buf + 256, 6);
    helper_float_23_recursive(buf + 320, 6);
    helper_float_23_recursive(buf + 384, 6);
    helper_float_23_recursive(buf + 448, 6);
    for (int j = 0; j < 512; j += 512) {
      for (int k = 0; k < 64; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 12) {
    helper_float_23_recursive(buf + 0, 9);
    helper_float_23_recursive(buf + 512, 9);
    helper_float_23_recursive(buf + 1024, 9);
    helper_float_23_recursive(buf + 1536, 9);
    helper_float_23_recursive(buf + 2048, 9);
    helper_float_23_recursive(buf + 2560, 9);
    helper_float_23_recursive(buf + 3072, 9);
    helper_float_23_recursive(buf + 3584, 9);
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 512; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_float_23_recursive(buf + 0, 12);
    helper_float_23_recursive(buf + 4096, 12);
    helper_float_23_recursive(buf + 8192, 12);
    helper_float_23_recursive(buf + 12288, 12);
    helper_float_23_recursive(buf + 16384, 12);
    helper_float_23_recursive(buf + 20480, 12);
    helper_float_23_recursive(buf + 24576, 12);
    helper_float_23_recursive(buf + 28672, 12);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 4096; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 18) {
    helper_float_23_recursive(buf + 0, 15);
    helper_float_23_recursive(buf + 32768, 15);
    helper_float_23_recursive(buf + 65536, 15);
    helper_float_23_recursive(buf + 98304, 15);
    helper_float_23_recursive(buf + 131072, 15);
    helper_float_23_recursive(buf + 163840, 15);
    helper_float_23_recursive(buf + 196608, 15);
    helper_float_23_recursive(buf + 229376, 15);
    for (int j = 0; j < 262144; j += 262144) {
      for (int k = 0; k < 32768; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 21) {
    helper_float_23_recursive(buf + 0, 18);
    helper_float_23_recursive(buf + 262144, 18);
    helper_float_23_recursive(buf + 524288, 18);
    helper_float_23_recursive(buf + 786432, 18);
    helper_float_23_recursive(buf + 1048576, 18);
    helper_float_23_recursive(buf + 1310720, 18);
    helper_float_23_recursive(buf + 1572864, 18);
    helper_float_23_recursive(buf + 1835008, 18);
    for (int j = 0; j < 2097152; j += 2097152) {
      for (int k = 0; k < 262144; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 23) {
    helper_float_23_recursive(buf + 0, 21);
    helper_float_23_recursive(buf + 2097152, 21);
    helper_float_23_recursive(buf + 4194304, 21);
    helper_float_23_recursive(buf + 6291456, 21);
    for (int j = 0; j < 8388608; j += 8388608) {
      for (int k = 0; k < 2097152; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_23(float *buf);
void helper_float_23(float *buf) {
  helper_float_23_recursive(buf, 23);
}
void helper_float_24_recursive(float *buf, int depth);
void helper_float_24_recursive(float *buf, int depth) {
  if (depth == 15) {
    for (int j = 0; j < 32768; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 32768; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 32768; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 32768; j += 16384) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 16384; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 18) {
    helper_float_24_recursive(buf + 0, 15);
    helper_float_24_recursive(buf + 32768, 15);
    helper_float_24_recursive(buf + 65536, 15);
    helper_float_24_recursive(buf + 98304, 15);
    helper_float_24_recursive(buf + 131072, 15);
    helper_float_24_recursive(buf + 163840, 15);
    helper_float_24_recursive(buf + 196608, 15);
    helper_float_24_recursive(buf + 229376, 15);
    for (int j = 0; j < 262144; j += 262144) {
      for (int k = 0; k < 32768; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 21) {
    helper_float_24_recursive(buf + 0, 18);
    helper_float_24_recursive(buf + 262144, 18);
    helper_float_24_recursive(buf + 524288, 18);
    helper_float_24_recursive(buf + 786432, 18);
    helper_float_24_recursive(buf + 1048576, 18);
    helper_float_24_recursive(buf + 1310720, 18);
    helper_float_24_recursive(buf + 1572864, 18);
    helper_float_24_recursive(buf + 1835008, 18);
    for (int j = 0; j < 2097152; j += 2097152) {
      for (int k = 0; k < 262144; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 24) {
    helper_float_24_recursive(buf + 0, 21);
    helper_float_24_recursive(buf + 2097152, 21);
    helper_float_24_recursive(buf + 4194304, 21);
    helper_float_24_recursive(buf + 6291456, 21);
    helper_float_24_recursive(buf + 8388608, 21);
    helper_float_24_recursive(buf + 10485760, 21);
    helper_float_24_recursive(buf + 12582912, 21);
    helper_float_24_recursive(buf + 14680064, 21);
    for (int j = 0; j < 16777216; j += 16777216) {
      for (int k = 0; k < 2097152; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_24(float *buf);
void helper_float_24(float *buf) {
  helper_float_24_recursive(buf, 24);
}
void helper_float_25_recursive(float *buf, int depth);
void helper_float_25_recursive(float *buf, int depth) {
  if (depth == 8) {
    for (int j = 0; j < 256; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 256; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 11) {
    helper_float_25_recursive(buf + 0, 8);
    helper_float_25_recursive(buf + 256, 8);
    helper_float_25_recursive(buf + 512, 8);
    helper_float_25_recursive(buf + 768, 8);
    helper_float_25_recursive(buf + 1024, 8);
    helper_float_25_recursive(buf + 1280, 8);
    helper_float_25_recursive(buf + 1536, 8);
    helper_float_25_recursive(buf + 1792, 8);
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_float_25_recursive(buf + 0, 11);
    helper_float_25_recursive(buf + 2048, 11);
    helper_float_25_recursive(buf + 4096, 11);
    helper_float_25_recursive(buf + 6144, 11);
    helper_float_25_recursive(buf + 8192, 11);
    helper_float_25_recursive(buf + 10240, 11);
    helper_float_25_recursive(buf + 12288, 11);
    helper_float_25_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 17) {
    helper_float_25_recursive(buf + 0, 14);
    helper_float_25_recursive(buf + 16384, 14);
    helper_float_25_recursive(buf + 32768, 14);
    helper_float_25_recursive(buf + 49152, 14);
    helper_float_25_recursive(buf + 65536, 14);
    helper_float_25_recursive(buf + 81920, 14);
    helper_float_25_recursive(buf + 98304, 14);
    helper_float_25_recursive(buf + 114688, 14);
    for (int j = 0; j < 131072; j += 131072) {
      for (int k = 0; k < 16384; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 20) {
    helper_float_25_recursive(buf + 0, 17);
    helper_float_25_recursive(buf + 131072, 17);
    helper_float_25_recursive(buf + 262144, 17);
    helper_float_25_recursive(buf + 393216, 17);
    helper_float_25_recursive(buf + 524288, 17);
    helper_float_25_recursive(buf + 655360, 17);
    helper_float_25_recursive(buf + 786432, 17);
    helper_float_25_recursive(buf + 917504, 17);
    for (int j = 0; j < 1048576; j += 1048576) {
      for (int k = 0; k < 131072; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 23) {
    helper_float_25_recursive(buf + 0, 20);
    helper_float_25_recursive(buf + 1048576, 20);
    helper_float_25_recursive(buf + 2097152, 20);
    helper_float_25_recursive(buf + 3145728, 20);
    helper_float_25_recursive(buf + 4194304, 20);
    helper_float_25_recursive(buf + 5242880, 20);
    helper_float_25_recursive(buf + 6291456, 20);
    helper_float_25_recursive(buf + 7340032, 20);
    for (int j = 0; j < 8388608; j += 8388608) {
      for (int k = 0; k < 1048576; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 25) {
    helper_float_25_recursive(buf + 0, 23);
    helper_float_25_recursive(buf + 8388608, 23);
    helper_float_25_recursive(buf + 16777216, 23);
    helper_float_25_recursive(buf + 25165824, 23);
    for (int j = 0; j < 33554432; j += 33554432) {
      for (int k = 0; k < 8388608; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_25(float *buf);
void helper_float_25(float *buf) {
  helper_float_25_recursive(buf, 25);
}
void helper_float_26_recursive(float *buf, int depth);
void helper_float_26_recursive(float *buf, int depth) {
  if (depth == 5) {
    for (int j = 0; j < 32; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 8) {
    helper_float_26_recursive(buf + 0, 5);
    helper_float_26_recursive(buf + 32, 5);
    helper_float_26_recursive(buf + 64, 5);
    helper_float_26_recursive(buf + 96, 5);
    helper_float_26_recursive(buf + 128, 5);
    helper_float_26_recursive(buf + 160, 5);
    helper_float_26_recursive(buf + 192, 5);
    helper_float_26_recursive(buf + 224, 5);
    for (int j = 0; j < 256; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 11) {
    helper_float_26_recursive(buf + 0, 8);
    helper_float_26_recursive(buf + 256, 8);
    helper_float_26_recursive(buf + 512, 8);
    helper_float_26_recursive(buf + 768, 8);
    helper_float_26_recursive(buf + 1024, 8);
    helper_float_26_recursive(buf + 1280, 8);
    helper_float_26_recursive(buf + 1536, 8);
    helper_float_26_recursive(buf + 1792, 8);
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_float_26_recursive(buf + 0, 11);
    helper_float_26_recursive(buf + 2048, 11);
    helper_float_26_recursive(buf + 4096, 11);
    helper_float_26_recursive(buf + 6144, 11);
    helper_float_26_recursive(buf + 8192, 11);
    helper_float_26_recursive(buf + 10240, 11);
    helper_float_26_recursive(buf + 12288, 11);
    helper_float_26_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 17) {
    helper_float_26_recursive(buf + 0, 14);
    helper_float_26_recursive(buf + 16384, 14);
    helper_float_26_recursive(buf + 32768, 14);
    helper_float_26_recursive(buf + 49152, 14);
    helper_float_26_recursive(buf + 65536, 14);
    helper_float_26_recursive(buf + 81920, 14);
    helper_float_26_recursive(buf + 98304, 14);
    helper_float_26_recursive(buf + 114688, 14);
    for (int j = 0; j < 131072; j += 131072) {
      for (int k = 0; k < 16384; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 20) {
    helper_float_26_recursive(buf + 0, 17);
    helper_float_26_recursive(buf + 131072, 17);
    helper_float_26_recursive(buf + 262144, 17);
    helper_float_26_recursive(buf + 393216, 17);
    helper_float_26_recursive(buf + 524288, 17);
    helper_float_26_recursive(buf + 655360, 17);
    helper_float_26_recursive(buf + 786432, 17);
    helper_float_26_recursive(buf + 917504, 17);
    for (int j = 0; j < 1048576; j += 1048576) {
      for (int k = 0; k < 131072; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 23) {
    helper_float_26_recursive(buf + 0, 20);
    helper_float_26_recursive(buf + 1048576, 20);
    helper_float_26_recursive(buf + 2097152, 20);
    helper_float_26_recursive(buf + 3145728, 20);
    helper_float_26_recursive(buf + 4194304, 20);
    helper_float_26_recursive(buf + 5242880, 20);
    helper_float_26_recursive(buf + 6291456, 20);
    helper_float_26_recursive(buf + 7340032, 20);
    for (int j = 0; j < 8388608; j += 8388608) {
      for (int k = 0; k < 1048576; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 26) {
    helper_float_26_recursive(buf + 0, 23);
    helper_float_26_recursive(buf + 8388608, 23);
    helper_float_26_recursive(buf + 16777216, 23);
    helper_float_26_recursive(buf + 25165824, 23);
    helper_float_26_recursive(buf + 33554432, 23);
    helper_float_26_recursive(buf + 41943040, 23);
    helper_float_26_recursive(buf + 50331648, 23);
    helper_float_26_recursive(buf + 58720256, 23);
    for (int j = 0; j < 67108864; j += 67108864) {
      for (int k = 0; k < 8388608; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_26(float *buf);
void helper_float_26(float *buf) {
  helper_float_26_recursive(buf, 26);
}
void helper_float_27_recursive(float *buf, int depth);
void helper_float_27_recursive(float *buf, int depth) {
  if (depth == 12) {
    for (int j = 0; j < 4096; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_float_27_recursive(buf + 0, 12);
    helper_float_27_recursive(buf + 4096, 12);
    helper_float_27_recursive(buf + 8192, 12);
    helper_float_27_recursive(buf + 12288, 12);
    helper_float_27_recursive(buf + 16384, 12);
    helper_float_27_recursive(buf + 20480, 12);
    helper_float_27_recursive(buf + 24576, 12);
    helper_float_27_recursive(buf + 28672, 12);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 4096; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 18) {
    helper_float_27_recursive(buf + 0, 15);
    helper_float_27_recursive(buf + 32768, 15);
    helper_float_27_recursive(buf + 65536, 15);
    helper_float_27_recursive(buf + 98304, 15);
    helper_float_27_recursive(buf + 131072, 15);
    helper_float_27_recursive(buf + 163840, 15);
    helper_float_27_recursive(buf + 196608, 15);
    helper_float_27_recursive(buf + 229376, 15);
    for (int j = 0; j < 262144; j += 262144) {
      for (int k = 0; k < 32768; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 21) {
    helper_float_27_recursive(buf + 0, 18);
    helper_float_27_recursive(buf + 262144, 18);
    helper_float_27_recursive(buf + 524288, 18);
    helper_float_27_recursive(buf + 786432, 18);
    helper_float_27_recursive(buf + 1048576, 18);
    helper_float_27_recursive(buf + 1310720, 18);
    helper_float_27_recursive(buf + 1572864, 18);
    helper_float_27_recursive(buf + 1835008, 18);
    for (int j = 0; j < 2097152; j += 2097152) {
      for (int k = 0; k < 262144; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 24) {
    helper_float_27_recursive(buf + 0, 21);
    helper_float_27_recursive(buf + 2097152, 21);
    helper_float_27_recursive(buf + 4194304, 21);
    helper_float_27_recursive(buf + 6291456, 21);
    helper_float_27_recursive(buf + 8388608, 21);
    helper_float_27_recursive(buf + 10485760, 21);
    helper_float_27_recursive(buf + 12582912, 21);
    helper_float_27_recursive(buf + 14680064, 21);
    for (int j = 0; j < 16777216; j += 16777216) {
      for (int k = 0; k < 2097152; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 27) {
    helper_float_27_recursive(buf + 0, 24);
    helper_float_27_recursive(buf + 16777216, 24);
    helper_float_27_recursive(buf + 33554432, 24);
    helper_float_27_recursive(buf + 50331648, 24);
    helper_float_27_recursive(buf + 67108864, 24);
    helper_float_27_recursive(buf + 83886080, 24);
    helper_float_27_recursive(buf + 100663296, 24);
    helper_float_27_recursive(buf + 117440512, 24);
    for (int j = 0; j < 134217728; j += 134217728) {
      for (int k = 0; k < 16777216; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_27(float *buf);
void helper_float_27(float *buf) {
  helper_float_27_recursive(buf, 27);
}
void helper_float_28_recursive(float *buf, int depth);
void helper_float_28_recursive(float *buf, int depth) {
  if (depth == 16) {
    for (int j = 0; j < 65536; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 65536; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 65536; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 65536; j += 16384) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 65536; j += 65536) {
      for (int k = 0; k < 16384; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 19) {
    helper_float_28_recursive(buf + 0, 16);
    helper_float_28_recursive(buf + 65536, 16);
    helper_float_28_recursive(buf + 131072, 16);
    helper_float_28_recursive(buf + 196608, 16);
    helper_float_28_recursive(buf + 262144, 16);
    helper_float_28_recursive(buf + 327680, 16);
    helper_float_28_recursive(buf + 393216, 16);
    helper_float_28_recursive(buf + 458752, 16);
    for (int j = 0; j < 524288; j += 524288) {
      for (int k = 0; k < 65536; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 22) {
    helper_float_28_recursive(buf + 0, 19);
    helper_float_28_recursive(buf + 524288, 19);
    helper_float_28_recursive(buf + 1048576, 19);
    helper_float_28_recursive(buf + 1572864, 19);
    helper_float_28_recursive(buf + 2097152, 19);
    helper_float_28_recursive(buf + 2621440, 19);
    helper_float_28_recursive(buf + 3145728, 19);
    helper_float_28_recursive(buf + 3670016, 19);
    for (int j = 0; j < 4194304; j += 4194304) {
      for (int k = 0; k < 524288; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 25) {
    helper_float_28_recursive(buf + 0, 22);
    helper_float_28_recursive(buf + 4194304, 22);
    helper_float_28_recursive(buf + 8388608, 22);
    helper_float_28_recursive(buf + 12582912, 22);
    helper_float_28_recursive(buf + 16777216, 22);
    helper_float_28_recursive(buf + 20971520, 22);
    helper_float_28_recursive(buf + 25165824, 22);
    helper_float_28_recursive(buf + 29360128, 22);
    for (int j = 0; j < 33554432; j += 33554432) {
      for (int k = 0; k < 4194304; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 28) {
    helper_float_28_recursive(buf + 0, 25);
    helper_float_28_recursive(buf + 33554432, 25);
    helper_float_28_recursive(buf + 67108864, 25);
    helper_float_28_recursive(buf + 100663296, 25);
    helper_float_28_recursive(buf + 134217728, 25);
    helper_float_28_recursive(buf + 167772160, 25);
    helper_float_28_recursive(buf + 201326592, 25);
    helper_float_28_recursive(buf + 234881024, 25);
    for (int j = 0; j < 268435456; j += 268435456) {
      for (int k = 0; k < 33554432; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 33554432), "r"(buf + j + k + 67108864), "r"(buf + j + k + 100663296), "r"(buf + j + k + 134217728), "r"(buf + j + k + 167772160), "r"(buf + j + k + 201326592), "r"(buf + j + k + 234881024) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_28(float *buf);
void helper_float_28(float *buf) {
  helper_float_28_recursive(buf, 28);
}
void helper_float_29_recursive(float *buf, int depth);
void helper_float_29_recursive(float *buf, int depth) {
  if (depth == 12) {
    for (int j = 0; j < 4096; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_float_29_recursive(buf + 0, 12);
    helper_float_29_recursive(buf + 4096, 12);
    helper_float_29_recursive(buf + 8192, 12);
    helper_float_29_recursive(buf + 12288, 12);
    helper_float_29_recursive(buf + 16384, 12);
    helper_float_29_recursive(buf + 20480, 12);
    helper_float_29_recursive(buf + 24576, 12);
    helper_float_29_recursive(buf + 28672, 12);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 4096; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 18) {
    helper_float_29_recursive(buf + 0, 15);
    helper_float_29_recursive(buf + 32768, 15);
    helper_float_29_recursive(buf + 65536, 15);
    helper_float_29_recursive(buf + 98304, 15);
    helper_float_29_recursive(buf + 131072, 15);
    helper_float_29_recursive(buf + 163840, 15);
    helper_float_29_recursive(buf + 196608, 15);
    helper_float_29_recursive(buf + 229376, 15);
    for (int j = 0; j < 262144; j += 262144) {
      for (int k = 0; k < 32768; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 21) {
    helper_float_29_recursive(buf + 0, 18);
    helper_float_29_recursive(buf + 262144, 18);
    helper_float_29_recursive(buf + 524288, 18);
    helper_float_29_recursive(buf + 786432, 18);
    helper_float_29_recursive(buf + 1048576, 18);
    helper_float_29_recursive(buf + 1310720, 18);
    helper_float_29_recursive(buf + 1572864, 18);
    helper_float_29_recursive(buf + 1835008, 18);
    for (int j = 0; j < 2097152; j += 2097152) {
      for (int k = 0; k < 262144; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 24) {
    helper_float_29_recursive(buf + 0, 21);
    helper_float_29_recursive(buf + 2097152, 21);
    helper_float_29_recursive(buf + 4194304, 21);
    helper_float_29_recursive(buf + 6291456, 21);
    helper_float_29_recursive(buf + 8388608, 21);
    helper_float_29_recursive(buf + 10485760, 21);
    helper_float_29_recursive(buf + 12582912, 21);
    helper_float_29_recursive(buf + 14680064, 21);
    for (int j = 0; j < 16777216; j += 16777216) {
      for (int k = 0; k < 2097152; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 27) {
    helper_float_29_recursive(buf + 0, 24);
    helper_float_29_recursive(buf + 16777216, 24);
    helper_float_29_recursive(buf + 33554432, 24);
    helper_float_29_recursive(buf + 50331648, 24);
    helper_float_29_recursive(buf + 67108864, 24);
    helper_float_29_recursive(buf + 83886080, 24);
    helper_float_29_recursive(buf + 100663296, 24);
    helper_float_29_recursive(buf + 117440512, 24);
    for (int j = 0; j < 134217728; j += 134217728) {
      for (int k = 0; k < 16777216; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 29) {
    helper_float_29_recursive(buf + 0, 27);
    helper_float_29_recursive(buf + 134217728, 27);
    helper_float_29_recursive(buf + 268435456, 27);
    helper_float_29_recursive(buf + 402653184, 27);
    for (int j = 0; j < 536870912; j += 536870912) {
      for (int k = 0; k < 134217728; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movups %%xmm0, (%0)\n"
          "movups %%xmm1, (%1)\n"
          "movups %%xmm2, (%2)\n"
          "movups %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_29(float *buf);
void helper_float_29(float *buf) {
  helper_float_29_recursive(buf, 29);
}
void helper_float_30_recursive(float *buf, int depth);
void helper_float_30_recursive(float *buf, int depth) {
  if (depth == 12) {
    for (int j = 0; j < 4096; j += 32) {
      for (int k = 0; k < 4; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm0, %%xmm0\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm0, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm1, %%xmm1\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm1, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm2, %%xmm2\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm2, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm3, %%xmm3\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm3, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm4, %%xmm4\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm4, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm5, %%xmm5\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm5, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm6, %%xmm6\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm6, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $160, %%xmm8, %%xmm8\n"
          "shufps $245, %%xmm7, %%xmm7\n"
          "xorps %%xmm9, %%xmm9\n"
          "subps %%xmm7, %%xmm9\n"
          "addsubps %%xmm9, %%xmm8\n"
          "movaps %%xmm8, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm0, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm0, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm0\n"
          "movaps %%xmm1, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm1, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm1, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm1\n"
          "movaps %%xmm2, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm2\n"
          "movaps %%xmm3, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm3, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm3, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm3\n"
          "movaps %%xmm4, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm4, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm4, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm4\n"
          "movaps %%xmm5, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm5, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm5, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm5\n"
          "movaps %%xmm6, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm6, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm6, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm6\n"
          "movaps %%xmm7, %%xmm8\n"
          "shufps $68, %%xmm8, %%xmm8\n"
          "xorps %%xmm9, %%xmm9\n"
          "movaps %%xmm7, %%xmm10\n"
          "shufps $14, %%xmm9, %%xmm10\n"
          "movaps %%xmm7, %%xmm11\n"
          "shufps $224, %%xmm11, %%xmm9\n"
          "addps %%xmm8, %%xmm10\n"
          "subps %%xmm9, %%xmm10\n"
          "movaps %%xmm10, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 256) {
      for (int k = 0; k < 32; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 2048) {
      for (int k = 0; k < 256; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 2048; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_float_30_recursive(buf + 0, 12);
    helper_float_30_recursive(buf + 4096, 12);
    helper_float_30_recursive(buf + 8192, 12);
    helper_float_30_recursive(buf + 12288, 12);
    helper_float_30_recursive(buf + 16384, 12);
    helper_float_30_recursive(buf + 20480, 12);
    helper_float_30_recursive(buf + 24576, 12);
    helper_float_30_recursive(buf + 28672, 12);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 4096; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 18) {
    helper_float_30_recursive(buf + 0, 15);
    helper_float_30_recursive(buf + 32768, 15);
    helper_float_30_recursive(buf + 65536, 15);
    helper_float_30_recursive(buf + 98304, 15);
    helper_float_30_recursive(buf + 131072, 15);
    helper_float_30_recursive(buf + 163840, 15);
    helper_float_30_recursive(buf + 196608, 15);
    helper_float_30_recursive(buf + 229376, 15);
    for (int j = 0; j < 262144; j += 262144) {
      for (int k = 0; k < 32768; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 21) {
    helper_float_30_recursive(buf + 0, 18);
    helper_float_30_recursive(buf + 262144, 18);
    helper_float_30_recursive(buf + 524288, 18);
    helper_float_30_recursive(buf + 786432, 18);
    helper_float_30_recursive(buf + 1048576, 18);
    helper_float_30_recursive(buf + 1310720, 18);
    helper_float_30_recursive(buf + 1572864, 18);
    helper_float_30_recursive(buf + 1835008, 18);
    for (int j = 0; j < 2097152; j += 2097152) {
      for (int k = 0; k < 262144; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 24) {
    helper_float_30_recursive(buf + 0, 21);
    helper_float_30_recursive(buf + 2097152, 21);
    helper_float_30_recursive(buf + 4194304, 21);
    helper_float_30_recursive(buf + 6291456, 21);
    helper_float_30_recursive(buf + 8388608, 21);
    helper_float_30_recursive(buf + 10485760, 21);
    helper_float_30_recursive(buf + 12582912, 21);
    helper_float_30_recursive(buf + 14680064, 21);
    for (int j = 0; j < 16777216; j += 16777216) {
      for (int k = 0; k < 2097152; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 27) {
    helper_float_30_recursive(buf + 0, 24);
    helper_float_30_recursive(buf + 16777216, 24);
    helper_float_30_recursive(buf + 33554432, 24);
    helper_float_30_recursive(buf + 50331648, 24);
    helper_float_30_recursive(buf + 67108864, 24);
    helper_float_30_recursive(buf + 83886080, 24);
    helper_float_30_recursive(buf + 100663296, 24);
    helper_float_30_recursive(buf + 117440512, 24);
    for (int j = 0; j < 134217728; j += 134217728) {
      for (int k = 0; k < 16777216; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 30) {
    helper_float_30_recursive(buf + 0, 27);
    helper_float_30_recursive(buf + 134217728, 27);
    helper_float_30_recursive(buf + 268435456, 27);
    helper_float_30_recursive(buf + 402653184, 27);
    helper_float_30_recursive(buf + 536870912, 27);
    helper_float_30_recursive(buf + 671088640, 27);
    helper_float_30_recursive(buf + 805306368, 27);
    helper_float_30_recursive(buf + 939524096, 27);
    for (int j = 0; j < 1073741824; j += 1073741824) {
      for (int k = 0; k < 134217728; k += 4) {
        __asm__ volatile (
          "movups (%0), %%xmm0\n"
          "movups (%1), %%xmm1\n"
          "movups (%2), %%xmm2\n"
          "movups (%3), %%xmm3\n"
          "movups (%4), %%xmm4\n"
          "movups (%5), %%xmm5\n"
          "movups (%6), %%xmm6\n"
          "movups (%7), %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm9\n"
          "addps %%xmm1, %%xmm8\n"
          "subps %%xmm1, %%xmm9\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm11\n"
          "addps %%xmm3, %%xmm10\n"
          "subps %%xmm3, %%xmm11\n"
          "movaps %%xmm4, %%xmm12\n"
          "movaps %%xmm4, %%xmm13\n"
          "addps %%xmm5, %%xmm12\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm6, %%xmm14\n"
          "movaps %%xmm6, %%xmm15\n"
          "addps %%xmm7, %%xmm14\n"
          "subps %%xmm7, %%xmm15\n"
          "movaps %%xmm8, %%xmm0\n"
          "movaps %%xmm8, %%xmm2\n"
          "addps %%xmm10, %%xmm0\n"
          "subps %%xmm10, %%xmm2\n"
          "movaps %%xmm9, %%xmm1\n"
          "movaps %%xmm9, %%xmm3\n"
          "addps %%xmm11, %%xmm1\n"
          "subps %%xmm11, %%xmm3\n"
          "movaps %%xmm12, %%xmm4\n"
          "movaps %%xmm12, %%xmm6\n"
          "addps %%xmm14, %%xmm4\n"
          "subps %%xmm14, %%xmm6\n"
          "movaps %%xmm13, %%xmm5\n"
          "movaps %%xmm13, %%xmm7\n"
          "addps %%xmm15, %%xmm5\n"
          "subps %%xmm15, %%xmm7\n"
          "movaps %%xmm0, %%xmm8\n"
          "movaps %%xmm0, %%xmm12\n"
          "addps %%xmm4, %%xmm8\n"
          "subps %%xmm4, %%xmm12\n"
          "movaps %%xmm1, %%xmm9\n"
          "movaps %%xmm1, %%xmm13\n"
          "addps %%xmm5, %%xmm9\n"
          "subps %%xmm5, %%xmm13\n"
          "movaps %%xmm2, %%xmm10\n"
          "movaps %%xmm2, %%xmm14\n"
          "addps %%xmm6, %%xmm10\n"
          "subps %%xmm6, %%xmm14\n"
          "movaps %%xmm3, %%xmm11\n"
          "movaps %%xmm3, %%xmm15\n"
          "addps %%xmm7, %%xmm11\n"
          "subps %%xmm7, %%xmm15\n"
          "movups %%xmm8, (%0)\n"
          "movups %%xmm9, (%1)\n"
          "movups %%xmm10, (%2)\n"
          "movups %%xmm11, (%3)\n"
          "movups %%xmm12, (%4)\n"
          "movups %%xmm13, (%5)\n"
          "movups %%xmm14, (%6)\n"
          "movups %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_float_30(float *buf);
void helper_float_30(float *buf) {
  helper_float_30_recursive(buf, 30);
}
int fht_float(float *buf, int log_n) {
  if (log_n == 0) {
    return 0;
  }
  if (log_n == 1) {
    helper_float_1(buf);
    return 0;
  }
  if (log_n == 2) {
    helper_float_2(buf);
    return 0;
  }
  if (log_n == 3) {
    helper_float_3(buf);
    return 0;
  }
  if (log_n == 4) {
    helper_float_4(buf);
    return 0;
  }
  if (log_n == 5) {
    helper_float_5(buf);
    return 0;
  }
  if (log_n == 6) {
    helper_float_6(buf);
    return 0;
  }
  if (log_n == 7) {
    helper_float_7(buf);
    return 0;
  }
  if (log_n == 8) {
    helper_float_8(buf);
    return 0;
  }
  if (log_n == 9) {
    helper_float_9(buf);
    return 0;
  }
  if (log_n == 10) {
    helper_float_10(buf);
    return 0;
  }
  if (log_n == 11) {
    helper_float_11(buf);
    return 0;
  }
  if (log_n == 12) {
    helper_float_12(buf);
    return 0;
  }
  if (log_n == 13) {
    helper_float_13(buf);
    return 0;
  }
  if (log_n == 14) {
    helper_float_14(buf);
    return 0;
  }
  if (log_n == 15) {
    helper_float_15(buf);
    return 0;
  }
  if (log_n == 16) {
    helper_float_16(buf);
    return 0;
  }
  if (log_n == 17) {
    helper_float_17(buf);
    return 0;
  }
  if (log_n == 18) {
    helper_float_18(buf);
    return 0;
  }
  if (log_n == 19) {
    helper_float_19(buf);
    return 0;
  }
  if (log_n == 20) {
    helper_float_20(buf);
    return 0;
  }
  if (log_n == 21) {
    helper_float_21(buf);
    return 0;
  }
  if (log_n == 22) {
    helper_float_22(buf);
    return 0;
  }
  if (log_n == 23) {
    helper_float_23(buf);
    return 0;
  }
  if (log_n == 24) {
    helper_float_24(buf);
    return 0;
  }
  if (log_n == 25) {
    helper_float_25(buf);
    return 0;
  }
  if (log_n == 26) {
    helper_float_26(buf);
    return 0;
  }
  if (log_n == 27) {
    helper_float_27(buf);
    return 0;
  }
  if (log_n == 28) {
    helper_float_28(buf);
    return 0;
  }
  if (log_n == 29) {
    helper_float_29(buf);
    return 0;
  }
  if (log_n == 30) {
    helper_float_30(buf);
    return 0;
  }
  return 1;
}
static inline void helper_double_1(double *buf);
static inline void helper_double_1(double *buf) {
  for (int j = 0; j < 2; j += 2) {
    __asm__ volatile (
      "movupd (%0), %%xmm0\n"
      "movapd %%xmm0, %%xmm8\n"
      "haddpd %%xmm8, %%xmm8\n"
      "movapd %%xmm0, %%xmm9\n"
      "hsubpd %%xmm9, %%xmm9\n"
      "blendpd $1, %%xmm8, %%xmm9\n"
      "movapd %%xmm9, %%xmm0\n"
      "movupd %%xmm0, (%0)\n"
      :: "r"(buf + j) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
    );
  }
}
void helper_double_2_recursive(double *buf, int depth);
void helper_double_2_recursive(double *buf, int depth) {
  if (depth == 2) {
    for (int j = 0; j < 4; j += 4) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_2(double *buf);
void helper_double_2(double *buf) {
  helper_double_2_recursive(buf, 2);
}
void helper_double_3_recursive(double *buf, int depth);
void helper_double_3_recursive(double *buf, int depth) {
  if (depth == 3) {
    for (int j = 0; j < 8; j += 8) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_3(double *buf);
void helper_double_3(double *buf) {
  helper_double_3_recursive(buf, 3);
}
static inline void helper_double_4(double *buf);
static inline void helper_double_4(double *buf) {
  for (int j = 0; j < 16; j += 16) {
    for (int k = 0; k < 2; k += 2) {
      __asm__ volatile (
        "movupd (%0), %%xmm0\n"
        "movupd (%1), %%xmm1\n"
        "movupd (%2), %%xmm2\n"
        "movupd (%3), %%xmm3\n"
        "movupd (%4), %%xmm4\n"
        "movupd (%5), %%xmm5\n"
        "movupd (%6), %%xmm6\n"
        "movupd (%7), %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm0\n"
        "movapd %%xmm1, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm1, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm2, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm2, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm2\n"
        "movapd %%xmm3, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm3, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm3\n"
        "movapd %%xmm4, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm4, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm4\n"
        "movapd %%xmm5, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm5, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm5\n"
        "movapd %%xmm6, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm6, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm6\n"
        "movapd %%xmm7, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm7, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "addpd %%xmm1, %%xmm8\n"
        "subpd %%xmm1, %%xmm9\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm11\n"
        "addpd %%xmm3, %%xmm10\n"
        "subpd %%xmm3, %%xmm11\n"
        "movapd %%xmm4, %%xmm12\n"
        "movapd %%xmm4, %%xmm13\n"
        "addpd %%xmm5, %%xmm12\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm6, %%xmm14\n"
        "movapd %%xmm6, %%xmm15\n"
        "addpd %%xmm7, %%xmm14\n"
        "subpd %%xmm7, %%xmm15\n"
        "movapd %%xmm8, %%xmm0\n"
        "movapd %%xmm8, %%xmm2\n"
        "addpd %%xmm10, %%xmm0\n"
        "subpd %%xmm10, %%xmm2\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm9, %%xmm3\n"
        "addpd %%xmm11, %%xmm1\n"
        "subpd %%xmm11, %%xmm3\n"
        "movapd %%xmm12, %%xmm4\n"
        "movapd %%xmm12, %%xmm6\n"
        "addpd %%xmm14, %%xmm4\n"
        "subpd %%xmm14, %%xmm6\n"
        "movapd %%xmm13, %%xmm5\n"
        "movapd %%xmm13, %%xmm7\n"
        "addpd %%xmm15, %%xmm5\n"
        "subpd %%xmm15, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm12\n"
        "addpd %%xmm4, %%xmm8\n"
        "subpd %%xmm4, %%xmm12\n"
        "movapd %%xmm1, %%xmm9\n"
        "movapd %%xmm1, %%xmm13\n"
        "addpd %%xmm5, %%xmm9\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm14\n"
        "addpd %%xmm6, %%xmm10\n"
        "subpd %%xmm6, %%xmm14\n"
        "movapd %%xmm3, %%xmm11\n"
        "movapd %%xmm3, %%xmm15\n"
        "addpd %%xmm7, %%xmm11\n"
        "subpd %%xmm7, %%xmm15\n"
        "movupd %%xmm8, (%0)\n"
        "movupd %%xmm9, (%1)\n"
        "movupd %%xmm10, (%2)\n"
        "movupd %%xmm11, (%3)\n"
        "movupd %%xmm12, (%4)\n"
        "movupd %%xmm13, (%5)\n"
        "movupd %%xmm14, (%6)\n"
        "movupd %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
void helper_double_5_recursive(double *buf, int depth);
void helper_double_5_recursive(double *buf, int depth) {
  if (depth == 2) {
    for (int j = 0; j < 4; j += 4) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 5) {
    helper_double_5_recursive(buf + 0, 2);
    helper_double_5_recursive(buf + 4, 2);
    helper_double_5_recursive(buf + 8, 2);
    helper_double_5_recursive(buf + 12, 2);
    helper_double_5_recursive(buf + 16, 2);
    helper_double_5_recursive(buf + 20, 2);
    helper_double_5_recursive(buf + 24, 2);
    helper_double_5_recursive(buf + 28, 2);
    for (int j = 0; j < 32; j += 32) {
      for (int k = 0; k < 4; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_5(double *buf);
void helper_double_5(double *buf) {
  helper_double_5_recursive(buf, 5);
}
static inline void helper_double_6(double *buf);
static inline void helper_double_6(double *buf) {
  for (int j = 0; j < 64; j += 16) {
    for (int k = 0; k < 2; k += 2) {
      __asm__ volatile (
        "movupd (%0), %%xmm0\n"
        "movupd (%1), %%xmm1\n"
        "movupd (%2), %%xmm2\n"
        "movupd (%3), %%xmm3\n"
        "movupd (%4), %%xmm4\n"
        "movupd (%5), %%xmm5\n"
        "movupd (%6), %%xmm6\n"
        "movupd (%7), %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm0\n"
        "movapd %%xmm1, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm1, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm2, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm2, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm2\n"
        "movapd %%xmm3, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm3, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm3\n"
        "movapd %%xmm4, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm4, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm4\n"
        "movapd %%xmm5, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm5, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm5\n"
        "movapd %%xmm6, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm6, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm6\n"
        "movapd %%xmm7, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm7, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "addpd %%xmm1, %%xmm8\n"
        "subpd %%xmm1, %%xmm9\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm11\n"
        "addpd %%xmm3, %%xmm10\n"
        "subpd %%xmm3, %%xmm11\n"
        "movapd %%xmm4, %%xmm12\n"
        "movapd %%xmm4, %%xmm13\n"
        "addpd %%xmm5, %%xmm12\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm6, %%xmm14\n"
        "movapd %%xmm6, %%xmm15\n"
        "addpd %%xmm7, %%xmm14\n"
        "subpd %%xmm7, %%xmm15\n"
        "movapd %%xmm8, %%xmm0\n"
        "movapd %%xmm8, %%xmm2\n"
        "addpd %%xmm10, %%xmm0\n"
        "subpd %%xmm10, %%xmm2\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm9, %%xmm3\n"
        "addpd %%xmm11, %%xmm1\n"
        "subpd %%xmm11, %%xmm3\n"
        "movapd %%xmm12, %%xmm4\n"
        "movapd %%xmm12, %%xmm6\n"
        "addpd %%xmm14, %%xmm4\n"
        "subpd %%xmm14, %%xmm6\n"
        "movapd %%xmm13, %%xmm5\n"
        "movapd %%xmm13, %%xmm7\n"
        "addpd %%xmm15, %%xmm5\n"
        "subpd %%xmm15, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm12\n"
        "addpd %%xmm4, %%xmm8\n"
        "subpd %%xmm4, %%xmm12\n"
        "movapd %%xmm1, %%xmm9\n"
        "movapd %%xmm1, %%xmm13\n"
        "addpd %%xmm5, %%xmm9\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm14\n"
        "addpd %%xmm6, %%xmm10\n"
        "subpd %%xmm6, %%xmm14\n"
        "movapd %%xmm3, %%xmm11\n"
        "movapd %%xmm3, %%xmm15\n"
        "addpd %%xmm7, %%xmm11\n"
        "subpd %%xmm7, %%xmm15\n"
        "movupd %%xmm8, (%0)\n"
        "movupd %%xmm9, (%1)\n"
        "movupd %%xmm10, (%2)\n"
        "movupd %%xmm11, (%3)\n"
        "movupd %%xmm12, (%4)\n"
        "movupd %%xmm13, (%5)\n"
        "movupd %%xmm14, (%6)\n"
        "movupd %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 64; j += 64) {
    for (int k = 0; k < 16; k += 2) {
      __asm__ volatile (
        "movupd (%0), %%xmm0\n"
        "movupd (%1), %%xmm1\n"
        "movupd (%2), %%xmm2\n"
        "movupd (%3), %%xmm3\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "addpd %%xmm1, %%xmm8\n"
        "subpd %%xmm1, %%xmm9\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm11\n"
        "addpd %%xmm3, %%xmm10\n"
        "subpd %%xmm3, %%xmm11\n"
        "movapd %%xmm8, %%xmm0\n"
        "movapd %%xmm8, %%xmm2\n"
        "addpd %%xmm10, %%xmm0\n"
        "subpd %%xmm10, %%xmm2\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm9, %%xmm3\n"
        "addpd %%xmm11, %%xmm1\n"
        "subpd %%xmm11, %%xmm3\n"
        "movupd %%xmm0, (%0)\n"
        "movupd %%xmm1, (%1)\n"
        "movupd %%xmm2, (%2)\n"
        "movupd %%xmm3, (%3)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
static inline void helper_double_7(double *buf);
static inline void helper_double_7(double *buf) {
  for (int j = 0; j < 128; j += 16) {
    for (int k = 0; k < 2; k += 2) {
      __asm__ volatile (
        "movupd (%0), %%xmm0\n"
        "movupd (%1), %%xmm1\n"
        "movupd (%2), %%xmm2\n"
        "movupd (%3), %%xmm3\n"
        "movupd (%4), %%xmm4\n"
        "movupd (%5), %%xmm5\n"
        "movupd (%6), %%xmm6\n"
        "movupd (%7), %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm0\n"
        "movapd %%xmm1, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm1, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm2, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm2, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm2\n"
        "movapd %%xmm3, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm3, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm3\n"
        "movapd %%xmm4, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm4, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm4\n"
        "movapd %%xmm5, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm5, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm5\n"
        "movapd %%xmm6, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm6, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm6\n"
        "movapd %%xmm7, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm7, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "addpd %%xmm1, %%xmm8\n"
        "subpd %%xmm1, %%xmm9\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm11\n"
        "addpd %%xmm3, %%xmm10\n"
        "subpd %%xmm3, %%xmm11\n"
        "movapd %%xmm4, %%xmm12\n"
        "movapd %%xmm4, %%xmm13\n"
        "addpd %%xmm5, %%xmm12\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm6, %%xmm14\n"
        "movapd %%xmm6, %%xmm15\n"
        "addpd %%xmm7, %%xmm14\n"
        "subpd %%xmm7, %%xmm15\n"
        "movapd %%xmm8, %%xmm0\n"
        "movapd %%xmm8, %%xmm2\n"
        "addpd %%xmm10, %%xmm0\n"
        "subpd %%xmm10, %%xmm2\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm9, %%xmm3\n"
        "addpd %%xmm11, %%xmm1\n"
        "subpd %%xmm11, %%xmm3\n"
        "movapd %%xmm12, %%xmm4\n"
        "movapd %%xmm12, %%xmm6\n"
        "addpd %%xmm14, %%xmm4\n"
        "subpd %%xmm14, %%xmm6\n"
        "movapd %%xmm13, %%xmm5\n"
        "movapd %%xmm13, %%xmm7\n"
        "addpd %%xmm15, %%xmm5\n"
        "subpd %%xmm15, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm12\n"
        "addpd %%xmm4, %%xmm8\n"
        "subpd %%xmm4, %%xmm12\n"
        "movapd %%xmm1, %%xmm9\n"
        "movapd %%xmm1, %%xmm13\n"
        "addpd %%xmm5, %%xmm9\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm14\n"
        "addpd %%xmm6, %%xmm10\n"
        "subpd %%xmm6, %%xmm14\n"
        "movapd %%xmm3, %%xmm11\n"
        "movapd %%xmm3, %%xmm15\n"
        "addpd %%xmm7, %%xmm11\n"
        "subpd %%xmm7, %%xmm15\n"
        "movupd %%xmm8, (%0)\n"
        "movupd %%xmm9, (%1)\n"
        "movupd %%xmm10, (%2)\n"
        "movupd %%xmm11, (%3)\n"
        "movupd %%xmm12, (%4)\n"
        "movupd %%xmm13, (%5)\n"
        "movupd %%xmm14, (%6)\n"
        "movupd %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 128; j += 128) {
    for (int k = 0; k < 16; k += 2) {
      __asm__ volatile (
        "movupd (%0), %%xmm0\n"
        "movupd (%1), %%xmm1\n"
        "movupd (%2), %%xmm2\n"
        "movupd (%3), %%xmm3\n"
        "movupd (%4), %%xmm4\n"
        "movupd (%5), %%xmm5\n"
        "movupd (%6), %%xmm6\n"
        "movupd (%7), %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "addpd %%xmm1, %%xmm8\n"
        "subpd %%xmm1, %%xmm9\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm11\n"
        "addpd %%xmm3, %%xmm10\n"
        "subpd %%xmm3, %%xmm11\n"
        "movapd %%xmm4, %%xmm12\n"
        "movapd %%xmm4, %%xmm13\n"
        "addpd %%xmm5, %%xmm12\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm6, %%xmm14\n"
        "movapd %%xmm6, %%xmm15\n"
        "addpd %%xmm7, %%xmm14\n"
        "subpd %%xmm7, %%xmm15\n"
        "movapd %%xmm8, %%xmm0\n"
        "movapd %%xmm8, %%xmm2\n"
        "addpd %%xmm10, %%xmm0\n"
        "subpd %%xmm10, %%xmm2\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm9, %%xmm3\n"
        "addpd %%xmm11, %%xmm1\n"
        "subpd %%xmm11, %%xmm3\n"
        "movapd %%xmm12, %%xmm4\n"
        "movapd %%xmm12, %%xmm6\n"
        "addpd %%xmm14, %%xmm4\n"
        "subpd %%xmm14, %%xmm6\n"
        "movapd %%xmm13, %%xmm5\n"
        "movapd %%xmm13, %%xmm7\n"
        "addpd %%xmm15, %%xmm5\n"
        "subpd %%xmm15, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm12\n"
        "addpd %%xmm4, %%xmm8\n"
        "subpd %%xmm4, %%xmm12\n"
        "movapd %%xmm1, %%xmm9\n"
        "movapd %%xmm1, %%xmm13\n"
        "addpd %%xmm5, %%xmm9\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm14\n"
        "addpd %%xmm6, %%xmm10\n"
        "subpd %%xmm6, %%xmm14\n"
        "movapd %%xmm3, %%xmm11\n"
        "movapd %%xmm3, %%xmm15\n"
        "addpd %%xmm7, %%xmm11\n"
        "subpd %%xmm7, %%xmm15\n"
        "movupd %%xmm8, (%0)\n"
        "movupd %%xmm9, (%1)\n"
        "movupd %%xmm10, (%2)\n"
        "movupd %%xmm11, (%3)\n"
        "movupd %%xmm12, (%4)\n"
        "movupd %%xmm13, (%5)\n"
        "movupd %%xmm14, (%6)\n"
        "movupd %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
void helper_double_8_recursive(double *buf, int depth);
void helper_double_8_recursive(double *buf, int depth) {
  if (depth == 2) {
    for (int j = 0; j < 4; j += 4) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 5) {
    helper_double_8_recursive(buf + 0, 2);
    helper_double_8_recursive(buf + 4, 2);
    helper_double_8_recursive(buf + 8, 2);
    helper_double_8_recursive(buf + 12, 2);
    helper_double_8_recursive(buf + 16, 2);
    helper_double_8_recursive(buf + 20, 2);
    helper_double_8_recursive(buf + 24, 2);
    helper_double_8_recursive(buf + 28, 2);
    for (int j = 0; j < 32; j += 32) {
      for (int k = 0; k < 4; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 8) {
    helper_double_8_recursive(buf + 0, 5);
    helper_double_8_recursive(buf + 32, 5);
    helper_double_8_recursive(buf + 64, 5);
    helper_double_8_recursive(buf + 96, 5);
    helper_double_8_recursive(buf + 128, 5);
    helper_double_8_recursive(buf + 160, 5);
    helper_double_8_recursive(buf + 192, 5);
    helper_double_8_recursive(buf + 224, 5);
    for (int j = 0; j < 256; j += 256) {
      for (int k = 0; k < 32; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_8(double *buf);
void helper_double_8(double *buf) {
  helper_double_8_recursive(buf, 8);
}
void helper_double_9_recursive(double *buf, int depth);
void helper_double_9_recursive(double *buf, int depth) {
  if (depth == 6) {
    for (int j = 0; j < 64; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 64; j += 64) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 9) {
    helper_double_9_recursive(buf + 0, 6);
    helper_double_9_recursive(buf + 64, 6);
    helper_double_9_recursive(buf + 128, 6);
    helper_double_9_recursive(buf + 192, 6);
    helper_double_9_recursive(buf + 256, 6);
    helper_double_9_recursive(buf + 320, 6);
    helper_double_9_recursive(buf + 384, 6);
    helper_double_9_recursive(buf + 448, 6);
    for (int j = 0; j < 512; j += 512) {
      for (int k = 0; k < 64; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_9(double *buf);
void helper_double_9(double *buf) {
  helper_double_9_recursive(buf, 9);
}
void helper_double_10_recursive(double *buf, int depth);
void helper_double_10_recursive(double *buf, int depth) {
  if (depth == 10) {
    for (int j = 0; j < 1024; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 1024; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 1024; j += 1024) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_10(double *buf);
void helper_double_10(double *buf) {
  helper_double_10_recursive(buf, 10);
}
void helper_double_11_recursive(double *buf, int depth);
void helper_double_11_recursive(double *buf, int depth) {
  if (depth == 2) {
    for (int j = 0; j < 4; j += 4) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 5) {
    helper_double_11_recursive(buf + 0, 2);
    helper_double_11_recursive(buf + 4, 2);
    helper_double_11_recursive(buf + 8, 2);
    helper_double_11_recursive(buf + 12, 2);
    helper_double_11_recursive(buf + 16, 2);
    helper_double_11_recursive(buf + 20, 2);
    helper_double_11_recursive(buf + 24, 2);
    helper_double_11_recursive(buf + 28, 2);
    for (int j = 0; j < 32; j += 32) {
      for (int k = 0; k < 4; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 8) {
    helper_double_11_recursive(buf + 0, 5);
    helper_double_11_recursive(buf + 32, 5);
    helper_double_11_recursive(buf + 64, 5);
    helper_double_11_recursive(buf + 96, 5);
    helper_double_11_recursive(buf + 128, 5);
    helper_double_11_recursive(buf + 160, 5);
    helper_double_11_recursive(buf + 192, 5);
    helper_double_11_recursive(buf + 224, 5);
    for (int j = 0; j < 256; j += 256) {
      for (int k = 0; k < 32; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 11) {
    helper_double_11_recursive(buf + 0, 8);
    helper_double_11_recursive(buf + 256, 8);
    helper_double_11_recursive(buf + 512, 8);
    helper_double_11_recursive(buf + 768, 8);
    helper_double_11_recursive(buf + 1024, 8);
    helper_double_11_recursive(buf + 1280, 8);
    helper_double_11_recursive(buf + 1536, 8);
    helper_double_11_recursive(buf + 1792, 8);
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_11(double *buf);
void helper_double_11(double *buf) {
  helper_double_11_recursive(buf, 11);
}
void helper_double_12_recursive(double *buf, int depth);
void helper_double_12_recursive(double *buf, int depth) {
  if (depth == 10) {
    for (int j = 0; j < 1024; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 1024; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 1024; j += 1024) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 12) {
    helper_double_12_recursive(buf + 0, 10);
    helper_double_12_recursive(buf + 1024, 10);
    helper_double_12_recursive(buf + 2048, 10);
    helper_double_12_recursive(buf + 3072, 10);
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 1024; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_12(double *buf);
void helper_double_12(double *buf) {
  helper_double_12_recursive(buf, 12);
}
static inline void helper_double_13(double *buf);
static inline void helper_double_13(double *buf) {
  for (int j = 0; j < 8192; j += 16) {
    for (int k = 0; k < 2; k += 2) {
      __asm__ volatile (
        "movupd (%0), %%xmm0\n"
        "movupd (%1), %%xmm1\n"
        "movupd (%2), %%xmm2\n"
        "movupd (%3), %%xmm3\n"
        "movupd (%4), %%xmm4\n"
        "movupd (%5), %%xmm5\n"
        "movupd (%6), %%xmm6\n"
        "movupd (%7), %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm0\n"
        "movapd %%xmm1, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm1, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm2, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm2, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm2\n"
        "movapd %%xmm3, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm3, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm3\n"
        "movapd %%xmm4, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm4, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm4\n"
        "movapd %%xmm5, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm5, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm5\n"
        "movapd %%xmm6, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm6, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm6\n"
        "movapd %%xmm7, %%xmm8\n"
        "haddpd %%xmm8, %%xmm8\n"
        "movapd %%xmm7, %%xmm9\n"
        "hsubpd %%xmm9, %%xmm9\n"
        "blendpd $1, %%xmm8, %%xmm9\n"
        "movapd %%xmm9, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "addpd %%xmm1, %%xmm8\n"
        "subpd %%xmm1, %%xmm9\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm11\n"
        "addpd %%xmm3, %%xmm10\n"
        "subpd %%xmm3, %%xmm11\n"
        "movapd %%xmm4, %%xmm12\n"
        "movapd %%xmm4, %%xmm13\n"
        "addpd %%xmm5, %%xmm12\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm6, %%xmm14\n"
        "movapd %%xmm6, %%xmm15\n"
        "addpd %%xmm7, %%xmm14\n"
        "subpd %%xmm7, %%xmm15\n"
        "movapd %%xmm8, %%xmm0\n"
        "movapd %%xmm8, %%xmm2\n"
        "addpd %%xmm10, %%xmm0\n"
        "subpd %%xmm10, %%xmm2\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm9, %%xmm3\n"
        "addpd %%xmm11, %%xmm1\n"
        "subpd %%xmm11, %%xmm3\n"
        "movapd %%xmm12, %%xmm4\n"
        "movapd %%xmm12, %%xmm6\n"
        "addpd %%xmm14, %%xmm4\n"
        "subpd %%xmm14, %%xmm6\n"
        "movapd %%xmm13, %%xmm5\n"
        "movapd %%xmm13, %%xmm7\n"
        "addpd %%xmm15, %%xmm5\n"
        "subpd %%xmm15, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm12\n"
        "addpd %%xmm4, %%xmm8\n"
        "subpd %%xmm4, %%xmm12\n"
        "movapd %%xmm1, %%xmm9\n"
        "movapd %%xmm1, %%xmm13\n"
        "addpd %%xmm5, %%xmm9\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm14\n"
        "addpd %%xmm6, %%xmm10\n"
        "subpd %%xmm6, %%xmm14\n"
        "movapd %%xmm3, %%xmm11\n"
        "movapd %%xmm3, %%xmm15\n"
        "addpd %%xmm7, %%xmm11\n"
        "subpd %%xmm7, %%xmm15\n"
        "movupd %%xmm8, (%0)\n"
        "movupd %%xmm9, (%1)\n"
        "movupd %%xmm10, (%2)\n"
        "movupd %%xmm11, (%3)\n"
        "movupd %%xmm12, (%4)\n"
        "movupd %%xmm13, (%5)\n"
        "movupd %%xmm14, (%6)\n"
        "movupd %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 8192; j += 128) {
    for (int k = 0; k < 16; k += 2) {
      __asm__ volatile (
        "movupd (%0), %%xmm0\n"
        "movupd (%1), %%xmm1\n"
        "movupd (%2), %%xmm2\n"
        "movupd (%3), %%xmm3\n"
        "movupd (%4), %%xmm4\n"
        "movupd (%5), %%xmm5\n"
        "movupd (%6), %%xmm6\n"
        "movupd (%7), %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "addpd %%xmm1, %%xmm8\n"
        "subpd %%xmm1, %%xmm9\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm11\n"
        "addpd %%xmm3, %%xmm10\n"
        "subpd %%xmm3, %%xmm11\n"
        "movapd %%xmm4, %%xmm12\n"
        "movapd %%xmm4, %%xmm13\n"
        "addpd %%xmm5, %%xmm12\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm6, %%xmm14\n"
        "movapd %%xmm6, %%xmm15\n"
        "addpd %%xmm7, %%xmm14\n"
        "subpd %%xmm7, %%xmm15\n"
        "movapd %%xmm8, %%xmm0\n"
        "movapd %%xmm8, %%xmm2\n"
        "addpd %%xmm10, %%xmm0\n"
        "subpd %%xmm10, %%xmm2\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm9, %%xmm3\n"
        "addpd %%xmm11, %%xmm1\n"
        "subpd %%xmm11, %%xmm3\n"
        "movapd %%xmm12, %%xmm4\n"
        "movapd %%xmm12, %%xmm6\n"
        "addpd %%xmm14, %%xmm4\n"
        "subpd %%xmm14, %%xmm6\n"
        "movapd %%xmm13, %%xmm5\n"
        "movapd %%xmm13, %%xmm7\n"
        "addpd %%xmm15, %%xmm5\n"
        "subpd %%xmm15, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm12\n"
        "addpd %%xmm4, %%xmm8\n"
        "subpd %%xmm4, %%xmm12\n"
        "movapd %%xmm1, %%xmm9\n"
        "movapd %%xmm1, %%xmm13\n"
        "addpd %%xmm5, %%xmm9\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm14\n"
        "addpd %%xmm6, %%xmm10\n"
        "subpd %%xmm6, %%xmm14\n"
        "movapd %%xmm3, %%xmm11\n"
        "movapd %%xmm3, %%xmm15\n"
        "addpd %%xmm7, %%xmm11\n"
        "subpd %%xmm7, %%xmm15\n"
        "movupd %%xmm8, (%0)\n"
        "movupd %%xmm9, (%1)\n"
        "movupd %%xmm10, (%2)\n"
        "movupd %%xmm11, (%3)\n"
        "movupd %%xmm12, (%4)\n"
        "movupd %%xmm13, (%5)\n"
        "movupd %%xmm14, (%6)\n"
        "movupd %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 8192; j += 1024) {
    for (int k = 0; k < 128; k += 2) {
      __asm__ volatile (
        "movupd (%0), %%xmm0\n"
        "movupd (%1), %%xmm1\n"
        "movupd (%2), %%xmm2\n"
        "movupd (%3), %%xmm3\n"
        "movupd (%4), %%xmm4\n"
        "movupd (%5), %%xmm5\n"
        "movupd (%6), %%xmm6\n"
        "movupd (%7), %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "addpd %%xmm1, %%xmm8\n"
        "subpd %%xmm1, %%xmm9\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm11\n"
        "addpd %%xmm3, %%xmm10\n"
        "subpd %%xmm3, %%xmm11\n"
        "movapd %%xmm4, %%xmm12\n"
        "movapd %%xmm4, %%xmm13\n"
        "addpd %%xmm5, %%xmm12\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm6, %%xmm14\n"
        "movapd %%xmm6, %%xmm15\n"
        "addpd %%xmm7, %%xmm14\n"
        "subpd %%xmm7, %%xmm15\n"
        "movapd %%xmm8, %%xmm0\n"
        "movapd %%xmm8, %%xmm2\n"
        "addpd %%xmm10, %%xmm0\n"
        "subpd %%xmm10, %%xmm2\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm9, %%xmm3\n"
        "addpd %%xmm11, %%xmm1\n"
        "subpd %%xmm11, %%xmm3\n"
        "movapd %%xmm12, %%xmm4\n"
        "movapd %%xmm12, %%xmm6\n"
        "addpd %%xmm14, %%xmm4\n"
        "subpd %%xmm14, %%xmm6\n"
        "movapd %%xmm13, %%xmm5\n"
        "movapd %%xmm13, %%xmm7\n"
        "addpd %%xmm15, %%xmm5\n"
        "subpd %%xmm15, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm12\n"
        "addpd %%xmm4, %%xmm8\n"
        "subpd %%xmm4, %%xmm12\n"
        "movapd %%xmm1, %%xmm9\n"
        "movapd %%xmm1, %%xmm13\n"
        "addpd %%xmm5, %%xmm9\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm14\n"
        "addpd %%xmm6, %%xmm10\n"
        "subpd %%xmm6, %%xmm14\n"
        "movapd %%xmm3, %%xmm11\n"
        "movapd %%xmm3, %%xmm15\n"
        "addpd %%xmm7, %%xmm11\n"
        "subpd %%xmm7, %%xmm15\n"
        "movupd %%xmm8, (%0)\n"
        "movupd %%xmm9, (%1)\n"
        "movupd %%xmm10, (%2)\n"
        "movupd %%xmm11, (%3)\n"
        "movupd %%xmm12, (%4)\n"
        "movupd %%xmm13, (%5)\n"
        "movupd %%xmm14, (%6)\n"
        "movupd %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
  for (int j = 0; j < 8192; j += 8192) {
    for (int k = 0; k < 1024; k += 2) {
      __asm__ volatile (
        "movupd (%0), %%xmm0\n"
        "movupd (%1), %%xmm1\n"
        "movupd (%2), %%xmm2\n"
        "movupd (%3), %%xmm3\n"
        "movupd (%4), %%xmm4\n"
        "movupd (%5), %%xmm5\n"
        "movupd (%6), %%xmm6\n"
        "movupd (%7), %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm9\n"
        "addpd %%xmm1, %%xmm8\n"
        "subpd %%xmm1, %%xmm9\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm11\n"
        "addpd %%xmm3, %%xmm10\n"
        "subpd %%xmm3, %%xmm11\n"
        "movapd %%xmm4, %%xmm12\n"
        "movapd %%xmm4, %%xmm13\n"
        "addpd %%xmm5, %%xmm12\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm6, %%xmm14\n"
        "movapd %%xmm6, %%xmm15\n"
        "addpd %%xmm7, %%xmm14\n"
        "subpd %%xmm7, %%xmm15\n"
        "movapd %%xmm8, %%xmm0\n"
        "movapd %%xmm8, %%xmm2\n"
        "addpd %%xmm10, %%xmm0\n"
        "subpd %%xmm10, %%xmm2\n"
        "movapd %%xmm9, %%xmm1\n"
        "movapd %%xmm9, %%xmm3\n"
        "addpd %%xmm11, %%xmm1\n"
        "subpd %%xmm11, %%xmm3\n"
        "movapd %%xmm12, %%xmm4\n"
        "movapd %%xmm12, %%xmm6\n"
        "addpd %%xmm14, %%xmm4\n"
        "subpd %%xmm14, %%xmm6\n"
        "movapd %%xmm13, %%xmm5\n"
        "movapd %%xmm13, %%xmm7\n"
        "addpd %%xmm15, %%xmm5\n"
        "subpd %%xmm15, %%xmm7\n"
        "movapd %%xmm0, %%xmm8\n"
        "movapd %%xmm0, %%xmm12\n"
        "addpd %%xmm4, %%xmm8\n"
        "subpd %%xmm4, %%xmm12\n"
        "movapd %%xmm1, %%xmm9\n"
        "movapd %%xmm1, %%xmm13\n"
        "addpd %%xmm5, %%xmm9\n"
        "subpd %%xmm5, %%xmm13\n"
        "movapd %%xmm2, %%xmm10\n"
        "movapd %%xmm2, %%xmm14\n"
        "addpd %%xmm6, %%xmm10\n"
        "subpd %%xmm6, %%xmm14\n"
        "movapd %%xmm3, %%xmm11\n"
        "movapd %%xmm3, %%xmm15\n"
        "addpd %%xmm7, %%xmm11\n"
        "subpd %%xmm7, %%xmm15\n"
        "movupd %%xmm8, (%0)\n"
        "movupd %%xmm9, (%1)\n"
        "movupd %%xmm10, (%2)\n"
        "movupd %%xmm11, (%3)\n"
        "movupd %%xmm12, (%4)\n"
        "movupd %%xmm13, (%5)\n"
        "movupd %%xmm14, (%6)\n"
        "movupd %%xmm15, (%7)\n"
        :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
      );
    }
  }
}
void helper_double_14_recursive(double *buf, int depth);
void helper_double_14_recursive(double *buf, int depth) {
  if (depth == 9) {
    for (int j = 0; j < 512; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 512; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 512; j += 512) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 12) {
    helper_double_14_recursive(buf + 0, 9);
    helper_double_14_recursive(buf + 512, 9);
    helper_double_14_recursive(buf + 1024, 9);
    helper_double_14_recursive(buf + 1536, 9);
    helper_double_14_recursive(buf + 2048, 9);
    helper_double_14_recursive(buf + 2560, 9);
    helper_double_14_recursive(buf + 3072, 9);
    helper_double_14_recursive(buf + 3584, 9);
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 512; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_double_14_recursive(buf + 0, 12);
    helper_double_14_recursive(buf + 4096, 12);
    helper_double_14_recursive(buf + 8192, 12);
    helper_double_14_recursive(buf + 12288, 12);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 4096; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_14(double *buf);
void helper_double_14(double *buf) {
  helper_double_14_recursive(buf, 14);
}
void helper_double_15_recursive(double *buf, int depth);
void helper_double_15_recursive(double *buf, int depth) {
  if (depth == 10) {
    for (int j = 0; j < 1024; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 1024; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 1024; j += 1024) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 13) {
    helper_double_15_recursive(buf + 0, 10);
    helper_double_15_recursive(buf + 1024, 10);
    helper_double_15_recursive(buf + 2048, 10);
    helper_double_15_recursive(buf + 3072, 10);
    helper_double_15_recursive(buf + 4096, 10);
    helper_double_15_recursive(buf + 5120, 10);
    helper_double_15_recursive(buf + 6144, 10);
    helper_double_15_recursive(buf + 7168, 10);
    for (int j = 0; j < 8192; j += 8192) {
      for (int k = 0; k < 1024; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_double_15_recursive(buf + 0, 13);
    helper_double_15_recursive(buf + 8192, 13);
    helper_double_15_recursive(buf + 16384, 13);
    helper_double_15_recursive(buf + 24576, 13);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 8192; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_15(double *buf);
void helper_double_15(double *buf) {
  helper_double_15_recursive(buf, 15);
}
void helper_double_16_recursive(double *buf, int depth);
void helper_double_16_recursive(double *buf, int depth) {
  if (depth == 2) {
    for (int j = 0; j < 4; j += 4) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 5) {
    helper_double_16_recursive(buf + 0, 2);
    helper_double_16_recursive(buf + 4, 2);
    helper_double_16_recursive(buf + 8, 2);
    helper_double_16_recursive(buf + 12, 2);
    helper_double_16_recursive(buf + 16, 2);
    helper_double_16_recursive(buf + 20, 2);
    helper_double_16_recursive(buf + 24, 2);
    helper_double_16_recursive(buf + 28, 2);
    for (int j = 0; j < 32; j += 32) {
      for (int k = 0; k < 4; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 8) {
    helper_double_16_recursive(buf + 0, 5);
    helper_double_16_recursive(buf + 32, 5);
    helper_double_16_recursive(buf + 64, 5);
    helper_double_16_recursive(buf + 96, 5);
    helper_double_16_recursive(buf + 128, 5);
    helper_double_16_recursive(buf + 160, 5);
    helper_double_16_recursive(buf + 192, 5);
    helper_double_16_recursive(buf + 224, 5);
    for (int j = 0; j < 256; j += 256) {
      for (int k = 0; k < 32; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 11) {
    helper_double_16_recursive(buf + 0, 8);
    helper_double_16_recursive(buf + 256, 8);
    helper_double_16_recursive(buf + 512, 8);
    helper_double_16_recursive(buf + 768, 8);
    helper_double_16_recursive(buf + 1024, 8);
    helper_double_16_recursive(buf + 1280, 8);
    helper_double_16_recursive(buf + 1536, 8);
    helper_double_16_recursive(buf + 1792, 8);
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_double_16_recursive(buf + 0, 11);
    helper_double_16_recursive(buf + 2048, 11);
    helper_double_16_recursive(buf + 4096, 11);
    helper_double_16_recursive(buf + 6144, 11);
    helper_double_16_recursive(buf + 8192, 11);
    helper_double_16_recursive(buf + 10240, 11);
    helper_double_16_recursive(buf + 12288, 11);
    helper_double_16_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 16) {
    helper_double_16_recursive(buf + 0, 14);
    helper_double_16_recursive(buf + 16384, 14);
    helper_double_16_recursive(buf + 32768, 14);
    helper_double_16_recursive(buf + 49152, 14);
    for (int j = 0; j < 65536; j += 65536) {
      for (int k = 0; k < 16384; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_16(double *buf);
void helper_double_16(double *buf) {
  helper_double_16_recursive(buf, 16);
}
void helper_double_17_recursive(double *buf, int depth);
void helper_double_17_recursive(double *buf, int depth) {
  if (depth == 12) {
    for (int j = 0; j < 4096; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 1024) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 1024; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_double_17_recursive(buf + 0, 12);
    helper_double_17_recursive(buf + 4096, 12);
    helper_double_17_recursive(buf + 8192, 12);
    helper_double_17_recursive(buf + 12288, 12);
    helper_double_17_recursive(buf + 16384, 12);
    helper_double_17_recursive(buf + 20480, 12);
    helper_double_17_recursive(buf + 24576, 12);
    helper_double_17_recursive(buf + 28672, 12);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 4096; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 17) {
    helper_double_17_recursive(buf + 0, 15);
    helper_double_17_recursive(buf + 32768, 15);
    helper_double_17_recursive(buf + 65536, 15);
    helper_double_17_recursive(buf + 98304, 15);
    for (int j = 0; j < 131072; j += 131072) {
      for (int k = 0; k < 32768; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_17(double *buf);
void helper_double_17(double *buf) {
  helper_double_17_recursive(buf, 17);
}
void helper_double_18_recursive(double *buf, int depth);
void helper_double_18_recursive(double *buf, int depth) {
  if (depth == 12) {
    for (int j = 0; j < 4096; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 1024) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 1024; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_double_18_recursive(buf + 0, 12);
    helper_double_18_recursive(buf + 4096, 12);
    helper_double_18_recursive(buf + 8192, 12);
    helper_double_18_recursive(buf + 12288, 12);
    helper_double_18_recursive(buf + 16384, 12);
    helper_double_18_recursive(buf + 20480, 12);
    helper_double_18_recursive(buf + 24576, 12);
    helper_double_18_recursive(buf + 28672, 12);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 4096; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 18) {
    helper_double_18_recursive(buf + 0, 15);
    helper_double_18_recursive(buf + 32768, 15);
    helper_double_18_recursive(buf + 65536, 15);
    helper_double_18_recursive(buf + 98304, 15);
    helper_double_18_recursive(buf + 131072, 15);
    helper_double_18_recursive(buf + 163840, 15);
    helper_double_18_recursive(buf + 196608, 15);
    helper_double_18_recursive(buf + 229376, 15);
    for (int j = 0; j < 262144; j += 262144) {
      for (int k = 0; k < 32768; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_18(double *buf);
void helper_double_18(double *buf) {
  helper_double_18_recursive(buf, 18);
}
void helper_double_19_recursive(double *buf, int depth);
void helper_double_19_recursive(double *buf, int depth) {
  if (depth == 2) {
    for (int j = 0; j < 4; j += 4) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 5) {
    helper_double_19_recursive(buf + 0, 2);
    helper_double_19_recursive(buf + 4, 2);
    helper_double_19_recursive(buf + 8, 2);
    helper_double_19_recursive(buf + 12, 2);
    helper_double_19_recursive(buf + 16, 2);
    helper_double_19_recursive(buf + 20, 2);
    helper_double_19_recursive(buf + 24, 2);
    helper_double_19_recursive(buf + 28, 2);
    for (int j = 0; j < 32; j += 32) {
      for (int k = 0; k < 4; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 8) {
    helper_double_19_recursive(buf + 0, 5);
    helper_double_19_recursive(buf + 32, 5);
    helper_double_19_recursive(buf + 64, 5);
    helper_double_19_recursive(buf + 96, 5);
    helper_double_19_recursive(buf + 128, 5);
    helper_double_19_recursive(buf + 160, 5);
    helper_double_19_recursive(buf + 192, 5);
    helper_double_19_recursive(buf + 224, 5);
    for (int j = 0; j < 256; j += 256) {
      for (int k = 0; k < 32; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 11) {
    helper_double_19_recursive(buf + 0, 8);
    helper_double_19_recursive(buf + 256, 8);
    helper_double_19_recursive(buf + 512, 8);
    helper_double_19_recursive(buf + 768, 8);
    helper_double_19_recursive(buf + 1024, 8);
    helper_double_19_recursive(buf + 1280, 8);
    helper_double_19_recursive(buf + 1536, 8);
    helper_double_19_recursive(buf + 1792, 8);
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_double_19_recursive(buf + 0, 11);
    helper_double_19_recursive(buf + 2048, 11);
    helper_double_19_recursive(buf + 4096, 11);
    helper_double_19_recursive(buf + 6144, 11);
    helper_double_19_recursive(buf + 8192, 11);
    helper_double_19_recursive(buf + 10240, 11);
    helper_double_19_recursive(buf + 12288, 11);
    helper_double_19_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 17) {
    helper_double_19_recursive(buf + 0, 14);
    helper_double_19_recursive(buf + 16384, 14);
    helper_double_19_recursive(buf + 32768, 14);
    helper_double_19_recursive(buf + 49152, 14);
    helper_double_19_recursive(buf + 65536, 14);
    helper_double_19_recursive(buf + 81920, 14);
    helper_double_19_recursive(buf + 98304, 14);
    helper_double_19_recursive(buf + 114688, 14);
    for (int j = 0; j < 131072; j += 131072) {
      for (int k = 0; k < 16384; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 19) {
    helper_double_19_recursive(buf + 0, 17);
    helper_double_19_recursive(buf + 131072, 17);
    helper_double_19_recursive(buf + 262144, 17);
    helper_double_19_recursive(buf + 393216, 17);
    for (int j = 0; j < 524288; j += 524288) {
      for (int k = 0; k < 131072; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_19(double *buf);
void helper_double_19(double *buf) {
  helper_double_19_recursive(buf, 19);
}
void helper_double_20_recursive(double *buf, int depth);
void helper_double_20_recursive(double *buf, int depth) {
  if (depth == 12) {
    for (int j = 0; j < 4096; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 1024) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 1024; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_double_20_recursive(buf + 0, 12);
    helper_double_20_recursive(buf + 4096, 12);
    helper_double_20_recursive(buf + 8192, 12);
    helper_double_20_recursive(buf + 12288, 12);
    helper_double_20_recursive(buf + 16384, 12);
    helper_double_20_recursive(buf + 20480, 12);
    helper_double_20_recursive(buf + 24576, 12);
    helper_double_20_recursive(buf + 28672, 12);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 4096; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 18) {
    helper_double_20_recursive(buf + 0, 15);
    helper_double_20_recursive(buf + 32768, 15);
    helper_double_20_recursive(buf + 65536, 15);
    helper_double_20_recursive(buf + 98304, 15);
    helper_double_20_recursive(buf + 131072, 15);
    helper_double_20_recursive(buf + 163840, 15);
    helper_double_20_recursive(buf + 196608, 15);
    helper_double_20_recursive(buf + 229376, 15);
    for (int j = 0; j < 262144; j += 262144) {
      for (int k = 0; k < 32768; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 20) {
    helper_double_20_recursive(buf + 0, 18);
    helper_double_20_recursive(buf + 262144, 18);
    helper_double_20_recursive(buf + 524288, 18);
    helper_double_20_recursive(buf + 786432, 18);
    for (int j = 0; j < 1048576; j += 1048576) {
      for (int k = 0; k < 262144; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_20(double *buf);
void helper_double_20(double *buf) {
  helper_double_20_recursive(buf, 20);
}
void helper_double_21_recursive(double *buf, int depth);
void helper_double_21_recursive(double *buf, int depth) {
  if (depth == 13) {
    for (int j = 0; j < 8192; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 1024) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 8192) {
      for (int k = 0; k < 1024; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 16) {
    helper_double_21_recursive(buf + 0, 13);
    helper_double_21_recursive(buf + 8192, 13);
    helper_double_21_recursive(buf + 16384, 13);
    helper_double_21_recursive(buf + 24576, 13);
    helper_double_21_recursive(buf + 32768, 13);
    helper_double_21_recursive(buf + 40960, 13);
    helper_double_21_recursive(buf + 49152, 13);
    helper_double_21_recursive(buf + 57344, 13);
    for (int j = 0; j < 65536; j += 65536) {
      for (int k = 0; k < 8192; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 19) {
    helper_double_21_recursive(buf + 0, 16);
    helper_double_21_recursive(buf + 65536, 16);
    helper_double_21_recursive(buf + 131072, 16);
    helper_double_21_recursive(buf + 196608, 16);
    helper_double_21_recursive(buf + 262144, 16);
    helper_double_21_recursive(buf + 327680, 16);
    helper_double_21_recursive(buf + 393216, 16);
    helper_double_21_recursive(buf + 458752, 16);
    for (int j = 0; j < 524288; j += 524288) {
      for (int k = 0; k < 65536; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 21) {
    helper_double_21_recursive(buf + 0, 19);
    helper_double_21_recursive(buf + 524288, 19);
    helper_double_21_recursive(buf + 1048576, 19);
    helper_double_21_recursive(buf + 1572864, 19);
    for (int j = 0; j < 2097152; j += 2097152) {
      for (int k = 0; k < 524288; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_21(double *buf);
void helper_double_21(double *buf) {
  helper_double_21_recursive(buf, 21);
}
void helper_double_22_recursive(double *buf, int depth);
void helper_double_22_recursive(double *buf, int depth) {
  if (depth == 2) {
    for (int j = 0; j < 4; j += 4) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 5) {
    helper_double_22_recursive(buf + 0, 2);
    helper_double_22_recursive(buf + 4, 2);
    helper_double_22_recursive(buf + 8, 2);
    helper_double_22_recursive(buf + 12, 2);
    helper_double_22_recursive(buf + 16, 2);
    helper_double_22_recursive(buf + 20, 2);
    helper_double_22_recursive(buf + 24, 2);
    helper_double_22_recursive(buf + 28, 2);
    for (int j = 0; j < 32; j += 32) {
      for (int k = 0; k < 4; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 8) {
    helper_double_22_recursive(buf + 0, 5);
    helper_double_22_recursive(buf + 32, 5);
    helper_double_22_recursive(buf + 64, 5);
    helper_double_22_recursive(buf + 96, 5);
    helper_double_22_recursive(buf + 128, 5);
    helper_double_22_recursive(buf + 160, 5);
    helper_double_22_recursive(buf + 192, 5);
    helper_double_22_recursive(buf + 224, 5);
    for (int j = 0; j < 256; j += 256) {
      for (int k = 0; k < 32; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 11) {
    helper_double_22_recursive(buf + 0, 8);
    helper_double_22_recursive(buf + 256, 8);
    helper_double_22_recursive(buf + 512, 8);
    helper_double_22_recursive(buf + 768, 8);
    helper_double_22_recursive(buf + 1024, 8);
    helper_double_22_recursive(buf + 1280, 8);
    helper_double_22_recursive(buf + 1536, 8);
    helper_double_22_recursive(buf + 1792, 8);
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_double_22_recursive(buf + 0, 11);
    helper_double_22_recursive(buf + 2048, 11);
    helper_double_22_recursive(buf + 4096, 11);
    helper_double_22_recursive(buf + 6144, 11);
    helper_double_22_recursive(buf + 8192, 11);
    helper_double_22_recursive(buf + 10240, 11);
    helper_double_22_recursive(buf + 12288, 11);
    helper_double_22_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 17) {
    helper_double_22_recursive(buf + 0, 14);
    helper_double_22_recursive(buf + 16384, 14);
    helper_double_22_recursive(buf + 32768, 14);
    helper_double_22_recursive(buf + 49152, 14);
    helper_double_22_recursive(buf + 65536, 14);
    helper_double_22_recursive(buf + 81920, 14);
    helper_double_22_recursive(buf + 98304, 14);
    helper_double_22_recursive(buf + 114688, 14);
    for (int j = 0; j < 131072; j += 131072) {
      for (int k = 0; k < 16384; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 20) {
    helper_double_22_recursive(buf + 0, 17);
    helper_double_22_recursive(buf + 131072, 17);
    helper_double_22_recursive(buf + 262144, 17);
    helper_double_22_recursive(buf + 393216, 17);
    helper_double_22_recursive(buf + 524288, 17);
    helper_double_22_recursive(buf + 655360, 17);
    helper_double_22_recursive(buf + 786432, 17);
    helper_double_22_recursive(buf + 917504, 17);
    for (int j = 0; j < 1048576; j += 1048576) {
      for (int k = 0; k < 131072; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 22) {
    helper_double_22_recursive(buf + 0, 20);
    helper_double_22_recursive(buf + 1048576, 20);
    helper_double_22_recursive(buf + 2097152, 20);
    helper_double_22_recursive(buf + 3145728, 20);
    for (int j = 0; j < 4194304; j += 4194304) {
      for (int k = 0; k < 1048576; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_22(double *buf);
void helper_double_22(double *buf) {
  helper_double_22_recursive(buf, 22);
}
void helper_double_23_recursive(double *buf, int depth);
void helper_double_23_recursive(double *buf, int depth) {
  if (depth == 2) {
    for (int j = 0; j < 4; j += 4) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 5) {
    helper_double_23_recursive(buf + 0, 2);
    helper_double_23_recursive(buf + 4, 2);
    helper_double_23_recursive(buf + 8, 2);
    helper_double_23_recursive(buf + 12, 2);
    helper_double_23_recursive(buf + 16, 2);
    helper_double_23_recursive(buf + 20, 2);
    helper_double_23_recursive(buf + 24, 2);
    helper_double_23_recursive(buf + 28, 2);
    for (int j = 0; j < 32; j += 32) {
      for (int k = 0; k < 4; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 8) {
    helper_double_23_recursive(buf + 0, 5);
    helper_double_23_recursive(buf + 32, 5);
    helper_double_23_recursive(buf + 64, 5);
    helper_double_23_recursive(buf + 96, 5);
    helper_double_23_recursive(buf + 128, 5);
    helper_double_23_recursive(buf + 160, 5);
    helper_double_23_recursive(buf + 192, 5);
    helper_double_23_recursive(buf + 224, 5);
    for (int j = 0; j < 256; j += 256) {
      for (int k = 0; k < 32; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 11) {
    helper_double_23_recursive(buf + 0, 8);
    helper_double_23_recursive(buf + 256, 8);
    helper_double_23_recursive(buf + 512, 8);
    helper_double_23_recursive(buf + 768, 8);
    helper_double_23_recursive(buf + 1024, 8);
    helper_double_23_recursive(buf + 1280, 8);
    helper_double_23_recursive(buf + 1536, 8);
    helper_double_23_recursive(buf + 1792, 8);
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_double_23_recursive(buf + 0, 11);
    helper_double_23_recursive(buf + 2048, 11);
    helper_double_23_recursive(buf + 4096, 11);
    helper_double_23_recursive(buf + 6144, 11);
    helper_double_23_recursive(buf + 8192, 11);
    helper_double_23_recursive(buf + 10240, 11);
    helper_double_23_recursive(buf + 12288, 11);
    helper_double_23_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 17) {
    helper_double_23_recursive(buf + 0, 14);
    helper_double_23_recursive(buf + 16384, 14);
    helper_double_23_recursive(buf + 32768, 14);
    helper_double_23_recursive(buf + 49152, 14);
    helper_double_23_recursive(buf + 65536, 14);
    helper_double_23_recursive(buf + 81920, 14);
    helper_double_23_recursive(buf + 98304, 14);
    helper_double_23_recursive(buf + 114688, 14);
    for (int j = 0; j < 131072; j += 131072) {
      for (int k = 0; k < 16384; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 20) {
    helper_double_23_recursive(buf + 0, 17);
    helper_double_23_recursive(buf + 131072, 17);
    helper_double_23_recursive(buf + 262144, 17);
    helper_double_23_recursive(buf + 393216, 17);
    helper_double_23_recursive(buf + 524288, 17);
    helper_double_23_recursive(buf + 655360, 17);
    helper_double_23_recursive(buf + 786432, 17);
    helper_double_23_recursive(buf + 917504, 17);
    for (int j = 0; j < 1048576; j += 1048576) {
      for (int k = 0; k < 131072; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 23) {
    helper_double_23_recursive(buf + 0, 20);
    helper_double_23_recursive(buf + 1048576, 20);
    helper_double_23_recursive(buf + 2097152, 20);
    helper_double_23_recursive(buf + 3145728, 20);
    helper_double_23_recursive(buf + 4194304, 20);
    helper_double_23_recursive(buf + 5242880, 20);
    helper_double_23_recursive(buf + 6291456, 20);
    helper_double_23_recursive(buf + 7340032, 20);
    for (int j = 0; j < 8388608; j += 8388608) {
      for (int k = 0; k < 1048576; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_23(double *buf);
void helper_double_23(double *buf) {
  helper_double_23_recursive(buf, 23);
}
void helper_double_24_recursive(double *buf, int depth);
void helper_double_24_recursive(double *buf, int depth) {
  if (depth == 13) {
    for (int j = 0; j < 8192; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 1024) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 8192; j += 8192) {
      for (int k = 0; k < 1024; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 16) {
    helper_double_24_recursive(buf + 0, 13);
    helper_double_24_recursive(buf + 8192, 13);
    helper_double_24_recursive(buf + 16384, 13);
    helper_double_24_recursive(buf + 24576, 13);
    helper_double_24_recursive(buf + 32768, 13);
    helper_double_24_recursive(buf + 40960, 13);
    helper_double_24_recursive(buf + 49152, 13);
    helper_double_24_recursive(buf + 57344, 13);
    for (int j = 0; j < 65536; j += 65536) {
      for (int k = 0; k < 8192; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 19) {
    helper_double_24_recursive(buf + 0, 16);
    helper_double_24_recursive(buf + 65536, 16);
    helper_double_24_recursive(buf + 131072, 16);
    helper_double_24_recursive(buf + 196608, 16);
    helper_double_24_recursive(buf + 262144, 16);
    helper_double_24_recursive(buf + 327680, 16);
    helper_double_24_recursive(buf + 393216, 16);
    helper_double_24_recursive(buf + 458752, 16);
    for (int j = 0; j < 524288; j += 524288) {
      for (int k = 0; k < 65536; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 22) {
    helper_double_24_recursive(buf + 0, 19);
    helper_double_24_recursive(buf + 524288, 19);
    helper_double_24_recursive(buf + 1048576, 19);
    helper_double_24_recursive(buf + 1572864, 19);
    helper_double_24_recursive(buf + 2097152, 19);
    helper_double_24_recursive(buf + 2621440, 19);
    helper_double_24_recursive(buf + 3145728, 19);
    helper_double_24_recursive(buf + 3670016, 19);
    for (int j = 0; j < 4194304; j += 4194304) {
      for (int k = 0; k < 524288; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 24) {
    helper_double_24_recursive(buf + 0, 22);
    helper_double_24_recursive(buf + 4194304, 22);
    helper_double_24_recursive(buf + 8388608, 22);
    helper_double_24_recursive(buf + 12582912, 22);
    for (int j = 0; j < 16777216; j += 16777216) {
      for (int k = 0; k < 4194304; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_24(double *buf);
void helper_double_24(double *buf) {
  helper_double_24_recursive(buf, 24);
}
void helper_double_25_recursive(double *buf, int depth);
void helper_double_25_recursive(double *buf, int depth) {
  if (depth == 10) {
    for (int j = 0; j < 1024; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 1024; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 1024; j += 1024) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 13) {
    helper_double_25_recursive(buf + 0, 10);
    helper_double_25_recursive(buf + 1024, 10);
    helper_double_25_recursive(buf + 2048, 10);
    helper_double_25_recursive(buf + 3072, 10);
    helper_double_25_recursive(buf + 4096, 10);
    helper_double_25_recursive(buf + 5120, 10);
    helper_double_25_recursive(buf + 6144, 10);
    helper_double_25_recursive(buf + 7168, 10);
    for (int j = 0; j < 8192; j += 8192) {
      for (int k = 0; k < 1024; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 16) {
    helper_double_25_recursive(buf + 0, 13);
    helper_double_25_recursive(buf + 8192, 13);
    helper_double_25_recursive(buf + 16384, 13);
    helper_double_25_recursive(buf + 24576, 13);
    helper_double_25_recursive(buf + 32768, 13);
    helper_double_25_recursive(buf + 40960, 13);
    helper_double_25_recursive(buf + 49152, 13);
    helper_double_25_recursive(buf + 57344, 13);
    for (int j = 0; j < 65536; j += 65536) {
      for (int k = 0; k < 8192; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 19) {
    helper_double_25_recursive(buf + 0, 16);
    helper_double_25_recursive(buf + 65536, 16);
    helper_double_25_recursive(buf + 131072, 16);
    helper_double_25_recursive(buf + 196608, 16);
    helper_double_25_recursive(buf + 262144, 16);
    helper_double_25_recursive(buf + 327680, 16);
    helper_double_25_recursive(buf + 393216, 16);
    helper_double_25_recursive(buf + 458752, 16);
    for (int j = 0; j < 524288; j += 524288) {
      for (int k = 0; k < 65536; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 22) {
    helper_double_25_recursive(buf + 0, 19);
    helper_double_25_recursive(buf + 524288, 19);
    helper_double_25_recursive(buf + 1048576, 19);
    helper_double_25_recursive(buf + 1572864, 19);
    helper_double_25_recursive(buf + 2097152, 19);
    helper_double_25_recursive(buf + 2621440, 19);
    helper_double_25_recursive(buf + 3145728, 19);
    helper_double_25_recursive(buf + 3670016, 19);
    for (int j = 0; j < 4194304; j += 4194304) {
      for (int k = 0; k < 524288; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 25) {
    helper_double_25_recursive(buf + 0, 22);
    helper_double_25_recursive(buf + 4194304, 22);
    helper_double_25_recursive(buf + 8388608, 22);
    helper_double_25_recursive(buf + 12582912, 22);
    helper_double_25_recursive(buf + 16777216, 22);
    helper_double_25_recursive(buf + 20971520, 22);
    helper_double_25_recursive(buf + 25165824, 22);
    helper_double_25_recursive(buf + 29360128, 22);
    for (int j = 0; j < 33554432; j += 33554432) {
      for (int k = 0; k < 4194304; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_25(double *buf);
void helper_double_25(double *buf) {
  helper_double_25_recursive(buf, 25);
}
void helper_double_26_recursive(double *buf, int depth);
void helper_double_26_recursive(double *buf, int depth) {
  if (depth == 5) {
    for (int j = 0; j < 32; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 32; j += 32) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 8) {
    helper_double_26_recursive(buf + 0, 5);
    helper_double_26_recursive(buf + 32, 5);
    helper_double_26_recursive(buf + 64, 5);
    helper_double_26_recursive(buf + 96, 5);
    helper_double_26_recursive(buf + 128, 5);
    helper_double_26_recursive(buf + 160, 5);
    helper_double_26_recursive(buf + 192, 5);
    helper_double_26_recursive(buf + 224, 5);
    for (int j = 0; j < 256; j += 256) {
      for (int k = 0; k < 32; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 11) {
    helper_double_26_recursive(buf + 0, 8);
    helper_double_26_recursive(buf + 256, 8);
    helper_double_26_recursive(buf + 512, 8);
    helper_double_26_recursive(buf + 768, 8);
    helper_double_26_recursive(buf + 1024, 8);
    helper_double_26_recursive(buf + 1280, 8);
    helper_double_26_recursive(buf + 1536, 8);
    helper_double_26_recursive(buf + 1792, 8);
    for (int j = 0; j < 2048; j += 2048) {
      for (int k = 0; k < 256; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 14) {
    helper_double_26_recursive(buf + 0, 11);
    helper_double_26_recursive(buf + 2048, 11);
    helper_double_26_recursive(buf + 4096, 11);
    helper_double_26_recursive(buf + 6144, 11);
    helper_double_26_recursive(buf + 8192, 11);
    helper_double_26_recursive(buf + 10240, 11);
    helper_double_26_recursive(buf + 12288, 11);
    helper_double_26_recursive(buf + 14336, 11);
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 2048; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 17) {
    helper_double_26_recursive(buf + 0, 14);
    helper_double_26_recursive(buf + 16384, 14);
    helper_double_26_recursive(buf + 32768, 14);
    helper_double_26_recursive(buf + 49152, 14);
    helper_double_26_recursive(buf + 65536, 14);
    helper_double_26_recursive(buf + 81920, 14);
    helper_double_26_recursive(buf + 98304, 14);
    helper_double_26_recursive(buf + 114688, 14);
    for (int j = 0; j < 131072; j += 131072) {
      for (int k = 0; k < 16384; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 20) {
    helper_double_26_recursive(buf + 0, 17);
    helper_double_26_recursive(buf + 131072, 17);
    helper_double_26_recursive(buf + 262144, 17);
    helper_double_26_recursive(buf + 393216, 17);
    helper_double_26_recursive(buf + 524288, 17);
    helper_double_26_recursive(buf + 655360, 17);
    helper_double_26_recursive(buf + 786432, 17);
    helper_double_26_recursive(buf + 917504, 17);
    for (int j = 0; j < 1048576; j += 1048576) {
      for (int k = 0; k < 131072; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 23) {
    helper_double_26_recursive(buf + 0, 20);
    helper_double_26_recursive(buf + 1048576, 20);
    helper_double_26_recursive(buf + 2097152, 20);
    helper_double_26_recursive(buf + 3145728, 20);
    helper_double_26_recursive(buf + 4194304, 20);
    helper_double_26_recursive(buf + 5242880, 20);
    helper_double_26_recursive(buf + 6291456, 20);
    helper_double_26_recursive(buf + 7340032, 20);
    for (int j = 0; j < 8388608; j += 8388608) {
      for (int k = 0; k < 1048576; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 26) {
    helper_double_26_recursive(buf + 0, 23);
    helper_double_26_recursive(buf + 8388608, 23);
    helper_double_26_recursive(buf + 16777216, 23);
    helper_double_26_recursive(buf + 25165824, 23);
    helper_double_26_recursive(buf + 33554432, 23);
    helper_double_26_recursive(buf + 41943040, 23);
    helper_double_26_recursive(buf + 50331648, 23);
    helper_double_26_recursive(buf + 58720256, 23);
    for (int j = 0; j < 67108864; j += 67108864) {
      for (int k = 0; k < 8388608; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_26(double *buf);
void helper_double_26(double *buf) {
  helper_double_26_recursive(buf, 26);
}
void helper_double_27_recursive(double *buf, int depth);
void helper_double_27_recursive(double *buf, int depth) {
  if (depth == 6) {
    for (int j = 0; j < 64; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 64; j += 64) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 9) {
    helper_double_27_recursive(buf + 0, 6);
    helper_double_27_recursive(buf + 64, 6);
    helper_double_27_recursive(buf + 128, 6);
    helper_double_27_recursive(buf + 192, 6);
    helper_double_27_recursive(buf + 256, 6);
    helper_double_27_recursive(buf + 320, 6);
    helper_double_27_recursive(buf + 384, 6);
    helper_double_27_recursive(buf + 448, 6);
    for (int j = 0; j < 512; j += 512) {
      for (int k = 0; k < 64; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 12) {
    helper_double_27_recursive(buf + 0, 9);
    helper_double_27_recursive(buf + 512, 9);
    helper_double_27_recursive(buf + 1024, 9);
    helper_double_27_recursive(buf + 1536, 9);
    helper_double_27_recursive(buf + 2048, 9);
    helper_double_27_recursive(buf + 2560, 9);
    helper_double_27_recursive(buf + 3072, 9);
    helper_double_27_recursive(buf + 3584, 9);
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 512; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_double_27_recursive(buf + 0, 12);
    helper_double_27_recursive(buf + 4096, 12);
    helper_double_27_recursive(buf + 8192, 12);
    helper_double_27_recursive(buf + 12288, 12);
    helper_double_27_recursive(buf + 16384, 12);
    helper_double_27_recursive(buf + 20480, 12);
    helper_double_27_recursive(buf + 24576, 12);
    helper_double_27_recursive(buf + 28672, 12);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 4096; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 18) {
    helper_double_27_recursive(buf + 0, 15);
    helper_double_27_recursive(buf + 32768, 15);
    helper_double_27_recursive(buf + 65536, 15);
    helper_double_27_recursive(buf + 98304, 15);
    helper_double_27_recursive(buf + 131072, 15);
    helper_double_27_recursive(buf + 163840, 15);
    helper_double_27_recursive(buf + 196608, 15);
    helper_double_27_recursive(buf + 229376, 15);
    for (int j = 0; j < 262144; j += 262144) {
      for (int k = 0; k < 32768; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 21) {
    helper_double_27_recursive(buf + 0, 18);
    helper_double_27_recursive(buf + 262144, 18);
    helper_double_27_recursive(buf + 524288, 18);
    helper_double_27_recursive(buf + 786432, 18);
    helper_double_27_recursive(buf + 1048576, 18);
    helper_double_27_recursive(buf + 1310720, 18);
    helper_double_27_recursive(buf + 1572864, 18);
    helper_double_27_recursive(buf + 1835008, 18);
    for (int j = 0; j < 2097152; j += 2097152) {
      for (int k = 0; k < 262144; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 24) {
    helper_double_27_recursive(buf + 0, 21);
    helper_double_27_recursive(buf + 2097152, 21);
    helper_double_27_recursive(buf + 4194304, 21);
    helper_double_27_recursive(buf + 6291456, 21);
    helper_double_27_recursive(buf + 8388608, 21);
    helper_double_27_recursive(buf + 10485760, 21);
    helper_double_27_recursive(buf + 12582912, 21);
    helper_double_27_recursive(buf + 14680064, 21);
    for (int j = 0; j < 16777216; j += 16777216) {
      for (int k = 0; k < 2097152; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 27) {
    helper_double_27_recursive(buf + 0, 24);
    helper_double_27_recursive(buf + 16777216, 24);
    helper_double_27_recursive(buf + 33554432, 24);
    helper_double_27_recursive(buf + 50331648, 24);
    helper_double_27_recursive(buf + 67108864, 24);
    helper_double_27_recursive(buf + 83886080, 24);
    helper_double_27_recursive(buf + 100663296, 24);
    helper_double_27_recursive(buf + 117440512, 24);
    for (int j = 0; j < 134217728; j += 134217728) {
      for (int k = 0; k < 16777216; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_27(double *buf);
void helper_double_27(double *buf) {
  helper_double_27_recursive(buf, 27);
}
void helper_double_28_recursive(double *buf, int depth);
void helper_double_28_recursive(double *buf, int depth) {
  if (depth == 14) {
    for (int j = 0; j < 16384; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 16384; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 16384; j += 1024) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 16384; j += 8192) {
      for (int k = 0; k < 1024; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 16384; j += 16384) {
      for (int k = 0; k < 8192; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 17) {
    helper_double_28_recursive(buf + 0, 14);
    helper_double_28_recursive(buf + 16384, 14);
    helper_double_28_recursive(buf + 32768, 14);
    helper_double_28_recursive(buf + 49152, 14);
    helper_double_28_recursive(buf + 65536, 14);
    helper_double_28_recursive(buf + 81920, 14);
    helper_double_28_recursive(buf + 98304, 14);
    helper_double_28_recursive(buf + 114688, 14);
    for (int j = 0; j < 131072; j += 131072) {
      for (int k = 0; k < 16384; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 20) {
    helper_double_28_recursive(buf + 0, 17);
    helper_double_28_recursive(buf + 131072, 17);
    helper_double_28_recursive(buf + 262144, 17);
    helper_double_28_recursive(buf + 393216, 17);
    helper_double_28_recursive(buf + 524288, 17);
    helper_double_28_recursive(buf + 655360, 17);
    helper_double_28_recursive(buf + 786432, 17);
    helper_double_28_recursive(buf + 917504, 17);
    for (int j = 0; j < 1048576; j += 1048576) {
      for (int k = 0; k < 131072; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 23) {
    helper_double_28_recursive(buf + 0, 20);
    helper_double_28_recursive(buf + 1048576, 20);
    helper_double_28_recursive(buf + 2097152, 20);
    helper_double_28_recursive(buf + 3145728, 20);
    helper_double_28_recursive(buf + 4194304, 20);
    helper_double_28_recursive(buf + 5242880, 20);
    helper_double_28_recursive(buf + 6291456, 20);
    helper_double_28_recursive(buf + 7340032, 20);
    for (int j = 0; j < 8388608; j += 8388608) {
      for (int k = 0; k < 1048576; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 26) {
    helper_double_28_recursive(buf + 0, 23);
    helper_double_28_recursive(buf + 8388608, 23);
    helper_double_28_recursive(buf + 16777216, 23);
    helper_double_28_recursive(buf + 25165824, 23);
    helper_double_28_recursive(buf + 33554432, 23);
    helper_double_28_recursive(buf + 41943040, 23);
    helper_double_28_recursive(buf + 50331648, 23);
    helper_double_28_recursive(buf + 58720256, 23);
    for (int j = 0; j < 67108864; j += 67108864) {
      for (int k = 0; k < 8388608; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 28) {
    helper_double_28_recursive(buf + 0, 26);
    helper_double_28_recursive(buf + 67108864, 26);
    helper_double_28_recursive(buf + 134217728, 26);
    helper_double_28_recursive(buf + 201326592, 26);
    for (int j = 0; j < 268435456; j += 268435456) {
      for (int k = 0; k < 67108864; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_28(double *buf);
void helper_double_28(double *buf) {
  helper_double_28_recursive(buf, 28);
}
void helper_double_29_recursive(double *buf, int depth);
void helper_double_29_recursive(double *buf, int depth) {
  if (depth == 9) {
    for (int j = 0; j < 512; j += 16) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm4, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm4, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm4\n"
          "movapd %%xmm5, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm5, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm5\n"
          "movapd %%xmm6, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm6, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm6\n"
          "movapd %%xmm7, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm7, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 512; j += 128) {
      for (int k = 0; k < 16; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    for (int j = 0; j < 512; j += 512) {
      for (int k = 0; k < 128; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 12) {
    helper_double_29_recursive(buf + 0, 9);
    helper_double_29_recursive(buf + 512, 9);
    helper_double_29_recursive(buf + 1024, 9);
    helper_double_29_recursive(buf + 1536, 9);
    helper_double_29_recursive(buf + 2048, 9);
    helper_double_29_recursive(buf + 2560, 9);
    helper_double_29_recursive(buf + 3072, 9);
    helper_double_29_recursive(buf + 3584, 9);
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 512; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_double_29_recursive(buf + 0, 12);
    helper_double_29_recursive(buf + 4096, 12);
    helper_double_29_recursive(buf + 8192, 12);
    helper_double_29_recursive(buf + 12288, 12);
    helper_double_29_recursive(buf + 16384, 12);
    helper_double_29_recursive(buf + 20480, 12);
    helper_double_29_recursive(buf + 24576, 12);
    helper_double_29_recursive(buf + 28672, 12);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 4096; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 18) {
    helper_double_29_recursive(buf + 0, 15);
    helper_double_29_recursive(buf + 32768, 15);
    helper_double_29_recursive(buf + 65536, 15);
    helper_double_29_recursive(buf + 98304, 15);
    helper_double_29_recursive(buf + 131072, 15);
    helper_double_29_recursive(buf + 163840, 15);
    helper_double_29_recursive(buf + 196608, 15);
    helper_double_29_recursive(buf + 229376, 15);
    for (int j = 0; j < 262144; j += 262144) {
      for (int k = 0; k < 32768; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 21) {
    helper_double_29_recursive(buf + 0, 18);
    helper_double_29_recursive(buf + 262144, 18);
    helper_double_29_recursive(buf + 524288, 18);
    helper_double_29_recursive(buf + 786432, 18);
    helper_double_29_recursive(buf + 1048576, 18);
    helper_double_29_recursive(buf + 1310720, 18);
    helper_double_29_recursive(buf + 1572864, 18);
    helper_double_29_recursive(buf + 1835008, 18);
    for (int j = 0; j < 2097152; j += 2097152) {
      for (int k = 0; k < 262144; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 24) {
    helper_double_29_recursive(buf + 0, 21);
    helper_double_29_recursive(buf + 2097152, 21);
    helper_double_29_recursive(buf + 4194304, 21);
    helper_double_29_recursive(buf + 6291456, 21);
    helper_double_29_recursive(buf + 8388608, 21);
    helper_double_29_recursive(buf + 10485760, 21);
    helper_double_29_recursive(buf + 12582912, 21);
    helper_double_29_recursive(buf + 14680064, 21);
    for (int j = 0; j < 16777216; j += 16777216) {
      for (int k = 0; k < 2097152; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 27) {
    helper_double_29_recursive(buf + 0, 24);
    helper_double_29_recursive(buf + 16777216, 24);
    helper_double_29_recursive(buf + 33554432, 24);
    helper_double_29_recursive(buf + 50331648, 24);
    helper_double_29_recursive(buf + 67108864, 24);
    helper_double_29_recursive(buf + 83886080, 24);
    helper_double_29_recursive(buf + 100663296, 24);
    helper_double_29_recursive(buf + 117440512, 24);
    for (int j = 0; j < 134217728; j += 134217728) {
      for (int k = 0; k < 16777216; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 29) {
    helper_double_29_recursive(buf + 0, 27);
    helper_double_29_recursive(buf + 134217728, 27);
    helper_double_29_recursive(buf + 268435456, 27);
    helper_double_29_recursive(buf + 402653184, 27);
    for (int j = 0; j < 536870912; j += 536870912) {
      for (int k = 0; k < 134217728; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_29(double *buf);
void helper_double_29(double *buf) {
  helper_double_29_recursive(buf, 29);
}
void helper_double_30_recursive(double *buf, int depth);
void helper_double_30_recursive(double *buf, int depth) {
  if (depth == 3) {
    for (int j = 0; j < 8; j += 8) {
      for (int k = 0; k < 2; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm0\n"
          "movapd %%xmm1, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm1, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm2, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm2, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm2\n"
          "movapd %%xmm3, %%xmm8\n"
          "haddpd %%xmm8, %%xmm8\n"
          "movapd %%xmm3, %%xmm9\n"
          "hsubpd %%xmm9, %%xmm9\n"
          "blendpd $1, %%xmm8, %%xmm9\n"
          "movapd %%xmm9, %%xmm3\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movupd %%xmm0, (%0)\n"
          "movupd %%xmm1, (%1)\n"
          "movupd %%xmm2, (%2)\n"
          "movupd %%xmm3, (%3)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 6) {
    helper_double_30_recursive(buf + 0, 3);
    helper_double_30_recursive(buf + 8, 3);
    helper_double_30_recursive(buf + 16, 3);
    helper_double_30_recursive(buf + 24, 3);
    helper_double_30_recursive(buf + 32, 3);
    helper_double_30_recursive(buf + 40, 3);
    helper_double_30_recursive(buf + 48, 3);
    helper_double_30_recursive(buf + 56, 3);
    for (int j = 0; j < 64; j += 64) {
      for (int k = 0; k < 8; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 9) {
    helper_double_30_recursive(buf + 0, 6);
    helper_double_30_recursive(buf + 64, 6);
    helper_double_30_recursive(buf + 128, 6);
    helper_double_30_recursive(buf + 192, 6);
    helper_double_30_recursive(buf + 256, 6);
    helper_double_30_recursive(buf + 320, 6);
    helper_double_30_recursive(buf + 384, 6);
    helper_double_30_recursive(buf + 448, 6);
    for (int j = 0; j < 512; j += 512) {
      for (int k = 0; k < 64; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 12) {
    helper_double_30_recursive(buf + 0, 9);
    helper_double_30_recursive(buf + 512, 9);
    helper_double_30_recursive(buf + 1024, 9);
    helper_double_30_recursive(buf + 1536, 9);
    helper_double_30_recursive(buf + 2048, 9);
    helper_double_30_recursive(buf + 2560, 9);
    helper_double_30_recursive(buf + 3072, 9);
    helper_double_30_recursive(buf + 3584, 9);
    for (int j = 0; j < 4096; j += 4096) {
      for (int k = 0; k < 512; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 15) {
    helper_double_30_recursive(buf + 0, 12);
    helper_double_30_recursive(buf + 4096, 12);
    helper_double_30_recursive(buf + 8192, 12);
    helper_double_30_recursive(buf + 12288, 12);
    helper_double_30_recursive(buf + 16384, 12);
    helper_double_30_recursive(buf + 20480, 12);
    helper_double_30_recursive(buf + 24576, 12);
    helper_double_30_recursive(buf + 28672, 12);
    for (int j = 0; j < 32768; j += 32768) {
      for (int k = 0; k < 4096; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 18) {
    helper_double_30_recursive(buf + 0, 15);
    helper_double_30_recursive(buf + 32768, 15);
    helper_double_30_recursive(buf + 65536, 15);
    helper_double_30_recursive(buf + 98304, 15);
    helper_double_30_recursive(buf + 131072, 15);
    helper_double_30_recursive(buf + 163840, 15);
    helper_double_30_recursive(buf + 196608, 15);
    helper_double_30_recursive(buf + 229376, 15);
    for (int j = 0; j < 262144; j += 262144) {
      for (int k = 0; k < 32768; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 21) {
    helper_double_30_recursive(buf + 0, 18);
    helper_double_30_recursive(buf + 262144, 18);
    helper_double_30_recursive(buf + 524288, 18);
    helper_double_30_recursive(buf + 786432, 18);
    helper_double_30_recursive(buf + 1048576, 18);
    helper_double_30_recursive(buf + 1310720, 18);
    helper_double_30_recursive(buf + 1572864, 18);
    helper_double_30_recursive(buf + 1835008, 18);
    for (int j = 0; j < 2097152; j += 2097152) {
      for (int k = 0; k < 262144; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 24) {
    helper_double_30_recursive(buf + 0, 21);
    helper_double_30_recursive(buf + 2097152, 21);
    helper_double_30_recursive(buf + 4194304, 21);
    helper_double_30_recursive(buf + 6291456, 21);
    helper_double_30_recursive(buf + 8388608, 21);
    helper_double_30_recursive(buf + 10485760, 21);
    helper_double_30_recursive(buf + 12582912, 21);
    helper_double_30_recursive(buf + 14680064, 21);
    for (int j = 0; j < 16777216; j += 16777216) {
      for (int k = 0; k < 2097152; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 27) {
    helper_double_30_recursive(buf + 0, 24);
    helper_double_30_recursive(buf + 16777216, 24);
    helper_double_30_recursive(buf + 33554432, 24);
    helper_double_30_recursive(buf + 50331648, 24);
    helper_double_30_recursive(buf + 67108864, 24);
    helper_double_30_recursive(buf + 83886080, 24);
    helper_double_30_recursive(buf + 100663296, 24);
    helper_double_30_recursive(buf + 117440512, 24);
    for (int j = 0; j < 134217728; j += 134217728) {
      for (int k = 0; k < 16777216; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
  if (depth == 30) {
    helper_double_30_recursive(buf + 0, 27);
    helper_double_30_recursive(buf + 134217728, 27);
    helper_double_30_recursive(buf + 268435456, 27);
    helper_double_30_recursive(buf + 402653184, 27);
    helper_double_30_recursive(buf + 536870912, 27);
    helper_double_30_recursive(buf + 671088640, 27);
    helper_double_30_recursive(buf + 805306368, 27);
    helper_double_30_recursive(buf + 939524096, 27);
    for (int j = 0; j < 1073741824; j += 1073741824) {
      for (int k = 0; k < 134217728; k += 2) {
        __asm__ volatile (
          "movupd (%0), %%xmm0\n"
          "movupd (%1), %%xmm1\n"
          "movupd (%2), %%xmm2\n"
          "movupd (%3), %%xmm3\n"
          "movupd (%4), %%xmm4\n"
          "movupd (%5), %%xmm5\n"
          "movupd (%6), %%xmm6\n"
          "movupd (%7), %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm9\n"
          "addpd %%xmm1, %%xmm8\n"
          "subpd %%xmm1, %%xmm9\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm11\n"
          "addpd %%xmm3, %%xmm10\n"
          "subpd %%xmm3, %%xmm11\n"
          "movapd %%xmm4, %%xmm12\n"
          "movapd %%xmm4, %%xmm13\n"
          "addpd %%xmm5, %%xmm12\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm6, %%xmm14\n"
          "movapd %%xmm6, %%xmm15\n"
          "addpd %%xmm7, %%xmm14\n"
          "subpd %%xmm7, %%xmm15\n"
          "movapd %%xmm8, %%xmm0\n"
          "movapd %%xmm8, %%xmm2\n"
          "addpd %%xmm10, %%xmm0\n"
          "subpd %%xmm10, %%xmm2\n"
          "movapd %%xmm9, %%xmm1\n"
          "movapd %%xmm9, %%xmm3\n"
          "addpd %%xmm11, %%xmm1\n"
          "subpd %%xmm11, %%xmm3\n"
          "movapd %%xmm12, %%xmm4\n"
          "movapd %%xmm12, %%xmm6\n"
          "addpd %%xmm14, %%xmm4\n"
          "subpd %%xmm14, %%xmm6\n"
          "movapd %%xmm13, %%xmm5\n"
          "movapd %%xmm13, %%xmm7\n"
          "addpd %%xmm15, %%xmm5\n"
          "subpd %%xmm15, %%xmm7\n"
          "movapd %%xmm0, %%xmm8\n"
          "movapd %%xmm0, %%xmm12\n"
          "addpd %%xmm4, %%xmm8\n"
          "subpd %%xmm4, %%xmm12\n"
          "movapd %%xmm1, %%xmm9\n"
          "movapd %%xmm1, %%xmm13\n"
          "addpd %%xmm5, %%xmm9\n"
          "subpd %%xmm5, %%xmm13\n"
          "movapd %%xmm2, %%xmm10\n"
          "movapd %%xmm2, %%xmm14\n"
          "addpd %%xmm6, %%xmm10\n"
          "subpd %%xmm6, %%xmm14\n"
          "movapd %%xmm3, %%xmm11\n"
          "movapd %%xmm3, %%xmm15\n"
          "addpd %%xmm7, %%xmm11\n"
          "subpd %%xmm7, %%xmm15\n"
          "movupd %%xmm8, (%0)\n"
          "movupd %%xmm9, (%1)\n"
          "movupd %%xmm10, (%2)\n"
          "movupd %%xmm11, (%3)\n"
          "movupd %%xmm12, (%4)\n"
          "movupd %%xmm13, (%5)\n"
          "movupd %%xmm14, (%6)\n"
          "movupd %%xmm15, (%7)\n"
          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
        );
      }
    }
    return;
  }
}
void helper_double_30(double *buf);
void helper_double_30(double *buf) {
  helper_double_30_recursive(buf, 30);
}
int fht_double(double *buf, int log_n) {
  if (log_n == 0) {
    return 0;
  }
  if (log_n == 1) {
    helper_double_1(buf);
    return 0;
  }
  if (log_n == 2) {
    helper_double_2(buf);
    return 0;
  }
  if (log_n == 3) {
    helper_double_3(buf);
    return 0;
  }
  if (log_n == 4) {
    helper_double_4(buf);
    return 0;
  }
  if (log_n == 5) {
    helper_double_5(buf);
    return 0;
  }
  if (log_n == 6) {
    helper_double_6(buf);
    return 0;
  }
  if (log_n == 7) {
    helper_double_7(buf);
    return 0;
  }
  if (log_n == 8) {
    helper_double_8(buf);
    return 0;
  }
  if (log_n == 9) {
    helper_double_9(buf);
    return 0;
  }
  if (log_n == 10) {
    helper_double_10(buf);
    return 0;
  }
  if (log_n == 11) {
    helper_double_11(buf);
    return 0;
  }
  if (log_n == 12) {
    helper_double_12(buf);
    return 0;
  }
  if (log_n == 13) {
    helper_double_13(buf);
    return 0;
  }
  if (log_n == 14) {
    helper_double_14(buf);
    return 0;
  }
  if (log_n == 15) {
    helper_double_15(buf);
    return 0;
  }
  if (log_n == 16) {
    helper_double_16(buf);
    return 0;
  }
  if (log_n == 17) {
    helper_double_17(buf);
    return 0;
  }
  if (log_n == 18) {
    helper_double_18(buf);
    return 0;
  }
  if (log_n == 19) {
    helper_double_19(buf);
    return 0;
  }
  if (log_n == 20) {
    helper_double_20(buf);
    return 0;
  }
  if (log_n == 21) {
    helper_double_21(buf);
    return 0;
  }
  if (log_n == 22) {
    helper_double_22(buf);
    return 0;
  }
  if (log_n == 23) {
    helper_double_23(buf);
    return 0;
  }
  if (log_n == 24) {
    helper_double_24(buf);
    return 0;
  }
  if (log_n == 25) {
    helper_double_25(buf);
    return 0;
  }
  if (log_n == 26) {
    helper_double_26(buf);
    return 0;
  }
  if (log_n == 27) {
    helper_double_27(buf);
    return 0;
  }
  if (log_n == 28) {
    helper_double_28(buf);
    return 0;
  }
  if (log_n == 29) {
    helper_double_29(buf);
    return 0;
  }
  if (log_n == 30) {
    helper_double_30(buf);
    return 0;
  }
  return 1;
}
