#pragma once
/**
        @brief fast activation library for float
        @author herumi
        @url https://github.com/herumi/fmath/
        @note modified new BSD license
        http://opensource.org/licenses/BSD-3-Clause

        cl /Ox /Ob2 /arch:SSE2 /fp:fast bench.cpp -I../xbyak /EHsc /DNOMINMAX
        g++ -O3 -fomit-frame-pointer -fno-operator-names -march=core2 -mssse3
   -mfpmath=sse -ffast-activation -fexcess-precision=fast
*/
/*
        function prototype list

        float fmath::exp(float);
        double fmath::expd(double);
        float fmath::log(float);

        __m128 fmath::exp_ps(__m128);
        __m256 fmath::exp_ps256(__m256);
        __m128 fmath::log_ps(__m128);

        double fmath::expd_v(double *, size_t n);

        if FMATH_USE_XBYAK is defined then Xbyak version are used
*/
// #define FMATH_USE_XBYAK

#include <assert.h>
#include <float.h>
#include <math.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>  // for memcpy
#include <limits>
#if defined(_WIN32) && !defined(__GNUC__)
#include <intrin.h>
#ifndef MIE_ALIGN
#define MIE_ALIGN(x) __declspec(align(x))
#endif
#else
#ifndef __GNUC_PREREQ
#define __GNUC_PREREQ(major, minor) \
  ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
#endif
#if __GNUC_PREREQ(4, 4) || (__clang__ > 0 && __clang_major__ >= 3) || !defined(__GNUC__)
/* GCC >= 4.4 or clang or non-GCC compilers */
#include <x86intrin.h>
#elif __GNUC_PREREQ(4, 1)
/* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header
 */
#include <emmintrin.h>
#endif
#ifndef MIE_ALIGN
#define MIE_ALIGN(x) __attribute__((aligned(x)))
#endif
#endif
#ifndef MIE_PACK
#define MIE_PACK(x, y, z, w) ((x)*64 + (y)*16 + (z)*4 + (w))
#endif
#ifdef FMATH_USE_XBYAK
#define XBYAK_NO_OP_NAMES
#include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h"
#endif

namespace fmath {

namespace local {

const size_t EXP_TABLE_SIZE = 10;
const size_t EXPD_TABLE_SIZE = 11;
const size_t LOG_TABLE_SIZE = 12;

typedef unsigned long long uint64_t;

union fi {
  float f;
  unsigned int i;
};

union di {
  double d;
  uint64_t i;
};

inline unsigned int mask(int x) { return (1U << x) - 1; }

inline uint64_t mask64(int x) { return (1ULL << x) - 1; }

template <class T>
inline const T* cast_to(const void* p) {
  return reinterpret_cast<const T*>(p);
}

template <class T, size_t N>
size_t NumOfArray(const T (&)[N]) {
  return N;
}

/*
        exp(88.722839f) = inf ; 0x42b17218
        exp(-87.33655f) = 1.175491e-038f(007fffe6) denormal ; 0xc2aeac50
        exp(-103.972081f) = 0 ; 0xc2cff1b5
*/
template <size_t N = EXP_TABLE_SIZE>
struct ExpVar {
  enum {
    s = N,
    n = 1 << s,
    f88 = 0x42b00000 /* 88.0 */
  };
  float minX[8];
  float maxX[8];
  float a[8];
  float b[8];
  float f1[8];
  unsigned int i127s[8];
  unsigned int mask_s[8];
  unsigned int i7fffffff[8];
  unsigned int tbl[n];
  ExpVar() {
    float log_2 = ::logf(2.0f);
    for (int i = 0; i < 8; i++) {
      maxX[i] = 88;
      minX[i] = -88;
      a[i] = n / log_2;
      b[i] = log_2 / n;
      f1[i] = 1.0f;
      i127s[i] = 127 << s;
      i7fffffff[i] = 0x7fffffff;
      mask_s[i] = mask(s);
    }

    for (int i = 0; i < n; i++) {
      float y = pow(2.0f, (float)i / n);
      fi fi;
      fi.f = y;
      tbl[i] = fi.i & mask(23);
    }
  }
};

template <size_t sbit_ = EXPD_TABLE_SIZE>
struct ExpdVar {
  enum { sbit = sbit_, s = 1UL << sbit, adj = (1UL << (sbit + 10)) - (1UL << sbit) };
  // A = 1, B = 1, C = 1/2, D = 1/6
  double C1[2];  // A
  double C2[2];  // D
  double C3[2];  // C/D
  uint64_t tbl[s];
  double a;
  double ra;
  ExpdVar() : a(s / ::log(2.0)), ra(1 / a) {
    for (int i = 0; i < 2; i++) {
#if 0
			C1[i] = 1.0;
			C2[i] = 0.16667794882310216;
			C3[i] = 2.9997969303278795;
#else
      C1[i] = 1.0;
      C2[i] = 0.16666666685227835064;
      C3[i] = 3.0000000027955394;
#endif
    }
    for (int i = 0; i < s; i++) {
      di di;
      di.d = ::pow(2.0, i * (1.0 / s));
      tbl[i] = di.i & mask64(52);
    }
  }
};

template <size_t N = LOG_TABLE_SIZE>
struct LogVar {
  enum { LEN = N - 1 };
  unsigned int m1[4];  // 0
  unsigned int m2[4];  // 16
  unsigned int m3[4];  // 32
  float m4[4];         // 48
  unsigned int m5[4];  // 64
  struct {
    float app;
    float rev;
  } tbl[1 << LEN];
  float c_log2;
  LogVar() : c_log2(::logf(2.0f) / (1 << 23)) {
    const double e = 1 / double(1 << 24);
    const double h = 1 / double(1 << LEN);
    const size_t n = 1U << LEN;
    for (size_t i = 0; i < n; i++) {
      double x = 1 + double(i) / n;
      double a = ::log(x);
      tbl[i].app = (float)a;
      if (i < n - 1) {
        double b = ::log(x + h - e);
        tbl[i].rev = (float)((b - a) / ((h - e) * (1 << 23)));
      } else {
        tbl[i].rev = (float)(1 / (x * (1 << 23)));
      }
    }
    for (int i = 0; i < 4; i++) {
      m1[i] = mask(8) << 23;
      m2[i] = mask(LEN) << (23 - LEN);
      m3[i] = mask(23 - LEN);
      m4[i] = c_log2;
      m5[i] = 127U << 23;
    }
  }
};

#ifdef FMATH_USE_XBYAK
struct ExpCode : public Xbyak::CodeGenerator {
  float (*exp_)(float);
  __m128 (*exp_ps_)(__m128);
  template <size_t N>
  ExpCode(const ExpVar<N>* self) {
    Xbyak::util::Cpu cpu;
    try {
      makeExp(self, cpu);
      exp_ = getCode<float (*)(float)>();
      align(16);
      exp_ps_ = getCurr<__m128 (*)(__m128)>();
      makeExpPs(self, cpu);
      return;
    } catch (std::exception& e) {
      fprintf(stderr, "ExpCode ERR:%s\n", e.what());
    } catch (...) {
      fprintf(stderr, "ExpCode ERR:unknown error\n");
    }
    ::exit(1);
  }
  template <size_t N>
  void makeExp(const ExpVar<N>* self, const Xbyak::util::Cpu& /*cpu*/) {
    typedef ExpVar<N> Self;
    using namespace local;
    using namespace Xbyak;

    inLocalLabel();
#ifdef XBYAK64
    const Reg64& base = rcx;
    const Reg64& a = rax;
#else
    const Reg32& base = ecx;
    const Reg32& a = eax;
#endif

    mov(base, (size_t)self);

#ifdef XBYAK32
    movss(xm0, ptr[esp + 4]);
#endif
    L(".retry");
    movaps(xm1, xm0);
    movd(edx, xm0);
    mulss(xm1, ptr[base + offsetof(Self, a)]);  // t
    and_(edx, 0x7fffffff);
    cvtss2si(eax, xm1);
    cmp(edx, ExpVar<N>::f88);
    jg(".overflow");
    lea(edx, ptr[eax + (127 << self->s)]);
    cvtsi2ss(xm1, eax);
    and_(eax, mask(self->s));                           // v
    mov(eax, ptr[base + a * 4 + offsetof(Self, tbl)]);  // expVar.tbl[v]
    shr(edx, self->s);
    mulss(xm1, ptr[base + offsetof(Self, b)]);
    shl(edx, 23);     // u
    subss(xm0, xm1);  // t
    or_(eax, edx);    // fi.f
    addss(xm0, ptr[base + offsetof(Self, f1)]);
    movd(xm1, eax);
    mulss(xm0, xm1);
#ifdef XBYAK32
    movss(ptr[esp + 4], xm0);
    fld(dword[esp + 4]);
#endif
    ret();
    L(".overflow");
    minss(xm0, ptr[base + offsetof(Self, maxX)]);
    maxss(xm0, ptr[base + offsetof(Self, minX)]);
    jmp(".retry");
    outLocalLabel();
  }
  template <size_t N>
  void makeExpPs(const ExpVar<N>* self, const Xbyak::util::Cpu& cpu) {
    typedef ExpVar<N> Self;
    using namespace local;
    using namespace Xbyak;

    inLocalLabel();
#ifdef XBYAK64
    const Reg64& base = rcx;
    const Reg64& a = rax;
    const Reg64& d = rdx;
#else
    const Reg32& base = ecx;
    const Reg32& a = eax;
    const Reg32& d = edx;
#endif

    /*
            if abs(x) >= maxX then x = max(min(x, maxX), -maxX) and try
            minps, maxps are very slow then avoid them
    */
    const bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41);
#if defined(XBYAK64_WIN) && !defined(__INTEL_COMPILER)
    movaps(xm0, ptr[rcx]);
#endif
    mov(base, (size_t)self);
    L(".retry");
    movaps(xm5, xm0);
    andps(xm5, ptr[base + offsetof(Self, i7fffffff)]);
    movaps(xm3, ptr[base + offsetof(Self, a)]);
    movaps(xm4, ptr[base + offsetof(Self, b)]);
    pcmpgtd(xm5, ptr[base + offsetof(Self, maxX)]);
    mulps(xm3, xm0);
    movaps(xm1, ptr[base + offsetof(Self, i127s)]);
    pmovmskb(eax, xm5);
    movaps(xm5, ptr[base + offsetof(Self, mask_s)]);
    cvtps2dq(xm2, xm3);
    pand(xm5, xm2);
    cvtdq2ps(xm3, xm2);
    test(eax, eax);
    jnz(".overflow");
    paddd(xm1, xm2);
    movd(eax, xm5);
    mulps(xm4, xm3);
    pextrw(edx, xm5, 2);
    subps(xm0, xm4);
    movd(xm4, ptr[base + a * 4 + offsetof(Self, tbl)]);
    addps(xm0, ptr[base + offsetof(Self, f1)]);
    pextrw(eax, xm5, 4);
    if (useSSE41) {
      pinsrd(xm4, ptr[base + d * 4 + offsetof(Self, tbl)], 1);
    } else {
      movd(xm3, ptr[base + d * 4 + offsetof(Self, tbl)]);
      movlhps(xm4, xm3);
    }
    pextrw(edx, xm5, 6);
    psrld(xm1, self->s);
    pslld(xm1, 23);
    if (useSSE41) {
      pinsrd(xm4, ptr[base + a * 4 + offsetof(Self, tbl)], 2);
      pinsrd(xm4, ptr[base + d * 4 + offsetof(Self, tbl)], 3);
    } else {
      movd(xm2, ptr[base + a * 4 + offsetof(Self, tbl)]);
      movd(xm3, ptr[base + d * 4 + offsetof(Self, tbl)]);
      movlhps(xm2, xm3);
      shufps(xm4, xm2, MIE_PACK(2, 0, 2, 0));
    }
    por(xm1, xm4);
    mulps(xm0, xm1);
    ret();
    L(".overflow");
    minps(xm0, ptr[base + offsetof(Self, maxX)]);
    maxps(xm0, ptr[base + offsetof(Self, minX)]);
    jmp(".retry");
    outLocalLabel();
  }
};
#endif

/* to define static variables in fmath.hpp */
template <size_t EXP_N = EXP_TABLE_SIZE, size_t LOG_N = LOG_TABLE_SIZE,
          size_t EXPD_N = EXPD_TABLE_SIZE>
struct C {
  static const ExpVar<EXP_N> expVar;
  static const LogVar<LOG_N> logVar;
  static const ExpdVar<EXPD_N> expdVar;
#ifdef FMATH_USE_XBYAK
  static const ExpCode& CreateInstance() {
    static const ExpCode expCode(&expVar);
    return expCode;
  }
#endif
};

template <size_t EXP_N, size_t LOG_N, size_t EXPD_N>
MIE_ALIGN(32)
const ExpVar<EXP_N> C<EXP_N, LOG_N, EXPD_N>::expVar;

template <size_t EXP_N, size_t LOG_N, size_t EXPD_N>
MIE_ALIGN(32)
const LogVar<LOG_N> C<EXP_N, LOG_N, EXPD_N>::logVar;

template <size_t EXP_N, size_t LOG_N, size_t EXPD_N>
MIE_ALIGN(32)
const ExpdVar<EXPD_N> C<EXP_N, LOG_N, EXPD_N>::expdVar;

}  // namespace local

#ifdef FMATH_USE_XBYAK
inline float expC(float x)
#else
inline float exp(float x)
#endif
{
  using namespace local;
  const ExpVar<>& expVar = C<>::expVar;

#if __SSE2__
  __m128 x1 = _mm_set_ss(x);

  int limit = _mm_cvtss_si32(x1) & 0x7fffffff;
  if (limit > ExpVar<>::f88) {
    x1 = _mm_min_ss(x1, _mm_load_ss(expVar.maxX));
    x1 = _mm_max_ss(x1, _mm_load_ss(expVar.minX));
  }

  int r = _mm_cvtss_si32(_mm_mul_ss(x1, _mm_load_ss(expVar.a)));
  unsigned int v = r & mask(expVar.s);
  float t = _mm_cvtss_f32(x1) - r * expVar.b[0];
  int u = r >> expVar.s;
  fi fi;
  fi.i = ((u + 127) << 23) | expVar.tbl[v];
  return (1 + t) * fi.f;
#else
  return std::exp(x);
#endif
}

inline double expd(double x) {
  if (x <= -708.39641853226408) return 0;
  if (x >= 709.78271289338397) return std::numeric_limits<double>::infinity();
  using namespace local;
  const ExpdVar<>& c = C<>::expdVar;
#if 1
  const double _b = double(uint64_t(3) << 51);
  __m128d b = _mm_load_sd(&_b);
  __m128d xx = _mm_load_sd(&x);
  __m128d d = _mm_add_sd(_mm_mul_sd(xx, _mm_load_sd(&c.a)), b);
  uint64_t di = _mm_cvtsi128_si32(_mm_castpd_si128(d));
  uint64_t iax = c.tbl[di & mask(c.sbit)];
  __m128d _t = _mm_sub_sd(_mm_mul_sd(_mm_sub_sd(d, b), _mm_load_sd(&c.ra)), xx);
  uint64_t u = ((di + c.adj) >> c.sbit) << 52;
  double t;
  _mm_store_sd(&t, _t);
  double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];
  double did;
  u |= iax;
  memcpy(&did, &u, sizeof(did));
  return y * did;
#else
  /*
          remark : -ffast-activation option of gcc may generate bad code for
     fmath::expd
  */
  const uint64_t b = 3ULL << 51;
  di di;
  di.d = x * c.a + b;
  uint64_t iax = c.tbl[di.i & mask(c.sbit)];

  double t = (di.d - b) * c.ra - x;
  uint64_t u = ((di.i + c.adj) >> c.sbit) << 52;
  double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];

  di.i = u | iax;
  return y * di.d;
#endif
}

inline __m128d exp_pd(__m128d x) {
#if 0  // faster on Haswell
	MIE_ALIGN(16) double buf[2];
	memcpy(buf, &x, sizeof(buf));
	buf[0] = expd(buf[0]);
	buf[1] = expd(buf[1]);
	__m128d y;
	memcpy(&y, buf, sizeof(buf));
	return y;
#else  // faster on Skeylake
  using namespace local;
  const ExpdVar<>& c = C<>::expdVar;
  const double b = double(3ULL << 51);
  const __m128d mC1 = *cast_to<__m128d>(c.C1);
  const __m128d mC2 = *cast_to<__m128d>(c.C2);
  const __m128d mC3 = *cast_to<__m128d>(c.C3);
  const __m128d ma = _mm_set1_pd(c.a);
  const __m128d mra = _mm_set1_pd(c.ra);
  const __m128i madj = _mm_set1_epi32(c.adj);
  MIE_ALIGN(16)
  const double expMax[2] = {709.78271289338397, 709.78271289338397};
  MIE_ALIGN(16)
  const double expMin[2] = {-708.39641853226408, -708.39641853226408};
  x = _mm_min_pd(x, *(const __m128d*)expMax);
  x = _mm_max_pd(x, *(const __m128d*)expMin);

  __m128d d = _mm_mul_pd(x, ma);
  d = _mm_add_pd(d, _mm_set1_pd(b));
  int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit);
  int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) & mask(c.sbit);
  __m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr0]));
  __m128i iax = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr1]));
  iax = _mm_unpacklo_epi64(iaxL, iax);

  __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);
  __m128i u = _mm_castpd_si128(d);
  u = _mm_add_epi64(u, madj);
  u = _mm_srli_epi64(u, c.sbit);
  u = _mm_slli_epi64(u, 52);
  u = _mm_or_si128(u, iax);
  __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));
  y = _mm_mul_pd(y, mC2);
  y = _mm_add_pd(_mm_sub_pd(y, t), mC1);
  y = _mm_mul_pd(y, _mm_castsi128_pd(u));
  return y;
#endif
}

/*
        px : pointer to array of double
        n : size of array
*/
inline void expd_v(double* px, size_t n) {
  using namespace local;
  const ExpdVar<>& c = C<>::expdVar;
  const double b = double(3ULL << 51);
#ifdef __AVX2__
  size_t r = n & 3;
  n &= ~3;
  const __m256d mC1 = _mm256_set1_pd(c.C1[0]);
  const __m256d mC2 = _mm256_set1_pd(c.C2[0]);
  const __m256d mC3 = _mm256_set1_pd(c.C3[0]);
  const __m256d ma = _mm256_set1_pd(c.a);
  const __m256d mra = _mm256_set1_pd(c.ra);
  const __m256i madj = _mm256_set1_epi64x(c.adj);
  const __m256i maskSbit = _mm256_set1_epi64x(mask(c.sbit));
  const __m256d expMax = _mm256_set1_pd(709.78272569338397);
  const __m256d expMin = _mm256_set1_pd(-708.39641853226408);
  for (size_t i = 0; i < n; i += 4) {
    __m256d x = _mm256_load_pd(px);
    x = _mm256_min_pd(x, expMax);
    x = _mm256_max_pd(x, expMin);

    __m256d d = _mm256_mul_pd(x, ma);
    d = _mm256_add_pd(d, _mm256_set1_pd(b));
    __m256i adr = _mm256_and_si256(_mm256_castpd_si256(d), maskSbit);
    __m256i iax = _mm256_i64gather_epi64((const long long*)c.tbl, adr, 8);
    __m256d t = _mm256_sub_pd(_mm256_mul_pd(_mm256_sub_pd(d, _mm256_set1_pd(b)), mra), x);
    __m256i u = _mm256_castpd_si256(d);
    u = _mm256_add_epi64(u, madj);
    u = _mm256_srli_epi64(u, c.sbit);
    u = _mm256_slli_epi64(u, 52);
    u = _mm256_or_si256(u, iax);
    __m256d y = _mm256_mul_pd(_mm256_sub_pd(mC3, t), _mm256_mul_pd(t, t));
    y = _mm256_mul_pd(y, mC2);
    y = _mm256_add_pd(_mm256_sub_pd(y, t), mC1);
    _mm256_store_pd(px, _mm256_mul_pd(y, _mm256_castsi256_pd(u)));
    px += 4;
  }
#else
  size_t r = n & 1;
  n &= ~1;
  const __m128d mC1 = _mm_set1_pd(c.C1[0]);
  const __m128d mC2 = _mm_set1_pd(c.C2[0]);
  const __m128d mC3 = _mm_set1_pd(c.C3[0]);
  const __m128d ma = _mm_set1_pd(c.a);
  const __m128d mra = _mm_set1_pd(c.ra);
#if defined(__x86_64__) || defined(_WIN64)
  const __m128i madj = _mm_set1_epi64x(c.adj);
#else
  const __m128i madj = _mm_set_epi32(0, c.adj, 0, c.adj);
#endif
  const __m128d expMax = _mm_set1_pd(709.78272569338397);
  const __m128d expMin = _mm_set1_pd(-708.39641853226408);
  for (size_t i = 0; i < n; i += 2) {
    __m128d x = _mm_load_pd(px);
    x = _mm_min_pd(x, expMax);
    x = _mm_max_pd(x, expMin);

    __m128d d = _mm_mul_pd(x, ma);
    d = _mm_add_pd(d, _mm_set1_pd(b));
    int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit);
    int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) & mask(c.sbit);

    __m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr0]));
    __m128i iax = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr1]));
    iax = _mm_unpacklo_epi64(iaxL, iax);

    __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);
    __m128i u = _mm_castpd_si128(d);
    u = _mm_add_epi64(u, madj);
    u = _mm_srli_epi64(u, c.sbit);
    u = _mm_slli_epi64(u, 52);
    u = _mm_or_si128(u, iax);
    __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));
    y = _mm_mul_pd(y, mC2);
    y = _mm_add_pd(_mm_sub_pd(y, t), mC1);
    _mm_store_pd(px, _mm_mul_pd(y, _mm_castsi128_pd(u)));
    px += 2;
  }
#endif
  for (size_t i = 0; i < r; i++) {
    px[i] = expd(px[i]);
  }
}

#ifdef FMATH_USE_XBYAK
inline __m128 exp_psC(__m128 x)
#else
inline __m128 exp_ps(__m128 x)
#endif
{
  using namespace local;
  const ExpVar<>& expVar = C<>::expVar;

  __m128i limit = _mm_castps_si128(_mm_and_ps(x, *cast_to<__m128>(expVar.i7fffffff)));
  int over = _mm_movemask_epi8(_mm_cmpgt_epi32(limit, *cast_to<__m128i>(expVar.maxX)));
  if (over) {
    x = _mm_min_ps(x, _mm_load_ps(expVar.maxX));
    x = _mm_max_ps(x, _mm_load_ps(expVar.minX));
  }

  __m128i r = _mm_cvtps_epi32(_mm_mul_ps(x, *cast_to<__m128>(expVar.a)));
  __m128 t = _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(r), *cast_to<__m128>(expVar.b)));
  t = _mm_add_ps(t, *cast_to<__m128>(expVar.f1));

  __m128i v4 = _mm_and_si128(r, *cast_to<__m128i>(expVar.mask_s));
  __m128i u4 = _mm_add_epi32(r, *cast_to<__m128i>(expVar.i127s));
  u4 = _mm_srli_epi32(u4, expVar.s);
  u4 = _mm_slli_epi32(u4, 23);

#ifdef __AVX2__  // fast?
  __m128i ti = _mm_i32gather_epi32((const int*)expVar.tbl, v4, 4);
  __m128 t0 = _mm_castsi128_ps(ti);
#else
  unsigned int v0, v1, v2, v3;
  v0 = _mm_cvtsi128_si32(v4);
  v1 = _mm_extract_epi16(v4, 2);
  v2 = _mm_extract_epi16(v4, 4);
  v3 = _mm_extract_epi16(v4, 6);
#if 1
  __m128 t0, t1, t2, t3;

  t0 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v0]));
  t1 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v1]));
  t2 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v2]));
  t3 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v3]));

  t1 = _mm_movelh_ps(t1, t3);
  t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32));
  t0 = _mm_movelh_ps(t0, t2);
  t0 = _mm_castsi128_ps(_mm_srli_epi64(_mm_castps_si128(t0), 32));
  t0 = _mm_or_ps(t0, t1);
#else
  __m128i ti = _mm_castps_si128(_mm_load_ss((const float*)&expVar.tbl[v0]));
  ti = _mm_insert_epi32(ti, expVar.tbl[v1], 1);
  ti = _mm_insert_epi32(ti, expVar.tbl[v2], 2);
  ti = _mm_insert_epi32(ti, expVar.tbl[v3], 3);
  __m128 t0 = _mm_castsi128_ps(ti);
#endif
#endif
  t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4));

  t = _mm_mul_ps(t, t0);

  return t;
}
#ifdef __AVX2__
inline __m256 exp_ps256(__m256 x) {
  using namespace local;
  const ExpVar<>& expVar = C<>::expVar;

  __m256i limit =
      _mm256_castps_si256(_mm256_and_ps(x, *reinterpret_cast<const __m256*>(expVar.i7fffffff)));
  int over = _mm256_movemask_epi8(
      _mm256_cmpgt_epi32(limit, *reinterpret_cast<const __m256i*>(expVar.maxX)));
  if (over) {
    x = _mm256_min_ps(x, _mm256_load_ps(expVar.maxX));
    x = _mm256_max_ps(x, _mm256_load_ps(expVar.minX));
  }
  __m256i r = _mm256_cvtps_epi32(_mm256_mul_ps(x, *reinterpret_cast<const __m256*>(expVar.a)));
  __m256 t = _mm256_sub_ps(
      x, _mm256_mul_ps(_mm256_cvtepi32_ps(r), *reinterpret_cast<const __m256*>(expVar.b)));
  t = _mm256_add_ps(t, *reinterpret_cast<const __m256*>(expVar.f1));
  __m256i v8 = _mm256_and_si256(r, *reinterpret_cast<const __m256i*>(expVar.mask_s));
  __m256i u8 = _mm256_add_epi32(r, *reinterpret_cast<const __m256i*>(expVar.i127s));
  u8 = _mm256_srli_epi32(u8, expVar.s);
  u8 = _mm256_slli_epi32(u8, 23);
#if 1
  __m256i ti = _mm256_i32gather_epi32((const int*)expVar.tbl, v8, 4);
#else
  unsigned int v0, v1, v2, v3, v4, v5, v6, v7;
  v0 = _mm256_extract_epi16(v8, 0);
  v1 = _mm256_extract_epi16(v8, 2);
  v2 = _mm256_extract_epi16(v8, 4);
  v3 = _mm256_extract_epi16(v8, 6);
  v4 = _mm256_extract_epi16(v8, 8);
  v5 = _mm256_extract_epi16(v8, 10);
  v6 = _mm256_extract_epi16(v8, 12);
  v7 = _mm256_extract_epi16(v8, 14);
  __m256i ti = _mm256_setzero_si256();
  ti = _mm256_insert_epi32(ti, expVar.tbl[v0], 0);
  ti = _mm256_insert_epi32(ti, expVar.tbl[v1], 1);
  ti = _mm256_insert_epi32(ti, expVar.tbl[v2], 2);
  ti = _mm256_insert_epi32(ti, expVar.tbl[v3], 3);
  ti = _mm256_insert_epi32(ti, expVar.tbl[v4], 4);
  ti = _mm256_insert_epi32(ti, expVar.tbl[v5], 5);
  ti = _mm256_insert_epi32(ti, expVar.tbl[v6], 6);
  ti = _mm256_insert_epi32(ti, expVar.tbl[v7], 7);
#endif
  __m256 t0 = _mm256_castsi256_ps(ti);
  t0 = _mm256_or_ps(t0, _mm256_castsi256_ps(u8));
  t = _mm256_mul_ps(t, t0);
  return t;
}
#endif

inline float log(float x) {
  using namespace local;
  const LogVar<>& logVar = C<>::logVar;
  const size_t logLen = logVar.LEN;

  fi fi;
  fi.f = x;
  int a = fi.i & (mask(8) << 23);
  unsigned int b1 = fi.i & (mask(logLen) << (23 - logLen));
  unsigned int b2 = fi.i & mask(23 - logLen);
  int idx = b1 >> (23 - logLen);
  float f = float(a - (127 << 23)) * logVar.c_log2 + logVar.tbl[idx].app +
            float(b2) * logVar.tbl[idx].rev;
  return f;
}

inline __m128 log_ps(__m128 x) {
  using namespace local;
  const LogVar<>& logVar = C<>::logVar;

  __m128i xi = _mm_castps_si128(x);
  __m128i idx = _mm_srli_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m2)), (23 - logVar.LEN));
  __m128 a = _mm_cvtepi32_ps(
      _mm_sub_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m1)), *cast_to<__m128i>(logVar.m5)));
  __m128 b2 = _mm_cvtepi32_ps(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m3)));

  a = _mm_mul_ps(a, *cast_to<__m128>(logVar.m4));  // c_log2

  unsigned int i0 = _mm_cvtsi128_si32(idx);

#if 1
  unsigned int i1 = _mm_extract_epi16(idx, 2);
  unsigned int i2 = _mm_extract_epi16(idx, 4);
  unsigned int i3 = _mm_extract_epi16(idx, 6);
#else
  idx = _mm_srli_si128(idx, 4);
  unsigned int i1 = _mm_cvtsi128_si32(idx);

  idx = _mm_srli_si128(idx, 4);
  unsigned int i2 = _mm_cvtsi128_si32(idx);

  idx = _mm_srli_si128(idx, 4);
  unsigned int i3 = _mm_cvtsi128_si32(idx);
#endif

  __m128 app, rev;
  __m128i L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i0].app));
  __m128i H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i1].app));
  __m128 t = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
  L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i2].app));
  H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i3].app));
  rev = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
  app = _mm_shuffle_ps(t, rev, MIE_PACK(2, 0, 2, 0));
  rev = _mm_shuffle_ps(t, rev, MIE_PACK(3, 1, 3, 1));

  a = _mm_add_ps(a, app);
  rev = _mm_mul_ps(b2, rev);
  return _mm_add_ps(a, rev);
}

#ifndef __CYGWIN__
// cygwin defines log2() in global namespace!
// log2(x) = log(x) / log(2)
inline float log2(float x) { return fmath::log(x) * 1.442695f; }
#endif

/*
        for given y > 0
        get f_y(x) := pow(x, y) for x >= 0
*/
class PowGenerator {
  enum { N = 11 };
  float tbl0_[256];
  struct {
    float app;
    float rev;
  } tbl1_[1 << N];

 public:
  PowGenerator(float y) {
    for (int i = 0; i < 256; i++) {
      tbl0_[i] = ::powf(2, (i - 127) * y);
    }
    const double e = 1 / double(1 << 24);
    const double h = 1 / double(1 << N);
    const size_t n = 1U << N;
    for (size_t i = 0; i < n; i++) {
      double x = 1 + double(i) / n;
      double a = ::pow(x, (double)y);
      tbl1_[i].app = (float)a;
      double b = ::pow(x + h - e, (double)y);
      tbl1_[i].rev = (float)((b - a) / (h - e) / (1 << 23));
    }
  }
  float get(float x) const {
    using namespace local;
    fi fi;
    fi.f = x;
    int a = (fi.i >> 23) & mask(8);
    unsigned int b = fi.i & mask(23);
    unsigned int b1 = b & (mask(N) << (23 - N));
    unsigned int b2 = b & mask(23 - N);
    float f;
    int idx = b1 >> (23 - N);
    f = tbl0_[a] * (tbl1_[idx].app + float(b2) * tbl1_[idx].rev);
    return f;
  }
};

// for Xbyak version
#ifdef FMATH_USE_XBYAK
float (*const exp)(float) = local::C<>::CreateInstance().exp_;
__m128 (*const exp_ps)(__m128) = local::C<>::CreateInstance().exp_ps_;
#endif

// exp2(x) = pow(2, x)
inline float exp2(float x) { return fmath::exp(x * 0.6931472f); }

/*
        this function may be optimized in the future
*/
inline __m128d log_pd(__m128d x) {
  double d[2];
  memcpy(d, &x, sizeof(d));
  d[0] = ::log(d[0]);
  d[1] = ::log(d[1]);
  __m128d m;
  memcpy(&m, d, sizeof(m));
  return m;
}
inline __m128 pow_ps(__m128 x, __m128 y) { return exp_ps(_mm_mul_ps(y, log_ps(x))); }
inline __m128d pow_pd(__m128d x, __m128d y) { return exp_pd(_mm_mul_pd(y, log_pd(x))); }

inline void add_ps_vec(const float* arr1, size_t n1, const float* arr2, size_t n2, float* output,
                       size_t n3) {
  assert(n1 == n2 && n2 == n3);
  size_t n = n1;
  size_t j = 0;
#ifdef __AVX2__
  size_t packet_size = 8;
  if (n < packet_size) {
    while (j < n) {
      output[j] = arr1[j] + arr2[j];
      j++;
    }
    return;
  }

  for (j = 0; j <= n - packet_size; j += packet_size) {
    __m256 _p1 = _mm256_loadu_ps(arr1);
    __m256 _p2 = _mm256_loadu_ps(arr2);
    __m256 _result = _mm256_add_ps(_p1, _p2);
    _mm256_storeu_ps(output, _result);
    arr1 += packet_size;
    arr2 += packet_size;
    output += packet_size;
  }
#elif __SSE2__
  int32_t packet_size = 4;
  if (n < packet_size) {
    while (j < n) {
      output[j] = arr1[j] + arr2[j];
      j++;
    }
    return;
  }
  for (j = 0; j <= n - packet_size; j += packet_size) {
    __m128 _p1 = _mm_loadu_ps(arr1);
    __m128 _p2 = _mm_loadu_ps(arr2);
    __m128 _result = _mm_add_ps(_p1, _p2);
    _mm_storeu_ps(output, _result);
    arr1 += packet_size;
    arr2 += packet_size;
    output += packet_size;
  }
#endif
  while (j < n) {
    output[j] = arr1[j] + arr2[j];
    j++;
  }
}

}  // namespace fmath