//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// kernel_SSE.h: a collection of Intel SSE optimized kernels.
// Check in kernel_default.h which one(s) are actually used by default.
// Others are mere experiments; they are still covered by tests
// in case they might be useful some day.
//

#ifndef GEMMLOWP_INTERNAL_KERNEL_AVX_SIGN_INT8_ACC_GARBAGE_H_
#define GEMMLOWP_INTERNAL_KERNEL_AVX_SIGN_INT8_ACC_GARBAGE_H_

#include "kernel.h"

#include <cstring>
#include <iostream>
#include <cassert>


namespace {
/*
void PrintContent(std::int8_t* arr, std::string name) {
    std::cout << "The content of " << name << " is\n";
    for (int i = 0; i < 32; i++) {
        std::cout << (int)*(arr+i) << " ";
    }
    std::cout << "\n";
}
*/
}


namespace gemmlowp {

#ifdef GEMMLOWP_AVX2_64
struct AVX2_64_Kernel24x8Depth2_Int8Operands_Int8Inputs_Int8Acc_Garbage : 
    KernelBase {
  typedef KernelFormat<KernelSideFormatInt8Inputs<CellFormat<8, 2, CellOrder::WidthMajor>, 12>,
                       KernelSideFormatInt8Inputs<CellFormat<4, 2, CellOrder::WidthMajor>, 1>>
      Format;

  const char *Name() const override { return "AVX, 4x24x8, depth 2 integer, int8 acc!!!!"; }

  void Run(dstType *dst_ptr, std::size_t dst_row_stride, std::size_t dst_col_stride,
           const std::uint8_t *lhs_ptr, const std::uint8_t *rhs_ptr, std::size_t start_depth,
           std::size_t run_depth) const override {
    ScopedProfilingLabel label("optimized kernel");
    assert(dst_row_stride == 1);
    std::int64_t run_depth_cells = run_depth / Format::kDepth;
    const std::int64_t dst_col_stride_q = dst_col_stride;
    std::int64_t* tmp = new std::int64_t;
    //std::cout << "run_depth_cells = " << run_depth_cells << "\n";
    std::int8_t *arr_256 = new std::int8_t[32];
    std::int8_t pshuf_odd_first_half[] = 
        {0,2,4,6,8,10,12,14,-1,-1,-1,-1,-1,-1,-1,-1,
        0,2,4,6,8,10,12,14,-1,-1,-1,-1,-1,-1,-1,-1};
    std::int8_t *pshuf_odd_fh_op = pshuf_odd_first_half;
    std::int8_t pshuf_swap_adjancent[] = 
        {1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,
         1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14};
    std::int8_t* pshuf_swp_adj_op = pshuf_swap_adjancent;
    //std::int8_t arr_256[32]; 
    /* Main loop */

    // A 2x8 cell of Rhs is stored in 8bit in ymm1 .
    // A 96x2 block of 6 16x2 cells Lhs is stored in 8bit in ymm0, replaced
    // every Iteration.
    // accumulators is stored in 8bit in xmm4--xmm15.
    //
    //                   +-------+-------+-------+-------+
    //                   |ymm1[0]        |ymm2[2]        |
    //              Rhs  +-------+---------------+-------+
    //                   |ymm1[1]        |ymm1[4]        |
    //                   +-------+-------+-------+-------+
    //
    //                   |       |       |       |       |
    //
    //    Lhs            |       |       |       |       |
    //
    //  +--+--+ - - - -  +-------+-------+-------+-------+
    //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
    //  |ymm0 | (Iter1)  | ymm4  | ymm5  | ymm6  | ymm7  |
    //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
    //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
    //  +--+--+ - - - -  +-------+-------+-------+-------+
    //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
    //  |ymm0 | (Iter2)  | ymm4  | ymm5  | ymm6  | ymm7  |
    //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
    //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
    //  +--+--+ - - - -  +-------+-------+-------+-------+
    //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
    //  |ymm0 | (Iter3)  | ymm8  | ymm9  | ymm10 | ymm11 |
    //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
    //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
    //  +--+--+ - - - -  +-------+-------+-------+-------+
    //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
    //  |ymm0 | (Iter4)  | ymm8  | ymm9  | ymm10 | ymm11 |
    //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
    //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
    //  +--+--+ - - - -  +-------+-------+-------+-------+
    //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
    //  |ymm0 | (Iter5)  | ymm12 | ymm13 | ymm14 | ymm15 |
    //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
    //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
    //  +--+--+ - - - -  +-------+-------+-------+-------+
    //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
    //  |ymm0 | (Iter6)  | ymm12 | ymm13 | ymm14 | ymm15 |
    //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
    //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
    //  +--+--+ - - - -  +-------+-------+-------+-------+
    //                              Accumulator

    asm volatile(
        // Set registers for destination
        "movq  %[dst_col_stride_q], %%r12\n\t"  // stride is r12
        "shlq $2, %%r12\n\t"                    // set stride dword
        "leaq (%%r12,%%r12,0x2), %%r13\n\t"     // load stride aligned r13

        // Set accumulators to zero.
        "vpxor %%ymm4, %%ymm4, %%ymm4 \n\t"    // zero accumulators
        "vpxor %%ymm5, %%ymm5, %%ymm5 \n\t"    // zero accumulators
        "vpxor %%ymm6, %%ymm6, %%ymm6 \n\t"    // zero accumulators
        "vpxor %%ymm7, %%ymm7, %%ymm7 \n\t"    // zero accumulators
        "vpxor %%ymm8, %%ymm8, %%ymm8 \n\t"    // zero accumulators
        "vpxor %%ymm9, %%ymm9, %%ymm9 \n\t"    // zero accumulators
        "vpxor %%ymm10, %%ymm10, %%ymm10\n\t"  // zero accumulators
        "vpxor %%ymm11, %%ymm11, %%ymm11\n\t"  // zero accumulators
        "vpxor %%ymm12, %%ymm12, %%ymm12\n\t"  // zero accumulators
        "vpxor %%ymm13, %%ymm13, %%ymm13\n\t"  // zero accumulators
        "vpxor %%ymm14, %%ymm14, %%ymm14\n\t"  // zero accumulators
        "vpxor %%ymm15, %%ymm15, %%ymm15\n\t"  // zero accumulators

        "movq  %[run_depth_cells], %%r14 \n\t"  // load cell depth r14

        // The loop which processes each cell at a time
        "outerLoop%=: \n\t"  // outer loop unroll
        

        "vmovdqu (%[rhs_ptr]), %%ymm1 \n\t" // Load rhs to ymm1
         
        // Iter 1
        "vmovdqu (%[lhs_ptr]), %%ymm0 \n\t" // move lhs to ymm0
        // lhs[0:16,:] * rhs[0:2,0] 
        // Duplicate rhs[:,0] and rhs[:,1] 
        "vpshuflw $0x00, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm4, %%ymm4 \n\t"


        // lhs[0:16,:] * rhs[2:4,0] 
        // Duplicate rhs[:,2] and rhs[:,3] 
        "vpshuflw $0x55, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm5, %%ymm5 \n\t"

        // lhs[0:16,:] * rhs[4:6,0] 
        // Duplicate rhs[:,4] and rhs[:,5] 
        "vpshuflw $0xaa, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm6, %%ymm6 \n\t"

        // lhs[0:16,:] * rhs[6:8,0] 
        // Duplicate rhs[:,6] and rhs[:,7] 
        "vpshuflw $0xff, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm7, %%ymm7 \n\t"
        
        // Iter 2, lower half of first row
        "vmovdqu 0x20(%[lhs_ptr]), %%ymm0 \n\t" // move lhs to ymm0
        // lhs[16:32,:] * rhs[0:2,0] 
        // Duplicate rhs[:,0] and rhs[:,1] 
        "vpshuflw $0x00, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm4, %%ymm4 \n\t"


        // lhs[16:32,:] * rhs[2:4,0] 
        // Duplicate rhs[:,2] and rhs[:,3] 
        "vpshuflw $0x55, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm5, %%ymm5 \n\t"

        // lhs[16:32,:] * rhs[4:6,0] 
        // Duplicate rhs[:,4] and rhs[:,5] 
        "vpshuflw $0xaa, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm6, %%ymm6 \n\t"

        // lhs[16:32,:] * rhs[6:8,0] 
        // Duplicate rhs[:,6] and rhs[:,7] 
        "vpshuflw $0xff, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm7, %%ymm7 \n\t"
        
        // Iter 3
        "vmovdqu 0x40(%[lhs_ptr]), %%ymm0 \n\t" // move lhs to ymm0
        // lhs[32:48,:] * rhs[0:2,0] 
        // Duplicate rhs[:,0] and rhs[:,1] 
        "vpshuflw $0x00, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm8, %%ymm8 \n\t"


        // lhs[32:48,:] * rhs[2:4,0] 
        // Duplicate rhs[:,2] and rhs[:,3] 
        "vpshuflw $0x55, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm9, %%ymm9 \n\t"

        // lhs[32:48,:] * rhs[4:6,0] 
        // Duplicate rhs[:,4] and rhs[:,5] 
        "vpshuflw $0xaa, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm10, %%ymm10 \n\t"

        // lhs[32:48,:] * rhs[6:8,0] 
        // Duplicate rhs[:,6] and rhs[:,7] 
        "vpshuflw $0xff, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm11, %%ymm11 \n\t"
        
        // Iter 4, lower half of first row
        "vmovdqu 0x60(%[lhs_ptr]), %%ymm0 \n\t" // move lhs to ymm0
        // lhs[48:64,:] * rhs[0:2,0] 
        // Duplicate rhs[:,0] and rhs[:,1] 
        "vpshuflw $0x00, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm8, %%ymm8 \n\t"


        // lhs[48:64,:] * rhs[2:4,0] 
        // Duplicate rhs[:,2] and rhs[:,3] 
        "vpshuflw $0x55, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm9, %%ymm9 \n\t"

        // lhs[48:64,:] * rhs[4:6,0] 
        // Duplicate rhs[:,4] and rhs[:,5] 
        "vpshuflw $0xaa, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm10, %%ymm10 \n\t"

        // lhs[48:64,:] * rhs[6:8,0] 
        // Duplicate rhs[:,6] and rhs[:,7] 
        "vpshuflw $0xff, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm11, %%ymm11 \n\t"
        
        // Iter 5
        "vmovdqu 0x80(%[lhs_ptr]), %%ymm0 \n\t" // move lhs to ymm0
        // lhs[64:80,:] * rhs[0:2,0] 
        // Duplicate rhs[:,0] and rhs[:,1] 
        "vpshuflw $0x00, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm12, %%ymm12 \n\t"


        // lhs[64:80,:] * rhs[2:4,0] 
        // Duplicate rhs[:,2] and rhs[:,3] 
        "vpshuflw $0x55, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm13, %%ymm13 \n\t"

        // lhs[64:80,:] * rhs[4:6,0] 
        // Duplicate rhs[:,4] and rhs[:,5] 
        "vpshuflw $0xaa, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm14, %%ymm14 \n\t"

        // lhs[64:80,:] * rhs[6:8,0] 
        // Duplicate rhs[:,6] and rhs[:,7] 
        "vpshuflw $0xff, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0xd8, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm15, %%ymm15 \n\t"
        
        // Iter 6, lower half of first row
        "vmovdqu 0xa0(%[lhs_ptr]), %%ymm0 \n\t" // move lhs to ymm0
        // lhs[80:96,:] * rhs[0:2,0] 
        // Duplicate rhs[:,0] and rhs[:,1] 
        "vpshuflw $0x00, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm12, %%ymm12 \n\t"


        // lhs[80:96,:] * rhs[2:4,0] 
        // Duplicate rhs[:,2] and rhs[:,3] 
        "vpshuflw $0x55, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm13, %%ymm13 \n\t"

        // lhs[80:96,:] * rhs[4:6,0] 
        // Duplicate rhs[:,4] and rhs[:,5] 
        "vpshuflw $0xaa, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm14, %%ymm14 \n\t"

        // lhs[80:96,:] * rhs[6:8,0] 
        // Duplicate rhs[:,6] and rhs[:,7] 
        "vpshuflw $0xff, %%ymm1, %%ymm2 \n\t"
        "vpermq $0x00, %%ymm2, %%ymm2 \n\t"
        // Elem-wise multiplcation by sign
        "vpsignb %%ymm0, %%ymm2, %%ymm2 \n\t"
        // Swap adjancent byte and store in ymm3
        "vpshufb (%[pshuf_swp_adj_op]), %%ymm2, %%ymm3 \n\t"
        // Add to prduce inner prod res of each element 
        // duplicated in adjancent cells
        "vpaddb %%ymm2, %%ymm3, %%ymm2 \n\t"
        // Reorder to get non-duplicate 
        "vpshufb (%[pshuf_odd_fh_op]), %%ymm2, %%ymm2 \n\t"
        "vpermq $0x85, %%ymm2, %%ymm2 \n\t"
        // Add to accum
        "vpaddb %%ymm2, %%ymm15, %%ymm15 \n\t"

        // move forward the pointer to rhs and lhs 
        "addq $0xc0, %[lhs_ptr] \n\t"
        "addq $0x08, %[rhs_ptr] \n\t"

        "subq $1, %%r14 \n\t"
        "ja outerLoop%= \n\t"
        // outerLoop ends here
        
        
        // Storing result
        "test %[start_depth], %[start_depth] \n\t"
        "jz storeDst%= \n\t"

        "vpaddd 0x00(%[dst_ptr]), %%ymm4, %%ymm4 \n\t"    // rhs0
        "vpaddd 0x20(%[dst_ptr]), %%ymm8, %%ymm8 \n\t"    // rhs0
        "vpaddd 0x40(%[dst_ptr]), %%ymm12, %%ymm12 \n\t"  // rhs0

        "vpaddd 0x00(%[dst_ptr], %%r12, 1) , %%ymm5, %%ymm5   \n\t"  // rhs1
        "vpaddd 0x20(%[dst_ptr], %%r12, 1) , %%ymm9, %%ymm9   \n\t"  // rhs1
        "vpaddd 0x40(%[dst_ptr], %%r12, 1) , %%ymm13, %%ymm13 \n\t"  // rhs1

        "vpaddd 0x00(%[dst_ptr], %%r12, 2) , %%ymm6, %%ymm6   \n\t"  // rhs2
        "vpaddd 0x20(%[dst_ptr], %%r12, 2) , %%ymm10, %%ymm10 \n\t"  // rhs2
        "vpaddd 0x40(%[dst_ptr], %%r12, 2) , %%ymm14, %%ymm14 \n\t"  // rhs2

        "vpaddd 0x00(%[dst_ptr], %%r13, 1) , %%ymm7, %%ymm7   \n\t"  // rhs3
        "vpaddd 0x20(%[dst_ptr], %%r13, 1) , %%ymm11, %%ymm11 \n\t"  // rhs3
        "vpaddd 0x40(%[dst_ptr], %%r13, 1) , %%ymm15, %%ymm15 \n\t"  // rhs3

        "storeDst%=:\n\t"

        "vmovdqu %%ymm4, 0x00(%[dst_ptr])            \n\t"  // rhs0
        "vmovdqu %%ymm8, 0x20(%[dst_ptr])            \n\t"  // rhs0
        "vmovdqu %%ymm12, 0x40(%[dst_ptr])           \n\t"  // rhs0

        "vmovdqu %%ymm5, 0x00(%[dst_ptr], %%r12, 1)  \n\t"  // rhs1
        "vmovdqu %%ymm9, 0x20(%[dst_ptr], %%r12, 1)  \n\t"  // rhs1
        "vmovdqu %%ymm13, 0x40(%[dst_ptr], %%r12, 1) \n\t"  // rhs1

        "vmovdqu %%ymm6, 0x00(%[dst_ptr], %%r12, 2)  \n\t"  // rhs2
        "vmovdqu %%ymm10, 0x20(%[dst_ptr], %%r12, 2) \n\t"  // rhs2
        "vmovdqu %%ymm14, 0x40(%[dst_ptr], %%r12, 2) \n\t"  // rhs2

        "vmovdqu %%ymm7, 0x00(%[dst_ptr], %%r13, 1)  \n\t"  // rhs3
        "vmovdqu %%ymm11, 0x20(%[dst_ptr], %%r13, 1) \n\t"  // rhs3
        "vmovdqu %%ymm15, 0x40(%[dst_ptr], %%r13, 1) \n\t"  // rhs3


        // Debug log
        //"vmovdqu %%ymm12, (%[arr_256]) \n\t"
        //"movq  %%r14, (%[tmp]) \n\t"

        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [dst_ptr] "+r"(dst_ptr), [arr_256] "+r"(arr_256),
        [pshuf_odd_fh_op] "+r"(pshuf_odd_fh_op),
        [pshuf_swp_adj_op] "+r"(pshuf_swp_adj_op),
        [tmp] "+r"(tmp)
        :  // inputs
        [start_depth] "r"(start_depth), [dst_col_stride_q] "r"(dst_col_stride_q),
        [run_depth_cells] "r"(run_depth_cells)
        :  // clobbers
        "cc", "memory", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7",
        "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%r12",
        "%r13", "%r14");
    //std::cout << "tmp is " << *tmp << "\n";
    //PrintContent(arr_256, "ymm1");
    //assert(1 == 0);
  }
};


#endif

}  // namespace gemmlowp

#endif  // GEMMLOWP_INTERNAL_KERNEL_AVX_SIGN_INT8_ACC_GARBAGE_H_
