void solution(int8_t inp[4][30][30][128], int8_t weights[3][3][128][128], int32_t bias[1][128], int8_t output[4][28][28][128]) {
  config_st((128));
  config_ex(WEIGHT_STATIONARY, NO_ACTIVATION, 1, false, false);
  config_ld((128), 1.0f, 16, 1);
  config_ld(0, 1.0f, 0, 0);

  for (int_fast32_t bo = 0; bo < 1; bo++) {
    uint32_t i_s = 0;
    uint32_t w_s = 16 * 16 * 8 * 3 * 16 * sizeof(int8_t) / 16;
    uint32_t res = 1 << 31;
    for (int_fast32_t bi = 0; bi < 4; bi++) {
      for (int_fast32_t orow_o = 0; orow_o < 2; orow_o++) {
        for (int_fast32_t orow_i = 0; orow_i < 14; orow_i++) {
          mvin( &bias[0][0], res + (0)/16, 16, (16) );
          mvin( &bias[0][16], res + (256)/16, 16, (16) );
          mvin( &bias[0][32], res + ((2) * (256))/16, 16, (16) );
          mvin( &bias[0][48], res + ((3) * (256))/16, 16, (16) );
          mvin( &bias[0][64], res + ((4) * (256))/16, 16, (16) );
          mvin( &bias[0][80], res + ((5) * (256))/16, 16, (16) );
          mvin( &bias[0][96], res + ((6) * (256))/16, 16, (16) );
          mvin( &bias[0][112], res + ((7) * (256))/16, 16, (16) );
          for (int_fast32_t krow = 0; krow < 3; krow++) {
            if (bi == 0) {
              if (orow_o == 0) {
                if (orow_i == 0) {
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][0][64 * och_o_o], w_s + ((krow) * (49152) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][16][64 * och_o_o], w_s + ((krow) * (49152) + 2048 + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][32][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][48][64 * och_o_o], w_s + ((krow) * (49152) + (3) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][64][64 * och_o_o], w_s + ((krow) * (49152) + (4) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][80][64 * och_o_o], w_s + ((krow) * (49152) + (5) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][96][64 * och_o_o], w_s + ((krow) * (49152) + (6) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][112][64 * och_o_o], w_s + ((krow) * (49152) + (7) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                }
              }
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (16), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][0][0], i_s + ((krow + orow_i) * (6144))/16, 16*(4), (16) );
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (16), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][0][64], i_s + ((krow + orow_i) * (6144) + (4) * (256))/16, 16*(4), (16) );
            }
            preload(w_s + ((krow) * (49152))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048)/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            if (bi == 0) {
              if (orow_o == 0) {
                if (orow_i == 0) {
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][0][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][16][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + 2048 + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][32][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][48][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][64][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][80][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][96][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][112][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                }
              }
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (16), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][1][0], i_s + ((krow + orow_i) * (6144) + 2048)/16, 16*(4), (16) );
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (16), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][1][64], i_s + ((krow + orow_i) * (6144) + 2048 + (4) * (256))/16, 16*(4), (16) );
            }
            preload(w_s + ((krow) * (49152) + 16384)/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048)/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + 2048 + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            if (bi == 0) {
              if (orow_o == 0) {
                if (orow_i == 0) {
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][0][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][16][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][32][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][48][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][64][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][80][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][96][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][112][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                }
              }
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (16), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][2][0], i_s + ((krow + orow_i) * (6144) + (2) * (2048))/16, 16*(4), (16) );
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (16), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][2][64], i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (4) * (256))/16, 16*(4), (16) );
            }
            preload(w_s + ((krow) * (49152) + (2) * (16384))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048)/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (4) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (5) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (6) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + 256)/16, res + (256)/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (2) * (256))/16, res + ((2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (3) * (256))/16, res + ((3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (4) * (256))/16, res + ((4) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (5) * (256))/16, res + ((5) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (6) * (256))/16, res + ((6) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (7) * (256))/16, res + ((7) * (256))/16 | 0x40000000, (16), (16), (16), (16));
  compute_preloaded(i_s + ((krow + orow_i) * (6144) + (2) * (2048) + (7) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
          }
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][0][0], res + (0)/16, (16), (16) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][0][16], res + (256)/16, (16), (16) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][0][32], res + ((2) * (256))/16, (16), (16) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][0][48], res + ((3) * (256))/16, (16), (16) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][0][64], res + ((4) * (256))/16, (16), (16) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][0][80], res + ((5) * (256))/16, (16), (16) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][0][96], res + ((6) * (256))/16, (16), (16) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][0][112], res + ((7) * (256))/16, (16), (16) );
        }
      }
    }
  }
  for (int_fast32_t bo = 0; bo < 1; bo++) {
    uint32_t i_s = 0;
    uint32_t w_s = 16 * 12 * 8 * 3 * 16 * sizeof(int8_t) / 16;
    uint32_t res = 1 << 31;
    for (int_fast32_t bi = 0; bi < 4; bi++) {
      for (int_fast32_t orow_o = 0; orow_o < 2; orow_o++) {
        for (int_fast32_t orow_i = 0; orow_i < 14; orow_i++) {
          mvin( &bias[0][0], res + (0)/16, 16, (12) );
          mvin( &bias[0][16], res + (208)/16, 16, (12) );
          mvin( &bias[0][32], res + ((2) * (208))/16, 16, (12) );
          mvin( &bias[0][48], res + ((3) * (208))/16, 16, (12) );
          mvin( &bias[0][64], res + ((4) * (208))/16, 16, (12) );
          mvin( &bias[0][80], res + ((5) * (208))/16, 16, (12) );
          mvin( &bias[0][96], res + ((6) * (208))/16, 16, (12) );
          mvin( &bias[0][112], res + ((7) * (208))/16, 16, (12) );
          for (int_fast32_t krow = 0; krow < 3; krow++) {
            if (bi == 0) {
              if (orow_o == 0) {
                if (orow_i == 0) {
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][0][64 * och_o_o], w_s + ((krow) * (49152) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][16][64 * och_o_o], w_s + ((krow) * (49152) + 2048 + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][32][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][48][64 * och_o_o], w_s + ((krow) * (49152) + (3) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][64][64 * och_o_o], w_s + ((krow) * (49152) + (4) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][80][64 * och_o_o], w_s + ((krow) * (49152) + (5) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][96][64 * och_o_o], w_s + ((krow) * (49152) + (6) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][0][112][64 * och_o_o], w_s + ((krow) * (49152) + (7) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                }
              }
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (12), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][16][0], i_s + ((krow + orow_i) * (4608))/16, 16*(4), (12) );
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (12), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][16][64], i_s + ((krow + orow_i) * (4608) + (4) * (192))/16, 16*(4), (12) );
            }
            preload(w_s + ((krow) * (49152))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048)/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 2048 + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (3) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (4) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (5) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (6) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (7) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            if (bi == 0) {
              if (orow_o == 0) {
                if (orow_i == 0) {
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][0][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][16][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + 2048 + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][32][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][48][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][64][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][80][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][96][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][1][112][64 * och_o_o], w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                }
              }
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (12), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][17][0], i_s + ((krow + orow_i) * (4608) + 1536)/16, 16*(4), (12) );
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (12), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][17][64], i_s + ((krow + orow_i) * (4608) + 1536 + (4) * (192))/16, 16*(4), (12) );
            }
            preload(w_s + ((krow) * (49152) + 16384)/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048)/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + 2048 + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (2) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (3) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (4) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (5) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (6) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + 16384 + (7) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + 1536 + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            if (bi == 0) {
              if (orow_o == 0) {
                if (orow_i == 0) {
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][0][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][16][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][32][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][48][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][64][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][80][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][96][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                  for (int_fast32_t och_o_o = 0; och_o_o < 2; och_o_o++) {
                    mvin2( &weights[krow][2][112][64 * och_o_o], w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (4 * och_o_o) * (256))/16, 16*(4), (16) );
                  }
                }
              }
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (12), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][18][0], i_s + ((krow + orow_i) * (4608) + (2) * (1536))/16, 16*(4), (12) );
            }
            if (orow_i == 0 || krow == 2) {
              config_ld(128, 1.0f, (12), 2);
              mvin3( &inp[bi + 4 * bo][krow + orow_i + 14 * orow_o][18][64], i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (4) * (192))/16, 16*(4), (12) );
            }
            preload(w_s + ((krow) * (49152) + (2) * (16384))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048)/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + 2048 + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + 192)/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (2) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (2) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (3) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (3) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (4) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (4) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (5) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (5) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (6) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (6) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048))/16, res + (0)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + 256)/16, res + (208)/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (2) * (256))/16, res + ((2) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (3) * (256))/16, res + ((3) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (4) * (256))/16, res + ((4) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (5) * (256))/16, res + ((5) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (6) * (256))/16, res + ((6) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
            preload(w_s + ((krow) * (49152) + (2) * (16384) + (7) * (2048) + (7) * (256))/16, res + ((7) * (208))/16 | 0x40000000, (16), (16), (16), (12));
  compute_preloaded(i_s + ((krow + orow_i) * (4608) + (2) * (1536) + (7) * (192))/16, ~((uint32_t)0), (16), (12), 16, 16);
          }
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][16][0], res + (0)/16, (16), (12) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][16][16], res + (208)/16, (16), (12) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][16][32], res + ((2) * (208))/16, (16), (12) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][16][48], res + ((3) * (208))/16, (16), (12) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][16][64], res + ((4) * (208))/16, (16), (12) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][16][80], res + ((5) * (208))/16, (16), (12) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][16][96], res + ((6) * (208))/16, (16), (12) );
          mvout( &output[bi + 4 * bo][orow_i + 14 * orow_o][16][112], res + ((7) * (208))/16, (16), (12) );
        }
      }
    }
  }
  fence();
}