void solution(int8_t A[512][512], int8_t B[512][512], int8_t C[512][512]) {
  config_st((512));
  config_ex(WEIGHT_STATIONARY, NO_ACTIVATION, 1, false, false);
  config_ld((512), 1.0f, 16, 2);
  config_ld((512), 1.0f, 16, 1);
  config_ld(0, 1.0f, 16, 0);

  uint32_t a = 0;
  uint32_t b = 16 * 16 * 4 * 8 * 8 * 2 * sizeof(int8_t) / 16;
  uint32_t res = 1 << 31;
  for (int_fast32_t ioo = 0; ioo < 2; ioo++) {
    for (int_fast32_t jo = 0; jo < 2; jo++) {
      for (int_fast32_t io = 0; io < 2; io++) {
        for (int_fast32_t i = 0; i < 8; i++) {
          for (int_fast32_t ji = 0; ji < 4; ji++) {
            mvin( 0, res + ((ji) * (1024))/16,(16), (16) );
            mvin( 0, res + ((ji) * (1024) + 256)/16,(16), (16) );
            mvin( 0, res + ((ji) * (1024) + (2) * (256))/16,(16), (16) );
            mvin( 0, res + ((ji) * (1024) + (3) * (256))/16,(16), (16) );
            for (int_fast32_t ko = 0; ko < 8; ko++) {
              if (jo == 0) {
                if (ji == 0) {
                  mvin2( &A[(16 * i + 128 * io + 256 * ioo)][64 * ko], a + ((io) * (65536) + (i) * (8192) + (ko) * (1024))/16, 16*(4), (16) );
                }
              }
              if (io == 0) {
                if (i == 0) {
                  mvin3( &B[(64 * ko)][64 * ji + 256 * jo], b + ((ji) * (32768) + (ko) * (4096))/16, 16*(4), (16) );
                }
              }
              preload(b + ((ji) * (32768) + (ko) * (4096))/16, res + ((ji) * (1024))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024))/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + 256)/16, res + ((ji) * (1024) + 256)/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024))/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + (2) * (256))/16, res + ((ji) * (1024) + (2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024))/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + (3) * (256))/16, res + ((ji) * (1024) + (3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024))/16, ~((uint32_t)0), (16), (16), 16, 16);
              if (io == 0) {
                if (i == 0) {
                  mvin3( &B[(16 + 64 * ko)][64 * ji + 256 * jo], b + ((ji) * (32768) + (ko) * (4096) + 1024)/16, 16*(4), (16) );
                }
              }
              preload(b + ((ji) * (32768) + (ko) * (4096) + 1024)/16, res + ((ji) * (1024))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + 1024 + 256)/16, res + ((ji) * (1024) + 256)/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + 1024 + (2) * (256))/16, res + ((ji) * (1024) + (2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + 1024 + (3) * (256))/16, res + ((ji) * (1024) + (3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + 256)/16, ~((uint32_t)0), (16), (16), 16, 16);
              if (io == 0) {
                if (i == 0) {
                  mvin3( &B[(32 + 64 * ko)][64 * ji + 256 * jo], b + ((ji) * (32768) + (ko) * (4096) + (2) * (1024))/16, 16*(4), (16) );
                }
              }
              preload(b + ((ji) * (32768) + (ko) * (4096) + (2) * (1024))/16, res + ((ji) * (1024))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + (2) * (1024) + 256)/16, res + ((ji) * (1024) + 256)/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + (2) * (1024) + (2) * (256))/16, res + ((ji) * (1024) + (2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + (2) * (1024) + (3) * (256))/16, res + ((ji) * (1024) + (3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + (2) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
              if (io == 0) {
                if (i == 0) {
                  mvin3( &B[(48 + 64 * ko)][64 * ji + 256 * jo], b + ((ji) * (32768) + (ko) * (4096) + (3) * (1024))/16, 16*(4), (16) );
                }
              }
              preload(b + ((ji) * (32768) + (ko) * (4096) + (3) * (1024))/16, res + ((ji) * (1024))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + (3) * (1024) + 256)/16, res + ((ji) * (1024) + 256)/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + (3) * (1024) + (2) * (256))/16, res + ((ji) * (1024) + (2) * (256))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
              preload(b + ((ji) * (32768) + (ko) * (4096) + (3) * (1024) + (3) * (256))/16, res + ((ji) * (1024) + (3) * (256))/16 | 0x40000000, (16), (16), (16), (16));
              compute_preloaded(a + ((io) * (65536) + (i) * (8192) + (ko) * (1024) + (3) * (256))/16, ~((uint32_t)0), (16), (16), 16, 16);
            }
            mvout( ((uint64_t) &C[(16 * i + 128 * io + 256 * ioo)][64 * ji + 256 * jo]), res + ((ji) * (1024))/16, (16), (16) );
            mvout( ((uint64_t) &C[(16 * i + 128 * io + 256 * ioo)][16 + 64 * ji + 256 * jo]), res + ((ji) * (1024) + 256)/16, (16), (16) );
            mvout( ((uint64_t) &C[(16 * i + 128 * io + 256 * ioo)][32 + 64 * ji + 256 * jo]), res + ((ji) * (1024) + (2) * (256))/16, (16), (16) );
            mvout( ((uint64_t) &C[(16 * i + 128 * io + 256 * ioo)][48 + 64 * ji + 256 * jo]), res + ((ji) * (1024) + (3) * (256))/16, (16), (16) );
          }
        }
      }
    }
  }
  fence();
}
