#include <torch/extension.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
using namespace torch::indexing;

#include "../utilities.h"


template <int tile_size>
__device__ void dkan_full_kernel_2d_thread_per_tile_batch_last_backward(
    const float* __restrict__ shared_parameters,
    const float* __restrict__ x,
    const float* __restrict__ output_grad,
    float* __restrict__ shared_parameters_grad,
    float* __restrict__ x_grad,
    int my_tile_in,
    int my_tile_out,
    int N_in,
    int N_out,
    int n_chunks,
    int batch_size
) {
    assert(false); // not implemented
}