#include <stdio.h>
#include <cuda_runtime.h>

// Minimal kernel that uses dynamic shared memory.
// Each thread writes its thread index into shared memory and then copies it to global memory.
__global__ void sharedMemoryKernel(int *out) {
    // Declare dynamic shared memory array.
    extern __shared__ int sdata[];

    int tid = threadIdx.x;
    
    // Write each thread's id into shared memory.
    sdata[tid] = tid;
    __syncthreads();  // Ensure all threads have written to shared memory.

    // For demonstration, copy the shared memory value to global memory.
    out[tid] = sdata[tid];
}

int main(void) {
    const int blockSize = 16;              // Number of threads per block.
    const int arraySize = blockSize;       // We'll use one block so the array size equals blockSize.
    const int memSize = arraySize * sizeof(int);

    int h_out[arraySize] = {0};
    int *d_out = NULL;
    cudaError_t err;

    // Allocate device memory.
    err = cudaMalloc((void **)&d_out, memSize);
    if (err != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(err));
        return -1;
    }

    // Launch the kernel using one block of blockSize threads.
    // Pass memSize bytes for dynamic shared memory.
    sharedMemoryKernel<<<1, blockSize, memSize>>>(d_out);

    // Check for any errors launching the kernel.
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(err));
        cudaFree(d_out);
        return -1;
    }

    // Wait for the GPU to finish.
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize failed: %s\n", cudaGetErrorString(err));
        cudaFree(d_out);
        return -1;
    }

    // Copy the output from device to host.
    err = cudaMemcpy(h_out, d_out, memSize, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(err));
        cudaFree(d_out);
        return -1;
    }

    // Print the results.
    for (int i = 0; i < arraySize; ++i) {
        printf("h_out[%d] = %d\n", i, h_out[i]);
    }

    // Free device memory.
    cudaFree(d_out);

    return 0;
}
