# distutils: language = c
# cython: boundscheck=False
# cython: wraparound=False
# cython: nonecheck=False
# cython: initializedcheck=False

import numpy as np
cimport numpy as np
from libc.stdint cimport uint32_t
from cython.parallel import prange

cdef extern from "stdint.h":
	unsigned int __builtin_clz(unsigned int x)

cdef inline int clz32(uint32_t x):
	if x == 0:
		return 32
	return __builtin_clz(x)

def sample_cLUT_fast(np.uint32_t[:] cLUT not None, int E, int M, int K):
	cdef:
		uint32_t[:] x = np.random.randint(0, 2**E, size=K, dtype=np.uint32)
		int[:] bitlen = np.empty(K, dtype=np.int32)
		int[:] j = np.random.randint(0, 2**M, size=K, dtype=np.int32)
		int width = <int>(2**M)
		int idx
		np.uint32_t[:] out = np.empty(K, dtype=np.uint32)
		Py_ssize_t i

	for i in range(K):
		bitlen[i] = 32 - clz32(x[i])

	for i in range(K):
		idx = bitlen[i] * width + j[i]
		out[i] = cLUT[idx]

	return out
