Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
(int(NT / Nthreads),), (Nthreads,), (d_Params, d_err, d_ftype, d_x, d_st, d_id, d_counter))
# ignore peaks that are smaller than another nearby peak
cleanup_heights = cp.RawKernel(code, 'cleanup_heights')
cleanup_heights(
(1 + int(maxFR // 32),), (32,), (d_Params, d_x, d_st, d_id, d_st1, d_id1, d_counter))
# add new spikes to 2nd counter
counter[0] = d_counter[1]
counter[0] = min(maxFR, counter[0])
d_WU = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
# d_WU1 = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
# update dWU here by adding back to subbed spikes
extract_snips = cp.RawKernel(code, 'extract_snips')
extract_snips((Nchan,), tpS, (d_Params, d_st1, d_id1, d_counter, d_data, d_WU))
# QUESTION: why a copy here??
# if counter[0] > 0:
# d_WU1[...] = d_WU[...]
del (
d_ftype, d_kkmax, d_err, d_st, d_id, d_st1, d_x, d_kk, d_id1, d_counter,
d_Params, d_dfilt)
return d_WU, d_dout
d_wtw = cp.zeros((nt0, nt0, Nfilt), dtype=np.float64, order='F')
d_dWUb = cp.zeros((nt0, Nchan, Nfilt), dtype=np.float64, order='F')
tpS = (nt0, int(Nthreads // nt0))
tpK = (Nrank, int(Nthreads // Nrank))
blankdWU = cp.RawKernel(code, 'blankdWU')
blankdWU((Nfilt,), tpS, (d_Params, d_dWU, d_iC, d_iW, d_dWUb))
# compute dWU * dWU'
getwtw = cp.RawKernel(code, 'getwtw')
getwtw((Nfilt,), tpS, (d_Params, d_dWUb, d_wtw))
# get W by power svd iterations
getW = cp.RawKernel(code, 'getW')
getW((Nfilt,), (nt0,), (d_Params, d_wtw, d_W))
# compute U by W' * dWU
getU = cp.RawKernel(code, 'getU')
getU((Nfilt,), tpK, (d_Params, d_dWUb, d_W, d_U))
# normalize U, get S, get mu, renormalize W
reNormalize = cp.RawKernel(code, 'reNormalize')
reNormalize((Nfilt,), (nt0,), (d_Params, d_A, d_B, d_W, d_U, d_mu))
del d_wtw, d_Params, d_dWUb
return d_W, d_U, d_mu
Nfilt = int(Params[1])
nt0 = int(Params[9])
d_Params = cp.asarray(Params, dtype=np.float64, order='F')
d_W1 = cp.asarray(W1, dtype=np.float32, order='F')
d_W2 = cp.asarray(W2, dtype=np.float32, order='F')
d_UtU = cp.asarray(UtU, dtype=np.float32, order='F')
d_WtW = cp.zeros((Nfilt, Nfilt, 2 * nt0 - 1), dtype=np.float32, order='F')
grid = (1 + int(Nfilt // nblock), 1 + int(Nfilt // nblock))
block = (nblock, nblock)
crossFilter = cp.RawKernel(code, 'crossFilter')
crossFilter(grid, block, (d_Params, d_W1, d_W2, d_UtU, d_WtW))
del d_Params, d_W1, d_W2, d_UtU
return d_WtW
# sum each template across channels, square, take max
sumChannels = cp.RawKernel(code, 'sumChannels')
sumChannels((int(NT / Nthreads),), (Nthreads,), (d_Params, d_dfilt, d_dout, d_kkmax, d_iC))
# compute the best filter
bestFilter = cp.RawKernel(code, 'bestFilter')
bestFilter(
(int(NT / Nthreads),), (Nthreads,), (d_Params, d_dout, d_err, d_ftype, d_kkmax, d_kk))
# ignore peaks that are smaller than another nearby peak
cleanup_spikes = cp.RawKernel(code, 'cleanup_spikes')
cleanup_spikes(
(int(NT / Nthreads),), (Nthreads,), (d_Params, d_err, d_ftype, d_x, d_st, d_id, d_counter))
# ignore peaks that are smaller than another nearby peak
cleanup_heights = cp.RawKernel(code, 'cleanup_heights')
cleanup_heights(
(1 + int(maxFR // 32),), (32,), (d_Params, d_x, d_st, d_id, d_st1, d_id1, d_counter))
# add new spikes to 2nd counter
counter[0] = d_counter[1]
counter[0] = min(maxFR, counter[0])
d_WU = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
# d_WU1 = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
# update dWU here by adding back to subbed spikes
extract_snips = cp.RawKernel(code, 'extract_snips')
extract_snips((Nchan,), tpS, (d_Params, d_st1, d_id1, d_counter, d_data, d_WU))
# QUESTION: why a copy here??
# if counter[0] > 0:
# Input GPU arrays.
d_Params = cp.asarray(Params, dtype=np.float64, order='F')
d_data = cp.asarray(dataRAW, dtype=np.float32, order='F')
d_W = cp.asarray(wPCA, dtype=np.float32, order='F')
d_iC = cp.asarray(iC, dtype=np.int32, order='F')
# New GPU arrays.
d_dout = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
d_dmax = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
d_st = cp.zeros(maxFR, dtype=np.int32, order='F')
d_id = cp.zeros(maxFR, dtype=np.int32, order='F')
d_counter = cp.zeros(1, dtype=np.int32, order='F')
# filter the data with the temporal templates
Conv1D = cp.RawKernel(code, 'Conv1D')
Conv1D((Nchan,), (Nthreads,), (d_Params, d_data, d_W, d_dout))
# get the max of the data
max1D = cp.RawKernel(code, 'max1D')
max1D((Nchan,), (Nthreads,), (d_Params, d_dout, d_dmax))
# take max across nearby channels
maxChannels = cp.RawKernel(code, 'maxChannels')
maxChannels(
(int(NT // Nthreads),), (Nthreads,),
(d_Params, d_dout, d_dmax, d_iC, d_st, d_id, d_counter))
# move d_x to the CPU
minSize = 1
minSize = min(maxFR, int(d_counter[0]))
# source code of CUDA kernel
with open(os.path.join(os.path.dirname(Path(__file__).resolve()),'bilateral.cu'), 'r') as f:
kernel_source_code = f.read()
kernel_source_code = Template(kernel_source_code)
kernel_source_code = kernel_source_code.substitute(
width=w, height=h, sigma_s=-0.5/(sigmaS**2), sigma_r=-0.5/(sigmaR**2),
sigma=sigma, snn=snn, half_kernel_size=half_kernel_size)
if fast:
kernel = cp.RawKernel(kernel_source_code, 'bilateral',
options=('--use_fast_math', ))
else:
kernel = cp.RawKernel(kernel_source_code, 'bilateral')
# create NumPy function
def bilateral_core(h_img, kernel):
# h_img must be a 2-D image
d_img = cp.asarray(h_img)
d_out = cp.empty_like(d_img)
kernel(((w + blksize[0] - 1)//blksize[0], (h + blksize[1] - 1)//blksize[1]), blksize, (d_img, d_out))
h_out = cp.asnumpy(d_out)
return h_out
# process
return mufnp.numpy_process(src, bilateral_core, kernel=kernel)
# get list of cmaxes for each combination of neuron and filter
computeCost = cp.RawKernel(code, 'computeCost')
computeCost(
(Nfilters,), (1024,), (d_Params, d_uproj, d_mu, d_W, d_iMatch, d_iC, d_call, d_cmax))
# loop through cmax to find best template
bestFilter = cp.RawKernel(code, 'bestFilter')
bestFilter((40,), (256,), (d_Params, d_iMatch, d_iC, d_call, d_cmax, d_id, d_x))
# average all spikes for same template -- ORIGINAL
average_snips = cp.RawKernel(code, 'average_snips')
average_snips(
(Nfilters,), (NrankPC, NchanNear), (d_Params, d_iC, d_call, d_id, d_uproj, d_cmax, d_dWU))
count_spikes = cp.RawKernel(code, 'count_spikes')
count_spikes((7,), (256,), (d_Params, d_id, d_nsp, d_x, d_V))
del d_Params, d_V
return d_dWU, d_id, d_x, d_nsp, d_cmax
# update 1st counter from 2nd counter
d_counter[1] = d_counter[0]
# compute PC features from reziduals + subtractions
# TODO: design - let's not use numeric indexing into the Params array. It's much more difficult to read.
if Params[12] > 0:
computePCfeatures = cp.RawKernel(code, 'computePCfeatures')
computePCfeatures(
(Nfilt,), tpPC,
(d_Params, d_counter, d_draw, d_st, d_id, d_y,
d_W, d_U, d_mu, d_iW, d_iC, d_wPCA, d_featPC))
# update dWU here by adding back to subbed spikes.
# additional parameter d_idx = array of time sorted indicies
average_snips = cp.RawKernel(code, 'average_snips')
average_snips(
(Nfilt,), tpS,
(d_Params, d_st, d_id, d_x, d_y, d_counter, d_draw, d_W, d_U, d_dWU, d_nsp, d_mu, d_z))
if counter[0] < maxFR:
minSize = counter[0]
else:
minSize = maxFR
del d_counter, d_Params, d_ftype, d_err, d_eloss, d_z, d_dout, d_data
return (
d_st[:minSize], d_id[:minSize], d_y[:minSize], d_feat[..., :minSize],
d_dWU, d_draw, d_nsp, d_featPC[..., :minSize], d_x[:minSize])
def _call_nms_kernel(bbox, thresh):
n_bbox = bbox.shape[0]
threads_per_block = 64
col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
blocks = (col_blocks, col_blocks, 1)
threads = (threads_per_block, 1, 1)
mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
kern = cp.RawKernel(_nms_gpu_code, 'nms_kernel')
kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
bbox, mask_dev))
mask_host = mask_dev.get()
selection, n_selec = _nms_gpu_post(
mask_host, n_bbox, threads_per_block, col_blocks)
return selection, n_selec
d_dout = cp.zeros((NT, Nchan), dtype=np.float32, order='F')
d_dfilt = cp.zeros((Nrank, NT, Nchan), dtype=np.float32, order='F')
d_err = cp.zeros(NT, dtype=np.float32, order='F')
d_kkmax = cp.zeros((NT, Nchan), dtype=np.int32, order='F')
d_kk = cp.zeros(NT, dtype=np.int32, order='F')
d_ftype = cp.zeros(NT, dtype=np.int32, order='F')
d_st = cp.zeros(maxFR, dtype=np.int32, order='F')
d_id = cp.zeros(maxFR, dtype=np.int32, order='F')
d_x = cp.zeros(maxFR, dtype=np.float32, order='F')
d_st1 = cp.zeros(maxFR, dtype=np.int32, order='F')
d_id1 = cp.zeros(maxFR, dtype=np.int32, order='F')
counter = np.zeros(2, dtype=np.int32, order='F')
# filter the data with the temporal templates
Conv1D = cp.RawKernel(code, 'Conv1D')
Conv1D((Nchan,), (Nthreads,), (d_Params, d_data, d_W, d_dfilt))
# sum each template across channels, square, take max
sumChannels = cp.RawKernel(code, 'sumChannels')
sumChannels((int(NT / Nthreads),), (Nthreads,), (d_Params, d_dfilt, d_dout, d_kkmax, d_iC))
# compute the best filter
bestFilter = cp.RawKernel(code, 'bestFilter')
bestFilter(
(int(NT / Nthreads),), (Nthreads,), (d_Params, d_dout, d_err, d_ftype, d_kkmax, d_kk))
# ignore peaks that are smaller than another nearby peak
cleanup_spikes = cp.RawKernel(code, 'cleanup_spikes')
cleanup_spikes(
(int(NT / Nthreads),), (Nthreads,), (d_Params, d_err, d_ftype, d_x, d_st, d_id, d_counter))