Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
maxFR = constants.maxFR
NT, Nchan, NchanNear, nt0, nt0min, spkTh, NrankPC = Params
NT = int(NT)
Nchan = int(Nchan)
# Input GPU arrays.
d_Params = cp.asarray(Params, dtype=np.float64, order='F')
d_data = cp.asarray(dataRAW, dtype=np.float32, order='F')
d_W = cp.asarray(wPCA, dtype=np.float32, order='F')
d_iC = cp.asarray(iC, dtype=np.int32, order='F')
# New GPU arrays.
d_dout = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
d_dmax = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
d_st = cp.zeros(maxFR, dtype=np.int32, order='F')
d_id = cp.zeros(maxFR, dtype=np.int32, order='F')
d_counter = cp.zeros(1, dtype=np.int32, order='F')
# filter the data with the temporal templates
Conv1D = cp.RawKernel(code, 'Conv1D')
Conv1D((Nchan,), (Nthreads,), (d_Params, d_data, d_W, d_dout))
# get the max of the data
max1D = cp.RawKernel(code, 'max1D')
max1D((Nchan,), (Nthreads,), (d_Params, d_dout, d_dmax))
# take max across nearby channels
maxChannels = cp.RawKernel(code, 'maxChannels')
maxChannels(
(int(NT // Nthreads),), (Nthreads,),
(d_Params, d_dout, d_dmax, d_iC, d_st, d_id, d_counter))
def _batched_unpack_params(params_data, buffer, dtype, stream=None):
n_params = params_data.n_params
n_elems = params_data.n_elems
params_dptr = params_data.dptr
params_dtype = params_data.dtype
params_size_csum = params_data.size_csum
buf_dtype = _communication_utility._get_nccl_type_id(dtype)
n_threads = 128
n_blocks = (n_elems + n_threads - 1) // n_threads
if stream is None:
stream = cp.cuda.get_current_stream()
with stream:
_cupy_batched_unpack_params()(
(n_blocks, ), (n_threads, ),
(buffer.memory.ptr, buf_dtype, n_elems,
params_dptr, params_dtype, params_size_csum, n_params))
# for k in range(size):
# rev_conv_1x1 = layers[size - k - 1][1]
# rev_x, _ = rev_conv_1x1(rev_x)
# error = cf.mean(abs(x - rev_x))
# print("lu_1x1:", error)
# affine coupling layer
params = glow.nn.additive_coupling.Parameters(
channels_x=channels_x // 2, channels_h=128)
params.to_gpu()
params.conv_1(x[:, 0::2])
params.conv_1.W.data = xp.random.uniform(
-1.0, 1.0, size=params.conv_1.W.data.shape).astype("float32")
params.conv_2.W.data = xp.random.uniform(
-1.0, 1.0, size=params.conv_2.W.data.shape).astype("float32")
params.conv_3.W.data = xp.random.uniform(
-1.0, 1.0, size=params.conv_3.W.data.shape).astype("float32")
params.scale.data = xp.random.uniform(
-1.0, 1.0, size=params.scale.data.shape).astype("float32")
nonlinear_mapping = glow.nn.additive_coupling.NonlinearMapping(params)
coupling_layer = glow.nn.additive_coupling.AdditiveCoupling(
nn=nonlinear_mapping)
rev_coupling_layer = coupling_layer.reverse_copy()
y = x
for _ in range(size):
y, _ = coupling_layer(y)
rev_x = y
for _ in range(size):
rev_x, _ = rev_coupling_layer(rev_x)
error = cf.mean(abs(x - rev_x))
print("coupling:", error)
ELBO = log_px - kl_divergence
# https://arxiv.org/abs/1604.08772 Section.2
# https://www.reddit.com/r/MachineLearning/comments/56m5o2/discussion_calculation_of_bitsdims/
bits_per_pixel = -(ELBO / num_pixels_per_batch - np.log(256)) / np.log(
2)
return ELBO, bits_per_pixel, negative_log_likelihood, kl_divergence
#==============================================================================
# Training iterations
#==============================================================================
dataset_size = len(dataset_train)
random.seed(0)
np.random.seed(0)
cp.random.seed(0)
for epoch in range(meter_train.epoch, args.epochs):
_print("Epoch {}/{}:".format(
epoch + 1,
args.epochs,
))
meter_train.next_epoch()
subset_indices = list(range(len(dataset_train.subset_filenames)))
subset_size_per_gpu = len(subset_indices) // comm.size
if len(subset_indices) % comm.size != 0:
subset_size_per_gpu += 1
for subset_loop in range(subset_size_per_gpu):
random.shuffle(subset_indices)
subset_index = subset_indices[comm.rank]
(int(NT / Nthreads),), (Nthreads,), (d_Params, d_err, d_ftype, d_x, d_st, d_id, d_counter))
# ignore peaks that are smaller than another nearby peak
cleanup_heights = cp.RawKernel(code, 'cleanup_heights')
cleanup_heights(
(1 + int(maxFR // 32),), (32,), (d_Params, d_x, d_st, d_id, d_st1, d_id1, d_counter))
# add new spikes to 2nd counter
counter[0] = d_counter[1]
counter[0] = min(maxFR, counter[0])
d_WU = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
# d_WU1 = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
# update dWU here by adding back to subbed spikes
extract_snips = cp.RawKernel(code, 'extract_snips')
extract_snips((Nchan,), tpS, (d_Params, d_st1, d_id1, d_counter, d_data, d_WU))
# QUESTION: why a copy here??
# if counter[0] > 0:
# d_WU1[...] = d_WU[...]
del (
d_ftype, d_kkmax, d_err, d_st, d_id, d_st1, d_x, d_kk, d_id1, d_counter,
d_Params, d_dfilt)
return d_WU, d_dout
d_wtw = cp.zeros((nt0, nt0, Nfilt), dtype=np.float64, order='F')
d_dWUb = cp.zeros((nt0, Nchan, Nfilt), dtype=np.float64, order='F')
tpS = (nt0, int(Nthreads // nt0))
tpK = (Nrank, int(Nthreads // Nrank))
blankdWU = cp.RawKernel(code, 'blankdWU')
blankdWU((Nfilt,), tpS, (d_Params, d_dWU, d_iC, d_iW, d_dWUb))
# compute dWU * dWU'
getwtw = cp.RawKernel(code, 'getwtw')
getwtw((Nfilt,), tpS, (d_Params, d_dWUb, d_wtw))
# get W by power svd iterations
getW = cp.RawKernel(code, 'getW')
getW((Nfilt,), (nt0,), (d_Params, d_wtw, d_W))
# compute U by W' * dWU
getU = cp.RawKernel(code, 'getU')
getU((Nfilt,), tpK, (d_Params, d_dWUb, d_W, d_U))
# normalize U, get S, get mu, renormalize W
reNormalize = cp.RawKernel(code, 'reNormalize')
reNormalize((Nfilt,), (nt0,), (d_Params, d_A, d_B, d_W, d_U, d_mu))
del d_wtw, d_Params, d_dWUb
return d_W, d_U, d_mu
Nfilt = int(Params[1])
nt0 = int(Params[9])
d_Params = cp.asarray(Params, dtype=np.float64, order='F')
d_W1 = cp.asarray(W1, dtype=np.float32, order='F')
d_W2 = cp.asarray(W2, dtype=np.float32, order='F')
d_UtU = cp.asarray(UtU, dtype=np.float32, order='F')
d_WtW = cp.zeros((Nfilt, Nfilt, 2 * nt0 - 1), dtype=np.float32, order='F')
grid = (1 + int(Nfilt // nblock), 1 + int(Nfilt // nblock))
block = (nblock, nblock)
crossFilter = cp.RawKernel(code, 'crossFilter')
crossFilter(grid, block, (d_Params, d_W1, d_W2, d_UtU, d_WtW))
del d_Params, d_W1, d_W2, d_UtU
return d_WtW
# sum each template across channels, square, take max
sumChannels = cp.RawKernel(code, 'sumChannels')
sumChannels((int(NT / Nthreads),), (Nthreads,), (d_Params, d_dfilt, d_dout, d_kkmax, d_iC))
# compute the best filter
bestFilter = cp.RawKernel(code, 'bestFilter')
bestFilter(
(int(NT / Nthreads),), (Nthreads,), (d_Params, d_dout, d_err, d_ftype, d_kkmax, d_kk))
# ignore peaks that are smaller than another nearby peak
cleanup_spikes = cp.RawKernel(code, 'cleanup_spikes')
cleanup_spikes(
(int(NT / Nthreads),), (Nthreads,), (d_Params, d_err, d_ftype, d_x, d_st, d_id, d_counter))
# ignore peaks that are smaller than another nearby peak
cleanup_heights = cp.RawKernel(code, 'cleanup_heights')
cleanup_heights(
(1 + int(maxFR // 32),), (32,), (d_Params, d_x, d_st, d_id, d_st1, d_id1, d_counter))
# add new spikes to 2nd counter
counter[0] = d_counter[1]
counter[0] = min(maxFR, counter[0])
d_WU = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
# d_WU1 = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
# update dWU here by adding back to subbed spikes
extract_snips = cp.RawKernel(code, 'extract_snips')
extract_snips((Nchan,), tpS, (d_Params, d_st1, d_id1, d_counter, d_data, d_WU))
# QUESTION: why a copy here??
# if counter[0] > 0:
# Input GPU arrays.
d_Params = cp.asarray(Params, dtype=np.float64, order='F')
d_data = cp.asarray(dataRAW, dtype=np.float32, order='F')
d_W = cp.asarray(wPCA, dtype=np.float32, order='F')
d_iC = cp.asarray(iC, dtype=np.int32, order='F')
# New GPU arrays.
d_dout = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
d_dmax = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
d_st = cp.zeros(maxFR, dtype=np.int32, order='F')
d_id = cp.zeros(maxFR, dtype=np.int32, order='F')
d_counter = cp.zeros(1, dtype=np.int32, order='F')
# filter the data with the temporal templates
Conv1D = cp.RawKernel(code, 'Conv1D')
Conv1D((Nchan,), (Nthreads,), (d_Params, d_data, d_W, d_dout))
# get the max of the data
max1D = cp.RawKernel(code, 'max1D')
max1D((Nchan,), (Nthreads,), (d_Params, d_dout, d_dmax))
# take max across nearby channels
maxChannels = cp.RawKernel(code, 'maxChannels')
maxChannels(
(int(NT // Nthreads),), (Nthreads,),
(d_Params, d_dout, d_dmax, d_iC, d_st, d_id, d_counter))
# move d_x to the CPU
minSize = 1
minSize = min(maxFR, int(d_counter[0]))
@testing.numpy_cupy_array_equal()
def test_array_split1(self, xp):
a = testing.shaped_arange((3, 11), xp)
split = xp.array_split(a, 4, 1)
return xp.concatenate(split, 1)