How to use cupy - 10 common examples

To help you get started, we’ve selected a few cupy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github rossant / pykilosort / pykilosort / cluster.py View on Github external
maxFR = constants.maxFR

    NT, Nchan, NchanNear, nt0, nt0min, spkTh, NrankPC = Params
    NT = int(NT)
    Nchan = int(Nchan)

    # Input GPU arrays.
    d_Params = cp.asarray(Params, dtype=np.float64, order='F')
    d_data = cp.asarray(dataRAW, dtype=np.float32, order='F')
    d_W = cp.asarray(wPCA, dtype=np.float32, order='F')
    d_iC = cp.asarray(iC, dtype=np.int32, order='F')

    # New GPU arrays.
    d_dout = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
    d_dmax = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
    d_st = cp.zeros(maxFR, dtype=np.int32, order='F')
    d_id = cp.zeros(maxFR, dtype=np.int32, order='F')
    d_counter = cp.zeros(1, dtype=np.int32, order='F')

    # filter the data with the temporal templates
    Conv1D = cp.RawKernel(code, 'Conv1D')
    Conv1D((Nchan,), (Nthreads,), (d_Params, d_data, d_W, d_dout))

    # get the max of the data
    max1D = cp.RawKernel(code, 'max1D')
    max1D((Nchan,), (Nthreads,), (d_Params, d_dout, d_dmax))

    # take max across nearby channels
    maxChannels = cp.RawKernel(code, 'maxChannels')
    maxChannels(
        (int(NT // Nthreads),), (Nthreads,),
        (d_Params, d_dout, d_dmax, d_iC, d_st, d_id, d_counter))
github chainer / chainer / chainermn / communicators / _memory_utility.py View on Github external
def _batched_unpack_params(params_data, buffer, dtype, stream=None):
    n_params = params_data.n_params
    n_elems = params_data.n_elems
    params_dptr = params_data.dptr
    params_dtype = params_data.dtype
    params_size_csum = params_data.size_csum
    buf_dtype = _communication_utility._get_nccl_type_id(dtype)
    n_threads = 128
    n_blocks = (n_elems + n_threads - 1) // n_threads
    if stream is None:
        stream = cp.cuda.get_current_stream()
    with stream:
        _cupy_batched_unpack_params()(
            (n_blocks, ), (n_threads, ),
            (buffer.memory.ptr, buf_dtype, n_elems,
             params_dptr, params_dtype, params_size_csum, n_params))
github musyoku / chainer-glow / run / debug / check_reverse.py View on Github external
# for k in range(size):
    #     rev_conv_1x1 = layers[size - k - 1][1]
    #     rev_x, _ = rev_conv_1x1(rev_x)
    # error = cf.mean(abs(x - rev_x))
    # print("lu_1x1:", error)

    # affine coupling layer
    params = glow.nn.additive_coupling.Parameters(
        channels_x=channels_x // 2, channels_h=128)
    params.to_gpu()
    params.conv_1(x[:, 0::2])
    params.conv_1.W.data = xp.random.uniform(
        -1.0, 1.0, size=params.conv_1.W.data.shape).astype("float32")
    params.conv_2.W.data = xp.random.uniform(
        -1.0, 1.0, size=params.conv_2.W.data.shape).astype("float32")
    params.conv_3.W.data = xp.random.uniform(
        -1.0, 1.0, size=params.conv_3.W.data.shape).astype("float32")
    params.scale.data = xp.random.uniform(
        -1.0, 1.0, size=params.scale.data.shape).astype("float32")
    nonlinear_mapping = glow.nn.additive_coupling.NonlinearMapping(params)
    coupling_layer = glow.nn.additive_coupling.AdditiveCoupling(
        nn=nonlinear_mapping)
    rev_coupling_layer = coupling_layer.reverse_copy()

    y = x
    for _ in range(size):
        y, _ = coupling_layer(y)
    rev_x = y
    for _ in range(size):
        rev_x, _ = rev_coupling_layer(rev_x)
    error = cf.mean(abs(x - rev_x))
    print("coupling:", error)
github musyoku / chainer-gqn / train_mn.py View on Github external
ELBO = log_px - kl_divergence

        # https://arxiv.org/abs/1604.08772 Section.2
        # https://www.reddit.com/r/MachineLearning/comments/56m5o2/discussion_calculation_of_bitsdims/
        bits_per_pixel = -(ELBO / num_pixels_per_batch - np.log(256)) / np.log(
            2)

        return ELBO, bits_per_pixel, negative_log_likelihood, kl_divergence

    #==============================================================================
    # Training iterations
    #==============================================================================
    dataset_size = len(dataset_train)
    random.seed(0)
    np.random.seed(0)
    cp.random.seed(0)

    for epoch in range(meter_train.epoch, args.epochs):
        _print("Epoch {}/{}:".format(
            epoch + 1,
            args.epochs,
        ))
        meter_train.next_epoch()

        subset_indices = list(range(len(dataset_train.subset_filenames)))
        subset_size_per_gpu = len(subset_indices) // comm.size
        if len(subset_indices) % comm.size != 0:
            subset_size_per_gpu += 1

        for subset_loop in range(subset_size_per_gpu):
            random.shuffle(subset_indices)
            subset_index = subset_indices[comm.rank]
github rossant / pykilosort / pykilosort / learn.py View on Github external
(int(NT / Nthreads),), (Nthreads,), (d_Params, d_err, d_ftype, d_x, d_st, d_id, d_counter))

    # ignore peaks that are smaller than another nearby peak
    cleanup_heights = cp.RawKernel(code, 'cleanup_heights')
    cleanup_heights(
        (1 + int(maxFR // 32),), (32,), (d_Params, d_x, d_st, d_id, d_st1, d_id1, d_counter))

    # add new spikes to 2nd counter
    counter[0] = d_counter[1]
    counter[0] = min(maxFR, counter[0])

    d_WU = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
    # d_WU1 = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')

    # update dWU here by adding back to subbed spikes
    extract_snips = cp.RawKernel(code, 'extract_snips')
    extract_snips((Nchan,), tpS, (d_Params, d_st1, d_id1, d_counter, d_data, d_WU))

    # QUESTION: why a copy here??
    # if counter[0] > 0:
    #     d_WU1[...] = d_WU[...]

    del (
        d_ftype, d_kkmax, d_err, d_st, d_id, d_st1, d_x, d_kk, d_id1, d_counter,
        d_Params, d_dfilt)
    return d_WU, d_dout
github rossant / pykilosort / pykilosort / learn.py View on Github external
d_wtw = cp.zeros((nt0, nt0, Nfilt), dtype=np.float64, order='F')
    d_dWUb = cp.zeros((nt0, Nchan, Nfilt), dtype=np.float64, order='F')

    tpS = (nt0, int(Nthreads // nt0))
    tpK = (Nrank, int(Nthreads // Nrank))

    blankdWU = cp.RawKernel(code, 'blankdWU')
    blankdWU((Nfilt,), tpS, (d_Params, d_dWU, d_iC, d_iW, d_dWUb))

    # compute dWU * dWU'
    getwtw = cp.RawKernel(code, 'getwtw')
    getwtw((Nfilt,), tpS, (d_Params, d_dWUb, d_wtw))

    # get W by power svd iterations
    getW = cp.RawKernel(code, 'getW')
    getW((Nfilt,), (nt0,), (d_Params, d_wtw, d_W))

    # compute U by W' * dWU
    getU = cp.RawKernel(code, 'getU')
    getU((Nfilt,), tpK, (d_Params, d_dWUb, d_W, d_U))

    # normalize U, get S, get mu, renormalize W
    reNormalize = cp.RawKernel(code, 'reNormalize')
    reNormalize((Nfilt,), (nt0,), (d_Params, d_A, d_B, d_W, d_U, d_mu))

    del d_wtw, d_Params, d_dWUb

    return d_W, d_U, d_mu
github rossant / pykilosort / pykilosort / learn.py View on Github external
Nfilt = int(Params[1])
    nt0 = int(Params[9])

    d_Params = cp.asarray(Params, dtype=np.float64, order='F')

    d_W1 = cp.asarray(W1, dtype=np.float32, order='F')
    d_W2 = cp.asarray(W2, dtype=np.float32, order='F')
    d_UtU = cp.asarray(UtU, dtype=np.float32, order='F')

    d_WtW = cp.zeros((Nfilt, Nfilt, 2 * nt0 - 1), dtype=np.float32, order='F')

    grid = (1 + int(Nfilt // nblock), 1 + int(Nfilt // nblock))
    block = (nblock, nblock)

    crossFilter = cp.RawKernel(code, 'crossFilter')
    crossFilter(grid, block, (d_Params, d_W1, d_W2, d_UtU, d_WtW))

    del d_Params, d_W1, d_W2, d_UtU

    return d_WtW
github rossant / pykilosort / pykilosort / learn.py View on Github external
# sum each template across channels, square, take max
    sumChannels = cp.RawKernel(code, 'sumChannels')
    sumChannels((int(NT / Nthreads),), (Nthreads,), (d_Params, d_dfilt, d_dout, d_kkmax, d_iC))

    # compute the best filter
    bestFilter = cp.RawKernel(code, 'bestFilter')
    bestFilter(
        (int(NT / Nthreads),), (Nthreads,), (d_Params, d_dout, d_err, d_ftype, d_kkmax, d_kk))

    # ignore peaks that are smaller than another nearby peak
    cleanup_spikes = cp.RawKernel(code, 'cleanup_spikes')
    cleanup_spikes(
        (int(NT / Nthreads),), (Nthreads,), (d_Params, d_err, d_ftype, d_x, d_st, d_id, d_counter))

    # ignore peaks that are smaller than another nearby peak
    cleanup_heights = cp.RawKernel(code, 'cleanup_heights')
    cleanup_heights(
        (1 + int(maxFR // 32),), (32,), (d_Params, d_x, d_st, d_id, d_st1, d_id1, d_counter))

    # add new spikes to 2nd counter
    counter[0] = d_counter[1]
    counter[0] = min(maxFR, counter[0])

    d_WU = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')
    # d_WU1 = cp.zeros((nt0, Nchan, counter[0]), dtype=np.float32, order='F')

    # update dWU here by adding back to subbed spikes
    extract_snips = cp.RawKernel(code, 'extract_snips')
    extract_snips((Nchan,), tpS, (d_Params, d_st1, d_id1, d_counter, d_data, d_WU))

    # QUESTION: why a copy here??
    # if counter[0] > 0:
github rossant / pykilosort / pykilosort / cluster.py View on Github external
# Input GPU arrays.
    d_Params = cp.asarray(Params, dtype=np.float64, order='F')
    d_data = cp.asarray(dataRAW, dtype=np.float32, order='F')
    d_W = cp.asarray(wPCA, dtype=np.float32, order='F')
    d_iC = cp.asarray(iC, dtype=np.int32, order='F')

    # New GPU arrays.
    d_dout = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
    d_dmax = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
    d_st = cp.zeros(maxFR, dtype=np.int32, order='F')
    d_id = cp.zeros(maxFR, dtype=np.int32, order='F')
    d_counter = cp.zeros(1, dtype=np.int32, order='F')

    # filter the data with the temporal templates
    Conv1D = cp.RawKernel(code, 'Conv1D')
    Conv1D((Nchan,), (Nthreads,), (d_Params, d_data, d_W, d_dout))

    # get the max of the data
    max1D = cp.RawKernel(code, 'max1D')
    max1D((Nchan,), (Nthreads,), (d_Params, d_dout, d_dmax))

    # take max across nearby channels
    maxChannels = cp.RawKernel(code, 'maxChannels')
    maxChannels(
        (int(NT // Nthreads),), (Nthreads,),
        (d_Params, d_dout, d_dmax, d_iC, d_st, d_id, d_counter))

    # move d_x to the CPU
    minSize = 1
    minSize = min(maxFR, int(d_counter[0]))
github chainer / chainer / tests / cupy_tests / manipulation_tests / test_split.py View on Github external
    @testing.numpy_cupy_array_equal()
    def test_array_split1(self, xp):
        a = testing.shaped_arange((3, 11), xp)
        split = xp.array_split(a, 4, 1)
        return xp.concatenate(split, 1)