Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_pca_inverse():
# Test that the projection of data can be inverted
rng = np.random.RandomState(0)
n, p = 50, 3
X = rng.randn(n, p) # spherical data
X[:, 1] *= 0.00001 # make middle component relatively small
X += [5, 4, 3] # make a large mean
dX = da.from_array(X, chunks=(n // 2, p))
# same check that we can find the original data from the transformed
# signal (since the data is almost of rank n_components)
pca = dd.PCA(n_components=2, svd_solver="full").fit(dX)
Y = pca.transform(dX)
Y_inverse = pca.inverse_transform(Y)
assert_almost_equal(X, Y_inverse, decimal=3)
# same as above with whitening (approximate reconstruction)
for solver in solver_list:
pca = dd.PCA(n_components=2, whiten=True, svd_solver=solver)
pca.fit(dX)
Y = pca.transform(dX)
Y_inverse = pca.inverse_transform(Y)
assert_eq(dX, Y_inverse, atol=1e-3)
dfParts = []
chunkSize = min(self.CHUNK_SIZE, self.nEvents / self.ncores)
nPartitions = int(self.nEvents // chunkSize) + 1
# Determine the column names
gNames = kwds.pop('groupnames', self.getGroupNames(wexpr='Stream'))
colNames = self.name2alias(gNames)
for p in range(nPartitions): # Generate partitioned dataframe
# Calculate the starting and ending index of every chunk of events
eventIDStart = int(p * chunkSize)
eventIDEnd = int(min(eventIDStart + chunkSize, self.nEvents))
dfParts.append(d.delayed(self._assembleGroups)(gNames, amin=eventIDStart, amax=eventIDEnd, **kwds))
# Construct eda (event dask array) and edf (event dask dataframe)
eda = da.from_array(np.concatenate(d.compute(*dfParts), axis=1).T, chunks=self.CHUNK_SIZE)
self.edf = ddf.from_dask_array(eda, columns=colNames)
if ret == True:
return self.edf
# Delayed array for loading an HDF5 file of reasonable size (e.g. < 1GB)
elif form == 'darray':
gNames = kwds.pop('groupnames', self.getGroupNames(wexpr='Stream'))
darray = d.delayed(self._assembleGroups)(gNames, amin=None, amax=None, timeStamps=timeStamps, ret='array', **kwds)
if ret == True:
return darray
def _check_inputs(self, X, accept_sparse_negative=False, copy=False):
kwargs = {}
if SK_022:
kwargs["copy"] = copy
if isinstance(X, (pd.DataFrame, dd.DataFrame)):
X = X.values
if isinstance(X, np.ndarray):
C = len(X) // min(multiprocessing.cpu_count(), 2)
X = da.from_array(X, chunks=C)
rng = check_random_state(self.random_state)
# TODO: non-float dtypes?
# TODO: sparse arrays?
# TODO: mix of sparse, dense?
sample = rng.uniform(size=(5, X.shape[1])).astype(X.dtype)
super(QuantileTransformer, self)._check_inputs(
sample, accept_sparse_negative=accept_sparse_negative, **kwargs
)
return X
Optimizes chunk size in different orientations to facilitate rapid
screening of algorithm output
Returns
-------
darray : Dask Array
chunk_init : tuple (len 3), chunk size before ghosting. Used in select cases
"""
# Compute chunk size and convert if not a Dask Array
if not isinstance(darray, da.core.Array):
chunk_size = util.compute_chunk_size(darray.shape,
darray.dtype.itemsize,
kernel=kernel,
preview=preview)
darray = da.from_array(darray, chunks=chunk_size)
chunks_init = darray.chunks
else:
chunks_init = darray.chunks
# Ghost Dask Array if operation specifies a kernel
if kernel != None:
hw = tuple(np.array(kernel) // 2)
darray = da.ghost.ghost(darray, depth=hw, boundary='reflect')
return(darray, chunks_init)
def test_fit_shuffle_blocks():
N = 10
X = da.from_array(1 + np.arange(N).reshape(-1, 1), chunks=1)
y = da.from_array(np.ones(N), chunks=1)
classes = [0, 1]
sgd = SGDClassifier(
max_iter=5, random_state=0, fit_intercept=False, shuffle=False, tol=1e-3
)
sgd1 = fit(clone(sgd), X, y, random_state=0, classes=classes)
sgd2 = fit(clone(sgd), X, y, random_state=42, classes=classes)
assert len(sgd1.coef_) == len(sgd2.coef_) == 1
assert not np.allclose(sgd1.coef_, sgd2.coef_)
X, y = make_classification(random_state=0, chunks=20)
sgd_a = fit(clone(sgd), X, y, random_state=0, classes=classes, shuffle_blocks=False)
sgd_b = fit(
clone(sgd), X, y, random_state=42, classes=classes, shuffle_blocks=False
def huge_2d_array():
array = np.vstack(1000 * [np.arange(0, 1000)])
return da.from_array(array, chunks=(500, 500))
def read(self, filename):
self.h5 = h5py.File(filename, 'r')
missing = -9999.
self.Lat = da.from_array(self.h5[u'DATA/Latitude'], chunks=CHUNK_SIZE) / 10000.
self.Lon = da.from_array(self.h5[u'DATA/Longitude'], chunks=CHUNK_SIZE) / 10000.
self.selected = (self.Lon > missing)
self.file_content = {}
for key in self.h5['DATA'].keys():
self.file_content[key] = da.from_array(self.h5[u'DATA/' + key], chunks=CHUNK_SIZE)
for key in self.h5[u'HEADER'].keys():
self.file_content[key] = self.h5[u'HEADER/' + key][:]
# Cloud Mask on pixel
mask = 2**0 + 2**1 + 2**2
lst = self.file_content[u'CloudMask'] & mask
lst = lst / 2**0
self.file_content[u"cma"] = lst
# Cloud Mask confidence
mask = 2**5 + 2**6
lst = self.file_content[u'CloudMask'] & mask
vardata = {}
for k in varnames:
vardata[k] = []
for i in iters:
for f in fnames:
try:
data = _read_mds(f, i, force_dict=True, endian=endian)
# this can screw up if the same variable appears in
# multiple diagnostic files
for k in data:
if k in varnames:
mwrap = MemmapArrayWrapper(data[k])
# for some reason, da.from_array does not
# necessarily give a unique name
# need to specify array name
myda = da.from_array(mwrap, mwrap.shape,
name='%s_%010d' % (k, i))
vardata[k].append(myda)
except IOError:
# couldn't find the variable, remove it from the list
#print 'Removing %s from list (iter %g)' % (k, i)
varnames.remove(k)
# final loop to create Variable objects
for k in varnames:
try:
dims, desc, units = _state_variables[k]
except KeyError:
try:
dims, desc, units = _ptracers[k]
except KeyError:
dims, desc, units = diag_meta[k]
"""
Creates a Dask array based on ``a``.
Parameters
----------
a : array-like
Object to convert to a Dask Array.
Returns
-------
a : Dask Array
"""
if not isinstance(a, dask.array.Array):
a = numpy.asarray(a)
a = dask.array.from_array(a, a.shape)
return a
def create_xarray(arr):
res = da.from_array(arr, chunks=(CHUNK_SIZE, CHUNK_SIZE))
res = xr.DataArray(res, dims=['y', 'x'])
return res