Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
valid_modes = ("train", "test", "val")
msg = "Unknown value '{}' for argument split. " "Valid values are {{{}}}."
msg = msg.format(split, iterable_to_str(valid_modes))
verify_str_arg(split, "split", valid_modes, msg)
# Set the patch and stride for the patch extractor
_extract_patches_from = _extract_patches(patch_size, stride, self._complete_patches_only)
num_partitions = 5
indexes = self._data_array.shape[0]
num_elements = math.ceil(indexes / num_partitions)
train_indexes_list = []
test_indexes_list = []
val_indexes_list = []
for partition in partition_all(num_elements, range(indexes)): # Partition files into N partitions
train_indexes, val_indexes, test_indexes = _split_train_val_test(partition, val_ratio, test_ratio)
train_indexes_list.extend(train_indexes)
test_indexes_list.extend(test_indexes)
val_indexes_list.extend(val_indexes)
if split == "train":
indexes = train_indexes_list
elif split == "val":
indexes = val_indexes_list
elif split == "test":
indexes = test_indexes_list
# Extract patches
for index in indexes:
img_array = self._data_array[index]
mask_array = self._slice_mask_array[index]
def into(dset, seq, chunksize=int(2**10), **kwargs):
assert not isinstance(dset, type)
for chunk in partition_all(chunksize, seq):
into(dset, into(np.ndarray, chunk, dshape=discover(dset).measure), **kwargs)
return dset
def seq_to_bags(its: Iterable[Any],
chunk_sz: int,
name: str = 'data'):
""" Take a stream of data items and return a stream of dask.bag.Bag
each bag (except last) containing ``chunk_sz`` elements in 1 partition.
"""
for chunk in toolz.partition_all(chunk_sz, its):
prefix = _randomize(name)
dsk = {(prefix, 0): chunk}
yield dask.bag.Bag(dsk, prefix, 1)
dsk = {
(a, 0, i): (
apply,
chunk,
[(x._name, i) if isinstance(x, _Frame) else x for x in args],
chunk_kwargs,
)
for i in range(args[0].npartitions)
}
# Combine
b = "{0}-combine-{1}".format(token or funcname(combine), token_key)
k = npartitions
depth = 0
while k > split_every:
for part_i, inds in enumerate(partition_all(split_every, range(k))):
conc = (list, [(a, depth, i) for i in inds])
dsk[(b, depth + 1, part_i)] = (
(apply, combine, [conc], combine_kwargs)
if combine_kwargs
else (combine, conc)
)
k = part_i + 1
a = b
depth += 1
# Aggregate
b = "{0}-agg-{1}".format(token or funcname(aggregate), token_key)
conc = (list, [(a, depth, i) for i in range(k)])
if aggregate_kwargs:
dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs)
else:
raise ValueError("scalar only valid for 2 column dataframe")
token = tokenize(df, min_periods, scalar, split_every)
funcname = 'corr' if corr else 'cov'
a = '{0}-chunk-{1}'.format(funcname, df._name)
dsk = {(a, i): (cov_corr_chunk, f, corr)
for (i, f) in enumerate(df._keys())}
prefix = '{0}-combine-{1}-'.format(funcname, df._name)
k = df.npartitions
b = a
depth = 0
while k > split_every:
b = prefix + str(depth)
for part_i, inds in enumerate(partition_all(split_every, range(k))):
dsk[(b, part_i)] = (cov_corr_combine, [(a, i) for i in inds], corr)
k = part_i + 1
a = b
depth += 1
name = '{0}-{1}'.format(funcname, token)
dsk[(name, 0)] = (cov_corr_agg, [(a, i) for i in range(k)],
df.columns, min_periods, corr, scalar)
dsk.update(df.dask)
if scalar:
return Scalar(dsk, name, 'f8')
meta = make_meta([(c, 'f8') for c in df.columns], index=df.columns)
return DataFrame(dsk, name, meta, (df.columns[0], df.columns[-1]))
def mb_filter(fastq, cores):
''' Filters umis with non-ACGT bases
Expects formatted fastq files.
'''
filter_mb = partial(umi_filter)
p = multiprocessing.Pool(cores)
chunks = tz.partition_all(10000, read_fastq(fastq))
bigchunks = tz.partition_all(cores, chunks)
for bigchunk in bigchunks:
for chunk in p.map(filter_mb, list(bigchunk)):
for read in chunk:
sys.stdout.write(read)
def append_iterator_to_pymongo(coll, seq, columns=None, dshape=None, chunksize=1024, **kwargs):
seq = iter(seq)
item = next(seq)
seq = concat([[item], seq])
if isinstance(item, (tuple, list)):
if not columns and dshape:
columns = dshape.measure.names
if not columns:
raise ValueError("Inputs must be dictionaries. "
"Or provide columns=[...] or dshape=DataShape(...) keyword")
seq = (dict(zip(columns, item)) for item in seq)
for block in partition_all(1024, seq):
coll.insert(copy.deepcopy(block))
return coll
def _lookup_most_recent_symbols(self, sids):
symbols = {
row.sid: {c: row[c] for c in symbol_columns}
for row in concat(
self.engine.execute(
self._select_most_recent_symbols_chunk(sid_group),
).fetchall()
for sid_group in partition_all(
SQLITE_MAX_VARIABLE_NUMBER,
sids
),
)
}
if len(symbols) != len(sids):
raise EquitiesNotFound(
sids=set(sids) - set(symbols),
plural=True,
)
return symbols
def sb_filter(fastq, bc, cores, nedit):
''' Filters reads with non-matching sample barcodes
Expects formatted fastq files.
'''
barcodes = set(sb.strip() for sb in bc)
if nedit == 0:
filter_sb = partial(exact_sample_filter2, barcodes=barcodes)
else:
barcodehash = MutationHash(barcodes, nedit)
filter_sb = partial(correcting_sample_filter2, barcodehash=barcodehash)
p = multiprocessing.Pool(cores)
chunks = tz.partition_all(10000, read_fastq(fastq))
bigchunks = tz.partition_all(cores, chunks)
for bigchunk in bigchunks:
for chunk in p.map(filter_sb, list(bigchunk)):
for read in chunk:
sys.stdout.write(read)