Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'xhistogram'
copyright = u'2016-2019, xhistogram developers'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The full version, including alpha/beta/rc tags.
release = xhistogram.__version__
# The short X.Y version.
version = '.'.join(release.split('.')[:2])
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
def _bincount_2d(bin_indices, weights, N, hist_shapes):
# a trick to apply bincount on an axis-by-axis basis
# https://stackoverflow.com/questions/40591754/vectorizing-numpy-bincount
# https://stackoverflow.com/questions/40588403/vectorized-searchsorted-numpy
M = bin_indices.shape[0]
if weights is not None:
weights = weights.ravel()
bin_indices_offset = (bin_indices + (N * np.arange(M)[:, None])).ravel()
bc_offset = bincount(bin_indices_offset, weights=weights,
minlength=N*M)
final_shape = (M,) + tuple(hist_shapes)
return bc_offset.reshape(final_shape)
def _bincount_loop(bin_indices, weights, N, hist_shapes, block_chunks):
M = bin_indices.shape[0]
assert sum(block_chunks) == M
block_counts = []
# iterate over chunks
bounds = np.cumsum((0,) + block_chunks)
for m_start, m_end in zip(bounds[:-1], bounds[1:]):
bin_indices_block = bin_indices[m_start:m_end]
weights_block = weights[m_start:m_end] if weights is not None else None
bc_block = _bincount_2d(bin_indices_block, weights_block, N, hist_shapes)
block_counts.append(bc_block)
all_counts = concatenate(block_counts)
final_shape = (bin_indices.shape[0],) + tuple(hist_shapes)
return all_counts.reshape(final_shape)
def reshape_input(a):
if do_full_array:
d = a.ravel()[None, :]
else:
# reshape the array to 2D
# axis 0: preserved axis after histogram
# axis 1: calculate histogram along this axis
new_pos = tuple(range(-len(axis), 0))
c = np.moveaxis(a, axis, new_pos)
split_idx = c.ndim - len(axis)
dims_0 = c.shape[:split_idx]
assert dims_0 == kept_axes_shape
dims_1 = c.shape[split_idx:]
new_dim_0 = np.prod(dims_0)
new_dim_1 = np.prod(dims_1)
d = reshape(c, (new_dim_0, new_dim_1))
return d
if weights is not None:
weights_reshaped = all_args_reshaped.pop()
else:
weights_reshaped = None
h = _histogram_2d_vectorized(*all_args_reshaped, bins=bins,
weights=weights_reshaped,
density=density, block_size=block_size)
if h.shape[0] == 1:
assert do_full_array
h = h.squeeze()
else:
final_shape = kept_axes_shape + h.shape[1:]
h = reshape(h, final_shape)
return h
nrows, ncols = a0.shape
nbins = [len(b) for b in bins]
hist_shapes = [nb+1 for nb in nbins]
# a marginally faster implementation would be to use searchsorted,
# like numpy histogram itself does
# https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L864-L882
# for now we stick with `digitize` because it's easy to understand how it works
# the maximum possible value of of digitize is nbins
# for right=False:
# - 0 corresponds to a < b[0]
# - i corresponds to bins[i-1] <= a < b[i]
# - nbins corresponds to a a >= b[1]
each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
# product of the bins gives the joint distribution
if N_inputs > 1:
bin_indices = ravel_multi_index(each_bin_indices, hist_shapes)
else:
bin_indices = each_bin_indices[0]
# total number of unique bin indices
N = reduce(lambda x, y: x*y, hist_shapes)
bin_counts = _dispatch_bincount(bin_indices, weights, N, hist_shapes,
block_size=block_size)
# just throw out everything outside of the bins, as np.histogram does
# TODO: make this optional?
slices = (slice(None),) + (N_inputs * (slice(1, -1),))
return bin_counts[slices]
hist_shapes = [nb+1 for nb in nbins]
# a marginally faster implementation would be to use searchsorted,
# like numpy histogram itself does
# https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L864-L882
# for now we stick with `digitize` because it's easy to understand how it works
# the maximum possible value of of digitize is nbins
# for right=False:
# - 0 corresponds to a < b[0]
# - i corresponds to bins[i-1] <= a < b[i]
# - nbins corresponds to a a >= b[1]
each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
# product of the bins gives the joint distribution
if N_inputs > 1:
bin_indices = ravel_multi_index(each_bin_indices, hist_shapes)
else:
bin_indices = each_bin_indices[0]
# total number of unique bin indices
N = reduce(lambda x, y: x*y, hist_shapes)
bin_counts = _dispatch_bincount(bin_indices, weights, N, hist_shapes,
block_size=block_size)
# just throw out everything outside of the bins, as np.histogram does
# TODO: make this optional?
slices = (slice(None),) + (N_inputs * (slice(1, -1),))
return bin_counts[slices]
for name in dims_to_keep if name in a0.coords}
all_coords = {}
all_coords.update(old_coords)
all_coords.update(new_coords)
# CF conventions tell us how to specify cell boundaries
# http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/cf-conventions.html#cell-boundaries
# However, they require introduction of an additional dimension.
# I don't like that.
edge_dims = [a.name + bin_edge_suffix for a in args[:N_args]]
edge_coords = {name: ((name,), bin_edge, a.attrs)
for name, bin_edge, a in zip(edge_dims, bins, args)}
output_name = '_'.join(['histogram'] + [a.name for a in args[:N_args]])
da_out = xr.DataArray(h_data, dims=output_dims, coords=all_coords,
name=output_name)
return da_out
# TODO: replace this with a more robust function
assert len(bins)==N_args
for bin in bins:
assert isinstance(bin, np.ndarray), 'all bins must be numpy arrays'
for a in args:
# TODO: make this a more robust check
assert a.name is not None, 'all arrays must have a name'
# we drop coords to simplify alignment
args = [da.reset_coords(drop=True) for da in args]
if N_weights:
args += [weights.reset_coords(drop=True)]
# explicitly broadcast so we understand what is going into apply_ufunc
# (apply_ufunc might be doing this by itself again)
args = list(xr.align(*args, join='exact'))
# what happens if we skip this?
#args = list(xr.broadcast(*args))
a0 = args[0]
a_dims = a0.dims
# roll our own broadcasting
# now manually expand the arrays
all_dims = [d for a in args for d in a.dims]
all_dims_ordered = list(OrderedDict.fromkeys(all_dims))
args_expanded = []
for a in args:
expand_keys = [d for d in all_dims_ordered if d not in a.dims]
a_expanded = a.expand_dims({k: 1 for k in expand_keys})
args_data = [a.data for a in args_transposed]
if N_weights:
weights_data = args_data.pop()
else:
weights_data = None
if dim is not None:
dims_to_keep = [d for d in all_dims_ordered if d not in dim]
axis = [args_transposed[0].get_axis_num(d) for d in dim]
else:
dims_to_keep = []
axis = None
print(args_data[0].shape)
h_data = _histogram(*args_data, weights=weights_data, bins=bins, axis=axis,
block_size=block_size)
# create output dims
new_dims = [a.name + bin_dim_suffix for a in args[:N_args]]
output_dims = dims_to_keep + new_dims
# create new coords
bin_centers = [0.5*(bin[:-1] + bin[1:]) for bin in bins]
new_coords = {name: ((name,), bin_center, a.attrs)
for name, bin_center, a in zip(new_dims, bin_centers, args)}
old_coords = {name: a0[name]
for name in dims_to_keep if name in a0.coords}
all_coords = {}
all_coords.update(old_coords)
all_coords.update(new_coords)