How to use the dask.base.tokenize function in dask

To help you get started, we’ve selected a few dask examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dask / dask / dask / array / core.py View on Github external
# https://github.com/numpy/numpy/issues/6240
        # We don't inspect the values of 0d dask arrays, because these could
        # hold potentially very expensive calculations. Instead, we treat
        # them just like other arrays, and if necessary cast the result of op
        # to match.
        vals = [np.empty((1,) * max(1, a.ndim), dtype=a.dtype)
                if not is_scalar_for_elemwise(a) else a
                for a in args]
        try:
            dt = apply_infer_dtype(op, vals, {}, 'elemwise', suggest_dtype=False)
        except Exception:
            return NotImplemented
        need_enforce_dtype = any(not is_scalar_for_elemwise(a) and a.ndim == 0 for a in args)

    name = kwargs.get('name', None) or '%s-%s' % (funcname(op),
                                                  tokenize(op, dt, *args))

    atop_kwargs = dict(dtype=dt, name=name, token=funcname(op).strip('_'))
    if need_enforce_dtype:
        atop_kwargs['enforce_dtype'] = dt
        atop_kwargs['enforce_dtype_function'] = op
        op = _enforce_dtype
    result = atop(op, expr_inds,
                  *concat((a, tuple(range(a.ndim)[::-1])
                           if not is_scalar_for_elemwise(a)
                           else None) for a in args),
                  **atop_kwargs)

    return handle_out(out, result)
github corteva / rioxarray / rioxarray / _io.py View on Github external
import dask

        if LooseVersion(dask.__version__) < LooseVersion("0.18.0"):
            msg = (
                "Automatic chunking requires dask.__version__ >= 0.18.0 . "
                "You currently have version %s" % dask.__version__
            )
            raise NotImplementedError(msg)
        block_shape = (1,) + riods.block_shapes[0]
        chunks = normalize_chunks(
            chunks=(1, "auto", "auto"),
            shape=(riods.count, riods.height, riods.width),
            dtype=riods.dtypes[0],
            previous_chunks=tuple((c,) for c in block_shape),
        )
    token = tokenize(filename, mtime, chunks)
    name_prefix = "open_rasterio-%s" % token
    return result.chunk(chunks, name_prefix=name_prefix, token=token)
github dask / dask / dask / dataframe / core.py View on Github external
def elemwise(op, *args, **kwargs):
    """ Elementwise operation for dask.Dataframes """
    columns = kwargs.get('columns', None)
    name = kwargs.get('name', None)

    _name = 'elemwise-' + tokenize(op, kwargs, *args)

    dasks = [arg for arg in args if isinstance(arg, (_Frame, Scalar))]
    other = [(i, arg) for i, arg in enumerate(args)
             if not isinstance(arg, (_Frame, Scalar))]

    if other:
        op2 = partial_by_order(op, other)
    else:
        op2 = op

    dfs = [df for df in dasks if isinstance(df, _Frame)]
    divisions = dfs[0].divisions
    if not all(df.divisions == divisions for df in dfs):
        from .multi import align_partitions
        dasks, divisions, parts = align_partitions(*dasks)
    n = len(divisions) - 1
github dask / dask / dask / array / wrap.py View on Github external
shape = shape.tolist()

    if not isinstance(shape, (tuple, list)):
        shape = (shape,)

    chunks = kwargs.pop("chunks", "auto")

    dtype = kwargs.pop("dtype", None)
    if dtype is None:
        dtype = func(shape, *args, **kwargs).dtype
    dtype = np.dtype(dtype)

    chunks = normalize_chunks(chunks, shape, dtype=dtype)
    name = kwargs.pop("name", None)

    name = name or funcname(func) + "-" + tokenize(
        func, shape, chunks, dtype, args, kwargs
    )

    return {
        "shape": shape,
        "dtype": dtype,
        "kwargs": kwargs,
        "chunks": chunks,
        "name": name,
    }
github dask / dask / dask / dataframe / core.py View on Github external
args = [args]

    npartitions = set(arg.npartitions for arg in args
                      if isinstance(arg, _Frame))
    if len(npartitions) > 1:
        raise ValueError("All arguments must have same number of partitions")
    npartitions = npartitions.pop()

    if split_every is None:
        split_every = 8
    elif split_every is False:
        split_every = npartitions
    elif split_every < 2 or not isinstance(split_every, int):
        raise ValueError("split_every must be an integer >= 2")

    token_key = tokenize(token or (chunk, aggregate), meta, args,
                         chunk_kwargs, aggregate_kwargs, combine_kwargs,
                         split_every)

    # Chunk
    a = '{0}-chunk-{1}'.format(token or funcname(chunk), token_key)
    if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs:
        dsk = {(a, i): (chunk, key) for i, key in enumerate(args[0]._keys())}
    else:
        dsk = {(a, i): (apply, chunk, [(x._name, i) if isinstance(x, _Frame)
                                       else x for x in args], chunk_kwargs)
               for i in range(args[0].npartitions)}

    # Combine
    prefix = '{0}-combine-{1}-'.format(token or funcname(combine), token_key)
    k = npartitions
    b = a
github dask / dask / dask / array / core.py View on Github external
This assumes that ``k`` is small.  All results will be returned in a single
    chunk.

    Examples
    --------

    >>> x = np.array([5, 1, 3, 6])
    >>> d = from_array(x, chunks=2)
    >>> d.topk(2).compute()
    array([6, 5])
    """
    if x.ndim != 1:
        raise ValueError("Topk only works on arrays of one dimension")

    token = tokenize(k, x)
    name = 'chunk.topk-' + token
    dsk = dict(((name, i), (chunk.topk, k, key))
               for i, key in enumerate(x._keys()))
    name2 = 'topk-' + token
    dsk[(name2, 0)] = (getitem, (np.sort, (np.concatenate, list(dsk))),
                       slice(-1, -k - 1, -1))
    chunks = ((k,),)

    return Array(sharedict.merge((name2, dsk), x.dask), name2, chunks, dtype=x.dtype)
github dask / dask / dask / array / linalg.py View on Github external
nr, nc = len(data.chunks[0]), len(data.chunks[1])
    cr_max, cc = max(data.chunks[0]), data.chunks[1][0]

    if not (data.ndim == 2 and nc == 1):  # Is a matrix  # Only one column block
        raise ValueError(
            "Input must have the following properties:\n"
            "  1. Have two dimensions\n"
            "  2. Have only one column of blocks\n\n"
            "Note: This function (tsqr) supports QR decomposition in the case of\n"
            "tall-and-skinny matrices (single column chunk/block; see qr)"
            "Current shape: {},\nCurrent chunksize: {}".format(
                data.shape, data.chunksize
            )
        )

    token = "-" + tokenize(data, compute_svd)

    m, n = data.shape
    numblocks = (nr, 1)

    qq, rr = np.linalg.qr(np.ones(shape=(1, 1), dtype=data.dtype))

    layers = data.__dask_graph__().layers.copy()
    dependencies = data.__dask_graph__().dependencies.copy()

    # Block qr
    name_qr_st1 = "qr" + token
    dsk_qr_st1 = blockwise(
        _wrapped_qr,
        name_qr_st1,
        "ij",
        data.name,
github dask / dask / dask / dataframe / core.py View on Github external
List of partitions to be used
    force : bool, default False
        Allows the expansion of the existing divisions.
        If False then the new divisions lower and upper bounds must be
        the same as the old divisions.

    Examples
    --------

    >>> df = df.repartition([0, 5, 10, 20])  # doctest: +SKIP

    Also works on Pandas objects

    >>> ddf = dd.repartition(df, [0, 5, 10, 20])  # doctest: +SKIP
    """
    token = tokenize(df, divisions)
    if isinstance(df, _Frame):
        tmp = 'repartition-split-' + token
        out = 'repartition-merge-' + token
        dsk = repartition_divisions(df.divisions, divisions,
                                    df._name, tmp, out, force=force)
        return new_dd_object(merge(df.dask, dsk), out,
                             df._meta, divisions)
    elif isinstance(df, (pd.Series, pd.DataFrame)):
        name = 'repartition-dataframe-' + token
        from .utils import shard_df_on_index
        dfs = shard_df_on_index(df, divisions[1:-1])
        dsk = dict(((name, i), df) for i, df in enumerate(dfs))
        return new_dd_object(dsk, name, df, divisions)
    raise ValueError('Data must be DataFrame or Series')
github dask / dask / dask / array / core.py View on Github external
if not all(x.shape == seq[0].shape for x in seq):
        raise ValueError("Stacked arrays must have the same shape. Got %s",
                         [x.shape for x in seq])

    ind = list(range(ndim))
    uc_args = list(concat((x, ind) for x in seq))
    _, seq = unify_chunks(*uc_args)

    dt = reduce(np.promote_types, [a.dtype for a in seq])
    seq = [x.astype(dt) for x in seq]

    assert len(set(a.chunks for a in seq)) == 1  # same chunks
    chunks = (seq[0].chunks[:axis] + ((1,) * n,) + seq[0].chunks[axis:])

    names = [a.name for a in seq]
    name = 'stack-' + tokenize(names, axis)
    keys = list(product([name], *[range(len(bd)) for bd in chunks]))

    inputs = [(names[key[axis + 1]], ) + key[1:axis + 1] + key[axis + 2:]
              for key in keys]
    values = [(getitem, inp, (slice(None, None, None),) * axis +
              (None, ) + (slice(None, None, None), ) * (ndim - axis))
              for inp in inputs]

    dsk = dict(zip(keys, values))
    dsk2 = sharedict.merge((name, dsk), *[a.dask for a in seq])

    return Array(dsk2, name, chunks, dtype=dt)