How to use the datalad.log.log_progress function in datalad

To help you get started, we’ve selected a few datalad examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github datalad / datalad / datalad / metadata / search.py View on Github external
# cache size per process
            limitmb=cfg.obtain('datalad.search.indexercachesize'),
            # disable parallel indexing for now till #1927 is resolved
            ## number of processes for indexing
            #procs=multiprocessing.cpu_count(),
            ## write separate index segments in each process for speed
            ## asks for writer.commit(optimize=True)
            #multisegment=True,
        )

        # load metadata of the base dataset and what it knows about all its subdatasets
        # (recursively)
        old_idx_size = 0
        old_ds_rpath = ''
        idx_size = 0
        log_progress(
            lgr.info,
            'autofieldidxbuild',
            'Start building search index',
            total=len(dsinfo),
            label='Building search index',
            unit=' Datasets',
        )
        for res in query_aggregated_metadata(
                reporton=self.documenttype,
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                # MIH: I cannot see a case when we would not want recursion (within
                # the metadata)
                recursive=True):
            # this assumes that files are reported after each dataset report,
            # and after a subsequent dataset report no files for the previous
github datalad / datalad / datalad / metadata / extractors / xmp.py View on Github external
def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        context = {}
        contentmeta = []
        log_progress(
            lgr.info,
            'extractorxmp',
            'Start XMP metadata extraction from %s', self.ds,
            total=len(self.paths),
            label='XMP metadata extraction',
            unit=' Files',
        )
        for f in self.paths:
            absfp = opj(self.ds.path, f)
            log_progress(
                lgr.info,
                'extractorxmp',
                'Extract XMP metadata from %s', absfp,
                update=1,
                increment=True)
            info = file_to_dict(absfp)
            if not info:
                # got nothing, likely nothing there
                # TODO check if this is an XMP sidecar file, parse that, and assign metadata
                # to the base file
                continue
            # update vocabulary
            vocab = {info[ns][0][0].split(':')[0]: {'@id': ns, 'type': vocabulary_id} for ns in info}
            # TODO this is dirty and assumed that XMP is internally consistent with the
            # definitions across all files -- which it likely isn't
            context.update(vocab)
github datalad / datalad / datalad / plugin / addurls.py View on Github external
"""Extend `rows` with values for special formatting fields.
    """
    file_fields = list(get_fmt_names(filename_format))
    if any(i.startswith("_url") for i in file_fields):
        for row, url in zip(rows, urls):
            row.update(get_url_parts(url))

    if any(i.startswith("_url_filename") for i in file_fields):
        if dry_run:  # Don't waste time making requests.
            dummy = get_file_parts("BASE.EXT", "_url_filename")
            for idx, row in enumerate(rows):
                row.update(
                    {k: v + str(idx) for k, v in dummy.items()})
        else:
            num_urls = len(urls)
            log_progress(lgr.info, "addurls_requestnames",
                         "Requesting file names for %d URLs", num_urls,
                         label="Requesting names", total=num_urls,
                         unit=" Files")
            for row, url in zip(rows, urls):
                # If we run into any issues here, we're just going to raise an
                # exception and then abort inside dlplugin.  It'd be good to
                # disentangle this from `extract` so that we could yield an
                # individual error, drop the row, and keep going.
                filename = get_url_filename(url)
                if filename:
                    row.update(get_file_parts(filename, "_url_filename"))
                else:
                    raise ValueError(
                        "{} does not contain a filename".format(url))
                log_progress(lgr.info, "addurls_requestnames",
                             "%s returned for %s", url, filename,
github datalad / datalad / datalad / metadata / extractors / annex.py View on Github external
def _get_content_metadata(self):
        log_progress(
            lgr.info,
            'extractorannex',
            'Start annex metadata extraction from %s', self.ds,
            total=len(self.paths),
            label='Annex metadata extraction',
            unit=' Files',
        )
        repo = self.ds.repo   # OPT: .repo could be relatively expensive
        if not isinstance(repo, AnnexRepo):
            log_progress(
                lgr.info,
                'extractorannex',
                'Finished annex metadata extraction from %s', self.ds
            )
            return
github datalad / datalad / datalad / metadata / metadata.py View on Github external
': %s' %
            (len(absent_extractors),
             single_or_plural(" is", "s are", len(absent_extractors)),
             ', '.join(absent_extractors)))

    log_progress(
        lgr.info,
        'metadataextractors',
        'Start metadata extraction from %s', ds,
        total=len(types),
        label='Metadata extraction',
        unit=' extractors',
    )
    for mtype in types:
        mtype_key = mtype
        log_progress(
            lgr.info,
            'metadataextractors',
            'Engage %s metadata extractor', mtype_key,
            update=1,
            increment=True)
        try:
            extractor_cls = extractors[mtype_key].load()
            extractor = extractor_cls(
                ds,
                paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist)
        except Exception as e:
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s', mtype_key, ds,
            )
github datalad / datalad / datalad / plugin / addurls.py View on Github external
# operations.
            filename_abs = os.path.join(ds.path, row["filename"])
            if row["subpath"]:
                ds_current = Dataset(os.path.join(ds.path,
                                                  row["subpath"]))
                ds_filename = os.path.relpath(filename_abs, ds_current.path)
            else:
                ds_current = ds
                ds_filename = row["filename"]
            row.update({"filename_abs": filename_abs,
                        "ds": ds_current,
                        "ds_filename": ds_filename})

        if version_urls:
            num_urls = len(rows)
            log_progress(lgr.info, "addurls_versionurls",
                         "Versioning %d URLs", num_urls,
                         label="Versioning URLs",
                         total=num_urls, unit=" URLs")
            for row in rows:
                url = row["url"]
                try:
                    row["url"] = get_versioned_url(url)
                except (ValueError, NotImplementedError) as exc:
                    # We don't expect this to happen because get_versioned_url
                    # should return the original URL if it isn't an S3 bucket.
                    # It only raises exceptions if it doesn't know how to
                    # handle the scheme for what looks like an S3 bucket.
                    lgr.warning("error getting version of %s: %s",
                                row["url"], exc_str(exc))
                log_progress(lgr.info, "addurls_versionurls",
                             "Versioned result for %s: %s", url, row["url"],
github datalad / datalad / datalad / plugin / addurls.py View on Github external
unit=" Files")
            for row, url in zip(rows, urls):
                # If we run into any issues here, we're just going to raise an
                # exception and then abort inside dlplugin.  It'd be good to
                # disentangle this from `extract` so that we could yield an
                # individual error, drop the row, and keep going.
                filename = get_url_filename(url)
                if filename:
                    row.update(get_file_parts(filename, "_url_filename"))
                else:
                    raise ValueError(
                        "{} does not contain a filename".format(url))
                log_progress(lgr.info, "addurls_requestnames",
                             "%s returned for %s", url, filename,
                             update=1, increment=True)
            log_progress(lgr.info, "addurls_requestnames",
                         "Finished requesting file names")
github datalad / datalad / datalad / metadata / extractors / image.py View on Github external
except Exception as e:
                lgr.debug("Image metadata extractor failed to load %s: %s",
                          absfp, exc_str(e))
                continue
            meta = {
                'type': 'dctype:Image',
            }

            # run all extractors
            meta.update({k: v(img) for k, v in self._extractors.items()})
            # filter useless fields (empty strings and NaNs)
            meta = {k: v for k, v in meta.items()
                    if not (hasattr(v, '__len__') and not len(v))}
            contentmeta.append((f, meta))

        log_progress(
            lgr.info,
            'extractorimage',
            'Finished image metadata extraction from %s', self.ds
        )
        return {
            '@context': vocabulary,
        }, \
            contentmeta
github datalad / datalad / datalad / metadata / extractors / image.py View on Github external
def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        contentmeta = []
        log_progress(
            lgr.info,
            'extractorimage',
            'Start image metadata extraction from %s', self.ds,
            total=len(self.paths),
            label='image metadata extraction',
            unit=' Files',
        )
        for f in self.paths:
            absfp = opj(self.ds.path, f)
            log_progress(
                lgr.info,
                'extractorimage',
                'Extract image metadata from %s', absfp,
                update=1,
                increment=True)
            try:
                img = Image.open(absfp)
            except Exception as e:
                lgr.debug("Image metadata extractor failed to load %s: %s",
                          absfp, exc_str(e))
                continue
            meta = {
                'type': 'dctype:Image',
            }

            # run all extractors