Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# cache size per process
limitmb=cfg.obtain('datalad.search.indexercachesize'),
# disable parallel indexing for now till #1927 is resolved
## number of processes for indexing
#procs=multiprocessing.cpu_count(),
## write separate index segments in each process for speed
## asks for writer.commit(optimize=True)
#multisegment=True,
)
# load metadata of the base dataset and what it knows about all its subdatasets
# (recursively)
old_idx_size = 0
old_ds_rpath = ''
idx_size = 0
log_progress(
lgr.info,
'autofieldidxbuild',
'Start building search index',
total=len(dsinfo),
label='Building search index',
unit=' Datasets',
)
for res in query_aggregated_metadata(
reporton=self.documenttype,
ds=self.ds,
aps=[dict(path=self.ds.path, type='dataset')],
# MIH: I cannot see a case when we would not want recursion (within
# the metadata)
recursive=True):
# this assumes that files are reported after each dataset report,
# and after a subsequent dataset report no files for the previous
def get_metadata(self, dataset, content):
if not content:
return {}, []
context = {}
contentmeta = []
log_progress(
lgr.info,
'extractorxmp',
'Start XMP metadata extraction from %s', self.ds,
total=len(self.paths),
label='XMP metadata extraction',
unit=' Files',
)
for f in self.paths:
absfp = opj(self.ds.path, f)
log_progress(
lgr.info,
'extractorxmp',
'Extract XMP metadata from %s', absfp,
update=1,
increment=True)
info = file_to_dict(absfp)
if not info:
# got nothing, likely nothing there
# TODO check if this is an XMP sidecar file, parse that, and assign metadata
# to the base file
continue
# update vocabulary
vocab = {info[ns][0][0].split(':')[0]: {'@id': ns, 'type': vocabulary_id} for ns in info}
# TODO this is dirty and assumed that XMP is internally consistent with the
# definitions across all files -- which it likely isn't
context.update(vocab)
"""Extend `rows` with values for special formatting fields.
"""
file_fields = list(get_fmt_names(filename_format))
if any(i.startswith("_url") for i in file_fields):
for row, url in zip(rows, urls):
row.update(get_url_parts(url))
if any(i.startswith("_url_filename") for i in file_fields):
if dry_run: # Don't waste time making requests.
dummy = get_file_parts("BASE.EXT", "_url_filename")
for idx, row in enumerate(rows):
row.update(
{k: v + str(idx) for k, v in dummy.items()})
else:
num_urls = len(urls)
log_progress(lgr.info, "addurls_requestnames",
"Requesting file names for %d URLs", num_urls,
label="Requesting names", total=num_urls,
unit=" Files")
for row, url in zip(rows, urls):
# If we run into any issues here, we're just going to raise an
# exception and then abort inside dlplugin. It'd be good to
# disentangle this from `extract` so that we could yield an
# individual error, drop the row, and keep going.
filename = get_url_filename(url)
if filename:
row.update(get_file_parts(filename, "_url_filename"))
else:
raise ValueError(
"{} does not contain a filename".format(url))
log_progress(lgr.info, "addurls_requestnames",
"%s returned for %s", url, filename,
def _get_content_metadata(self):
log_progress(
lgr.info,
'extractorannex',
'Start annex metadata extraction from %s', self.ds,
total=len(self.paths),
label='Annex metadata extraction',
unit=' Files',
)
repo = self.ds.repo # OPT: .repo could be relatively expensive
if not isinstance(repo, AnnexRepo):
log_progress(
lgr.info,
'extractorannex',
'Finished annex metadata extraction from %s', self.ds
)
return
': %s' %
(len(absent_extractors),
single_or_plural(" is", "s are", len(absent_extractors)),
', '.join(absent_extractors)))
log_progress(
lgr.info,
'metadataextractors',
'Start metadata extraction from %s', ds,
total=len(types),
label='Metadata extraction',
unit=' extractors',
)
for mtype in types:
mtype_key = mtype
log_progress(
lgr.info,
'metadataextractors',
'Engage %s metadata extractor', mtype_key,
update=1,
increment=True)
try:
extractor_cls = extractors[mtype_key].load()
extractor = extractor_cls(
ds,
paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist)
except Exception as e:
log_progress(
lgr.error,
'metadataextractors',
'Failed %s metadata extraction from %s', mtype_key, ds,
)
# operations.
filename_abs = os.path.join(ds.path, row["filename"])
if row["subpath"]:
ds_current = Dataset(os.path.join(ds.path,
row["subpath"]))
ds_filename = os.path.relpath(filename_abs, ds_current.path)
else:
ds_current = ds
ds_filename = row["filename"]
row.update({"filename_abs": filename_abs,
"ds": ds_current,
"ds_filename": ds_filename})
if version_urls:
num_urls = len(rows)
log_progress(lgr.info, "addurls_versionurls",
"Versioning %d URLs", num_urls,
label="Versioning URLs",
total=num_urls, unit=" URLs")
for row in rows:
url = row["url"]
try:
row["url"] = get_versioned_url(url)
except (ValueError, NotImplementedError) as exc:
# We don't expect this to happen because get_versioned_url
# should return the original URL if it isn't an S3 bucket.
# It only raises exceptions if it doesn't know how to
# handle the scheme for what looks like an S3 bucket.
lgr.warning("error getting version of %s: %s",
row["url"], exc_str(exc))
log_progress(lgr.info, "addurls_versionurls",
"Versioned result for %s: %s", url, row["url"],
unit=" Files")
for row, url in zip(rows, urls):
# If we run into any issues here, we're just going to raise an
# exception and then abort inside dlplugin. It'd be good to
# disentangle this from `extract` so that we could yield an
# individual error, drop the row, and keep going.
filename = get_url_filename(url)
if filename:
row.update(get_file_parts(filename, "_url_filename"))
else:
raise ValueError(
"{} does not contain a filename".format(url))
log_progress(lgr.info, "addurls_requestnames",
"%s returned for %s", url, filename,
update=1, increment=True)
log_progress(lgr.info, "addurls_requestnames",
"Finished requesting file names")
except Exception as e:
lgr.debug("Image metadata extractor failed to load %s: %s",
absfp, exc_str(e))
continue
meta = {
'type': 'dctype:Image',
}
# run all extractors
meta.update({k: v(img) for k, v in self._extractors.items()})
# filter useless fields (empty strings and NaNs)
meta = {k: v for k, v in meta.items()
if not (hasattr(v, '__len__') and not len(v))}
contentmeta.append((f, meta))
log_progress(
lgr.info,
'extractorimage',
'Finished image metadata extraction from %s', self.ds
)
return {
'@context': vocabulary,
}, \
contentmeta
def get_metadata(self, dataset, content):
if not content:
return {}, []
contentmeta = []
log_progress(
lgr.info,
'extractorimage',
'Start image metadata extraction from %s', self.ds,
total=len(self.paths),
label='image metadata extraction',
unit=' Files',
)
for f in self.paths:
absfp = opj(self.ds.path, f)
log_progress(
lgr.info,
'extractorimage',
'Extract image metadata from %s', absfp,
update=1,
increment=True)
try:
img = Image.open(absfp)
except Exception as e:
lgr.debug("Image metadata extractor failed to load %s: %s",
absfp, exc_str(e))
continue
meta = {
'type': 'dctype:Image',
}
# run all extractors