Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Up to how many leading directories withing a dataset could lead to a
sub-dataset
Yields
------
str
Path to the generated dataset(s)
"""
# we apparently can't import api functionality within api
from datalad.api import add
# To simplify managing all the file paths etc
if not isabs(path):
path = abspath(path)
# make it a git (or annex??) repository... ok - let's do randomly one or another ;)
RepoClass = GitRepo if random.randint(0, 1) else AnnexRepo
lgr.info("Generating repo of class %s under %s", RepoClass, path)
repo = RepoClass(path, create=True)
# let's create some dummy file and add it to the beast
fn = opj(path, "file%d.dat" % random.randint(1, 1000))
with open(fn, 'w') as f:
f.write(fn)
repo.add(fn, git=True)
repo.commit(msg="Added %s" % fn)
yield path
if levels:
# make a dataset for that one since we want to add sub datasets
ds_ = Dataset(path)
# Process the levels
level, levels_ = levels[0], levels[1:]
# helper to commit changes reported in status
_datalad_msg = False
if not message:
message = 'Recorded changes'
_datalad_msg = True
# TODO remove pathobj stringification when commit() can
# handle it
to_commit = [str(f.relative_to(self.pathobj))
for f, props in iteritems(status)] \
if partial_commit else None
if not partial_commit or to_commit:
# we directly call GitRepo.commit() to avoid a whole slew
# if direct-mode safeguards and workarounds in the AnnexRepo
# implementation (which also run an additional dry-run commit
GitRepo.commit(
self,
files=to_commit,
msg=message,
_datalad_msg=_datalad_msg,
options=None,
# do not raise on empty commit
# it could be that the `add` in this save-cycle has already
# brought back a 'modified' file into a clean state
careless=True,
)
def _initiate_dataset(self, path, name):
lgr.info("Initiating dataset %s" % name)
if self.branch is not None:
raise NotImplementedError("Disabled for now")
# because all the 'create' magic is stuffed into the constructor ATM
# we need first to initiate a git repository
git_repo = GitRepo(path, create=True)
# since we are initiating, that branch shouldn't exist yet, thus --orphan
git_repo.checkout(self.branch, options=["--orphan"])
# TODO: RF whenevever create becomes a dedicated factory/method
# and/or branch becomes an option for the "creator"
backend = self.backend or cfg.obtain('datalad.crawl.default_backend', default='MD5E')
direct = cfg.obtain('datalad.crawl.init_direct', default=False)
if direct:
raise NotImplementedError("Disabled for now to init direct mode ones")
ds = create(
path=path,
force=False,
# no_annex=False, # TODO: add as an arg
# Passing save arg based on backend was that we need to save only if
elif class_ == GitRepo:
type_ = "git"
else:
raise RuntimeError("Unknown class %s." % str(class_))
while not ismount(dir_): # TODO: always correct termination?
if exists(opj(dir_, '.git')):
# found git dir
if class_ is None:
# detect repo type:
try:
return AnnexRepo(dir_, create=False)
except RuntimeError as e:
pass
try:
return GitRepo(dir_, create=False)
except InvalidGitRepositoryError as e:
raise RuntimeError("No datalad repository found in %s" %
abspath_)
else:
try:
return class_(dir_, create=False)
except (RuntimeError, InvalidGitRepositoryError) as e:
raise RuntimeError("No %s repository found in %s." %
(type_, abspath_))
else:
dir_ = normpath(opj(dir_, ".."))
if class_ is not None:
raise RuntimeError("No %s repository found in %s" % (type_, abspath_))
else:
raise RuntimeError("No datalad repository found in %s" % abspath_)
def knows_annex(path):
"""Returns whether at a given path there is information about an annex
It is just a thin wrapper around GitRepo.is_with_annex() classmethod
which also checks for `path` to exist first.
This includes actually present annexes, but also uninitialized ones, or
even the presence of a remote annex branch.
"""
from os.path import exists
if not exists(path):
lgr.debug("No annex: test path {0} doesn't exist".format(path))
return False
from datalad.support.gitrepo import GitRepo
return GitRepo(path, init=False, create=False).is_with_annex()
def is_valid_repo(cls, path, allow_noninitialized=False):
"""Return True if given path points to an annex repository
"""
# Note: default value for allow_noninitialized=False is important
# for invalidating an instance via self._flyweight_invalid. If this is
# changed, we also need to override _flyweight_invalid and explicitly
# pass allow_noninitialized=False!
initialized_annex = GitRepo.is_valid_repo(path) and \
exists(opj(path, '.git', 'annex'))
if allow_noninitialized:
try:
return initialized_annex \
or GitRepo(path, create=False, init=False).is_with_annex()
except (NoSuchPathError, InvalidGitRepositoryError):
return False
else:
return initialized_annex
def _fixup_submodule_dotgit_setup(ds, relativepath):
"""Implementation of our current of .git in a subdataset
Each subdataset/module has its own .git directory where a standalone
repository would have it. No gitdir files, no symlinks.
"""
# move .git to superrepo's .git/modules, remove .git, create
# .git-file
path = opj(ds.path, relativepath)
subds_dotgit = opj(path, ".git")
src_dotgit = GitRepo.get_git_dir(path)
if src_dotgit == '.git':
# this is what we want
return
# first we want to remove any conflicting worktree setup
# done by git to find the checkout at the mountpoint of the
# submodule, if we keep that, any git command will fail
# after we move .git
GitRepo(path, init=False).config.unset(
'core.worktree', where='local')
# what we have here is some kind of reference, remove and
# replace by the target
os.remove(subds_dotgit)
# make absolute
src_dotgit = opj(path, src_dotgit)
#recursion_limit=recursion_limit,
action='metadata',
# uninstalled subdatasets could be queried via aggregated metadata
# -> no 'error'
unavailable_path_status='',
nondataset_path_status='error',
# we need to know when to look into aggregated data
force_subds_discovery=True,
force_parentds_discovery=True,
return_type='generator',
on_failure='ignore'):
if ap.get('status', None):
# this is done
yield ap
continue
if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(ap['path']):
ap['process_content'] = True
to_query = None
if ap.get('state', None) == 'absent' or \
ap.get('type', 'dataset') != 'dataset':
# this is a lonely absent dataset/file or content in a present dataset
# -> query through parent
# there must be a parent, otherwise this would be a non-dataset path
# and would have errored during annotation
to_query = ap['parentds']
else:
to_query = ap['path']
if to_query:
pcontent = content_by_ds.get(to_query, [])
pcontent.append(ap)
content_by_ds[to_query] = pcontent
"""
if path is None:
path = realpath(curdir)
# TODO: commented out to ease developing for now
# self.repo = _call(AnnexRepo, path, **kwargs)
# TODO: backend -- should be fetched from the config I guess... or should we
# give that duty to the dataset initialization routine to change default backend?
# Well -- different annexifiers might have different ideas for the backend, but
# then those could be overriden via options
if exists(path):
if not exists(opj(path, '.git')):
if (len(listdir(path))) and (not allow_dirty):
raise RuntimeError("Directory %s is not empty." % path)
self.repo = (GitRepo if no_annex else AnnexRepo)(path, always_commit=False, **kwargs)
git_remotes = self.repo.get_remotes()
if special_remotes:
if no_annex: # isinstance(self.repo, GitRepo):
raise ValueError("Cannot have special remotes in a simple git repo")
# TODO: move under AnnexRepo with proper testing etc
repo_info_repos = [v for k, v in self.repo.repo_info().items()
if k.endswith(' repositories')]
annex_remotes = {r['description']: r for r in sum(repo_info_repos, [])}
for remote in special_remotes:
if remote not in git_remotes:
if remote in annex_remotes:
# Already known - needs only enabling
lgr.info("Enabling existing special remote %s" % remote)