Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if dataset == 'train':
self.data = batcher.train_samples
elif dataset == 'validation':
self.data = batcher.valid_samples
if mini_epochs != 1:
raise ValueError(
"'mini_epochs' must be equal to 1 for validation data.")
else:
raise ValueError("'dataset' should be 'train' or 'validation'.")
original_size = len(self.data)
self.n_batches = len(self.data) // self.batch_size
self.data = self.data[:self.n_batches*self.batch_size]
np.random.shuffle(self.data)
self.logger = medaka.common.get_named_logger(
'{}Batcher'.format(dataset.capitalize()))
self.logger.info(
'{} batches of {} samples ({}), from {} original.'.format(
self.n_batches, self.batch_size, len(self.data),
original_size))
def stitch(args):
"""Entry point for stitching program."""
index = medaka.datastore.DataIndex(args.inputs)
if args.regions is None:
args.regions = sorted(index.index)
# batch size is a simple empirical heuristic
regions = medaka.common.grouper(
(common.Region.from_string(r) for r in args.regions),
batch_size=max(1, len(args.regions) // (2 * args.jobs)))
with open(args.output, 'w') as fasta:
Executor = concurrent.futures.ProcessPoolExecutor
with Executor(max_workers=args.jobs) as executor:
worker = functools.partial(stitch_from_probs, args.inputs)
for contigs in executor.map(worker, regions):
for name, info, seq in contigs:
fasta.write('>{} {}\n{}\n'.format(name, info, seq))
def stitch(args):
"""Entry point for stitching program."""
index = medaka.datastore.DataIndex(args.inputs)
if args.regions is None:
args.regions = sorted(index.index)
# batch size is a simple empirical heuristic
regions = medaka.common.grouper(
(common.Region.from_string(r) for r in args.regions),
batch_size=max(1, len(args.regions) // (2 * args.jobs)))
with open(args.output, 'w') as fasta:
Executor = concurrent.futures.ProcessPoolExecutor
with Executor(max_workers=args.jobs) as executor:
worker = functools.partial(stitch_from_probs, args.inputs)
for contigs in executor.map(worker, regions):
for name, info, seq in contigs:
fasta.write('>{} {}\n{}\n'.format(name, info, seq))
test_sample, test_fname = self.samples[0]
with medaka.datastore.DataStore(test_fname) as ds:
self.feature_shape = ds.load_sample(test_sample).features.shape
self.logger.info("Sample features have shape {}".format(self.feature_shape))
if isinstance(self.validation, float):
np.random.seed(self.seed)
np.random.shuffle(self.samples)
n_sample_train = int((1 - self.validation) * len(self.samples))
self.train_samples = self.samples[:n_sample_train]
self.valid_samples = self.samples[n_sample_train:]
msg = 'Randomly selected {} ({:3.2%}) of features for validation (seed {})'
self.logger.info(msg.format(len(self.valid_samples), self.validation, self.seed))
else:
self.train_samples = self.samples
self.valid_samples = medaka.datastore.DataIndex(self.validation).samples.copy()
msg = 'Found {} validation samples equivalent to {:3.2%} of all the data'
fraction = len(self.valid_samples) / len(self.valid_samples) + len(self.train_samples)
self.logger.info(msg.format(len(self.valid_samples), fraction))
msg = 'Got {} samples in {} batches ({} labels) for {}'
self.logger.info(msg.format(len(self.train_samples),
len(self.train_samples) // batch_size,
len(self.train_samples) * self.feature_shape[0],
'training'))
self.logger.info(msg.format(len(self.valid_samples),
len(self.valid_samples) // batch_size,
len(self.valid_samples) * self.feature_shape[0],
'validation'))
def stitch(args):
"""Entry point for stitching program."""
index = medaka.datastore.DataIndex(args.inputs)
if args.regions is None:
args.regions = sorted(index.index)
# batch size is a simple empirical heuristic
regions = medaka.common.grouper(
(common.Region.from_string(r) for r in args.regions),
batch_size=max(1, len(args.regions) // (2 * args.jobs)))
with open(args.output, 'w') as fasta:
Executor = concurrent.futures.ProcessPoolExecutor
with Executor(max_workers=args.jobs) as executor:
worker = functools.partial(stitch_from_probs, args.inputs)
for contigs in executor.map(worker, regions):
for name, info, seq in contigs:
fasta.write('>{} {}\n{}\n'.format(name, info, seq))
self.logger.info(
"Sample features have shape {}".format(self.feature_shape))
if isinstance(self.validation, float):
np.random.seed(self.seed)
np.random.shuffle(self.samples)
n_sample_train = int((1 - self.validation) * len(self.samples))
self.train_samples = self.samples[:n_sample_train]
self.valid_samples = self.samples[n_sample_train:]
self.logger.info(
'Randomly selected {} ({:3.2%}) of features for '
'validation (seed {})'.format(
len(self.valid_samples), self.validation, self.seed))
else:
self.train_samples = self.samples
self.valid_samples = medaka.datastore.DataIndex(
self.validation).samples.copy()
msg = 'Found {} validation samples, to {:3.2%} of all the data'
fraction = len(self.valid_samples) / \
(len(self.valid_samples) + len(self.train_samples))
self.logger.info(msg.format(len(self.valid_samples), fraction))
msg = 'Got {} samples in {} batches ({} labels) for {}'
self.logger.info(msg.format(
len(self.train_samples),
len(self.train_samples) // batch_size,
len(self.train_samples) * self.feature_shape[0],
'training'))
self.logger.info(msg.format(
len(self.valid_samples),
len(self.valid_samples) // batch_size,
len(self.valid_samples) * self.feature_shape[0],
def rlebam(args):
"""Entry point for merging run length information for fast5s to bam."""
logger = medaka.common.get_named_logger('BAMDecor')
read_index = medaka.common.read_key_value_tsv(args.read_index)
logger.info("Found {} read in index\n".format(len(read_index)))
def _ingress():
for line in sys.stdin:
if line[0] == '@':
yield line.rstrip(), None, None, None
else:
read_id, flag, _ = line.split('\t', 2)
is_rev = bool(int(flag) & 16)
fname = read_index[read_id]
yield line.rstrip(), read_id, is_rev, fname
with concurrent.futures.ProcessPoolExecutor(
max_workers=args.workers) as executor:
for results in executor.map(
elif max_run != num_qstrat:
raise ValueError(
'num_qstrat in feature_encoder_args must agree '
'with max_run in feature_encoder_args')
# Create and serialise to file model ancilliaries
feature_encoder = feature_encoders[args.feature_encoder](
**args.feature_encoder_args)
ds.set_meta(feature_encoder, 'feature_encoder')
label_scheme = medaka.labels.label_schemes[args.label_scheme](
**args.label_scheme_args)
ds.set_meta(label_scheme, 'label_scheme')
model_function = functools.partial(
medaka.models.build_model,
feature_encoder.feature_vector_length,
len(label_scheme._decoding))
ds.set_meta(model_function, 'model_function')
# TODO: this parallelism would be better in
# `SampleGenerator.bams_to_training_samples` since training
# alignments are usually chunked.
with concurrent.futures.ProcessPoolExecutor(max_workers=args.threads) \
as executor:
# break up overly long chunks
MAX_SIZE = int(1e6)
regions = itertools.chain(*(r.split(MAX_SIZE) for r in regions))
futures = [executor.submit(
_samples_worker, args, reg,
feature_encoder, label_scheme) for reg in regions]
inter_op_parallelism_threads=args.threads)
))
# Split overly long regions to maximum size so as to not create
# massive feature matrices
MAX_REGION_SIZE = int(1e6) # 1Mb
regions = []
for region in args.regions:
if region.size > MAX_REGION_SIZE:
regs = region.split(MAX_REGION_SIZE, args.chunk_ovlp)
else:
regs = [region]
regions.extend(regs)
logger.info("Processing {} long region(s) with batching.".format(len(regions)))
model = medaka.models.load_model(args.model, time_steps=args.chunk_len)
# the returned regions are those where the pileup width is smaller than chunk_len
remainder_regions = run_prediction(
args.output, args.bam, regions, model, args.model, args.rle_ref, args.read_fraction,
args.chunk_len, args.chunk_ovlp,
batch_size=args.batch_size, save_features=args.save_features,
tag_name=args.tag_name, tag_value=args.tag_value, tag_keep_missing=args.tag_keep_missing
)
# short/remainder regions: just do things without chunking. We can do this
# here because we now have the size of all pileups (and know they are small).
# TODO: can we avoid calculating pileups twice whilst controlling memory?
if len(remainder_regions) > 0:
logger.info("Processing {} short region(s).".format(len(remainder_regions)))
model = medaka.models.load_model(args.model, time_steps=None)
for region in remainder_regions:
new_remainders = run_prediction(
for region in args.regions:
if region.size > MAX_REGION_SIZE:
# chunk_ovlp is mostly used in overlapping pileups (which generally
# end up being expanded compared to the draft coordinate system)
regs = region.split(
MAX_REGION_SIZE, overlap=args.chunk_ovlp, fixed_size=False)
else:
regs = [region]
regions.extend(regs)
logger.info("Processing {} long region(s) with batching.".format(
len(regions)))
logger.info("Using model: {}.".format(args.model))
model = medaka.models.load_model(args.model, time_steps=args.chunk_len,
allow_cudnn=args.allow_cudnn)
# the returned regions are those where the pileup width is smaller than
# chunk_len
remainder_regions = run_prediction(
args.output, args.bam, regions, model, feature_encoder,
args.chunk_len, args.chunk_ovlp,
batch_size=args.batch_size, save_features=args.save_features
)
# short/remainder regions: just do things without chunking. We can do this
# here because we now have the size of all pileups (and know they are
# small).
# TODO: can we avoid calculating pileups twice whilst controlling memory?
if len(remainder_regions) > 0:
logger.info("Processing {} short region(s).".format(