Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def load_sample(self, key):
"""Load `medaka.common.Sample` object from file.
:param key: str, sample name.
:returns: `medaka.common.Sample` object.
"""
s = dict()
for field in medaka.common.Sample._fields:
pth = '{}/{}/{}'.format(self._sample_path_, key, field)
if pth in self.fh:
s[field] = self.fh[pth][()]
# handle loading of bytestrings
if isinstance(s[field], np.ndarray) and \
isinstance(s[field][0], type(b'')):
s[field] = np.char.decode(s[field])
else:
s[field] = None
return medaka.common.Sample(**s)
def amend(self, **kwargs):
"""Create new `Sample` with some attributes changed."""
d = self._asdict()
for k, v in kwargs.items():
if k not in self._fields:
raise KeyError('Invalid key for Sample: {}'.format(k))
d[k] = v
return Sample(**d)
# max just to avoid div error
feature_array = counts / np.maximum(1, depth).reshape((-1, 1))
elif self.normalise == 'fwd_rev':
# normalize forward and reverse and by dtype
feature_array = np.empty_like(counts, dtype=self.feature_dtype)
for (dt, is_rev), inds in self.feature_indices.items():
dt_depth = np.sum(counts[:, inds], axis=1)
dt_depth[minor_inds] = dt_depth[major_ind_at_minor_inds]
# max just to avoid div err
feature_array[:, inds] = \
counts[:, inds] / np.maximum(1, dt_depth).reshape((-1, 1))
else:
feature_array = counts
feature_array = feature_array.astype(self.feature_dtype)
sample = medaka.common.Sample(
ref_name=region.ref_name, features=feature_array,
labels=None, ref_seq=None,
positions=positions, label_probs=None
)
self.logger.info('Processed {} (median depth {})'.format(
sample.name, np.median(depth)))
return sample
# Relationship.forward_abutted guarantees all samples have the
# same ref_name
non_concat_fields = {'ref_name'}
def concat_attr(attr):
vals = [getattr(s, attr) for s in samples]
if attr not in non_concat_fields:
all_none = all([v is None for v in vals])
c = np.concatenate(vals) if not all_none else None
else:
assert len(set(vals)) == 1
c = vals[0]
return c
return Sample(**{attr: concat_attr(attr) for attr in Sample._fields})
:param sample_gen: stream of (`medaka.common.Sample`,
bool is_last_in_contig)
:param ref_seq: str, reference sequence
:yields: `medaka.common.Sample` s
"""
queue = []
for s, is_last_in_contig in sample_gen:
if is_last_in_contig:
# there are no further samples in this contig
# all remaining variants must be in this contig
# or in the queue, so process and empty queue
queue.append(s)
yield medaka.common.Sample.from_samples(queue)
queue = []
continue
# find last non-variant call, i.e. a major position which
# has no minor positions and is the same as the ref
# call retaining gaps cast to array
call_with_gaps = np.array(list(
label_scheme.decode_consensus(s, with_gaps=True)), dtype='|U1')
def get_symbol(p):
return ref_seq[p['major']] if p['minor'] == 0 else '*'
ref_seq_with_gaps = np.fromiter(
(get_symbol(p) for p in s.positions),
dtype='|U1',
# if self.log_min is 10, we can cope with depth up to 10**9
feature_array[i, :] = np.log10(feature_array[i, :],
out=feature_array[i, :])
feature_array[i, :] += self.log_min
feature_array[i, :] = np.nan_to_num(feature_array[i, :], copy=False)
if self.ref_mode == 'onehot':
feature_array[i, self.encoding[('ref', str(ref_base), int(ref_len))]] = 1
elif self.ref_mode == 'base_length':
feature_array[i, self.encoding['ref_base']] = self.ref_base_encoding[ref_base]
feature_array[i, self.encoding['ref_length']] = ref_len
elif self.ref_mode == 'index':
# index of count which ref would contribute to were it a read
feature_array[i, self.encoding['ref_index']] = self.encoding[(False, min(ref_len, self.max_hp_len), ref_base)]
sample = Sample(ref_name=region.ref_name, features=feature_array,
labels=None, ref_seq=ref_array,
positions=positions, label_probs=None)
logging.info('Processed {} (median depth {})'.format(encode_sample_name(sample), np.median(depth_array)))
return sample
mock_compr = self.max_hp_len > 1 and not self.is_compressed
truth_chunk = aln.get_positions_and_labels(ref_compr_rle=ref_rle_fq, mock_compr=mock_compr,
is_compressed=self.is_compressed, rle_dtype=True)
sample = self.bam_to_sample(bam, Region(region.ref_name, aln.start, aln.end), ref_rle_fq, read_fraction=read_fraction)
# Create labels according to positions in pileup
pad = (encoding[_gap_], 1) if len(truth_chunk.reads[0].dtype) > 0 else encoding[_gap_]
padder = itertools.repeat(pad)
position_to_label = defaultdict(padder.__next__,
zip([tuple(p) for p in truth_chunk.positions],
[a for a in truth_chunk.reads[0]]))
padded_labels = np.fromiter((position_to_label[tuple(p)] for p in sample.positions),
dtype=truth_chunk.reads[0].dtype, count=len(sample.positions))
sample = sample._asdict()
sample['labels'] = padded_labels
samples.append(Sample(**sample))
return tuple(samples)
... dtype=[('major', int), ('minor', int)])
>>> feat = np.arange(len(pos))
>>> s = Sample('contig1', feat , None, None, pos, None)
>>> s.slice(2) #doctest: +ELLIPSIS
Sample(...features=2, ..., positions=(1, 1), label_probs=None)
>>> s.slice(slice(1,3)) #doctest: +ELLIPSIS
Sample(..., features=array([1, 2]),...)
"""
non_slice_fields = {'ref_name'}
def slice_attr(attr):
a = getattr(self, attr)
if attr not in non_slice_fields:
a = a[key] if a is not None else None
return a
return Sample(**{attr: slice_attr(attr) for attr in self._fields})
else:
# s2 ends before s1
if s2.last_pos <= s1.last_pos:
logger.info('{} ends before {}, skipping.'.format(
s2_name, s1_name
))
continue
# s1 and s2 overlap by only one position
# or there is no overlap between s1 and s2
elif s2.first_pos >= s1.last_pos:
# trigger a break
end_1, start_2 = None, None
else:
try:
end_1, start_2, heuristic = \
common.Sample.overlap_indices(s1, s2)
if heuristic:
logger.debug(
"Used heuristic to stitch {} and {}.".format(
s1.name, s2.name))
heuristic_use += 1
except common.OverlapException as e:
logger.info(
"Unhandled overlap type whilst stitching chunks.")
raise(e)
new_seq = label_scheme.decode_consensus(
s1.slice(slice(start_1, end_1)))
seq_parts.append(new_seq)
if end_1 is None: