Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def stitch(args):
"""Entry point for stitching program."""
index = medaka.datastore.DataIndex(args.inputs)
if args.regions is None:
args.regions = sorted(index.index)
# batch size is a simple empirical heuristic
regions = medaka.common.grouper(
(common.Region.from_string(r) for r in args.regions),
batch_size=max(1, len(args.regions) // (2 * args.jobs)))
with open(args.output, 'w') as fasta:
Executor = concurrent.futures.ProcessPoolExecutor
with Executor(max_workers=args.jobs) as executor:
worker = functools.partial(stitch_from_probs, args.inputs)
for contigs in executor.map(worker, regions):
for name, info, seq in contigs:
fasta.write('>{} {}\n{}\n'.format(name, info, seq))
:regions: list of `medaka.common.Region` s for which to yield samples.
:samples: iterable of sample names to yield (in order in which they
are supplied).
:yields: `medaka.common.Sample` objects.
"""
if samples is not None:
# yield samples in the order they are asked for
for sample, fname in samples:
yield DataStore(fname).load_sample(sample)
else:
all_samples = self.index
if regions is None:
regions = [
medaka.common.Region.from_string(x)
for x in sorted(all_samples)]
for reg in regions:
if reg.ref_name not in self.index:
continue
for sample in self.index[reg.ref_name]:
# samples can have major.minor coords, round to end excl.
sam_reg = medaka.common.Region(
sample['ref_name'],
int(float(sample['start'])),
int(float(sample['end'])) + 1)
if sam_reg.overlaps(reg):
with DataStore(sample['filename']) as store:
yield store.load_sample(sample['sample_key'])
A `LabelScheme` read from file is used to decode SNPs. All `LabelScheme` s
define a `decode_snps` public method. We do not need to use `join_samples`
to look for variants overlapping sample slice boundaries because we only
analyse a single locus at a time. This means that `LabelScheme` s that do
not define the `decode_consensus` method (called within `join_samples`) can
be used.
"""
logger = medaka.common.get_named_logger('SNPs')
index = medaka.datastore.DataIndex(args.inputs)
if args.regions is None:
args.regions = sorted(index.index)
regions = [
medaka.common.Region.from_string(r)
for r in args.regions]
# lookup LabelScheme stored in HDF5
try:
label_scheme = index.metadata['label_scheme']
except KeyError:
logger.debug(
"Could not find `label_scheme` metadata in input file, "
"assuming HaploidLabelScheme.")
label_scheme = medaka.labels.HaploidLabelScheme()
logger.debug("Label decoding is:\n{}".format(
'\n'.join('{}: {}'.format(k, v)
for k, v in label_scheme._decoding.items())))
meta_info = label_scheme.snp_metainfo
def main():
# Entry point for testing/checking
logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO)
np.set_printoptions(precision=4, linewidth=100)
parser = argparse.ArgumentParser('medaka', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('bam', help='alignment file.')
parser.add_argument('region', help='alignment region to sample.')
parser.add_argument('--print', action='store_true', help='print counts.')
parser.add_argument('--dtypes', nargs='+', help='perform a multi-datatype tests.')
parser.add_argument('--norm', nargs='+', help='additional normalisation tests. (total, fwd_rev)')
args = parser.parse_args()
region = medaka.common.Region.from_string(args.region)
kwargs={}
def _print(samples):
if args.print:
for p, f in zip(samples.positions, samples.features):
print('{}\t{}\t0\t{}\t{}'.format(p[0], p[1], '\t'.join('{:.3f}'.format(x) if x>0.0 else '-' for x in f), sum(f)))
dtype_options = [('',)]
if args.dtypes is not None:
dtype_options.append(args.dtypes)
norm_options = [None, ]
if args.norm is not None:
norm_options.extend(args.norm)
for dtypes in dtype_options:
kwargs['dtypes'] = dtypes
:param bam: `.bam` file.
:param region_strs: iterable of str in zero-based (samtools-like)
region format e.g. ref:start-end or filepath containing a
region string per line.
:returns: list of `Region` objects.
"""
with pysam.AlignmentFile(bam) as bam_fh:
ref_lengths = dict(zip(bam_fh.references, bam_fh.lengths))
if region_strs is not None:
if os.path.isfile(region_strs[0]):
with open(region_strs[0]) as fh:
region_strs = [line.strip() for line in fh.readlines()]
regions = []
for r in (Region.from_string(x) for x in region_strs):
start = r.start if r.start is not None else 0
end = r.end if r.end is not None else ref_lengths[r.ref_name]
regions.append(Region(r.ref_name, start, end))
else:
regions = [
Region(ref_name, 0, end)
for ref_name, end in ref_lengths.items()]
return regions
def variants_from_hdf(args):
"""Entry point for variant calling from HDF5 files.
A `LabelScheme` read from HDF must define both a `decode_variants`
and `decode_consnesus` method. The latter is used with `join_samples`
to detect multi-locus variants spanning `Sample` slice boundaries.
"""
logger = medaka.common.get_named_logger('Variants')
index = medaka.datastore.DataIndex(args.inputs)
if args.regions is None:
args.regions = sorted(index.index)
regions = [
medaka.common.Region.from_string(r)
for r in args.regions]
# lookup LabelScheme stored in HDF5
try:
label_scheme = index.metadata['label_scheme']
except KeyError:
logger.debug(
"Could not find `label_scheme` metadata in input file, "
"assuming HaploidLabelScheme.")
label_scheme = medaka.labels.HaploidLabelScheme()
logger.debug("Label decoding is:\n{}".format(
'\n'.join('{}: {}'.format(k, v)
for k, v in label_scheme._decoding.items())))
if not hasattr(label_scheme, 'decode_variants'):
def get_homozygous_regions(args):
"""Entry point to find homozygous regions from an input diploid VCF."""
vcf = VCFReader(args.vcf, cache=False)
reg = medaka.common.Region.from_string(args.region)
if reg.start is None or reg.end is None:
raise ValueError('Region start and end must be specified')
def get_hetero_pos(v):
gt = v.to_dict()['GT']
pos = list()
if gt[0] != gt[-1]: # heterozygous
pos = [p for p in range(v.pos, v.pos + len(v.ref))]
return pos
def get_homo_regions(ref_name, pos, min_len=1000):
regions = []
hetero_gaps = np.ediff1d(pos)
sort_inds = np.argsort(hetero_gaps)[::-1]
homo_len = 0
for i in sort_inds: