Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
fast5_dir=None, summary=None, regions=None, threads=1):
"""Compress a bam into run length encoding (RLE).
:param bam_input: str, name of the bam file to be compressed
:param bam_output: str, name of the bam to be produced
:param ref_fname: str, reference filename, used to produce bam_input
:param fast5_dir: str, root directory to find the fast5 files
:param summary: str, filename of a summary name, with the columns
to link the read id to the fast5 file containing it. Must contain
columns 'read_id' and 'filename'
:param regions: list, genomic regions to be extracted
:param threads: int, number of workers to be used
:returns: None
"""
regions = medaka.common.get_regions(bam_input, regions)
ref_fasta = pysam.FastaFile(ref_fname)
# If fast_dir is passed, create an index
if fast5_dir:
with open(summary) as input_file:
# Summary files can be huge, avoid loading to memory
col_names = input_file.readline().replace('#', '').split()
col_filename = col_names.index('filename')
col_readid = col_names.index('read_id')
file_index = dict(
(line.split()[col_readid], line.split()[col_filename])
for line in input_file)
else:
file_index = None
def call_methylation(args):
"""Entry point for calling methylation from bam file."""
logger = medaka.common.get_named_logger('Mextract')
logger.info("Processing {}.".format(args.bam))
region = medaka.common.get_regions(args.bam, [args.region])[0]
ref = pysam.FastaFile(args.reference)
ref = ref.fetch(region.ref_name)
motifs = MOTIFS[args.meth]
additional_text = ''
if args.meth == 'all':
additional_text = (
" Note motifs are stranded, such that both 'C' and 'G' (and 'A' "
"and 'T') bases will be seen in the output for 5mC (6mA). "
"Output with a 'G' ('T' for 6mA) correspond to the reverse "
"strand. Users should post process the output to join/sum lines "
"belonging to the same genomic loci.")
elif args.meth == 'dcm':
additional_text = (
" The output is sorted by motif, users may wish to subsequently "
"sorted by position.")
def predict(args):
"""Inference program."""
logger_level = logging.getLogger(__package__).level
if logger_level > logging.DEBUG:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow as tf
from tensorflow.keras import backend as K
args.regions = medaka.common.get_regions(
args.bam, region_strs=args.regions)
logger = medaka.common.get_named_logger('Predict')
logger.info('Processing region(s): {}'.format(
' '.join(str(r) for r in args.regions)))
# create output and copy meta
with medaka.datastore.DataStore(args.model) as ds:
ds.copy_meta(args.output)
feature_encoder = ds.get_meta('feature_encoder')
feature_encoder.tag_name = args.tag_name
feature_encoder.tag_value = args.tag_value
feature_encoder.tag_keep_missing = args.tag_keep_missing
feature_encoder.read_group = args.RG
logger.info("Setting tensorflow threads to {}.".format(args.threads))
def predict(args):
"""Inference program."""
logger_level = logging.getLogger(__package__).level
if logger_level > logging.DEBUG:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
args.regions = medaka.common.get_regions(args.bam, region_strs=args.regions)
logger = medaka.common.get_named_logger('Predict')
logger.info('Processing region(s): {}'.format(' '.join(str(r) for r in args.regions)))
# write class names to output
with medaka.datastore.DataStore(args.model) as ds:
meta = ds.meta
with medaka.datastore.DataStore(args.output, 'w', verify_on_close=False) as ds:
ds.update_meta(meta)
logger.info("Setting tensorflow threads to {}.".format(args.threads))
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
K.set_session(tf.Session(
config=tf.ConfigProto(
intra_op_parallelism_threads=args.threads,
inter_op_parallelism_threads=args.threads)
))
def create_samples(args):
"""Entry point for creation of feature .hdfs (labelled or unlabelled)."""
logger = medaka.common.get_named_logger('Prepare')
if args.chunk_ovlp >= args.chunk_len:
raise ValueError(
'chunk_ovlp {} is not smaller than chunk_len {}'.format(
args.chunk_ovlp, args.chunk_len))
regions = medaka.common.get_regions(args.bam, args.regions)
reg_str = '\n'.join(['\t\t\t{}'.format(r) for r in regions])
logger.info('Got regions:\n{}'.format(reg_str))
if args.truth is None:
logger.warn(
'Running medaka features without a truth bam, '
'unlabelled data will be produced. Is this intended?')
time.sleep(3)
no_data = False
with medaka.datastore.DataStore(args.output, 'w') as ds:
# write feature options to file
logger.info("Writing meta data to file.")
num_qstrat = args.feature_encoder_args.get('num_qstrat')
max_run = args.label_scheme_args.get('max_run')
# If one of them is set, set the other to agree.