Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# rename user MSA file for compatibility with pplacer
if not user_msa_file.endswith('.fasta'):
if marker_set_id == 'bac120':
t = PATH_BAC120_USER_MSA.format(prefix=prefix)
elif marker_set_id == 'ar122':
t = PATH_AR122_USER_MSA.format(prefix=prefix)
else:
self.logger.error(
'There was an error determining the marker set.')
raise GenomeMarkerSetUnknown
shutil.copyfile(user_msa_file, t)
user_msa_file = t
# run pplacer to place bins in reference genome tree
num_genomes = sum([1 for _seq_id, _seq in read_seq(user_msa_file)])
# check if a scratch file is to be created
pplacer_mmap_file = None
if scratch_dir:
self.logger.info('Using a scratch file for pplacer allocations. '
'This decreases memory usage and performance.')
pplacer_mmap_file = os.path.join(
scratch_dir, prefix + ".pplacer.scratch")
make_sure_path_exists(scratch_dir)
# get path to pplacer reference package
if marker_set_id == 'bac120':
if levelopt is None:
self.logger.info(
f'Placing {num_genomes} bacterial genomes into reference tree with pplacer using {self.pplacer_cpus} cpus (be patient).')
pplacer_ref_pkg = os.path.join(
# populate producer queue with data to process
seq_iter = read_seq(seq_file)
producer_queue = mp.Queue()
read_all_seqs = False
for _ in range(self.cpus):
try:
seq_data = next(seq_iter)
producer_queue.put(seq_data)
except StopIteration:
read_all_seqs = True
for _ in range(self.cpus):
producer_queue.put(None) # signal processes to terminate
break
data_items = sum(1 for _ in read_seq(seq_file))
try:
consumer_queue = mp.Queue()
manager_proc = mp.Process(target=self.__process_manager, args=(
producer, producer_queue, consumer_queue))
manager_proc.start()
# process items produced by workers
items_processed = 0
consumer_data = None
while True:
if progress:
status = progress(items_processed, data_items)
sys.stdout.write('\r%s' % status)
sys.stdout.flush()
# given domain
continue
percent_multihit_dict = self.parser_marker_summary_file(
marker_summary_file, marker_set_id)
trans_table_file = os.path.join(
align_dir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))
trans_table_dict = self.parse_trans_table_file(trans_table_file)
msa_dict = read_fasta(user_msa_file)
if splittreeopt is True:
# run pplacer to place bins in reference genome tree
num_genomes = sum(
[1 for _seq_id, _seq in read_seq(user_msa_file)])
summaryfout, debugfile, conflict_file, marker_dict = self._generate_summary_file(
marker_set_id, prefix, out_dir, debugopt, splittreeopt)
high_classify_tree = self.place_genomes(user_msa_file,
marker_set_id,
out_dir,
prefix,
scratch_dir,
'high')
tree = self._assign_mrca_red(
high_classify_tree, marker_set_id, 'high')
high_classification = self._get_high_pplacer_taxonomy(
out_dir, marker_set_id, prefix, user_msa_file, tree)
tree_mapping_dict = {}
Function to process data items.
consumer : queue
Function to consumed processed data items.
seq_file : str
Name of fasta/q file to read.
progress : function
Function to report progress string.
Returns
-------
Set by caller in the consumer function.
"""
# populate producer queue with data to process
seq_iter = read_seq(seq_file)
producer_queue = mp.Queue()
read_all_seqs = False
for _ in range(self.cpus):
try:
seq_data = next(seq_iter)
producer_queue.put(seq_data)
except StopIteration:
read_all_seqs = True
for _ in range(self.cpus):
producer_queue.put(None) # signal processes to terminate
break
data_items = sum(1 for _ in read_seq(seq_file))
try:
consumer_queue = mp.Queue()
manager_proc = mp.Process(target=self.__process_manager, args=(