Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
make_sure_path_exists(out_root)
result = {}
if marker_set_id == 'bac120':
out_pplacer = os.path.join(
out_dir, PATH_BAC120_PPLACER_CLASS.format(prefix=prefix))
elif marker_set_id == 'ar122':
out_pplacer = os.path.join(
out_dir, PATH_AR122_PPLACER_CLASS.format(prefix=prefix))
else:
self.logger.error('There was an error determining the marker set.')
raise GenomeMarkerSetUnknown
# We get the pplacer taxonomy for comparison
with open(out_pplacer, 'w') as pplaceout:
user_genome_ids = set(read_fasta(user_msa_file).keys())
for leaf in tree.leaf_node_iter():
if leaf.taxon.label in user_genome_ids:
taxa = []
cur_node = leaf
while cur_node.parent_node:
_support, taxon, _aux_info = parse_label(
cur_node.label)
if taxon:
for t in taxon.split(';')[::-1]:
taxa.append(t.strip())
cur_node = cur_node.parent_node
taxa_str = ';'.join(taxa[::-1])
pplaceout.write('{}\t{}\n'.format(
leaf.taxon.label, self.standardise_taxonomy(taxa_str, marker_set_id)))
result[leaf.taxon.label] = self.standardise_taxonomy(
taxa_str, marker_set_id)
concatenated_file
The path to the MSA.
gtdb_taxonomy
A dictionary mapping the accession to the 7 rank taxonomy.
taxa_filter
A comma separated list of taxa to include.
outgroup_taxon
If using an outgroup (de novo workflow), ensure this is retained.
Returns
-------
Dict[str, str]
The genome id to msa of those genomes specified in the filter.
"""
msa = read_fasta(concatenated_file)
msa_len = len(msa)
self.logger.info(f'Read concatenated alignment for {msa_len:,} GTDB genomes.')
if taxa_filter is not None:
taxa_to_keep = set(taxa_filter.split(','))
if outgroup_taxon not in taxa_to_keep and outgroup_taxon is not None:
taxa_to_keep.add(outgroup_taxon)
filtered_genomes = 0
for genome_id, taxa in gtdb_taxonomy.items():
common_taxa = taxa_to_keep.intersection(taxa)
if len(common_taxa) == 0:
if genome_id in msa:
del msa[genome_id]
filtered_genomes += 1
with open(os.path.join(self.pack_dir, 'metadata', 'metadata.txt')) as metafile:
for line in metafile:
if line.startswith('VERSION_DATA'):
version = line.strip().split('=')[1]
# List genomes in fastani folder
list_genomes = [os.path.basename(x) for x in glob.glob(os.path.join(
self.pack_dir, 'fastani', 'database/*.gz'))]
list_genomes = [x.replace('_genomic.fna.gz', '').replace('GCA_', 'GB_GCA_').replace('GCF_', 'RS_GCF_') for x in
list_genomes]
# Archaeal genome MSA is untrimmed
ar_msa_file = glob.glob(os.path.join(
self.pack_dir, 'msa/*ar122.faa'))[0]
ar_msa = read_fasta(ar_msa_file)
first_seq = ar_msa.get(list(ar_msa.keys())[0])
if len(first_seq) != 32675:
print('ERROR: len(first_seq) != 32675')
# Bacterial genome MSA is untrimmed
bac_msa_file = glob.glob(os.path.join(
self.pack_dir, 'msa/*bac120.faa'))[0]
bac_msa = read_fasta(bac_msa_file)
first_seq = bac_msa.get(list(bac_msa.keys())[0])
if len(first_seq) != 41155:
print('ERROR: len(first_seq) != 41155')
# Bacterial MASK is same length as the untrimmed bacterial genomes
bac_mask_file = glob.glob(os.path.join(
self.pack_dir, 'masks/*bac120.mask'))[0]
bac_mask = ''
def run(self, msa_file, marker_list):
"""Randomly select a subset of columns from the MSA of each marker."""
# read multiple sequence alignment
self.logger.info('Reading multiple sequence alignment.')
msa = read_fasta(msa_file, False)
self.logger.info('Read MSA for %d genomes.' % len(msa))
filtered_seqs, pruned_seqs = self.trim(msa, marker_list)
self.logger.info('Removed %d taxa have amino acids in <%.1f%% of columns in filtered MSA.' % (
len(pruned_seqs),
self.min_perc_aa))
# write out trimmed sequences
with open(os.path.join(
self.output_dir, "filtered_msa.faa"), 'w') as filter_file:
for gid, seq in filtered_seqs.items():
fasta_outstr = ">%s\n%s\n" % (gid, seq)
filter_file.write(fasta_outstr)
self.logger.info('Done.')
preserve_underscores=True)
for n in tree.leaf_node_iter():
if n.taxon.label.startswith("U_"):
subdict = uba_acc.get(n.taxon.label)
if "gca" in subdict.keys():
n.taxon.label = subdict.get("gca")
else:
n.taxon.label = subdict.get("uba")
tree.write_to_path(os.path.join(dirout, dom, 'pplacer', dom + "_r" + release + ".tree"),
schema='newick',
suppress_rooting=True,
unquoted_underscores=True)
trimmed_seqout = open(os.path.join(
dirout, dom, 'pplacer', 'trimmed_msa_' + dom + '.faa'), 'w')
trimmed_fasta = read_fasta(seqfile)
for gid, seq in trimmed_fasta.items():
if gid in genomes_to_retain:
if gid.startswith("U_"):
subdict = uba_acc.get(gid)
if "gca" in subdict.keys():
trimmed_seqout.write(
">{0}\n{1}\n".format(subdict.get("gca"), seq))
else:
trimmed_seqout.write(
">{0}\n{1}\n".format(subdict.get("uba"), seq))
else:
trimmed_seqout.write(">{0}\n{1}\n".format(gid, seq))
trimmed_seqout.close()
logoutf = open(os.path.join(dirout, dom, 'pplacer',
'fitting_' + dom + '.log'), 'w')
# we run a fastani comparison for each user genomes against the
# selected genomes in the same genus
if len(fastani_verification) > 0:
fastani = FastANI(cpus=self.cpus, force_single=True)
self.logger.info(f'fastANI version: {fastani.version}')
d_ani_compare, d_paths = self._get_fastani_genome_path(
fastani_verification, genomes)
all_fastani_dict = fastani.run(d_ani_compare, d_paths)
classified_user_genomes, unclassified_user_genomes = self._sort_fastani_results(
fastani_verification, pplacer_taxonomy_dict, all_fastani_dict, msa_dict, percent_multihit_dict,
trans_table_dict, bac_ar_diff, summaryfout)
self.logger.info('{0} genome(s) have been classified using FastANI and pplacer.'.format(
len(classified_user_genomes)))
user_genome_ids = set(read_fasta(user_msa_file).keys())
# we remove ids already classified with FastANI
user_genome_ids = user_genome_ids.difference(
set(classified_user_genomes))
for leaf in tree.leaf_node_iter():
if leaf.taxon.label in user_genome_ids:
# In some cases , pplacer can associate 2 user genomes
# on the same parent node so we need to go up the tree
# to find a node with a reference genome as leaf.
cur_node = leaf.parent_node
list_subnode = [subnd.taxon.label.replace(
"'", '') for subnd in cur_node.leaf_iter()]
while len(set(list_subnode) & set(self.reference_ids)) < 1:
cur_node = cur_node.parent_node
list_subnode = [subnd.taxon.label.replace(
"'", '') for subnd in cur_node.leaf_iter()]
"""
if maskid == 'bac' and mask_type == 'reference':
mask = os.path.join(Config.MASK_DIR, Config.MASK_BAC120)
elif maskid == 'arc' and mask_type == 'reference':
mask = os.path.join(Config.MASK_DIR, Config.MASK_AR122)
elif mask_type == 'file':
mask = maskid
else:
self.logger.error('Command not understood.')
raise GTDBTkException('Command not understood.')
with open(mask, 'r') as f:
maskstr = f.readline()
with open(output_file, 'w') as outfwriter:
dict_genomes = read_fasta(untrimmed_msa, False)
for k, v in dict_genomes.items():
aligned_seq = ''.join([v[i] for i in range(0, len(maskstr)) if maskstr[i] == '1'])
fasta_outstr = ">%s\n%s\n" % (k, aligned_seq)
outfwriter.write(fasta_outstr)
print('ERROR: len(bac_mask) != 41155')
# Archaeal MASK is same length as the untrimmed archaeal genomes
ar_mask_file = glob.glob(os.path.join(
self.pack_dir, 'masks/*ar122.mask'))[0]
ar_mask = ''
with open(ar_mask_file) as amf:
ar_mask = amf.readline()
if len(ar_mask) != 32675:
print('ERROR: len(ar_mask) != 32675')
# Archaeal Pplacer MSA should have the same number of genomes as the
# Archaeal untrimmed MSA
ar_pplacer_msa_file = glob.glob(os.path.join(
self.pack_dir, 'pplacer', 'gtdb_' + version + '_ar122.refpkg', 'ar122_msa_r95.faa'))[0]
ar_pplacer_msa = read_fasta(ar_pplacer_msa_file)
if len(ar_pplacer_msa) != len(ar_msa):
print('ERROR: len(ar_pplacer_msa) != len(ar_msa)')
print('len(ar_pplacer_msa): {}'.format(len(ar_pplacer_msa)))
print('len(ar_msa): {}'.format(len(ar_msa)))
print('difference genomes: {}'.format(list(set(ar_msa.keys()).difference(set(ar_pplacer_msa.keys())))))
first_seq = ar_pplacer_msa.get(list(ar_pplacer_msa.keys())[0])
# Archaeal Pplacer MSA should have the same length as the Archaeal mask
if len(first_seq) != len([a for a in ar_mask if a == '1']):
print('ERROR: len(first_seq) != len([a for a in ar_mask if a ==1])')
print('len(first_seq): {}'.format(len(first_seq)))
print('len([a for a in ar_mask if a ==1]): {}'.format(len([a for a in ar_mask if a == '1'])))
# Bacterial Pplacer MSA should have the same number of genomes as the
# Bacterial untrimmed MSA
bac_pplacer_msa_file = os.path.join(
self.pack_dir, 'pplacer', 'gtdb_' + version + '_bac120.refpkg', 'bac120_msa_r95.faa')
self.pack_dir, 'fastani', 'database/*.gz'))]
list_genomes = [x.replace('_genomic.fna.gz', '').replace('GCA_', 'GB_GCA_').replace('GCF_', 'RS_GCF_') for x in
list_genomes]
# Archaeal genome MSA is untrimmed
ar_msa_file = glob.glob(os.path.join(
self.pack_dir, 'msa/*ar122.faa'))[0]
ar_msa = read_fasta(ar_msa_file)
first_seq = ar_msa.get(list(ar_msa.keys())[0])
if len(first_seq) != 32675:
print('ERROR: len(first_seq) != 32675')
# Bacterial genome MSA is untrimmed
bac_msa_file = glob.glob(os.path.join(
self.pack_dir, 'msa/*bac120.faa'))[0]
bac_msa = read_fasta(bac_msa_file)
first_seq = bac_msa.get(list(bac_msa.keys())[0])
if len(first_seq) != 41155:
print('ERROR: len(first_seq) != 41155')
# Bacterial MASK is same length as the untrimmed bacterial genomes
bac_mask_file = glob.glob(os.path.join(
self.pack_dir, 'masks/*bac120.mask'))[0]
bac_mask = ''
with open(bac_mask_file) as bmf:
bac_mask = bmf.readline()
if len(bac_mask) != 41155:
print('ERROR: len(bac_mask) != 41155')
# Archaeal MASK is same length as the untrimmed archaeal genomes
ar_mask_file = glob.glob(os.path.join(
self.pack_dir, 'masks/*ar122.mask'))[0]