Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Taxonomic classifications of SSU sequences for each genome.
"""
blast = Blast(self.cpus)
self.logger.info('Classifying SSU rRNA genes.')
classifications = defaultdict(dict)
for genome_id, seq_file in seq_files.items():
genome_dir = os.path.join(output_dir, genome_id)
# blast sequences against 16S database
blast_file = os.path.join(genome_dir, 'ssu.blastn.tsv')
blast.blastn(seq_file, ssu_db, blast_file, evalue=evalue_threshold, max_matches=1, output_fmt='custom')
# read taxonomy file
taxonomy = Taxonomy().read(ssu_taxonomy_file)
# write out classification file
classification_file = os.path.join(genome_dir, 'ssu.taxonomy.tsv')
fout = open(classification_file, 'w')
fout.write('query_id\tssu_taxonomy\tssu_length\tssu_blast_subject_id\tssu_blast_evalue\tssu_blast_bitscore\tssu_blast_align_len\tssu_blast_perc_identity\n')
processed_query_ids = set()
for line in open(blast_file):
line_split = [x.strip() for x in line.split('\t')]
query_id = line_split[0]
if query_id in processed_query_ids:
# A query may have multiple hits to different sections
# of a gene. Blast results are organized by e-value so
# only the first hit is considered. The subject gene
# is the same in all cases so the taxonomy string will
fout.write(rank + ': taxon')
fout.write('\t' + rank + ': % scaffolds')
fout.write('\t' + rank + ': % genes')
fout.write('\t' + rank + ': % coding bps')
fout.write('\t' + rank + ': avg. e-value')
fout.write('\t' + rank + ': avg. % identity')
fout.write('\t' + rank + ': avg. align. length (aa)')
fout.write('\n')
# sort profiles in descending order of abundance
profile, stats = self.profile()
sorted_profiles = {}
max_taxa = 0
for r in range(0, len(Taxonomy.rank_labels)):
sorted_profile = sorted(profile[r].items(), key=operator.itemgetter(1))
sorted_profile.reverse()
sorted_profiles[r] = sorted_profile
if len(sorted_profiles) > max_taxa:
max_taxa = len(sorted_profiles)
# write out table
total_seqs = len(self.genes_in_scaffold)
total_genes = sum(self.genes_in_scaffold.values())
total_coding_bases = sum(self.coding_bases.values())
for i in range(0, max_taxa):
for r in range(0, len(Taxonomy.rank_labels)):
if r != 0:
Scaffold are classified using a majority vote
over all genes with a valid hit. If less than 20% of
genes have a valid hit, the scaffold is considered unclassified.
Classification is performed from the highest (domain)
to lowest (species) rank. If a rank is taxonomically
inconsistent with a higher ranks classification, this
rank and all lower ranks are set to unclassified.
Returns
-------
dict : d[scaffold_id][rank] -> [taxon, HitInfo]
Classification of each scaffold along with summary statistics
of hits to the specified taxon.
"""
expected_parent = Taxonomy().taxonomic_consistency(self.taxonomy)
# classify each scaffold using a majority vote
seq_assignments = defaultdict(lambda: defaultdict(list))
for seq_id, rank_hits in self.hits.items():
parent_taxa = None
for rank in range(0, len(Taxonomy.rank_prefixes)):
taxa = max(rank_hits[rank], key=lambda x: len(rank_hits[rank][x]))
count = len(rank_hits[rank][taxa])
if (taxa != Taxonomy.rank_prefixes[rank]
and (count >= self.percent_to_classify * self.genes_in_scaffold[seq_id])
and (rank == 0 or expected_parent[taxa] == parent_taxa)):
seq_assignments[seq_id][rank] = [taxa, rank_hits[rank][taxa]]
parent_taxa = taxa
else:
# set to unclassified at all lower ranks
over all fragments originating from the sequence
with a valid hit. If less than 20% of fragments have
a valid hit, the sequence is considered unclassified.
Classification is performed from the highest (domain)
to lowest (species) rank. If a rank is taxonomically
inconsistent with a higher ranks classification, this
rank and all lower ranks are set to unclassified.
Returns
-------
dict : d[contig_id][rank] -> [taxa, HitInfo]
Classification of each sequence along with summary statistics
of hits to the specified taxa.
"""
expected_parent = Taxonomy().taxonomic_consistency(self.taxonomy)
# classify each sequence using a majority vote
seq_assignments = defaultdict(lambda: defaultdict(list))
for seq_id, rank_hits in self.hits.iteritems():
parent_taxa = None
for rank in xrange(0, len(self.rank_prefixes)):
taxa = max(rank_hits[rank], key=lambda x: len(rank_hits[rank][x]))
count = len(rank_hits[rank][taxa])
if count >= self.percent_to_classify * self.fragments_from_seq[seq_id]:
if rank == 0 or expected_parent[taxa] == parent_taxa:
seq_assignments[seq_id][rank] = [taxa, rank_hits[rank][taxa]]
parent_taxa = taxa
else:
# set to unclassified at all lower ranks
for r in xrange(rank, len(self.rank_prefixes)):
def root(self, options):
"""Root tree using outgroup."""
check_file_exists(options.input_tree)
gtdb_taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)
self.logger.info('Identifying genomes from the specified outgroup.')
outgroup = set()
for genome_id, taxa in gtdb_taxonomy.iteritems():
if options.outgroup_taxon in taxa:
outgroup.add(genome_id)
reroot = RerootTree()
reroot.root_with_outgroup(options.input_tree,
options.output_tree,
outgroup)
self.logger.info('Done.')
def write_genome_summary(self, output_file):
"""Summarize classification of each genome.
Parameters
----------
output_file : str
Output file.
"""
fout = open(output_file, 'w')
fout.write('Genome id\t# scaffolds\t# genes\tCoding bases')
for rank in Taxonomy.rank_labels:
fout.write('\t' + rank + ': taxon')
fout.write('\t' + rank + ': % of scaffolds')
fout.write('\t' + rank + ': % of genes')
fout.write('\t' + rank + ': % of coding bases')
fout.write('\t' + rank + ': avg. e-value')
fout.write('\t' + rank + ': avg. % identity')
fout.write('\t' + rank + ': avg. align. length (aa)')
fout.write('\n')
sorted_genome_ids = alphanumeric_sort(self.profiles.keys())
for genome_id in sorted_genome_ids:
self.profiles[genome_id].write_genome_summary(fout)
fout.close()
def __init__(self, genome_id, percent_to_classify, taxonomy):
"""Initialization.
Parameters
----------
genome_id : str
Unique identify of genome.
percent_to_classify : float
Minimum percentage of genes to assign scaffold to a taxon [0, 100].
taxonomy : d[ref_genome_id] -> [domain, phylum, ..., species]
Taxonomic assignment of each reference genome.
"""
self.percent_to_classify = percent_to_classify / 100.0
self.unclassified = Taxonomy.unclassified_rank
self.genome_id = genome_id
self.taxonomy = taxonomy
self.TaxaInfo = namedtuple('TaxaInfo', """evalue
perc_identity
aln_length
num_seqs
num_genes
num_basepairs""")
# track hits at each rank: dict[scaffold_id][rank][taxa] -> [HitInfo, ...]
self.HitInfo = namedtuple('HitInfo', """subject_genome_id
subject_gene_id
evalue
perc_identity
def __init__(self):
"""Initialize."""
check_dependencies(['comparem', 'diamond', 'makeblastdb'])
self.underclassified = 'underclassified'
self.rank_prefixes = Taxonomy.rank_prefixes
self.rank_index = Taxonomy.rank_index
self.rank_labels = Taxonomy.rank_labels
self.time_keeper = TimeKeeper()