Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)]
Node with highest F-measure for each taxon.
taxonomy : d[unique_id] -> [d__; ...; s__]
Taxonomic information for taxa in tree of interest.
out_table : str
Output table to write statistics for assigned labels.
"""
# get extent taxa
extant_taxa = Taxonomy().extant_taxa(taxonomy)
fout_table = open(out_table, 'w')
fout_table.write('Taxon\tNo. Expected in Tree\tF-measure\tPrecision\tRecall')
fout_table.write('\tNo. Genomes from Taxon\tNo. Genome In Lineage')
fout_table.write('\tRogue out\tRogue in\n')
for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()):
if len(fmeasure_for_taxa[taxon]) != 1:
self.logger.error('Multiple positions specified for taxon label.')
sys.exit()
num_genomes = len(extant_taxa[taxon])
stat_table = fmeasure_for_taxa[taxon][0]
fout_table.write('%s\t%d\t%.4f\t%.4f\t%.4f\t%d\t%d\t%s\t%s\n' % (
taxon,
num_genomes,
stat_table.fmeasure,
stat_table.precision,
stat_table.recall,
stat_table.taxa_in_lineage,
stat_table.num_leaves_with_taxa,
','.join(stat_table.rogue_out),
num_reassigned = 0
for gid, taxa in gtdbtk_taxonomy.items():
if gid in taxonomy:
num_reassigned += 1
taxonomy[gid] = taxa
self.logger.info(f'Read GTDB-Tk classifications for {len(gtdbtk_taxonomy):,} genomes.')
self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.')
if options.custom_taxonomy_file:
# add and overwrite taxonomy for genomes specified in the
# custom taxonomy file
check_file_exists(options.custom_taxonomy_file)
self.logger.info('Reading custom taxonomy file.')
custom_taxonomy = Taxonomy().read(options.custom_taxonomy_file)
num_reassigned = 0
for gid, taxa in custom_taxonomy.items():
if gid in taxonomy:
num_reassigned += 1
taxonomy[gid] = taxa
self.logger.info(f'Read custom taxonomy for {len(custom_taxonomy):,} genomes.')
self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.')
if options.gtdbtk_classification_file and options.custom_taxonomy_file:
dup_genomes = set(gtdbtk_taxonomy).intersection(custom_taxonomy)
if len(dup_genomes) > 0:
self.logger.error('GTDB-Tk classification and custom taxonomy '
'files must not specify taxonomies for the '
'same genomes.')
self.logger.error('These files have {:,} genomes in common.'.format(len(dup_genomes)))
taxon_children = Taxonomy().taxon_children(taxonomy)
# get all named groups
taxa_for_dist_inference = set()
for taxon_id, taxa in taxonomy.items():
for taxon in taxa:
taxa_for_dist_inference.add(taxon)
# sanity check species names as these are a common problem
species = set()
for taxon_id, taxa in taxonomy.items():
if len(taxa) > Taxonomy.rank_index['s__']:
species_name = taxa[Taxonomy.rank_index['s__']]
valid, error_msg = True, None
if species_name != 's__':
valid, error_msg = Taxonomy().validate_species_name(
species_name, require_full=True, require_prefix=True)
if not valid:
print('[Warning] Species name {} for {} is invalid: {}'.format(
species_name, taxon_id, error_msg))
continue
species.add(species_name)
# restrict taxa to those with a sufficient number of named children
# Note: a taxonomic group with no children will not end up in the
# taxon_children data structure so care must be taken when applying
# this filtering criteria.
if min_children > 0:
valid_taxa = set()
for taxon, children_taxa in taxon_children.items():
if len(children_taxa) >= min_children:
def _read_taxonomy_files(self, options) -> Dict[str, Tuple[str, str, str, str, str, str, str]]:
"""Read and merge taxonomy files."""
self.logger.info('Reading GTDB taxonomy for representative genomes.')
taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)
if options.gtdbtk_classification_file:
# add and overwrite taxonomy for genomes specified in the
# GTDB-Tk classification file
check_file_exists(options.gtdbtk_classification_file)
self.logger.info('Reading GTDB-Tk classification file.')
gtdbtk_taxonomy = Taxonomy().read(options.gtdbtk_classification_file)
del gtdbtk_taxonomy['user_genome']
num_reassigned = 0
for gid, taxa in gtdbtk_taxonomy.items():
if gid in taxonomy:
num_reassigned += 1
taxonomy[gid] = taxa
self.logger.info(f'Read GTDB-Tk classifications for {len(gtdbtk_taxonomy):,} genomes.')
self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.')
if options.custom_taxonomy_file:
# add and overwrite taxonomy for genomes specified in the
# custom taxonomy file
check_file_exists(options.custom_taxonomy_file)
self.logger.info('Reading custom taxonomy file.')
Parameters
----------
tree : Tree
Dendropy Tree.
taxonomy : d[extent_taxon_id] -> taxa list
Taxon labels for extant taxa.
Returns
-------
d[taxon] -> [(Node, F-measure, precision, recall_, ...]
Node(s) with highest F-measure for each taxon.
"""
# get named lineages/taxa at each taxonomic rank
taxa_at_rank = Taxonomy().named_lineages_at_rank(taxonomy)
# get extant taxa for each taxon label
extent_taxa_with_label = {}
for i, rank in enumerate(Taxonomy.rank_labels):
extent_taxa_with_label[i] = Taxonomy().extant_taxa_for_rank(rank, taxonomy)
# get parent taxon for each taxon:
taxon_parents = Taxonomy().parents(taxonomy)
# get number of leaves and taxon in each lineage
self.logger.info('Calculating taxa within each lineage.')
for node in tree.preorder_node_iter():
num_leaves = 0
taxa_count = defaultdict(lambda: defaultdict(int))
for leaf in node.leaf_iter():
num_leaves += 1
"""
# determine children taxa for each named group
taxon_children = Taxonomy().taxon_children(taxonomy)
# get all named groups
taxa_for_dist_inference = set()
for taxon_id, taxa in taxonomy.items():
for taxon in taxa:
taxa_for_dist_inference.add(taxon)
# sanity check species names as these are a common problem
species = set()
for taxon_id, taxa in taxonomy.items():
if len(taxa) > Taxonomy.rank_index['s__']:
species_name = taxa[Taxonomy.rank_index['s__']]
valid, error_msg = True, None
if species_name != 's__':
valid, error_msg = Taxonomy().validate_species_name(
species_name, require_full=True, require_prefix=True)
if not valid:
print('[Warning] Species name {} for {} is invalid: {}'.format(
species_name, taxon_id, error_msg))
continue
species.add(species_name)
# restrict taxa to those with a sufficient number of named children
# Note: a taxonomic group with no children will not end up in the
# taxon_children data structure so care must be taken when applying
# this filtering criteria.
if min_children > 0: