Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_add_ncbi_prefix(self):
refname = 'GCF_123.1'
self.assertEqual(tools.add_ncbi_prefix(refname), 'RS_GCF_123.1')
refname = 'GCA_456.1'
self.assertEqual(tools.add_ncbi_prefix(refname), 'GB_GCA_456.1')
refname = 'genome_1'
self.assertEqual(tools.add_ncbi_prefix(refname), refname)
def test_add_ncbi_prefix(self):
refname = 'GCF_123.1'
self.assertEqual(tools.add_ncbi_prefix(refname), 'RS_GCF_123.1')
refname = 'GCA_456.1'
self.assertEqual(tools.add_ncbi_prefix(refname), 'GB_GCA_456.1')
refname = 'genome_1'
self.assertEqual(tools.add_ncbi_prefix(refname), refname)
def test_add_ncbi_prefix(self):
refname = 'GCF_123.1'
self.assertEqual(tools.add_ncbi_prefix(refname), 'RS_GCF_123.1')
refname = 'GCA_456.1'
self.assertEqual(tools.add_ncbi_prefix(refname), 'GB_GCA_456.1')
refname = 'genome_1'
self.assertEqual(tools.add_ncbi_prefix(refname), refname)
def setUp(self):
self.classify = Classify()
self.out_dir = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
self.prefix = 'gtdbtk'
self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference'
self.aln_dir_ref = 'tests/data/align_dir_reference/align'
self.user_msa_file = os.path.join(self.aln_dir_ref, 'gtdbtk.ar122.user_msa.fasta')
self.taxonomy_file = Config.TAXONOMY_FILE
self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
self.options.min_perc_aa = 50
self.options.rnd_seed = 42
# classify options
self.options.scratch_dir = None
self.options.keep_ref_red = None
self.options.pplacer_cpus = None
# infer options
self.options.prot_model = 'WAG'
self.options.no_support = False
self.options.no_gamma = True
self.version = ' unittest'
self.optionparser = OptionsParser(self.version)
logger_setup(None, "gtdbtk.log", "GTDB-Tk", self.version, True)
# self.generic_out_path = 'tests/data/results'
self.generic_out_path = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
def _get_ingroup_domain(self, ingroup_taxon) -> str:
"""Get domain on ingroup taxon."""
# read GTDB taxonomy in order to establish domain on ingroup taxon
gtdb_taxonomy = Taxonomy().read(TAXONOMY_FILE)
ingroup_domain = None
for taxa in gtdb_taxonomy.values():
if ingroup_taxon in taxa:
ingroup_domain = taxa[Taxonomy.DOMAIN_IDX]
if ingroup_domain is None:
raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} was not found in '
f'the GTDB taxonomy.')
return ingroup_domain
fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)]
Node with highest F-measure for each taxon.
taxonomy : d[unique_id] -> [d__; ...; s__]
Taxonomic information for taxa in tree of interest.
out_table : str
Output table to write statistics for assigned labels.
"""
# get extent taxa
extant_taxa = Taxonomy().extant_taxa(taxonomy)
fout_table = open(out_table, 'w')
fout_table.write('Taxon\tNo. Expected in Tree\tF-measure\tPrecision\tRecall')
fout_table.write('\tNo. Genomes from Taxon\tNo. Genome In Lineage')
fout_table.write('\tRogue out\tRogue in\n')
for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()):
if len(fmeasure_for_taxa[taxon]) != 1:
self.logger.error('Multiple positions specified for taxon label.')
sys.exit()
num_genomes = len(extant_taxa[taxon])
stat_table = fmeasure_for_taxa[taxon][0]
fout_table.write('%s\t%d\t%.4f\t%.4f\t%.4f\t%d\t%d\t%s\t%s\n' % (
taxon,
num_genomes,
stat_table.fmeasure,
stat_table.precision,
stat_table.recall,
stat_table.taxa_in_lineage,
stat_table.num_leaves_with_taxa,
','.join(stat_table.rogue_out),
def _get_fastani_genome_path(self, fastani_verification, genomes):
"""Generates a queue of comparisons to be made and the paths to
the corresponding genome id."""
dict_compare, dict_paths = dict(), dict()
for qry_node, qry_dict in fastani_verification.items():
user_label = qry_node.taxon.label
dict_paths[user_label] = genomes[user_label]
dict_compare[user_label] = set()
for node in qry_dict.get('potential_g'):
leafnode = node[0]
shortleaf = leafnode.taxon.label
if leafnode.taxon.label.startswith('GB_') or leafnode.taxon.label.startswith('RS_'):
shortleaf = leafnode.taxon.label[3:]
ref_path = os.path.join(
Config.FASTANI_GENOMES, shortleaf + Config.FASTANI_GENOMES_EXT)
if not os.path.isfile(ref_path):
raise GTDBTkExit(f'Reference genome missing from FastANI database: {ref_path}')
dict_compare[user_label].add(shortleaf)
dict_paths[shortleaf] = ref_path
return dict_compare, dict_paths
def parser_marker_summary_file(self, marker_summary_file, marker_set_id):
results = {}
with open(marker_summary_file, 'r') as msf:
msf.readline()
for line in msf:
infos = line.strip().split('\t')
if marker_set_id == "bac120":
multi_hits_percent = (100 * float(infos[2])) / \
Config.BAC_MARKER_COUNT
elif marker_set_id == "ar122":
multi_hits_percent = (100 * float(infos[2])) / \
Config.AR_MARKER_COUNT
# print (marker_set_id, float(infos[3]), multi_hits_percent)
if multi_hits_percent >= Config.DEFAULT_MULTIHIT_THRESHOLD:
results[infos[0]] = round(multi_hits_percent, 1)
return results
the corresponding genome id."""
dict_compare, dict_paths = dict(), dict()
for qry_node, qry_dict in fastani_verification.items():
user_label = qry_node.taxon.label
dict_paths[user_label] = genomes[user_label]
dict_compare[user_label] = set()
for node in qry_dict.get('potential_g'):
leafnode = node[0]
shortleaf = leafnode.taxon.label
if leafnode.taxon.label.startswith('GB_') or leafnode.taxon.label.startswith('RS_'):
shortleaf = leafnode.taxon.label[3:]
ref_path = os.path.join(
Config.FASTANI_GENOMES, shortleaf + Config.FASTANI_GENOMES_EXT)
if not os.path.isfile(ref_path):
raise GTDBTkExit(f'Reference genome missing from FastANI database: {ref_path}')
dict_compare[user_label].add(shortleaf)
dict_paths[shortleaf] = ref_path
return dict_compare, dict_paths