Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def setUp(self):
self.classify = Classify()
self.out_dir = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
self.prefix = 'gtdbtk'
self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference'
self.aln_dir_ref = 'tests/data/align_dir_reference/align'
self.user_msa_file = os.path.join(self.aln_dir_ref, 'gtdbtk.ar122.user_msa.fasta')
self.taxonomy_file = Config.TAXONOMY_FILE
self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
def _get_ingroup_domain(self, ingroup_taxon) -> str:
"""Get domain on ingroup taxon."""
# read GTDB taxonomy in order to establish domain on ingroup taxon
gtdb_taxonomy = Taxonomy().read(TAXONOMY_FILE)
ingroup_domain = None
for taxa in gtdb_taxonomy.values():
if ingroup_taxon in taxa:
ingroup_domain = taxa[Taxonomy.DOMAIN_IDX]
if ingroup_domain is None:
raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} was not found in '
f'the GTDB taxonomy.')
return ingroup_domain
def get_reference_ids():
results = []
with open(Config.TAXONOMY_FILE) as tf:
for line in tf:
raw_id = line.split('\t')[0]
results.append(raw_id)
if raw_id[0:4] in ['GCF_', 'GCA_']:
results.append(add_ncbi_prefix(raw_id))
elif raw_id[0:3] in ['RS_', 'GB_']:
results.append(raw_id[3:])
return results
Returns
-------
dendropy.Tree
Taxonomy string.
"""
# read tree
self.logger.info('Reading tree.')
tree = dendropy.Tree.get_from_path(input_tree,
schema='newick',
rooting='force-rooted',
preserve_underscores=True)
self.logger.info('Reading taxonomy from file.')
taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)
# determine taxa to be used for inferring distribution
trusted_taxa = None
taxa_for_dist_inference = self._filter_taxa_for_dist_inference(tree,
taxonomy,
trusted_taxa,
Config.RED_MIN_CHILDREN,
Config.RED_MIN_SUPPORT)
phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree,
taxa_for_dist_inference,
taxonomy)
# set edge lengths to median value over all rootings
tree.seed_node.rel_dist = 0.0
for n in tree.preorder_node_iter(lambda x: x != tree.seed_node):
def _read_taxonomy_files(self, options) -> Dict[str, Tuple[str, str, str, str, str, str, str]]:
"""Read and merge taxonomy files."""
self.logger.info('Reading GTDB taxonomy for representative genomes.')
taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)
if options.gtdbtk_classification_file:
# add and overwrite taxonomy for genomes specified in the
# GTDB-Tk classification file
check_file_exists(options.gtdbtk_classification_file)
self.logger.info('Reading GTDB-Tk classification file.')
gtdbtk_taxonomy = Taxonomy().read(options.gtdbtk_classification_file)
del gtdbtk_taxonomy['user_genome']
num_reassigned = 0
for gid, taxa in gtdbtk_taxonomy.items():
if gid in taxonomy:
num_reassigned += 1
taxonomy[gid] = taxa
self.logger.info(f'Read GTDB-Tk classifications for {len(gtdbtk_taxonomy):,} genomes.')
def __init__(self, cpus=1, pplacer_cpus=None):
"""Initialize."""
check_dependencies(['pplacer', 'guppy', 'fastANI'])
self.taxonomy_file = Config.TAXONOMY_FILE
self.af_threshold = Config.AF_THRESHOLD
self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
self.order_rank = ["d__", "p__", "c__", "o__", 'f__', 'g__', 's__']
self.logger = logging.getLogger('timestamp')
self.cpus = max(cpus, 1)
self.pplacer_cpus = max(pplacer_cpus if pplacer_cpus else cpus, 1)
self.species_radius = self.parse_radius_file()
self.reference_ids = get_reference_ids()
# rank_of_interest determine the rank in the tree_mapping file for
# lower classification
self.rank_of_interest = "o__"
def check_install(self):
"""Check that all reference files exist.
Returns
-------
bool
True if the installation is complete, False otherwise.
"""
ok = True
ok = ok and self.checkfile(Config.TAXONOMY_FILE, 'Taxonomy')
ok = ok and self.checkfile(Config.CONCAT_BAC120, 'concat_bac120')
ok = ok and self.checkfile(Config.CONCAT_AR122, 'concat_ar122')
ok = ok and self.checkfile(os.path.join(Config.MASK_DIR, Config.MASK_BAC120), 'mask_bac120')
ok = ok and self.checkfile(os.path.join(Config.MASK_DIR, Config.MASK_AR122), 'mask_ar122')
ok = ok and self.checkfile(Config.TIGRFAM_HMMS, 'tirgfam_hmms')
ok = ok and self.checkfile(os.path.join(Config.PFAM_HMM_DIR, 'Pfam-A.hmm'), 'pfam_hmms')
ok = ok and self.checkfolder(Config.FASTANI_GENOMES, 'fastani_genomes')
ok = ok and self.checkfolder(os.path.join(Config.PPLACER_DIR, Config.PPLACER_BAC120_REF_PKG), 'pplacer_bac120')
ok = ok and self.checkfolder(os.path.join(Config.PPLACER_DIR, Config.PPLACER_AR122_REF_PKG), 'pplacer_ar122')
if not ok:
self.logger.error('One or more reference files are malformed.')
return ok
"""Initialize."""
self.logger = logging.getLogger('timestamp')
self.warnings = logging.getLogger('warnings')
self.cpus = cpus
self.debug = debug
self.marker_gene_dir = None
self.genome_file_suffix = GENOME_FILE_SUFFIX
self.protein_file_suffix = PROTEIN_FILE_SUFFIX
self.nt_gene_file_suffix = NT_GENE_FILE_SUFFIX
self.gff_file_suffix = GFF_FILE_SUFFIX
self.checksum_suffix = CHECKSUM_SUFFIX
self.taxonomy_file = Config.TAXONOMY_FILE
self.pfam_hmm_dir = Config.PFAM_HMM_DIR
self.pfam_suffix = PFAM_SUFFIX
self.pfam_top_hit_suffix = PFAM_TOP_HIT_SUFFIX
self.tigrfam_hmms = Config.TIGRFAM_HMMS
self.tigrfam_suffix = TIGRFAM_SUFFIX
self.tigrfam_top_hit_suffix = TIGRFAM_TOP_HIT_SUFFIX
mash = Mash(self.cpus, dir_mash, prefix)
self.logger.info(f'Using Mash version {mash.version()}')
mash_results = mash.run(genomes, ref_genomes, max_d, mash_k, mash_v, mash_s)
for qry_gid, ref_hits in mash_results.items():
d_compare[qry_gid] = d_compare[qry_gid].union(set(ref_hits.keys()))
# Compare against all reference genomes.
else:
for qry_gid in genomes:
d_compare[qry_gid] = set(ref_genomes.keys())
self.logger.info(f'Calculating ANI with FastANI v{FastANI._get_version()}.')
fastani = FastANI(self.cpus, force_single=True)
fastani_results = fastani.run(d_compare, d_paths)
taxonomy = Taxonomy().read(TAXONOMY_FILE, canonical_ids=True)
ANISummaryFile(out_dir, prefix, fastani_results, taxonomy)
ANIClosestFile(out_dir,
prefix,
fastani_results,
genomes,
min_af,
taxonomy)