Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def split(self, options):
"""Split command"""
check_file_exists(options.scaffold_stats_file)
check_file_exists(options.genome_file)
make_sure_path_exists(options.output_dir)
self.logger.info('Reading scaffold statistics.')
scaffold_stats = ScaffoldStats()
scaffold_stats.read(options.scaffold_stats_file)
cluster = Cluster(1)
cluster.split(scaffold_stats,
options.criteria1,
options.criteria2,
options.genome_file,
options.output_dir)
self.logger.info('Partitioned sequences written to: ' + options.output_dir)
keep_rbhs : boolean
Flag indicating if RBH should be written to file.
output_dir : str
Directory to store AAI results.
"""
# read taxonomic identification of each genome
taxonomy = {}
if taxonomy_file:
for line in open(taxonomy_file):
genome_id, taxa_str = line.rstrip().split('\t')
taxonomy[genome_id] = taxa_str
# calculate AAI between query and target genomes
aai_output_dir = os.path.join(output_dir, 'aai')
make_sure_path_exists(aai_output_dir)
aai_calculator = AAICalculator(self.cpus)
aai_output_file, rbh_output_file = aai_calculator.run(query_gene_file,
target_gene_file,
sorted_hit_table,
evalue_threshold,
per_iden_threshold,
per_aln_len_threshold,
keep_rbhs,
aai_output_dir)
# determine matches to each query genomes
aai_results_file = os.path.join(aai_output_dir, 'aai_summary.tsv')
with open(aai_results_file) as f:
f.readline()
hits = defaultdict(list)
Concatenate hits within the specified number of base pairs.
output_dir : str
Output directory.
Returns
-------
dict : d[genome_id][seq_id] -> information about best hit
Information about best hits for each genome.
"""
self.logger.info('Identifying SSU rRNA genes.')
best_hits = {}
for genome_file in genome_files:
genome_id = remove_extension(genome_file)
genome_dir = os.path.join(output_dir, genome_id)
make_sure_path_exists(genome_dir)
# identify 16S reads from contigs/scaffolds
self._hmm_search(genome_file, evalue_threshold, genome_dir)
# read HMM hits
hits_per_domain = {}
for domain in ['archaea', 'bacteria', 'euk']:
seq_info = self._read_hits(os.path.join(genome_dir, 'ssu' + '.hmm_' + domain + '.txt'), domain, evalue_threshold)
hits = {}
if len(seq_info) > 0:
for seq_id, seq_hits in seq_info.items():
for hit in seq_hits:
self._add_hit(hits, seq_id, hit, concatenate_threshold)
hits_per_domain[domain] = hits
def modify_bin(self, options):
"""Modify bin command"""
make_sure_path_exists(os.path.dirname(options.output_genome))
options.compatible_file = None #*** Removed with Python 3 port
if not (options.add or options.remove or options.outlier_file or options.compatible_file):
self.logger.warning('No modification to bin requested.\n')
sys.exit()
if (options.add or options.remove) and (options.outlier_file or options.compatible_file):
self.logger.warning("The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n")
sys.exit()
if options.outlier_file and options.compatible_file:
self.logger.warning("The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n")
sys.exit()
failed_to_add = []
failed_to_remove = []
per_identity_threshold : float
Percent identity threshold used to define a homologous gene.
per_aln_len_threshold : float
Alignment length threshold used to define a homologous gene.
output_dir : str
Directory to store AAI results.
"""
self.blast_dir = blast_dir
self.per_identity_threshold = per_iden_threshold
self.per_aln_len_threshold = per_aln_len_threshold
self.output_dir = output_dir
shared_genes_dir = os.path.join(output_dir, self.shared_genes)
make_sure_path_exists(shared_genes_dir)
self.shared_genes_dir = shared_genes_dir
# calculate length of genes and number of genes each genome
self.logger.info('Calculating length of genes.')
self.gene_lengths = {}
genes_in_genomes = defaultdict(int)
for seq_id, seq in seq_io.read_fasta_seq(os.path.join(self.blast_dir, self.gene_file)):
if seq[-1] == '*':
self.gene_lengths[seq_id] = len(seq) - 1
else:
self.gene_lengths[seq_id] = len(seq)
genome_id = seq_id[0:seq_id.find('~')]
genes_in_genomes[genome_id] += 1
# get byte offset of hits from each genome
diamond = Diamond(self.cpus)
diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits')
diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out)
diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv')
diamond.view(diamond_daa_out + '.daa', diamond_table_out)
self.logger.info('')
self.logger.info(' Creating taxonomic profile for each genome.')
self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id)
self.logger.info('')
self.logger.info(' Writing taxonomic profile for each genome.')
report_dir = os.path.join(self.output_dir, 'bin_reports')
make_sure_path_exists(report_dir)
for genome_id, profile in self.profiles.iteritems():
seq_summary_out = os.path.join(report_dir, genome_id + '.sequences.tsv')
profile.write_seq_summary(seq_summary_out)
genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv')
profile.write_genome_profile(genome_profile_out)
genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv')
self._write_genome_summary(genome_summary_out)
# create Krona plot
krona_profiles = defaultdict(lambda: defaultdict(int))
for genome_id, profile in self.profiles.iteritems():
seq_assignments = profile.classify_seqs(taxonomy)
def infer(self, options):
"""Infer tree from MSA."""
check_file_exists(options.msa_file)
make_sure_path_exists(options.out_dir)
if (options.cpus > 1):
check_dependencies(['FastTreeMP'])
else:
check_dependencies(['FastTree'])
self.logger.info('Inferring tree with FastTree using %s+GAMMA.' % options.prot_model)
fasttree = FastTree(multithreaded=(options.cpus > 1))
tree_unrooted_output = os.path.join(options.out_dir, options.prefix + '.unrooted.tree')
tree_log = os.path.join(options.out_dir, options.prefix + '.tree.log')
tree_output_log = os.path.join(options.out_dir, 'fasttree.log')
fasttree.run(options.msa_file,
'prot',
options.prot_model,
tree_unrooted_output,
genome_stats = GenomeStats()
genome_stats = genome_stats.run(scaffold_stats)
# identify outliers
outliers = Outliers()
outlier_file = os.path.join(options.output_dir, 'outliers.tsv')
outliers.identify(scaffold_stats, genome_stats,
options.gc_perc, options.td_perc,
options.cov_corr, options.cov_perc,
options.report_type, outlier_file)
self.logger.info('Outlier information written to: ' + outlier_file)
# create outlier plots
if options.create_plots:
plot_dir = os.path.join(options.output_dir, 'plots')
make_sure_path_exists(plot_dir)
outliers.plot(scaffold_stats,
genome_stats,
outliers.gc_dist,
outliers.td_dist,
options,
options.highlight_file,
options.links_file,
options.individual_plots,
plot_dir)
self.logger.info('Outlier plots written to: ' + plot_dir)
def identify(self, options):
"""Identify marker genes in genomes."""
try:
if options.genome_dir:
check_dir_exists(options.genome_dir)
if options.batchfile:
check_file_exists(options.batchfile)
make_sure_path_exists(options.out_dir)
markers = Markers(options.cpus)
markers.identify(options.genome_dir,
options.batchfile,
options.proteins,
options.out_dir,
options.prefix)
self.logger.info('Done.')
except Exception as e:
self.logger.info('GTDB-Tk has encountered an error.')