Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def split(self, options):
"""Split command"""
check_file_exists(options.scaffold_stats_file)
check_file_exists(options.genome_file)
make_sure_path_exists(options.output_dir)
self.logger.info('Reading scaffold statistics.')
scaffold_stats = ScaffoldStats()
scaffold_stats.read(options.scaffold_stats_file)
cluster = Cluster(1)
cluster.split(scaffold_stats,
options.criteria1,
options.criteria2,
options.genome_file,
options.output_dir)
self.logger.info('Partitioned sequences written to: ' + options.output_dir)
def taxon_profile(self, options):
"""Call genes command"""
make_sure_path_exists(options.output_dir)
check_file_exists(options.scaffold_stats_file)
check_file_exists(options.taxonomy_file)
check_file_exists(options.db_file)
gene_files = self._genome_files(options.genome_prot_dir, options.protein_ext)
if not self._check_protein_seqs(gene_files):
self.logger.warning('All files must contain amino acid sequences.')
sys.exit()
# build gene profile
taxon_profile = TaxonProfile(options.cpus, options.output_dir)
taxon_profile.run(gene_files,
options.scaffold_stats_file,
options.db_file,
options.taxonomy_file,
options.per_to_classify,
options.evalue,
def manual(self, options):
"""Manual command"""
check_file_exists(options.cluster_file)
check_file_exists(options.genome_file)
make_sure_path_exists(options.output_dir)
genome_id = remove_extension(options.genome_file)
seqs = seq_io.read(options.genome_file)
fout = {}
with open(options.cluster_file) as f:
f.readline()
for line in f:
line_split = line.rstrip().split('\t')
scaffold_id = line_split[0]
cluster_id = int(line_split[1])
if cluster_id < 0:
def taxon_profile(self, options):
"""Call genes command"""
make_sure_path_exists(options.output_dir)
check_file_exists(options.scaffold_stats_file)
check_file_exists(options.taxonomy_file)
check_file_exists(options.db_file)
gene_files = self._genome_files(options.genome_prot_dir, options.protein_ext)
if not self._check_protein_seqs(gene_files):
self.logger.warning('All files must contain amino acid sequences.')
sys.exit()
# build gene profile
taxon_profile = TaxonProfile(options.cpus, options.output_dir)
taxon_profile.run(gene_files,
options.scaffold_stats_file,
options.db_file,
options.taxonomy_file,
options.per_to_classify,
options.evalue,
options.per_identity,
options.per_aln_len,
def split(self, options):
"""Split command"""
check_file_exists(options.scaffold_stats_file)
check_file_exists(options.genome_file)
make_sure_path_exists(options.output_dir)
self.logger.info('Reading scaffold statistics.')
scaffold_stats = ScaffoldStats()
scaffold_stats.read(options.scaffold_stats_file)
cluster = Cluster(1)
cluster.split(scaffold_stats,
options.criteria1,
options.criteria2,
options.genome_file,
options.output_dir)
self.logger.info('Partitioned sequences written to: ' + options.output_dir)
keep_rbhs : boolean
Flag indicating if RBH should be written to file.
output_dir : str
Directory to store AAI results.
"""
# read taxonomic identification of each genome
taxonomy = {}
if taxonomy_file:
for line in open(taxonomy_file):
genome_id, taxa_str = line.rstrip().split('\t')
taxonomy[genome_id] = taxa_str
# calculate AAI between query and target genomes
aai_output_dir = os.path.join(output_dir, 'aai')
make_sure_path_exists(aai_output_dir)
aai_calculator = AAICalculator(self.cpus)
aai_output_file, rbh_output_file = aai_calculator.run(query_gene_file,
target_gene_file,
sorted_hit_table,
evalue_threshold,
per_iden_threshold,
per_aln_len_threshold,
keep_rbhs,
aai_output_dir)
# determine matches to each query genomes
aai_results_file = os.path.join(aai_output_dir, 'aai_summary.tsv')
with open(aai_results_file) as f:
f.readline()
hits = defaultdict(list)
Concatenate hits within the specified number of base pairs.
output_dir : str
Output directory.
Returns
-------
dict : d[genome_id][seq_id] -> information about best hit
Information about best hits for each genome.
"""
self.logger.info('Identifying SSU rRNA genes.')
best_hits = {}
for genome_file in genome_files:
genome_id = remove_extension(genome_file)
genome_dir = os.path.join(output_dir, genome_id)
make_sure_path_exists(genome_dir)
# identify 16S reads from contigs/scaffolds
self._hmm_search(genome_file, evalue_threshold, genome_dir)
# read HMM hits
hits_per_domain = {}
for domain in ['archaea', 'bacteria', 'euk']:
seq_info = self._read_hits(os.path.join(genome_dir, 'ssu' + '.hmm_' + domain + '.txt'), domain, evalue_threshold)
hits = {}
if len(seq_info) > 0:
for seq_id, seq_hits in seq_info.items():
for hit in seq_hits:
self._add_hit(hits, seq_id, hit, concatenate_threshold)
hits_per_domain[domain] = hits
def modify_bin(self, options):
"""Modify bin command"""
make_sure_path_exists(os.path.dirname(options.output_genome))
options.compatible_file = None #*** Removed with Python 3 port
if not (options.add or options.remove or options.outlier_file or options.compatible_file):
self.logger.warning('No modification to bin requested.\n')
sys.exit()
if (options.add or options.remove) and (options.outlier_file or options.compatible_file):
self.logger.warning("The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n")
sys.exit()
if options.outlier_file and options.compatible_file:
self.logger.warning("The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n")
sys.exit()
failed_to_add = []
failed_to_remove = []
continue
line_split = line.split('\t')
scaffold_id = line_split[0]
info = line_split[8]
if info != '': # this will be empty for non-protein coding genes
gene_id = info.split(';')[0].replace('ID=', '')
gene_number[scaffold_id] += 1
gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id])
# write out gene file with modified identifiers
genome_gene_file = os.path.abspath(os.path.join(genome_dir, genome_id + '.genes.faa'))
fout = open(os.path.join(output_dir, genome_id + '.faa'), 'w')
for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True):
annotation = annotation[annotation.find(' ') + 1:] # remove additional gene id from annotation
annotation += ' [IMG Gene ID: ' + gene_id + ']' # append IMG gene id for future reference
fout.write('>' + gene_id_to_scaffold_id[gene_id] + ' ' + annotation + '\n')
fout.write(seq + '\n')
fout.close()
Parameters
----------
scaffold_file : str
Fasta file containing scaffolds to add.
genome_file : str
Fasta file of binned scaffolds.
compatible_file : str
File specifying compatible scaffolds.
min_len : int
Minimum length to add scaffold.
out_genome : str
Name of output genome.
"""
cur_bin_id = remove_extension(genome_file)
# determine statistics for each potentially compatible scaffold
scaffold_ids = set()
with open(compatible_file) as f:
headers = [x.strip() for x in f.readline().split('\t')]
scaffold_gc_index = headers.index('Scaffold GC')
genome_gc_index = headers.index('Median genome GC')
td_dist_index = headers.index('Scaffold TD')
scaffold_cov_index = headers.index('Scaffold coverage')
genome_cov_index = headers.index('Median genome coverage')
for line in f:
line_split = line.split('\t')
scaffold_id = line_split[0]
bin_id = line_split[1].strip()