Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Parameters
----------
scaffold_file : str
Fasta file containing scaffolds to add.
genome_file : str
Fasta file of binned scaffolds.
compatible_file : str
File specifying compatible scaffolds.
min_len : int
Minimum length to add scaffold.
out_genome : str
Name of output genome.
"""
cur_bin_id = remove_extension(genome_file)
# determine statistics for each potentially compatible scaffold
scaffold_ids = set()
with open(compatible_file) as f:
headers = [x.strip() for x in f.readline().split('\t')]
scaffold_gc_index = headers.index('Scaffold GC')
genome_gc_index = headers.index('Median genome GC')
td_dist_index = headers.index('Scaffold TD')
scaffold_cov_index = headers.index('Scaffold coverage')
genome_cov_index = headers.index('Median genome coverage')
for line in f:
line_split = line.split('\t')
scaffold_id = line_split[0]
bin_id = line_split[1].strip()
t = Taxonomy()
taxonomy = t.read(taxonomy_file)
if not t.validate(taxonomy,
check_prefixes=True,
check_ranks=True,
check_hierarchy=False,
check_species=False,
check_group_names=False,
check_duplicate_names=False,
report_errors=True):
self.logger.error('Invalid taxonomy file.')
sys.exit(-1)
# record length and number of genes in each scaffold
for aa_file in gene_files:
genome_id = remove_extension(aa_file)
self.profiles[genome_id] = Profile(genome_id, percent_to_classify, taxonomy)
for seq_id, seq in seq_io.read_seq(aa_file):
scaffold_id = seq_id[0:seq_id.rfind('_')]
self.profiles[genome_id].genes_in_scaffold[scaffold_id] += 1
self.profiles[genome_id].coding_bases[scaffold_id] += len(seq) * 3 # length in nucleotide space
# run diamond and create taxonomic profile for each genome
self.logger.info('Running DIAMOND blastp with {:,} processes (be patient!)'.format(self.cpus))
diamond = Diamond(self.cpus)
diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv')
if not os.path.exists(diamond_table_out):
diamond.blastp(gene_file,
db_file,
evalue,
E-value threshold for defining valid hits.
concatenate_threshold : int
Concatenate hits within the specified number of base pairs.
output_dir : str
Output directory.
Returns
-------
dict : d[genome_id][seq_id] -> information about best hit
Information about best hits for each genome.
"""
self.logger.info('Identifying SSU rRNA genes.')
best_hits = {}
for genome_file in genome_files:
genome_id = remove_extension(genome_file)
genome_dir = os.path.join(output_dir, genome_id)
make_sure_path_exists(genome_dir)
# identify 16S reads from contigs/scaffolds
self._hmm_search(genome_file, evalue_threshold, genome_dir)
# read HMM hits
hits_per_domain = {}
for domain in ['archaea', 'bacteria', 'euk']:
seq_info = self._read_hits(os.path.join(genome_dir, 'ssu' + '.hmm_' + domain + '.txt'), domain, evalue_threshold)
hits = {}
if len(seq_info) > 0:
for seq_id, seq_hits in seq_info.items():
for hit in seq_hits:
self._add_hit(hits, seq_id, hit, concatenate_threshold)
def manual(self, options):
"""Manual command"""
check_file_exists(options.cluster_file)
check_file_exists(options.genome_file)
make_sure_path_exists(options.output_dir)
genome_id = remove_extension(options.genome_file)
seqs = seq_io.read(options.genome_file)
fout = {}
with open(options.cluster_file) as f:
f.readline()
for line in f:
line_split = line.rstrip().split('\t')
scaffold_id = line_split[0]
cluster_id = int(line_split[1])
if cluster_id < 0:
# negative values indicate scaffolds that should
# not be placed in a cluster
continue
def filter_bins(self, options):
"""Filter bins command"""
make_sure_path_exists(options.output_dir)
genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
if not self._check_nuclotide_seqs(genome_files):
self.logger.warning('All files must contain nucleotide sequences.')
sys.exit()
outliers = Outliers()
for genome_file in genome_files:
gf = remove_extension(genome_file) + '.filtered.' + options.genome_ext
out_genome = os.path.join(options.output_dir, gf)
outliers.remove_outliers(genome_file, options.filter_file, out_genome, options.modified_only)
self.logger.info('Modified genome written to: ' + options.output_dir)
for seq_id, classification in seq_assignments.items():
taxa = []
for r in range(0, len(Taxonomy.rank_labels)):
taxa.append(classification[r][0])
krona_profiles[genome_id][';'.join(taxa)] += profile.genes_in_scaffold[seq_id]
krona = Krona()
krona_output_file = os.path.join(self.output_dir, 'gene_profiles.scaffolds.html')
krona.create(krona_profiles, krona_output_file)
# create Krona plot based on best hit of each gene
krona_profiles = defaultdict(lambda: defaultdict(int))
for aa_file in gene_files:
genome_id = remove_extension(aa_file)
profile = self.profiles[genome_id]
for gene_id, _seq in seq_io.read_seq(aa_file):
taxa_str = Taxonomy.unclassified_taxon
if gene_id in profile.gene_hits:
taxa_str, _hit_info = profile.gene_hits[gene_id]
krona_profiles[genome_id][taxa_str] += 1
krona_output_file = os.path.join(self.output_dir, 'gene_profiles.genes.html')
krona.create(krona_profiles, krona_output_file)
~
Parameters
----------
gene_dir : str
Directory with fasta files containing protein sequences.
output_dir : float
Directory to contain modified fasta files.
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for f in os.listdir(gene_dir):
gf = os.path.join(gene_dir, f)
genome_id = remove_extension(gf)
aa_file = os.path.join(output_dir, genome_id + '.faa')
fout = open(aa_file, 'w')
for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
if seq[-1] == '*':
seq = seq[0:-1]
fout.write(seq + '\n')
fout.close()
def _prefix_gene_identifiers(self, gene_files, keep_headers, output_file):
"""Prefix all gene IDs with genome IDs: ~.
Parameters
----------
gene_files : list of str
Genes in fasta files to modify.
keep_headers : boolean
If True, indicates FASTA headers already have the format ~.
output_file : str
Name of FASTA file to contain modified genes.
"""
fout = open(output_file, 'w')
for gf in gene_files:
genome_id = remove_extension(gf)
if genome_id.endswith('_genes'):
genome_id = genome_id[0:genome_id.rfind('_genes')]
for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
if keep_headers:
fout.write('>' + seq_id + ' ' + annotation + '\n')
else:
fout.write('>' + genome_id + '~' + seq_id + ' ' + annotation + '\n')
fout.write(seq + '\n')
fout.close()
def _producer_blast(self, genome_pair):
"""Apply reciprocal blast to a pair of genomes.
Parameters
----------
genome_pair : list
Identifier of genomes to process.
"""
blast = Blast(cpus=self.producer_cpus)
aa_gene_fileA, aa_gene_fileB = genome_pair
genome_idA = remove_extension(aa_gene_fileA)
genome_idB = remove_extension(aa_gene_fileB)
dbA = os.path.join(self.output_dir, genome_idA + '.db')
dbB = os.path.join(self.output_dir, genome_idB + '.db')
output_fileAB = os.path.join(self.output_dir, genome_idA + '-' + genome_idB + '.blastp.tsv')
blast.blastp(aa_gene_fileA, dbB, output_fileAB, self.evalue)
output_fileBA = os.path.join(self.output_dir, genome_idB + '-' + genome_idA + '.blastp.tsv')
blast.blastp(aa_gene_fileB, dbA, output_fileBA, self.evalue)
return True