Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
diamond_output_dir = os.path.join(self.output_dir, 'diamond')
make_sure_path_exists(diamond_output_dir)
fragment_file = os.path.join(diamond_output_dir, 'fragments.fna')
fragment_out = open(fragment_file, 'w')
contig_id_to_genome_id = {}
for genome_file in genome_files:
genome_id = remove_extension(genome_file)
self.profiles[genome_id] = Profile(genome_id, taxonomy)
self._fragment_genomes(genome_file,
window_size,
step_size,
self.profiles[genome_id],
fragment_out)
for seq_id, _seq in seq_io.read_seq(genome_file):
contig_id_to_genome_id[seq_id] = genome_id
# run diamond
self.logger.info('')
self.logger.info(' Running diamond blastx with %d processes (be patient!)' % self.cpus)
diamond = Diamond(self.cpus)
diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits')
diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out)
diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv')
diamond.view(diamond_daa_out + '.daa', diamond_table_out)
self.logger.info('')
self.logger.info(' Creating taxonomic profile for each genome.')
self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id)
Returns
-------
dict : d[seq_id] -> seq
Dictionary of unbinned sequences.
"""
check_file_exists(scaffold_file)
# get list of sequences in bins
self.logger.info('Reading binned scaffolds.')
binned_seq_ids = set()
total_binned_bases = 0
for genome_file in genome_files:
for seq_id, seq in seq_io.read_seq(genome_file):
binned_seq_ids.add(seq_id)
total_binned_bases += len(seq)
self.logger.info('Read %d (%.2f Mbp) binned scaffolds.' % (len(binned_seq_ids), float(total_binned_bases) / 1e6))
# write all unbinned sequences
self.logger.info('Identifying unbinned scaffolds >= %d bp.' % min_seq_len)
unbinned_bases = 0
unbinned_seqs = {}
for seq_id, seq in seq_io.read_seq(scaffold_file):
if seq_id not in binned_seq_ids and len(seq) >= min_seq_len:
unbinned_seqs[seq_id] = seq
unbinned_bases += len(seq)
self.logger.info('Identified %d (%.2f Mbp) unbinned scaffolds.' % (len(unbinned_seqs), float(unbinned_bases) / 1e6))
Second set of genome files in fasta format.
seq_file : str
Scaffolds/contigs binned to create genomes.
output_file : str
Desire file to write results.
"""
# determine total number of sequences
self.logger.info('Reading sequences.')
seq_lens = {}
total_bases = 0
num_seqs_over_length = defaultdict(int)
total_bases_over_length = defaultdict(int)
lengths_to_check = [1000, 5000, 10000, 20000, 50000]
for seq_id, seq in seq_io.read_seq(seq_file):
seq_len = len(seq)
seq_lens[seq_id] = seq_len
total_bases += seq_len
for length in lengths_to_check:
if seq_len >= length:
num_seqs_over_length[length] += 1
total_bases_over_length[length] += seq_len
# determine sequences in each bin
genome_seqs1 = self._genome_seqs(genome_files1)
genome_seqs2 = self._genome_seqs(genome_files2)
# determine bin stats
genome_stats1, total_uniq_binned_seqs1, total_uniq_binned_bases1, num_repeats1 = self._genome_stats(genome_seqs1, seq_lens)
genome_stats2, total_uniq_binned_seqs2, total_uniq_binned_bases2, num_repeats2 = self._genome_stats(genome_seqs2, seq_lens)