Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
tetra = Tetranucleotide(self.cpus)
signatures = tetra.read(tetra_file)
cov_profiles = None
if coverage_file:
coverage = Coverage(self.cpus)
cov_profiles, _ = coverage.read(coverage_file)
# determine bin assignment for each scaffold
self.logger.info('Determining scaffold statistics.')
scaffold_id_genome_id = {}
for gf in genome_files:
genome_id = remove_extension(gf)
for scaffold_id, _seq in seq_io.read_seq(gf):
scaffold_id_genome_id[scaffold_id] = genome_id
# write out scaffold statistics
fout = open(output_file, 'w')
fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')
if cov_profiles:
first_key = list(cov_profiles.keys())[0]
bam_ids = sorted(cov_profiles[first_key].keys())
for bam_id in bam_ids:
fout.write('\t' + bam_id)
for kmer in tetra.canonical_order():
fout.write('\t' + kmer)
fout.write('\n')
Parameters
----------
genome_files : iterable
Genome files in fasta format.
Returns
-------
dict: d[genome_id] -> set(seq_id1, ..., seq_idN)
Ids of sequences in each genome.
"""
genome_seqs = defaultdict(set)
for genome_file in genome_files:
genome_id = remove_extension(genome_file)
for seq_id, _seq in seq_io.read_seq(genome_file):
genome_seqs[genome_id].add(seq_id)
return genome_seqs
Parameters
----------
genome_file : str
Fasta file with genome sequences to fragment.
window_size : int
Size of each fragment.
step_size : int
Number of bases to move after each window.
profile : Profile
Class for classifying fragments.
fout : stream
Output stream to store all fragments.
"""
for seq_id, seq in seq_io.read_seq(genome_file):
fragments = seq_tk.fragment(seq, window_size, step_size)
for i, frag in enumerate(fragments):
fout.write('>' + seq_id + '~' + str(i) + '\n')
fout.write(frag + '\n')
profile.fragments_from_seq[seq_id] = len(fragments)
profile.seq_len[seq_id] = len(seq)
taxa.append(classification[r][0])
krona_profiles[genome_id][';'.join(taxa)] += profile.genes_in_scaffold[seq_id]
krona = Krona()
krona_output_file = os.path.join(self.output_dir, 'gene_profiles.scaffolds.html')
krona.create(krona_profiles, krona_output_file)
# create Krona plot based on best hit of each gene
krona_profiles = defaultdict(lambda: defaultdict(int))
for aa_file in gene_files:
genome_id = remove_extension(aa_file)
profile = self.profiles[genome_id]
for gene_id, _seq in seq_io.read_seq(aa_file):
taxa_str = Taxonomy.unclassified_taxon
if gene_id in profile.gene_hits:
taxa_str, _hit_info = profile.gene_hits[gene_id]
krona_profiles[genome_id][taxa_str] += 1
krona_output_file = os.path.join(self.output_dir, 'gene_profiles.genes.html')
krona.create(krona_profiles, krona_output_file)
# write out scaffold statistics
fout = open(output_file, 'w')
fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')
if cov_profiles:
first_key = list(cov_profiles.keys())[0]
bam_ids = sorted(cov_profiles[first_key].keys())
for bam_id in bam_ids:
fout.write('\t' + bam_id)
for kmer in tetra.canonical_order():
fout.write('\t' + kmer)
fout.write('\n')
for scaffold_id, seq in seq_io.read_seq(scaffold_file):
fout.write(scaffold_id)
fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned))
fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0))
fout.write('\t%d' % len(seq))
if cov_profiles:
for bam_id in bam_ids:
fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id])
fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id])))
fout.write('\n')
fout.close()
best_gc = [gc, bin_id]
if td < best_td[0]:
best_td = [td, bin_id]
if cov < best_cov[0]:
best_cov = [cov, bin_id]
# check if scaffold is closest to a single bin
if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id:
compatible_scaffolds.add(scaffold_id)
self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds)))
# add compatible sequences to genome
added_seqs = 0
genome_seqs = seq_io.read(genome_file)
for seq_id, seq in seq_io.read_seq(scaffold_file):
if seq_id in compatible_scaffolds:
if len(seq) >= min_len:
genome_seqs[seq_id] = seq
added_seqs += 1
self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))
# save modified bin
seq_io.write_fasta(genome_seqs, out_genome)
binned_seq_ids = set()
total_binned_bases = 0
for genome_file in genome_files:
for seq_id, seq in seq_io.read_seq(genome_file):
binned_seq_ids.add(seq_id)
total_binned_bases += len(seq)
self.logger.info('Read %d (%.2f Mbp) binned scaffolds.' % (len(binned_seq_ids), float(total_binned_bases) / 1e6))
# write all unbinned sequences
self.logger.info('Identifying unbinned scaffolds >= %d bp.' % min_seq_len)
unbinned_bases = 0
unbinned_seqs = {}
for seq_id, seq in seq_io.read_seq(scaffold_file):
if seq_id not in binned_seq_ids and len(seq) >= min_seq_len:
unbinned_seqs[seq_id] = seq
unbinned_bases += len(seq)
self.logger.info('Identified %d (%.2f Mbp) unbinned scaffolds.' % (len(unbinned_seqs), float(unbinned_bases) / 1e6))
self.logger.info('Percentage of unbinned scaffolds: %.2f%%' % (len(unbinned_seqs) * 100.0 / (len(unbinned_seqs) + len(binned_seq_ids))))
self.logger.info('Percentage of unbinned bases: %.2f%%' % (unbinned_bases * 100.0 / (unbinned_bases + total_binned_bases)))
return unbinned_seqs
self.logger.info('Identifying homologs within reference genomes of interest (be patient!).')
self.diamond_dir = os.path.join(self.output_dir, 'diamond')
make_sure_path_exists(self.diamond_dir)
hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes)
self.logger.info('Identifying homologs within competing reference genomes (be patient!).')
hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, False, hits_comp_ref_genomes)
# get list of genes with a top hit to the reference genomes of interest
hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)
# get number of genes on each scaffold
num_genes_on_scaffold = defaultdict(int)
for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
scaffold_id = seq_id[0:seq_id.rfind('_')]
num_genes_on_scaffold[scaffold_id] += 1
# get hits to each scaffold
hits_to_scaffold = defaultdict(list)
for query_id, hit in hits_to_ref.items():
gene_id = query_id[0:query_id.rfind('~')]
scaffold_id = gene_id[0:gene_id.rfind('_')]
hits_to_scaffold[scaffold_id].append(hit)
# report summary stats for each scaffold
reference_out = os.path.join(self.output_dir, 'references.tsv')
fout = open(reference_out, 'w')
fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs')
fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage')
fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')