Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def manual(self, options):
"""Manual command"""
check_file_exists(options.cluster_file)
check_file_exists(options.genome_file)
make_sure_path_exists(options.output_dir)
genome_id = remove_extension(options.genome_file)
seqs = seq_io.read(options.genome_file)
fout = {}
with open(options.cluster_file) as f:
f.readline()
for line in f:
line_split = line.rstrip().split('\t')
scaffold_id = line_split[0]
cluster_id = int(line_split[1])
if cluster_id < 0:
# negative values indicate scaffolds that should
# not be placed in a cluster
continue
if cluster_id not in fout:
fout[cluster_id] = open(os.path.join(options.output_dir, genome_id + '_c%d.fna' % cluster_id), 'w')
no_pca : boolean
Flag indicating if PCA of genomic signature should be calculated.
iterations: int
iterations to perform during clustering
genome_file : str
Sequences being clustered.
output_dir : str
Directory to write results.
"""
# get GC and mean coverage for each scaffold in genome
self.logger.info('Determining mean coverage and genomic signatures.')
signatures = GenomicSignature(K)
genome_stats = []
signature_matrix = []
seqs = seq_io.read(genome_file)
for seq_id, seq in seqs.items():
stats = scaffold_stats.stats[seq_id]
if not no_coverage:
genome_stats.append((np_mean(stats.coverage)))
else:
genome_stats.append(())
if K == 0:
pass
elif K == 4:
signature_matrix.append(stats.signature)
else:
sig = signatures.seq_signature(seq)
total_kmers = sum(sig)
for i in range(0, len(sig)):
for genome_file in genome_files:
genome_id = remove_extension(genome_file)
genome_dir = os.path.join(output_dir, genome_id)
if len(best_hits[genome_id]) == 0:
continue
# write summary file and putative SSU rRNAs to file
summary_file = os.path.join(genome_dir, 'ssu.hmm_summary.tsv')
summary_out = open(summary_file, 'w')
summary_out.write('Sequence Id\tHMM\ti-Evalue\tStart hit\tEnd hit\tSSU gene length\tReverse Complement\tSequence length\n')
ssu_seq_files[genome_id] = os.path.join(genome_dir, 'ssu.fna')
seq_out = open(ssu_seq_files[genome_id] , 'w')
seqs = seq_io.read(genome_file)
for seq_id in best_hits[genome_id]:
orig_seq_id = seq_id
if '-#' in seq_id:
seq_id = seq_id[0:seq_id.rfind('-#')]
seq_info = [orig_seq_id] + best_hits[genome_id][orig_seq_id]
seq = seqs[seq_id]
summary_out.write('\t'.join(seq_info) + '\n')
seq_out.write('>' + seq_info[0] + '\n')
seq_out.write(seq[int(seq_info[3]) + 1:int(seq_info[4]) + 1] + '\n')
summary_out.close()
seq_out.close()
criteria2 : str
Second criteria used for splitting genome.
genome_file : str
Sequences being clustered.
output_dir : str
Directory to write results.
"""
seqs = seq_io.read(genome_file)
# calculate PCA if necessary
if 'pc' in criteria1 or 'pc' in criteria2:
self.logger.info('Performing PCA.')
signatures = GenomicSignature(K)
signature_matrix = []
seqs = seq_io.read(genome_file)
for seq_id, seq in seqs.items():
stats = scaffold_stats.stats[seq_id]
signature_matrix.append(stats.signature)
pc, _variance = self.pca(signature_matrix)
for i, seq_id in enumerate(seqs):
scaffold_stats.stats[seq_id].pc1 = pc[i][0]
scaffold_stats.stats[seq_id].pc2 = pc[i][1]
scaffold_stats.stats[seq_id].pc3 = pc[i][2]
# split bin
genome_id = remove_extension(genome_file)
fout1 = open(os.path.join(output_dir, genome_id + '_c1.fna'), 'w')
fout2 = open(os.path.join(output_dir, genome_id + '_c2.fna'), 'w')
self.logger.info('Creating taxonomic profile for each genome.')
self.taxonomic_profiles(diamond_table_out, taxonomy)
# write out taxonomic profile
self.logger.info('Writing taxonomic profile for each genome.')
report_dir = os.path.join(self.output_dir, 'bin_reports')
make_sure_path_exists(report_dir)
for aa_file in gene_files:
genome_id = remove_extension(aa_file)
profile = self.profiles[genome_id]
scaffold_summary_out = os.path.join(report_dir, genome_id + '.scaffolds.tsv')
profile.write_scaffold_summary(scaffold_stats, scaffold_summary_out)
gene_summary_out = os.path.join(report_dir, genome_id + '.gene.tsv')
profile.write_gene_summary(gene_summary_out, seq_io.read(aa_file))
genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv')
profile.write_genome_profile(genome_profile_out)
# create summary report for all genomes
genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv')
self.write_genome_summary(genome_summary_out)
# create Krona plot based on classification of scaffolds
self.logger.info('Creating Krona plot for each genome.')
krona_profiles = defaultdict(lambda: defaultdict(int))
for genome_id, profile in self.profiles.items():
seq_assignments = profile.classify_seqs()
for seq_id, classification in seq_assignments.items():
taxa = []
if gc < best_gc[0]:
best_gc = [gc, bin_id]
if td < best_td[0]:
best_td = [td, bin_id]
if cov < best_cov[0]:
best_cov = [cov, bin_id]
# check if scaffold is closest to a single bin
if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id:
compatible_scaffolds.add(scaffold_id)
self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds)))
# add compatible sequences to genome
added_seqs = 0
genome_seqs = seq_io.read(genome_file)
for seq_id, seq in seq_io.read_seq(scaffold_file):
if seq_id in compatible_scaffolds:
if len(seq) >= min_len:
genome_seqs[seq_id] = seq
added_seqs += 1
self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))
# save modified bin
seq_io.write_fasta(genome_seqs, out_genome)
scaffold_id = line_split[0]
bin_id = line_split[1].strip()
scaffold_ids.append(scaffold_id)
bin_ids[scaffold_id] = bin_id
compatible_scaffolds = set()
for scaffold_id, bin_id in bin_ids.items():
if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id:
compatible_scaffolds.add(scaffold_id)
self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds)))
# add compatible sequences to genome
added_seqs = 0
genome_seqs = seq_io.read(genome_file)
for seq_id, seq in seq_io.read_seq(scaffold_file):
if seq_id in compatible_scaffolds:
if len(seq) >= min_len:
genome_seqs[seq_id] = seq
added_seqs += 1
self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))
# save modified bin
seq_io.write_fasta(genome_seqs, out_genome)
Parameters
----------
scaffold_stats : ScaffoldStats
Statistics for individual scaffolds.
criteria1 : str
First criteria used for splitting genome.
criteria2 : str
Second criteria used for splitting genome.
genome_file : str
Sequences being clustered.
output_dir : str
Directory to write results.
"""
seqs = seq_io.read(genome_file)
# calculate PCA if necessary
if 'pc' in criteria1 or 'pc' in criteria2:
self.logger.info('Performing PCA.')
signatures = GenomicSignature(K)
signature_matrix = []
seqs = seq_io.read(genome_file)
for seq_id, seq in seqs.items():
stats = scaffold_stats.stats[seq_id]
signature_matrix.append(stats.signature)
pc, _variance = self.pca(signature_matrix)
for i, seq_id in enumerate(seqs):
scaffold_stats.stats[seq_id].pc1 = pc[i][0]
scaffold_stats.stats[seq_id].pc2 = pc[i][1]
scaffold_gc = float(line_split[scaffold_gc_index])
genome_gc = float(line_split[genome_gc_index])
gc_dist = abs(scaffold_gc - genome_gc)
td_dist = float(line_split[td_dist_index])
scaffold_cov = float(line_split[scaffold_cov_index])
genome_cov = float(line_split[genome_cov_index])
cov_dist = abs(scaffold_cov - genome_cov)
if bin_id == cur_bin_id:
scaffold_ids.add(scaffold_id)
# add compatible sequences to genome
added_seqs = 0
genome_seqs = seq_io.read(genome_file)
for seq_id, seq in seq_io.read_seq(scaffold_file):
if seq_id in scaffold_ids:
if len(seq) >= min_len:
genome_seqs[seq_id] = seq
added_seqs += 1
self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))
# save modified bin
seq_io.write_fasta(genome_seqs, out_genome)