Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
gene_files : list
Fasta files containing called genes in nucleotide space.
Returns
-------
dict of dict : d[genome_id][codon] -> count
Codon usage of each genome.
set
Set with all identified codons.
dict of dict : d[genome_id][codon] -> length
Mean length of genes for each stop codon.
"""
self.logger.info(' Calculating codon usage for each genome.')
parallel = Parallel(self.cpus)
parallel.run(self._producer, None, gene_files[0:1], self._progress)
Fasta files containing called genes in nucleotide space.
output_dir : str
Directory to store results.
"""
self.output_dir = output_dir
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
self.logger.info('Calculating codon usage for each genome.')
progress_func = self._progress
if self.logger.is_silent:
progress_func = None
parallel = Parallel(self.cpus)
parallel.run(self._producer, None, gene_files, progress_func)
genome_info_pairs = []
genome_ids = genes_in_genomes.keys()
for i in xrange(0, len(genome_ids)):
genome_idI = genome_ids[i]
genome_infoI = (genome_idI, genes_in_genomes[genome_idI])
for j in xrange(i + 1, len(genome_ids)):
genome_idJ = genome_ids[j]
genome_infoJ = (genome_idJ, genes_in_genomes[genome_idJ])
genome_info_pairs.append((genome_infoI, genome_infoJ))
if len(genome_info_pairs) == 0:
self.logger.warning('No genome pairs identified.')
return
parallel = Parallel(self.cpus)
consumer_data = parallel.run(self._producer, self._consumer, genome_info_pairs, self._progress)
# write results for each genome pair
aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv')
fout = open(aai_summay_file, 'w')
fout.write('Genome Id A\tGenes in A\tGenome Id B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\n')
for data in consumer_data:
fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\n' % data)
fout.close()
self.logger.info('Summary of AAI between genomes: %s' % aai_summay_file)
target_genomes = list(target_genomes)
for i in xrange(0, len(query_genomes)):
genome_idI = query_genomes[i]
if target_genomes:
genome_id_list = target_genomes
else:
genome_id_list = []
for j in xrange(i + 1, len(query_genomes)):
genome_idJ = query_genomes[j]
genome_id_list.append(genome_idJ)
genome_id_lists.append((genome_idI, genome_id_list))
self.processed_paired = 0
parallel = Parallel(self.cpus)
progress_func = self._progress
if self.logger.is_silent:
progress_func = None
consumer_data = parallel.run(self._producer, self._consumer, genome_id_lists, progress_func)
# write results for each genome pair
self.logger.info('Summarizing AAI results.')
aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv')
fout = open(aai_summay_file, 'w')
fout.write('#Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n')
for data in consumer_data:
fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data)
fout.close()
output_dir : str
Directory to store blast results.
"""
self.evalue = evalue
self.output_dir = output_dir
# set CPUs per producer process
self.producer_cpus = 1
if self.cpus > len(aa_gene_files):
self.producer_cpus = self.cpus / len(aa_gene_files)
# create the blast databases in serial
self.logger.info(' Creating blast databases:')
parallel = Parallel(self.cpus)
parallel.run(self._producer_db, None, aa_gene_files, self._progress)
# perform reciprocal blast between all genome pairs
self.logger.info('')
self.logger.info(' Identifying hits between all pairs of genomes:')
genome_pairs = []
for i in xrange(0, len(aa_gene_files)):
for j in xrange(i, len(aa_gene_files)):
genome_pairs.append((aa_gene_files[i], aa_gene_files[j]))
parallel.run(self._producer_blast, None, genome_pairs, self._progress)
Returns
-------
dict of dict : d[genome_id][kmer] -> count
Kmer usage of each genome.
set
Set with all identified kmers.
"""
self.logger.info('Calculating kmer usage for each genome.')
progress_func = self._progress
if self.logger.is_silent:
progress_func = None
parallel = Parallel(self.cpus)
kmer_counts = parallel.run(self._producer, self._consumer, genome_files, progress_func)
return kmer_counts, self.signatures.canonical_order()
"""Calculate tetranucleotide signatures of sequences.
Parameters
----------
seq_file : str
Name of fasta/q file to read.
Returns
-------
dict : d[seq_id] -> tetranucleotide signature in canonical order
Count of each kmer.
"""
self.logger.info('Calculating tetranucleotide signature for each sequence:')
parallel = Parallel(self.cpus)
seq_signatures = parallel.run_seqs_file(self._producer, self._consumer, seq_file, self._progress)
return seq_signatures
-------
dict of dict : d[genome_id][codon] -> count
Codon usage of each genome.
set
Set with all identified codons.
dict of dict : d[genome_id][codon] -> length
Mean length of genes for each stop codon.
"""
self.logger.info('Calculating codon usage for each genome.')
progress_func = self._progress
if self.logger.is_silent:
progress_func = None
parallel = Parallel(self.cpus)
consumer_data = parallel.run(self._producer, self._consumer, gene_files, progress_func)
return consumer_data.genome_codon_usage, consumer_data.codon_set, consumer_data.mean_gene_length
Directory to store results.
"""
self.output_dir = output_dir
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
self.critical_value = critical_value
self.logger.info('Calculating dinucleotide usage for each genome.')
progress_func = self._progress
if self.logger.is_silent:
progress_func = None
parallel = Parallel(self.cpus)
parallel.run(self._producer, None, gene_files, progress_func)
Returns
-------
dict of dict : dict[genome_id][aa] -> count
Amino acid usage of each genome.
set
Set with all identified amino acids.
"""
self.logger.info('Calculating amino acid usage for each genome:')
progress_func = self._progress
if self.logger.is_silent:
progress_func = None
parallel = Parallel(self.cpus)
consumer_data = parallel.run(self._producer, self._consumer, gene_files, progress_func)
return consumer_data.genome_aa_usage, consumer_data.aa_set