Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# calculate Manhattan distance for each gene
manhattan_dist = self._manhattan(gene_codon_usage, genome_codon_usage)
# report dinucleotide usage of each gene
codon_set_sorted = sorted(genome_codon_usage.keys())
gene_ids_sorted = sorted(manhattan_dist.items(), key=operator.itemgetter(1), reverse=True)
output_file = os.path.join(self.output_dir, genome_id + '.codon_usage.tsv')
fout = open(output_file, 'w')
fout.write('Gene Id\tGC\tLength (bp)\t# codons\tManhattan distance\tDeviations from mean')
for di in codon_set_sorted:
fout.write('\t' + di)
fout.write('\n')
genome_gc = seq_tk.gc(''.join(seqs.values()))
genome_sum_codon = sum(genome_codon_usage.values())
fout.write('%s\t%.2f\t%d\t%d' % ('', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_codon))
fout.write('\t%.1f\t%.1f' % (0, 0))
for di in codon_set_sorted:
fout.write('\t%.2f' % (genome_codon_usage.get(di, 0) * 100.0 / genome_sum_codon))
fout.write('\n')
for gene_id, dists in gene_ids_sorted:
codons = gene_codon_usage[gene_id]
sum_codons = sum(codons.values())
fout.write('%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_codons))
fout.write('\t%.2f\t%.2f' % (dists[0], dists[1]))
for di in codon_set_sorted:
fout.write('\t%.2f' % (codons.get(di, 0) * 100.0 / sum_codons))
fout.write('\n')
fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')
if cov_profiles:
first_key = list(cov_profiles.keys())[0]
bam_ids = sorted(cov_profiles[first_key].keys())
for bam_id in bam_ids:
fout.write('\t' + bam_id)
for kmer in tetra.canonical_order():
fout.write('\t' + kmer)
fout.write('\n')
for scaffold_id, seq in seq_io.read_seq(scaffold_file):
fout.write(scaffold_id)
fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned))
fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0))
fout.write('\t%d' % len(seq))
if cov_profiles:
for bam_id in bam_ids:
fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id])
fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id])))
fout.write('\n')
fout.close()
Sequences indexed by sequence id.
genome_id : str
Unique id of genome used to create output file.
"""
# calculate dinucleotide usage for each gene and the genome as a while
gene_di_usage = defaultdict(lambda: defaultdict(int))
gene_n1 = defaultdict(lambda: defaultdict(int))
gene_n3 = defaultdict(lambda: defaultdict(int))
genome_di_usage = defaultdict(int)
genome_n1 = defaultdict(int)
genome_n3 = defaultdict(int)
gc = {}
for gene_id, seq in seqs.items():
gc[gene_id] = seq_tk.gc(seq)
for i in range(2, len(seq) - 2, 3):
dinucleotide = seq[i:i + 2].upper()
if 'N' not in dinucleotide:
gene_di_usage[gene_id][dinucleotide] += 1
gene_n3[gene_id][dinucleotide[0]] += 1
gene_n1[gene_id][dinucleotide[1]] += 1
genome_di_usage[dinucleotide] += 1
genome_n3[dinucleotide[0]] += 1
genome_n1[dinucleotide[1]] += 1
# calculate Manhattan distance for each gene
manhattan_dist = self._manhattan(gene_di_usage, genome_di_usage)
# identify deviant genes under Hotelling T-squared statistic
"""Calculate codon usage statistics for genome.
Parameters
----------
seqs : dict[seq_id] -> seq
Sequences indexed by sequence id.
genome_id : str
Unique id of genome used to create output file.
"""
# calculate codon usage for each gene and the genome as a while
gene_codon_usage = defaultdict(lambda: defaultdict(int))
genome_codon_usage = defaultdict(int)
gc = {}
for gene_id, seq in seqs.items():
gc[gene_id] = seq_tk.gc(seq)
for i in range(0, len(seq) - 3, 3):
codon = seq[i:i + 3].upper()
if 'N' not in codon:
gene_codon_usage[gene_id][codon] += 1
genome_codon_usage[codon] += 1
# calculate Manhattan distance for each gene
manhattan_dist = self._manhattan(gene_codon_usage, genome_codon_usage)
# report dinucleotide usage of each gene
codon_set_sorted = sorted(genome_codon_usage.keys())
gene_ids_sorted = sorted(manhattan_dist.items(), key=operator.itemgetter(1), reverse=True)
output_file = os.path.join(self.output_dir, genome_id + '.codon_usage.tsv')
fout = open(output_file, 'w')
f_dist_correction = float(num_genes - 15 + 1) / (15 * num_genes)
deviant_threshold = f.isf(self.critical_value, 15, num_genes - 15 + 1) / f_dist_correction
# report dinucleotide usage of each gene
di_set_sorted = sorted(genome_di_usage.keys())
gene_ids_sorted = sorted(t2_stats.items(), key=operator.itemgetter(1), reverse=True)
output_file = os.path.join(self.output_dir, genome_id + '.di_usage.tsv')
fout = open(output_file, 'w')
fout.write('Gene Id\tGC\tLength (bp)\t# dinucleotides\tHotelling T-squared statistic\tDeviant\tManhattan distance\tDeviations from mean')
for di in di_set_sorted:
fout.write('\t' + di)
fout.write('\n')
genome_gc = seq_tk.gc(''.join(seqs.values()))
genome_sum_di = sum(genome_di_usage.values())
fout.write('%s\t%.2f\t%d\t%d' % ('', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_di))
fout.write('\t%s\t%s\t%.1f\t%.1f' % ('na', 'na', 0, 0))
for di in di_set_sorted:
fout.write('\t%.2f' % (genome_di_usage.get(di, 0) * 100.0 / genome_sum_di))
fout.write('\n')
for gene_id, t2_stat in gene_ids_sorted:
dinucleotides = gene_di_usage[gene_id]
sum_di = sum(dinucleotides.values())
fout.write('%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_di))
fout.write('\t%.2f\t%s' % (t2_stat, 'yes' if t2_stat > deviant_threshold else 'no'))
fout.write('\t%.2f\t%.2f' % (manhattan_dist[gene_id][0], manhattan_dist[gene_id][1]))
for di in di_set_sorted:
fout.write('\t%.2f' % (dinucleotides.get(di, 0) * 100.0 / sum_di))