How to use the biolib.seq_tk.gc function in biolib

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / CompareM / comparem / lgt_codon.py View on Github external
# calculate Manhattan distance for each gene
        manhattan_dist = self._manhattan(gene_codon_usage, genome_codon_usage)

        # report dinucleotide usage of each gene
        codon_set_sorted = sorted(genome_codon_usage.keys())
        gene_ids_sorted = sorted(manhattan_dist.items(), key=operator.itemgetter(1), reverse=True)

        output_file = os.path.join(self.output_dir, genome_id + '.codon_usage.tsv')
        fout = open(output_file, 'w')

        fout.write('Gene Id\tGC\tLength (bp)\t# codons\tManhattan distance\tDeviations from mean')
        for di in codon_set_sorted:
            fout.write('\t' + di)
        fout.write('\n')

        genome_gc = seq_tk.gc(''.join(seqs.values()))
        genome_sum_codon = sum(genome_codon_usage.values())
        fout.write('%s\t%.2f\t%d\t%d' % ('', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_codon))
        fout.write('\t%.1f\t%.1f' % (0, 0))
        for di in codon_set_sorted:
            fout.write('\t%.2f' % (genome_codon_usage.get(di, 0) * 100.0 / genome_sum_codon))
        fout.write('\n')

        for gene_id, dists in gene_ids_sorted:
            codons = gene_codon_usage[gene_id]
            sum_codons = sum(codons.values())
            fout.write('%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_codons))
            fout.write('\t%.2f\t%.2f' % (dists[0], dists[1]))

            for di in codon_set_sorted:
                fout.write('\t%.2f' % (codons.get(di, 0) * 100.0 / sum_codons))
            fout.write('\n')
github dparks1134 / RefineM / refinem / scaffold_stats.py View on Github external
fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')

        if cov_profiles:
            first_key = list(cov_profiles.keys())[0]
            bam_ids = sorted(cov_profiles[first_key].keys())
            for bam_id in bam_ids:
                fout.write('\t' + bam_id)

        for kmer in tetra.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for scaffold_id, seq in seq_io.read_seq(scaffold_file):
            fout.write(scaffold_id)
            fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned))
            fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0))
            fout.write('\t%d' % len(seq))

            if cov_profiles:
                for bam_id in bam_ids:
                    fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id])

            fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id])))
            fout.write('\n')

        fout.close()
github dparks1134 / CompareM / comparem / lgt_dinucleotide.py View on Github external
Sequences indexed by sequence id.
        genome_id : str
            Unique id of genome used to create output file.
        """

        # calculate dinucleotide usage for each gene and the genome as a while
        gene_di_usage = defaultdict(lambda: defaultdict(int))
        gene_n1 = defaultdict(lambda: defaultdict(int))
        gene_n3 = defaultdict(lambda: defaultdict(int))

        genome_di_usage = defaultdict(int)
        genome_n1 = defaultdict(int)
        genome_n3 = defaultdict(int)
        gc = {}
        for gene_id, seq in seqs.items():
            gc[gene_id] = seq_tk.gc(seq)

            for i in range(2, len(seq) - 2, 3):
                dinucleotide = seq[i:i + 2].upper()
                if 'N' not in dinucleotide:
                    gene_di_usage[gene_id][dinucleotide] += 1
                    gene_n3[gene_id][dinucleotide[0]] += 1
                    gene_n1[gene_id][dinucleotide[1]] += 1

                    genome_di_usage[dinucleotide] += 1
                    genome_n3[dinucleotide[0]] += 1
                    genome_n1[dinucleotide[1]] += 1

        # calculate Manhattan distance for each gene
        manhattan_dist = self._manhattan(gene_di_usage, genome_di_usage)

        # identify deviant genes under Hotelling T-squared statistic
github dparks1134 / CompareM / comparem / lgt_codon.py View on Github external
"""Calculate codon usage statistics for genome.

        Parameters
        ----------
        seqs : dict[seq_id] -> seq
            Sequences indexed by sequence id.
        genome_id : str
            Unique id of genome used to create output file.
        """

        # calculate codon usage for each gene and the genome as a while
        gene_codon_usage = defaultdict(lambda: defaultdict(int))
        genome_codon_usage = defaultdict(int)
        gc = {}
        for gene_id, seq in seqs.items():
            gc[gene_id] = seq_tk.gc(seq)

            for i in range(0, len(seq) - 3, 3):
                codon = seq[i:i + 3].upper()
                if 'N' not in codon:
                    gene_codon_usage[gene_id][codon] += 1
                    genome_codon_usage[codon] += 1

        # calculate Manhattan distance for each gene
        manhattan_dist = self._manhattan(gene_codon_usage, genome_codon_usage)

        # report dinucleotide usage of each gene
        codon_set_sorted = sorted(genome_codon_usage.keys())
        gene_ids_sorted = sorted(manhattan_dist.items(), key=operator.itemgetter(1), reverse=True)

        output_file = os.path.join(self.output_dir, genome_id + '.codon_usage.tsv')
        fout = open(output_file, 'w')
github dparks1134 / CompareM / comparem / lgt_dinucleotide.py View on Github external
f_dist_correction = float(num_genes - 15 + 1) / (15 * num_genes)
        deviant_threshold = f.isf(self.critical_value, 15, num_genes - 15 + 1) / f_dist_correction

        # report dinucleotide usage of each gene
        di_set_sorted = sorted(genome_di_usage.keys())
        gene_ids_sorted = sorted(t2_stats.items(), key=operator.itemgetter(1), reverse=True)

        output_file = os.path.join(self.output_dir, genome_id + '.di_usage.tsv')
        fout = open(output_file, 'w')

        fout.write('Gene Id\tGC\tLength (bp)\t# dinucleotides\tHotelling T-squared statistic\tDeviant\tManhattan distance\tDeviations from mean')
        for di in di_set_sorted:
            fout.write('\t' + di)
        fout.write('\n')

        genome_gc = seq_tk.gc(''.join(seqs.values()))
        genome_sum_di = sum(genome_di_usage.values())
        fout.write('%s\t%.2f\t%d\t%d' % ('', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_di))
        fout.write('\t%s\t%s\t%.1f\t%.1f' % ('na', 'na', 0, 0))
        for di in di_set_sorted:
            fout.write('\t%.2f' % (genome_di_usage.get(di, 0) * 100.0 / genome_sum_di))
        fout.write('\n')

        for gene_id, t2_stat in gene_ids_sorted:
            dinucleotides = gene_di_usage[gene_id]
            sum_di = sum(dinucleotides.values())
            fout.write('%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_di))
            fout.write('\t%.2f\t%s' % (t2_stat, 'yes' if t2_stat > deviant_threshold else 'no'))
            fout.write('\t%.2f\t%.2f' % (manhattan_dist[gene_id][0], manhattan_dist[gene_id][1]))

            for di in di_set_sorted:
                fout.write('\t%.2f' % (dinucleotides.get(di, 0) * 100.0 / sum_di))