How to use the biolib.seq_io.read_fasta function in biolib

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Ecogenomics / GTDBTk / src / gtdbtk / TreeManager.py View on Github external
marker_dict_original = {marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in bacterial_set[db_marker]}
        else:
            for db_marker in archaeal_set.keys().sort():
                marker_dict_original = {marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in archaeal_set[db_marker]}

        gene_bac_list = []
        gene_arc_list = []
        for marker_db, marker_suffix in marker_dbs.iteritems():
            # get all gene sequences
            genome_path = str(path)
            tophit_path = genome_path.replace(ConfigMetadata.PROTEIN_FILE_SUFFIX, marker_suffix)

            # we load the list of all the genes detected in the genome
            protein_file = tophit_path.replace(
                marker_suffix, self.protein_file_suffix)
            all_genes_dict = read_fasta(protein_file, False)

            # we store the tophit file line by line and store the
            # information in a dictionary
            with open(tophit_path) as tp:
                # first line is header line
                tp.readline()
                gene_dict = {}
                for line_tp in tp:
                    linelist = line_tp.split("\t")
                    genename = linelist[0]
                    sublist = linelist[1]
                    if ";" in sublist:
                        diff_markers = sublist.split(";")
                    else:
                        diff_markers = [sublist]
github dparks1134 / CompareM / comparem / kmer_usage.py View on Github external
----------
        genome_file : str
            Fasta file containing genomic sequences.

        Returns
        -------
        str
           Unique identifier of genome.
        dict : d[kmer] -> count
            Occurrence of each kmer.
        """

        genome_id = ntpath.basename(genome_file)
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(genome_file)
        kmer_usage = self.signatures.counts(seqs)

        return (genome_id, kmer_usage)
github dparks1134 / CompareM / comparem / amino_acid_usage.py View on Github external
gene_file : str
            Fasta file containing amino acid sequences.

        Returns
        -------
        str
           Unique identifier of genome.
        dict : dict[aa] -> count
            Occurrence of each amino acid.
        """

        genome_id = ntpath.basename(gene_file)
        genome_id = genome_id.replace('.genes.faa', '')
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(gene_file)
        aa_usage = self.amino_acid_usage(seqs)

        return [genome_id, aa_usage]
github dparks1134 / CompareM / comparem / lgt_dinucleotide.py View on Github external
def _producer(self, gene_file):
        """Calculates dinucleotide usage statistics of a genome.

        Parameters
        ----------
        gene_file : str
            Fasta file containing amino acid sequences.
        """

        genome_id = ntpath.basename(gene_file)
        genome_id = genome_id.replace('.genes.fna', '')
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(gene_file)
        self.dinucleotide_usage(seqs, genome_id)

        return True
github dparks1134 / CompareM / comparem / codon_usage.py View on Github external
Returns
        -------
        str
           Unique identifier of genome.
        dict : d[codon] -> count
            Occurrence of each codon.
        dict : d[codon] -> length
            Average length of genes for a given stop codon.
        """

        genome_id = ntpath.basename(gene_file)
        genome_id = genome_id.replace('.genes.fna', '')
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(gene_file)
        codon_usage, gene_length = self.codon_usage(seqs)

        return (genome_id, codon_usage, gene_length)
github Ecogenomics / GTDBTk / scripts / trim_msa_based_on_mask.py View on Github external
def run(self, msa, mask, outf):

        outfwriter = open(outf, 'w')
        dict_genomes = read_fasta(msa, False)
        with open(mask, 'r') as f:
            maskstr = f.readline()
        print(maskstr)
        print(len(maskstr))

        from future.utils import iteritems

        for k, v in dict_genomes.iteritems():
            aligned_seq = ''.join([v[i] for i in range(0, len(maskstr)) if maskstr[i] == '1'])
            fasta_outstr = ">%s\n%s\n" % (k, aligned_seq)
            outfwriter.write(fasta_outstr)
        outfwriter.close()