How to use the biolib.seq_io.read_fasta_seq function in biolib

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / scripts / make_database.py View on Github external
continue

            line_split = line.split('\t')
            scaffold_id = line_split[0]
            info = line_split[8]
            if info != '':  # this will be empty for non-protein coding genes
                gene_id = info.split(';')[0].replace('ID=', '')

                gene_number[scaffold_id] += 1
                gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id])

        # write out gene file with modified identifiers
        genome_gene_file = os.path.abspath(os.path.join(genome_dir, genome_id + '.genes.faa'))

        fout = open(os.path.join(output_dir, genome_id + '.faa'), 'w')
        for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True):

            annotation = annotation[annotation.find(' ') + 1:]  # remove additional gene id from annotation
            annotation += ' [IMG Gene ID: ' + gene_id + ']'  # append IMG gene id for future reference

            fout.write('>' + gene_id_to_scaffold_id[gene_id] + ' ' + annotation + '\n')
            fout.write(seq + '\n')
        fout.close()
github dparks1134 / CompareM / comparem / aai_calculator.py View on Github external
Directory to store AAI results.
        """

        self.sorted_hit_table = sorted_hit_table
        self.evalue_threshold = evalue_threshold
        self.per_identity_threshold = per_iden_threshold
        self.per_aln_len_threshold = per_aln_len_threshold
        self.keep_rbhs = keep_rbhs
        self.output_dir = output_dir

        # calculate length of genes and number of genes in each genome
        self.logger.info('Calculating length of genes.')
        self.gene_lengths = {}
        self.query_gene_count = defaultdict(int)
        query_genomes = set()
        for seq_id, seq in seq_io.read_fasta_seq(query_gene_file):
            if seq[-1] == '*':
                self.gene_lengths[seq_id] = len(seq) - 1
            else:
                self.gene_lengths[seq_id] = len(seq)
                
            genome_id = seq_id[0:seq_id.find('~')]
            self.query_gene_count[genome_id] += 1
            query_genomes.add(genome_id)
            
        self.target_gene_count = defaultdict(int)
        target_genomes = set()
        if target_gene_file:
            for seq_id, seq in seq_io.read_fasta_seq(target_gene_file):
                if seq[-1] == '*':
                    self.gene_lengths[seq_id] = len(seq) - 1
                else:
github dparks1134 / CompareM / comparem / aai_calculator.py View on Github external
self.query_gene_count = defaultdict(int)
        query_genomes = set()
        for seq_id, seq in seq_io.read_fasta_seq(query_gene_file):
            if seq[-1] == '*':
                self.gene_lengths[seq_id] = len(seq) - 1
            else:
                self.gene_lengths[seq_id] = len(seq)
                
            genome_id = seq_id[0:seq_id.find('~')]
            self.query_gene_count[genome_id] += 1
            query_genomes.add(genome_id)
            
        self.target_gene_count = defaultdict(int)
        target_genomes = set()
        if target_gene_file:
            for seq_id, seq in seq_io.read_fasta_seq(target_gene_file):
                if seq[-1] == '*':
                    self.gene_lengths[seq_id] = len(seq) - 1
                else:
                    self.gene_lengths[seq_id] = len(seq)
                    
                genome_id = seq_id[0:seq_id.find('~')]
                self.target_gene_count[genome_id] += 1
                target_genomes.add(genome_id)
        else:
            self.target_gene_count = self.query_gene_count

        # get byte offset of hits from each genome
        self.logger.info('Indexing sorted hit table.')
        self.offset_table = self._genome_offsets(self.sorted_hit_table)

        # calculate AAI between each pair of genomes in parallel
github dparks1134 / CompareM / comparem / similarity_search.py View on Github external
----------
        gene_files : list of str
            Genes in fasta files to modify.
        keep_headers : boolean
            If True, indicates FASTA headers already have the format ~.
        output_file : str
            Name of FASTA file to contain modified genes.
        """
        
        fout = open(output_file, 'w')
        for gf in gene_files:           
            genome_id = remove_extension(gf)
            if genome_id.endswith('_genes'):
                genome_id = genome_id[0:genome_id.rfind('_genes')]
                
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                if keep_headers:
                    fout.write('>' + seq_id  + ' ' + annotation + '\n')
                else:
                    fout.write('>' + genome_id + '~' + seq_id  + ' ' + annotation + '\n')
                fout.write(seq + '\n')
        fout.close()
github dparks1134 / CompareM / comparem / aai_calculator.py View on Github external
self.blast_dir = blast_dir

        self.per_identity_threshold = per_iden_threshold
        self.per_aln_len_threshold = per_aln_len_threshold
        self.output_dir = output_dir

        shared_genes_dir = os.path.join(output_dir, self.shared_genes)
        make_sure_path_exists(shared_genes_dir)
        self.shared_genes_dir = shared_genes_dir

        # calculate length of genes and number of genes each genome
        self.logger.info('Calculating length of genes.')
        self.gene_lengths  = {}
        genes_in_genomes = defaultdict(int)
        for seq_id, seq in seq_io.read_fasta_seq(os.path.join(self.blast_dir, self.gene_file)):
            if seq[-1] == '*':
                self.gene_lengths[seq_id] = len(seq) - 1
            else:
                self.gene_lengths[seq_id] = len(seq)
                
            genome_id = seq_id[0:seq_id.find('~')]
            genes_in_genomes[genome_id] += 1

        # get byte offset of hits from each genome
        self.logger.info('Indexing blast hits.')
        self.blast_table = os.path.join(self.blast_dir, self.blast_table_file)
        self.offset_table = self._genome_offsets(self.blast_table)

        # calculate AAI between each pair of genomes in parallel
        self.logger.info('Calculating amino acid identity between all pairs of genomes:')
github dparks1134 / RefineM / scripts / make_database.py View on Github external
gene_dir : str
            Directory with fasta files containing protein sequences.
        output_dir : float
            Directory to contain modified fasta files.
        """

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for f in os.listdir(gene_dir):
            gf = os.path.join(gene_dir, f)
            genome_id = remove_extension(gf)

            aa_file = os.path.join(output_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()
github dparks1134 / RefineM / scripts / make_database.py View on Github external
genes_to_ignore : set
            Genes which should not be written to file.
        """

        genes_kept = 0
        for genome_id in genome_list:
            genome_gene_file = os.path.join(gene_dir, genome_id + '.faa')
            if not os.path.exists(genome_gene_file):
                print '[WARNING] Missing gene file for genome %s.' % genome_gene_file
                continue

            if os.stat(genome_gene_file).st_size == 0:
                print '[WARNING] Gene file is empty for genome %s.' % genome_gene_file
                continue

            for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True):
                if gene_id in genes_to_ignore:
                    continue

                # IMG headers sometimes contain non-ascii characters which cause
                # problems with BLAST and DIAMOND so there are explicitly filtered out
                annotation = filter(lambda x: x in string.printable, annotation)

                # a few IMG genomes contain protein sequences which start with a hyphen
                if seq[0] == '-':
                    seq = seq[1:]

                gene_out.write('>' + gene_id + ' ' + annotation + '\n')
                gene_out.write(seq + '\n')
                genes_kept += 1

        return genes_kept