How to use the biolib.seq_io.read_seq function in biolib

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / refinem / scaffold_stats.py View on Github external
tetra = Tetranucleotide(self.cpus)
        signatures = tetra.read(tetra_file)

        cov_profiles = None
        if coverage_file:
            coverage = Coverage(self.cpus)
            cov_profiles, _ = coverage.read(coverage_file)

        # determine bin assignment for each scaffold
        self.logger.info('Determining scaffold statistics.')

        scaffold_id_genome_id = {}
        for gf in genome_files:
            genome_id = remove_extension(gf)
            for scaffold_id, _seq in seq_io.read_seq(gf):
                scaffold_id_genome_id[scaffold_id] = genome_id

        # write out scaffold statistics
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')

        if cov_profiles:
            first_key = list(cov_profiles.keys())[0]
            bam_ids = sorted(cov_profiles[first_key].keys())
            for bam_id in bam_ids:
                fout.write('\t' + bam_id)

        for kmer in tetra.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')
github dparks1134 / RefineM / refinem / bin_comparer.py View on Github external
Parameters
        ----------
        genome_files : iterable
            Genome files in fasta format.

        Returns
        -------
        dict: d[genome_id] -> set(seq_id1, ..., seq_idN)
            Ids of sequences in each genome.
        """

        genome_seqs = defaultdict(set)
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            for seq_id, _seq in seq_io.read_seq(genome_file):
                genome_seqs[genome_id].add(seq_id)

        return genome_seqs
github dparks1134 / RefineM / refinem / deprecated / taxonomic_profile.py View on Github external
Parameters
        ----------
        genome_file : str
            Fasta file with genome sequences to fragment.
        window_size : int
            Size of each fragment.
        step_size : int
            Number of bases to move after each window.
        profile : Profile
            Class for classifying fragments.
        fout : stream
            Output stream to store all fragments.
        """

        for seq_id, seq in seq_io.read_seq(genome_file):
            fragments = seq_tk.fragment(seq, window_size, step_size)
            for i, frag in enumerate(fragments):
                fout.write('>' + seq_id + '~' + str(i) + '\n')
                fout.write(frag + '\n')

                profile.fragments_from_seq[seq_id] = len(fragments)
                profile.seq_len[seq_id] = len(seq)

github dparks1134 / RefineM / refinem / taxon_profile.py View on Github external
taxa.append(classification[r][0])

                krona_profiles[genome_id][';'.join(taxa)] += profile.genes_in_scaffold[seq_id]

        krona = Krona()
        krona_output_file = os.path.join(self.output_dir, 'gene_profiles.scaffolds.html')
        krona.create(krona_profiles, krona_output_file)

        # create Krona plot based on best hit of each gene
        krona_profiles = defaultdict(lambda: defaultdict(int))

        for aa_file in gene_files:
            genome_id = remove_extension(aa_file)

            profile = self.profiles[genome_id]
            for gene_id, _seq in seq_io.read_seq(aa_file):

                taxa_str = Taxonomy.unclassified_taxon
                if gene_id in profile.gene_hits:
                    taxa_str, _hit_info = profile.gene_hits[gene_id]

                krona_profiles[genome_id][taxa_str] += 1

        krona_output_file = os.path.join(self.output_dir, 'gene_profiles.genes.html')
        krona.create(krona_profiles, krona_output_file)
github dparks1134 / RefineM / refinem / scaffold_stats.py View on Github external
# write out scaffold statistics
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')

        if cov_profiles:
            first_key = list(cov_profiles.keys())[0]
            bam_ids = sorted(cov_profiles[first_key].keys())
            for bam_id in bam_ids:
                fout.write('\t' + bam_id)

        for kmer in tetra.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for scaffold_id, seq in seq_io.read_seq(scaffold_file):
            fout.write(scaffold_id)
            fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned))
            fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0))
            fout.write('\t%d' % len(seq))

            if cov_profiles:
                for bam_id in bam_ids:
                    fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id])

            fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id])))
            fout.write('\n')

        fout.close()
github dparks1134 / RefineM / refinem / outliers.py View on Github external
best_gc = [gc, bin_id]
                if td < best_td[0]:
                    best_td = [td, bin_id]
                if cov < best_cov[0]:
                    best_cov = [cov, bin_id]

            # check if scaffold is closest to a single bin
            if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)
                
        self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds)))

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1
                
        self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
github dparks1134 / RefineM / refinem / unbinned.py View on Github external
binned_seq_ids = set()
        total_binned_bases = 0
        for genome_file in genome_files:
            for seq_id, seq in seq_io.read_seq(genome_file):
                binned_seq_ids.add(seq_id)
                total_binned_bases += len(seq)

        self.logger.info('Read %d (%.2f Mbp) binned scaffolds.' % (len(binned_seq_ids), float(total_binned_bases) / 1e6))

        # write all unbinned sequences
        self.logger.info('Identifying unbinned scaffolds >= %d bp.' % min_seq_len)

        unbinned_bases = 0
        unbinned_seqs = {}
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id not in binned_seq_ids and len(seq) >= min_seq_len:
                unbinned_seqs[seq_id] = seq
                unbinned_bases += len(seq)

        self.logger.info('Identified %d (%.2f Mbp) unbinned scaffolds.' % (len(unbinned_seqs), float(unbinned_bases) / 1e6))

        self.logger.info('Percentage of unbinned scaffolds: %.2f%%' % (len(unbinned_seqs) * 100.0 / (len(unbinned_seqs) + len(binned_seq_ids))))
        self.logger.info('Percentage of unbinned bases: %.2f%%' % (unbinned_bases * 100.0 / (unbinned_bases + total_binned_bases)))

        return unbinned_seqs
github dparks1134 / RefineM / refinem / reference.py View on Github external
self.logger.info('Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes)

        self.logger.info('Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, False, hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

        # get number of genes on each scaffold
        num_genes_on_scaffold = defaultdict(int)
        for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
            scaffold_id = seq_id[0:seq_id.rfind('_')]
            num_genes_on_scaffold[scaffold_id] += 1

        # get hits to each scaffold
        hits_to_scaffold = defaultdict(list)
        for query_id, hit in hits_to_ref.items():
            gene_id = query_id[0:query_id.rfind('~')]
            scaffold_id = gene_id[0:gene_id.rfind('_')]
            hits_to_scaffold[scaffold_id].append(hit)

        # report summary stats for each scaffold
        reference_out = os.path.join(self.output_dir, 'references.tsv')
        fout = open(reference_out, 'w')
        fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs')
        fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage')
        fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')