How to use the biolib.seq_io.read function in biolib

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / refinem / main.py View on Github external
def manual(self, options):
        """Manual command"""
        
        check_file_exists(options.cluster_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)
        
        genome_id = remove_extension(options.genome_file)

        seqs = seq_io.read(options.genome_file)
        fout = {}
        with open(options.cluster_file) as f:
            f.readline()
            
            for line in f:
                line_split = line.rstrip().split('\t')
                scaffold_id = line_split[0]
                cluster_id = int(line_split[1])
                
                if cluster_id < 0:
                    # negative values indicate scaffolds that should
                    # not be placed in a cluster
                    continue
                    
                if cluster_id not in fout:
                    fout[cluster_id] = open(os.path.join(options.output_dir, genome_id + '_c%d.fna' % cluster_id), 'w')
github dparks1134 / RefineM / refinem / cluster.py View on Github external
no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations: int
            iterations to perform during clustering
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.items():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in range(0, len(sig)):
github dparks1134 / RefineM / refinem / ssu.py View on Github external
for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            genome_dir = os.path.join(output_dir, genome_id)
            
            if len(best_hits[genome_id]) == 0:
                continue

            # write summary file and putative SSU rRNAs to file
            summary_file = os.path.join(genome_dir, 'ssu.hmm_summary.tsv')
            summary_out = open(summary_file, 'w')
            summary_out.write('Sequence Id\tHMM\ti-Evalue\tStart hit\tEnd hit\tSSU gene length\tReverse Complement\tSequence length\n')

            ssu_seq_files[genome_id] = os.path.join(genome_dir, 'ssu.fna')
            seq_out = open(ssu_seq_files[genome_id] , 'w')

            seqs = seq_io.read(genome_file)

            for seq_id in best_hits[genome_id]:
                orig_seq_id = seq_id
                if '-#' in seq_id:
                    seq_id = seq_id[0:seq_id.rfind('-#')]

                seq_info = [orig_seq_id] + best_hits[genome_id][orig_seq_id]
                seq = seqs[seq_id]
                summary_out.write('\t'.join(seq_info) + '\n')

                seq_out.write('>' + seq_info[0] + '\n')
                seq_out.write(seq[int(seq_info[3]) + 1:int(seq_info[4]) + 1] + '\n')

            summary_out.close()
            seq_out.close()
github dparks1134 / RefineM / refinem / cluster.py View on Github external
criteria2 : str
           Second criteria used for splitting genome.
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """
        
        seqs = seq_io.read(genome_file)
        
        # calculate PCA if necessary
        if 'pc' in criteria1 or 'pc' in criteria2:
            self.logger.info('Performing PCA.')
            signatures = GenomicSignature(K)
            signature_matrix = []
            seqs = seq_io.read(genome_file)
            for seq_id, seq in seqs.items():
                stats = scaffold_stats.stats[seq_id]

                signature_matrix.append(stats.signature)

            pc, _variance = self.pca(signature_matrix)
            for i, seq_id in enumerate(seqs):
                scaffold_stats.stats[seq_id].pc1 = pc[i][0]
                scaffold_stats.stats[seq_id].pc2 = pc[i][1]
                scaffold_stats.stats[seq_id].pc3 = pc[i][2]
                
        # split bin
        genome_id = remove_extension(genome_file)
        fout1 = open(os.path.join(output_dir, genome_id + '_c1.fna'), 'w')
        fout2 = open(os.path.join(output_dir, genome_id + '_c2.fna'), 'w')
github dparks1134 / RefineM / refinem / taxon_profile.py View on Github external
self.logger.info('Creating taxonomic profile for each genome.')
        self.taxonomic_profiles(diamond_table_out, taxonomy)

        # write out taxonomic profile
        self.logger.info('Writing taxonomic profile for each genome.')
        report_dir = os.path.join(self.output_dir, 'bin_reports')
        make_sure_path_exists(report_dir)
        for aa_file in gene_files:
            genome_id = remove_extension(aa_file)
            profile = self.profiles[genome_id]

            scaffold_summary_out = os.path.join(report_dir, genome_id + '.scaffolds.tsv')
            profile.write_scaffold_summary(scaffold_stats, scaffold_summary_out)

            gene_summary_out = os.path.join(report_dir, genome_id + '.gene.tsv')
            profile.write_gene_summary(gene_summary_out, seq_io.read(aa_file))

            genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv')
            profile.write_genome_profile(genome_profile_out)

        # create summary report for all genomes
        genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv')
        self.write_genome_summary(genome_summary_out)

        # create Krona plot based on classification of scaffolds
        self.logger.info('Creating Krona plot for each genome.')
        krona_profiles = defaultdict(lambda: defaultdict(int))
        for genome_id, profile in self.profiles.items():
            seq_assignments = profile.classify_seqs()

            for seq_id, classification in seq_assignments.items():
                taxa = []
github dparks1134 / RefineM / refinem / outliers.py View on Github external
if gc < best_gc[0]:
                    best_gc = [gc, bin_id]
                if td < best_td[0]:
                    best_td = [td, bin_id]
                if cov < best_cov[0]:
                    best_cov = [cov, bin_id]

            # check if scaffold is closest to a single bin
            if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)
                
        self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds)))

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1
                
        self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
github dparks1134 / RefineM / refinem / outliers.py View on Github external
scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_ids.append(scaffold_id)
                bin_ids[scaffold_id] = bin_id

        compatible_scaffolds = set()
        for scaffold_id, bin_id in bin_ids.items():
            if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)
                
        self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds)))

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1
                
        self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
github dparks1134 / RefineM / refinem / cluster.py View on Github external
Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        criteria1 : str
            First criteria used for splitting genome.
        criteria2 : str
           Second criteria used for splitting genome.
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """
        
        seqs = seq_io.read(genome_file)
        
        # calculate PCA if necessary
        if 'pc' in criteria1 or 'pc' in criteria2:
            self.logger.info('Performing PCA.')
            signatures = GenomicSignature(K)
            signature_matrix = []
            seqs = seq_io.read(genome_file)
            for seq_id, seq in seqs.items():
                stats = scaffold_stats.stats[seq_id]

                signature_matrix.append(stats.signature)

            pc, _variance = self.pca(signature_matrix)
            for i, seq_id in enumerate(seqs):
                scaffold_stats.stats[seq_id].pc1 = pc[i][0]
                scaffold_stats.stats[seq_id].pc2 = pc[i][1]
github dparks1134 / RefineM / refinem / outliers.py View on Github external
scaffold_gc = float(line_split[scaffold_gc_index])
                genome_gc = float(line_split[genome_gc_index])
                gc_dist = abs(scaffold_gc - genome_gc)

                td_dist = float(line_split[td_dist_index])

                scaffold_cov = float(line_split[scaffold_cov_index])
                genome_cov = float(line_split[genome_cov_index])
                cov_dist = abs(scaffold_cov - genome_cov)

                if bin_id == cur_bin_id:
                    scaffold_ids.add(scaffold_id)

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in scaffold_ids:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1
                
        self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)