Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
marker_dict_original = {marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in bacterial_set[db_marker]}
else:
for db_marker in archaeal_set.keys().sort():
marker_dict_original = {marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in archaeal_set[db_marker]}
gene_bac_list = []
gene_arc_list = []
for marker_db, marker_suffix in marker_dbs.iteritems():
# get all gene sequences
genome_path = str(path)
tophit_path = genome_path.replace(ConfigMetadata.PROTEIN_FILE_SUFFIX, marker_suffix)
# we load the list of all the genes detected in the genome
protein_file = tophit_path.replace(
marker_suffix, self.protein_file_suffix)
all_genes_dict = read_fasta(protein_file, False)
# we store the tophit file line by line and store the
# information in a dictionary
with open(tophit_path) as tp:
# first line is header line
tp.readline()
gene_dict = {}
for line_tp in tp:
linelist = line_tp.split("\t")
genename = linelist[0]
sublist = linelist[1]
if ";" in sublist:
diff_markers = sublist.split(";")
else:
diff_markers = [sublist]
----------
genome_file : str
Fasta file containing genomic sequences.
Returns
-------
str
Unique identifier of genome.
dict : d[kmer] -> count
Occurrence of each kmer.
"""
genome_id = ntpath.basename(genome_file)
genome_id = os.path.splitext(genome_id)[0]
seqs = seq_io.read_fasta(genome_file)
kmer_usage = self.signatures.counts(seqs)
return (genome_id, kmer_usage)
gene_file : str
Fasta file containing amino acid sequences.
Returns
-------
str
Unique identifier of genome.
dict : dict[aa] -> count
Occurrence of each amino acid.
"""
genome_id = ntpath.basename(gene_file)
genome_id = genome_id.replace('.genes.faa', '')
genome_id = os.path.splitext(genome_id)[0]
seqs = seq_io.read_fasta(gene_file)
aa_usage = self.amino_acid_usage(seqs)
return [genome_id, aa_usage]
def _producer(self, gene_file):
"""Calculates dinucleotide usage statistics of a genome.
Parameters
----------
gene_file : str
Fasta file containing amino acid sequences.
"""
genome_id = ntpath.basename(gene_file)
genome_id = genome_id.replace('.genes.fna', '')
genome_id = os.path.splitext(genome_id)[0]
seqs = seq_io.read_fasta(gene_file)
self.dinucleotide_usage(seqs, genome_id)
return True
Returns
-------
str
Unique identifier of genome.
dict : d[codon] -> count
Occurrence of each codon.
dict : d[codon] -> length
Average length of genes for a given stop codon.
"""
genome_id = ntpath.basename(gene_file)
genome_id = genome_id.replace('.genes.fna', '')
genome_id = os.path.splitext(genome_id)[0]
seqs = seq_io.read_fasta(gene_file)
codon_usage, gene_length = self.codon_usage(seqs)
return (genome_id, codon_usage, gene_length)
def run(self, msa, mask, outf):
outfwriter = open(outf, 'w')
dict_genomes = read_fasta(msa, False)
with open(mask, 'r') as f:
maskstr = f.readline()
print(maskstr)
print(len(maskstr))
from future.utils import iteritems
for k, v in dict_genomes.iteritems():
aligned_seq = ''.join([v[i] for i in range(0, len(maskstr)) if maskstr[i] == '1'])
fasta_outstr = ">%s\n%s\n" % (k, aligned_seq)
outfwriter.write(fasta_outstr)
outfwriter.close()