Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
continue
line_split = line.split('\t')
scaffold_id = line_split[0]
info = line_split[8]
if info != '': # this will be empty for non-protein coding genes
gene_id = info.split(';')[0].replace('ID=', '')
gene_number[scaffold_id] += 1
gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id])
# write out gene file with modified identifiers
genome_gene_file = os.path.abspath(os.path.join(genome_dir, genome_id + '.genes.faa'))
fout = open(os.path.join(output_dir, genome_id + '.faa'), 'w')
for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True):
annotation = annotation[annotation.find(' ') + 1:] # remove additional gene id from annotation
annotation += ' [IMG Gene ID: ' + gene_id + ']' # append IMG gene id for future reference
fout.write('>' + gene_id_to_scaffold_id[gene_id] + ' ' + annotation + '\n')
fout.write(seq + '\n')
fout.close()
Directory to store AAI results.
"""
self.sorted_hit_table = sorted_hit_table
self.evalue_threshold = evalue_threshold
self.per_identity_threshold = per_iden_threshold
self.per_aln_len_threshold = per_aln_len_threshold
self.keep_rbhs = keep_rbhs
self.output_dir = output_dir
# calculate length of genes and number of genes in each genome
self.logger.info('Calculating length of genes.')
self.gene_lengths = {}
self.query_gene_count = defaultdict(int)
query_genomes = set()
for seq_id, seq in seq_io.read_fasta_seq(query_gene_file):
if seq[-1] == '*':
self.gene_lengths[seq_id] = len(seq) - 1
else:
self.gene_lengths[seq_id] = len(seq)
genome_id = seq_id[0:seq_id.find('~')]
self.query_gene_count[genome_id] += 1
query_genomes.add(genome_id)
self.target_gene_count = defaultdict(int)
target_genomes = set()
if target_gene_file:
for seq_id, seq in seq_io.read_fasta_seq(target_gene_file):
if seq[-1] == '*':
self.gene_lengths[seq_id] = len(seq) - 1
else:
self.query_gene_count = defaultdict(int)
query_genomes = set()
for seq_id, seq in seq_io.read_fasta_seq(query_gene_file):
if seq[-1] == '*':
self.gene_lengths[seq_id] = len(seq) - 1
else:
self.gene_lengths[seq_id] = len(seq)
genome_id = seq_id[0:seq_id.find('~')]
self.query_gene_count[genome_id] += 1
query_genomes.add(genome_id)
self.target_gene_count = defaultdict(int)
target_genomes = set()
if target_gene_file:
for seq_id, seq in seq_io.read_fasta_seq(target_gene_file):
if seq[-1] == '*':
self.gene_lengths[seq_id] = len(seq) - 1
else:
self.gene_lengths[seq_id] = len(seq)
genome_id = seq_id[0:seq_id.find('~')]
self.target_gene_count[genome_id] += 1
target_genomes.add(genome_id)
else:
self.target_gene_count = self.query_gene_count
# get byte offset of hits from each genome
self.logger.info('Indexing sorted hit table.')
self.offset_table = self._genome_offsets(self.sorted_hit_table)
# calculate AAI between each pair of genomes in parallel
----------
gene_files : list of str
Genes in fasta files to modify.
keep_headers : boolean
If True, indicates FASTA headers already have the format ~.
output_file : str
Name of FASTA file to contain modified genes.
"""
fout = open(output_file, 'w')
for gf in gene_files:
genome_id = remove_extension(gf)
if genome_id.endswith('_genes'):
genome_id = genome_id[0:genome_id.rfind('_genes')]
for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
if keep_headers:
fout.write('>' + seq_id + ' ' + annotation + '\n')
else:
fout.write('>' + genome_id + '~' + seq_id + ' ' + annotation + '\n')
fout.write(seq + '\n')
fout.close()
self.blast_dir = blast_dir
self.per_identity_threshold = per_iden_threshold
self.per_aln_len_threshold = per_aln_len_threshold
self.output_dir = output_dir
shared_genes_dir = os.path.join(output_dir, self.shared_genes)
make_sure_path_exists(shared_genes_dir)
self.shared_genes_dir = shared_genes_dir
# calculate length of genes and number of genes each genome
self.logger.info('Calculating length of genes.')
self.gene_lengths = {}
genes_in_genomes = defaultdict(int)
for seq_id, seq in seq_io.read_fasta_seq(os.path.join(self.blast_dir, self.gene_file)):
if seq[-1] == '*':
self.gene_lengths[seq_id] = len(seq) - 1
else:
self.gene_lengths[seq_id] = len(seq)
genome_id = seq_id[0:seq_id.find('~')]
genes_in_genomes[genome_id] += 1
# get byte offset of hits from each genome
self.logger.info('Indexing blast hits.')
self.blast_table = os.path.join(self.blast_dir, self.blast_table_file)
self.offset_table = self._genome_offsets(self.blast_table)
# calculate AAI between each pair of genomes in parallel
self.logger.info('Calculating amino acid identity between all pairs of genomes:')
gene_dir : str
Directory with fasta files containing protein sequences.
output_dir : float
Directory to contain modified fasta files.
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for f in os.listdir(gene_dir):
gf = os.path.join(gene_dir, f)
genome_id = remove_extension(gf)
aa_file = os.path.join(output_dir, genome_id + '.faa')
fout = open(aa_file, 'w')
for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
if seq[-1] == '*':
seq = seq[0:-1]
fout.write(seq + '\n')
fout.close()
genes_to_ignore : set
Genes which should not be written to file.
"""
genes_kept = 0
for genome_id in genome_list:
genome_gene_file = os.path.join(gene_dir, genome_id + '.faa')
if not os.path.exists(genome_gene_file):
print '[WARNING] Missing gene file for genome %s.' % genome_gene_file
continue
if os.stat(genome_gene_file).st_size == 0:
print '[WARNING] Gene file is empty for genome %s.' % genome_gene_file
continue
for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True):
if gene_id in genes_to_ignore:
continue
# IMG headers sometimes contain non-ascii characters which cause
# problems with BLAST and DIAMOND so there are explicitly filtered out
annotation = filter(lambda x: x in string.printable, annotation)
# a few IMG genomes contain protein sequences which start with a hyphen
if seq[0] == '-':
seq = seq[1:]
gene_out.write('>' + gene_id + ' ' + annotation + '\n')
gene_out.write(seq + '\n')
genes_kept += 1
return genes_kept