Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _topHit(self, tigrfam_file):
"""Determine top hits to TIGRFAMs.
A gene is assigned to a single TIGRFAM
family. This will be the top hit among
all TIGRFAM HMMs and pass the threshold
for the HMM.
Parameters
----------
tigrfam_file : str
Name of file containing hits to TIGRFAM HMMs.
"""
assembly_dir, filename = os.path.split(tigrfam_file)
genome_id = filename.replace(self.tigrfam_suffix, '')
tophit_file = TopHitTigrFile(self.output_dir, genome_id)
# Populate the top-hit file.
with open(tigrfam_file, 'r') as fh_tigrfam:
for line in fh_tigrfam:
if line[0] == '#':
continue
line_split = line.split()
gene_id = line_split[0]
hmm_id = line_split[3]
evalue = float(line_split[4])
bitscore = float(line_split[5])
tophit_file.add_hit(gene_id, hmm_id, evalue, bitscore)
# Write the top-hit file to disk and calculate checksum.
tophit_file.write()
while True:
gene_file = queueIn.get(block=True, timeout=None)
if gene_file is None:
break
assembly_dir, filename = os.path.split(gene_file)
genome_id = filename.replace(self.protein_file_suffix, '')
genome_dir = os.path.join(self.output_dir, genome_id)
output_hit_file = os.path.join(genome_dir, filename.replace(self.protein_file_suffix,
self.tigrfam_suffix))
hmmsearch_out = os.path.join(genome_dir, filename.replace(self.protein_file_suffix,
'_tigrfam.out'))
# Check if this has already been processed.
out_files = (output_hit_file, hmmsearch_out, TopHitTigrFile.get_path(self.output_dir, genome_id))
if all([file_has_checksum(x) for x in out_files]):
self.warnings.info(f'Skipped TIGRFAM processing for: {genome_id}')
with n_skipped.get_lock():
n_skipped.value += 1
else:
cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu %d %s %s' % (hmmsearch_out,
output_hit_file,
self.cpus_per_genome,
self.tigrfam_hmms,
gene_file)
os.system(cmd)
# calculate checksum
for out_file in [output_hit_file, hmmsearch_out]:
checksum = sha256(out_file)
def _report_identified_marker_genes(self, gene_dict, outdir, prefix):
"""Report statistics for identified marker genes."""
# Summarise the copy number of each AR122 and BAC120 markers.
tln_summary_file = TlnTableSummaryFile(outdir, prefix)
ar122_copy_number_file = CopyNumberFileAR122(outdir, prefix)
bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)
# Process each genome.
for db_genome_id, info in sorted(gene_dict.items()):
cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
pfam_tophit_file.read()
tigr_tophit_file.read()
# Summarise each of the markers for this genome.
ar122_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"),
pfam_tophit_file, tigr_tophit_file)
bac120_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"),
pfam_tophit_file, tigr_tophit_file)
# Write the best translation table to disk for this genome.
tln_summary_file.add_genome(db_genome_id, info.get("best_translation_table"))
# Write each of the summary files to disk.
ar122_copy_number_file.write()
bac120_copy_number_file.write()
tln_summary_file.write()
def _run_multi_align(self, db_genome_id, path, marker_set_id):
"""
Returns the concatenated marker sequence for a specific genome
:param db_genome_id: Selected genome
:param path: Path to the genomic fasta file for the genome
:param marker_set_id: Unique ID of marker set to use for alignment
"""
cur_marker_dir = os.path.dirname(os.path.dirname(path))
pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
pfam_tophit_file.read()
tigr_tophit_file.read()
if marker_set_id == 'bac120':
copy_number_file = CopyNumberFileBAC120('/dev/null', None)
elif marker_set_id == 'ar122':
copy_number_file = CopyNumberFileAR122('/dev/null', None)
else:
raise GTDBTkException('Unknown marker set.')
copy_number_file.add_genome(db_genome_id, path, pfam_tophit_file, tigr_tophit_file)
single_copy_hits = copy_number_file.get_single_copy_hits(db_genome_id)
# gather information for all marker genes
marker_paths = {"PFAM": os.path.join(self.pfam_hmm_dir, 'individual_hmms'),
"TIGRFAM": os.path.join(os.path.dirname(self.tigrfam_hmm_dir), 'individual_hmms')}