Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
:param mode: str, analysis type (ANIb or ANIblastall)
:param logger: a logger for messages
Returns the following pandas dataframes in an ANIResults object;
query sequences are rows, subject sequences are columns:
- alignment_lengths - non-symmetrical: total length of alignment
- percentage_identity - non-symmetrical: ANIb (Goris) percentage identity
- alignment_coverage - non-symmetrical: coverage of query
- similarity_errors - non-symmetrical: count of similarity errors
May throw a ZeroDivisionError if one or more BLAST runs failed, or a
very distant sequence was included in the analysis.
"""
# Process directory to identify input files
blastfiles = pyani_files.get_input_files(blast_dir, ".blast_tab")
# Hold data in ANIResults object
results = ANIResults(list(org_lengths.keys()), mode)
# Fill diagonal NA values for alignment_length with org_lengths
for org, length in list(org_lengths.items()):
results.alignment_lengths[org][org] = length
# Process .blast_tab files assuming that the filename format holds:
# org1_vs_org2.blast_tab:
for blastfile in blastfiles:
qname, sname = blastfile.stem.split("_vs_")
# We may have BLAST files from other analyses in the same directory
# If this occurs, we raise a warning, and skip the file
if qname not in list(org_lengths.keys()):
if logger:
def subcmd_index(args: Namespace) -> int:
"""Generate a file with the MD5 hash for each genome in an input directory.
:param args: Namespace, received command-line arguments
:param logger: logging object
Identify the genome files in the input directory, and generate a single
MD5 for each so that .fna produces .md5
Genome files (FASTA) are identified from the file extension.
"""
logger = logging.getLogger(__name__)
# Get list of FASTA files in the input directory
logger.info("Scanning directory %s for FASTA files", args.indir)
fpaths = pyani_files.get_fasta_paths(args.indir)
logger.info("Found FASTA files:")
logger.info([f"\t{fpath}\n" for fpath in fpaths])
# Lists of class/label information
classes = []
labels = []
# Create MD5 hash for each file, if needed
for fpath in fpaths:
hashfname = fpath.with_suffix(".md5")
if hashfname.is_file():
logger.info("%s already indexed (using existing hash)", fpath)
with open(hashfname, "r") as ifh:
datahash = ifh.readline().split()[0]
else:
# Write an .md5 hash file
:param org_lengths: dictionary of total sequence lengths, keyed by sequence
Returns the following pandas dataframes in an ANIResults object;
query sequences are rows, subject sequences are columns:
- alignment_lengths - symmetrical: total length of alignment
- percentage_identity - symmetrical: percentage identity of alignment
- alignment_coverage - non-symmetrical: coverage of query and subject
- similarity_errors - symmetrical: count of similarity errors
May throw a ZeroDivisionError if one or more NUCmer runs failed, or a
very distant sequence was included in the analysis.
"""
# Process directory to identify input files - as of v0.2.4 we use the
# .filter files that result from delta-filter (1:1 alignments)
deltafiles = pyani_files.get_input_files(delta_dir, ".filter")
# Hold data in ANIResults object
results = ANIResults(list(org_lengths.keys()), "ANIm")
# Fill diagonal NA values for alignment_length with org_lengths
for org, length in list(org_lengths.items()):
results.alignment_lengths[org][org] = length
# Process .delta files assuming that the filename format holds:
# org1_vs_org2.delta
for deltafile in deltafiles:
qname, sname = deltafile.stem.split("_vs_")
# We may have .delta files from other analyses in the same directory
# If this occurs, we raise a warning, and skip the .delta file
if qname not in list(org_lengths.keys()):
def get_fasta_files(dirname: Path = Path(".")) -> Iterable:
"""Return iterable of FASTA files in the passed directory.
:param dirname: str, path to input directory
"""
infiles = pyani_files.get_input_files(
dirname, ".fasta", ".fas", ".fa", ".fna", ".fsa_nt"
)
return infiles
# Ensure argument validity and get method function/config
test_class_label_paths(args, logger)
test_scheduler(args, logger)
method_function, method_config = get_method(args, logger)
make_outdirs(args)
# Skip calculations (or not) depending on rerender option
if args.rerender:
logger.warning(
"--rerender option used. Producing graphics with no new recalculations"
)
else:
# Run ANI comparisons
logger.info("Identifying FASTA files in %s", args.indirname)
infiles = pyani_files.get_fasta_files(args.indirname)
logger.info("Input files:\n\t%s", "\n\t".join([str(_) for _ in infiles]))
# Are we subsampling? If so, make the selection here
if args.subsample:
infiles = subsample_input(args, logger, infiles)
logger.info(
"Sampled input files:\n\t%s", "\n\t".join([str(_) for _ in infiles])
)
# Get lengths of input sequences
logger.info("Processing input sequence lengths")
org_lengths = pyani_files.get_sequence_lengths(infiles)
seqlens = os.linesep.join(
["\t%s: %d" % (k, v) for k, v in list(org_lengths.items())]
)
logger.info("Sequence lengths:\n%s", seqlens)
name = args.name
# Add info for this analysis to the database
logger.info("Adding analysis information to database %s", args.dbpath)
run_id = pyani_db.add_run(args.dbpath, "ANIm", args.cmdline,
start_time, "started", name)
logger.info("Current analysis has ID %s in this database", run_id)
# Identify input files for comparison, and populate the database
logger.info("Identifying input genome/hash files:")
infiles = pyani_files.get_fasta_and_hash_paths(args.indir)
# Get hash string and sequence description for each FASTA/hash pair,
# and add info to the current database
for fastafile, hashfile in infiles:
# Get genome data
inhash, filecheck = pyani_files.read_hash_string(hashfile)
indesc = pyani_files.read_fasta_description(fastafile)
abspath = os.path.abspath(fastafile)
genome_len = pyani_tools.get_genome_length(abspath)
outstr = ["FASTA file:\t%s" % abspath,
"description:\t%s" % indesc,
"hash file:\t%s" % hashfile,
"MD5 hash:\t%s" % inhash,
"Total length:\t%d" % genome_len]
logger.info('\t' + '\n\t'.join(outstr))
# Attempt to add current genome/path combination to database
logger.info("Adding genome data to database...")
try:
genome_id = pyani_db.add_genome(args.dbpath, inhash,
abspath, genome_len, indesc)
except sqlite3.IntegrityError: # genome data already in database