Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
:param args: dictionary of arguments from CAT
:param toil_options: toil options Namespace object
:return:
"""
with Toil(toil_options) as t:
if not t.options.restart:
input_file_ids = argparse.Namespace()
input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal)
input_file_ids.chrom_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes)
input_file_ids.hints_db = FileID.forPath(t.importFile('file://' + args.hints_db), args.hints_db)
if args.cgp_param is not None:
input_file_ids.cgp_param = FileID.forPath(t.importFile('file://' + args.cgp_param), args.cgp_param)
else:
input_file_ids.cgp_param = None
input_file_ids.gtf = FileID.forPath(t.importFile('file://' + args.gtf), args.gtf)
input_file_ids.cgp_cfg = FileID.forPath(t.importFile('file://' + args.cgp_cfg), args.cgp_cfg)
input_file_ids.fasta = {genome: FileID.forPath(t.importFile('file://' + fasta), fasta)
for genome, fasta in args.fasta_files.iteritems()}
du = tools.toilInterface.find_total_disk_usage([input_file_ids.hints_db], buffer='4G')
job = Job.wrapJobFn(setup, args, input_file_ids, memory='8G', disk=du)
results, stdout_file_ids, param_file_id = t.start(job)
else:
results, stdout_file_ids, param_file_id = t.restart()
tools.fileOps.ensure_file_dir(args.stdout_file)
with open(args.stdout_file, 'w') as outf, tools.fileOps.TemporaryFilePath() as tmp:
for (chrom, start, chunksize), stdout_file in stdout_file_ids.iteritems():
outf.write('## BEGIN CHUNK chrom: {} start: {} chunksize: {}\n'.format(chrom, start, chunksize))
t.exportFile(stdout_file, 'file://' + tmp)
for l in open(tmp):
outf.write(l)
for genome, (raw_gtf_file_id, joined_gtf_file_id, joined_gp_file_id) in results.iteritems():
tools.fileOps.ensure_file_dir(args.augustus_cgp_raw_gtf[genome])
def write_fasta_to_filestore(toil, fasta_local_path):
"""
Convenience function that loads a fasta and its associated gdx/flat file into the fileStore.
Assumes that the paths are consistent with the requirements (i.e. $path.gdx and $path.flat)
:param toil: Toil context manager
:param fasta_local_path: Path to local fasta to load.
:return: List of fileStore IDs for fasta, fasta_gdx, fasta_flat
"""
fasta_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path), fasta_local_path)
gdx_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.gdx'), fasta_local_path + '.gdx')
flat_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.flat'), fasta_local_path + '.flat')
return fasta_file_id, gdx_file_id, flat_file_id
def validate_import_bam(t, bam_path, fasta_sequences, genome):
validate_bam_fasta_pairs(bam_path, fasta_sequences, genome)
return [FileID.forPath(t.importFile('file://' + bam_path), bam_path),
FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')]
def augustus_cgp(args, toil_options):
"""
Main entry function for AugustusCGP toil pipeline
:param args: dictionary of arguments from CAT
:param toil_options: toil options Namespace object
:return:
"""
with Toil(toil_options) as t:
if not t.options.restart:
input_file_ids = argparse.Namespace()
input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal)
input_file_ids.chrom_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes)
input_file_ids.hints_db = FileID.forPath(t.importFile('file://' + args.hints_db), args.hints_db)
if args.cgp_param is not None:
input_file_ids.cgp_param = FileID.forPath(t.importFile('file://' + args.cgp_param), args.cgp_param)
else:
input_file_ids.cgp_param = None
input_file_ids.gtf = FileID.forPath(t.importFile('file://' + args.gtf), args.gtf)
input_file_ids.cgp_cfg = FileID.forPath(t.importFile('file://' + args.cgp_cfg), args.cgp_cfg)
input_file_ids.fasta = {genome: FileID.forPath(t.importFile('file://' + fasta), fasta)
for genome, fasta in args.fasta_files.iteritems()}
du = tools.toilInterface.find_total_disk_usage([input_file_ids.hints_db], buffer='4G')
job = Job.wrapJobFn(setup, args, input_file_ids, memory='8G', disk=du)
results, stdout_file_ids, param_file_id = t.start(job)
else:
results, stdout_file_ids, param_file_id = t.restart()
tools.fileOps.ensure_file_dir(args.stdout_file)
with open(args.stdout_file, 'w') as outf, tools.fileOps.TemporaryFilePath() as tmp:
def write_fasta_to_filestore(toil, fasta_local_path):
"""
Convenience function that loads a fasta and its associated gdx/flat file into the fileStore.
Assumes that the paths are consistent with the requirements (i.e. $path.gdx and $path.flat)
:param toil: Toil context manager
:param fasta_local_path: Path to local fasta to load.
:return: List of fileStore IDs for fasta, fasta_gdx, fasta_flat
"""
fasta_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path), fasta_local_path)
gdx_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.gdx'), fasta_local_path + '.gdx')
flat_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.flat'), fasta_local_path + '.flat')
return fasta_file_id, gdx_file_id, flat_file_id
def chaining(args, toil_options):
"""entry point to this program"""
with Toil(toil_options) as t:
if not t.options.restart:
input_file_ids = argparse.Namespace()
input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal)
input_file_ids.query_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes)
input_file_ids.query_two_bit = FileID.forPath(t.importFile('file://' + args.query_two_bit),
args.query_two_bit)
target_two_bit_file_ids = {genome: FileID.forPath(t.importFile('file://' + f), f)
for genome, f in args.target_two_bits.iteritems()}
input_file_ids.target_two_bits = target_two_bit_file_ids
job = Job.wrapJobFn(setup, args, input_file_ids)
chain_file_ids = t.start(job)
else:
chain_file_ids = t.restart()
for chain_file, chain_file_id in chain_file_ids.iteritems():
tools.fileOps.ensure_file_dir(chain_file)
t.exportFile(chain_file_id, 'file://' + chain_file)
def validate_import_bam(t, bam_path, fasta_sequences, genome):
validate_bam_fasta_pairs(bam_path, fasta_sequences, genome)
return [FileID.forPath(t.importFile('file://' + bam_path), bam_path),
FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')]
:param toil_options: toil options Namespace object
:return:
"""
with Toil(toil_options) as t:
if not t.options.restart:
input_file_ids = argparse.Namespace()
input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal)
input_file_ids.chrom_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes)
input_file_ids.hints_db = FileID.forPath(t.importFile('file://' + args.hints_db), args.hints_db)
if args.cgp_param is not None:
input_file_ids.cgp_param = FileID.forPath(t.importFile('file://' + args.cgp_param), args.cgp_param)
else:
input_file_ids.cgp_param = None
input_file_ids.gtf = FileID.forPath(t.importFile('file://' + args.gtf), args.gtf)
input_file_ids.cgp_cfg = FileID.forPath(t.importFile('file://' + args.cgp_cfg), args.cgp_cfg)
input_file_ids.fasta = {genome: FileID.forPath(t.importFile('file://' + fasta), fasta)
for genome, fasta in args.fasta_files.iteritems()}
du = tools.toilInterface.find_total_disk_usage([input_file_ids.hints_db], buffer='4G')
job = Job.wrapJobFn(setup, args, input_file_ids, memory='8G', disk=du)
results, stdout_file_ids, param_file_id = t.start(job)
else:
results, stdout_file_ids, param_file_id = t.restart()
tools.fileOps.ensure_file_dir(args.stdout_file)
with open(args.stdout_file, 'w') as outf, tools.fileOps.TemporaryFilePath() as tmp:
for (chrom, start, chunksize), stdout_file in stdout_file_ids.iteritems():
outf.write('## BEGIN CHUNK chrom: {} start: {} chunksize: {}\n'.format(chrom, start, chunksize))
t.exportFile(stdout_file, 'file://' + tmp)
for l in open(tmp):
outf.write(l)
for genome, (raw_gtf_file_id, joined_gtf_file_id, joined_gp_file_id) in results.iteritems():
tools.fileOps.ensure_file_dir(args.augustus_cgp_raw_gtf[genome])
t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_cgp_raw_gtf[genome])
# load the IsoSeq data, if we have any
iso_seq_file_ids = []
if hints_args.genome in hints_args.cfg['ISO_SEQ_BAM']:
for bam_path in hints_args.cfg['ISO_SEQ_BAM'][hints_args.genome]:
validate_bam_fasta_pairs(bam_path, fasta_sequences, hints_args.genome)
iso_seq_file_ids.append(validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome))
if hints_args.annotation is None:
annotation_file_id = None
else:
annotation_file_id = FileID.forPath(t.importFile('file://' + hints_args.annotation),
hints_args.annotation)
if hints_args.protein_fasta is None:
protein_fasta_file_id = genome_fasta_file_id = None
else:
protein_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.protein_fasta),
hints_args.protein_fasta)
genome_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.fasta), hints_args.fasta)
input_file_ids = {'bams': bam_file_ids,
'iso_seq_bams': iso_seq_file_ids,
'annotation': annotation_file_id,
'protein_fasta': protein_fasta_file_id,
'genome_fasta': genome_fasta_file_id}
if len(input_file_ids['bams']) + len(input_file_ids['iso_seq_bams']) > 0:
logger.info('All BAMs validated for {}. Beginning Toil hints pipeline'.format(hints_args.genome))
disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids)
job = Job.wrapJobFn(setup_hints, input_file_ids, disk=disk_usage)
combined_hints = t.start(job)
else:
logger.info('Restarting Toil hints pipeline for {}.'.format(hints_args.genome))
def augustus_pb(args, toil_options):
"""
Main entry function for AugustusPB toil pipeline
:param args: dictionary of arguments from CAT
:param toil_options: toil options Namespace object
:return:
"""
with Toil(toil_options) as t:
if not t.options.restart:
input_file_ids = argparse.Namespace()
input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta)
input_file_ids.chrom_sizes = FileID.forPath(t.importFile('file://' + args.chrom_sizes), args.chrom_sizes)
input_file_ids.pb_cfg = FileID.forPath(t.importFile('file://' + args.pb_cfg), args.pb_cfg)
input_file_ids.hints_gff = FileID.forPath(t.importFile('file://' + args.hints_gff), args.hints_gff)
job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk='32G')
raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.start(job)
else:
raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.restart()
tools.fileOps.ensure_file_dir(args.augustus_pb_raw_gtf)
t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_pb_raw_gtf)
t.exportFile(gtf_file_id, 'file://' + args.augustus_pb_gtf)
t.exportFile(joined_gp_file_id, 'file://' + args.augustus_pb_gp)