Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def run(items, background=None):
"""Detect copy number variations from batched set of samples using cn.mops.
"""
if not background: background = []
names = [tz.get_in(["rgnames", "sample"], x) for x in items + background]
work_bams = [x["align_bam"] for x in items + background]
if len(items + background) < 2:
raise ValueError("cn.mops only works on batches with multiple samples")
data = items[0]
work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", names[0],
"cn_mops"))
parallel = {"type": "local", "cores": data["config"]["algorithm"].get("num_cores", 1),
"progs": ["delly"]}
with pysam.Samfile(work_bams[0], "rb") as pysam_work_bam:
chroms = [None] if _get_regional_bed_file(items[0]) else pysam_work_bam.references
out_files = run_multicore(_run_on_chrom, [(chrom, work_bams, names, work_dir, items)
for chrom in chroms],
data["config"], parallel)
out_file = _combine_out_files(out_files, work_dir, data)
out = []
for data in items:
def is_human(data, builds=None):
"""Check if human, optionally with build number, search by name or extra GL contigs.
"""
def has_build37_contigs(data):
for contig in ref.file_contigs(dd.get_ref_file(data)):
if contig.name.startswith("GL") or contig.name.find("_gl") >= 0:
if contig.name in naming.GMAP["hg19"] or contig.name in naming.GMAP["GRCh37"]:
return True
return False
if not builds and tz.get_in(["genome_resources", "aliases", "human"], data):
return True
if not builds or "37" in builds:
target_builds = ["hg19", "GRCh37"]
if any([dd.get_genome_build(data).startswith(b) for b in target_builds]):
return True
elif has_build37_contigs(data):
return True
if not builds or "38" in builds:
target_builds = ["hg38"]
if any([dd.get_genome_build(data).startswith(b) for b in target_builds]):
return True
return False
def _compare_dicts(self, orig, new, ns):
out = {}
for key, val in new.items():
nskey = ns + [key]
orig_val = tz.get_in([key], orig)
if isinstance(val, dict) and isinstance(orig_val, dict):
for nkey, nval in self._compare_dicts(orig_val or {}, val or {}, nskey).items():
out = self._merge(out, {nkey: nval})
elif val != orig_val:
out = tz.update_in(out, nskey, lambda x: copy.deepcopy(val))
return out
def vc_output_record(samples):
"""Prepare output record from variant calling to feed into downstream analysis.
Prep work handles reformatting so we return generated dictionaries.
For any shared keys that are calculated only once for a batch, like variant calls
for the batch, we assign to every sample.
"""
shared_keys = [["vrn_file"], ["validate", "summary"],
["validate", "tp"], ["validate", "fp"], ["validate", "fn"]]
raw = cwlutils.samples_to_records([utils.to_single_data(x) for x in samples])
shared = {}
for key in shared_keys:
cur = list(set([x for x in [tz.get_in(key, d) for d in raw] if x]))
if len(cur) > 0:
assert len(cur) == 1, (key, cur)
shared[tuple(key)] = cur[0]
else:
shared[tuple(key)] = None
out = []
for d in raw:
for key, val in shared.items():
d = tz.update_in(d, key, lambda x: val)
out.append([d])
return out
def _configured_genders(items):
return set([str(tz.get_in(["metadata", "sex"], data, "")).lower() for data in items])
def _get_stats_files(data, out_dir=None):
"""Retrieve stats files from pre-existing dictionary or filesystem.
"""
if not out_dir:
out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
"qc", dd.get_sample_name(data), "samtools"))
stats_file = tz.get_in(["depth", "samtools", "stats"], data)
idxstats_file = tz.get_in(["depth", "samtools", "idxstats"], data)
if not stats_file:
stats_file = os.path.join(out_dir, "%s.txt" % dd.get_sample_name(data))
if not idxstats_file:
idxstats_file = os.path.join(out_dir, "%s-idxstats.txt" % dd.get_sample_name(data))
return stats_file, idxstats_file
del cur_data["align_split"]
for x in mgroup[1:]:
cur_data["combine"][file_key]["extras"].append(x[file_key])
ready_merge.append([cur_data])
cur_hla = None
for d in mgroup:
hla_files = tz.get_in(["hla", "fastq"], d)
if hla_files:
if not cur_hla:
cur_hla = {"rgnames": {"sample": dd.get_sample_name(cur_data)},
"config": cur_data["config"], "dirs": cur_data["dirs"],
"hla": {"fastq": []}}
cur_hla["hla"]["fastq"].append(hla_files)
if cur_hla:
hla_merges.append([cur_hla])
if not tz.get_in(["config", "algorithm", "kraken"], data):
# kraken requires fasta filenames from data['files'] as input.
# We don't want to remove those files if kraken qc is required.
_save_fastq_space(samples)
merged = run_parallel("delayed_bam_merge", ready_merge)
hla_merge_raw = run_parallel("merge_split_alignments", hla_merges)
hla_merges = {}
for hla_merge in [x[0] for x in hla_merge_raw]:
hla_merges[dd.get_sample_name(hla_merge)] = tz.get_in(["hla", "fastq"], hla_merge)
# Add stable 'align_bam' target to use for retrieving raw alignment
out = []
for data in [x[0] for x in merged + ready]:
if data.get("work_bam"):
data["align_bam"] = data["work_bam"]
if dd.get_sample_name(data) in hla_merges:
data["hla"]["fastq"] = hla_merges[dd.get_sample_name(data)]
"m3.medium")
tree["Mappings"]["AWSNATAMI"]["us-east-1"]["AMI"] = "ami-184dc970"
resources = tree['Resources']
# We don't need the demo Lustre client instance.
for section in ('ClientInstanceProfile', 'ClientLaunchConfig',
'ClientNodes', 'ClientRole'):
resources.pop(section)
for item in resources['BasePolicy']['Properties']['Roles'][:]:
if item['Ref'] == 'ClientRole':
resources['BasePolicy']['Properties']['Roles'].remove(item)
for section_name in ['MDS', 'MDS', 'MGS']:
section = resources['{}LaunchConfig'.format(section_name)]
cf_params = toolz.get_in(['Metadata', 'AWS::CloudFormation::Init',
'config', 'files', '/etc/loci.conf',
'content', 'Fn::Join'], section)[1]
index, _ = self._get_index(cf_params, 'OssCount:')
cf_params[index + 1] = oss_count
index, _ = self._get_index(cf_params, 'OstVolumeCount:')
cf_params[index + 1] = ost_vol_count
index, _ = self._get_index(cf_params, 'OstVolumeSize:')
cf_params[index + 1] = ost_vol_size
resources['OSSNodes']['Properties']['DesiredCapacity'] = oss_count
resources['OSSNodes']['Properties']['MaxSize'] = oss_count
resources['OSSNodes']['Properties']['MinSize'] = oss_count
resources['OssWaitCondition']['Properties']['Count'] = oss_count
def validate_launch_config_servicenet(lc):
"""
Validate that if CLBs are provided, ServiceNet is also provided.
"""
clb = any([lb.get('type', 'CloudLoadBalancer') == 'CloudLoadBalancer'
for lb in get_in(('args', 'loadBalancers'), lc, default=())])
networks = get_in(('args', 'server', 'networks'), lc, default=None)
if (clb and
networks is not None and
{'uuid': '11111111-1111-1111-1111-111111111111'} not in networks):
raise ValidationError("ServiceNet network must be present if one or "
"more Cloud Load Balancers are configured.")
def _joint_calling(items):
"""Determine if this call feeds downstream into joint calls.
"""
jointcaller = tz.get_in(("config", "algorithm", "jointcaller"), items[0])
if jointcaller:
assert len(items) == 1, "Can only do joint calling preparation with GATK with single samples"
assert tz.get_in(("metadata", "batch"), items[0]) is not None, \
"Joint calling requires batched samples, %s has no metadata batch." % dd.get_sample_name(items[0])
return jointcaller