Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
'The sample map should be tab separated with two columns. '
'The first column is the sample ID, and the second column '
'is the gVCF path.\n'
'WARNING: the sample names in the gVCFs will be overwritten',
required=True)
parser.add_argument('--tmp-path', help='path to folder for temp output (can be a cloud bucket)',
default='/tmp')
parser.add_argument('--out-file', '-o', help='path to final combiner output', required=True)
parser.add_argument('--json', help='json to use for the import of the gVCFs'
'(must be filesystem local)', required=True)
parser.add_argument('--header', help='external header, must be cloud based', required=False)
args = parser.parse_args()
hl.init(default_reference=DEFAULT_REF,
log='/hail-joint-caller-' + time.strftime('%Y%m%d-%H%M') + '.log')
with open(args.json) as j:
ty = hl.tarray(hl.tinterval(hl.tstruct(locus=hl.tlocus(reference_genome='GRCh38'))))
intervals = ty._from_json(j.read())
with open(args.sample_map) as m:
samples = [l.strip().split('\t') for l in m]
run_combiner(samples, intervals, args.out_file, args.tmp_path, args.header, overwrite=True)
elif isinstance(t, hl.tlocus):
contig_counts = append_agg(c, hl.agg.filter(hl.is_defined(expr), hl.agg.counter(expr.contig)))
d['contig counts'] = lambda results: format(index_with_path(results, contig_counts))
elif isinstance(t, (hl.tset, hl.tdict, hl.tarray)):
size = append_agg(c, hl.agg.stats(hl.len(expr)))
d['minimum size'] = lambda results: format(map_int(results[size]['min']))
d['maximum size'] = lambda results: format(map_int(results[size]['max']))
d['mean size'] = lambda results: format(results[size]['mean'])
to_print.append((context, d))
if isinstance(t, hl.ttuple):
for i in range(len(expr)):
recur_expr(expr[i], f'{context}[{i}]', path, c)
if isinstance(t, hl.tstruct):
for k, v in expr.items():
recur_expr(v, f'{context}[{repr(k)}]', path, c)
if isinstance(t, (hl.tset, hl.tarray)):
def explode_f(x):
c2 = Computations()
new_path = path + (c.n,)
recur_expr(x, f'{context}[]', new_path, c2)
return c2.result()
append_agg(c, hl.agg.explode(explode_f, expr))
if isinstance(t, hl.tdict):
def explode_f(x):
c2 = Computations()
new_path = path + (c.n,)
recur_expr(x[0], f'{context}[]', new_path, c2)
recur_expr(x[1], f'{context}[]', new_path, c2)
return c2.result()
append_agg(c, hl.agg.explode(explode_f, hl.array(expr)))
def __init__(self, path, intervals, filter_intervals):
if intervals is not None:
t = hl.expr.impute_type(intervals)
if not isinstance(t, hl.tarray) and not isinstance(t.element_type, hl.tinterval):
raise TypeError("'intervals' must be an array of tintervals")
pt = t.element_type.point_type
if isinstance(pt, hl.tstruct):
self._interval_type = t
else:
self._interval_type = hl.tarray(hl.tinterval(hl.tstruct(__point=pt)))
self.path = path
self.filter_intervals = filter_intervals
if intervals is not None and t != self._interval_type:
self.intervals = [hl.Interval(hl.Struct(__point=i.start),
hl.Struct(__point=i.end),
i.includes_start,
i.includes_end) for i in intervals]
else:
self.intervals = intervals
The ``includes_start`` and ``includes_end`` keys must be ``True``. The
``contig`` fields must be the same.
One difference between :func:`.import_gvcfs` and :func:`.import_vcf` is that
:func:`.import_gvcfs` only keys the resulting matrix tables by ``locus``
rather than ``locus, alleles``.
"""
rg = reference_genome.name if reference_genome else None
global _cached_importvcfs
if _cached_importvcfs is None:
_cached_importvcfs = Env.hail().io.vcf.ImportVCFs
if partitions is not None:
partitions, partitions_type = hl.utils._dumps_partitions(partitions, hl.tstruct(locus=hl.tlocus(rg), alleles=hl.tarray(hl.tstr)))
else:
partitions_type = None
vector_ref_s = _cached_importvcfs.pyApply(
wrap_to_list(path),
wrap_to_list(call_fields),
entry_float_type._parsable_string(),
rg,
contig_recoding,
array_elements_required,
skip_invalid_loci,
force_bgz,
force,
partitions, partitions_type._parsable_string(),
filter,
find_replace[0] if find_replace is not None else None,
def array_floating_point_divide(arg_type, ret_type):
register_function("/", (arg_type, hl.tarray(arg_type),), hl.tarray(ret_type))
register_function("/", (hl.tarray(arg_type),arg_type), hl.tarray(ret_type))
register_function("/", (hl.tarray(arg_type),hl.tarray(arg_type)), hl.tarray(ret_type))
array_floating_point_divide(hl.tint32, hl.tfloat32)
(child_typ.row_key_type
._insert_fields(**{f: child_typ.row_type[f] for f in pass_through})
._concat(poisreg_type)),
child_typ.row_key)
elif name == 'Skat':
key_field = self.config['keyField']
key_type = child_typ.row_type[key_field]
skat_type = hl.dtype(f'struct{{id:{key_type},size:int32,q_stat:float64,p_value:float64,fault:int32}}')
self._type = hl.ttable(
hl.tstruct(),
skat_type,
['id'])
elif name == 'PCA':
self._type = hl.ttable(
hl.tstruct(eigenvalues=hl.tarray(hl.tfloat64),
scores=hl.tarray(child_typ.col_key_type._insert_field('scores', hl.tarray(hl.tfloat64)))),
child_typ.row_key_type._insert_field('loadings', dtype('array')),
child_typ.row_key)
elif name == 'IBD':
ibd_info_type = hl.tstruct(Z0=hl.tfloat64, Z1=hl.tfloat64, Z2=hl.tfloat64, PI_HAT=hl.tfloat64)
ibd_type = hl.tstruct(i=hl.tstr,
j=hl.tstr,
ibd=ibd_info_type,
ibs0=hl.tint64,
ibs1=hl.tint64,
ibs2=hl.tint64)
self._type = hl.ttable(
hl.tstruct(),
ibd_type,
['i', 'j'])
else:
assert name == 'LocalLDPrune', name
def _compute_type(self):
child_typ = self.child.typ
if self.product:
value_type = hl.tarray(self.table.typ.value_type)
else:
value_type = self.table.typ.value_type
self._type = hl.tmatrix(
child_typ.global_type,
child_typ.col_type,
child_typ.col_key,
child_typ.row_type._insert_field(self.root, value_type),
child_typ.row_key,
child_typ.entry_type)
def __init__(self, path, intervals, filter_intervals):
if intervals is not None:
t = hl.expr.impute_type(intervals)
if not isinstance(t, hl.tarray) and not isinstance(t.element_type, hl.tinterval):
raise TypeError("'intervals' must be an array of tintervals")
pt = t.element_type.point_type
if isinstance(pt, hl.tstruct):
self._interval_type = t
else:
self._interval_type = hl.tarray(hl.tinterval(hl.tstruct(__point=pt)))
self.path = path
self.filter_intervals = filter_intervals
if intervals is not None and t != self._interval_type:
self.intervals = [hl.Interval(hl.Struct(__point=i.start),
hl.Struct(__point=i.end),
i.includes_start,
i.includes_end) for i in intervals]
else:
self.intervals = intervals
"""
Returns phased genotype calls in the non-PAR region of Y (requires both father and proband to be haploid to return phase)
:param CallExpression proband_call: Input proband genotype call
:param CallExpression father_call: Input father genotype call
:return: Array containing: phased proband call, phased father call, phased mother call
:rtype: ArrayExpression
"""
return hl.cond(
proband_call.is_haploid() & father_call.is_haploid() & (father_call[0] == proband_call[0]),
hl.array([
hl.call(proband_call[0], phased=True),
hl.call(father_call[0], phased=True),
hl.null(hl.tcall)
]),
hl.null(hl.tarray(hl.tcall))
)