Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
xpos2=x_position(ds.info.CHR2, ds.info.POS2),
# End 2
end2=ds.info.END2,
xend2=x_position(ds.info.CHR2, ds.info.END2),
# Other
length=ds.info.SVLEN,
type=ds.info.SVTYPE,
alts=ds.alleles[1:],
)
# MULTIALLELIC should not be used as a quality filter in the browser
ds = ds.annotate(filters=ds.filters.difference(hl.set(["MULTIALLELIC"])))
# Group gene lists for all consequences in a struct
ds = ds.annotate(
consequences=hl.struct(
**{
csq.lower(): ds.info[f"PROTEIN_CODING__{csq}"]
for csq in PROTEIN_CODING_CONSEQUENCES
if csq != "INTERGENIC" and csq != "NEAREST_TSS"
}
)
)
ds = ds.annotate(intergenic=ds.info.PROTEIN_CODING__INTERGENIC)
# Collect set of all genes for which a variant has a consequence
all_genes = hl.empty_array(hl.tstr)
for csq in ds.consequences.dtype.fields:
all_genes = all_genes.extend(hl.or_else(ds.consequences[csq.lower()], hl.empty_array(hl.tstr)))
ds = ds.annotate(genes=hl.set(all_genes))
# Group per-population values in a struct for each field
def add_variant_type(alt_alleles: hl.expr.ArrayExpression) -> hl.expr.StructExpression:
"""
Get Struct of variant_type and n_alt_alleles from ArrayExpression of Strings (all alleles)
"""
ref = alt_alleles[0]
alts = alt_alleles[1:]
non_star_alleles = hl.filter(lambda a: a != '*', alts)
return hl.struct(variant_type=hl.cond(
hl.all(lambda a: hl.is_snp(ref, a), non_star_alleles),
hl.cond(hl.len(non_star_alleles) > 1, "multi-snv", "snv"),
hl.cond(
hl.all(lambda a: hl.is_indel(ref, a), non_star_alleles),
hl.cond(hl.len(non_star_alleles) > 1, "multi-indel", "indel"),
"mixed")
), n_alt_alleles=hl.len(non_star_alleles))
mnv_in_genome=ds.constituent_snvs.all(lambda s: hl.is_defined(s.genome)),
)
ds = ds.transmute(
n_individuals=ds.n_indv_tnv,
ac=ds.AC_tnv,
ac_hom=ds.n_tnv_hom,
exome=hl.or_missing(
ds.mnv_in_exome,
hl.struct(
n_individuals=ds.n_indv_tnv_ex, ac=ds.AC_tnv_ex, ac_hom=ds.n_tnv_hom_ex
),
),
genome=hl.or_missing(
ds.mnv_in_genome,
hl.struct(
n_individuals=ds.n_indv_tnv_gen,
ac=ds.AC_tnv_gen,
ac_hom=ds.n_tnv_hom_gen,
),
),
)
ds = ds.drop("AC_snp1", "AC_snp2", "AC_snp3")
ds = ds.transmute(
consequence=hl.struct(
category=hl.null(hl.tstr),
gene_id=ds.gene_id,
gene_name=ds.gene_name,
transcript_id=ds.transcript_id,
consequence=ds.tnv_cons,
lambda csq_values: hl.struct(
**{
field: hl.cond(csq_values[index] == "", hl.null(hl.tstr), csq_values[index])
for index, field in enumerate(VEP_FIELDS)
}
:meth:`.result`
Parameters
----------
named_exprs : varargs of :class:`.Expression`
Aggregation expressions.
Returns
-------
:class:`.GroupedMatrixTable`
"""
if self._row_keys is not None:
raise NotImplementedError("GroupedMatrixTable is already grouped by rows. Cannot aggregate over cols.")
assert self._col_keys is not None
base = self._col_fields if self._col_fields is not None else hl.struct()
for k, e in named_exprs.items():
analyze('GroupedMatrixTable.aggregate_cols', e, self._parent._global_indices, {self._parent._col_axis})
self._check_bindings('aggregate_cols', named_exprs, self._parent._col_indices)
return self._copy(col_fields = base.annotate(**named_exprs))
# Subset labels contain an _, so rebuild those after splitting them
if labels[0] == "non":
labels = ["_".join(labels[0:2])] + labels[2:]
if len(labels) == 2:
[subset, pop] = labels
faf_index_tree[subset][pop] = index
else:
assert len(labels) == 1
subset = labels[0]
faf_index_tree[subset]["total"] = index
ds = ds.annotate(
**{
subset: ds[subset].annotate(
faf95_adj=hl.struct(**{pop: ds.faf[index].faf95 for pop, index in faf_index_tree[subset].items()}),
faf99_adj=hl.struct(**{pop: ds.faf[index].faf99 for pop, index in faf_index_tree[subset].items()}),
)
for subset in subsets
}
)
ds = ds.drop("freq", "popmax", "faf")
##############
# Histograms #
##############
# Extract overall age distribution
ds = ds.transmute(
gnomad_age_hist_het=ds.age_hist_het[g.age_index_dict["gnomad"]],
gnomad_age_hist_hom=ds.age_hist_hom[g.age_index_dict["gnomad"]],
.map(lambda m: hl.struct(m=m[0], f=f[0])),
hl.zip_with_index(father_v)
output_region(current_region._replace(stop=last_pos))
#
# Step 3: Convert regions to a Hail table.
#
types = {t: hl.tfloat for t in tissue_fields}
types["gene_id"] = hl.tstr
types["chrom"] = hl.tstr
types["start"] = hl.tint
types["stop"] = hl.tint
types["mean"] = hl.tfloat
ds = hl.import_table("regions.tsv", min_partitions=100, missing="", types=types)
ds = ds.select("gene_id", "chrom", "start", "stop", "mean", tissues=hl.struct(**{t: ds[t] for t in tissue_fields}))
ds.write(args.output_ht)
return hl.bind(lambda mr:
(hl.case()
.when(ds.locus == mr.locus,
hl.struct(
locus=ds.locus,
alleles=[mr.alleles[0], mr.alleles[1]],
a_index=i,
was_split=True))
.when(filter_changed_loci,
hl.null(hl.tstruct(locus=ds.locus.dtype, alleles=hl.tarray(hl.tstr),
a_index=hl.tint, was_split=hl.tbool)))
.or_error(
"Found non-left-aligned variant in sparse_split_multi\n"
+ "old locus: " + hl.str(ds.locus) + "\n"
+ "old ref : " + ds.alleles[0] + "\n"
+ "old alt : " + ds.alleles[i] + "\n"
+ "mr locus : " + hl.str(mr.locus) + "\n"
+ "mr ref : " + mr.alleles[0] + "\n"
+ "mr alt : " + mr.alleles[1]
)),
ds = ds.select_globals()
population_dict_fields = [
"pop_no_lofs",
"pop_obs_het_lof",
"pop_obs_hom_lof",
"pop_defined",
"pop_p",
]
populations = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"]
# Convert dicts to structs for Elasticsearch export
ds = ds.annotate(
**{
f: hl.struct(**{pop: ds[f][pop] for pop in populations})
for f in population_dict_fields
}
)
# Convert interval to struct for Elasticsearch export
ds = ds.annotate(
interval=hl.struct(
chrom=ds.interval.start.contig,
start=ds.interval.start.position,
end=ds.interval.end.position,
)
)
ds = ds.key_by()
ds = ds.transmute(gene_name=ds.gene, transcript_id=ds.transcript)