Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
bound_exprs = {}
gq_dp_exprs = {}
def has_field_of_type(name, dtype):
return name in mt.entry and mt[name].dtype == dtype
if has_field_of_type('DP', hl.tint32):
gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max')
if has_field_of_type('GQ', hl.tint32):
gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max')
if not has_field_of_type('GT', hl.tcall):
raise ValueError(f"'variant_qc': expect an entry field 'GT' of type 'call'")
bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))
bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count()
bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles)
result = hl.rbind(hl.struct(**bound_exprs),
lambda e1: hl.rbind(
hl.case().when(hl.len(mt.alleles) == 2,
hl.hardy_weinberg_test(e1.call_stats.homozygote_count[0],
e1.call_stats.AC[1] - 2 *
e1.call_stats.homozygote_count[1],
e1.call_stats.homozygote_count[1])
).or_missing(),
lambda hwe: hl.struct(**{
**gq_dp_exprs,
**e1.call_stats,
'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered),
),
),
exon=annotation.EXON,
gene_id=annotation.Gene,
gene_symbol=annotation.SYMBOL,
gene_symbol_source=annotation.SYMBOL_SOURCE,
hgnc_id=annotation.HGNC_ID,
hgvsc=annotation.HGVSc,
hgvsp=annotation.HGVSp,
lof=annotation.LoF,
lof_filter=annotation.LoF_filter,
lof_flags=annotation.LoF_flags,
lof_info=annotation.LoF_info,
# PolyPhen field contains "polyphen_prediction(polyphen_score)"
polyphen_prediction=hl.or_missing(
hl.is_defined(annotation.PolyPhen), annotation.PolyPhen.split("\\(")[0]
),
protein_id=annotation.ENSP,
# Protein_position may contain either "start-end" or, when start == end, "start"
protein_start=split_position_start(annotation.Protein_position),
protein_end=split_position_end(annotation.Protein_position),
# SIFT field contains "sift_prediction(sift_score)"
sift_prediction=hl.or_missing(hl.is_defined(annotation.SIFT), annotation.SIFT.split("\\(")[0]),
transcript_id=annotation.Feature,
)
lambda variant_id_parts: hl.struct(
variant_id=ds[f"{snp}_copy"],
chrom=variant_id_parts[0],
pos=hl.int(variant_id_parts[1]),
ref=variant_id_parts[2],
alt=variant_id_parts[3],
exome=hl.or_missing(
hl.is_defined(ds[f"AN_{snp}_ex"]),
hl.struct(
filters=ds[f"filter_{snp}_ex"],
ac=hl.int(ds[f"AC_{snp}_ex"]),
an=hl.int(ds[f"AN_{snp}_ex"]),
),
),
genome=hl.or_missing(
hl.is_defined(ds[f"AN_{snp}_gen"]),
hl.struct(
filters=ds[f"filter_{snp}_gen"],
ac=hl.int(ds[f"AC_{snp}_gen"]),
an=hl.int(ds[f"AN_{snp}_gen"]),
),
def _ac_an_parent_child_count(
proband_gt: hl.expr.CallExpression,
father_gt: hl.expr.CallExpression,
mother_gt: hl.expr.CallExpression,
) -> Dict[str, hl.expr.Int64Expression]:
"""
Helper method to get AC and AN for parents and children
"""
ac_parent_expr = hl.agg.sum(
father_gt.n_alt_alleles() + mother_gt.n_alt_alleles()
)
an_parent_expr = hl.agg.sum(
(hl.is_defined(father_gt) + hl.is_defined(mother_gt)) * 2
)
ac_child_expr = hl.agg.sum(proband_gt.n_alt_alleles())
an_child_expr = hl.agg.sum(hl.is_defined(proband_gt) * 2)
return {
f"ac_parents": ac_parent_expr,
f"an_parents": an_parent_expr,
f"ac_children": ac_child_expr,
f"an_children": an_child_expr,
}
hl.or_missing(father_call.is_haploid(), hl.array([father_call.one_hot_alleles(alleles)])),
call_to_one_hot_alleles_array(father_call, alleles)
)
mother_v = call_to_one_hot_alleles_array(mother_call, alleles)
combinations = hl.flatmap(
lambda f:
hl.zip_with_index(mother_v)
.filter(lambda m: m[1] + f[1] == proband_v)
.map(lambda m: hl.struct(m=m[0], f=f[0])),
hl.zip_with_index(father_v)
)
return (
hl.cond(
hl.is_defined(combinations) & (hl.len(combinations) == 1),
hl.array([
hl.call(father_call[combinations[0].f], mother_call[combinations[0].m], phased=True),
hl.cond(father_call.is_haploid(), hl.call(father_call[0], phased=True), phase_parent_call(father_call, combinations[0].f)),
phase_parent_call(mother_call, combinations[0].m)
]),
hl.null(hl.tarray(hl.tcall))
)
min_score=hl.agg.min(ht.score),
max_score=hl.agg.max(ht.score),
n=hl.agg.count(),
n_ins=hl.agg.count_where(hl.is_insertion(ht.alleles[0], ht.alleles[1])),
n_del=hl.agg.count_where(hl.is_deletion(ht.alleles[0], ht.alleles[1])),
n_ti=hl.agg.count_where(hl.is_transition(ht.alleles[0], ht.alleles[1])),
n_tv=hl.agg.count_where(hl.is_transversion(ht.alleles[0], ht.alleles[1])),
n_1bp_indel=hl.agg.count_where(indel_length == 1),
n_mod3bp_indel=hl.agg.count_where((indel_length % 3) == 0),
n_singleton=hl.agg.count_where(ht.singleton),
fail_hard_filters=hl.agg.count_where(
(ht.info.QD < 2) | (ht.info.FS > 60) | (ht.info.MQ < 30)
),
n_pos_train=hl.agg.count_where(ht.positive_train_site),
n_neg_train=hl.agg.count_where(ht.negative_train_site),
n_clinvar=hl.agg.count_where(hl.is_defined(clinvar)),
n_de_novos_singleton_adj=hl.agg.filter(
ht.ac == 1, hl.agg.sum(fam.n_de_novos_adj)
),
n_de_novo_singleton=hl.agg.filter(
ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw)
),
n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj),
n_de_novo=hl.agg.sum(fam.n_de_novos_raw),
n_trans_singletons=hl.agg.filter(
ht.ac_raw == 2, hl.agg.sum(fam.n_transmitted_raw)
),
n_untrans_singletons=hl.agg.filter(
(ht.ac_raw < 3) & (ht.ac_qc_samples_unrelated_raw == 1),
hl.agg.sum(fam.n_untransmitted_raw),
),
n_train_trans_singletons=hl.agg.filter(
info.AS_SB_TABLE.filter(lambda x: hl.is_defined(x)).fold(
lambda i, j: i[:2] + j[:2], [0, 0]
logger.info(
f"Generating sibling variant sharing counts using {sib_ht.count()} pairs."
)
sib_ht = sib_ht.explode("sibs").key_by("sibs")[mt.s]
# Create sibling sharing counters
sib_stats = hl.struct(
**{
f"n_sib_shared_variants_{name}": hl.sum(
hl.agg.filter(
expr,
hl.agg.group_by(
sib_ht.sib_idx,
hl.or_missing(
hl.agg.sum(hl.is_defined(mt.GT)) == 2,
hl.agg.min(_get_alt_count(mt.locus, mt.GT, is_female)),
),
),
).values()
)
for name, expr in strata.items()
}
)
sib_stats = sib_stats.annotate(
**{
f"ac_sibs_{name}": hl.agg.filter(
expr & hl.is_defined(sib_ht.sib_idx), hl.agg.sum(mt.GT.n_alt_alleles())
)
for name, expr in strata.items()
}