Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_projectmax(mt: hl.MatrixTable, loc: hl.expr.StringExpression) -> hl.MatrixTable:
"""
First pass of projectmax (returns aggregated MT with project_max field)
:param MatrixTable mt: Input MT
:param StringExpression loc: Column expression location of project ID (e.g. mt.meta.pid)
:return: Frequency data with annotated project_max
:rtype: MatrixTable
"""
# TODO: add hom count
mt = mt.annotate_cols(project=loc)
agg_mt = mt.group_cols_by(mt.project).aggregate(ac=hl.agg.sum(mt.GT.n_alt_alleles()),
an=2 * hl.agg.count_where(hl.is_defined(mt.GT)),
hom=hl.agg.count_where(mt.GT.is_hom_var()))
agg_mt = agg_mt.annotate_entries(af=agg_mt.ac / agg_mt.an)
return agg_mt.annotate_rows(project_max=hl.agg.take(hl.struct(project=agg_mt.project, ac=agg_mt.ac,
af=agg_mt.af, an=agg_mt.an, hom=agg_mt.hom),
5, -agg_mt.af))
n_de_novo_singleton=hl.agg.filter(
ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw)
),
n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj),
n_de_novo=hl.agg.sum(fam.n_de_novos_raw),
n_trans_singletons=hl.agg.filter(
ht.ac_raw == 2, hl.agg.sum(fam.n_transmitted_raw)
),
n_untrans_singletons=hl.agg.filter(
(ht.ac_raw < 3) & (ht.ac_qc_samples_unrelated_raw == 1),
hl.agg.sum(fam.n_untransmitted_raw),
),
n_train_trans_singletons=hl.agg.filter(
(ht.ac_raw == 2) & ht.positive_train_site, hl.agg.sum(fam.n_transmitted_raw)
),
n_omni=hl.agg.count_where(truth_data.omni),
n_mills=hl.agg.count_where(truth_data.mills),
n_hapmap=hl.agg.count_where(truth_data.hapmap),
n_kgp_phase1_hc=hl.agg.count_where(truth_data.kgp_phase1_hc),
)
def many_aggs(mt):
aggs = [
hl.agg.count_where(mt.GT.is_hom_ref()),
hl.agg.count_where(mt.GT.is_het()),
hl.agg.count_where(mt.GT.is_hom_var()),
hl.agg.count_where(mt.GT.is_non_ref()),
hl.agg.count_where(mt.GT.n_alt_alleles() == 2),
hl.agg.count_where(mt.GT.phased),
hl.agg.count_where(mt.GT.is_haploid()),
hl.agg.count_where(mt.GT.is_diploid()),
hl.agg.count_where(mt.GT.ploidy == 2),
hl.agg.fraction(mt.AD[0] > 0),
hl.agg.fraction(mt.AD[0] < 0),
hl.agg.fraction(mt.AD.length() < 0),
hl.agg.fraction(mt.AD.length() > 0),
hl.agg.fraction(mt.PL[0] > 0),
hl.agg.fraction(mt.PL[0] < 0),
hl.agg.fraction(mt.PL.length() < 0),
hl.agg.fraction(mt.PL.length() > 0),
hl.agg.fraction(mt.GQ < 0),
hl.agg.fraction(mt.GQ > 0),
ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw)
),
n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj),
n_de_novo=hl.agg.sum(fam.n_de_novos_raw),
n_trans_singletons=hl.agg.filter(
ht.ac_raw == 2, hl.agg.sum(fam.n_transmitted_raw)
),
n_untrans_singletons=hl.agg.filter(
(ht.ac_raw < 3) & (ht.ac_qc_samples_unrelated_raw == 1),
hl.agg.sum(fam.n_untransmitted_raw),
),
n_train_trans_singletons=hl.agg.filter(
(ht.ac_raw == 2) & ht.positive_train_site, hl.agg.sum(fam.n_transmitted_raw)
),
n_omni=hl.agg.count_where(truth_data.omni),
n_mills=hl.agg.count_where(truth_data.mills),
n_hapmap=hl.agg.count_where(truth_data.hapmap),
n_kgp_phase1_hc=hl.agg.count_where(truth_data.kgp_phase1_hc),
)
def many_aggs(mt):
aggs = [
hl.agg.count_where(mt.GT.is_hom_ref()),
hl.agg.count_where(mt.GT.is_het()),
hl.agg.count_where(mt.GT.is_hom_var()),
hl.agg.count_where(mt.GT.is_non_ref()),
hl.agg.count_where(mt.GT.n_alt_alleles() == 2),
hl.agg.count_where(mt.GT.phased),
hl.agg.count_where(mt.GT.is_haploid()),
hl.agg.count_where(mt.GT.is_diploid()),
hl.agg.count_where(mt.GT.ploidy == 2),
hl.agg.fraction(mt.AD[0] > 0),
hl.agg.fraction(mt.AD[0] < 0),
hl.agg.fraction(mt.AD.length() < 0),
hl.agg.fraction(mt.AD.length() > 0),
hl.agg.fraction(mt.PL[0] > 0),
hl.agg.fraction(mt.PL[0] < 0),
hl.agg.fraction(mt.PL.length() < 0),
hl.agg.fraction(mt.PL.length() > 0),
hl.agg.fraction(mt.GQ < 0),
hl.agg.fraction(mt.GQ > 0),
hl.agg.fraction(mt.GQ % 2 == 0),
hl.agg.fraction(mt.GQ % 2 != 0),
hl.agg.fraction(mt.GQ / 5 < 10),
def many_aggs(mt):
aggs = [
hl.agg.count_where(mt.GT.is_hom_ref()),
hl.agg.count_where(mt.GT.is_het()),
hl.agg.count_where(mt.GT.is_hom_var()),
hl.agg.count_where(mt.GT.is_non_ref()),
hl.agg.count_where(mt.GT.n_alt_alleles() == 2),
hl.agg.count_where(mt.GT.phased),
hl.agg.count_where(mt.GT.is_haploid()),
hl.agg.count_where(mt.GT.is_diploid()),
hl.agg.count_where(mt.GT.ploidy == 2),
hl.agg.fraction(mt.AD[0] > 0),
hl.agg.fraction(mt.AD[0] < 0),
hl.agg.fraction(mt.AD.length() < 0),
hl.agg.fraction(mt.AD.length() > 0),
hl.agg.fraction(mt.PL[0] > 0),
hl.agg.fraction(mt.PL[0] < 0),
hl.agg.fraction(mt.PL.length() < 0),
hl.agg.fraction(mt.PL.length() > 0),
hl.agg.fraction(mt.GQ < 0),
hl.agg.fraction(mt.GQ > 0),
hl.agg.fraction(mt.GQ % 2 == 0),
hl.agg.fraction(mt.GQ % 2 != 0),
hl.agg.fraction(mt.GQ / 5 < 10),
hl.agg.fraction(mt.GQ / 5 <= 10),
hl.agg.fraction(mt.GQ / 5 > 10),
hl.agg.fraction(mt.GQ / 5 >= 10),
def check_mismatch(ht: hl.Table) -> hl.expr.expressions.StructExpression:
"""
Checks for mismatches between reference allele and allele in reference fasta
:param ht: Table to be checked
:return: StructExpression containing counts for mismatches and count for all variants on negative strand
"""
mismatch = ht.aggregate(hl.struct(total_variants=hl.agg.count(),
total_mismatch=hl.agg.count_where(ht.reference_mismatch),
negative_strand=hl.agg.count_where(ht.new_locus.is_negative_strand),
negative_strand_mismatch=hl.agg.count_where(ht.new_locus.is_negative_strand & ht.reference_mismatch)
)
)
return mismatch
def check_mismatch(ht: hl.Table) -> hl.expr.expressions.StructExpression:
"""
Checks for mismatches between reference allele and allele in reference fasta
:param ht: Table to be checked
:return: StructExpression containing counts for mismatches and count for all variants on negative strand
"""
mismatch = ht.aggregate(hl.struct(total_variants=hl.agg.count(),
total_mismatch=hl.agg.count_where(ht.reference_mismatch),
negative_strand=hl.agg.count_where(ht.new_locus.is_negative_strand),
negative_strand_mismatch=hl.agg.count_where(ht.new_locus.is_negative_strand & ht.reference_mismatch)
)
)
return mismatch
def check_mismatch(ht: hl.Table) -> hl.expr.expressions.StructExpression:
"""
Checks for mismatches between reference allele and allele in reference fasta
:param ht: Table to be checked
:return: StructExpression containing counts for mismatches and count for all variants on negative strand
"""
mismatch = ht.aggregate(hl.struct(total_variants=hl.agg.count(),
total_mismatch=hl.agg.count_where(ht.reference_mismatch),
negative_strand=hl.agg.count_where(ht.new_locus.is_negative_strand),
negative_strand_mismatch=hl.agg.count_where(ht.new_locus.is_negative_strand & ht.reference_mismatch)
)
)
return mismatch
grch37_resources.reference_data.get_truth_ht()
if build == "GRCh37"
else grch38_resources.reference_data.get_truth_ht()
)[ht.key]
fam = fam_stats_ht[ht.key]
return dict(
min_score=hl.agg.min(ht.score),
max_score=hl.agg.max(ht.score),
n=hl.agg.count(),
n_ins=hl.agg.count_where(hl.is_insertion(ht.alleles[0], ht.alleles[1])),
n_del=hl.agg.count_where(hl.is_deletion(ht.alleles[0], ht.alleles[1])),
n_ti=hl.agg.count_where(hl.is_transition(ht.alleles[0], ht.alleles[1])),
n_tv=hl.agg.count_where(hl.is_transversion(ht.alleles[0], ht.alleles[1])),
n_1bp_indel=hl.agg.count_where(indel_length == 1),
n_mod3bp_indel=hl.agg.count_where((indel_length % 3) == 0),
n_singleton=hl.agg.count_where(ht.singleton),
fail_hard_filters=hl.agg.count_where(
(ht.info.QD < 2) | (ht.info.FS > 60) | (ht.info.MQ < 30)
),
n_pos_train=hl.agg.count_where(ht.positive_train_site),
n_neg_train=hl.agg.count_where(ht.negative_train_site),
n_clinvar=hl.agg.count_where(hl.is_defined(clinvar)),
n_de_novos_singleton_adj=hl.agg.filter(
ht.ac == 1, hl.agg.sum(fam.n_de_novos_adj)
),
n_de_novo_singleton=hl.agg.filter(
ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw)
),
n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj),
n_de_novo=hl.agg.sum(fam.n_de_novos_raw),
n_trans_singletons=hl.agg.filter(