Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
:param normal_ploidy_cutoff: Number of standard deviations to use when determining sex chromosome ploidy cutoffs for XX, XY karyotypes.
:param aneuploidy_cutoff: Number of standard deviations to use when sex chromosome ploidy cutoffs for aneuploidies.
:return: Tuple of ploidy cutoff tuples: ((x_ploidy_cutoffs), (y_ploidy_cutoffs))
"""
# Group sex chromosome ploidy table by f_stat cutoff and get mean/stdev for chrX/Y ploidies
sex_stats = ht.aggregate(
hl.agg.group_by(
hl.cond(ht.f_stat < f_stat_cutoff, "xx", "xy"),
hl.struct(
x=hl.agg.stats(ht.chrX_ploidy),
y=hl.agg.stats(ht.chrY_ploidy)
)
)
)
logger.info(f"XX stats: {sex_stats['xx']}")
logger.info(f"XY stats: {sex_stats['xy']}")
cutoffs = (
(
sex_stats["xy"].x.mean + (normal_ploidy_cutoff * sex_stats["xy"].x.stdev),
(sex_stats["xx"].x.mean - (normal_ploidy_cutoff * sex_stats["xx"].x.stdev), sex_stats["xx"].x.mean + (normal_ploidy_cutoff * sex_stats["xx"].x.stdev)),
sex_stats["xx"].x.mean + (aneuploidy_cutoff * sex_stats["xx"].x.stdev)
),
(
(sex_stats["xx"].y.mean + (normal_ploidy_cutoff * sex_stats["xx"].y.stdev), sex_stats["xy"].y.mean + (normal_ploidy_cutoff * sex_stats["xy"].y.stdev)),
sex_stats["xy"].y.mean + (aneuploidy_cutoff * sex_stats["xy"].y.stdev)
)
)
logger.info(f"X ploidy cutoffs: {cutoffs[0]}")
logger.info(f"Y ploidy cutoffs: {cutoffs[1]}")
return cutoffs
filter_expr.append((mt.site_callrate > min_callrate))
if min_inbreeding_coeff_threshold is not None:
filter_expr.append((mt.site_inbreeding_coeff > min_inbreeding_coeff_threshold))
if min_hardy_weinberg_threshold is not None:
filter_expr.append((mt.hwe.p_value > min_hardy_weinberg_threshold))
if snv_only:
filter_expr.append(hl.is_snp(mt.alleles[0], mt.alleles[1]))
if bi_allelic_only:
filter_expr.append(bi_allelic_expr(mt))
if apply_hard_filters:
if 'info' in mt.row_value:
if 'QD' in mt.info:
filter_expr.append((mt.info.QD >= 2))
else:
logger.warn("Could not apply QD hard filter, as `info.QD` not found in schema.")
if 'FS' in mt.info:
filter_expr.append((mt.info.FS <= 60))
else:
logger.warn("Could not apply FS hard filter, as `info.FS` not found in schema.")
if 'MQ' in mt.info:
filter_expr.append((mt.info.MQ >= 30))
else:
logger.warn("Could not apply MQ hard filter, as `info.MQ` not found in schema.")
else:
logger.warn("Could not apply hard filters as `info` not found in schema.")
return mt.filter_rows(functools.reduce(operator.iand, filter_expr))
:param min_af: Minimum allele frequency to keep. Not applied if set to ``None``.
:param min_callrate: Minimum call rate to keep. Not applied if set to ``None``.
:param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``.
:param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``.
:param apply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30
:param ld_r2: Minimum r2 to keep when LD-pruning (set to `None` for no LD pruning)
:param filter_lcr: Filter LCR regions
:param filter_decoy: Filter decoy regions
:param filter_segdup: Filter segmental duplication regions
:param filter_exome_low_coverage_regions: If set, only high coverage exome regions (computed from gnomAD are kept)
:param high_conf_regions: If given, the data will be filtered to only include variants in those regions
:return: Filtered MT
"""
logger.info("Creating QC MatrixTable")
if ld_r2 is not None:
logger.warn("The LD-prune step of this function requires non-preemptible workers only!")
qc_mt = filter_low_conf_regions(
mt,
filter_lcr=filter_lcr,
filter_decoy=filter_decoy,
filter_segdup=filter_segdup,
filter_exome_low_coverage_regions=filter_exome_low_coverage_regions, high_conf_regions=high_conf_regions
)
if adj_only:
qc_mt = filter_to_adj(qc_mt) # TODO: Make sure that this works fine before call rate filtering
qc_mt = filter_rows_for_qc(
qc_mt,
min_af,
min_callrate,