Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
+ hl.format("%.3f", element.sift_score)
+ ")",
"polyphen": element.polyphen_prediction
+ "("
+ hl.format("%.3f", element.polyphen_score)
+ ")",
"domains": hl.delimit(
element.domains.map(lambda d: d.db + ":" + d.name), "&"
),
}
)
elif feature_type == "MotifFeature":
fields["motif_score_change"] = hl.format("%.3f", element.motif_score_change)
return hl.delimit(
[hl.or_else(hl.str(fields.get(f, "")), "") for f in _csq_fields], "|"
)
elif feature_type == "MotifFeature":
fields["motif_score_change"] = hl.format("%.3f", element.motif_score_change)
return hl.delimit(
[hl.or_else(hl.str(fields.get(f, "")), "") for f in _csq_fields], "|"
)
csq = hl.empty_array(hl.tstr)
for feature_field, feature_type in [
("transcript_consequences", "Transcript"),
("regulatory_feature_consequences", "RegulatoryFeature"),
("motif_feature_consequences", "MotifFeature"),
("intergenic_consequences", "Intergenic"),
]:
csq = csq.extend(
hl.or_else(
vep_expr[feature_field].map(
lambda x: get_csq_from_struct(x, feature_type=feature_type)
),
hl.empty_array(hl.tstr),
)
)
return hl.or_missing(hl.len(csq) > 0, csq)
transcript_id=hl.delimit(variants.transcript_id, ","),
hgvsc=hl.delimit(variants.hgvsc.keys().map(lambda k: k + ":" + variants.hgvsc[k]), ","),
hgvsp=hl.delimit(variants.hgvsp.keys().map(lambda k: k + ":" + variants.hgvsp[k]), ","),
)
variants = variants.annotate(flags="PASS")
variants = variants.drop("v")
results = hl.read_table(args.results)
results = results.annotate(
analysis_group=results.analysis_group.lower().replace("[^a-z0-9]+", "_").replace("_+$", "")
)
results = results.drop("v")
# Add n_denovos to AC_case
results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) + hl.or_else(results.n_denovos, 0))
results = results.annotate(af_case=hl.cond(results.an_case == 0, 0, results.ac_case / results.an_case))
variants = variants.filter(hl.is_defined(results[variants.key]))
analysis_groups = results.aggregate(hl.agg.collect_as_set(results.analysis_group))
variants = variants.annotate(groups=hl.struct())
for group in analysis_groups:
group_results = results.filter(results.analysis_group == group).drop("analysis_group")
variants = variants.annotate(groups=variants.groups.annotate(**{group: group_results[variants.key]}))
# The latest (2019/04/15) SCHEMA dataset moved the source and in_analysis field from variant level to group level
# in_analysis is the same for all groups within a variant, but source is not
variants = variants.annotate(in_analysis=variants.groups.meta.in_analysis, source=variants.groups.meta.source)
variants = variants.key_by().drop("locus", "alleles")
hgnc.previous_symbols == "",
hl.empty_array(hl.tstr),
hgnc.previous_symbols.split(",").map(lambda s: s.strip()),
),
synonyms=hl.cond(
hgnc.synonyms == "", hl.empty_array(hl.tstr), hgnc.synonyms.split(",").map(lambda s: s.strip())
),
)
genes = genes.annotate(**hgnc[genes.gene_id])
genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol), "hgnc", hl.null(hl.tstr)))
# If an HGNC gene symbol was not present, use the symbol from Gencode
for gencode_version in all_gencode_versions:
genes = genes.annotate(
symbol=hl.or_else(genes.symbol, genes.gencode[f"v{gencode_version}"].gene_symbol),
symbol_source=hl.cond(
hl.is_missing(genes.symbol) & hl.is_defined(genes.gencode[f"v{gencode_version}"].gene_symbol),
f"gencode (v{gencode_version})",
genes.symbol_source,
),
)
# Collect all fields that can be used to search by gene name
genes = genes.annotate(
symbol_upper_case=genes.symbol.upper(),
search_terms=hl.empty_array(hl.tstr).append(genes.symbol).extend(genes.synonyms).extend(genes.previous_symbols),
)
for gencode_version in all_gencode_versions:
genes = genes.annotate(
search_terms=hl.rbind(
genes.gencode[f"v{gencode_version}"].gene_symbol,
lambda i: hl.rbind(
hl.int(hl.or_else(step1_indices[i], 0)),
ht.__cols[i].__m_step1,
ht.__entries[i],
lambda step1_idx, m_step1, entry: hl.rbind(
hl.map(
lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))),
hl.range(0, n_blocks + 1)),
lambda step1_separators: hl.rbind(
hl.set(step1_separators).contains(step1_idx),
hl.sum(
hl.map(
lambda s1: step1_idx >= s1,
step1_separators)) - 1,
lambda is_separator, step1_block: entry.annotate(
__step1_block=step1_block,
__step2_block=hl.cond(~entry.__in_step1 & is_separator,
step1_block - 1,
lambda c: (
hl.bind(
lambda is_coding, is_most_severe, is_canonical: (
hl.cond(
is_coding,
hl.cond(is_most_severe, hl.cond(is_canonical, 1, 2), hl.cond(is_canonical, 3, 4)),
hl.cond(is_most_severe, hl.cond(is_canonical, 5, 6), hl.cond(is_canonical, 7, 8)),
)
),
hl.or_else(c.biotype, "") == "protein_coding",
hl.set(c.consequence_terms).contains(vep_root.most_severe_consequence),
hl.or_else(c.canonical, 0) == 1,
)
t2 = tm.annotate_cols(
errors=hl.agg.count(),
snp_errors=hl.agg.count_where(hl.is_snp(tm.alleles[0], tm.alleles[1])))
table2 = t2.key_cols_by().cols()
table2 = table2.select(pat_id=table2.father[ck_name],
mat_id=table2.mother[ck_name],
fam_id=table2.fam_id,
errors=table2.errors,
snp_errors=table2.snp_errors)
table2 = table2.group_by('pat_id', 'mat_id').aggregate(
fam_id=hl.agg.take(table2.fam_id, 1)[0],
children=hl.int32(hl.agg.count()),
errors=hl.agg.sum(table2.errors),
snp_errors=hl.agg.sum(table2.snp_errors))
table2 = table2.annotate(errors=hl.or_else(table2.errors, hl.int64(0)),
snp_errors=hl.or_else(table2.snp_errors, hl.int64(0)))
# in implicated, idx 0 is dad, idx 1 is mom, idx 2 is child
implicated = hl.literal([
[0, 0, 0], # dummy
[1, 1, 1],
[1, 1, 1],
[1, 0, 1],
[0, 1, 1],
[0, 0, 1],
[1, 0, 1],
[0, 1, 1],
[0, 0, 1],
[0, 1, 1],
[0, 1, 1],
[1, 0, 1],
[1, 0, 1],
mt = mt.annotate_rows(**hl.sorted(mt.tx_annotation, key=lambda x: csq_order[
(x.lof, hl.or_else(hl.is_missing(x.lof_flag), False), x.csq)])[0])
[1, 1, 1],
[1, 1, 1],
[1, 0, 1],
[0, 1, 1],
[0, 0, 1],
[1, 0, 1],
[0, 1, 1],
[0, 0, 1],
[0, 1, 1],
[0, 1, 1],
[1, 0, 1],
[1, 0, 1],
], dtype=hl.tarray(hl.tarray(hl.tint64)))
table3 = tm.annotate_cols(all_errors=hl.or_else(hl.agg.array_sum(implicated[tm.mendel_code]), [0, 0, 0]),
snp_errors=hl.or_else(
hl.agg.filter(hl.is_snp(tm.alleles[0], tm.alleles[1]),
hl.agg.array_sum(implicated[tm.mendel_code])),
[0, 0, 0])).key_cols_by().cols()
table3 = table3.select(xs=[
hl.struct(**{ck_name: table3.father[ck_name],
'fam_id': table3.fam_id,
'errors': table3.all_errors[0],
'snp_errors': table3.snp_errors[0]}),
hl.struct(**{ck_name: table3.mother[ck_name],
'fam_id': table3.fam_id,
'errors': table3.all_errors[1],
'snp_errors': table3.snp_errors[1]}),
hl.struct(**{ck_name: table3.proband[ck_name],
'fam_id': table3.fam_id,
'errors': table3.all_errors[2],
# Group gene lists for all consequences in a struct
ds = ds.annotate(
consequences=hl.struct(
**{
csq.lower(): ds.info[f"PROTEIN_CODING__{csq}"]
for csq in PROTEIN_CODING_CONSEQUENCES
if csq != "INTERGENIC" and csq != "NEAREST_TSS"
}
)
)
ds = ds.annotate(intergenic=ds.info.PROTEIN_CODING__INTERGENIC)
# Collect set of all genes for which a variant has a consequence
all_genes = hl.empty_array(hl.tstr)
for csq in ds.consequences.dtype.fields:
all_genes = all_genes.extend(hl.or_else(ds.consequences[csq.lower()], hl.empty_array(hl.tstr)))
ds = ds.annotate(genes=hl.set(all_genes))
# Group per-population values in a struct for each field
# ds = ds.annotate(**{field.lower(): expr_for_per_population_field(ds, field) for field in PER_POPULATION_FIELDS})
ds = ds.annotate(
freq=hl.struct(
**dict(
(
(
pop,
hl.struct(
**{field.lower(): ds.info[f"{pop.upper()}_{field}"] for field in PER_POPULATION_FIELDS}
),
)
for pop in POPULATIONS
),