Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import hail as hl
GOLD_STD = 'gs://hail-common/vep/vep/vep_examplars/vep_no_csq_4dc19bc1b.mt/'
GOLD_STD_CSQ = 'gs://hail-common/vep/vep/vep_examplars/vep_csq_4dc19bc1b.mt/'
for path, csq in [(GOLD_STD, False), (GOLD_STD_CSQ, True)]:
print(f"Checking 'hl.vep' replicates on '{path}'")
expected = hl.read_matrix_table(path)
actual = hl.vep(expected.rows().select(), 'gs://hail-common/vep/vep/vep85-loftee-gcloud-testing.json', csq=csq)
actual._force_count()
# vep_result_agrees = actual._same(expected)
:param str full_meta: Whether to add all metadata (warning: large)
:param str fam_version: Version of metadata (default to current)
:param str fam_root: Where to put the pedigree information. Set to None if no pedigree information is desired.
:param str duplicate_mapping_root: Where to put the duplicate genome/exome samples ID mapping (default is None -- do not annotate)
:param bool release_samples: When set, filters the data to release samples only
:param str release_annotations: One of the RELEASES to add variant annotations (into va), or None for no data
:return: gnomAD hardcalls dataset with chosen annotations
:rtype: MatrixTable
"""
from gnomad_hail.utils import filter_to_adj
if raw and split:
raise DataException('No split raw data. Use of hardcalls is recommended.')
if non_refs_only:
mt = hl.read_matrix_table(get_gnomad_data_path(data_type, split=split, non_refs_only=non_refs_only, hail_version=hail_version))
else:
mt = hl.read_matrix_table(get_gnomad_data_path(data_type, hardcalls=not raw, split=split, hail_version=hail_version))
if adj:
mt = filter_to_adj(mt)
if meta_root:
meta_ht = get_gnomad_meta(data_type, meta_version, full_meta=full_meta)
mt = mt.annotate_cols(**{meta_root: meta_ht[mt.s]})
if duplicate_mapping_root:
dup_ht = hl.import_table(genomes_exomes_duplicate_ids_tsv_path, impute=True,
key='exome_id' if data_type == "exomes" else 'genome_id')
mt = mt.annotate_cols(**{duplicate_mapping_root: dup_ht[mt.s]})
if fam_root:
def matrix_table_take_row(mt_path):
mt = hl.read_matrix_table(mt_path)
mt.info.AF.take(100)
def shuffle_key_rows_by_mt(mt_path):
mt = hl.read_matrix_table(mt_path)
mt = mt.annotate_rows(reversed_position_locus=hl.struct(
contig=mt.locus.contig,
position=-mt.locus.position))
mt = mt.key_rows_by(mt.reversed_position_locus)
mt._force_count_rows()
return
tmp_path += f'{uuid.uuid4()}/'
ht = hl.read_matrix_table(out_paths[0]).rows()
intervals = calculate_new_intervals(ht, TARGET_RECORDS)
mts = [hl.read_matrix_table(path, _intervals=intervals) for path in out_paths]
combined_mts = [combine_gvcfs(mt) for mt in chunks(mts, MAX_COMBINE_NUMBER)]
i = 0
while len(combined_mts) > 1:
tmp = tmp_path + f'{i}/'
pad = len(str(len(combined_mts)))
hl.experimental.write_matrix_tables(combined_mts, tmp, overwrite=True)
paths = [tmp + str(n).zfill(pad) + '.mt' for n in range(len(combined_mts))]
ht = hl.read_matrix_table(out_paths[0]).rows()
intervals = calculate_new_intervals(ht, TARGET_RECORDS)
mts = [hl.read_matrix_table(path, _intervals=intervals) for path in paths]
combined_mts = [combine_gvcfs(mts) for mt in chunks(mts, MAX_COMBINE_NUMBER)]
i += 1
combined_mts[0].write(out_file, overwrite=overwrite)
def read_mt(mt_path: str):
"""Read MT from the given path."""
logger.info(f"\n==> read in: {mt_path}")
return hl.read_matrix_table(mt_path)
def read_tx_annotation_tables(mt_path, gtex_tx_summary_path, mt_type="mt"):
if mt_type == "mt":
mt = hl.read_matrix_table(mt_path)
elif mt_type == "ht":
mt = hl.read_table(mt_path)
mt = hl.MatrixTable.from_rows_table(mt)
gtex = hl.read_table(gtex_tx_summary_path)
return mt, gtex
sample_names, paths = [list(x) for x in zip(*samples)]
sample_names = [[n] for n in sample_names]
assert len(paths) == len(samples)
out_paths = stage_one(paths, sample_names, tmp_path, intervals, header, out_file)
if not out_paths:
return
tmp_path += f'{uuid.uuid4()}/'
mts = [hl.read_matrix_table(path) for path in out_paths]
combined_mts = [comb.combine_gvcfs(mt) for mt in chunks(mts, MAX_COMBINE_NUMBER)]
i = 0
while len(combined_mts) > 1:
tmp = tmp_path + f'{i}/'
pad = len(str(len(combined_mts)))
hl.experimental.write_matrix_tables(combined_mts, tmp, overwrite=True)
paths = [tmp + str(n).zfill(pad) + '.mt' for n in range(len(combined_mts))]
mts = [hl.read_matrix_table(path) for path in paths]
combined_mts = [comb.combine_gvcfs(mts) for mt in chunks(mts, MAX_COMBINE_NUMBER)]
i += 1
combined_mts[0].write(out_file, overwrite=overwrite)
else:
field = Env.get_uid()
mt = mt.select_entries(**{field: call_expr})
mt = mt.select_rows().select_cols()
mt = mt.distinct_by_row()
locally_pruned_table_path = new_temp_file()
(_local_ld_prune(require_biallelic(mt, 'ld_prune'), field, r2, bp_window_size, memory_per_core)
.add_index()
.write(locally_pruned_table_path, overwrite=True))
locally_pruned_table = hl.read_table(locally_pruned_table_path)
locally_pruned_ds_path = new_temp_file()
mt = mt.annotate_rows(info=locally_pruned_table[mt.row_key])
(mt.filter_rows(hl.is_defined(mt.info))
.write(locally_pruned_ds_path, overwrite=True))
locally_pruned_ds = hl.read_matrix_table(locally_pruned_ds_path)
n_locally_pruned_variants = locally_pruned_ds.count_rows()
info(f'ld_prune: local pruning stage retained {n_locally_pruned_variants} variants')
standardized_mean_imputed_gt_expr = hl.or_else(
(locally_pruned_ds[field].n_alt_alleles() - locally_pruned_ds.info.mean) * locally_pruned_ds.info.centered_length_rec,
0.0)
std_gt_bm = BlockMatrix.from_entry_expr(standardized_mean_imputed_gt_expr, block_size=block_size)
r2_bm = (std_gt_bm @ std_gt_bm.T) ** 2
_, stops = hl.linalg.utils.locus_windows(locally_pruned_table.locus, bp_window_size)
entries = r2_bm.sparsify_row_intervals(range(stops.size), stops, blocks_only=True).entries()
entries = entries.filter((entries.entry >= r2) & (entries.i < entries.j))