Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input")
parser.add_argument("output")
args = parser.parse_args()
hl.init(log="/tmp/hail.log")
ds = hl.read_table(args.input)
ds = format_variants_table(ds)
ds.describe()
ds.write(args.output)
def join_p100_p10(path_10, path_100):
ht1 = hl.read_table(path_10)
ht2 = hl.read_table(path_100)
ht2.join(ht1)._force_count()
###
# Create filtered keytable with autosomal, biallelic HM3 snps
# with MAF > .01, INFO > 0.9 and passing QC in UKBB GWAS analysis set
#
# Also provides both UKBB and HM3 formatting of variant
###
if gs_missing(BUCKET + HM3_qcpos_stem + '.ht') or force_all:
print("Creating Hail table of HM3 SNPs passing UKBB GWAS QC...")
# get list of SNPs to be used in GWAS
# filter here: autosomes, no indels, no MHC (filter early for efficiency)
ukb_snps = hl.read_table(GWAS_snps).key_by('locus','alleles').repartition(500, shuffle=False)
# ukb_snps = ukb_snps.annotate(sort_al=hl.sorted(ukb_snps.alleles))
if verbose:
print("\nCount 1: " + str(ukb_snps.count()) + '\n')
ukb_snps = ukb_snps.filter(
hl.is_snp(ukb_snps.alleles[0], ukb_snps.alleles[1]) &
(~(ukb_snps.locus.contig == 'X')) &
(~( (ukb_snps.locus.contig == '6') & (ukb_snps.locus.position > 25000000) & (ukb_snps.locus.position < 34000000)))
)
if verbose:
print("\nCount 2: " + str(ukb_snps.count()) + '\n')
# merge in, filter on MAF from the UKBB GWAS sample
ukb_qc = hl.import_table(GWAS_qc)
ukb_qc = ukb_qc.annotate(vstruct = hl.parse_variant(ukb_qc.variant))
ukb_qc = ukb_qc.annotate(locus = ukb_qc.vstruct.locus, alleles = ukb_qc.vstruct.alleles).key_by('locus','alleles')
def run(self):
mt = self.import_vcf()
mt = hl.split_multi_hts(mt)
if self.validate:
self.validate_mt(mt, self.genome_version, self.sample_type)
if self.remap_path:
mt = self.remap_sample_ids(mt, self.remap_path)
if self.subset_path:
mt = self.subset_samples_and_variants(mt, self.subset_path)
mt = HailMatrixTableTask.run_vep(mt, self.genome_version, self.vep_runner)
ref_data = hl.read_table(self.reference_ht_path)
clinvar = hl.read_table(self.clinvar_ht_path)
hgmd = hl.read_table(self.hgmd_ht_path)
mt = SeqrVariantsAndGenotypesSchema(mt, ref_data=ref_data, clinvar_data=clinvar, hgmd_data=hgmd).annotate_all(
overwrite=True).select_annotated_mt()
mt = SeqrGenotypesSchema(mt).annotate_all(overwrite=True).mt
mt.describe()
mt.write(self.output().path, stage_locally=True)
def write_temp_gcs(t: Union[hl.MatrixTable, hl.Table], gcs_path: str,
overwrite: bool = False, temp_path: str = '/tmp.h') -> None:
t.write(temp_path, overwrite=True)
t = hl.read_matrix_table(temp_path) if isinstance(t, hl.MatrixTable) else hl.read_table(temp_path)
t.write(gcs_path, overwrite=overwrite)
def export_table_to_elasticsearch(table_url, host, index_name, index_type, port=9200, num_shards=1, block_size=200):
ds = hl.read_table(table_url)
es = ElasticsearchClient(host, port)
es.export_table_to_elasticsearch(
ds,
index_name=index_name,
index_type_name=index_type,
block_size=block_size,
num_shards=num_shards,
delete_index_before_exporting=True,
export_globals_to_index_meta=True,
verbose=True,
)
def join_p1000_p10(path_10, path_1000):
ht1 = hl.read_table(path_10)
ht2 = hl.read_table(path_1000)
ht2.join(ht1)._force_count()
ukb_qc = ukb_qc.annotate(vstruct = hl.parse_variant(ukb_qc.variant))
ukb_qc = ukb_qc.annotate(locus = ukb_qc.vstruct.locus, alleles = ukb_qc.vstruct.alleles).key_by('locus','alleles')
ukb_qc2 = ukb_snps.join(ukb_qc.select(ukb_qc.minor_AF))
if verbose:
print("\nCount 3: " + str(ukb_qc2.count()) + '\n')
ukb_qc2 = ukb_qc2.filter(
(hl.float(ukb_qc2.minor_AF) > 0.01) &
(hl.float(ukb_qc2.minor_AF) < 0.99)
)
if verbose:
print("\nCount 4: " + str(ukb_qc2.count()) + '\n')
# merge in rsid, info (from full UKB sample)
# and filter to info > 0.9
ukb_mfi = hl.read_table(GWAS_mfi).key_by('locus','alleles').repartition(500, shuffle=False)
ukb_qc3 = ukb_qc2.join(ukb_mfi.select('varid','variant','rsid','info'))
if verbose:
print("\nCount 5: " + str(ukb_qc3.count()) + '\n')
ukb_qc3 = ukb_qc3.filter(
(ukb_qc3.info > 0.9)
)
if verbose:
print("\nCount 6: " + str(ukb_qc3.count()) + '\n')
# drop multi-allelic sites
loc_count = (ukb_qc3.group_by(ukb_qc3.locus)
.aggregate(nloc=hl.agg.count())
)
loc_count = loc_count.filter(loc_count.nloc==1)
if verbose:
def read_force_count_p1000(path):
hl.read_table(path)._force_count()