Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
[int(x) for x in ld[10].split(',') if x]))
assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
ref2exons.setdefault(refseq,[]).append({
'transcript' : refseq,
'chr': chr,
'strand': -1 if ld[3] == '-' else 1,
'txstart': int(ld[4]),
'txend': int(ld[5]),
'cdsstart': int(ld[6]),
'cdsend': int(ld[7]),
'position': exons
})
gene2exons = {}
reflink_file = os.path.join(data_folder, '../hgFixed/database/refLink.txt.gz')
refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False)
for refseq in sorted(ref2exons.keys()):
geneid = refseq2gene.get(refseq, None)
if geneid and geneid != '0':
if geneid not in gene2exons:
gene2exons[geneid] = {exons_key: ref2exons[refseq]}
else:
gene2exons[geneid][exons_key].extend(ref2exons[refseq])
load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))
return gene2exons
def load_broadinstitute_exac_any(one_file,key):
logging.info("Loading file %s (%s)" % (one_file,key))
data = tab2dict(one_file, (0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21), 0)
exacs = {}
for transcript in data:
tupleexac = data[transcript]
# remove version in key so we can search the dict easily later
exacs[transcript.split(".")[0]] = {"exac" :
{
"transcript" : transcript, # but keep version here
"n_exons" : int(tupleexac[0]),
"cds_start" : int(tupleexac[1]),
"cds_end" : int(tupleexac[2]),
"bp" : int(tupleexac[3]),
key : {
"mu_syn" : float(tupleexac[4]),
"mu_mis" : float(tupleexac[5]),
"mu_lof" : float(tupleexac[6]),
"n_syn" : float(tupleexac[7]),
def load(self, aslist=False):
'''
loading ncbi "homologene.data" file
adding "homologene" field in gene doc
'''
from biothings.utils.hub_db import get_src_dump
homo_d = tab2dict(self.datafile,(2,1),0,header=0)
entrez_doc = get_src_dump().find_one({"_id":"entrez"}) or {}
entrez_dir = entrez_doc.get("download",{}).get("data_folder")
assert entrez_dir, "Can't find Entrez data directory"
DATAFILE = os.path.join(entrez_dir, 'gene_history.gz')
assert os.path.exists(DATAFILE), "gene_history.gz is missing (entrez_dir: %s)" % entrez_dir
retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0,includefn=lambda ld: ld[1] != '-')
for id in list(homo_d.keys()):
homo_d[retired2gene.get(id,id)] = homo_d[id]
with open(self.datafile) as df:
homologene_d = {}
doc_li = []
print()
geneid_d = get_geneid_d(entrez_dir, self.species_li,load_cache=False,save_cache=False,only_for=homo_d)
for line in df:
ld = line.strip().split('\t')
hm_id, tax_id, geneid = [int(x) for x in ld[:3]]
if (self.taxid_set is None or tax_id in self.taxid_set) and geneid in geneid_d:
# for selected species only
# and also ignore those geneid does not match any
# existing gene doc
"""gene_ensembl__xref_entrezgene__dm"""
CUSTOM_MAPPING_FILE = os.path.join(
self.data_folder, 'gene_ensembl__gene__extra.txt')
global extra_mapping_lock
try:
print("Trying to acquire extra mapping lock")
extra_mapping_lock.acquire()
print("Lock acquired")
if not os.path.exists(CUSTOM_MAPPING_FILE) or os.stat(CUSTOM_MAPPING_FILE).st_size == 0:
print("Missing extra mapping file, now generating")
from . import ensembl_ncbi_mapping
ensembl_ncbi_mapping.main(src_name, confirm=False)
finally:
print("Releasing lock")
extra_mapping_lock.release()
extra = tab2dict(CUSTOM_MAPPING_FILE, (0, 1), 0, alwayslist=True)
datafile = os.path.join(
self.data_folder, 'gene_ensembl__xref_entrezgene__dm.txt')
# [(ensembl_gid, entrez_gid),...]
ensembl2entrez = tab2dict(
datafile, (1, 2), 0, includefn=_not_LRG, alwayslist=True)
# replace with our custom mapping
##adjusted = {}
for k in extra:
# if k in ensembl2entrez:
## adjusted[k] = {"ensembl2entrez":ensembl2entrez[k],"extra":extra[k]}
ensembl2entrez[k] = extra[k]
##import pickle
# pickle.dump(adjusted,open("/tmp/adjusted","wb"))
# back to list of tuples
ensembl2entrez_li = []
for ensembl_id, entrez_ids in ensembl2entrez.items():
def loaddata(data_folder):
#GNF1H
datafile = os.path.join(data_folder, 'gnf', 'GNF1H.ANNO7.LOAD_20130402.tab')
gene2gnf1h = tab2dict(datafile, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '')
#GNF1m
datafile = os.path.join(data_folder, 'gnf', 'gnf1m.NEW_ANNO6.LOAD_20130402.tab')
gene2gnf1m = tab2dict(datafile, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '')
return {'GNF1H': gene2gnf1h,
'GNF1M': gene2gnf1m}
def load(self, aslist=False):
uni_d = tab2dict(self.datafile, (0, 1), 0, alwayslist=0)
DATAFILE = os.path.join(self.data_folder, 'gene_history.gz')
retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0,includefn=lambda ld: ld[1] != '-')
for id in list(uni_d.keys()):
uni_d[retired2gene.get(id,id)] = uni_d[id]
geneid_d = get_geneid_d(self.data_folder, self.species_li,load_cache=False,save_cache=False,only_for=uni_d)
gene2unigene = tab2dict_iter(self.datafile, (0, 1), 0, alwayslist=0,
includefn=lambda ld: int(ld[0]) in geneid_d)
cnt = 0
for doc in gene2unigene:
yield self.format(doc)
cnt += 1
def load_pharmgkb(data_folder):
datafile = os.path.join(data_folder, 'genes.zip')
gene2pharmgkb = tab2dict((datafile, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '')
fn = lambda value: {'pharmgkb': value}
gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False)
return gene2pharmgkb
try:
print("Trying to acquire extra mapping lock")
extra_mapping_lock.acquire()
print("Lock acquired")
if not os.path.exists(CUSTOM_MAPPING_FILE) or os.stat(CUSTOM_MAPPING_FILE).st_size == 0:
print("Missing extra mapping file, now generating")
from . import ensembl_ncbi_mapping
ensembl_ncbi_mapping.main(src_name, confirm=False)
finally:
print("Releasing lock")
extra_mapping_lock.release()
extra = tab2dict(CUSTOM_MAPPING_FILE, (0, 1), 0, alwayslist=True)
datafile = os.path.join(
self.data_folder, 'gene_ensembl__xref_entrezgene__dm.txt')
# [(ensembl_gid, entrez_gid),...]
ensembl2entrez = tab2dict(
datafile, (1, 2), 0, includefn=_not_LRG, alwayslist=True)
# replace with our custom mapping
##adjusted = {}
for k in extra:
# if k in ensembl2entrez:
## adjusted[k] = {"ensembl2entrez":ensembl2entrez[k],"extra":extra[k]}
ensembl2entrez[k] = extra[k]
##import pickle
# pickle.dump(adjusted,open("/tmp/adjusted","wb"))
# back to list of tuples
ensembl2entrez_li = []
for ensembl_id, entrez_ids in ensembl2entrez.items():
for entrez_id in entrez_ids:
ensembl2entrez_li.append((ensembl_id, entrez_id))
self.ensembl2entrez_li = ensembl2entrez_li
def _load_ensembl_2taxid(self):
"""ensembl2taxid"""
datafile = os.path.join(
self.data_folder, 'gene_ensembl__translation__main.txt')
ensembl2taxid = dict_nodup(
tab2dict(datafile, (0, 1), 1, includefn=_not_LRG))
# need to convert taxid to integer here
ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
return ensembl2taxid