Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _fn(x):
import logging
out = {'taxid': int(x[0])}
if x[1].strip() not in ['', '\\N']:
out['symbol'] = x[1].strip()
if x[2].strip() not in ['', '\\N']:
_name = SubStr(x[2].strip(), '', ' [Source:').strip()
if _name:
out['name'] = _name
return out
skip_count = 0
datafile = os.path.join(
self.data_folder, 'gene_ensembl__gene__main.txt')
for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8), 1, includefn=_not_LRG):
datadict = value_convert(datadict, _fn)
for id, doc in datadict.items():
if id.isdigit():
if skip_count < ERR_THRESHOLD:
skip_count += 1
else:
raise ValueError('Too many ensembl ids are entirely numeric')
self.logger.warning(
"Document Skipped: All-digit id {}".format(id))
continue
doc['_id'] = id
yield doc
def restructure_dict(dictionary):
restr_dict = dict()
restr_dict['_id'] = dictionary['ChEBI ID']
restr_dict['chebi']= dictionary
restr_dict['chebi'] = clean_up(restr_dict['chebi'])
restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None"])
restr_dict = value_convert(unlist(restr_dict),skipped_keys=["beilstein_registry_numbers","pubchem_database_links","pubmed_citation_links","sabio_rk_database_links","gmelin_registry_numbers","molbase_database_links"])
return restr_dict
elif isinstance(value['atc-code'], dict) or isinstance(value['atc-code'], OrderedDict):
restr_atccode_dict(value['atc-code'])
d1['atc_codes'] = atccode_list
d1['targets'] = targets_list
d1['carriers'] = carriers_list
d1['enzymes'] = enzymes_list
d1['transporters'] = transporters_list
d1['predicted_properties'] = pred_properties_dict
d1['products'] = products_list
restr_dict['drugbank'] = d1
restr_dict = unlist(restr_dict)
restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None"])
restr_dict = boolean_convert(restr_dict,added_keys=["mddr_like_rule","bioavailability","ghose_filter","rule_of_five"])
restr_dict = value_convert(restr_dict,skipped_keys=["dpd","chemspider","chebi","pubchem_compound","pubchem_substance","bindingdb"])
return restr_dict
_flag=1
for x,y in iter(dictionary['molecule_structures'].items()):
if x == 'standard_inchi_key':
restr_dict['chembl'].update(dictionary)
restr_dict['chembl'].update({'inchi_key':y})
if x == 'canonical_smiles':
restr_dict['chembl']['smiles'] = y
if x == 'standard_inchi':
restr_dict['chembl']['inchi'] = y
if _flag == 0:
restr_dict['chembl'] = dictionary
del restr_dict['chembl']['molecule_structures']
restr_dict = unlist(restr_dict)
restr_dict = dict_sweep(restr_dict, vals=[None,".", "-", "", "NA", "None","none", " ", "Not Available", "unknown","null"])
restr_dict = value_convert(restr_dict, skipped_keys=["chebi_par_id","first_approval"])
restr_dict = boolean_convert(restr_dict, added_keys=["topical","oral","parenteral",
"dosed_ingredient","polymer_flag","therapeutic_flag","med_chem_friendly","ro3_pass"])
return restr_dict
def load_ensembl2prosite(self):
# Prosite
datafile = os.path.join(
self.data_folder, 'gene_ensembl__prot_profile__dm.txt')
for datadict in tab2dict_iter(datafile, (1, 4), 0):
datadict = dict_nodup(datadict)
datadict = value_convert(datadict, lambda x: {
'prosite': x}, traverse_list=False)
for doc in map_id(datadict, self.ensembl2entrez):
yield doc
len(set(ensembl2x) | set(ensembl2entrez)))
print('# of ensembl IDs match entrez Gene IDs: %d' %
len(set(ensembl2x) & set(ensembl2entrez)))
print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' %
len(set(ensembl2x) - set(ensembl2entrez)))
# all genes with matched entrez
def _fn(eid, taxid=None):
# need to make a copy of the value here.
d = copy.copy(ensembl2x.get(eid, {}))
# otherwise, it will cause issue when multiple entrezgene ids
return d
# match the same ensembl gene, for example,
# ENSMUSG00000027104 --> (11909, 100047997)
data = value_convert(entrez2ensembl, _fn)
# add those has no matched entrez geneid, using ensembl id as the key
for eid in (set(ensembl2x) - set(ensembl2entrez)):
_g = ensembl2x[eid]
#_g.update(self.ensembl_main.get(eid, {}))
data[eid] = _g
for id in data:
if isinstance(data[id], dict):
_doc = dict_nodup(data[id], sort=True)
else:
# if one entrez gene matches multiple ensembl genes
_doc = dict_attrmerge(data[id], removedup=True, sort=True)
data[id] = _doc
return data
res = {keyname : uniq[0]}
return res
def normalize_pdb(value):
return normalize(value,"pdb")
def normalize_pir(value):
return normalize(value,"pir")
# PDB
gene2pdb = value_convert(gene2pdb, normalize_pdb, traverse_list=False)
pdb_dumpfile = os.path.join(data_folder, 'gene2pdb.pyobj')
dump(gene2pdb,pdb_dumpfile)
# PIR
gene2pir = value_convert(gene2pir, normalize_pir, traverse_list=False)
pir_dumpfile = os.path.join(data_folder, 'gene2pir.pyobj')
dump(gene2pir,pir_dumpfile)
def transform(xli2):
gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True)
gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False)
gid, uniprot = list(gene2uniprot.items())[0]
docs = []
for gid, uniprot in gene2uniprot.items():
doc = {"_id" : gid}
doc.update(uniprot)
docs.append(doc)
return docs
def load_ensembl2pos(self):
datafile = os.path.join(
self.data_folder, 'gene_ensembl__gene__main.txt')
# Twice 1 because first is the dict key, the second because we need gene id within genomic_pos
ensembl2pos = dict_nodup(
tab2dict(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG))
ensembl2pos = value_convert(ensembl2pos, lambda x: {
'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4])})
ensembl2pos = value_convert(ensembl2pos, lambda x: {
'genomic_pos': x}, traverse_list=False)
for datadict in tab2dict_iter(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG):
datadict = dict_nodup(datadict)
datadict = value_convert(datadict, lambda x: {'ensemblgene': x[0], 'chr': x[3], 'start': int(
x[1]), 'end': int(x[2]), 'strand': int(x[4])})
datadict = value_convert(datadict, lambda x: {
'genomic_pos': x, '__aslistofdict__': 'genomic_pos'}, traverse_list=False)
for doc in map_id(datadict, self.ensembl2entrez):
yield doc