Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import itertools, glob, os
from .dbsnp_json_parser import load_data_file
import biothings.hub.dataload.uploader as uploader
from hub.dataload.uploader import SnpeffPostUpdateUploader
SRC_META = {
"url" : "https://www.ncbi.nlm.nih.gov/projects/SNP/",
"license_url" : "https://www.ncbi.nlm.nih.gov/home/about/policies/",
"license_url_short": "http://bit.ly/2AqoLOc"
}
class DBSNPBaseUploader(uploader.IgnoreDuplicatedSourceUploader,
uploader.ParallelizedSourceUploader,
SnpeffPostUpdateUploader):
def jobs(self):
files = glob.glob(os.path.join(self.data_folder,"refsnp-chr*.json.bz2"))
return [(f,) for f in files]
def load_data(self,input_file):
self.logger.info("Load data from '%s'",input_file)
return load_data_file(input_file,self.__class__.__metadata__["assembly"])
def post_update_data(self, *args, **kwargs):
super(DBSNPBaseUploader,self).post_update_data(*args,**kwargs)
self.logger.info("Indexing 'rsid'")
# background=true or it'll lock the whole database...
self.collection.create_index("dbsnp.rsid",background=True)
def load_data(self,data_folder):
content = glob.glob(os.path.join(data_folder,"ExAC.r*.vcf"))
if len(content) != 1:
raise uploader.ResourceError("Expecting one single vcf file, got: %s" % repr(content))
input_file = content.pop()
self.logger.info("Load data from file '%s'" % input_file)
return load_data(self.__class__.name, input_file)
import biothings.hub.dataload.uploader as uploader
from hub.dataload.uploader import SnpeffPostUpdateUploader
class WellderlyFactoryUploader(uploader.DummySourceUploader,SnpeffPostUpdateUploader):
"""Data originally coming from: http://www.stsiweb.org/wellderly"""
name = "wellderly"
__metadata__ = {
"mapper" : 'observed',
"assembly" : "hg19",
"src_meta" : {
"url" : "https://genomics.scripps.edu/browser/",
"license_url" : "https://genomics.scripps.edu/browser/page-help.html",
"license_url_short": "http://bit.ly/2VE6gj7"
}
}
#split_collections = ["wellderly_cg1","wellderly_cg10","wellderly_cg11",
# "wellderly_cg12","wellderly_cg13","wellderly_cg14",
# "wellderly_cg15","wellderly_cg16","wellderly_cg17",
import os.path
from .umls_parser import load_data
import biothings.hub.dataload.uploader as uploader
class UMLSUploader(uploader.BaseSourceUploader):
name = "umls"
def load_data(self, data_folder):
umls_docs = load_data(data_folder)
return umls_docs
@classmethod
def get_mapping(klass):
mapping = {
"umls": {
"properties": {
"cui": {
"type": "keyword",
"normalizer" : "keyword_lowercase_normalizer",
'copy_to': ['all'],
'''
Populates MICROBE gene entries with genomic position data
Currently updates the 120 microbial taxids that are NCBI Reference Sequences
run get_ref_microbe_taxids function to get an updated file for TAXIDS_FILE
when it's necessary.
'''
import os.path
from biothings.utils.common import (dump, loadobj, get_timestamp)
from biothings.utils.dataload import tab2list
import biothings.hub.dataload.uploader as uploader
from biothings.utils.hub_db import get_src_dump
class EntrezGenomicPosUploader(uploader.MergerSourceUploader):
name = "entrez_genomic_pos"
main_source = "entrez"
def load_data(self, data_folder):
"""
Loads gene data from NCBI's refseq2gene.gz file.
Parses it based on genomic position data and refseq status provided by the
list of taxids from get_ref_microbe_taxids() as lookup table
:return:
"""
refsrc = get_src_dump().find_one({"_id":"ref_microbe_taxids"})
assert refsrc, "ref_microbe_taxids dump not found"
taxids_file = os.path.join(refsrc["download"]["data_folder"], "ref_microbe_taxids.pyobj")
datafile = os.path.join(data_folder, 'gene2refseq.gz')
from .parser import load_cpdb
import biothings.hub.dataload.uploader as uploader
class CPDBUploader(uploader.BaseSourceUploader):
name = "cpdb"
PATHWAYS = ['biocarta','humancyc','kegg','mousecyc',
'netpath','pharmgkb','pid',
'smpdb','wikipathways','yeastcyc']
def load_data(self, data_folder):
return load_cpdb(data_folder, self.__class__.PATHWAYS)
@classmethod
def get_mapping(klass):
mapping = {
"pathway": {
"dynamic": False,
"properties": {
}
def load_data(self,data_folder):
# there's one zip there, let's get the zipped filename
zgrasp = glob.glob(os.path.join(data_folder,"*.zip"))
if len(zgrasp) != 1:
raise uploader.ResourceError("Expecting one zip only, got: %s" % repr(zgrasp))
zgrasp = zgrasp.pop()
zf = zipfile.ZipFile(zgrasp)
content = [e.filename for e in zf.filelist]
if len(content) != 1:
raise uploader.ResourceError("Expecting only one file in the archive, got: %s" % content)
input_file = content.pop()
input_file = os.path.join(data_folder,"sorted")#input_file)
self.logger.info("Load data from file '%s'" % input_file)
res = load_data(input_file)
return res
from .parser import GeneSummaryParser
import biothings.hub.dataload.uploader as uploader
class EntrezGeneSummaryUploader(uploader.MergerSourceUploader):
name = "entrez_genesummary"
main_source = "refseq"
def load_data(self, data_folder):
gene2summary = GeneSummaryParser(data_folder).load()
return gene2summary
@classmethod
def get_mapping(klass):
mapping = {
"summary": {
"type": "text",
"boost": 0.5, # downgrade summary field.
'copy_to': ['all'],
},
import itertools, glob, os
from .dbsnp_json_parser import load_data_file
import biothings.hub.dataload.uploader as uploader
from hub.dataload.uploader import SnpeffPostUpdateUploader
SRC_META = {
"url" : "https://www.ncbi.nlm.nih.gov/projects/SNP/",
"license_url" : "https://www.ncbi.nlm.nih.gov/home/about/policies/",
"license_url_short": "http://bit.ly/2AqoLOc"
}
class DBSNPBaseUploader(uploader.IgnoreDuplicatedSourceUploader,
uploader.ParallelizedSourceUploader,
SnpeffPostUpdateUploader):
def jobs(self):
files = glob.glob(os.path.join(self.data_folder,"refsnp-chr*.json.bz2"))
return [(f,) for f in files]
def load_data(self,input_file):
self.logger.info("Load data from '%s'",input_file)
return load_data_file(input_file,self.__class__.__metadata__["assembly"])
def post_update_data(self, *args, **kwargs):
super(DBSNPBaseUploader,self).post_update_data(*args,**kwargs)
self.logger.info("Indexing 'rsid'")
# background=true or it'll lock the whole database...
self.collection.create_index("dbsnp.rsid",background=True)