Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"step" : "rebuild",
"description" : ""}
self.logger.info("Rebuild cache for sources: %s, target: %s" % (srcs,tgt))
for src in srcs:
# src can be a full name (eg. clinvar.clinvar_hg38) but id_feeder knows only name (clinvar_hg38)
if "." in src:
src = src.split(".")[1]
self.logger.info("Rebuilding cache for source '%s'" % src)
col = mongo.get_src_db()[src]
pinfo["source"] = src
job = yield from self.managers["job_manager"].defer_to_thread(pinfo, partial(rebuild,col))
yield from job
self.logger.info("Done rebuilding cache for source '%s'" % src)
if tgt:
self.logger.info("Rebuilding cache for target '%s'" % tgt)
col = mongo.get_target_db()[tgt]
pinfo["source"] = tgt
job = self.managers["job_manager"].defer_to_thread(pinfo, partial(rebuild,col))
yield from job
def test():
target = get_target_db()
sync_src = backend.GeneDocMongoDBBackend(target['genedoc_mygene_allspecies_20130402_uiu7bkyi'])
idxer = ESIndexer()
sync_target = backend.GeneDocESBackend(idxer)
return sync_src, sync_target
def chrom_worker(col_name, ids):
tgt = mongo.get_target_db()
col = tgt[col_name]
cur = col.find({'_id': {'$in': ids}})
bob = col.initialize_unordered_bulk_op()
disagreed = []
missing = []
root_keys = {}
at_least_one = False
for doc in cur:
dchrom = get_chrom(doc)
if dchrom["chrom"] is None:
missing.append(doc["_id"])
elif dchrom["agreed"] is False:
disagreed.append(doc["_id"])
chrom = dchrom["chrom"]
if chrom:
bob.find({"_id": doc["_id"]}).update({"$set": {"chrom" : chrom}})
def get_stats(self,sources,job_manager):
self.stats = super(MyGeneDataBuilder,self).get_stats(sources,job_manager)
# enrich with some specific mygene counts, specially regarding ensembl vs. entrez
tgt = mongo.get_target_db()[self.target_name]
self.stats["total_genes"] = tgt.count()
# entrez genes are digits only (also, don't count entrez_gene collection,
# because tgt can be a subset, we have to work with the merged collection)
self.logger.debug("Counting 'total_entrez_genes'")
entrez_cnt = tgt.find({"entrezgene":{"$exists":1}},{"_id":1}).count()
self.stats["total_entrez_genes"] = entrez_cnt
# ensembl genes aount are taken from :
# 1. "ensembl" field, but it can a list => use aggregation.
# Note: "ensembl.0" means first element of the list, so it implicitely
# select doc with a list. Finally, filtering with {$type:"array"} doesn't work because
# mongo filters this on the most inner field (that's weird, but it is what is it...)
# 2. when document is root doc coming from ensembl_gene collection without a "ensembl" key ("orphan")
# Note: we can't create a sparce or conditional index to help querying "ensembl"
# because data is too long for an index key, and "hashed" mode doesn't work because list aren't supported
# Queries are gonna use colscan strategy...
self.logger.debug("Counting 'total_ensembl_genes'")
def post_merge(self, source_names, batch_size, job_manager):
tgt = mongo.get_target_db()[self.target_name]
# background=true or it'll lock the whole database...
self.logger.info("Indexing 'taxid'")
tgt.create_index("taxid",background=True)
self.logger.info("Indexing 'entrezgene'")
tgt.create_index("entrezgene",background=True)
def diff2src(use_parallel=True, noconfirm=False):
src_li = []
target_db = get_target_db()
src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')])
es_idxer = ESIndexer()
es_idxer.conn.default_indices = []
for es_idx in es_idxer.conn.indices.get_indices():
if es_idx.startswith('genedoc'):
es_idxer.ES_INDEX_NAME = es_idx
src_li.append((es_idx, es_idxer.count()['count'], 'es'))
print("Found {} sources:".format(len(src_li)))
src_1 = _pick_one(src_li, "Pick first source above: ")
src_li.remove(src_1)
print
src_2 = _pick_one(src_li, "Pick second source above: ")
sync_li = []
def apply_changes(self, changes, verify=True, noconfirm=False):
if verify:
self.pre_verify_changes(changes)
if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'):
print("Aborted.")
return -1
step = self.step
_db = get_target_db()
source_col = _db[changes['source']]
src = GeneDocMongoDBBackend(source_col)
target = GeneDocESBackend(self)
_timestamp = changes['timestamp']
def _add_docs(ids):
i = 0
for _ids in iter_n(ids, step):
t1 = time.time()
_doc_li = src.mget_from_ids(_ids)
for _doc in _doc_li:
_doc['_timestamp'] = _timestamp
i += 1
target.insert(_doc_li)
print('\t{}\t{}'.format(i, timesofar(t1)))
def diff_two(col_1, col_2, use_parallel=True):
target = get_target_db()
b1 = GeneDocMongoDBBackend(target[col_1])
b2 = GeneDocMongoDBBackend(target[col_2])
return diff_collections(b1, b2, use_parallel=use_parallel)