Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
sys.exit()
logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
setup_logfile(logfile)
#mark the download starts
src_dump = get_src_dump()
doc = {'_id': 'entrez',
'timestamp': TIMESTAMP,
'data_folder': DATA_FOLDER,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
download(DATA_FOLDER, no_confirm=no_confirm)
t_download = timesofar(t0)
t1 = time.time()
#mark parsing starts
src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
parse_gbff(DATA_FOLDER)
t_parsing = timesofar(t1)
t_total = timesofar(t0)
#mark the download finished successfully
_updates = {
'status': 'success',
'time': {
'download': t_download,
'parsing': t_parsing,
'total': t_total
},
'pending_to_upload': True # a flag to trigger data uploading
def redo_parse_gbff(path):
'''call this function manually to re-start the parsing step and set src_dump.
This is used when main() is broken at parsing step, then parsing need to be re-started
after the fix.
'''
#mark the download starts
src_dump = get_src_dump()
t0 = time.time()
t_download = timesofar(t0)
t1 = time.time()
#mark parsing starts
src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
parse_gbff(path)
t_parsing = timesofar(t1)
t_total = timesofar(t0)
#mark the download finished successfully
_updates = {
'status': 'success',
'time': {
'download': t_download,
'parsing': t_parsing,
'total': t_total
},
'pending_to_upload': True # a flag to trigger data uploading
def two_docs_iterator(b1, b2, id_list, step=10000):
t0 = time.time()
n = len(id_list)
for i in range(0, n, step):
t1 = time.time()
print("Processing %d-%d documents..." % (i + 1, min(i + step, n)))
_ids = id_list[i:i+step]
iter1 = b1.mget_from_ids(_ids, asiter=True)
iter2 = b2.mget_from_ids(_ids, asiter=True)
for doc1, doc2 in zip(iter1, iter2):
yield doc1, doc2
print('Done.[%.1f%%,%s]' % (i*100./n, timesofar(t1)))
print("="*20)
print('Finished.[total time: %s]' % timesofar(t0))
logging.info("done. [{}]".format(timesofar(t00)))
if changes['update']:
logging.info("Updating {} existing docs...".format(len(changes['update'])))
t00 = time.time()
i = 0
t1 = time.time()
for _diff in changes['update']:
target.update_diff(_diff, extra={'_timestamp': _timestamp})
i += 1
if i > 1 and i % step == 0:
logging.info('\t{}\t{}'.format(i, timesofar(t1)))
t1 = time.time()
logging.info("done. [{}]".format(timesofar(t00)))
logging.info("\n")
logging.info("Finished. %s" % timesofar(t0))
_kwargs = kwargs.copy()
_kwargs.update(dict(size=step, index=index_name, doc_type=doc_type))
res = helpers.scan(conn, query=query, scroll=scroll, **_kwargs)
t1 = time.time()
for doc in res:
if verbose and cnt % step == 0:
if cnt != 0:
print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1)))
print('\t{}-{}...'.format(cnt+1, min(cnt+step, n)), end='')
t1 = time.time()
yield doc
cnt += 1
if verbose:
print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1)))
print("Finished! [{}]".format(timesofar(t0)))
from ..ensembl.parser import EnsemblParser
from biothings.utils.hub_db import get_src_dump
ensembl_doc = get_src_dump().find_one({"_id":"ensembl"}) or {}
ensembl_dir = ensembl_doc.get("data_folder")
assert ensembl_dir, "Can't find Ensembl data directory (used for id conversion)"
ensembl_parser = EnsemblParser(ensembl_dir)
ensembl_parser._load_ensembl2entrez_li()
ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True)
for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")):
_,ensid,transid,_ = line
if transid in exacs:
data = exacs.pop(transid) # pop so no-match means no data in the end
for entrezid in ensembl2entrez.get(ensid,[ensid]):
exacs[entrezid] = data
load_done('[%d, %s]' % (len(exacs), timesofar(t0)))
return exacs
'timestamp': time.strftime('%Y%m%d'),
'data_folder': DATA_FOLDER,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
try:
download(DATA_FOLDER, refseq_release, no_confirm=no_confirm)
finally:
sys.stdout.close()
#mark the download finished successfully
_updates = {
'status': 'success',
'time': timesofar(t0),
'pending_to_upload': True # a flag to trigger data uploading
}
src_dump.update({'_id': 'refseq'}, {'$set': _updates})
cnt_2 += 1
if verbose:
logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
else:
cnt_3 += 1
else:
# one rsid, one doc
if doc['_id']:
yield doc
cnt_2 += 1
if verbose:
logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
else:
cnt_3 += 1
cnt_1 += 1
logging.info("Done. [{}]".format(timesofar(t0)))
logging.info("Total rs: {}; total docs: {}; skipped rs: {}".format(cnt_1, cnt_2, cnt_3))
def two_docs_iterator(b1, b2, id_list, step=10000):
t0 = time.time()
n = len(id_list)
for i in range(0, n, step):
t1 = time.time()
print("Processing %d-%d documents..." % (i + 1, min(i + step, n)))
_ids = id_list[i:i+step]
iter1 = b1.mget_from_ids(_ids, asiter=True)
iter2 = b2.mget_from_ids(_ids, asiter=True)
for doc1, doc2 in zip(iter1, iter2):
yield doc1, doc2
print('Done.[%.1f%%,%s]' % (i*100./n, timesofar(t1)))
print("="*20)
print('Finished.[total time: %s]' % timesofar(t0))
def redo_parse_gbff(path):
'''call this function manually to re-start the parsing step and set src_dump.
This is used when main() is broken at parsing step, then parsing need to be re-started
after the fix.
'''
#mark the download starts
src_dump = get_src_dump()
t0 = time.time()
t_download = timesofar(t0)
t1 = time.time()
#mark parsing starts
src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
parse_gbff(path)
t_parsing = timesofar(t1)
t_total = timesofar(t0)
#mark the download finished successfully
_updates = {
'status': 'success',
'time': {
'download': t_download,
'parsing': t_parsing,
'total': t_total
},
'pending_to_upload': True # a flag to trigger data uploading
}
src_dump.update({'_id': 'entrez'}, {'$set': _updates})