How to use the biothings.utils.common.timesofar function in biothings

To help you get started, we’ve selected a few biothings examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biothings / mygene.info / src / dataload / data_dump / dl_entrez.py View on Github external
sys.exit()

    logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    src_dump = get_src_dump()
    doc = {'_id': 'entrez',
           'timestamp': TIMESTAMP,
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(DATA_FOLDER, no_confirm=no_confirm)
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(DATA_FOLDER)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
github biothings / mygene.info / src / dataload / data_dump / dl_entrez.py View on Github external
def redo_parse_gbff(path):
    '''call this function manually to re-start the parsing step and set src_dump.
       This is used when main() is broken at parsing step, then parsing need to be re-started
       after the fix.
    '''
    #mark the download starts
    src_dump = get_src_dump()

    t0 = time.time()
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(path)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
github biothings / myvariant.info / src / utils / diff.py View on Github external
def two_docs_iterator(b1, b2, id_list, step=10000):
    t0 = time.time()
    n = len(id_list)
    for i in range(0, n, step):
        t1 = time.time()
        print("Processing %d-%d documents..." % (i + 1, min(i + step, n)))
        _ids = id_list[i:i+step]
        iter1 = b1.mget_from_ids(_ids, asiter=True)
        iter2 = b2.mget_from_ids(_ids, asiter=True)
        for doc1, doc2 in zip(iter1, iter2):
            yield doc1, doc2
        print('Done.[%.1f%%,%s]' % (i*100./n, timesofar(t1)))
    print("="*20)
    print('Finished.[total time: %s]' % timesofar(t0))
github biothings / mygene.info / src / databuild / sync.py View on Github external
logging.info("done. [{}]".format(timesofar(t00)))

        if changes['update']:
            logging.info("Updating {} existing docs...".format(len(changes['update'])))
            t00 = time.time()
            i = 0
            t1 = time.time()
            for _diff in changes['update']:
                target.update_diff(_diff, extra={'_timestamp': _timestamp})
                i += 1
                if i > 1 and i % step == 0:
                    logging.info('\t{}\t{}'.format(i, timesofar(t1)))
                    t1 = time.time()
            logging.info("done. [{}]".format(timesofar(t00)))
        logging.info("\n")
        logging.info("Finished. %s" % timesofar(t0))
github biothings / mygene.info / src / utils / es.py View on Github external
_kwargs = kwargs.copy()
        _kwargs.update(dict(size=step, index=index_name, doc_type=doc_type))
        res = helpers.scan(conn, query=query, scroll=scroll, **_kwargs)
        t1 = time.time()
        for doc in res:
            if verbose and cnt % step == 0:
                if cnt != 0:
                    print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1)))
                print('\t{}-{}...'.format(cnt+1, min(cnt+step, n)), end='')
                t1 = time.time()
            yield doc
            cnt += 1
        if verbose:
            print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1)))
            print("Finished! [{}]".format(timesofar(t0)))
github biothings / mygene.info / src / hub / dataload / sources / exac / parser.py View on Github external
from ..ensembl.parser import EnsemblParser
    from biothings.utils.hub_db import get_src_dump
    ensembl_doc = get_src_dump().find_one({"_id":"ensembl"}) or {}
    ensembl_dir = ensembl_doc.get("data_folder")
    assert ensembl_dir, "Can't find Ensembl data directory (used for id conversion)"
    ensembl_parser = EnsemblParser(ensembl_dir)
    ensembl_parser._load_ensembl2entrez_li()
    ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True)
    for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")):
        _,ensid,transid,_ = line
        if transid in exacs:
            data = exacs.pop(transid) # pop so no-match means no data in the end
            for entrezid in ensembl2entrez.get(ensid,[ensid]):
                exacs[entrezid] = data

    load_done('[%d, %s]' % (len(exacs), timesofar(t0)))

    return exacs
github biothings / mygene.info / src / dataload / data_dump / dl_refseq.py View on Github external
'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        download(DATA_FOLDER, refseq_release, no_confirm=no_confirm)
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'refseq'}, {'$set': _updates})
github biothings / myvariant.info / src / hub / dataload / sources / dbsnp / dbsnp_vcf_parser.py View on Github external
cnt_2 += 1
                    if verbose:
                        logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
            else:
                cnt_3 += 1
        else:
            # one rsid, one doc
            if doc['_id']:
                yield doc
                cnt_2 += 1
                if verbose:
                    logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
            else:
                cnt_3 += 1
        cnt_1 += 1
    logging.info("Done. [{}]".format(timesofar(t0)))
    logging.info("Total rs: {}; total docs: {}; skipped rs: {}".format(cnt_1, cnt_2, cnt_3))
github biothings / myvariant.info / src / utils / diff.py View on Github external
def two_docs_iterator(b1, b2, id_list, step=10000):
    t0 = time.time()
    n = len(id_list)
    for i in range(0, n, step):
        t1 = time.time()
        print("Processing %d-%d documents..." % (i + 1, min(i + step, n)))
        _ids = id_list[i:i+step]
        iter1 = b1.mget_from_ids(_ids, asiter=True)
        iter2 = b2.mget_from_ids(_ids, asiter=True)
        for doc1, doc2 in zip(iter1, iter2):
            yield doc1, doc2
        print('Done.[%.1f%%,%s]' % (i*100./n, timesofar(t1)))
    print("="*20)
    print('Finished.[total time: %s]' % timesofar(t0))
github biothings / mygene.info / src / dataload / data_dump / dl_entrez.py View on Github external
def redo_parse_gbff(path):
    '''call this function manually to re-start the parsing step and set src_dump.
       This is used when main() is broken at parsing step, then parsing need to be re-started
       after the fix.
    '''
    #mark the download starts
    src_dump = get_src_dump()

    t0 = time.time()
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(path)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})