Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main(no_confirm=True):
src_dump = get_src_dump()
download_list = get_file_list_for_download()
if len(download_list) == 0:
logging.info("No newer file found. Abort now.")
sys.exit(0)
doc = src_dump.find_one({'_id': 'ucsc'})
if not os.path.exists(DATA_FOLDER):
os.makedirs(DATA_FOLDER)
logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log')
setup_logfile(logfile)
# mark the download starts
doc = {'_id': 'ucsc',
'timestamp': timestamp,
'data_folder': DATA_FOLDER,
'lastmodified': latest_lastmodified,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
download(download_list, no_confirm)
# mark the download finished successfully
_updates = {
'status': 'success',
'time': timesofar(t0),
'pending_to_upload': True # a flag to trigger data uploading
def main():
no_confirm = True # set it to True for running this script automatically without intervention.
if not ARCHIVE_DATA:
rmdashfr(DATA_FOLDER)
if not os.path.exists(DATA_FOLDER):
os.makedirs(DATA_FOLDER)
else:
if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
sys.exit()
logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
setup_logfile(logfile)
#mark the download starts
src_dump = get_src_dump()
doc = {'_id': 'entrez',
'timestamp': TIMESTAMP,
'data_folder': DATA_FOLDER,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
download(DATA_FOLDER, no_confirm=no_confirm)
t_download = timesofar(t0)
t1 = time.time()
#mark parsing starts
src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
parse_gbff(DATA_FOLDER)
doc = src_dump.find_one({'_id': 'exac'})
if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
path, filename = os.path.split(DATAFILES_PATH[0])
data_file = os.path.join(doc['data_folder'], filename)
if os.path.exists(data_file):
logging.info("No newer file found. Abort now.")
sys.exit(0)
if not os.path.exists(DATA_FOLDER):
os.makedirs(DATA_FOLDER)
else:
if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
sys.exit(0)
logfile = os.path.join(DATA_FOLDER, 'exac_dump.log')
setup_logfile(logfile)
#mark the download starts
doc = {'_id': 'exac',
'timestamp': timestamp,
'data_folder': DATA_FOLDER,
'lastmodified': lastmodified,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
download(no_confirm)
#mark the download finished successfully
_updates = {
'status': 'success',
'time': timesofar(t0),
'pending_to_upload': True # a flag to trigger data uploading
data_file = os.path.join(doc['data_folder'], 'genes.zip')
if os.path.exists(data_file):
logging.info("No newer file found. Abort now.")
sys.exit(0)
if not ARCHIVE_DATA:
rmdashfr(DATA_FOLDER)
if not os.path.exists(DATA_FOLDER):
os.makedirs(DATA_FOLDER)
else:
if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
sys.exit(0)
logfile = os.path.join(DATA_FOLDER, 'pharmgkb_dump.log')
setup_logfile(logfile)
#mark the download starts
doc = {'_id': 'pharmgkb',
'timestamp': TIMESTAMP,
'data_folder': DATA_FOLDER,
'lastmodified': lastmodified,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
download(no_confirm)
#mark the download finished successfully
_updates = {
'status': 'success',
'time': timesofar(t0),
'pending_to_upload': True # a flag to trigger data uploading
data_file = os.path.join(doc['data_folder'], filename)
if os.path.exists(data_file):
logging.info("No newer file found. Abort now.")
sys.exit(0)
if not ARCHIVE_DATA:
rmdashfr(DATA_FOLDER)
if not os.path.exists(DATA_FOLDER):
os.makedirs(DATA_FOLDER)
else:
if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
sys.exit(0)
logfile = os.path.join(DATA_FOLDER, 'uniprot_dump.log')
setup_logfile(logfile)
#mark the download starts
doc = {'_id': 'uniprot',
'timestamp': TIMESTAMP,
'data_folder': DATA_FOLDER,
'lastmodified': lastmodified,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
download(no_confirm)
#mark the download finished successfully
_updates = {
'status': 'success',
'time': timesofar(t0),
'pending_to_upload': True # a flag to trigger data uploading
doc = src_dump.find_one({'_id': 'ensembl'})
if doc and 'release' in doc and mart_version <= doc['release']:
data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt')
if os.path.exists(data_file):
logging.info("No newer release found. Abort now.")
sys.exit(0)
DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version))
if not os.path.exists(DATA_FOLDER):
os.makedirs(DATA_FOLDER)
else:
if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
sys.exit(0)
logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version)
setup_logfile(logfile)
#mark the download starts
doc = {'_id': 'ensembl',
'release': mart_version,
'timestamp': time.strftime('%Y%m%d'),
'data_folder': DATA_FOLDER,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
try:
BM = BioMart()
BM.species_li = get_all_species(mart_version)
BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt'))
BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt'))
def update_from_temp_collections(config,no_confirm=False,use_parallel=False):
t0 = time.time()
sc = GeneDocSyncer(config)
new_src_li = sc.get_new_source_list()
if not new_src_li:
logging.info("No new source collections need to update. Abort now.")
return
logging.info("Found {} new source collections need to update:".format(len(new_src_li)))
logging.info("\n".join(['\t' + x for x in new_src_li]))
if no_confirm or ask('Continue?') == 'Y':
logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d'))
logfile = os.path.join(LOG_FOLDER, logfile)
setup_logfile(logfile)
for src in new_src_li:
t0 = time.time()
logging.info("Current source collection: %s" % src)
ts = _get_timestamp(src, as_str=True)
logging.info("Calculating changes... ")
changes = sc.get_changes(src, use_parallel=use_parallel)
logging.info("Done")
get_changes_stats(changes)
if no_confirm or ask("Continue to save changes...") == 'Y':
if config == 'genedoc_mygene':
dumpfile = 'changes_{}.pyobj'.format(ts)
else:
dumpfile = 'changes_{}_allspecies.pyobj'.format(ts)
dump(changes, dumpfile)
dumpfile_key = 'genedoc_changes/' + dumpfile