Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def download(no_confirm=False):
orig_path = os.getcwd()
try:
os.chdir(DATA_FOLDER)
path, filename = os.path.split(DATAFILE_PATH)
if os.path.exists(filename):
if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
os.remove(filename)
else:
logging.info("Skipped!")
return
logging.info('Downloading "%s"...' % filename)
url = 'ftp://{}/{}'.format(FTP_SERVER, DATAFILE_PATH)
cmdline = 'wget %s -O %s' % (url, filename)
#cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections
return_code = os.system(cmdline)
if return_code == 0:
logging.info("Success.")
else:
logging.info("Failed with return code (%s)." % return_code)
logging.info("=" * 50)
finally:
os.chdir(orig_path)
refseq_release = get_refseq_release()
logging.info(refseq_release)
src_dump = get_src_dump()
doc = src_dump.find_one({'_id': 'refseq'})
if doc and 'release' in doc and refseq_release <= doc['release']:
data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz')
if os.path.exists(data_file):
logging.info("No newer release found. Abort now.")
sys.exit(0)
DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release))
if not os.path.exists(DATA_FOLDER):
os.makedirs(DATA_FOLDER)
else:
if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
sys.exit(0)
logfile = os.path.join(DATA_FOLDER, 'refseq_dump.log')
setup_logfile(logfile)
#mark the download starts
doc = {'_id': 'refseq',
'release': refseq_release,
'timestamp': time.strftime('%Y%m%d'),
'data_folder': DATA_FOLDER,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
try:
def build_index(config, use_parallel=True, noconfirm=False):
bdr = DataBuilder(backend='mongodb')
bdr.load_build_config(config)
target_collection = bdr.pick_target_collection()
target_es_index = 'genedoc_' + bdr._build_config['name']
if target_collection:
es_idxer = ESIndexer(mapping=bdr.get_mapping())
es_idxer.ES_INDEX_NAME = target_es_index
es_idxer.step = 10000
es_idxer.use_parallel = use_parallel
es_server = es_idxer.conn.servers[0].geturl()
print("ES target: {}/{}/{}".format(es_server,
es_idxer.ES_INDEX_NAME,
es_idxer.ES_INDEX_TYPE))
if noconfirm or ask("Continue?") == 'Y':
#es_idxer.s = 609000
#es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
es_idxer.create_index()
es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm)
es_idxer.build_index(target_collection, verbose=False)
es_idxer.optimize()
else:
print("Aborted.")
else:
print("Error: target collection is not ready yet or failed to build.")
def delete_index_type(self, index_type, noconfirm=False):
'''Delete all indexes for a given index_type.'''
index_name = self.ES_INDEX_NAME
# Check if index_type exists
m = self.conn.indices.get_mapping(index_name, index_type)
if not m:
print('Error: index type "%s" does not exist in index "%s".'
% (index_type, index_name))
return
path = '/%s/%s' % (index_name, index_type)
if noconfirm or ask(
'Confirm to delete all data under "%s":' % path) == 'Y':
return self.conn.indices.delete_mapping(
index=index_name, doc_type=index_type)
if len(sys.argv) > 1:
config = sys.argv[1]
else:
config = 'mygene_allspecies'
if not config.startswith('genedoc_'):
config = 'genedoc_' + config
assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies']
noconfirm = '-b' in sys.argv
_changes_fn = _get_current_changes_fn(config)
if _changes_fn:
print("Changes file: " + _changes_fn)
else:
print("No changes file found. Aborted.")
return -1
if noconfirm or ask("Continue to load?") == 'Y':
changes = loadobj(_changes_fn)
else:
print("Aborted.")
return -2
_es_index = config + TARGET_ES_INDEX_SUFFIX
# ES host will be set depending on whether a tunnel is used or not
with open_tunnel() as tunnel:
if tunnel.ok:
_es_host = 'localhost:' + str(es_local_tunnel_port)
else:
_es_host = ES_HOST
esi = ESIndexer2(_es_index, es_host=_es_host)
meta = esi.get_mapping_meta(changes)
def download(no_confirm=False):
orig_path = os.getcwd()
try:
os.chdir(DATA_FOLDER)
filename = 'genes.zip'
url = GENES_URL
if os.path.exists(filename):
if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
os.remove(filename)
else:
logging.info("Skipped!")
return
logging.info('Downloading "%s"...' % filename)
cmdline = 'wget "%s" -O %s' % (url, filename)
#cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections
return_code = os.system(cmdline)
if return_code == 0:
logging.info("Success.")
else:
logging.info("Failed with return code (%s)." % return_code)
logging.info("=" * 50)
finally:
os.chdir(orig_path)
logging.info("\n".join(['\t' + x for x in new_src_li]))
if no_confirm or ask('Continue?') == 'Y':
logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d'))
logfile = os.path.join(LOG_FOLDER, logfile)
setup_logfile(logfile)
for src in new_src_li:
t0 = time.time()
logging.info("Current source collection: %s" % src)
ts = _get_timestamp(src, as_str=True)
logging.info("Calculating changes... ")
changes = sc.get_changes(src, use_parallel=use_parallel)
logging.info("Done")
get_changes_stats(changes)
if no_confirm or ask("Continue to save changes...") == 'Y':
if config == 'genedoc_mygene':
dumpfile = 'changes_{}.pyobj'.format(ts)
else:
dumpfile = 'changes_{}_allspecies.pyobj'.format(ts)
dump(changes, dumpfile)
dumpfile_key = 'genedoc_changes/' + dumpfile
logging.info('Saving to S3: "{}"... '.format(dumpfile_key))
send_s3_file(dumpfile, dumpfile_key)
logging.info('Done.')
#os.remove(dumpfile)
if no_confirm or ask("Continue to apply changes...") == 'Y':
sc.apply_changes(changes)
sc.verify_changes(changes)
logging.info('=' * 20)
logging.info("Finished. %s" % timesofar(t0))
def clean_target_collection():
bdr = DataBuilder(backend='mongodb')
bdr.load_build_config('mygene')
try:
target_collection = bdr.pick_target_collection(autoselect=False)
except KeyboardInterrupt:
print("Aborted.")
return
if ask('Delete collection "{}"'.format(target_collection.name)) == 'Y':
if ask("Double check! Are you sure?") == 'Y':
target_collection.drop()
print('Done, collection "{}" was dropped.'.format(target_collection.name))
def rename_from_temp_collection(config,from_index,no_confirm=False):
# check if index exist before chenging anything
sc = GeneDocSyncer(config)
if not from_index in sc._db.collection_names():
logging.error("Collection '%s' does not exist" % from_index)
from_col = sc._db.get_collection(from_index)
orig_name = sc._target_col.name
logging.info("Backing up timestamp from '%s'" % orig_name)
if no_confirm or ask('Continue?') == 'Y':
bckfile = backup_timestamp_main([config]).pop()
else:
bckfile = None
# rename existing current for backup purpose
bck_name = orig_name + "_bck_%s" % time.strftime('%Y%m%d%H%M%S')
logging.info("Renaming %s to %s" % (orig_name,bck_name))
if no_confirm or ask('Continue?') == 'Y':
sc._target_col.rename(bck_name)
logging.info("Renaming %s to %s" % (from_col.name,orig_name))
if no_confirm or ask('Continue?') == 'Y':
from_col.rename(orig_name)
if bckfile is None:
try:
pat = "%s_current_tsbk_*.txt.bz" % config
logging.info("Looking for '%s'" % pat)
bckfile = sorted(glob.glob(pat))[0]
if ask("Do you want me to apply timestamp from file '%s' to collection '%s' ?" % (bckfile,sc._target_col.name)) == 'Y':
pass
else:
return
except IndexError:
logging.error("Can't find any timstamp file to apply, giving up...")
return
pat = prefix + '_(\d{8})_\w{8}'
_li = []
for index in index_li:
mat = re.match(pat, index)
if mat:
_li.append((mat.group(1), index))
_li.sort() # older collection appears first
# keep last # of newer indices
index_to_remove = [x[1] for x in _li[:-keep_last]]
if len(index_to_remove) > 0:
print("{} \"{}*\" indices will be removed.".format(
len(index_to_remove), prefix))
if verbose:
for index in index_to_remove:
print('\t', index)
if noconfirm or ask("Continue?") == 'Y':
for index in index_to_remove:
if dryrun:
print("dryrun=True, nothing is actually deleted")
else:
conn.indices.delete(index)
print("Done.[%s indices removed]" % len(index_to_remove))
else:
print("Aborted.")
else:
print("Nothing needs to be removed.")