Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for exp_array in exp_arrays:
job_name = get_exp_name(exp_array[0][0], collection.name)
output_dir_path = get_output_dir_path(exp_array[0][0])
slurm_config = exp_array[0][0]['slurm']
del slurm_config['experiments_per_job']
start_slurm_job(collection, exp_array, unobserved, post_mortem,
name=job_name, output_dir_path=output_dir_path, **slurm_config)
else:
login_node_name = 'fs'
if login_node_name in os.uname()[1]:
logging.error("Refusing to run a compute experiment on a login node. "
"Please use Slurm or a compute node.")
sys.exit(1)
[get_output_dir_path(exp) for exp in exps_list] # Check if output dir exists
logging.info(f'Starting local worker thread that will run up to {nexps} experiment{s_if(nexps)}, '
f'until no queued experiments remain.')
if not unobserved:
collection.update_many({'_id': {'$in': [e['_id'] for e in exps_list]}}, {"$set": {"status": "PENDING"}})
num_exceptions = 0
tq = tqdm(enumerate(exps_list))
for i_exp, exp in tq:
if output_to_file:
output_dir_path = get_output_dir_path(exp)
else:
output_dir_path = None
success = start_local_job(collection, exp, unobserved, post_mortem, output_dir_path)
if success is False:
num_exceptions += 1
tq.set_postfix(failed=f"{num_exceptions}/{i_exp} experiments")
collection = get_collection(db_collection_name)
if sacred_id is None:
if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
detect_killed(db_collection_name, print_detected=False)
filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)
ndelete = collection.count_documents(filter_dict)
batch_ids = collection.find(filter_dict, {'batch_id'})
batch_ids_in_del = set([x['batch_id'] for x in batch_ids])
if ndelete >= 10:
if input(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection. "
f"Are you sure? (y/n) ").lower() != "y":
exit()
else:
logging.info(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection.")
collection.delete_many(filter_dict)
else:
exp = collection.find_one({'_id': sacred_id})
if exp is None:
logging.error(f"No experiment found with ID {sacred_id}.")
sys.exit(1)
else:
logging.info(f"Deleting experiment with ID {sacred_id}.")
batch_ids_in_del = set([exp['batch_id']])
collection.delete_one({'_id': sacred_id})
if len(batch_ids_in_del) > 0:
# clean up the uploaded sources if no experiments of a batch remain
delete_orphaned_sources(collection, batch_ids_in_del)
if 'conda_environment' in exp['seml']:
configs.append((exe, exp['seml']['conda_environment'], config))
else:
configs.append((exe, None, config))
return configs
elif slurm:
if not output_to_file:
logging.error("Output cannot be written to stdout in Slurm mode. "
"Remove the '--output-to-console' argument.")
sys.exit(1)
exp_chunks = chunk_list(exps_list)
exp_arrays = batch_chunks(exp_chunks)
njobs = len(exp_chunks)
narrays = len(exp_arrays)
logging.info(f"Starting {nexps} experiment{s_if(nexps)} in "
f"{njobs} Slurm job{s_if(njobs)} in {narrays} Slurm job array{s_if(narrays)}.")
for exp_array in exp_arrays:
job_name = get_exp_name(exp_array[0][0], collection.name)
output_dir_path = get_output_dir_path(exp_array[0][0])
slurm_config = exp_array[0][0]['slurm']
del slurm_config['experiments_per_job']
start_slurm_job(collection, exp_array, unobserved, post_mortem,
name=job_name, output_dir_path=output_dir_path, **slurm_config)
else:
login_node_name = 'fs'
if login_node_name in os.uname()[1]:
logging.error("Refusing to run a compute experiment on a login node. "
"Please use Slurm or a compute node.")
sys.exit(1)
[get_output_dir_path(exp) for exp in exps_list] # Check if output dir exists
def delete_experiments(db_collection_name, sacred_id, filter_states, batch_id, filter_dict):
collection = get_collection(db_collection_name)
if sacred_id is None:
if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
detect_killed(db_collection_name, print_detected=False)
filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)
ndelete = collection.count_documents(filter_dict)
batch_ids = collection.find(filter_dict, {'batch_id'})
batch_ids_in_del = set([x['batch_id'] for x in batch_ids])
if ndelete >= 10:
if input(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection. "
f"Are you sure? (y/n) ").lower() != "y":
exit()
else:
logging.info(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection.")
collection.delete_many(filter_dict)
filter_states: list of strings or None
List of statuses to filter for. Will cancel all jobs from the database collection
with one of the given statuses.
batch_id: int or None
The ID of the batch of experiments to cancel. All experiments that are queued together (i.e. within the same
command line call) have the same batch ID.
filter_dict: dict or None
Arbitrary filter dictionary to use for cancelling experiments. Any experiments whose database entries match all
keys/values of the dictionary will be cancelled.
Returns
-------
None
"""
collection = get_collection(db_collection_name)
if sacred_id is None:
# no ID is provided: we check whether there are slurm jobs for which after this action no
# RUNNING experiment remains. These slurm jobs can be killed altogether.
# However, it is NOT possible right now to cancel a single experiment in a Slurm job with multiple
# running experiments.
try:
if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
detect_killed(db_collection_name, print_detected=False)
filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)
ncancel = collection.count_documents(filter_dict)
if ncancel >= 10:
if input(f"Cancelling {ncancel} experiment{s_if(ncancel)}. "
f"Are you sure? (y/n) ").lower() != "y":
exit()
def reset_experiments(db_collection_name, sacred_id, filter_states, batch_id, filter_dict):
collection = get_collection(db_collection_name)
if sacred_id is None:
if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
detect_killed(db_collection_name, print_detected=False)
filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)
nreset = collection.count_documents(filter_dict)
exps = collection.find(filter_dict)
if nreset >= 10:
if input(f"Resetting the state of {nreset} experiment{s_if(nreset)}. "
f"Are you sure? (y/n) ").lower() != "y":
exit()
else:
logging.info(f"Resetting the state of {nreset} experiment{s_if(nreset)}.")
for exp in exps:
reset_single_experiment(collection, exp)
else:
exp = collection.find_one({'_id': sacred_id})
if exp is None:
logging.error(f"No experiment found with ID {sacred_id}.")
def delete_experiments(db_collection_name, sacred_id, filter_states, batch_id, filter_dict):
collection = get_collection(db_collection_name)
if sacred_id is None:
if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
detect_killed(db_collection_name, print_detected=False)
filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)
ndelete = collection.count_documents(filter_dict)
batch_ids = collection.find(filter_dict, {'batch_id'})
batch_ids_in_del = set([x['batch_id'] for x in batch_ids])
if ndelete >= 10:
if input(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection. "
f"Are you sure? (y/n) ").lower() != "y":
exit()
else:
logging.info(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection.")
collection.delete_many(filter_dict)
else:
exp = collection.find_one({'_id': sacred_id})
if exp is None:
logging.error(f"No experiment found with ID {sacred_id}.")
sys.exit(1)
Returns
-------
None
"""
collection = get_collection(db_collection_name)
if sacred_id is None:
# no ID is provided: we check whether there are slurm jobs for which after this action no
# RUNNING experiment remains. These slurm jobs can be killed altogether.
# However, it is NOT possible right now to cancel a single experiment in a Slurm job with multiple
# running experiments.
try:
if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
detect_killed(db_collection_name, print_detected=False)
filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)
ncancel = collection.count_documents(filter_dict)
if ncancel >= 10:
if input(f"Cancelling {ncancel} experiment{s_if(ncancel)}. "
f"Are you sure? (y/n) ").lower() != "y":
exit()
else:
logging.info(f"Cancelling {ncancel} experiment{s_if(ncancel)}.")
filter_dict_new = filter_dict.copy()
filter_dict_new.update({'slurm.array_id': {'$exists': True}})
exps = list(collection.find(filter_dict_new, {'_id': 1, 'status': 1, 'slurm.array_id': 1, 'slurm.task_id': 1}))
# set of slurm IDs in the database
slurm_ids = set([(e['slurm']['array_id'], e['slurm']['task_id']) for e in exps])
# set of experiment IDs to be cancelled.
exp_ids = set([e['_id'] for e in exps])
import torch
stats['pytorch'] = {}
if torch.cuda.is_available():
stats['pytorch']['gpu_max_memory_bytes'] = torch.cuda.max_memory_allocated()
if 'tensorflow' in sys.modules:
import tensorflow as tf
stats['tensorflow'] = {}
if int(tf.__version__.split('.')[0]) < 2:
if tf.test.is_gpu_available():
stats['tensorflow']['gpu_max_memory_bytes'] = tf.contrib.memory_stats.MaxBytesInUse()
else:
if len(tf.config.experimental.list_physical_devices('GPU')) >= 1:
logging.info("SEML stats: There is currently no way to get actual GPU memory usage in TensorFlow 2.")
collection = db_utils.get_collection(run.config['db_collection'])
collection.update_one(
{'_id': exp_id},
{'$set': {'stats': stats}})
def report_status(db_collection_name):
detect_killed(db_collection_name, print_detected=False)
collection = get_collection(db_collection_name)
queued = collection.count_documents({'status': 'QUEUED'})
pending = collection.count_documents({'status': 'PENDING'})
failed = collection.count_documents({'status': 'FAILED'})
killed = collection.count_documents({'status': 'KILLED'})
interrupted = collection.count_documents({'status': 'INTERRUPTED'})
running = collection.count_documents({'status': 'RUNNING'})
completed = collection.count_documents({'status': 'COMPLETED'})
title = f"********** Report for database collection '{db_collection_name}' **********"
logging.info(title)
logging.info(f"* - {queued:3d} queued experiment{s_if(queued)}")
logging.info(f"* - {pending:3d} pending experiment{s_if(pending)}")
logging.info(f"* - {running:3d} running experiment{s_if(running)}")
logging.info(f"* - {completed:3d} completed experiment{s_if(completed)}")
logging.info(f"* - {interrupted:3d} interrupted experiment{s_if(interrupted)}")
logging.info(f"* - {failed:3d} failed experiment{s_if(failed)}")
logging.info(f"* - {killed:3d} killed experiment{s_if(killed)}")
logging.info("*" * len(title))