How to use seml - 10 common examples

To help you get started, we’ve selected a few seml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

TUM-DAML / seml / seml / start.py View on Github

for exp_array in exp_arrays:
            job_name = get_exp_name(exp_array[0][0], collection.name)
            output_dir_path = get_output_dir_path(exp_array[0][0])
            slurm_config = exp_array[0][0]['slurm']
            del slurm_config['experiments_per_job']
            start_slurm_job(collection, exp_array, unobserved, post_mortem,
                            name=job_name, output_dir_path=output_dir_path, **slurm_config)
    else:
        login_node_name = 'fs'
        if login_node_name in os.uname()[1]:
            logging.error("Refusing to run a compute experiment on a login node. "
                          "Please use Slurm or a compute node.")
            sys.exit(1)
        [get_output_dir_path(exp) for exp in exps_list]  # Check if output dir exists
        logging.info(f'Starting local worker thread that will run up to {nexps} experiment{s_if(nexps)}, '
                     f'until no queued experiments remain.')
        if not unobserved:
            collection.update_many({'_id': {'$in': [e['_id'] for e in exps_list]}}, {"$set": {"status": "PENDING"}})
        num_exceptions = 0
        tq = tqdm(enumerate(exps_list))
        for i_exp, exp in tq:
            if output_to_file:
                output_dir_path = get_output_dir_path(exp)
            else:
                output_dir_path = None
            success = start_local_job(collection, exp, unobserved, post_mortem, output_dir_path)
            if success is False:
                num_exceptions += 1
            tq.set_postfix(failed=f"{num_exceptions}/{i_exp} experiments")

TUM-DAML / seml / seml / manage.py View on Github

collection = get_collection(db_collection_name)
    if sacred_id is None:
        if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
            detect_killed(db_collection_name, print_detected=False)

        filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)
        ndelete = collection.count_documents(filter_dict)
        batch_ids = collection.find(filter_dict, {'batch_id'})
        batch_ids_in_del = set([x['batch_id'] for x in batch_ids])

        if ndelete >= 10:
            if input(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection. "
                     f"Are you sure? (y/n) ").lower() != "y":
                exit()
        else:
            logging.info(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection.")
        collection.delete_many(filter_dict)
    else:
        exp = collection.find_one({'_id': sacred_id})
        if exp is None:
            logging.error(f"No experiment found with ID {sacred_id}.")
            sys.exit(1)
        else:
            logging.info(f"Deleting experiment with ID {sacred_id}.")
            batch_ids_in_del = set([exp['batch_id']])
            collection.delete_one({'_id': sacred_id})

    if len(batch_ids_in_del) > 0:
        # clean up the uploaded sources if no experiments of a batch remain
        delete_orphaned_sources(collection, batch_ids_in_del)

TUM-DAML / seml / seml / start.py View on Github

if 'conda_environment' in exp['seml']:
                configs.append((exe, exp['seml']['conda_environment'], config))
            else:
                configs.append((exe, None, config))
        return configs
    elif slurm:
        if not output_to_file:
            logging.error("Output cannot be written to stdout in Slurm mode. "
                          "Remove the '--output-to-console' argument.")
            sys.exit(1)
        exp_chunks = chunk_list(exps_list)
        exp_arrays = batch_chunks(exp_chunks)
        njobs = len(exp_chunks)
        narrays = len(exp_arrays)

        logging.info(f"Starting {nexps} experiment{s_if(nexps)} in "
                     f"{njobs} Slurm job{s_if(njobs)} in {narrays} Slurm job array{s_if(narrays)}.")

        for exp_array in exp_arrays:
            job_name = get_exp_name(exp_array[0][0], collection.name)
            output_dir_path = get_output_dir_path(exp_array[0][0])
            slurm_config = exp_array[0][0]['slurm']
            del slurm_config['experiments_per_job']
            start_slurm_job(collection, exp_array, unobserved, post_mortem,
                            name=job_name, output_dir_path=output_dir_path, **slurm_config)
    else:
        login_node_name = 'fs'
        if login_node_name in os.uname()[1]:
            logging.error("Refusing to run a compute experiment on a login node. "
                          "Please use Slurm or a compute node.")
            sys.exit(1)
        [get_output_dir_path(exp) for exp in exps_list]  # Check if output dir exists

TUM-DAML / seml / seml / manage.py View on Github

def delete_experiments(db_collection_name, sacred_id, filter_states, batch_id, filter_dict):
    collection = get_collection(db_collection_name)
    if sacred_id is None:
        if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
            detect_killed(db_collection_name, print_detected=False)

        filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)
        ndelete = collection.count_documents(filter_dict)
        batch_ids = collection.find(filter_dict, {'batch_id'})
        batch_ids_in_del = set([x['batch_id'] for x in batch_ids])

        if ndelete >= 10:
            if input(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection. "
                     f"Are you sure? (y/n) ").lower() != "y":
                exit()
        else:
            logging.info(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection.")
        collection.delete_many(filter_dict)

TUM-DAML / seml / seml / manage.py View on Github

filter_states: list of strings or None
        List of statuses to filter for. Will cancel all jobs from the database collection
        with one of the given statuses.
    batch_id: int or None
        The ID of the batch of experiments to cancel. All experiments that are queued together (i.e. within the same
        command line call) have the same batch ID.
    filter_dict: dict or None
        Arbitrary filter dictionary to use for cancelling experiments. Any experiments whose database entries match all
        keys/values of the dictionary will be cancelled.

    Returns
    -------
    None

    """
    collection = get_collection(db_collection_name)
    if sacred_id is None:
        # no ID is provided: we check whether there are slurm jobs for which after this action no
        # RUNNING experiment remains. These slurm jobs can be killed altogether.
        # However, it is NOT possible right now to cancel a single experiment in a Slurm job with multiple
        # running experiments.
        try:
            if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
                detect_killed(db_collection_name, print_detected=False)

            filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)

            ncancel = collection.count_documents(filter_dict)
            if ncancel >= 10:
                if input(f"Cancelling {ncancel} experiment{s_if(ncancel)}. "
                         f"Are you sure? (y/n) ").lower() != "y":
                    exit()

TUM-DAML / seml / seml / manage.py View on Github

def reset_experiments(db_collection_name, sacred_id, filter_states, batch_id, filter_dict):
    collection = get_collection(db_collection_name)

    if sacred_id is None:
        if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
            detect_killed(db_collection_name, print_detected=False)

        filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)

        nreset = collection.count_documents(filter_dict)
        exps = collection.find(filter_dict)

        if nreset >= 10:
            if input(f"Resetting the state of {nreset} experiment{s_if(nreset)}. "
                     f"Are you sure? (y/n) ").lower() != "y":
                exit()
        else:
            logging.info(f"Resetting the state of {nreset} experiment{s_if(nreset)}.")
        for exp in exps:
            reset_single_experiment(collection, exp)
    else:
        exp = collection.find_one({'_id': sacred_id})
        if exp is None:
            logging.error(f"No experiment found with ID {sacred_id}.")

TUM-DAML / seml / seml / manage.py View on Github

def delete_experiments(db_collection_name, sacred_id, filter_states, batch_id, filter_dict):
    collection = get_collection(db_collection_name)
    if sacred_id is None:
        if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
            detect_killed(db_collection_name, print_detected=False)

        filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)
        ndelete = collection.count_documents(filter_dict)
        batch_ids = collection.find(filter_dict, {'batch_id'})
        batch_ids_in_del = set([x['batch_id'] for x in batch_ids])

        if ndelete >= 10:
            if input(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection. "
                     f"Are you sure? (y/n) ").lower() != "y":
                exit()
        else:
            logging.info(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection.")
        collection.delete_many(filter_dict)
    else:
        exp = collection.find_one({'_id': sacred_id})
        if exp is None:
            logging.error(f"No experiment found with ID {sacred_id}.")
            sys.exit(1)

TUM-DAML / seml / seml / manage.py View on Github

Returns
    -------
    None

    """
    collection = get_collection(db_collection_name)
    if sacred_id is None:
        # no ID is provided: we check whether there are slurm jobs for which after this action no
        # RUNNING experiment remains. These slurm jobs can be killed altogether.
        # However, it is NOT possible right now to cancel a single experiment in a Slurm job with multiple
        # running experiments.
        try:
            if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
                detect_killed(db_collection_name, print_detected=False)

            filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)

            ncancel = collection.count_documents(filter_dict)
            if ncancel >= 10:
                if input(f"Cancelling {ncancel} experiment{s_if(ncancel)}. "
                         f"Are you sure? (y/n) ").lower() != "y":
                    exit()
            else:
                logging.info(f"Cancelling {ncancel} experiment{s_if(ncancel)}.")

            filter_dict_new = filter_dict.copy()
            filter_dict_new.update({'slurm.array_id': {'$exists': True}})
            exps = list(collection.find(filter_dict_new, {'_id': 1, 'status': 1, 'slurm.array_id': 1, 'slurm.task_id': 1}))
            # set of slurm IDs in the database
            slurm_ids = set([(e['slurm']['array_id'], e['slurm']['task_id']) for e in exps])
            # set of experiment IDs to be cancelled.
            exp_ids = set([e['_id'] for e in exps])

TUM-DAML / seml / seml / misc.py View on Github

import torch
        stats['pytorch'] = {}
        if torch.cuda.is_available():
            stats['pytorch']['gpu_max_memory_bytes'] = torch.cuda.max_memory_allocated()

    if 'tensorflow' in sys.modules:
        import tensorflow as tf
        stats['tensorflow'] = {}
        if int(tf.__version__.split('.')[0]) &lt; 2:
            if tf.test.is_gpu_available():
                stats['tensorflow']['gpu_max_memory_bytes'] = tf.contrib.memory_stats.MaxBytesInUse()
        else:
            if len(tf.config.experimental.list_physical_devices('GPU')) &gt;= 1:
                logging.info("SEML stats: There is currently no way to get actual GPU memory usage in TensorFlow 2.")

    collection = db_utils.get_collection(run.config['db_collection'])
    collection.update_one(
            {'_id': exp_id},
            {'$set': {'stats': stats}})

TUM-DAML / seml / seml / manage.py View on Github

def report_status(db_collection_name):
    detect_killed(db_collection_name, print_detected=False)
    collection = get_collection(db_collection_name)
    queued = collection.count_documents({'status': 'QUEUED'})
    pending = collection.count_documents({'status': 'PENDING'})
    failed = collection.count_documents({'status': 'FAILED'})
    killed = collection.count_documents({'status': 'KILLED'})
    interrupted = collection.count_documents({'status': 'INTERRUPTED'})
    running = collection.count_documents({'status': 'RUNNING'})
    completed = collection.count_documents({'status': 'COMPLETED'})
    title = f"********** Report for database collection '{db_collection_name}' **********"
    logging.info(title)
    logging.info(f"*     - {queued:3d} queued experiment{s_if(queued)}")
    logging.info(f"*     - {pending:3d} pending experiment{s_if(pending)}")
    logging.info(f"*     - {running:3d} running experiment{s_if(running)}")
    logging.info(f"*     - {completed:3d} completed experiment{s_if(completed)}")
    logging.info(f"*     - {interrupted:3d} interrupted experiment{s_if(interrupted)}")
    logging.info(f"*     - {failed:3d} failed experiment{s_if(failed)}")
    logging.info(f"*     - {killed:3d} killed experiment{s_if(killed)}")
    logging.info("*" * len(title))

How to use seml - 10 common examples

To help you get started, we’ve selected a few seml examples, based on popular ways it is used in public projects.

seml

Package Health Score

Popular seml functions

Similar packages