Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if len(deps) == 1 and not deps[0]['child_path']:
fields['hard_dependencies'] = 'Reference:\n %s\n' % (
path_util.safe_join(deps[0]['parent_uuid'], deps[0]['parent_path']),)
else:
fields['hard_dependencies'] = 'References:\n%s\n' % ('\n'.join(
' %s:%s' % (
dep['child_path'],
path_util.safe_join(dep['parent_uuid'], dep['parent_path']),
) for dep in sorted(deps, key=lambda dep: dep['child_path'])
))
# Compute a nicely-formatted failure message, if this bundle failed.
# It is possible for bundles that are not failed to have failure messages:
# for example, if a bundle is killed in the database after running for too
# long then succeeds afterwards, it will be in this state.
fields['failure_message'] = ''
if info['state'] == State.FAILED and metadata['failure_message']:
fields['failure_message'] = 'Failure message:\n %s\n' % ('\n '.join(
metadata['failure_message'].split('\n')
))
# Return the formatted summary of the bundle info.
return '''
{bundle_type}: {name}
{description}
UUID: {uuid}
Hash: {data_hash}
State: {state}
{stats}{hard_dependencies}{failure_message}
'''.format(**fields).strip()
def wait(self, bundle_spec):
'''
Block on the execution of the given bundle. Return READY or FAILED
based on whether it was computed successfully.
'''
# Constants for a simple exponential backoff routine that will decrease the
# frequency at which we check this bundle's state from 1s to 1m.
period = 1.0
backoff = 1.1
max_period = 60.0
info = self.info(bundle_spec)
while info['state'] not in (State.READY, State.FAILED):
time.sleep(period)
period = min(backoff*period, max_period)
info = self.info(bundle_spec)
return info['state']
def _do_bundle_action(self, bundle_uuid, worker_message, action_string):
"""
Sends the message to the worker to do the bundle action, and adds the
action string to the bundle metadata.
"""
bundle = self.model.get_bundle(bundle_uuid)
if bundle.state != State.RUNNING:
raise UsageError('Cannot execute this action on a bundle that is not running.')
worker = self.worker_model.get_bundle_worker(bundle_uuid)
precondition(
self.worker_model.send_json_message(worker['socket_id'], worker_message, 60),
'Unable to reach worker.')
new_actions = getattr(bundle.metadata, 'actions', []) + [action_string]
db_update = {'metadata': {'actions': new_actions}}
self.model.update_bundle(bundle, db_update)
def watch(self, bundle_spec, fns):
'''
Block on the execution of the given bundle.
fns should be a list of functions that return strings.
Periodically execute fns and print output.
Return READY or FAILED based on whether it was computed successfully.
'''
# Constants for a simple exponential backoff routine that will decrease the
# frequency at which we check this bundle's state from 1s to 1m.
period = 1.0
backoff = 1.1
max_period = 60.0
info = self.info(bundle_spec)
while info['state'] not in (State.READY, State.FAILED):
# Update bundle info
info = self.info(bundle_spec)
# Call update functions
change = False
for fn in fns:
result = fn()
while not result == '':
change = True
stdout.write(result)
result = fn()
stdout.flush()
# Sleep if nothing happened
if change == False:
time.sleep(period)
period = min(backoff*period, max_period)
# Make a note of runnning jobs (according to the database) which aren't
# mentioned in statuses. These are probably zombies, and we want to
# get rid of them if they have been issued a kill action.
status_bundle_uuids = set(status['bundle'].uuid for status in statuses)
running_bundles = self.model.batch_get_bundles(state=State.RUNNING)
for bundle in running_bundles:
if bundle.uuid in status_bundle_uuids: continue # Exists, skip
if BundleAction.KILL not in getattr(bundle.metadata, 'actions', set()): continue # Not killing
status = {'state': State.FAILED, 'bundle': bundle}
print 'work_manager: %s (%s): killing zombie %s' % (bundle.uuid, bundle.state, status)
self.update_running_bundle(status)
# Update the status of these bundles.
for status in statuses:
bundle = status['bundle']
if bundle.state in [State.READY, State.FAILED]: # Skip bundles that have already completed.
continue
print 'work_manager: %s (%s): %s' % (bundle.uuid, bundle.state, status)
self.update_running_bundle(status)
def _delete_finished_torque_workers(self, workers):
"""
Shut down Torque workers once the runs have finished. We use this
mechanism, instead of shutting workers down automatically after they
finish a single run, since this mechanism also handles exceptional cases
such as the run message going missing.
"""
running_job_handles = set()
running_states = [State.WAITING_FOR_WORKER_STARTUP, State.STARTING, State.RUNNING]
for bundle in self._model.batch_get_bundles(state=running_states, bundle_type='run'):
if hasattr(bundle.metadata, 'job_handle'):
running_job_handles.add(bundle.metadata.job_handle)
for worker in workers.user_owned_workers(self._model.root_user_id):
job_handle = worker['worker_id']
if job_handle not in running_job_handles:
if (job_handle in self._last_delete_attempt and
self._last_delete_attempt[job_handle] - time.time() < 60):
# Throttle the deletes in case there is a Torque problem.
continue
self._last_delete_attempt[job_handle] = time.time()
logger.info('Delete Torque worker with handle %s', job_handle)
# Delete the worker job.
command = self._torque_ssh_command(['qdel', job_handle])
def _fail_on_bad_torque_start(self):
"""
Fail the bundle and clean-up the Torque worker if the Torque worker
failed to start. This would happen if:
1) The Torque worker outputs some errors.
2) If Torque fails to schedule the worker at all, for example, when
the user has requested too many resources.
"""
for bundle in self._model.batch_get_bundles(state=State.WAITING_FOR_WORKER_STARTUP, bundle_type='run'):
failure_message = self._read_torque_error_log(bundle.metadata.job_handle)
if failure_message is None and time.time() - bundle.metadata.last_updated > 20 * 60:
failure_message = 'Worker failed to start. You may have requested too many resources.'
if failure_message is not None:
logger.info('Failing %s: %s', bundle.uuid, failure_message)
self._model.update_bundle(
bundle, {'state': State.FAILED,
'metadata': {'failure_message': failure_message}})
status['bundle'] = bundle
new_statuses.append(status)
statuses = new_statuses
# Now that we have the bundle information and thus the temporary directory,
# we can fetch the rest of the status.
for status in statuses:
if 'bundle_handler' in status:
status.update(status['bundle_handler'](status['bundle']))
del status['bundle_handler']
# Make a note of runnning jobs (according to the database) which aren't
# mentioned in statuses. These are probably zombies, and we want to
# get rid of them if they have been issued a kill action.
status_bundle_uuids = set(status['bundle'].uuid for status in statuses)
running_bundles = self.model.batch_get_bundles(state=State.RUNNING)
for bundle in running_bundles:
if bundle.uuid in status_bundle_uuids: continue # Exists, skip
if BundleAction.KILL not in getattr(bundle.metadata, 'actions', set()): continue # Not killing
status = {'state': State.FAILED, 'bundle': bundle}
print 'work_manager: %s (%s): killing zombie %s' % (bundle.uuid, bundle.state, status)
self.update_running_bundle(status)
# Update the status of these bundles.
for status in statuses:
bundle = status['bundle']
if bundle.state in [State.READY, State.FAILED]: # Skip bundles that have already completed.
continue
print 'work_manager: %s (%s): %s' % (bundle.uuid, bundle.state, status)
self.update_running_bundle(status)