Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_reservation_enviroment_not_exists_get_server_ip_return_actual_host_ip(self):
tfso_server = Server(5)
assert tfso_server.get_server_ip() == util.get_ip_address()
def get_server_ip(self):
return os.getenv(TFOS_SERVER_HOST) if os.getenv(TFOS_SERVER_HOST) else util.get_ip_address()
def _train(iter):
# get shared queue, reconnecting if necessary
mgr = _get_manager(cluster_info, util.get_ip_address(), util.read_executor_id())
try:
queue = mgr.get_queue(qname)
equeue = mgr.get_queue('error')
except (AttributeError, KeyError):
msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(qname)
raise Exception(msg)
state = str(mgr.get('state'))
logger.info("mgr.state={0}".format(state))
terminating = state == "'terminating'"
if terminating:
logger.info("mgr is terminating, skipping partition")
count = sum(1 for item in iter)
logger.info("Skipped {0} items from partition".format(count))
else:
logger.info("Feeding partition {0} into {1} queue {2}".format(iter, qname, queue))
def _inference(iter):
# get shared queue, reconnecting if necessary
mgr = _get_manager(cluster_info, util.get_ip_address(), util.read_executor_id())
try:
queue_in = mgr.get_queue(qname)
equeue = mgr.get_queue('error')
except (AttributeError, KeyError):
msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(qname)
raise Exception(msg)
logger.info("Feeding partition {0} into {1} queue {2}".format(iter, qname, queue_in))
count = 0
for item in iter:
count += 1
queue_in.put(item, block=True)
# signal "end of partition"
queue_in.put(marker.EndPartition())
def _shutdown(iter):
host = util.get_ip_address()
executor_id = util.read_executor_id()
# reconnect to shared queue
mgr = _get_manager(cluster_info, host, executor_id)
# send SIGTERM to Tensorboard proc (if running)
for node in cluster_info:
if node['host'] == host and node['executor_id'] == executor_id:
tb_pid = node['tb_pid']
if tb_pid != 0:
logger.info("Stopping tensorboard (pid={0})".format(tb_pid))
subprocess.Popen(["kill", str(tb_pid)])
# terminate any listening queues
logger.info("Stopping all queues")
for q in queues:
gpus_to_use = gpu_info.get_gpus(num_gpus)
# assign TF job/task based on provided cluster_spec template (or use default/null values)
job_name = 'default'
task_index = -1
cluster_id = cluster_meta['id']
cluster_template = cluster_meta['cluster_template']
for jobtype in cluster_template:
nodes = cluster_template[jobtype]
if executor_id in nodes:
job_name = jobtype
task_index = nodes.index(executor_id)
break
# get unique key (hostname, executor_id) for this executor
host = util.get_ip_address()
util.write_executor_id(executor_id)
port = 0
# check for existing TFManagers
if TFSparkNode.mgr is not None and str(TFSparkNode.mgr.get('state')) != "'stopped'":
if TFSparkNode.cluster_id == cluster_id:
# raise an exception to force Spark to retry this "reservation" task on another executor
raise Exception("TFManager already started on {0}, executor={1}, state={2}".format(host, executor_id, str(TFSparkNode.mgr.get("state"))))
else:
# old state, just continue with creating new manager
logger.warn("Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}".format(TFSparkNode.cluster_id, cluster_id))
# start a TFManager and get a free port
# use a random uuid as the authkey
authkey = uuid.uuid4().bytes
addr = None