import threading
import logging
import queue
import time
import Pyro4
[docs]class Job(object):
def __init__(self, id, **kwargs):
self.id = id
self.kwargs = kwargs
self.timestamps = {}
self.result = None
self.exception = None
self.worker_name = None
[docs] def time_it(self, which_time):
self.timestamps[which_time] = time.time()
def __repr__(self):
return(\
"job_id: " +str(self.id) + "\n" + \
"kwargs: " + str(self.kwargs) + "\n" + \
"result: " + str(self.result)+ "\n" +\
"exception: "+ str(self.exception) + "\n"
)
[docs] def recreate_from_run(self, run):
run.config_id
run.budget
run.error_logs
run.loss
run.info
run.time_stamps
[docs]class Worker(object):
def __init__(self, name, uri):
self.name = name
self.proxy = Pyro4.Proxy(uri)
self.runs_job = None
[docs] def is_alive(self):
try:
self.proxy._pyroReconnect(1)
except Pyro4.errors.ConnectionClosedError:
return False
except:
raise
return(True)
[docs] def shutdown(self):
self.proxy.shutdown()
[docs] def is_busy(self):
return(self.proxy.is_busy())
def __repr__(self):
return(self.name)
[docs]class Dispatcher(object):
"""
The dispatcher is responsible for assigning tasks to free workers, report results back to the master and
communicate to the nameserver.
"""
def __init__(self, new_result_callback, run_id='0',
ping_interval=10, nameserver='localhost',
nameserver_port=None,
host=None, logger=None, queue_callback=None):
"""
Parameters
----------
new_result_callback: function
function that will be called with a `Job instance <hpbandster.core.dispatcher.Job>`_ as argument.
From the `Job` the result can be read and e.g. logged.
run_id: str
unique run_id associated with the HPB run
ping_interval: int
how often to ping for workers (in seconds)
nameserver: str
address of the Pyro4 nameserver
nameserver_port: int
port of Pyro4 nameserver
host: str
ip (or name that resolves to that) of the network interface to use
logger: logging.Logger
logger-instance for info and debug
queue_callback: function
gets called with the number of workers in the pool on every update-cycle
"""
self.new_result_callback = new_result_callback
self.queue_callback = queue_callback
self.run_id = run_id
self.nameserver = nameserver
self.nameserver_port = nameserver_port
self.host = host
self.ping_interval = int(ping_interval)
self.shutdown_all_threads = False
if logger is None:
self.logger = logging.getLogger('hpbandster')
else:
self.logger = logger
self.worker_pool = {}
self.waiting_jobs = queue.Queue()
self.running_jobs = {}
self.idle_workers = set()
self.thread_lock = threading.Lock()
self.runner_cond = threading.Condition(self.thread_lock)
self.discover_cond = threading.Condition(self.thread_lock)
self.pyro_id="hpbandster.run_%s.dispatcher"%self.run_id
[docs] def run(self):
with self.discover_cond:
t1 = threading.Thread(target=self.discover_workers, name='discover_workers')
t1.start()
self.logger.info('DISPATCHER: started the \'discover_worker\' thread')
t2 = threading.Thread(target=self.job_runner, name='job_runner')
t2.start()
self.logger.info('DISPATCHER: started the \'job_runner\' thread')
self.pyro_daemon = Pyro4.core.Daemon(host=self.host)
with Pyro4.locateNS(host=self.nameserver, port=self.nameserver_port) as ns:
uri = self.pyro_daemon.register(self, self.pyro_id)
ns.register(self.pyro_id, uri)
self.logger.info("DISPATCHER: Pyro daemon running on %s"%(self.pyro_daemon.locationStr))
self.pyro_daemon.requestLoop()
with self.discover_cond:
self.shutdown_all_threads = True
self.logger.info('DISPATCHER: Dispatcher shutting down')
self.runner_cond.notify_all()
self.discover_cond.notify_all()
with Pyro4.locateNS(self.nameserver, port=self.nameserver_port) as ns:
ns.remove(self.pyro_id)
t1.join()
self.logger.debug('DISPATCHER: \'discover_worker\' thread exited')
t2.join()
self.logger.debug('DISPATCHER: \'job_runner\' thread exited')
self.logger.info('DISPATCHER: shut down complete')
[docs] def shutdown_all_workers(self, rediscover=False):
with self.discover_cond:
for worker in self.worker_pool.values():
worker.shutdown()
if rediscover:
time.sleep(1)
self.discover_cond.notify()
[docs] def shutdown(self, shutdown_workers=False):
if shutdown_workers:
self.shutdown_all_workers()
with self.runner_cond:
self.pyro_daemon.shutdown()
[docs] @Pyro4.expose
@Pyro4.oneway
def trigger_discover_worker(self):
#time.sleep(1)
self.logger.info("DISPATCHER: A new worker triggered discover_worker")
with self.discover_cond:
self.discover_cond.notify()
[docs] def discover_workers(self):
self.discover_cond.acquire()
sleep_interval = 1
while True:
self.logger.debug('DISPATCHER: Starting worker discovery')
update = False
with Pyro4.locateNS(host=self.nameserver, port=self.nameserver_port) as ns:
worker_names = ns.list(prefix="hpbandster.run_%s.worker."%self.run_id)
self.logger.debug("DISPATCHER: Found %i potential workers, %i currently in the pool."%(len(worker_names), len(self.worker_pool)))
for wn, uri in worker_names.items():
if not wn in self.worker_pool:
w = Worker(wn, uri)
if not w.is_alive():
self.logger.debug('DISPATCHER: skipping dead worker, %s'%wn)
continue
update = True
self.logger.info('DISPATCHER: discovered new worker, %s'%wn)
self.worker_pool[wn] = w
# check the current list of workers
crashed_jobs = set()
all_workers = list(self.worker_pool.keys())
for wn in all_workers:
# remove dead entries from the nameserver
if not self.worker_pool[wn].is_alive():
self.logger.info('DISPATCHER: removing dead worker, %s'%wn)
update = True
# todo check if there were jobs running on that that need to be rescheduled
current_job = self.worker_pool[wn].runs_job
if not current_job is None:
self.logger.info('Job %s was not completed'%str(current_job))
crashed_jobs.add(current_job)
del self.worker_pool[wn]
self.idle_workers.discard(wn)
continue
if not self.worker_pool[wn].is_busy():
self.idle_workers.add(wn)
# try to submit more jobs if something changed
if update:
if not self.queue_callback is None:
self.discover_cond.release()
self.queue_callback(len(self.worker_pool))
self.discover_cond.acquire()
self.runner_cond.notify()
for crashed_job in crashed_jobs:
self.discover_cond.release()
self.register_result(crashed_job, {'result': None, 'exception': 'Worker died unexpectedly.'})
self.discover_cond.acquire()
self.logger.debug('DISPATCHER: Finished worker discovery')
#if (len(self.worker_pool) == 0 ): # ping for new workers if no workers are currently available
# self.logger.debug('No workers available! Keep pinging')
# self.discover_cond.wait(sleep_interval)
# sleep_interval *= 2
#else:
self.discover_cond.wait(self.ping_interval)
if self.shutdown_all_threads:
self.logger.debug('DISPATCHER: discover_workers shutting down')
self.runner_cond.notify()
self.discover_cond.release()
return
[docs] def number_of_workers(self):
with self.discover_cond:
return(len(self.worker_pool))
[docs] def job_runner(self):
self.runner_cond.acquire()
while True:
while self.waiting_jobs.empty() or len(self.idle_workers) == 0:
self.logger.debug('DISPATCHER: jobs to submit = %i, number of idle workers = %i -> waiting!'%(self.waiting_jobs.qsize(), len(self.idle_workers) ))
self.runner_cond.wait()
self.logger.debug('DISPATCHER: Trying to submit another job.')
if self.shutdown_all_threads:
self.logger.debug('DISPATCHER: job_runner shutting down')
self.discover_cond.notify()
self.runner_cond.release()
return
job = self.waiting_jobs.get()
wn = self.idle_workers.pop()
worker = self.worker_pool[wn]
self.logger.debug('DISPATCHER: starting job %s on %s'%(str(job.id),worker.name))
job.time_it('started')
worker.runs_job = job.id
worker.proxy.start_computation(self, job.id, **job.kwargs)
job.worker_name = wn
self.running_jobs[job.id] = job
self.logger.debug('DISPATCHER: job %s dispatched on %s'%(str(job.id),worker.name))
[docs] def submit_job(self, id, **kwargs):
self.logger.debug('DISPATCHER: trying to submit job %s'%str(id))
with self.runner_cond:
job = Job(id, **kwargs)
job.time_it('submitted')
self.waiting_jobs.put(job)
self.logger.debug('DISPATCHER: trying to notify the job_runner thread.')
self.runner_cond.notify()
[docs] @Pyro4.expose
@Pyro4.callback
@Pyro4.oneway
def register_result(self, id=None, result=None):
self.logger.debug('DISPATCHER: job %s finished'%(str(id)))
with self.runner_cond:
self.logger.debug('DISPATCHER: register_result: lock acquired')
# fill in missing information
job = self.running_jobs[id]
job.time_it('finished')
job.result = result['result']
job.exception = result['exception']
self.logger.debug('DISPATCHER: job %s on %s finished'%(str(job.id),job.worker_name))
self.logger.debug(str(job))
# delete job
del self.running_jobs[id]
# label worker as idle again
try:
self.worker_pool[job.worker_name].runs_job = None
self.worker_pool[job.worker_name].proxy._pyroRelease()
self.idle_workers.add(job.worker_name)
# notify the job_runner to check for more jobs to run
self.runner_cond.notify()
except KeyError:
# happens for crashed workers, but we can just continue
pass
except:
raise
# call users callback function to register the result
# needs to be with the condition released, as the master can call
# submit_job quickly enough to cause a dead-lock
self.new_result_callback(job)