Development of an internal social media platform with personalised dashboards for students
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

asynpool.py 47KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270
  1. # -*- coding: utf-8 -*-
  2. """
  3. celery.concurrency.asynpool
  4. ~~~~~~~~~~~~~~~~~~~~~~~~~~~
  5. .. note::
  6. This module will be moved soon, so don't use it directly.
  7. Non-blocking version of :class:`multiprocessing.Pool`.
  8. This code deals with three major challenges:
  9. 1) Starting up child processes and keeping them running.
  10. 2) Sending jobs to the processes and receiving results back.
  11. 3) Safely shutting down this system.
  12. """
  13. from __future__ import absolute_import
  14. import errno
  15. import gc
  16. import os
  17. import select
  18. import socket
  19. import struct
  20. import sys
  21. import time
  22. from collections import deque, namedtuple
  23. from io import BytesIO
  24. from pickle import HIGHEST_PROTOCOL
  25. from time import sleep
  26. from weakref import WeakValueDictionary, ref
  27. from amqp.utils import promise
  28. from billiard.pool import RUN, TERMINATE, ACK, NACK, WorkersJoined
  29. from billiard import pool as _pool
  30. from billiard.compat import buf_t, setblocking, isblocking
  31. from billiard.einfo import ExceptionInfo
  32. from billiard.queues import _SimpleQueue
  33. from kombu.async import READ, WRITE, ERR
  34. from kombu.serialization import pickle as _pickle
  35. from kombu.utils import fxrange
  36. from kombu.utils.compat import get_errno
  37. from kombu.utils.eventio import SELECT_BAD_FD
  38. from celery.five import Counter, items, string_t, text_t, values
  39. from celery.utils.log import get_logger
  40. from celery.utils.text import truncate
  41. from celery.worker import state as worker_state
  42. try:
  43. from _billiard import read as __read__
  44. from struct import unpack_from as _unpack_from
  45. memoryview = memoryview
  46. readcanbuf = True
  47. if sys.version_info[0] == 2 and sys.version_info < (2, 7, 6):
  48. def unpack_from(fmt, view, _unpack_from=_unpack_from): # noqa
  49. return _unpack_from(fmt, view.tobytes()) # <- memoryview
  50. else:
  51. # unpack_from supports memoryview in 2.7.6 and 3.3+
  52. unpack_from = _unpack_from # noqa
  53. except (ImportError, NameError): # pragma: no cover
  54. def __read__(fd, buf, size, read=os.read): # noqa
  55. chunk = read(fd, size)
  56. n = len(chunk)
  57. if n != 0:
  58. buf.write(chunk)
  59. return n
  60. readcanbuf = False # noqa
  61. def unpack_from(fmt, iobuf, unpack=struct.unpack): # noqa
  62. return unpack(fmt, iobuf.getvalue()) # <-- BytesIO
  63. logger = get_logger(__name__)
  64. error, debug = logger.error, logger.debug
  65. UNAVAIL = frozenset([errno.EAGAIN, errno.EINTR])
  66. #: Constant sent by child process when started (ready to accept work)
  67. WORKER_UP = 15
  68. #: A process must have started before this timeout (in secs.) expires.
  69. PROC_ALIVE_TIMEOUT = 4.0
  70. SCHED_STRATEGY_PREFETCH = 1
  71. SCHED_STRATEGY_FAIR = 4
  72. SCHED_STRATEGIES = {
  73. None: SCHED_STRATEGY_PREFETCH,
  74. 'fair': SCHED_STRATEGY_FAIR,
  75. }
  76. RESULT_MAXLEN = 128
  77. Ack = namedtuple('Ack', ('id', 'fd', 'payload'))
  78. def gen_not_started(gen):
  79. # gi_frame is None when generator stopped.
  80. return gen.gi_frame and gen.gi_frame.f_lasti == -1
  81. def _get_job_writer(job):
  82. try:
  83. writer = job._writer
  84. except AttributeError:
  85. pass
  86. else:
  87. return writer() # is a weakref
  88. def _select(readers=None, writers=None, err=None, timeout=0):
  89. """Simple wrapper to :class:`~select.select`.
  90. :param readers: Set of reader fds to test if readable.
  91. :param writers: Set of writer fds to test if writable.
  92. :param err: Set of fds to test for error condition.
  93. All fd sets passed must be mutable as this function
  94. will remove non-working fds from them, this also means
  95. the caller must make sure there are still fds in the sets
  96. before calling us again.
  97. :returns: tuple of ``(readable, writable, again)``, where
  98. ``readable`` is a set of fds that have data available for read,
  99. ``writable`` is a set of fds that is ready to be written to
  100. and ``again`` is a flag that if set means the caller must
  101. throw away the result and call us again.
  102. """
  103. readers = set() if readers is None else readers
  104. writers = set() if writers is None else writers
  105. err = set() if err is None else err
  106. try:
  107. r, w, e = select.select(readers, writers, err, timeout)
  108. if e:
  109. r = list(set(r) | set(e))
  110. return r, w, 0
  111. except (select.error, socket.error) as exc:
  112. if get_errno(exc) == errno.EINTR:
  113. return [], [], 1
  114. elif get_errno(exc) in SELECT_BAD_FD:
  115. for fd in readers | writers | err:
  116. try:
  117. select.select([fd], [], [], 0)
  118. except (select.error, socket.error) as exc:
  119. if get_errno(exc) not in SELECT_BAD_FD:
  120. raise
  121. readers.discard(fd)
  122. writers.discard(fd)
  123. err.discard(fd)
  124. return [], [], 1
  125. else:
  126. raise
  127. def _repr_result(obj):
  128. try:
  129. return repr(obj)
  130. except Exception as orig_exc:
  131. try:
  132. return text_t(obj)
  133. except UnicodeDecodeError:
  134. if isinstance(obj, string_t):
  135. try:
  136. return obj.decode('utf-8', errors='replace')
  137. except Exception:
  138. pass
  139. return '<Unrepresentable: {0!r} (o.__repr__ returns unicode?)>'.format(
  140. orig_exc,
  141. )
  142. class Worker(_pool.Worker):
  143. """Pool worker process."""
  144. dead = False
  145. def on_loop_start(self, pid):
  146. # our version sends a WORKER_UP message when the process is ready
  147. # to accept work, this will tell the parent that the inqueue fd
  148. # is writable.
  149. self.outq.put((WORKER_UP, (pid, )))
  150. def prepare_result(self, result, maxlen=RESULT_MAXLEN, truncate=truncate):
  151. if not isinstance(result, ExceptionInfo):
  152. return truncate(_repr_result(result), maxlen)
  153. return result
  154. class ResultHandler(_pool.ResultHandler):
  155. """Handles messages from the pool processes."""
  156. def __init__(self, *args, **kwargs):
  157. self.fileno_to_outq = kwargs.pop('fileno_to_outq')
  158. self.on_process_alive = kwargs.pop('on_process_alive')
  159. super(ResultHandler, self).__init__(*args, **kwargs)
  160. # add our custom message handler
  161. self.state_handlers[WORKER_UP] = self.on_process_alive
  162. def _recv_message(self, add_reader, fd, callback,
  163. __read__=__read__, readcanbuf=readcanbuf,
  164. BytesIO=BytesIO, unpack_from=unpack_from,
  165. load=_pickle.load):
  166. Hr = Br = 0
  167. if readcanbuf:
  168. buf = bytearray(4)
  169. bufv = memoryview(buf)
  170. else:
  171. buf = bufv = BytesIO()
  172. # header
  173. while Hr < 4:
  174. try:
  175. n = __read__(
  176. fd, bufv[Hr:] if readcanbuf else bufv, 4 - Hr,
  177. )
  178. except OSError as exc:
  179. if get_errno(exc) not in UNAVAIL:
  180. raise
  181. yield
  182. else:
  183. if n == 0:
  184. raise (OSError('End of file during message') if Hr
  185. else EOFError())
  186. Hr += n
  187. body_size, = unpack_from('>i', bufv)
  188. if readcanbuf:
  189. buf = bytearray(body_size)
  190. bufv = memoryview(buf)
  191. else:
  192. buf = bufv = BytesIO()
  193. while Br < body_size:
  194. try:
  195. n = __read__(
  196. fd, bufv[Br:] if readcanbuf else bufv, body_size - Br,
  197. )
  198. except OSError as exc:
  199. if get_errno(exc) not in UNAVAIL:
  200. raise
  201. yield
  202. else:
  203. if n == 0:
  204. raise (OSError('End of file during message') if Br
  205. else EOFError())
  206. Br += n
  207. add_reader(fd, self.handle_event, fd)
  208. if readcanbuf:
  209. message = load(BytesIO(bufv))
  210. else:
  211. bufv.seek(0)
  212. message = load(bufv)
  213. if message:
  214. callback(message)
  215. def _make_process_result(self, hub):
  216. """Coroutine that reads messages from the pool processes
  217. and calls the appropriate handler."""
  218. fileno_to_outq = self.fileno_to_outq
  219. on_state_change = self.on_state_change
  220. add_reader = hub.add_reader
  221. remove_reader = hub.remove_reader
  222. recv_message = self._recv_message
  223. def on_result_readable(fileno):
  224. try:
  225. fileno_to_outq[fileno]
  226. except KeyError: # process gone
  227. return remove_reader(fileno)
  228. it = recv_message(add_reader, fileno, on_state_change)
  229. try:
  230. next(it)
  231. except StopIteration:
  232. pass
  233. except (IOError, OSError, EOFError):
  234. remove_reader(fileno)
  235. else:
  236. add_reader(fileno, it)
  237. return on_result_readable
  238. def register_with_event_loop(self, hub):
  239. self.handle_event = self._make_process_result(hub)
  240. def handle_event(self, fileno):
  241. raise RuntimeError('Not registered with event loop')
  242. def on_stop_not_started(self):
  243. """This method is always used to stop when the helper thread is not
  244. started."""
  245. cache = self.cache
  246. check_timeouts = self.check_timeouts
  247. fileno_to_outq = self.fileno_to_outq
  248. on_state_change = self.on_state_change
  249. join_exited_workers = self.join_exited_workers
  250. # flush the processes outqueues until they have all terminated.
  251. outqueues = set(fileno_to_outq)
  252. while cache and outqueues and self._state != TERMINATE:
  253. if check_timeouts is not None:
  254. # make sure tasks with a time limit will time out.
  255. check_timeouts()
  256. # cannot iterate and remove at the same time
  257. pending_remove_fd = set()
  258. for fd in outqueues:
  259. self._flush_outqueue(
  260. fd, pending_remove_fd.discard, fileno_to_outq,
  261. on_state_change,
  262. )
  263. try:
  264. join_exited_workers(shutdown=True)
  265. except WorkersJoined:
  266. return debug('result handler: all workers terminated')
  267. outqueues.difference_update(pending_remove_fd)
  268. def _flush_outqueue(self, fd, remove, process_index, on_state_change):
  269. try:
  270. proc = process_index[fd]
  271. except KeyError:
  272. # process already found terminated
  273. # which means its outqueue has already been processed
  274. # by the worker lost handler.
  275. return remove(fd)
  276. reader = proc.outq._reader
  277. try:
  278. setblocking(reader, 1)
  279. except (OSError, IOError):
  280. return remove(fd)
  281. try:
  282. if reader.poll(0):
  283. task = reader.recv()
  284. else:
  285. task = None
  286. sleep(0.5)
  287. except (IOError, EOFError):
  288. return remove(fd)
  289. else:
  290. if task:
  291. on_state_change(task)
  292. finally:
  293. try:
  294. setblocking(reader, 0)
  295. except (OSError, IOError):
  296. return remove(fd)
  297. class AsynPool(_pool.Pool):
  298. """Pool version that uses AIO instead of helper threads."""
  299. ResultHandler = ResultHandler
  300. Worker = Worker
  301. def __init__(self, processes=None, synack=False,
  302. sched_strategy=None, *args, **kwargs):
  303. self.sched_strategy = SCHED_STRATEGIES.get(sched_strategy,
  304. sched_strategy)
  305. processes = self.cpu_count() if processes is None else processes
  306. self.synack = synack
  307. # create queue-pairs for all our processes in advance.
  308. self._queues = dict((self.create_process_queues(), None)
  309. for _ in range(processes))
  310. # inqueue fileno -> process mapping
  311. self._fileno_to_inq = {}
  312. # outqueue fileno -> process mapping
  313. self._fileno_to_outq = {}
  314. # synqueue fileno -> process mapping
  315. self._fileno_to_synq = {}
  316. # We keep track of processes that have not yet
  317. # sent a WORKER_UP message. If a process fails to send
  318. # this message within proc_up_timeout we terminate it
  319. # and hope the next process will recover.
  320. self._proc_alive_timeout = PROC_ALIVE_TIMEOUT
  321. self._waiting_to_start = set()
  322. # denormalized set of all inqueues.
  323. self._all_inqueues = set()
  324. # Set of fds being written to (busy)
  325. self._active_writes = set()
  326. # Set of active co-routines currently writing jobs.
  327. self._active_writers = set()
  328. # Set of fds that are busy (executing task)
  329. self._busy_workers = set()
  330. self._mark_worker_as_available = self._busy_workers.discard
  331. # Holds jobs waiting to be written to child processes.
  332. self.outbound_buffer = deque()
  333. self.write_stats = Counter()
  334. super(AsynPool, self).__init__(processes, *args, **kwargs)
  335. for proc in self._pool:
  336. # create initial mappings, these will be updated
  337. # as processes are recycled, or found lost elsewhere.
  338. self._fileno_to_outq[proc.outqR_fd] = proc
  339. self._fileno_to_synq[proc.synqW_fd] = proc
  340. self.on_soft_timeout = self.on_hard_timeout = None
  341. if self._timeout_handler:
  342. self.on_soft_timeout = self._timeout_handler.on_soft_timeout
  343. self.on_hard_timeout = self._timeout_handler.on_hard_timeout
  344. def _create_worker_process(self, i):
  345. gc.collect() # Issue #2927
  346. return super(AsynPool, self)._create_worker_process(i)
  347. def _event_process_exit(self, hub, proc):
  348. # This method is called whenever the process sentinel is readable.
  349. self._untrack_child_process(proc, hub)
  350. self.maintain_pool()
  351. def _track_child_process(self, proc, hub):
  352. try:
  353. fd = proc._sentinel_poll
  354. except AttributeError:
  355. # we need to duplicate the fd here to carefully
  356. # control when the fd is removed from the process table,
  357. # as once the original fd is closed we cannot unregister
  358. # the fd from epoll(7) anymore, causing a 100% CPU poll loop.
  359. fd = proc._sentinel_poll = os.dup(proc._popen.sentinel)
  360. hub.add_reader(fd, self._event_process_exit, hub, proc)
  361. def _untrack_child_process(self, proc, hub):
  362. if proc._sentinel_poll is not None:
  363. fd, proc._sentinel_poll = proc._sentinel_poll, None
  364. hub.remove(fd)
  365. os.close(fd)
  366. def register_with_event_loop(self, hub):
  367. """Registers the async pool with the current event loop."""
  368. self._result_handler.register_with_event_loop(hub)
  369. self.handle_result_event = self._result_handler.handle_event
  370. self._create_timelimit_handlers(hub)
  371. self._create_process_handlers(hub)
  372. self._create_write_handlers(hub)
  373. # Add handler for when a process exits (calls maintain_pool)
  374. [self._track_child_process(w, hub) for w in self._pool]
  375. # Handle_result_event is called whenever one of the
  376. # result queues are readable.
  377. [hub.add_reader(fd, self.handle_result_event, fd)
  378. for fd in self._fileno_to_outq]
  379. # Timers include calling maintain_pool at a regular interval
  380. # to be certain processes are restarted.
  381. for handler, interval in items(self.timers):
  382. hub.call_repeatedly(interval, handler)
  383. hub.on_tick.add(self.on_poll_start)
  384. def _create_timelimit_handlers(self, hub, now=time.time):
  385. """For async pool this sets up the handlers used
  386. to implement time limits."""
  387. call_later = hub.call_later
  388. trefs = self._tref_for_id = WeakValueDictionary()
  389. def on_timeout_set(R, soft, hard):
  390. if soft:
  391. trefs[R._job] = call_later(
  392. soft, self._on_soft_timeout, R._job, soft, hard, hub,
  393. )
  394. elif hard:
  395. trefs[R._job] = call_later(
  396. hard, self._on_hard_timeout, R._job,
  397. )
  398. self.on_timeout_set = on_timeout_set
  399. def _discard_tref(job):
  400. try:
  401. tref = trefs.pop(job)
  402. tref.cancel()
  403. del(tref)
  404. except (KeyError, AttributeError):
  405. pass # out of scope
  406. self._discard_tref = _discard_tref
  407. def on_timeout_cancel(R):
  408. _discard_tref(R._job)
  409. self.on_timeout_cancel = on_timeout_cancel
  410. def _on_soft_timeout(self, job, soft, hard, hub, now=time.time):
  411. # only used by async pool.
  412. if hard:
  413. self._tref_for_id[job] = hub.call_at(
  414. now() + (hard - soft), self._on_hard_timeout, job,
  415. )
  416. try:
  417. result = self._cache[job]
  418. except KeyError:
  419. pass # job ready
  420. else:
  421. self.on_soft_timeout(result)
  422. finally:
  423. if not hard:
  424. # remove tref
  425. self._discard_tref(job)
  426. def _on_hard_timeout(self, job):
  427. # only used by async pool.
  428. try:
  429. result = self._cache[job]
  430. except KeyError:
  431. pass # job ready
  432. else:
  433. self.on_hard_timeout(result)
  434. finally:
  435. # remove tref
  436. self._discard_tref(job)
  437. def on_job_ready(self, job, i, obj, inqW_fd):
  438. self._mark_worker_as_available(inqW_fd)
  439. def _create_process_handlers(self, hub, READ=READ, ERR=ERR):
  440. """For async pool this will create the handlers called
  441. when a process is up/down and etc."""
  442. add_reader, remove_reader, remove_writer = (
  443. hub.add_reader, hub.remove_reader, hub.remove_writer,
  444. )
  445. cache = self._cache
  446. all_inqueues = self._all_inqueues
  447. fileno_to_inq = self._fileno_to_inq
  448. fileno_to_outq = self._fileno_to_outq
  449. fileno_to_synq = self._fileno_to_synq
  450. busy_workers = self._busy_workers
  451. handle_result_event = self.handle_result_event
  452. process_flush_queues = self.process_flush_queues
  453. waiting_to_start = self._waiting_to_start
  454. def verify_process_alive(proc):
  455. proc = proc() # is a weakref
  456. if (proc is not None and proc._is_alive() and
  457. proc in waiting_to_start):
  458. assert proc.outqR_fd in fileno_to_outq
  459. assert fileno_to_outq[proc.outqR_fd] is proc
  460. assert proc.outqR_fd in hub.readers
  461. error('Timed out waiting for UP message from %r', proc)
  462. os.kill(proc.pid, 9)
  463. def on_process_up(proc):
  464. """Called when a process has started."""
  465. # If we got the same fd as a previous process then we will also
  466. # receive jobs in the old buffer, so we need to reset the
  467. # job._write_to and job._scheduled_for attributes used to recover
  468. # message boundaries when processes exit.
  469. infd = proc.inqW_fd
  470. for job in values(cache):
  471. if job._write_to and job._write_to.inqW_fd == infd:
  472. job._write_to = proc
  473. if job._scheduled_for and job._scheduled_for.inqW_fd == infd:
  474. job._scheduled_for = proc
  475. fileno_to_outq[proc.outqR_fd] = proc
  476. # maintain_pool is called whenever a process exits.
  477. self._track_child_process(proc, hub)
  478. assert not isblocking(proc.outq._reader)
  479. # handle_result_event is called when the processes outqueue is
  480. # readable.
  481. add_reader(proc.outqR_fd, handle_result_event, proc.outqR_fd)
  482. waiting_to_start.add(proc)
  483. hub.call_later(
  484. self._proc_alive_timeout, verify_process_alive, ref(proc),
  485. )
  486. self.on_process_up = on_process_up
  487. def _remove_from_index(obj, proc, index, remove_fun, callback=None):
  488. # this remove the file descriptors for a process from
  489. # the indices. we have to make sure we don't overwrite
  490. # another processes fds, as the fds may be reused.
  491. try:
  492. fd = obj.fileno()
  493. except (IOError, OSError):
  494. return
  495. try:
  496. if index[fd] is proc:
  497. # fd has not been reused so we can remove it from index.
  498. index.pop(fd, None)
  499. except KeyError:
  500. pass
  501. else:
  502. remove_fun(fd)
  503. if callback is not None:
  504. callback(fd)
  505. return fd
  506. def on_process_down(proc):
  507. """Called when a worker process exits."""
  508. if getattr(proc, 'dead', None):
  509. return
  510. process_flush_queues(proc)
  511. _remove_from_index(
  512. proc.outq._reader, proc, fileno_to_outq, remove_reader,
  513. )
  514. if proc.synq:
  515. _remove_from_index(
  516. proc.synq._writer, proc, fileno_to_synq, remove_writer,
  517. )
  518. inq = _remove_from_index(
  519. proc.inq._writer, proc, fileno_to_inq, remove_writer,
  520. callback=all_inqueues.discard,
  521. )
  522. if inq:
  523. busy_workers.discard(inq)
  524. self._untrack_child_process(proc, hub)
  525. waiting_to_start.discard(proc)
  526. self._active_writes.discard(proc.inqW_fd)
  527. remove_writer(proc.inq._writer)
  528. remove_reader(proc.outq._reader)
  529. if proc.synqR_fd:
  530. remove_reader(proc.synq._reader)
  531. if proc.synqW_fd:
  532. self._active_writes.discard(proc.synqW_fd)
  533. remove_reader(proc.synq._writer)
  534. self.on_process_down = on_process_down
  535. def _create_write_handlers(self, hub,
  536. pack=struct.pack, dumps=_pickle.dumps,
  537. protocol=HIGHEST_PROTOCOL):
  538. """For async pool this creates the handlers used to write data to
  539. child processes."""
  540. fileno_to_inq = self._fileno_to_inq
  541. fileno_to_synq = self._fileno_to_synq
  542. outbound = self.outbound_buffer
  543. pop_message = outbound.popleft
  544. append_message = outbound.append
  545. put_back_message = outbound.appendleft
  546. all_inqueues = self._all_inqueues
  547. active_writes = self._active_writes
  548. active_writers = self._active_writers
  549. busy_workers = self._busy_workers
  550. diff = all_inqueues.difference
  551. add_writer = hub.add_writer
  552. hub_add, hub_remove = hub.add, hub.remove
  553. mark_write_fd_as_active = active_writes.add
  554. mark_write_gen_as_active = active_writers.add
  555. mark_worker_as_busy = busy_workers.add
  556. write_generator_done = active_writers.discard
  557. get_job = self._cache.__getitem__
  558. write_stats = self.write_stats
  559. is_fair_strategy = self.sched_strategy == SCHED_STRATEGY_FAIR
  560. revoked_tasks = worker_state.revoked
  561. getpid = os.getpid
  562. precalc = {ACK: self._create_payload(ACK, (0, )),
  563. NACK: self._create_payload(NACK, (0, ))}
  564. def _put_back(job, _time=time.time):
  565. # puts back at the end of the queue
  566. if job._terminated is not None or \
  567. job.correlation_id in revoked_tasks:
  568. if not job._accepted:
  569. job._ack(None, _time(), getpid(), None)
  570. job._set_terminated(job._terminated)
  571. else:
  572. # XXX linear lookup, should find a better way,
  573. # but this happens rarely and is here to protect against races.
  574. if job not in outbound:
  575. outbound.appendleft(job)
  576. self._put_back = _put_back
  577. # called for every event loop iteration, and if there
  578. # are messages pending this will schedule writing one message
  579. # by registering the 'schedule_writes' function for all currently
  580. # inactive inqueues (not already being written to)
  581. # consolidate means the event loop will merge them
  582. # and call the callback once with the list writable fds as
  583. # argument. Using this means we minimize the risk of having
  584. # the same fd receive every task if the pipe read buffer is not
  585. # full.
  586. if is_fair_strategy:
  587. def on_poll_start():
  588. if outbound and len(busy_workers) < len(all_inqueues):
  589. inactive = diff(active_writes)
  590. [hub_add(fd, None, WRITE | ERR, consolidate=True)
  591. for fd in inactive]
  592. else:
  593. [hub_remove(fd) for fd in diff(active_writes)]
  594. else:
  595. def on_poll_start(): # noqa
  596. if outbound:
  597. [hub_add(fd, None, WRITE | ERR, consolidate=True)
  598. for fd in diff(active_writes)]
  599. else:
  600. [hub_remove(fd) for fd in diff(active_writes)]
  601. self.on_poll_start = on_poll_start
  602. def on_inqueue_close(fd, proc):
  603. # Makes sure the fd is removed from tracking when
  604. # the connection is closed, this is essential as fds may be reused.
  605. busy_workers.discard(fd)
  606. try:
  607. if fileno_to_inq[fd] is proc:
  608. fileno_to_inq.pop(fd, None)
  609. active_writes.discard(fd)
  610. all_inqueues.discard(fd)
  611. hub_remove(fd)
  612. except KeyError:
  613. pass
  614. self.on_inqueue_close = on_inqueue_close
  615. def schedule_writes(ready_fds, curindex=[0]):
  616. # Schedule write operation to ready file descriptor.
  617. # The file descriptor is writeable, but that does not
  618. # mean the process is currently reading from the socket.
  619. # The socket is buffered so writeable simply means that
  620. # the buffer can accept at least 1 byte of data.
  621. # This means we have to cycle between the ready fds.
  622. # the first version used shuffle, but using i % total
  623. # is about 30% faster with many processes. The latter
  624. # also shows more fairness in write stats when used with
  625. # many processes [XXX On OS X, this may vary depending
  626. # on event loop implementation (i.e select vs epoll), so
  627. # have to test further]
  628. total = len(ready_fds)
  629. for i in range(total):
  630. ready_fd = ready_fds[curindex[0] % total]
  631. if ready_fd in active_writes:
  632. # already writing to this fd
  633. curindex[0] += 1
  634. continue
  635. if is_fair_strategy and ready_fd in busy_workers:
  636. # worker is already busy with another task
  637. curindex[0] += 1
  638. continue
  639. if ready_fd not in all_inqueues:
  640. hub_remove(ready_fd)
  641. curindex[0] += 1
  642. continue
  643. try:
  644. job = pop_message()
  645. except IndexError:
  646. # no more messages, remove all inactive fds from the hub.
  647. # this is important since the fds are always writeable
  648. # as long as there's 1 byte left in the buffer, and so
  649. # this may create a spinloop where the event loop
  650. # always wakes up.
  651. for inqfd in diff(active_writes):
  652. hub_remove(inqfd)
  653. break
  654. else:
  655. if not job._accepted: # job not accepted by another worker
  656. try:
  657. # keep track of what process the write operation
  658. # was scheduled for.
  659. proc = job._scheduled_for = fileno_to_inq[ready_fd]
  660. except KeyError:
  661. # write was scheduled for this fd but the process
  662. # has since exited and the message must be sent to
  663. # another process.
  664. put_back_message(job)
  665. curindex[0] += 1
  666. continue
  667. cor = _write_job(proc, ready_fd, job)
  668. job._writer = ref(cor)
  669. mark_write_gen_as_active(cor)
  670. mark_write_fd_as_active(ready_fd)
  671. mark_worker_as_busy(ready_fd)
  672. # Try to write immediately, in case there's an error.
  673. try:
  674. next(cor)
  675. except StopIteration:
  676. pass
  677. except OSError as exc:
  678. if get_errno(exc) != errno.EBADF:
  679. raise
  680. else:
  681. add_writer(ready_fd, cor)
  682. curindex[0] += 1
  683. hub.consolidate_callback = schedule_writes
  684. def send_job(tup):
  685. # Schedule writing job request for when one of the process
  686. # inqueues are writable.
  687. body = dumps(tup, protocol=protocol)
  688. body_size = len(body)
  689. header = pack('>I', body_size)
  690. # index 1,0 is the job ID.
  691. job = get_job(tup[1][0])
  692. job._payload = buf_t(header), buf_t(body), body_size
  693. append_message(job)
  694. self._quick_put = send_job
  695. def on_not_recovering(proc, fd, job, exc):
  696. error('Process inqueue damaged: %r %r: %r',
  697. proc, proc.exitcode, exc, exc_info=1)
  698. if proc._is_alive():
  699. proc.terminate()
  700. hub.remove(fd)
  701. self._put_back(job)
  702. def _write_job(proc, fd, job):
  703. # writes job to the worker process.
  704. # Operation must complete if more than one byte of data
  705. # was written. If the broker connection is lost
  706. # and no data was written the operation shall be canceled.
  707. header, body, body_size = job._payload
  708. errors = 0
  709. try:
  710. # job result keeps track of what process the job is sent to.
  711. job._write_to = proc
  712. send = proc.send_job_offset
  713. Hw = Bw = 0
  714. # write header
  715. while Hw < 4:
  716. try:
  717. Hw += send(header, Hw)
  718. except Exception as exc:
  719. if get_errno(exc) not in UNAVAIL:
  720. raise
  721. # suspend until more data
  722. errors += 1
  723. if errors > 100:
  724. on_not_recovering(proc, fd, job, exc)
  725. raise StopIteration()
  726. yield
  727. else:
  728. errors = 0
  729. # write body
  730. while Bw < body_size:
  731. try:
  732. Bw += send(body, Bw)
  733. except Exception as exc:
  734. if get_errno(exc) not in UNAVAIL:
  735. raise
  736. # suspend until more data
  737. errors += 1
  738. if errors > 100:
  739. on_not_recovering(proc, fd, job, exc)
  740. raise StopIteration()
  741. yield
  742. else:
  743. errors = 0
  744. finally:
  745. hub_remove(fd)
  746. write_stats[proc.index] += 1
  747. # message written, so this fd is now available
  748. active_writes.discard(fd)
  749. write_generator_done(job._writer()) # is a weakref
  750. def send_ack(response, pid, job, fd, WRITE=WRITE, ERR=ERR):
  751. # Only used when synack is enabled.
  752. # Schedule writing ack response for when the fd is writeable.
  753. msg = Ack(job, fd, precalc[response])
  754. callback = promise(write_generator_done)
  755. cor = _write_ack(fd, msg, callback=callback)
  756. mark_write_gen_as_active(cor)
  757. mark_write_fd_as_active(fd)
  758. callback.args = (cor, )
  759. add_writer(fd, cor)
  760. self.send_ack = send_ack
  761. def _write_ack(fd, ack, callback=None):
  762. # writes ack back to the worker if synack enabled.
  763. # this operation *MUST* complete, otherwise
  764. # the worker process will hang waiting for the ack.
  765. header, body, body_size = ack[2]
  766. try:
  767. try:
  768. proc = fileno_to_synq[fd]
  769. except KeyError:
  770. # process died, we can safely discard the ack at this
  771. # point.
  772. raise StopIteration()
  773. send = proc.send_syn_offset
  774. Hw = Bw = 0
  775. # write header
  776. while Hw < 4:
  777. try:
  778. Hw += send(header, Hw)
  779. except Exception as exc:
  780. if get_errno(exc) not in UNAVAIL:
  781. raise
  782. yield
  783. # write body
  784. while Bw < body_size:
  785. try:
  786. Bw += send(body, Bw)
  787. except Exception as exc:
  788. if get_errno(exc) not in UNAVAIL:
  789. raise
  790. # suspend until more data
  791. yield
  792. finally:
  793. if callback:
  794. callback()
  795. # message written, so this fd is now available
  796. active_writes.discard(fd)
  797. def flush(self):
  798. if self._state == TERMINATE:
  799. return
  800. # cancel all tasks that have not been accepted so that NACK is sent.
  801. for job in values(self._cache):
  802. if not job._accepted:
  803. job._cancel()
  804. # clear the outgoing buffer as the tasks will be redelivered by
  805. # the broker anyway.
  806. if self.outbound_buffer:
  807. self.outbound_buffer.clear()
  808. self.maintain_pool()
  809. try:
  810. # ...but we must continue writing the payloads we already started
  811. # to keep message boundaries.
  812. # The messages may be NACK'ed later if synack is enabled.
  813. if self._state == RUN:
  814. # flush outgoing buffers
  815. intervals = fxrange(0.01, 0.1, 0.01, repeatlast=True)
  816. owned_by = {}
  817. for job in values(self._cache):
  818. writer = _get_job_writer(job)
  819. if writer is not None:
  820. owned_by[writer] = job
  821. while self._active_writers:
  822. writers = list(self._active_writers)
  823. for gen in writers:
  824. if (gen.__name__ == '_write_job' and
  825. gen_not_started(gen)):
  826. # has not started writing the job so can
  827. # discard the task, but we must also remove
  828. # it from the Pool._cache.
  829. try:
  830. job = owned_by[gen]
  831. except KeyError:
  832. pass
  833. else:
  834. # removes from Pool._cache
  835. job.discard()
  836. self._active_writers.discard(gen)
  837. else:
  838. try:
  839. job = owned_by[gen]
  840. except KeyError:
  841. pass
  842. else:
  843. job_proc = job._write_to
  844. if job_proc._is_alive():
  845. self._flush_writer(job_proc, gen)
  846. # workers may have exited in the meantime.
  847. self.maintain_pool()
  848. sleep(next(intervals)) # don't busyloop
  849. finally:
  850. self.outbound_buffer.clear()
  851. self._active_writers.clear()
  852. self._active_writes.clear()
  853. self._busy_workers.clear()
  854. def _flush_writer(self, proc, writer):
  855. fds = set([proc.inq._writer])
  856. try:
  857. while fds:
  858. if not proc._is_alive():
  859. break # process exited
  860. readable, writable, again = _select(
  861. writers=fds, err=fds, timeout=0.5,
  862. )
  863. if not again and (writable or readable):
  864. try:
  865. next(writer)
  866. except (StopIteration, OSError, IOError, EOFError):
  867. break
  868. finally:
  869. self._active_writers.discard(writer)
  870. def get_process_queues(self):
  871. """Get queues for a new process.
  872. Here we will find an unused slot, as there should always
  873. be one available when we start a new process.
  874. """
  875. return next(q for q, owner in items(self._queues)
  876. if owner is None)
  877. def on_grow(self, n):
  878. """Grow the pool by ``n`` proceses."""
  879. diff = max(self._processes - len(self._queues), 0)
  880. if diff:
  881. self._queues.update(
  882. dict((self.create_process_queues(), None) for _ in range(diff))
  883. )
  884. def on_shrink(self, n):
  885. """Shrink the pool by ``n`` processes."""
  886. pass
  887. def create_process_queues(self):
  888. """Creates new in, out (and optionally syn) queues,
  889. returned as a tuple."""
  890. # NOTE: Pipes must be set O_NONBLOCK at creation time (the original
  891. # fd), otherwise it will not be possible to change the flags until
  892. # there is an actual reader/writer on the other side.
  893. inq = _SimpleQueue(wnonblock=True)
  894. outq = _SimpleQueue(rnonblock=True)
  895. synq = None
  896. assert isblocking(inq._reader)
  897. assert not isblocking(inq._writer)
  898. assert not isblocking(outq._reader)
  899. assert isblocking(outq._writer)
  900. if self.synack:
  901. synq = _SimpleQueue(wnonblock=True)
  902. assert isblocking(synq._reader)
  903. assert not isblocking(synq._writer)
  904. return inq, outq, synq
  905. def on_process_alive(self, pid):
  906. """Handler called when the :const:`WORKER_UP` message is received
  907. from a child process, which marks the process as ready
  908. to receive work."""
  909. try:
  910. proc = next(w for w in self._pool if w.pid == pid)
  911. except StopIteration:
  912. return logger.warning('process with pid=%s already exited', pid)
  913. assert proc.inqW_fd not in self._fileno_to_inq
  914. assert proc.inqW_fd not in self._all_inqueues
  915. self._waiting_to_start.discard(proc)
  916. self._fileno_to_inq[proc.inqW_fd] = proc
  917. self._fileno_to_synq[proc.synqW_fd] = proc
  918. self._all_inqueues.add(proc.inqW_fd)
  919. def on_job_process_down(self, job, pid_gone):
  920. """Handler called for each job when the process it was assigned to
  921. exits."""
  922. if job._write_to and not job._write_to._is_alive():
  923. # job was partially written
  924. self.on_partial_read(job, job._write_to)
  925. elif job._scheduled_for and not job._scheduled_for._is_alive():
  926. # job was only scheduled to be written to this process,
  927. # but no data was sent so put it back on the outbound_buffer.
  928. self._put_back(job)
  929. def on_job_process_lost(self, job, pid, exitcode):
  930. """Handler called for each *started* job when the process it
  931. was assigned to exited by mysterious means (error exitcodes and
  932. signals)"""
  933. self.mark_as_worker_lost(job, exitcode)
  934. def human_write_stats(self):
  935. if self.write_stats is None:
  936. return 'N/A'
  937. vals = list(values(self.write_stats))
  938. total = sum(vals)
  939. def per(v, total):
  940. return '{0:.2f}%'.format((float(v) / total) * 100.0 if v else 0)
  941. return {
  942. 'total': total,
  943. 'avg': per(total / len(self.write_stats) if total else 0, total),
  944. 'all': ', '.join(per(v, total) for v in vals),
  945. 'raw': ', '.join(map(str, vals)),
  946. 'inqueues': {
  947. 'total': len(self._all_inqueues),
  948. 'active': len(self._active_writes),
  949. }
  950. }
  951. def _process_cleanup_queues(self, proc):
  952. """Handler called to clean up a processes queues after process
  953. exit."""
  954. if not proc.dead:
  955. try:
  956. self._queues[self._find_worker_queues(proc)] = None
  957. except (KeyError, ValueError):
  958. pass
  959. @staticmethod
  960. def _stop_task_handler(task_handler):
  961. """Called at shutdown to tell processes that we are shutting down."""
  962. for proc in task_handler.pool:
  963. try:
  964. setblocking(proc.inq._writer, 1)
  965. except (OSError, IOError):
  966. pass
  967. else:
  968. try:
  969. proc.inq.put(None)
  970. except OSError as exc:
  971. if get_errno(exc) != errno.EBADF:
  972. raise
  973. def create_result_handler(self):
  974. return super(AsynPool, self).create_result_handler(
  975. fileno_to_outq=self._fileno_to_outq,
  976. on_process_alive=self.on_process_alive,
  977. )
  978. def _process_register_queues(self, proc, queues):
  979. """Marks new ownership for ``queues`` so that the fileno indices are
  980. updated."""
  981. assert queues in self._queues
  982. b = len(self._queues)
  983. self._queues[queues] = proc
  984. assert b == len(self._queues)
  985. def _find_worker_queues(self, proc):
  986. """Find the queues owned by ``proc``."""
  987. try:
  988. return next(q for q, owner in items(self._queues)
  989. if owner == proc)
  990. except StopIteration:
  991. raise ValueError(proc)
  992. def _setup_queues(self):
  993. # this is only used by the original pool which uses a shared
  994. # queue for all processes.
  995. # these attributes makes no sense for us, but we will still
  996. # have to initialize them.
  997. self._inqueue = self._outqueue = \
  998. self._quick_put = self._quick_get = self._poll_result = None
  999. def process_flush_queues(self, proc):
  1000. """Flushes all queues, including the outbound buffer, so that
  1001. all tasks that have not been started will be discarded.
  1002. In Celery this is called whenever the transport connection is lost
  1003. (consumer restart).
  1004. """
  1005. resq = proc.outq._reader
  1006. on_state_change = self._result_handler.on_state_change
  1007. fds = set([resq])
  1008. while fds and not resq.closed and self._state != TERMINATE:
  1009. readable, _, again = _select(fds, None, fds, timeout=0.01)
  1010. if readable:
  1011. try:
  1012. task = resq.recv()
  1013. except (OSError, IOError, EOFError) as exc:
  1014. if get_errno(exc) == errno.EINTR:
  1015. continue
  1016. elif get_errno(exc) == errno.EAGAIN:
  1017. break
  1018. else:
  1019. debug('got %r while flushing process %r',
  1020. exc, proc, exc_info=1)
  1021. if get_errno(exc) not in UNAVAIL:
  1022. debug('got %r while flushing process %r',
  1023. exc, proc, exc_info=1)
  1024. break
  1025. else:
  1026. if task is None:
  1027. debug('got sentinel while flushing process %r', proc)
  1028. break
  1029. else:
  1030. on_state_change(task)
  1031. else:
  1032. break
  1033. def on_partial_read(self, job, proc):
  1034. """Called when a job was only partially written to a child process
  1035. and it exited."""
  1036. # worker terminated by signal:
  1037. # we cannot reuse the sockets again, because we don't know if
  1038. # the process wrote/read anything frmo them, and if so we cannot
  1039. # restore the message boundaries.
  1040. if not job._accepted:
  1041. # job was not acked, so find another worker to send it to.
  1042. self._put_back(job)
  1043. writer = _get_job_writer(job)
  1044. if writer:
  1045. self._active_writers.discard(writer)
  1046. del(writer)
  1047. if not proc.dead:
  1048. proc.dead = True
  1049. # Replace queues to avoid reuse
  1050. before = len(self._queues)
  1051. try:
  1052. queues = self._find_worker_queues(proc)
  1053. if self.destroy_queues(queues, proc):
  1054. self._queues[self.create_process_queues()] = None
  1055. except ValueError:
  1056. pass
  1057. assert len(self._queues) == before
  1058. def destroy_queues(self, queues, proc):
  1059. """Destroy queues that can no longer be used, so that they
  1060. be replaced by new sockets."""
  1061. assert not proc._is_alive()
  1062. self._waiting_to_start.discard(proc)
  1063. removed = 1
  1064. try:
  1065. self._queues.pop(queues)
  1066. except KeyError:
  1067. removed = 0
  1068. try:
  1069. self.on_inqueue_close(queues[0]._writer.fileno(), proc)
  1070. except IOError:
  1071. pass
  1072. for queue in queues:
  1073. if queue:
  1074. for sock in (queue._reader, queue._writer):
  1075. if not sock.closed:
  1076. try:
  1077. sock.close()
  1078. except (IOError, OSError):
  1079. pass
  1080. return removed
  1081. def _create_payload(self, type_, args,
  1082. dumps=_pickle.dumps, pack=struct.pack,
  1083. protocol=HIGHEST_PROTOCOL):
  1084. body = dumps((type_, args), protocol=protocol)
  1085. size = len(body)
  1086. header = pack('>I', size)
  1087. return header, body, size
  1088. @classmethod
  1089. def _set_result_sentinel(cls, _outqueue, _pool):
  1090. # unused
  1091. pass
  1092. def _help_stuff_finish_args(self):
  1093. # Pool._help_stuff_finished is a classmethod so we have to use this
  1094. # trick to modify the arguments passed to it.
  1095. return (self._pool, )
  1096. @classmethod
  1097. def _help_stuff_finish(cls, pool):
  1098. debug(
  1099. 'removing tasks from inqueue until task handler finished',
  1100. )
  1101. fileno_to_proc = {}
  1102. inqR = set()
  1103. for w in pool:
  1104. try:
  1105. fd = w.inq._reader.fileno()
  1106. inqR.add(fd)
  1107. fileno_to_proc[fd] = w
  1108. except IOError:
  1109. pass
  1110. while inqR:
  1111. readable, _, again = _select(inqR, timeout=0.5)
  1112. if again:
  1113. continue
  1114. if not readable:
  1115. break
  1116. for fd in readable:
  1117. fileno_to_proc[fd].inq._reader.recv()
  1118. sleep(0)
  1119. @property
  1120. def timers(self):
  1121. return {self.maintain_pool: 5.0}