Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
workqueue.c
Go to the documentation of this file.
1 /*
2  * kernel/workqueue.c - generic async execution with shared worker pool
3  *
4  * Copyright (C) 2002 Ingo Molnar
5  *
6  * Derived from the taskqueue/keventd code by:
7  * David Woodhouse <[email protected]>
8  * Andrew Morton
9  * Kai Petzke <[email protected]>
10  * Theodore Ts'o <[email protected]>
11  *
12  * Made to use alloc_percpu by Christoph Lameter.
13  *
14  * Copyright (C) 2010 SUSE Linux Products GmbH
15  * Copyright (C) 2010 Tejun Heo <[email protected]>
16  *
17  * This is the generic async execution mechanism. Work items as are
18  * executed in process context. The worker pool is shared and
19  * automatically managed. There is one worker pool for each CPU and
20  * one extra for works which are better served by workers which are
21  * not bound to any specific CPU.
22  *
23  * Please read Documentation/workqueue.txt for details.
24  */
25 
26 #include <linux/export.h>
27 #include <linux/kernel.h>
28 #include <linux/sched.h>
29 #include <linux/init.h>
30 #include <linux/signal.h>
31 #include <linux/completion.h>
32 #include <linux/workqueue.h>
33 #include <linux/slab.h>
34 #include <linux/cpu.h>
35 #include <linux/notifier.h>
36 #include <linux/kthread.h>
37 #include <linux/hardirq.h>
38 #include <linux/mempolicy.h>
39 #include <linux/freezer.h>
40 #include <linux/kallsyms.h>
41 #include <linux/debug_locks.h>
42 #include <linux/lockdep.h>
43 #include <linux/idr.h>
44 
45 #include "workqueue_sched.h"
46 
47 enum {
48  /*
49  * global_cwq flags
50  *
51  * A bound gcwq is either associated or disassociated with its CPU.
52  * While associated (!DISASSOCIATED), all workers are bound to the
53  * CPU and none has %WORKER_UNBOUND set and concurrency management
54  * is in effect.
55  *
56  * While DISASSOCIATED, the cpu may be offline and all workers have
57  * %WORKER_UNBOUND set and concurrency management disabled, and may
58  * be executing on any CPU. The gcwq behaves as an unbound one.
59  *
60  * Note that DISASSOCIATED can be flipped only while holding
61  * assoc_mutex of all pools on the gcwq to avoid changing binding
62  * state while create_worker() is in progress.
63  */
64  GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
65  GCWQ_FREEZING = 1 << 1, /* freeze in progress */
66 
67  /* pool flags */
68  POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
69  POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
70 
71  /* worker flags */
72  WORKER_STARTED = 1 << 0, /* started */
73  WORKER_DIE = 1 << 1, /* die die die */
74  WORKER_IDLE = 1 << 2, /* is idle */
75  WORKER_PREP = 1 << 3, /* preparing to run works */
76  WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
77  WORKER_UNBOUND = 1 << 7, /* worker is unbound */
78 
81 
82  NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
83 
84  BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
87 
88  MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
89  IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
90 
91  MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
92  /* call for help after 10ms
93  (min two ticks) */
94  MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
95  CREATE_COOLDOWN = HZ, /* time to breath after fail */
96 
97  /*
98  * Rescue workers are used only on emergencies and shared by
99  * all cpus. Give -20.
100  */
101  RESCUER_NICE_LEVEL = -20,
102  HIGHPRI_NICE_LEVEL = -20,
103 };
104 
105 /*
106  * Structure fields follow one of the following exclusion rules.
107  *
108  * I: Modifiable by initialization/destruction paths and read-only for
109  * everyone else.
110  *
111  * P: Preemption protected. Disabling preemption is enough and should
112  * only be modified and accessed from the local cpu.
113  *
114  * L: gcwq->lock protected. Access with gcwq->lock held.
115  *
116  * X: During normal operation, modification requires gcwq->lock and
117  * should be done only from local cpu. Either disabling preemption
118  * on local cpu or grabbing gcwq->lock is enough for read access.
119  * If GCWQ_DISASSOCIATED is set, it's identical to L.
120  *
121  * F: wq->flush_mutex protected.
122  *
123  * W: workqueue_lock protected.
124  */
125 
126 struct global_cwq;
127 struct worker_pool;
128 
129 /*
130  * The poor guys doing the actual heavy lifting. All on-duty workers
131  * are either serving the manager role, on idle list or on busy hash.
132  */
133 struct worker {
134  /* on idle list while idle, on busy hash table while busy */
135  union {
136  struct list_head entry; /* L: while idle */
137  struct hlist_node hentry; /* L: while busy */
138  };
139 
140  struct work_struct *current_work; /* L: work being processed */
141  struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
142  struct list_head scheduled; /* L: scheduled works */
143  struct task_struct *task; /* I: worker task */
144  struct worker_pool *pool; /* I: the associated pool */
145  /* 64 bytes boundary on 64bit, 32 on 32bit */
146  unsigned long last_active; /* L: last active timestamp */
147  unsigned int flags; /* X: flags */
148  int id; /* I: worker id */
149 
150  /* for rebinding worker to CPU */
151  struct work_struct rebind_work; /* L: for busy worker */
152 };
153 
154 struct worker_pool {
155  struct global_cwq *gcwq; /* I: the owning gcwq */
156  unsigned int flags; /* X: flags */
157 
158  struct list_head worklist; /* L: list of pending works */
159  int nr_workers; /* L: total number of workers */
160 
161  /* nr_idle includes the ones off idle_list for rebinding */
162  int nr_idle; /* L: currently idle ones */
163 
164  struct list_head idle_list; /* X: list of idle workers */
165  struct timer_list idle_timer; /* L: worker idle timeout */
166  struct timer_list mayday_timer; /* L: SOS timer for workers */
167 
168  struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */
169  struct ida worker_ida; /* L: for worker IDs */
170 };
171 
172 /*
173  * Global per-cpu workqueue. There's one and only one for each cpu
174  * and all works are queued and processed here regardless of their
175  * target workqueues.
176  */
177 struct global_cwq {
178  spinlock_t lock; /* the gcwq lock */
179  unsigned int cpu; /* I: the associated cpu */
180  unsigned int flags; /* L: GCWQ_* flags */
181 
182  /* workers are chained either in busy_hash or pool idle_list */
184  /* L: hash of busy workers */
185 
187  /* normal and highpri pools */
189 
190 /*
191  * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of
192  * work_struct->data are used for flags and thus cwqs need to be
193  * aligned at two's power of the number of flag bits.
194  */
196  struct worker_pool *pool; /* I: the associated pool */
197  struct workqueue_struct *wq; /* I: the owning workqueue */
198  int work_color; /* L: current color */
199  int flush_color; /* L: flushing color */
201  /* L: nr of in_flight works */
202  int nr_active; /* L: nr of active works */
203  int max_active; /* L: max active works */
204  struct list_head delayed_works; /* L: delayed works */
205 };
206 
207 /*
208  * Structure used to wait for workqueue flush.
209  */
210 struct wq_flusher {
211  struct list_head list; /* F: list of flushers */
212  int flush_color; /* F: flush color waiting for */
213  struct completion done; /* flush completion */
214 };
215 
216 /*
217  * All cpumasks are assumed to be always set on UP and thus can't be
218  * used to determine whether there's something to be done.
219  */
220 #ifdef CONFIG_SMP
222 #define mayday_test_and_set_cpu(cpu, mask) \
223  cpumask_test_and_set_cpu((cpu), (mask))
224 #define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
225 #define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
226 #define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp))
227 #define free_mayday_mask(mask) free_cpumask_var((mask))
228 #else
229 typedef unsigned long mayday_mask_t;
230 #define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
231 #define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
232 #define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
233 #define alloc_mayday_mask(maskp, gfp) true
234 #define free_mayday_mask(mask) do { } while (0)
235 #endif
236 
237 /*
238  * The externally visible workqueue abstraction is an array of
239  * per-CPU workqueues:
240  */
242  unsigned int flags; /* W: WQ_* flags */
243  union {
246  unsigned long v;
247  } cpu_wq; /* I: cwq's */
248  struct list_head list; /* W: list of all workqueues */
249 
250  struct mutex flush_mutex; /* protects wq flushing */
251  int work_color; /* F: current work color */
252  int flush_color; /* F: current flush color */
253  atomic_t nr_cwqs_to_flush; /* flush in progress */
254  struct wq_flusher *first_flusher; /* F: first flusher */
255  struct list_head flusher_queue; /* F: flush waiters */
256  struct list_head flusher_overflow; /* F: flush overflow list */
257 
258  mayday_mask_t mayday_mask; /* cpus requesting rescue */
259  struct worker *rescuer; /* I: rescue worker */
260 
261  int nr_drainers; /* W: drain in progress */
262  int saved_max_active; /* W: saved cwq max_active */
263 #ifdef CONFIG_LOCKDEP
264  struct lockdep_map lockdep_map;
265 #endif
266  char name[]; /* I: workqueue name */
267 };
268 
270 EXPORT_SYMBOL_GPL(system_wq);
271 struct workqueue_struct *system_highpri_wq __read_mostly;
272 EXPORT_SYMBOL_GPL(system_highpri_wq);
274 EXPORT_SYMBOL_GPL(system_long_wq);
276 EXPORT_SYMBOL_GPL(system_unbound_wq);
278 EXPORT_SYMBOL_GPL(system_freezable_wq);
279 
280 #define CREATE_TRACE_POINTS
281 #include <trace/events/workqueue.h>
282 
283 #define for_each_worker_pool(pool, gcwq) \
284  for ((pool) = &(gcwq)->pools[0]; \
285  (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
286 
287 #define for_each_busy_worker(worker, i, pos, gcwq) \
288  for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
289  hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
290 
291 static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
292  unsigned int sw)
293 {
294  if (cpu < nr_cpu_ids) {
295  if (sw & 1) {
296  cpu = cpumask_next(cpu, mask);
297  if (cpu < nr_cpu_ids)
298  return cpu;
299  }
300  if (sw & 2)
301  return WORK_CPU_UNBOUND;
302  }
303  return WORK_CPU_NONE;
304 }
305 
306 static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
307  struct workqueue_struct *wq)
308 {
309  return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
310 }
311 
312 /*
313  * CPU iterators
314  *
315  * An extra gcwq is defined for an invalid cpu number
316  * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
317  * specific CPU. The following iterators are similar to
318  * for_each_*_cpu() iterators but also considers the unbound gcwq.
319  *
320  * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND
321  * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND
322  * for_each_cwq_cpu() : possible CPUs for bound workqueues,
323  * WORK_CPU_UNBOUND for unbound workqueues
324  */
325 #define for_each_gcwq_cpu(cpu) \
326  for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \
327  (cpu) < WORK_CPU_NONE; \
328  (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
329 
330 #define for_each_online_gcwq_cpu(cpu) \
331  for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \
332  (cpu) < WORK_CPU_NONE; \
333  (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
334 
335 #define for_each_cwq_cpu(cpu, wq) \
336  for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \
337  (cpu) < WORK_CPU_NONE; \
338  (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
339 
340 #ifdef CONFIG_DEBUG_OBJECTS_WORK
341 
342 static struct debug_obj_descr work_debug_descr;
343 
344 static void *work_debug_hint(void *addr)
345 {
346  return ((struct work_struct *) addr)->func;
347 }
348 
349 /*
350  * fixup_init is called when:
351  * - an active object is initialized
352  */
353 static int work_fixup_init(void *addr, enum debug_obj_state state)
354 {
355  struct work_struct *work = addr;
356 
357  switch (state) {
358  case ODEBUG_STATE_ACTIVE:
359  cancel_work_sync(work);
360  debug_object_init(work, &work_debug_descr);
361  return 1;
362  default:
363  return 0;
364  }
365 }
366 
367 /*
368  * fixup_activate is called when:
369  * - an active object is activated
370  * - an unknown object is activated (might be a statically initialized object)
371  */
372 static int work_fixup_activate(void *addr, enum debug_obj_state state)
373 {
374  struct work_struct *work = addr;
375 
376  switch (state) {
377 
379  /*
380  * This is not really a fixup. The work struct was
381  * statically initialized. We just make sure that it
382  * is tracked in the object tracker.
383  */
384  if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
385  debug_object_init(work, &work_debug_descr);
386  debug_object_activate(work, &work_debug_descr);
387  return 0;
388  }
389  WARN_ON_ONCE(1);
390  return 0;
391 
392  case ODEBUG_STATE_ACTIVE:
393  WARN_ON(1);
394 
395  default:
396  return 0;
397  }
398 }
399 
400 /*
401  * fixup_free is called when:
402  * - an active object is freed
403  */
404 static int work_fixup_free(void *addr, enum debug_obj_state state)
405 {
406  struct work_struct *work = addr;
407 
408  switch (state) {
409  case ODEBUG_STATE_ACTIVE:
410  cancel_work_sync(work);
411  debug_object_free(work, &work_debug_descr);
412  return 1;
413  default:
414  return 0;
415  }
416 }
417 
418 static struct debug_obj_descr work_debug_descr = {
419  .name = "work_struct",
420  .debug_hint = work_debug_hint,
421  .fixup_init = work_fixup_init,
422  .fixup_activate = work_fixup_activate,
423  .fixup_free = work_fixup_free,
424 };
425 
426 static inline void debug_work_activate(struct work_struct *work)
427 {
428  debug_object_activate(work, &work_debug_descr);
429 }
430 
431 static inline void debug_work_deactivate(struct work_struct *work)
432 {
433  debug_object_deactivate(work, &work_debug_descr);
434 }
435 
436 void __init_work(struct work_struct *work, int onstack)
437 {
438  if (onstack)
439  debug_object_init_on_stack(work, &work_debug_descr);
440  else
441  debug_object_init(work, &work_debug_descr);
442 }
443 EXPORT_SYMBOL_GPL(__init_work);
444 
445 void destroy_work_on_stack(struct work_struct *work)
446 {
447  debug_object_free(work, &work_debug_descr);
448 }
449 EXPORT_SYMBOL_GPL(destroy_work_on_stack);
450 
451 #else
452 static inline void debug_work_activate(struct work_struct *work) { }
453 static inline void debug_work_deactivate(struct work_struct *work) { }
454 #endif
455 
456 /* Serializes the accesses to the list of workqueues. */
457 static DEFINE_SPINLOCK(workqueue_lock);
458 static LIST_HEAD(workqueues);
459 static bool workqueue_freezing; /* W: have wqs started freezing? */
460 
461 /*
462  * The almighty global cpu workqueues. nr_running is the only field
463  * which is expected to be used frequently by other cpus via
464  * try_to_wake_up(). Put it in a separate cacheline.
465  */
466 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
467 static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
468 
469 /*
470  * Global cpu workqueue and nr_running counter for unbound gcwq. The
471  * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
472  * workers have WORKER_UNBOUND set.
473  */
474 static struct global_cwq unbound_global_cwq;
475 static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
476  [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */
477 };
478 
479 static int worker_thread(void *__worker);
480 
481 static int worker_pool_pri(struct worker_pool *pool)
482 {
483  return pool - pool->gcwq->pools;
484 }
485 
486 static struct global_cwq *get_gcwq(unsigned int cpu)
487 {
488  if (cpu != WORK_CPU_UNBOUND)
489  return &per_cpu(global_cwq, cpu);
490  else
491  return &unbound_global_cwq;
492 }
493 
494 static atomic_t *get_pool_nr_running(struct worker_pool *pool)
495 {
496  int cpu = pool->gcwq->cpu;
497  int idx = worker_pool_pri(pool);
498 
499  if (cpu != WORK_CPU_UNBOUND)
500  return &per_cpu(pool_nr_running, cpu)[idx];
501  else
502  return &unbound_pool_nr_running[idx];
503 }
504 
505 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
506  struct workqueue_struct *wq)
507 {
508  if (!(wq->flags & WQ_UNBOUND)) {
509  if (likely(cpu < nr_cpu_ids))
510  return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
511  } else if (likely(cpu == WORK_CPU_UNBOUND))
512  return wq->cpu_wq.single;
513  return NULL;
514 }
515 
516 static unsigned int work_color_to_flags(int color)
517 {
518  return color << WORK_STRUCT_COLOR_SHIFT;
519 }
520 
521 static int get_work_color(struct work_struct *work)
522 {
523  return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
524  ((1 << WORK_STRUCT_COLOR_BITS) - 1);
525 }
526 
527 static int work_next_color(int color)
528 {
529  return (color + 1) % WORK_NR_COLORS;
530 }
531 
532 /*
533  * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
534  * contain the pointer to the queued cwq. Once execution starts, the flag
535  * is cleared and the high bits contain OFFQ flags and CPU number.
536  *
537  * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
538  * and clear_work_data() can be used to set the cwq, cpu or clear
539  * work->data. These functions should only be called while the work is
540  * owned - ie. while the PENDING bit is set.
541  *
542  * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
543  * a work. gcwq is available once the work has been queued anywhere after
544  * initialization until it is sync canceled. cwq is available only while
545  * the work item is queued.
546  *
547  * %WORK_OFFQ_CANCELING is used to mark a work item which is being
548  * canceled. While being canceled, a work item may have its PENDING set
549  * but stay off timer and worklist for arbitrarily long and nobody should
550  * try to steal the PENDING bit.
551  */
552 static inline void set_work_data(struct work_struct *work, unsigned long data,
553  unsigned long flags)
554 {
555  BUG_ON(!work_pending(work));
556  atomic_long_set(&work->data, data | flags | work_static(work));
557 }
558 
559 static void set_work_cwq(struct work_struct *work,
560  struct cpu_workqueue_struct *cwq,
561  unsigned long extra_flags)
562 {
563  set_work_data(work, (unsigned long)cwq,
564  WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
565 }
566 
567 static void set_work_cpu_and_clear_pending(struct work_struct *work,
568  unsigned int cpu)
569 {
570  /*
571  * The following wmb is paired with the implied mb in
572  * test_and_set_bit(PENDING) and ensures all updates to @work made
573  * here are visible to and precede any updates by the next PENDING
574  * owner.
575  */
576  smp_wmb();
577  set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
578 }
579 
580 static void clear_work_data(struct work_struct *work)
581 {
582  smp_wmb(); /* see set_work_cpu_and_clear_pending() */
583  set_work_data(work, WORK_STRUCT_NO_CPU, 0);
584 }
585 
586 static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
587 {
588  unsigned long data = atomic_long_read(&work->data);
589 
590  if (data & WORK_STRUCT_CWQ)
591  return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
592  else
593  return NULL;
594 }
595 
596 static struct global_cwq *get_work_gcwq(struct work_struct *work)
597 {
598  unsigned long data = atomic_long_read(&work->data);
599  unsigned int cpu;
600 
601  if (data & WORK_STRUCT_CWQ)
602  return ((struct cpu_workqueue_struct *)
603  (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
604 
605  cpu = data >> WORK_OFFQ_CPU_SHIFT;
606  if (cpu == WORK_CPU_NONE)
607  return NULL;
608 
609  BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
610  return get_gcwq(cpu);
611 }
612 
613 static void mark_work_canceling(struct work_struct *work)
614 {
615  struct global_cwq *gcwq = get_work_gcwq(work);
616  unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
617 
618  set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
620 }
621 
622 static bool work_is_canceling(struct work_struct *work)
623 {
624  unsigned long data = atomic_long_read(&work->data);
625 
626  return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
627 }
628 
629 /*
630  * Policy functions. These define the policies on how the global worker
631  * pools are managed. Unless noted otherwise, these functions assume that
632  * they're being called with gcwq->lock held.
633  */
634 
635 static bool __need_more_worker(struct worker_pool *pool)
636 {
637  return !atomic_read(get_pool_nr_running(pool));
638 }
639 
640 /*
641  * Need to wake up a worker? Called from anything but currently
642  * running workers.
643  *
644  * Note that, because unbound workers never contribute to nr_running, this
645  * function will always return %true for unbound gcwq as long as the
646  * worklist isn't empty.
647  */
648 static bool need_more_worker(struct worker_pool *pool)
649 {
650  return !list_empty(&pool->worklist) && __need_more_worker(pool);
651 }
652 
653 /* Can I start working? Called from busy but !running workers. */
654 static bool may_start_working(struct worker_pool *pool)
655 {
656  return pool->nr_idle;
657 }
658 
659 /* Do I need to keep working? Called from currently running workers. */
660 static bool keep_working(struct worker_pool *pool)
661 {
662  atomic_t *nr_running = get_pool_nr_running(pool);
663 
664  return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
665 }
666 
667 /* Do we need a new worker? Called from manager. */
668 static bool need_to_create_worker(struct worker_pool *pool)
669 {
670  return need_more_worker(pool) && !may_start_working(pool);
671 }
672 
673 /* Do I need to be the manager? */
674 static bool need_to_manage_workers(struct worker_pool *pool)
675 {
676  return need_to_create_worker(pool) ||
677  (pool->flags & POOL_MANAGE_WORKERS);
678 }
679 
680 /* Do we have too many workers and should some go away? */
681 static bool too_many_workers(struct worker_pool *pool)
682 {
683  bool managing = pool->flags & POOL_MANAGING_WORKERS;
684  int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
685  int nr_busy = pool->nr_workers - nr_idle;
686 
687  /*
688  * nr_idle and idle_list may disagree if idle rebinding is in
689  * progress. Never return %true if idle_list is empty.
690  */
691  if (list_empty(&pool->idle_list))
692  return false;
693 
694  return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
695 }
696 
697 /*
698  * Wake up functions.
699  */
700 
701 /* Return the first worker. Safe with preemption disabled */
702 static struct worker *first_worker(struct worker_pool *pool)
703 {
704  if (unlikely(list_empty(&pool->idle_list)))
705  return NULL;
706 
707  return list_first_entry(&pool->idle_list, struct worker, entry);
708 }
709 
719 static void wake_up_worker(struct worker_pool *pool)
720 {
721  struct worker *worker = first_worker(pool);
722 
723  if (likely(worker))
724  wake_up_process(worker->task);
725 }
726 
738 void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
739 {
740  struct worker *worker = kthread_data(task);
741 
742  if (!(worker->flags & WORKER_NOT_RUNNING))
743  atomic_inc(get_pool_nr_running(worker->pool));
744 }
745 
762  unsigned int cpu)
763 {
764  struct worker *worker = kthread_data(task), *to_wakeup = NULL;
765  struct worker_pool *pool = worker->pool;
766  atomic_t *nr_running = get_pool_nr_running(pool);
767 
768  if (worker->flags & WORKER_NOT_RUNNING)
769  return NULL;
770 
771  /* this can only happen on the local cpu */
772  BUG_ON(cpu != raw_smp_processor_id());
773 
774  /*
775  * The counterpart of the following dec_and_test, implied mb,
776  * worklist not empty test sequence is in insert_work().
777  * Please read comment there.
778  *
779  * NOT_RUNNING is clear. This means that we're bound to and
780  * running on the local cpu w/ rq lock held and preemption
781  * disabled, which in turn means that none else could be
782  * manipulating idle_list, so dereferencing idle_list without gcwq
783  * lock is safe.
784  */
785  if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
786  to_wakeup = first_worker(pool);
787  return to_wakeup ? to_wakeup->task : NULL;
788 }
789 
803 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
804  bool wakeup)
805 {
806  struct worker_pool *pool = worker->pool;
807 
808  WARN_ON_ONCE(worker->task != current);
809 
810  /*
811  * If transitioning into NOT_RUNNING, adjust nr_running and
812  * wake up an idle worker as necessary if requested by
813  * @wakeup.
814  */
815  if ((flags & WORKER_NOT_RUNNING) &&
816  !(worker->flags & WORKER_NOT_RUNNING)) {
817  atomic_t *nr_running = get_pool_nr_running(pool);
818 
819  if (wakeup) {
820  if (atomic_dec_and_test(nr_running) &&
821  !list_empty(&pool->worklist))
822  wake_up_worker(pool);
823  } else
824  atomic_dec(nr_running);
825  }
826 
827  worker->flags |= flags;
828 }
829 
840 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
841 {
842  struct worker_pool *pool = worker->pool;
843  unsigned int oflags = worker->flags;
844 
845  WARN_ON_ONCE(worker->task != current);
846 
847  worker->flags &= ~flags;
848 
849  /*
850  * If transitioning out of NOT_RUNNING, increment nr_running. Note
851  * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
852  * of multiple flags, not a single flag.
853  */
854  if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
855  if (!(worker->flags & WORKER_NOT_RUNNING))
856  atomic_inc(get_pool_nr_running(pool));
857 }
858 
872 static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
873  struct work_struct *work)
874 {
875  const int base_shift = ilog2(sizeof(struct work_struct));
876  unsigned long v = (unsigned long)work;
877 
878  /* simple shift and fold hash, do we need something better? */
879  v >>= base_shift;
880  v += v >> BUSY_WORKER_HASH_ORDER;
882 
883  return &gcwq->busy_hash[v];
884 }
885 
903 static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
904  struct hlist_head *bwh,
905  struct work_struct *work)
906 {
907  struct worker *worker;
908  struct hlist_node *tmp;
909 
910  hlist_for_each_entry(worker, tmp, bwh, hentry)
911  if (worker->current_work == work)
912  return worker;
913  return NULL;
914 }
915 
932 static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
933  struct work_struct *work)
934 {
935  return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
936  work);
937 }
938 
956 static void move_linked_works(struct work_struct *work, struct list_head *head,
957  struct work_struct **nextp)
958 {
959  struct work_struct *n;
960 
961  /*
962  * Linked worklist will always end before the end of the list,
963  * use NULL for list head.
964  */
965  list_for_each_entry_safe_from(work, n, NULL, entry) {
966  list_move_tail(&work->entry, head);
967  if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
968  break;
969  }
970 
971  /*
972  * If we're already inside safe list traversal and have moved
973  * multiple works to the scheduled queue, the next position
974  * needs to be updated.
975  */
976  if (nextp)
977  *nextp = n;
978 }
979 
980 static void cwq_activate_delayed_work(struct work_struct *work)
981 {
982  struct cpu_workqueue_struct *cwq = get_work_cwq(work);
983 
984  trace_workqueue_activate_work(work);
985  move_linked_works(work, &cwq->pool->worklist, NULL);
987  cwq->nr_active++;
988 }
989 
990 static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
991 {
992  struct work_struct *work = list_first_entry(&cwq->delayed_works,
993  struct work_struct, entry);
994 
995  cwq_activate_delayed_work(work);
996 }
997 
1009 static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1010 {
1011  /* ignore uncolored works */
1012  if (color == WORK_NO_COLOR)
1013  return;
1014 
1015  cwq->nr_in_flight[color]--;
1016 
1017  cwq->nr_active--;
1018  if (!list_empty(&cwq->delayed_works)) {
1019  /* one down, submit a delayed one */
1020  if (cwq->nr_active < cwq->max_active)
1021  cwq_activate_first_delayed(cwq);
1022  }
1023 
1024  /* is flush in progress and are we at the flushing tip? */
1025  if (likely(cwq->flush_color != color))
1026  return;
1027 
1028  /* are there still in-flight works? */
1029  if (cwq->nr_in_flight[color])
1030  return;
1031 
1032  /* this cwq is done, clear flush_color */
1033  cwq->flush_color = -1;
1034 
1035  /*
1036  * If this was the last cwq, wake up the first flusher. It
1037  * will handle the rest.
1038  */
1039  if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1040  complete(&cwq->wq->first_flusher->done);
1041 }
1042 
1068 static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1069  unsigned long *flags)
1070 {
1071  struct global_cwq *gcwq;
1072 
1073  local_irq_save(*flags);
1074 
1075  /* try to steal the timer if it exists */
1076  if (is_dwork) {
1077  struct delayed_work *dwork = to_delayed_work(work);
1078 
1079  /*
1080  * dwork->timer is irqsafe. If del_timer() fails, it's
1081  * guaranteed that the timer is not queued anywhere and not
1082  * running on the local CPU.
1083  */
1084  if (likely(del_timer(&dwork->timer)))
1085  return 1;
1086  }
1087 
1088  /* try to claim PENDING the normal way */
1090  return 0;
1091 
1092  /*
1093  * The queueing is in progress, or it is already queued. Try to
1094  * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1095  */
1096  gcwq = get_work_gcwq(work);
1097  if (!gcwq)
1098  goto fail;
1099 
1100  spin_lock(&gcwq->lock);
1101  if (!list_empty(&work->entry)) {
1102  /*
1103  * This work is queued, but perhaps we locked the wrong gcwq.
1104  * In that case we must see the new value after rmb(), see
1105  * insert_work()->wmb().
1106  */
1107  smp_rmb();
1108  if (gcwq == get_work_gcwq(work)) {
1109  debug_work_deactivate(work);
1110 
1111  /*
1112  * A delayed work item cannot be grabbed directly
1113  * because it might have linked NO_COLOR work items
1114  * which, if left on the delayed_list, will confuse
1115  * cwq->nr_active management later on and cause
1116  * stall. Make sure the work item is activated
1117  * before grabbing.
1118  */
1119  if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1120  cwq_activate_delayed_work(work);
1121 
1122  list_del_init(&work->entry);
1123  cwq_dec_nr_in_flight(get_work_cwq(work),
1124  get_work_color(work));
1125 
1126  spin_unlock(&gcwq->lock);
1127  return 1;
1128  }
1129  }
1130  spin_unlock(&gcwq->lock);
1131 fail:
1132  local_irq_restore(*flags);
1133  if (work_is_canceling(work))
1134  return -ENOENT;
1135  cpu_relax();
1136  return -EAGAIN;
1137 }
1138 
1152 static void insert_work(struct cpu_workqueue_struct *cwq,
1153  struct work_struct *work, struct list_head *head,
1154  unsigned int extra_flags)
1155 {
1156  struct worker_pool *pool = cwq->pool;
1157 
1158  /* we own @work, set data and link */
1159  set_work_cwq(work, cwq, extra_flags);
1160 
1161  /*
1162  * Ensure that we get the right work->data if we see the
1163  * result of list_add() below, see try_to_grab_pending().
1164  */
1165  smp_wmb();
1166 
1167  list_add_tail(&work->entry, head);
1168 
1169  /*
1170  * Ensure either worker_sched_deactivated() sees the above
1171  * list_add_tail() or we see zero nr_running to avoid workers
1172  * lying around lazily while there are works to be processed.
1173  */
1174  smp_mb();
1175 
1176  if (__need_more_worker(pool))
1177  wake_up_worker(pool);
1178 }
1179 
1180 /*
1181  * Test whether @work is being queued from another work executing on the
1182  * same workqueue. This is rather expensive and should only be used from
1183  * cold paths.
1184  */
1185 static bool is_chained_work(struct workqueue_struct *wq)
1186 {
1187  unsigned long flags;
1188  unsigned int cpu;
1189 
1190  for_each_gcwq_cpu(cpu) {
1191  struct global_cwq *gcwq = get_gcwq(cpu);
1192  struct worker *worker;
1193  struct hlist_node *pos;
1194  int i;
1195 
1196  spin_lock_irqsave(&gcwq->lock, flags);
1197  for_each_busy_worker(worker, i, pos, gcwq) {
1198  if (worker->task != current)
1199  continue;
1200  spin_unlock_irqrestore(&gcwq->lock, flags);
1201  /*
1202  * I'm @worker, no locking necessary. See if @work
1203  * is headed to the same workqueue.
1204  */
1205  return worker->current_cwq->wq == wq;
1206  }
1207  spin_unlock_irqrestore(&gcwq->lock, flags);
1208  }
1209  return false;
1210 }
1211 
1212 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1213  struct work_struct *work)
1214 {
1215  struct global_cwq *gcwq;
1216  struct cpu_workqueue_struct *cwq;
1217  struct list_head *worklist;
1218  unsigned int work_flags;
1219  unsigned int req_cpu = cpu;
1220 
1221  /*
1222  * While a work item is PENDING && off queue, a task trying to
1223  * steal the PENDING will busy-loop waiting for it to either get
1224  * queued or lose PENDING. Grabbing PENDING and queueing should
1225  * happen with IRQ disabled.
1226  */
1228 
1229  debug_work_activate(work);
1230 
1231  /* if dying, only works from the same workqueue are allowed */
1232  if (unlikely(wq->flags & WQ_DRAINING) &&
1233  WARN_ON_ONCE(!is_chained_work(wq)))
1234  return;
1235 
1236  /* determine gcwq to use */
1237  if (!(wq->flags & WQ_UNBOUND)) {
1238  struct global_cwq *last_gcwq;
1239 
1240  if (cpu == WORK_CPU_UNBOUND)
1241  cpu = raw_smp_processor_id();
1242 
1243  /*
1244  * It's multi cpu. If @work was previously on a different
1245  * cpu, it might still be running there, in which case the
1246  * work needs to be queued on that cpu to guarantee
1247  * non-reentrancy.
1248  */
1249  gcwq = get_gcwq(cpu);
1250  last_gcwq = get_work_gcwq(work);
1251 
1252  if (last_gcwq && last_gcwq != gcwq) {
1253  struct worker *worker;
1254 
1255  spin_lock(&last_gcwq->lock);
1256 
1257  worker = find_worker_executing_work(last_gcwq, work);
1258 
1259  if (worker && worker->current_cwq->wq == wq)
1260  gcwq = last_gcwq;
1261  else {
1262  /* meh... not running there, queue here */
1263  spin_unlock(&last_gcwq->lock);
1264  spin_lock(&gcwq->lock);
1265  }
1266  } else {
1267  spin_lock(&gcwq->lock);
1268  }
1269  } else {
1270  gcwq = get_gcwq(WORK_CPU_UNBOUND);
1271  spin_lock(&gcwq->lock);
1272  }
1273 
1274  /* gcwq determined, get cwq and queue */
1275  cwq = get_cwq(gcwq->cpu, wq);
1276  trace_workqueue_queue_work(req_cpu, cwq, work);
1277 
1278  if (WARN_ON(!list_empty(&work->entry))) {
1279  spin_unlock(&gcwq->lock);
1280  return;
1281  }
1282 
1283  cwq->nr_in_flight[cwq->work_color]++;
1284  work_flags = work_color_to_flags(cwq->work_color);
1285 
1286  if (likely(cwq->nr_active < cwq->max_active)) {
1287  trace_workqueue_activate_work(work);
1288  cwq->nr_active++;
1289  worklist = &cwq->pool->worklist;
1290  } else {
1291  work_flags |= WORK_STRUCT_DELAYED;
1292  worklist = &cwq->delayed_works;
1293  }
1294 
1295  insert_work(cwq, work, worklist, work_flags);
1296 
1297  spin_unlock(&gcwq->lock);
1298 }
1299 
1311 bool queue_work_on(int cpu, struct workqueue_struct *wq,
1312  struct work_struct *work)
1313 {
1314  bool ret = false;
1315  unsigned long flags;
1316 
1317  local_irq_save(flags);
1318 
1320  __queue_work(cpu, wq, work);
1321  ret = true;
1322  }
1323 
1324  local_irq_restore(flags);
1325  return ret;
1326 }
1328 
1339 bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
1340 {
1341  return queue_work_on(WORK_CPU_UNBOUND, wq, work);
1342 }
1344 
1345 void delayed_work_timer_fn(unsigned long __data)
1346 {
1347  struct delayed_work *dwork = (struct delayed_work *)__data;
1348  struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1349 
1350  /* should have been called from irqsafe timer with irq already off */
1351  __queue_work(dwork->cpu, cwq->wq, &dwork->work);
1352 }
1354 
1355 static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1356  struct delayed_work *dwork, unsigned long delay)
1357 {
1358  struct timer_list *timer = &dwork->timer;
1359  struct work_struct *work = &dwork->work;
1360  unsigned int lcpu;
1361 
1363  timer->data != (unsigned long)dwork);
1364  WARN_ON_ONCE(timer_pending(timer));
1365  WARN_ON_ONCE(!list_empty(&work->entry));
1366 
1367  /*
1368  * If @delay is 0, queue @dwork->work immediately. This is for
1369  * both optimization and correctness. The earliest @timer can
1370  * expire is on the closest next tick and delayed_work users depend
1371  * on that there's no such delay when @delay is 0.
1372  */
1373  if (!delay) {
1374  __queue_work(cpu, wq, &dwork->work);
1375  return;
1376  }
1377 
1378  timer_stats_timer_set_start_info(&dwork->timer);
1379 
1380  /*
1381  * This stores cwq for the moment, for the timer_fn. Note that the
1382  * work's gcwq is preserved to allow reentrance detection for
1383  * delayed works.
1384  */
1385  if (!(wq->flags & WQ_UNBOUND)) {
1386  struct global_cwq *gcwq = get_work_gcwq(work);
1387 
1388  /*
1389  * If we cannot get the last gcwq from @work directly,
1390  * select the last CPU such that it avoids unnecessarily
1391  * triggering non-reentrancy check in __queue_work().
1392  */
1393  lcpu = cpu;
1394  if (gcwq)
1395  lcpu = gcwq->cpu;
1396  if (lcpu == WORK_CPU_UNBOUND)
1397  lcpu = raw_smp_processor_id();
1398  } else {
1399  lcpu = WORK_CPU_UNBOUND;
1400  }
1401 
1402  set_work_cwq(work, get_cwq(lcpu, wq), 0);
1403 
1404  dwork->cpu = cpu;
1405  timer->expires = jiffies + delay;
1406 
1407  if (unlikely(cpu != WORK_CPU_UNBOUND))
1408  add_timer_on(timer, cpu);
1409  else
1410  add_timer(timer);
1411 }
1412 
1424 bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1425  struct delayed_work *dwork, unsigned long delay)
1426 {
1427  struct work_struct *work = &dwork->work;
1428  bool ret = false;
1429  unsigned long flags;
1430 
1431  /* read the comment in __queue_work() */
1432  local_irq_save(flags);
1433 
1435  __queue_delayed_work(cpu, wq, dwork, delay);
1436  ret = true;
1437  }
1438 
1439  local_irq_restore(flags);
1440  return ret;
1441 }
1443 
1453  struct delayed_work *dwork, unsigned long delay)
1454 {
1455  return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1456 }
1458 
1477 bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
1478  struct delayed_work *dwork, unsigned long delay)
1479 {
1480  unsigned long flags;
1481  int ret;
1482 
1483  do {
1484  ret = try_to_grab_pending(&dwork->work, true, &flags);
1485  } while (unlikely(ret == -EAGAIN));
1486 
1487  if (likely(ret >= 0)) {
1488  __queue_delayed_work(cpu, wq, dwork, delay);
1489  local_irq_restore(flags);
1490  }
1491 
1492  /* -ENOENT from try_to_grab_pending() becomes %true */
1493  return ret;
1494 }
1496 
1505 bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
1506  unsigned long delay)
1507 {
1508  return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1509 }
1511 
1522 static void worker_enter_idle(struct worker *worker)
1523 {
1524  struct worker_pool *pool = worker->pool;
1525  struct global_cwq *gcwq = pool->gcwq;
1526 
1527  BUG_ON(worker->flags & WORKER_IDLE);
1528  BUG_ON(!list_empty(&worker->entry) &&
1529  (worker->hentry.next || worker->hentry.pprev));
1530 
1531  /* can't use worker_set_flags(), also called from start_worker() */
1532  worker->flags |= WORKER_IDLE;
1533  pool->nr_idle++;
1534  worker->last_active = jiffies;
1535 
1536  /* idle_list is LIFO */
1537  list_add(&worker->entry, &pool->idle_list);
1538 
1539  if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1540  mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1541 
1542  /*
1543  * Sanity check nr_running. Because gcwq_unbind_fn() releases
1544  * gcwq->lock between setting %WORKER_UNBOUND and zapping
1545  * nr_running, the warning may trigger spuriously. Check iff
1546  * unbind is not in progress.
1547  */
1548  WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
1549  pool->nr_workers == pool->nr_idle &&
1550  atomic_read(get_pool_nr_running(pool)));
1551 }
1552 
1562 static void worker_leave_idle(struct worker *worker)
1563 {
1564  struct worker_pool *pool = worker->pool;
1565 
1566  BUG_ON(!(worker->flags & WORKER_IDLE));
1567  worker_clr_flags(worker, WORKER_IDLE);
1568  pool->nr_idle--;
1569  list_del_init(&worker->entry);
1570 }
1571 
1602 static bool worker_maybe_bind_and_lock(struct worker *worker)
1603 __acquires(&gcwq->lock)
1604 {
1605  struct global_cwq *gcwq = worker->pool->gcwq;
1606  struct task_struct *task = worker->task;
1607 
1608  while (true) {
1609  /*
1610  * The following call may fail, succeed or succeed
1611  * without actually migrating the task to the cpu if
1612  * it races with cpu hotunplug operation. Verify
1613  * against GCWQ_DISASSOCIATED.
1614  */
1615  if (!(gcwq->flags & GCWQ_DISASSOCIATED))
1616  set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
1617 
1618  spin_lock_irq(&gcwq->lock);
1619  if (gcwq->flags & GCWQ_DISASSOCIATED)
1620  return false;
1621  if (task_cpu(task) == gcwq->cpu &&
1622  cpumask_equal(&current->cpus_allowed,
1623  get_cpu_mask(gcwq->cpu)))
1624  return true;
1625  spin_unlock_irq(&gcwq->lock);
1626 
1627  /*
1628  * We've raced with CPU hot[un]plug. Give it a breather
1629  * and retry migration. cond_resched() is required here;
1630  * otherwise, we might deadlock against cpu_stop trying to
1631  * bring down the CPU on non-preemptive kernel.
1632  */
1633  cpu_relax();
1634  cond_resched();
1635  }
1636 }
1637 
1638 /*
1639  * Rebind an idle @worker to its CPU. worker_thread() will test
1640  * list_empty(@worker->entry) before leaving idle and call this function.
1641  */
1642 static void idle_worker_rebind(struct worker *worker)
1643 {
1644  struct global_cwq *gcwq = worker->pool->gcwq;
1645 
1646  /* CPU may go down again inbetween, clear UNBOUND only on success */
1647  if (worker_maybe_bind_and_lock(worker))
1648  worker_clr_flags(worker, WORKER_UNBOUND);
1649 
1650  /* rebind complete, become available again */
1651  list_add(&worker->entry, &worker->pool->idle_list);
1652  spin_unlock_irq(&gcwq->lock);
1653 }
1654 
1655 /*
1656  * Function for @worker->rebind.work used to rebind unbound busy workers to
1657  * the associated cpu which is coming back online. This is scheduled by
1658  * cpu up but can race with other cpu hotplug operations and may be
1659  * executed twice without intervening cpu down.
1660  */
1661 static void busy_worker_rebind_fn(struct work_struct *work)
1662 {
1663  struct worker *worker = container_of(work, struct worker, rebind_work);
1664  struct global_cwq *gcwq = worker->pool->gcwq;
1665 
1666  if (worker_maybe_bind_and_lock(worker))
1667  worker_clr_flags(worker, WORKER_UNBOUND);
1668 
1669  spin_unlock_irq(&gcwq->lock);
1670 }
1671 
1694 static void rebind_workers(struct global_cwq *gcwq)
1695 {
1696  struct worker_pool *pool;
1697  struct worker *worker, *n;
1698  struct hlist_node *pos;
1699  int i;
1700 
1701  lockdep_assert_held(&gcwq->lock);
1702 
1703  for_each_worker_pool(pool, gcwq)
1704  lockdep_assert_held(&pool->assoc_mutex);
1705 
1706  /* dequeue and kick idle ones */
1707  for_each_worker_pool(pool, gcwq) {
1708  list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1709  /*
1710  * idle workers should be off @pool->idle_list
1711  * until rebind is complete to avoid receiving
1712  * premature local wake-ups.
1713  */
1714  list_del_init(&worker->entry);
1715 
1716  /*
1717  * worker_thread() will see the above dequeuing
1718  * and call idle_worker_rebind().
1719  */
1720  wake_up_process(worker->task);
1721  }
1722  }
1723 
1724  /* rebind busy workers */
1725  for_each_busy_worker(worker, i, pos, gcwq) {
1726  struct work_struct *rebind_work = &worker->rebind_work;
1727  struct workqueue_struct *wq;
1728 
1730  work_data_bits(rebind_work)))
1731  continue;
1732 
1733  debug_work_activate(rebind_work);
1734 
1735  /*
1736  * wq doesn't really matter but let's keep @worker->pool
1737  * and @cwq->pool consistent for sanity.
1738  */
1739  if (worker_pool_pri(worker->pool))
1740  wq = system_highpri_wq;
1741  else
1742  wq = system_wq;
1743 
1744  insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
1745  worker->scheduled.next,
1746  work_color_to_flags(WORK_NO_COLOR));
1747  }
1748 }
1749 
1750 static struct worker *alloc_worker(void)
1751 {
1752  struct worker *worker;
1753 
1754  worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1755  if (worker) {
1756  INIT_LIST_HEAD(&worker->entry);
1757  INIT_LIST_HEAD(&worker->scheduled);
1758  INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
1759  /* on creation a worker is in !idle && prep state */
1760  worker->flags = WORKER_PREP;
1761  }
1762  return worker;
1763 }
1764 
1779 static struct worker *create_worker(struct worker_pool *pool)
1780 {
1781  struct global_cwq *gcwq = pool->gcwq;
1782  const char *pri = worker_pool_pri(pool) ? "H" : "";
1783  struct worker *worker = NULL;
1784  int id = -1;
1785 
1786  spin_lock_irq(&gcwq->lock);
1787  while (ida_get_new(&pool->worker_ida, &id)) {
1788  spin_unlock_irq(&gcwq->lock);
1789  if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
1790  goto fail;
1791  spin_lock_irq(&gcwq->lock);
1792  }
1793  spin_unlock_irq(&gcwq->lock);
1794 
1795  worker = alloc_worker();
1796  if (!worker)
1797  goto fail;
1798 
1799  worker->pool = pool;
1800  worker->id = id;
1801 
1802  if (gcwq->cpu != WORK_CPU_UNBOUND)
1803  worker->task = kthread_create_on_node(worker_thread,
1804  worker, cpu_to_node(gcwq->cpu),
1805  "kworker/%u:%d%s", gcwq->cpu, id, pri);
1806  else
1807  worker->task = kthread_create(worker_thread, worker,
1808  "kworker/u:%d%s", id, pri);
1809  if (IS_ERR(worker->task))
1810  goto fail;
1811 
1812  if (worker_pool_pri(pool))
1813  set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1814 
1815  /*
1816  * Determine CPU binding of the new worker depending on
1817  * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the
1818  * flag remains stable across this function. See the comments
1819  * above the flag definition for details.
1820  *
1821  * As an unbound worker may later become a regular one if CPU comes
1822  * online, make sure every worker has %PF_THREAD_BOUND set.
1823  */
1824  if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
1825  kthread_bind(worker->task, gcwq->cpu);
1826  } else {
1827  worker->task->flags |= PF_THREAD_BOUND;
1828  worker->flags |= WORKER_UNBOUND;
1829  }
1830 
1831  return worker;
1832 fail:
1833  if (id >= 0) {
1834  spin_lock_irq(&gcwq->lock);
1835  ida_remove(&pool->worker_ida, id);
1836  spin_unlock_irq(&gcwq->lock);
1837  }
1838  kfree(worker);
1839  return NULL;
1840 }
1841 
1851 static void start_worker(struct worker *worker)
1852 {
1853  worker->flags |= WORKER_STARTED;
1854  worker->pool->nr_workers++;
1855  worker_enter_idle(worker);
1856  wake_up_process(worker->task);
1857 }
1858 
1868 static void destroy_worker(struct worker *worker)
1869 {
1870  struct worker_pool *pool = worker->pool;
1871  struct global_cwq *gcwq = pool->gcwq;
1872  int id = worker->id;
1873 
1874  /* sanity check frenzy */
1875  BUG_ON(worker->current_work);
1876  BUG_ON(!list_empty(&worker->scheduled));
1877 
1878  if (worker->flags & WORKER_STARTED)
1879  pool->nr_workers--;
1880  if (worker->flags & WORKER_IDLE)
1881  pool->nr_idle--;
1882 
1883  list_del_init(&worker->entry);
1884  worker->flags |= WORKER_DIE;
1885 
1886  spin_unlock_irq(&gcwq->lock);
1887 
1888  kthread_stop(worker->task);
1889  kfree(worker);
1890 
1891  spin_lock_irq(&gcwq->lock);
1892  ida_remove(&pool->worker_ida, id);
1893 }
1894 
1895 static void idle_worker_timeout(unsigned long __pool)
1896 {
1897  struct worker_pool *pool = (void *)__pool;
1898  struct global_cwq *gcwq = pool->gcwq;
1899 
1900  spin_lock_irq(&gcwq->lock);
1901 
1902  if (too_many_workers(pool)) {
1903  struct worker *worker;
1904  unsigned long expires;
1905 
1906  /* idle_list is kept in LIFO order, check the last one */
1907  worker = list_entry(pool->idle_list.prev, struct worker, entry);
1908  expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1909 
1910  if (time_before(jiffies, expires))
1911  mod_timer(&pool->idle_timer, expires);
1912  else {
1913  /* it's been idle for too long, wake up manager */
1914  pool->flags |= POOL_MANAGE_WORKERS;
1915  wake_up_worker(pool);
1916  }
1917  }
1918 
1919  spin_unlock_irq(&gcwq->lock);
1920 }
1921 
1922 static bool send_mayday(struct work_struct *work)
1923 {
1924  struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1925  struct workqueue_struct *wq = cwq->wq;
1926  unsigned int cpu;
1927 
1928  if (!(wq->flags & WQ_RESCUER))
1929  return false;
1930 
1931  /* mayday mayday mayday */
1932  cpu = cwq->pool->gcwq->cpu;
1933  /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1934  if (cpu == WORK_CPU_UNBOUND)
1935  cpu = 0;
1936  if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1937  wake_up_process(wq->rescuer->task);
1938  return true;
1939 }
1940 
1941 static void gcwq_mayday_timeout(unsigned long __pool)
1942 {
1943  struct worker_pool *pool = (void *)__pool;
1944  struct global_cwq *gcwq = pool->gcwq;
1945  struct work_struct *work;
1946 
1947  spin_lock_irq(&gcwq->lock);
1948 
1949  if (need_to_create_worker(pool)) {
1950  /*
1951  * We've been trying to create a new worker but
1952  * haven't been successful. We might be hitting an
1953  * allocation deadlock. Send distress signals to
1954  * rescuers.
1955  */
1956  list_for_each_entry(work, &pool->worklist, entry)
1957  send_mayday(work);
1958  }
1959 
1960  spin_unlock_irq(&gcwq->lock);
1961 
1962  mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1963 }
1964 
1987 static bool maybe_create_worker(struct worker_pool *pool)
1988 __releases(&gcwq->lock)
1989 __acquires(&gcwq->lock)
1990 {
1991  struct global_cwq *gcwq = pool->gcwq;
1992 
1993  if (!need_to_create_worker(pool))
1994  return false;
1995 restart:
1996  spin_unlock_irq(&gcwq->lock);
1997 
1998  /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1999  mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
2000 
2001  while (true) {
2002  struct worker *worker;
2003 
2004  worker = create_worker(pool);
2005  if (worker) {
2006  del_timer_sync(&pool->mayday_timer);
2007  spin_lock_irq(&gcwq->lock);
2008  start_worker(worker);
2009  BUG_ON(need_to_create_worker(pool));
2010  return true;
2011  }
2012 
2013  if (!need_to_create_worker(pool))
2014  break;
2015 
2017  schedule_timeout(CREATE_COOLDOWN);
2018 
2019  if (!need_to_create_worker(pool))
2020  break;
2021  }
2022 
2023  del_timer_sync(&pool->mayday_timer);
2024  spin_lock_irq(&gcwq->lock);
2025  if (need_to_create_worker(pool))
2026  goto restart;
2027  return true;
2028 }
2029 
2045 static bool maybe_destroy_workers(struct worker_pool *pool)
2046 {
2047  bool ret = false;
2048 
2049  while (too_many_workers(pool)) {
2050  struct worker *worker;
2051  unsigned long expires;
2052 
2053  worker = list_entry(pool->idle_list.prev, struct worker, entry);
2054  expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2055 
2056  if (time_before(jiffies, expires)) {
2057  mod_timer(&pool->idle_timer, expires);
2058  break;
2059  }
2060 
2061  destroy_worker(worker);
2062  ret = true;
2063  }
2064 
2065  return ret;
2066 }
2067 
2088 static bool manage_workers(struct worker *worker)
2089 {
2090  struct worker_pool *pool = worker->pool;
2091  bool ret = false;
2092 
2093  if (pool->flags & POOL_MANAGING_WORKERS)
2094  return ret;
2095 
2096  pool->flags |= POOL_MANAGING_WORKERS;
2097 
2098  /*
2099  * To simplify both worker management and CPU hotplug, hold off
2100  * management while hotplug is in progress. CPU hotplug path can't
2101  * grab %POOL_MANAGING_WORKERS to achieve this because that can
2102  * lead to idle worker depletion (all become busy thinking someone
2103  * else is managing) which in turn can result in deadlock under
2104  * extreme circumstances. Use @pool->assoc_mutex to synchronize
2105  * manager against CPU hotplug.
2106  *
2107  * assoc_mutex would always be free unless CPU hotplug is in
2108  * progress. trylock first without dropping @gcwq->lock.
2109  */
2110  if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
2111  spin_unlock_irq(&pool->gcwq->lock);
2112  mutex_lock(&pool->assoc_mutex);
2113  /*
2114  * CPU hotplug could have happened while we were waiting
2115  * for assoc_mutex. Hotplug itself can't handle us
2116  * because manager isn't either on idle or busy list, and
2117  * @gcwq's state and ours could have deviated.
2118  *
2119  * As hotplug is now excluded via assoc_mutex, we can
2120  * simply try to bind. It will succeed or fail depending
2121  * on @gcwq's current state. Try it and adjust
2122  * %WORKER_UNBOUND accordingly.
2123  */
2124  if (worker_maybe_bind_and_lock(worker))
2125  worker->flags &= ~WORKER_UNBOUND;
2126  else
2127  worker->flags |= WORKER_UNBOUND;
2128 
2129  ret = true;
2130  }
2131 
2132  pool->flags &= ~POOL_MANAGE_WORKERS;
2133 
2134  /*
2135  * Destroy and then create so that may_start_working() is true
2136  * on return.
2137  */
2138  ret |= maybe_destroy_workers(pool);
2139  ret |= maybe_create_worker(pool);
2140 
2141  pool->flags &= ~POOL_MANAGING_WORKERS;
2142  mutex_unlock(&pool->assoc_mutex);
2143  return ret;
2144 }
2145 
2160 static void process_one_work(struct worker *worker, struct work_struct *work)
2161 __releases(&gcwq->lock)
2162 __acquires(&gcwq->lock)
2163 {
2164  struct cpu_workqueue_struct *cwq = get_work_cwq(work);
2165  struct worker_pool *pool = worker->pool;
2166  struct global_cwq *gcwq = pool->gcwq;
2167  struct hlist_head *bwh = busy_worker_head(gcwq, work);
2168  bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
2169  work_func_t f = work->func;
2170  int work_color;
2171  struct worker *collision;
2172 #ifdef CONFIG_LOCKDEP
2173  /*
2174  * It is permissible to free the struct work_struct from
2175  * inside the function that is called from it, this we need to
2176  * take into account for lockdep too. To avoid bogus "held
2177  * lock freed" warnings as well as problems when looking into
2178  * work->lockdep_map, make a copy and use that here.
2179  */
2180  struct lockdep_map lockdep_map;
2181 
2182  lockdep_copy_map(&lockdep_map, &work->lockdep_map);
2183 #endif
2184  /*
2185  * Ensure we're on the correct CPU. DISASSOCIATED test is
2186  * necessary to avoid spurious warnings from rescuers servicing the
2187  * unbound or a disassociated gcwq.
2188  */
2189  WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2190  !(gcwq->flags & GCWQ_DISASSOCIATED) &&
2191  raw_smp_processor_id() != gcwq->cpu);
2192 
2193  /*
2194  * A single work shouldn't be executed concurrently by
2195  * multiple workers on a single cpu. Check whether anyone is
2196  * already processing the work. If so, defer the work to the
2197  * currently executing one.
2198  */
2199  collision = __find_worker_executing_work(gcwq, bwh, work);
2200  if (unlikely(collision)) {
2201  move_linked_works(work, &collision->scheduled, NULL);
2202  return;
2203  }
2204 
2205  /* claim and dequeue */
2206  debug_work_deactivate(work);
2207  hlist_add_head(&worker->hentry, bwh);
2208  worker->current_work = work;
2209  worker->current_cwq = cwq;
2210  work_color = get_work_color(work);
2211 
2212  list_del_init(&work->entry);
2213 
2214  /*
2215  * CPU intensive works don't participate in concurrency
2216  * management. They're the scheduler's responsibility.
2217  */
2218  if (unlikely(cpu_intensive))
2219  worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
2220 
2221  /*
2222  * Unbound gcwq isn't concurrency managed and work items should be
2223  * executed ASAP. Wake up another worker if necessary.
2224  */
2225  if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
2226  wake_up_worker(pool);
2227 
2228  /*
2229  * Record the last CPU and clear PENDING which should be the last
2230  * update to @work. Also, do this inside @gcwq->lock so that
2231  * PENDING and queued state changes happen together while IRQ is
2232  * disabled.
2233  */
2234  set_work_cpu_and_clear_pending(work, gcwq->cpu);
2235 
2236  spin_unlock_irq(&gcwq->lock);
2237 
2238  lock_map_acquire_read(&cwq->wq->lockdep_map);
2239  lock_map_acquire(&lockdep_map);
2240  trace_workqueue_execute_start(work);
2241  f(work);
2242  /*
2243  * While we must be careful to not use "work" after this, the trace
2244  * point will only record its address.
2245  */
2246  trace_workqueue_execute_end(work);
2247  lock_map_release(&lockdep_map);
2248  lock_map_release(&cwq->wq->lockdep_map);
2249 
2250  if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
2251  pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
2252  " last function: %pf\n",
2253  current->comm, preempt_count(), task_pid_nr(current), f);
2255  dump_stack();
2256  }
2257 
2258  spin_lock_irq(&gcwq->lock);
2259 
2260  /* clear cpu intensive status */
2261  if (unlikely(cpu_intensive))
2262  worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
2263 
2264  /* we're done with it, release */
2265  hlist_del_init(&worker->hentry);
2266  worker->current_work = NULL;
2267  worker->current_cwq = NULL;
2268  cwq_dec_nr_in_flight(cwq, work_color);
2269 }
2270 
2283 static void process_scheduled_works(struct worker *worker)
2284 {
2285  while (!list_empty(&worker->scheduled)) {
2286  struct work_struct *work = list_first_entry(&worker->scheduled,
2287  struct work_struct, entry);
2288  process_one_work(worker, work);
2289  }
2290 }
2291 
2302 static int worker_thread(void *__worker)
2303 {
2304  struct worker *worker = __worker;
2305  struct worker_pool *pool = worker->pool;
2306  struct global_cwq *gcwq = pool->gcwq;
2307 
2308  /* tell the scheduler that this is a workqueue worker */
2309  worker->task->flags |= PF_WQ_WORKER;
2310 woke_up:
2311  spin_lock_irq(&gcwq->lock);
2312 
2313  /* we are off idle list if destruction or rebind is requested */
2314  if (unlikely(list_empty(&worker->entry))) {
2315  spin_unlock_irq(&gcwq->lock);
2316 
2317  /* if DIE is set, destruction is requested */
2318  if (worker->flags & WORKER_DIE) {
2319  worker->task->flags &= ~PF_WQ_WORKER;
2320  return 0;
2321  }
2322 
2323  /* otherwise, rebind */
2324  idle_worker_rebind(worker);
2325  goto woke_up;
2326  }
2327 
2328  worker_leave_idle(worker);
2329 recheck:
2330  /* no more worker necessary? */
2331  if (!need_more_worker(pool))
2332  goto sleep;
2333 
2334  /* do we need to manage? */
2335  if (unlikely(!may_start_working(pool)) && manage_workers(worker))
2336  goto recheck;
2337 
2338  /*
2339  * ->scheduled list can only be filled while a worker is
2340  * preparing to process a work or actually processing it.
2341  * Make sure nobody diddled with it while I was sleeping.
2342  */
2343  BUG_ON(!list_empty(&worker->scheduled));
2344 
2345  /*
2346  * When control reaches this point, we're guaranteed to have
2347  * at least one idle worker or that someone else has already
2348  * assumed the manager role.
2349  */
2350  worker_clr_flags(worker, WORKER_PREP);
2351 
2352  do {
2353  struct work_struct *work =
2354  list_first_entry(&pool->worklist,
2355  struct work_struct, entry);
2356 
2357  if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
2358  /* optimization path, not strictly necessary */
2359  process_one_work(worker, work);
2360  if (unlikely(!list_empty(&worker->scheduled)))
2361  process_scheduled_works(worker);
2362  } else {
2363  move_linked_works(work, &worker->scheduled, NULL);
2364  process_scheduled_works(worker);
2365  }
2366  } while (keep_working(pool));
2367 
2368  worker_set_flags(worker, WORKER_PREP, false);
2369 sleep:
2370  if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
2371  goto recheck;
2372 
2373  /*
2374  * gcwq->lock is held and there's no work to process and no
2375  * need to manage, sleep. Workers are woken up only while
2376  * holding gcwq->lock or from local cpu, so setting the
2377  * current state before releasing gcwq->lock is enough to
2378  * prevent losing any event.
2379  */
2380  worker_enter_idle(worker);
2382  spin_unlock_irq(&gcwq->lock);
2383  schedule();
2384  goto woke_up;
2385 }
2386 
2406 static int rescuer_thread(void *__wq)
2407 {
2408  struct workqueue_struct *wq = __wq;
2409  struct worker *rescuer = wq->rescuer;
2410  struct list_head *scheduled = &rescuer->scheduled;
2411  bool is_unbound = wq->flags & WQ_UNBOUND;
2412  unsigned int cpu;
2413 
2414  set_user_nice(current, RESCUER_NICE_LEVEL);
2415 repeat:
2417 
2418  if (kthread_should_stop()) {
2420  return 0;
2421  }
2422 
2423  /*
2424  * See whether any cpu is asking for help. Unbounded
2425  * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
2426  */
2427  for_each_mayday_cpu(cpu, wq->mayday_mask) {
2428  unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2429  struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
2430  struct worker_pool *pool = cwq->pool;
2431  struct global_cwq *gcwq = pool->gcwq;
2432  struct work_struct *work, *n;
2433 
2435  mayday_clear_cpu(cpu, wq->mayday_mask);
2436 
2437  /* migrate to the target cpu if possible */
2438  rescuer->pool = pool;
2439  worker_maybe_bind_and_lock(rescuer);
2440 
2441  /*
2442  * Slurp in all works issued via this workqueue and
2443  * process'em.
2444  */
2445  BUG_ON(!list_empty(&rescuer->scheduled));
2446  list_for_each_entry_safe(work, n, &pool->worklist, entry)
2447  if (get_work_cwq(work) == cwq)
2448  move_linked_works(work, scheduled, &n);
2449 
2450  process_scheduled_works(rescuer);
2451 
2452  /*
2453  * Leave this gcwq. If keep_working() is %true, notify a
2454  * regular worker; otherwise, we end up with 0 concurrency
2455  * and stalling the execution.
2456  */
2457  if (keep_working(pool))
2458  wake_up_worker(pool);
2459 
2460  spin_unlock_irq(&gcwq->lock);
2461  }
2462 
2463  schedule();
2464  goto repeat;
2465 }
2466 
2468  struct work_struct work;
2470 };
2471 
2472 static void wq_barrier_func(struct work_struct *work)
2473 {
2474  struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
2475  complete(&barr->done);
2476 }
2477 
2502 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2503  struct wq_barrier *barr,
2504  struct work_struct *target, struct worker *worker)
2505 {
2506  struct list_head *head;
2507  unsigned int linked = 0;
2508 
2509  /*
2510  * debugobject calls are safe here even with gcwq->lock locked
2511  * as we know for sure that this will not trigger any of the
2512  * checks and call back into the fixup functions where we
2513  * might deadlock.
2514  */
2515  INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2517  init_completion(&barr->done);
2518 
2519  /*
2520  * If @target is currently being executed, schedule the
2521  * barrier to the worker; otherwise, put it after @target.
2522  */
2523  if (worker)
2524  head = worker->scheduled.next;
2525  else {
2526  unsigned long *bits = work_data_bits(target);
2527 
2528  head = target->entry.next;
2529  /* there can already be other linked works, inherit and set */
2530  linked = *bits & WORK_STRUCT_LINKED;
2532  }
2533 
2534  debug_work_activate(&barr->work);
2535  insert_work(cwq, &barr->work, head,
2536  work_color_to_flags(WORK_NO_COLOR) | linked);
2537 }
2538 
2570 static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2571  int flush_color, int work_color)
2572 {
2573  bool wait = false;
2574  unsigned int cpu;
2575 
2576  if (flush_color >= 0) {
2578  atomic_set(&wq->nr_cwqs_to_flush, 1);
2579  }
2580 
2581  for_each_cwq_cpu(cpu, wq) {
2582  struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2583  struct global_cwq *gcwq = cwq->pool->gcwq;
2584 
2585  spin_lock_irq(&gcwq->lock);
2586 
2587  if (flush_color >= 0) {
2588  BUG_ON(cwq->flush_color != -1);
2589 
2590  if (cwq->nr_in_flight[flush_color]) {
2591  cwq->flush_color = flush_color;
2593  wait = true;
2594  }
2595  }
2596 
2597  if (work_color >= 0) {
2598  BUG_ON(work_color != work_next_color(cwq->work_color));
2599  cwq->work_color = work_color;
2600  }
2601 
2602  spin_unlock_irq(&gcwq->lock);
2603  }
2604 
2605  if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
2606  complete(&wq->first_flusher->done);
2607 
2608  return wait;
2609 }
2610 
2622 {
2623  struct wq_flusher this_flusher = {
2624  .list = LIST_HEAD_INIT(this_flusher.list),
2625  .flush_color = -1,
2626  .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2627  };
2628  int next_color;
2629 
2630  lock_map_acquire(&wq->lockdep_map);
2631  lock_map_release(&wq->lockdep_map);
2632 
2633  mutex_lock(&wq->flush_mutex);
2634 
2635  /*
2636  * Start-to-wait phase
2637  */
2638  next_color = work_next_color(wq->work_color);
2639 
2640  if (next_color != wq->flush_color) {
2641  /*
2642  * Color space is not full. The current work_color
2643  * becomes our flush_color and work_color is advanced
2644  * by one.
2645  */
2646  BUG_ON(!list_empty(&wq->flusher_overflow));
2647  this_flusher.flush_color = wq->work_color;
2648  wq->work_color = next_color;
2649 
2650  if (!wq->first_flusher) {
2651  /* no flush in progress, become the first flusher */
2652  BUG_ON(wq->flush_color != this_flusher.flush_color);
2653 
2654  wq->first_flusher = &this_flusher;
2655 
2656  if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
2657  wq->work_color)) {
2658  /* nothing to flush, done */
2659  wq->flush_color = next_color;
2660  wq->first_flusher = NULL;
2661  goto out_unlock;
2662  }
2663  } else {
2664  /* wait in queue */
2665  BUG_ON(wq->flush_color == this_flusher.flush_color);
2666  list_add_tail(&this_flusher.list, &wq->flusher_queue);
2667  flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2668  }
2669  } else {
2670  /*
2671  * Oops, color space is full, wait on overflow queue.
2672  * The next flush completion will assign us
2673  * flush_color and transfer to flusher_queue.
2674  */
2675  list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2676  }
2677 
2678  mutex_unlock(&wq->flush_mutex);
2679 
2680  wait_for_completion(&this_flusher.done);
2681 
2682  /*
2683  * Wake-up-and-cascade phase
2684  *
2685  * First flushers are responsible for cascading flushes and
2686  * handling overflow. Non-first flushers can simply return.
2687  */
2688  if (wq->first_flusher != &this_flusher)
2689  return;
2690 
2691  mutex_lock(&wq->flush_mutex);
2692 
2693  /* we might have raced, check again with mutex held */
2694  if (wq->first_flusher != &this_flusher)
2695  goto out_unlock;
2696 
2697  wq->first_flusher = NULL;
2698 
2699  BUG_ON(!list_empty(&this_flusher.list));
2700  BUG_ON(wq->flush_color != this_flusher.flush_color);
2701 
2702  while (true) {
2703  struct wq_flusher *next, *tmp;
2704 
2705  /* complete all the flushers sharing the current flush color */
2706  list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2707  if (next->flush_color != wq->flush_color)
2708  break;
2709  list_del_init(&next->list);
2710  complete(&next->done);
2711  }
2712 
2713  BUG_ON(!list_empty(&wq->flusher_overflow) &&
2714  wq->flush_color != work_next_color(wq->work_color));
2715 
2716  /* this flush_color is finished, advance by one */
2717  wq->flush_color = work_next_color(wq->flush_color);
2718 
2719  /* one color has been freed, handle overflow queue */
2720  if (!list_empty(&wq->flusher_overflow)) {
2721  /*
2722  * Assign the same color to all overflowed
2723  * flushers, advance work_color and append to
2724  * flusher_queue. This is the start-to-wait
2725  * phase for these overflowed flushers.
2726  */
2728  tmp->flush_color = wq->work_color;
2729 
2730  wq->work_color = work_next_color(wq->work_color);
2731 
2732  list_splice_tail_init(&wq->flusher_overflow,
2733  &wq->flusher_queue);
2734  flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2735  }
2736 
2737  if (list_empty(&wq->flusher_queue)) {
2738  BUG_ON(wq->flush_color != wq->work_color);
2739  break;
2740  }
2741 
2742  /*
2743  * Need to flush more colors. Make the next flusher
2744  * the new first flusher and arm cwqs.
2745  */
2746  BUG_ON(wq->flush_color == wq->work_color);
2747  BUG_ON(wq->flush_color != next->flush_color);
2748 
2749  list_del_init(&next->list);
2750  wq->first_flusher = next;
2751 
2752  if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
2753  break;
2754 
2755  /*
2756  * Meh... this color is already done, clear first
2757  * flusher and repeat cascading.
2758  */
2759  wq->first_flusher = NULL;
2760  }
2761 
2762 out_unlock:
2763  mutex_unlock(&wq->flush_mutex);
2764 }
2766 
2779 {
2780  unsigned int flush_cnt = 0;
2781  unsigned int cpu;
2782 
2783  /*
2784  * __queue_work() needs to test whether there are drainers, is much
2785  * hotter than drain_workqueue() and already looks at @wq->flags.
2786  * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
2787  */
2788  spin_lock(&workqueue_lock);
2789  if (!wq->nr_drainers++)
2790  wq->flags |= WQ_DRAINING;
2791  spin_unlock(&workqueue_lock);
2792 reflush:
2793  flush_workqueue(wq);
2794 
2795  for_each_cwq_cpu(cpu, wq) {
2796  struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2797  bool drained;
2798 
2799  spin_lock_irq(&cwq->pool->gcwq->lock);
2800  drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
2801  spin_unlock_irq(&cwq->pool->gcwq->lock);
2802 
2803  if (drained)
2804  continue;
2805 
2806  if (++flush_cnt == 10 ||
2807  (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2808  pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
2809  wq->name, flush_cnt);
2810  goto reflush;
2811  }
2812 
2813  spin_lock(&workqueue_lock);
2814  if (!--wq->nr_drainers)
2815  wq->flags &= ~WQ_DRAINING;
2816  spin_unlock(&workqueue_lock);
2817 }
2819 
2820 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2821 {
2822  struct worker *worker = NULL;
2823  struct global_cwq *gcwq;
2824  struct cpu_workqueue_struct *cwq;
2825 
2826  might_sleep();
2827  gcwq = get_work_gcwq(work);
2828  if (!gcwq)
2829  return false;
2830 
2831  spin_lock_irq(&gcwq->lock);
2832  if (!list_empty(&work->entry)) {
2833  /*
2834  * See the comment near try_to_grab_pending()->smp_rmb().
2835  * If it was re-queued to a different gcwq under us, we
2836  * are not going to wait.
2837  */
2838  smp_rmb();
2839  cwq = get_work_cwq(work);
2840  if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
2841  goto already_gone;
2842  } else {
2843  worker = find_worker_executing_work(gcwq, work);
2844  if (!worker)
2845  goto already_gone;
2846  cwq = worker->current_cwq;
2847  }
2848 
2849  insert_wq_barrier(cwq, barr, work, worker);
2850  spin_unlock_irq(&gcwq->lock);
2851 
2852  /*
2853  * If @max_active is 1 or rescuer is in use, flushing another work
2854  * item on the same workqueue may lead to deadlock. Make sure the
2855  * flusher is not running on the same workqueue by verifying write
2856  * access.
2857  */
2858  if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
2859  lock_map_acquire(&cwq->wq->lockdep_map);
2860  else
2861  lock_map_acquire_read(&cwq->wq->lockdep_map);
2862  lock_map_release(&cwq->wq->lockdep_map);
2863 
2864  return true;
2865 already_gone:
2866  spin_unlock_irq(&gcwq->lock);
2867  return false;
2868 }
2869 
2881 bool flush_work(struct work_struct *work)
2882 {
2883  struct wq_barrier barr;
2884 
2885  lock_map_acquire(&work->lockdep_map);
2886  lock_map_release(&work->lockdep_map);
2887 
2888  if (start_flush_work(work, &barr)) {
2889  wait_for_completion(&barr.done);
2890  destroy_work_on_stack(&barr.work);
2891  return true;
2892  } else {
2893  return false;
2894  }
2895 }
2897 
2898 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
2899 {
2900  unsigned long flags;
2901  int ret;
2902 
2903  do {
2904  ret = try_to_grab_pending(work, is_dwork, &flags);
2905  /*
2906  * If someone else is canceling, wait for the same event it
2907  * would be waiting for before retrying.
2908  */
2909  if (unlikely(ret == -ENOENT))
2910  flush_work(work);
2911  } while (unlikely(ret < 0));
2912 
2913  /* tell other tasks trying to grab @work to back off */
2914  mark_work_canceling(work);
2915  local_irq_restore(flags);
2916 
2917  flush_work(work);
2918  clear_work_data(work);
2919  return ret;
2920 }
2921 
2940 bool cancel_work_sync(struct work_struct *work)
2941 {
2942  return __cancel_work_timer(work, false);
2943 }
2945 
2959 {
2961  if (del_timer_sync(&dwork->timer))
2962  __queue_work(dwork->cpu,
2963  get_work_cwq(&dwork->work)->wq, &dwork->work);
2964  local_irq_enable();
2965  return flush_work(&dwork->work);
2966 }
2968 
2982 {
2983  unsigned long flags;
2984  int ret;
2985 
2986  do {
2987  ret = try_to_grab_pending(&dwork->work, true, &flags);
2988  } while (unlikely(ret == -EAGAIN));
2989 
2990  if (unlikely(ret < 0))
2991  return false;
2992 
2993  set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
2994  local_irq_restore(flags);
2995  return ret;
2996 }
2998 
3009 {
3010  return __cancel_work_timer(&dwork->work, true);
3011 }
3013 
3021 bool schedule_work_on(int cpu, struct work_struct *work)
3022 {
3023  return queue_work_on(cpu, system_wq, work);
3024 }
3026 
3038 bool schedule_work(struct work_struct *work)
3039 {
3040  return queue_work(system_wq, work);
3041 }
3043 
3053 bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
3054  unsigned long delay)
3055 {
3056  return queue_delayed_work_on(cpu, system_wq, dwork, delay);
3057 }
3059 
3068 bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
3069 {
3070  return queue_delayed_work(system_wq, dwork, delay);
3071 }
3073 
3086 {
3087  int cpu;
3088  struct work_struct __percpu *works;
3089 
3090  works = alloc_percpu(struct work_struct);
3091  if (!works)
3092  return -ENOMEM;
3093 
3094  get_online_cpus();
3095 
3096  for_each_online_cpu(cpu) {
3097  struct work_struct *work = per_cpu_ptr(works, cpu);
3098 
3099  INIT_WORK(work, func);
3100  schedule_work_on(cpu, work);
3101  }
3102 
3103  for_each_online_cpu(cpu)
3104  flush_work(per_cpu_ptr(works, cpu));
3105 
3106  put_online_cpus();
3107  free_percpu(works);
3108  return 0;
3109 }
3110 
3136 {
3138 }
3140 
3154 {
3155  if (!in_interrupt()) {
3156  fn(&ew->work);
3157  return 0;
3158  }
3159 
3160  INIT_WORK(&ew->work, fn);
3161  schedule_work(&ew->work);
3162 
3163  return 1;
3164 }
3166 
3167 int keventd_up(void)
3168 {
3169  return system_wq != NULL;
3170 }
3171 
3172 static int alloc_cwqs(struct workqueue_struct *wq)
3173 {
3174  /*
3175  * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
3176  * Make sure that the alignment isn't lower than that of
3177  * unsigned long long.
3178  */
3179  const size_t size = sizeof(struct cpu_workqueue_struct);
3180  const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
3181  __alignof__(unsigned long long));
3182 
3183  if (!(wq->flags & WQ_UNBOUND))
3184  wq->cpu_wq.pcpu = __alloc_percpu(size, align);
3185  else {
3186  void *ptr;
3187 
3188  /*
3189  * Allocate enough room to align cwq and put an extra
3190  * pointer at the end pointing back to the originally
3191  * allocated pointer which will be used for free.
3192  */
3193  ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
3194  if (ptr) {
3195  wq->cpu_wq.single = PTR_ALIGN(ptr, align);
3196  *(void **)(wq->cpu_wq.single + 1) = ptr;
3197  }
3198  }
3199 
3200  /* just in case, make sure it's actually aligned */
3201  BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
3202  return wq->cpu_wq.v ? 0 : -ENOMEM;
3203 }
3204 
3205 static void free_cwqs(struct workqueue_struct *wq)
3206 {
3207  if (!(wq->flags & WQ_UNBOUND))
3208  free_percpu(wq->cpu_wq.pcpu);
3209  else if (wq->cpu_wq.single) {
3210  /* the pointer to free is stored right after the cwq */
3211  kfree(*(void **)(wq->cpu_wq.single + 1));
3212  }
3213 }
3214 
3215 static int wq_clamp_max_active(int max_active, unsigned int flags,
3216  const char *name)
3217 {
3218  int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
3219 
3220  if (max_active < 1 || max_active > lim)
3221  pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
3222  max_active, name, 1, lim);
3223 
3224  return clamp_val(max_active, 1, lim);
3225 }
3226 
3228  unsigned int flags,
3229  int max_active,
3230  struct lock_class_key *key,
3231  const char *lock_name, ...)
3232 {
3233  va_list args, args1;
3234  struct workqueue_struct *wq;
3235  unsigned int cpu;
3236  size_t namelen;
3237 
3238  /* determine namelen, allocate wq and format name */
3239  va_start(args, lock_name);
3240  va_copy(args1, args);
3241  namelen = vsnprintf(NULL, 0, fmt, args) + 1;
3242 
3243  wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
3244  if (!wq)
3245  goto err;
3246 
3247  vsnprintf(wq->name, namelen, fmt, args1);
3248  va_end(args);
3249  va_end(args1);
3250 
3251  /*
3252  * Workqueues which may be used during memory reclaim should
3253  * have a rescuer to guarantee forward progress.
3254  */
3255  if (flags & WQ_MEM_RECLAIM)
3256  flags |= WQ_RESCUER;
3257 
3258  max_active = max_active ?: WQ_DFL_ACTIVE;
3259  max_active = wq_clamp_max_active(max_active, flags, wq->name);
3260 
3261  /* init wq */
3262  wq->flags = flags;
3263  wq->saved_max_active = max_active;
3264  mutex_init(&wq->flush_mutex);
3265  atomic_set(&wq->nr_cwqs_to_flush, 0);
3266  INIT_LIST_HEAD(&wq->flusher_queue);
3267  INIT_LIST_HEAD(&wq->flusher_overflow);
3268 
3269  lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3270  INIT_LIST_HEAD(&wq->list);
3271 
3272  if (alloc_cwqs(wq) < 0)
3273  goto err;
3274 
3275  for_each_cwq_cpu(cpu, wq) {
3276  struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3277  struct global_cwq *gcwq = get_gcwq(cpu);
3278  int pool_idx = (bool)(flags & WQ_HIGHPRI);
3279 
3280  BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
3281  cwq->pool = &gcwq->pools[pool_idx];
3282  cwq->wq = wq;
3283  cwq->flush_color = -1;
3284  cwq->max_active = max_active;
3285  INIT_LIST_HEAD(&cwq->delayed_works);
3286  }
3287 
3288  if (flags & WQ_RESCUER) {
3289  struct worker *rescuer;
3290 
3292  goto err;
3293 
3294  wq->rescuer = rescuer = alloc_worker();
3295  if (!rescuer)
3296  goto err;
3297 
3298  rescuer->task = kthread_create(rescuer_thread, wq, "%s",
3299  wq->name);
3300  if (IS_ERR(rescuer->task))
3301  goto err;
3302 
3303  rescuer->task->flags |= PF_THREAD_BOUND;
3304  wake_up_process(rescuer->task);
3305  }
3306 
3307  /*
3308  * workqueue_lock protects global freeze state and workqueues
3309  * list. Grab it, set max_active accordingly and add the new
3310  * workqueue to workqueues list.
3311  */
3312  spin_lock(&workqueue_lock);
3313 
3314  if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
3315  for_each_cwq_cpu(cpu, wq)
3316  get_cwq(cpu, wq)->max_active = 0;
3317 
3318  list_add(&wq->list, &workqueues);
3319 
3320  spin_unlock(&workqueue_lock);
3321 
3322  return wq;
3323 err:
3324  if (wq) {
3325  free_cwqs(wq);
3327  kfree(wq->rescuer);
3328  kfree(wq);
3329  }
3330  return NULL;
3331 }
3333 
3341 {
3342  unsigned int cpu;
3343 
3344  /* drain it before proceeding with destruction */
3345  drain_workqueue(wq);
3346 
3347  /*
3348  * wq list is used to freeze wq, remove from list after
3349  * flushing is complete in case freeze races us.
3350  */
3351  spin_lock(&workqueue_lock);
3352  list_del(&wq->list);
3353  spin_unlock(&workqueue_lock);
3354 
3355  /* sanity check */
3356  for_each_cwq_cpu(cpu, wq) {
3357  struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3358  int i;
3359 
3360  for (i = 0; i < WORK_NR_COLORS; i++)
3361  BUG_ON(cwq->nr_in_flight[i]);
3362  BUG_ON(cwq->nr_active);
3363  BUG_ON(!list_empty(&cwq->delayed_works));
3364  }
3365 
3366  if (wq->flags & WQ_RESCUER) {
3367  kthread_stop(wq->rescuer->task);
3369  kfree(wq->rescuer);
3370  }
3371 
3372  free_cwqs(wq);
3373  kfree(wq);
3374 }
3376 
3388 static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
3389 {
3390  cwq->max_active = max_active;
3391 
3392  while (!list_empty(&cwq->delayed_works) &&
3393  cwq->nr_active < cwq->max_active)
3394  cwq_activate_first_delayed(cwq);
3395 }
3396 
3407 void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3408 {
3409  unsigned int cpu;
3410 
3411  max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
3412 
3413  spin_lock(&workqueue_lock);
3414 
3416 
3417  for_each_cwq_cpu(cpu, wq) {
3418  struct global_cwq *gcwq = get_gcwq(cpu);
3419 
3420  spin_lock_irq(&gcwq->lock);
3421 
3422  if (!(wq->flags & WQ_FREEZABLE) ||
3423  !(gcwq->flags & GCWQ_FREEZING))
3424  cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);
3425 
3426  spin_unlock_irq(&gcwq->lock);
3427  }
3428 
3429  spin_unlock(&workqueue_lock);
3430 }
3432 
3445 bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
3446 {
3447  struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3448 
3449  return !list_empty(&cwq->delayed_works);
3450 }
3452 
3460 unsigned int work_cpu(struct work_struct *work)
3461 {
3462  struct global_cwq *gcwq = get_work_gcwq(work);
3463 
3464  return gcwq ? gcwq->cpu : WORK_CPU_NONE;
3465 }
3467 
3481 unsigned int work_busy(struct work_struct *work)
3482 {
3483  struct global_cwq *gcwq = get_work_gcwq(work);
3484  unsigned long flags;
3485  unsigned int ret = 0;
3486 
3487  if (!gcwq)
3488  return false;
3489 
3490  spin_lock_irqsave(&gcwq->lock, flags);
3491 
3492  if (work_pending(work))
3493  ret |= WORK_BUSY_PENDING;
3494  if (find_worker_executing_work(gcwq, work))
3495  ret |= WORK_BUSY_RUNNING;
3496 
3497  spin_unlock_irqrestore(&gcwq->lock, flags);
3498 
3499  return ret;
3500 }
3502 
3503 /*
3504  * CPU hotplug.
3505  *
3506  * There are two challenges in supporting CPU hotplug. Firstly, there
3507  * are a lot of assumptions on strong associations among work, cwq and
3508  * gcwq which make migrating pending and scheduled works very
3509  * difficult to implement without impacting hot paths. Secondly,
3510  * gcwqs serve mix of short, long and very long running works making
3511  * blocked draining impractical.
3512  *
3513  * This is solved by allowing a gcwq to be disassociated from the CPU
3514  * running as an unbound one and allowing it to be reattached later if the
3515  * cpu comes back online.
3516  */
3517 
3518 /* claim manager positions of all pools */
3519 static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
3520 {
3521  struct worker_pool *pool;
3522 
3523  for_each_worker_pool(pool, gcwq)
3524  mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
3525  spin_lock_irq(&gcwq->lock);
3526 }
3527 
3528 /* release manager positions */
3529 static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
3530 {
3531  struct worker_pool *pool;
3532 
3533  spin_unlock_irq(&gcwq->lock);
3534  for_each_worker_pool(pool, gcwq)
3535  mutex_unlock(&pool->assoc_mutex);
3536 }
3537 
3538 static void gcwq_unbind_fn(struct work_struct *work)
3539 {
3540  struct global_cwq *gcwq = get_gcwq(smp_processor_id());
3541  struct worker_pool *pool;
3542  struct worker *worker;
3543  struct hlist_node *pos;
3544  int i;
3545 
3546  BUG_ON(gcwq->cpu != smp_processor_id());
3547 
3548  gcwq_claim_assoc_and_lock(gcwq);
3549 
3550  /*
3551  * We've claimed all manager positions. Make all workers unbound
3552  * and set DISASSOCIATED. Before this, all workers except for the
3553  * ones which are still executing works from before the last CPU
3554  * down must be on the cpu. After this, they may become diasporas.
3555  */
3556  for_each_worker_pool(pool, gcwq)
3557  list_for_each_entry(worker, &pool->idle_list, entry)
3558  worker->flags |= WORKER_UNBOUND;
3559 
3560  for_each_busy_worker(worker, i, pos, gcwq)
3561  worker->flags |= WORKER_UNBOUND;
3562 
3563  gcwq->flags |= GCWQ_DISASSOCIATED;
3564 
3565  gcwq_release_assoc_and_unlock(gcwq);
3566 
3567  /*
3568  * Call schedule() so that we cross rq->lock and thus can guarantee
3569  * sched callbacks see the %WORKER_UNBOUND flag. This is necessary
3570  * as scheduler callbacks may be invoked from other cpus.
3571  */
3572  schedule();
3573 
3574  /*
3575  * Sched callbacks are disabled now. Zap nr_running. After this,
3576  * nr_running stays zero and need_more_worker() and keep_working()
3577  * are always true as long as the worklist is not empty. @gcwq now
3578  * behaves as unbound (in terms of concurrency management) gcwq
3579  * which is served by workers tied to the CPU.
3580  *
3581  * On return from this function, the current worker would trigger
3582  * unbound chain execution of pending work items if other workers
3583  * didn't already.
3584  */
3585  for_each_worker_pool(pool, gcwq)
3586  atomic_set(get_pool_nr_running(pool), 0);
3587 }
3588 
3589 /*
3590  * Workqueues should be brought up before normal priority CPU notifiers.
3591  * This will be registered high priority CPU notifier.
3592  */
3593 static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3594  unsigned long action,
3595  void *hcpu)
3596 {
3597  unsigned int cpu = (unsigned long)hcpu;
3598  struct global_cwq *gcwq = get_gcwq(cpu);
3599  struct worker_pool *pool;
3600 
3601  switch (action & ~CPU_TASKS_FROZEN) {
3602  case CPU_UP_PREPARE:
3603  for_each_worker_pool(pool, gcwq) {
3604  struct worker *worker;
3605 
3606  if (pool->nr_workers)
3607  continue;
3608 
3609  worker = create_worker(pool);
3610  if (!worker)
3611  return NOTIFY_BAD;
3612 
3613  spin_lock_irq(&gcwq->lock);
3614  start_worker(worker);
3615  spin_unlock_irq(&gcwq->lock);
3616  }
3617  break;
3618 
3619  case CPU_DOWN_FAILED:
3620  case CPU_ONLINE:
3621  gcwq_claim_assoc_and_lock(gcwq);
3622  gcwq->flags &= ~GCWQ_DISASSOCIATED;
3623  rebind_workers(gcwq);
3624  gcwq_release_assoc_and_unlock(gcwq);
3625  break;
3626  }
3627  return NOTIFY_OK;
3628 }
3629 
3630 /*
3631  * Workqueues should be brought down after normal priority CPU notifiers.
3632  * This will be registered as low priority CPU notifier.
3633  */
3634 static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3635  unsigned long action,
3636  void *hcpu)
3637 {
3638  unsigned int cpu = (unsigned long)hcpu;
3639  struct work_struct unbind_work;
3640 
3641  switch (action & ~CPU_TASKS_FROZEN) {
3642  case CPU_DOWN_PREPARE:
3643  /* unbinding should happen on the local CPU */
3644  INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
3645  queue_work_on(cpu, system_highpri_wq, &unbind_work);
3646  flush_work(&unbind_work);
3647  break;
3648  }
3649  return NOTIFY_OK;
3650 }
3651 
3652 #ifdef CONFIG_SMP
3653 
3654 struct work_for_cpu {
3655  struct work_struct work;
3656  long (*fn)(void *);
3657  void *arg;
3658  long ret;
3659 };
3660 
3661 static void work_for_cpu_fn(struct work_struct *work)
3662 {
3663  struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
3664 
3665  wfc->ret = wfc->fn(wfc->arg);
3666 }
3667 
3678 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
3679 {
3680  struct work_for_cpu wfc = { .fn = fn, .arg = arg };
3681 
3682  INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
3683  schedule_work_on(cpu, &wfc.work);
3684  flush_work(&wfc.work);
3685  return wfc.ret;
3686 }
3687 EXPORT_SYMBOL_GPL(work_on_cpu);
3688 #endif /* CONFIG_SMP */
3689 
3690 #ifdef CONFIG_FREEZER
3691 
3702 void freeze_workqueues_begin(void)
3703 {
3704  unsigned int cpu;
3705 
3706  spin_lock(&workqueue_lock);
3707 
3708  BUG_ON(workqueue_freezing);
3709  workqueue_freezing = true;
3710 
3711  for_each_gcwq_cpu(cpu) {
3712  struct global_cwq *gcwq = get_gcwq(cpu);
3713  struct workqueue_struct *wq;
3714 
3715  spin_lock_irq(&gcwq->lock);
3716 
3717  BUG_ON(gcwq->flags & GCWQ_FREEZING);
3718  gcwq->flags |= GCWQ_FREEZING;
3719 
3720  list_for_each_entry(wq, &workqueues, list) {
3721  struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3722 
3723  if (cwq && wq->flags & WQ_FREEZABLE)
3724  cwq->max_active = 0;
3725  }
3726 
3727  spin_unlock_irq(&gcwq->lock);
3728  }
3729 
3730  spin_unlock(&workqueue_lock);
3731 }
3732 
3746 bool freeze_workqueues_busy(void)
3747 {
3748  unsigned int cpu;
3749  bool busy = false;
3750 
3751  spin_lock(&workqueue_lock);
3752 
3753  BUG_ON(!workqueue_freezing);
3754 
3755  for_each_gcwq_cpu(cpu) {
3756  struct workqueue_struct *wq;
3757  /*
3758  * nr_active is monotonically decreasing. It's safe
3759  * to peek without lock.
3760  */
3761  list_for_each_entry(wq, &workqueues, list) {
3762  struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3763 
3764  if (!cwq || !(wq->flags & WQ_FREEZABLE))
3765  continue;
3766 
3767  BUG_ON(cwq->nr_active < 0);
3768  if (cwq->nr_active) {
3769  busy = true;
3770  goto out_unlock;
3771  }
3772  }
3773  }
3774 out_unlock:
3775  spin_unlock(&workqueue_lock);
3776  return busy;
3777 }
3778 
3788 void thaw_workqueues(void)
3789 {
3790  unsigned int cpu;
3791 
3792  spin_lock(&workqueue_lock);
3793 
3794  if (!workqueue_freezing)
3795  goto out_unlock;
3796 
3797  for_each_gcwq_cpu(cpu) {
3798  struct global_cwq *gcwq = get_gcwq(cpu);
3799  struct worker_pool *pool;
3800  struct workqueue_struct *wq;
3801 
3802  spin_lock_irq(&gcwq->lock);
3803 
3804  BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
3805  gcwq->flags &= ~GCWQ_FREEZING;
3806 
3807  list_for_each_entry(wq, &workqueues, list) {
3808  struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3809 
3810  if (!cwq || !(wq->flags & WQ_FREEZABLE))
3811  continue;
3812 
3813  /* restore max_active and repopulate worklist */
3814  cwq_set_max_active(cwq, wq->saved_max_active);
3815  }
3816 
3817  for_each_worker_pool(pool, gcwq)
3818  wake_up_worker(pool);
3819 
3820  spin_unlock_irq(&gcwq->lock);
3821  }
3822 
3823  workqueue_freezing = false;
3824 out_unlock:
3825  spin_unlock(&workqueue_lock);
3826 }
3827 #endif /* CONFIG_FREEZER */
3828 
3829 static int __init init_workqueues(void)
3830 {
3831  unsigned int cpu;
3832  int i;
3833 
3834  /* make sure we have enough bits for OFFQ CPU number */
3835  BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
3836  WORK_CPU_LAST);
3837 
3838  cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3839  hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3840 
3841  /* initialize gcwqs */
3842  for_each_gcwq_cpu(cpu) {
3843  struct global_cwq *gcwq = get_gcwq(cpu);
3844  struct worker_pool *pool;
3845 
3846  spin_lock_init(&gcwq->lock);
3847  gcwq->cpu = cpu;
3848  gcwq->flags |= GCWQ_DISASSOCIATED;
3849 
3850  for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3851  INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3852 
3853  for_each_worker_pool(pool, gcwq) {
3854  pool->gcwq = gcwq;
3855  INIT_LIST_HEAD(&pool->worklist);
3856  INIT_LIST_HEAD(&pool->idle_list);
3857 
3859  pool->idle_timer.function = idle_worker_timeout;
3860  pool->idle_timer.data = (unsigned long)pool;
3861 
3862  setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
3863  (unsigned long)pool);
3864 
3865  mutex_init(&pool->assoc_mutex);
3866  ida_init(&pool->worker_ida);
3867  }
3868  }
3869 
3870  /* create the initial worker */
3872  struct global_cwq *gcwq = get_gcwq(cpu);
3873  struct worker_pool *pool;
3874 
3875  if (cpu != WORK_CPU_UNBOUND)
3876  gcwq->flags &= ~GCWQ_DISASSOCIATED;
3877 
3878  for_each_worker_pool(pool, gcwq) {
3879  struct worker *worker;
3880 
3881  worker = create_worker(pool);
3882  BUG_ON(!worker);
3883  spin_lock_irq(&gcwq->lock);
3884  start_worker(worker);
3885  spin_unlock_irq(&gcwq->lock);
3886  }
3887  }
3888 
3889  system_wq = alloc_workqueue("events", 0, 0);
3890  system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
3891  system_long_wq = alloc_workqueue("events_long", 0, 0);
3892  system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3894  system_freezable_wq = alloc_workqueue("events_freezable",
3895  WQ_FREEZABLE, 0);
3896  BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
3897  !system_unbound_wq || !system_freezable_wq);
3898  return 0;
3899 }
3900 early_initcall(init_workqueues);