Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
core.c
Go to the documentation of this file.
1 /*
2  * kernel/sched/core.c
3  *
4  * Kernel scheduler and related syscalls
5  *
6  * Copyright (C) 1991-2002 Linus Torvalds
7  *
8  * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9  * make semaphores SMP safe
10  * 1998-11-19 Implemented schedule_timeout() and related stuff
11  * by Andrea Arcangeli
12  * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13  * hybrid priority-list and round-robin design with
14  * an array-switch method of distributing timeslices
15  * and per-CPU runqueues. Cleanups and useful suggestions
16  * by Davide Libenzi, preemptible kernel bits by Robert Love.
17  * 2003-09-03 Interactivity tuning by Con Kolivas.
18  * 2004-04-02 Scheduler domains code by Nick Piggin
19  * 2007-04-15 Work begun on replacing all interactivity tuning with a
20  * fair scheduling design by Con Kolivas.
21  * 2007-05-05 Load balancing (smp-nice) and other improvements
22  * by Peter Williams
23  * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24  * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25  * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26  * Thomas Gleixner, Mike Kravetz
27  */
28 
29 #include <linux/mm.h>
30 #include <linux/module.h>
31 #include <linux/nmi.h>
32 #include <linux/init.h>
33 #include <linux/uaccess.h>
34 #include <linux/highmem.h>
35 #include <asm/mmu_context.h>
36 #include <linux/interrupt.h>
37 #include <linux/capability.h>
38 #include <linux/completion.h>
39 #include <linux/kernel_stat.h>
40 #include <linux/debug_locks.h>
41 #include <linux/perf_event.h>
42 #include <linux/security.h>
43 #include <linux/notifier.h>
44 #include <linux/profile.h>
45 #include <linux/freezer.h>
46 #include <linux/vmalloc.h>
47 #include <linux/blkdev.h>
48 #include <linux/delay.h>
49 #include <linux/pid_namespace.h>
50 #include <linux/smp.h>
51 #include <linux/threads.h>
52 #include <linux/timer.h>
53 #include <linux/rcupdate.h>
54 #include <linux/cpu.h>
55 #include <linux/cpuset.h>
56 #include <linux/percpu.h>
57 #include <linux/proc_fs.h>
58 #include <linux/seq_file.h>
59 #include <linux/sysctl.h>
60 #include <linux/syscalls.h>
61 #include <linux/times.h>
62 #include <linux/tsacct_kern.h>
63 #include <linux/kprobes.h>
64 #include <linux/delayacct.h>
65 #include <linux/unistd.h>
66 #include <linux/pagemap.h>
67 #include <linux/hrtimer.h>
68 #include <linux/tick.h>
69 #include <linux/debugfs.h>
70 #include <linux/ctype.h>
71 #include <linux/ftrace.h>
72 #include <linux/slab.h>
73 #include <linux/init_task.h>
74 #include <linux/binfmts.h>
75 
76 #include <asm/switch_to.h>
77 #include <asm/tlb.h>
78 #include <asm/irq_regs.h>
79 #include <asm/mutex.h>
80 #ifdef CONFIG_PARAVIRT
81 #include <asm/paravirt.h>
82 #endif
83 
84 #include "sched.h"
85 #include "../workqueue_sched.h"
86 #include "../smpboot.h"
87 
88 #define CREATE_TRACE_POINTS
89 #include <trace/events/sched.h>
90 
91 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
92 {
93  unsigned long delta;
94  ktime_t soft, hard, now;
95 
96  for (;;) {
97  if (hrtimer_active(period_timer))
98  break;
99 
100  now = hrtimer_cb_get_time(period_timer);
101  hrtimer_forward(period_timer, now, period);
102 
103  soft = hrtimer_get_softexpires(period_timer);
104  hard = hrtimer_get_expires(period_timer);
105  delta = ktime_to_ns(ktime_sub(hard, soft));
106  __hrtimer_start_range_ns(period_timer, soft, delta,
108  }
109 }
110 
111 DEFINE_MUTEX(sched_domains_mutex);
112 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
113 
114 static void update_rq_clock_task(struct rq *rq, s64 delta);
115 
116 void update_rq_clock(struct rq *rq)
117 {
118  s64 delta;
119 
120  if (rq->skip_clock_update > 0)
121  return;
122 
123  delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
124  rq->clock += delta;
125  update_rq_clock_task(rq, delta);
126 }
127 
128 /*
129  * Debugging: various feature bits
130  */
131 
132 #define SCHED_FEAT(name, enabled) \
133  (1UL << __SCHED_FEAT_##name) * enabled |
134 
136 #include "features.h"
137  0;
138 
139 #undef SCHED_FEAT
140 
141 #ifdef CONFIG_SCHED_DEBUG
142 #define SCHED_FEAT(name, enabled) \
143  #name ,
144 
145 static const char * const sched_feat_names[] = {
146 #include "features.h"
147 };
148 
149 #undef SCHED_FEAT
150 
151 static int sched_feat_show(struct seq_file *m, void *v)
152 {
153  int i;
154 
155  for (i = 0; i < __SCHED_FEAT_NR; i++) {
156  if (!(sysctl_sched_features & (1UL << i)))
157  seq_puts(m, "NO_");
158  seq_printf(m, "%s ", sched_feat_names[i]);
159  }
160  seq_puts(m, "\n");
161 
162  return 0;
163 }
164 
165 #ifdef HAVE_JUMP_LABEL
166 
167 #define jump_label_key__true STATIC_KEY_INIT_TRUE
168 #define jump_label_key__false STATIC_KEY_INIT_FALSE
169 
170 #define SCHED_FEAT(name, enabled) \
171  jump_label_key__##enabled ,
172 
173 struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
174 #include "features.h"
175 };
176 
177 #undef SCHED_FEAT
178 
179 static void sched_feat_disable(int i)
180 {
181  if (static_key_enabled(&sched_feat_keys[i]))
182  static_key_slow_dec(&sched_feat_keys[i]);
183 }
184 
185 static void sched_feat_enable(int i)
186 {
187  if (!static_key_enabled(&sched_feat_keys[i]))
188  static_key_slow_inc(&sched_feat_keys[i]);
189 }
190 #else
191 static void sched_feat_disable(int i) { };
192 static void sched_feat_enable(int i) { };
193 #endif /* HAVE_JUMP_LABEL */
194 
195 static ssize_t
196 sched_feat_write(struct file *filp, const char __user *ubuf,
197  size_t cnt, loff_t *ppos)
198 {
199  char buf[64];
200  char *cmp;
201  int neg = 0;
202  int i;
203 
204  if (cnt > 63)
205  cnt = 63;
206 
207  if (copy_from_user(&buf, ubuf, cnt))
208  return -EFAULT;
209 
210  buf[cnt] = 0;
211  cmp = strstrip(buf);
212 
213  if (strncmp(cmp, "NO_", 3) == 0) {
214  neg = 1;
215  cmp += 3;
216  }
217 
218  for (i = 0; i < __SCHED_FEAT_NR; i++) {
219  if (strcmp(cmp, sched_feat_names[i]) == 0) {
220  if (neg) {
221  sysctl_sched_features &= ~(1UL << i);
222  sched_feat_disable(i);
223  } else {
224  sysctl_sched_features |= (1UL << i);
225  sched_feat_enable(i);
226  }
227  break;
228  }
229  }
230 
231  if (i == __SCHED_FEAT_NR)
232  return -EINVAL;
233 
234  *ppos += cnt;
235 
236  return cnt;
237 }
238 
239 static int sched_feat_open(struct inode *inode, struct file *filp)
240 {
241  return single_open(filp, sched_feat_show, NULL);
242 }
243 
244 static const struct file_operations sched_feat_fops = {
245  .open = sched_feat_open,
246  .write = sched_feat_write,
247  .read = seq_read,
248  .llseek = seq_lseek,
249  .release = single_release,
250 };
251 
252 static __init int sched_init_debug(void)
253 {
254  debugfs_create_file("sched_features", 0644, NULL, NULL,
255  &sched_feat_fops);
256 
257  return 0;
258 }
259 late_initcall(sched_init_debug);
260 #endif /* CONFIG_SCHED_DEBUG */
261 
262 /*
263  * Number of tasks to iterate in a single balance run.
264  * Limited because this is done with IRQs disabled.
265  */
267 
268 /*
269  * period over which we average the RT time consumption, measured
270  * in ms.
271  *
272  * default: 1s
273  */
275 
276 /*
277  * period over which we measure -rt task cpu usage in us.
278  * default: 1s
279  */
280 unsigned int sysctl_sched_rt_period = 1000000;
281 
283 
284 /*
285  * part of the period that we allow rt tasks to run in us.
286  * default: 0.95s
287  */
289 
290 
291 
292 /*
293  * __task_rq_lock - lock the rq @p resides on.
294  */
295 static inline struct rq *__task_rq_lock(struct task_struct *p)
296  __acquires(rq->lock)
297 {
298  struct rq *rq;
299 
300  lockdep_assert_held(&p->pi_lock);
301 
302  for (;;) {
303  rq = task_rq(p);
304  raw_spin_lock(&rq->lock);
305  if (likely(rq == task_rq(p)))
306  return rq;
307  raw_spin_unlock(&rq->lock);
308  }
309 }
310 
311 /*
312  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
313  */
314 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
315  __acquires(p->pi_lock)
316  __acquires(rq->lock)
317 {
318  struct rq *rq;
319 
320  for (;;) {
321  raw_spin_lock_irqsave(&p->pi_lock, *flags);
322  rq = task_rq(p);
323  raw_spin_lock(&rq->lock);
324  if (likely(rq == task_rq(p)))
325  return rq;
326  raw_spin_unlock(&rq->lock);
327  raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
328  }
329 }
330 
331 static void __task_rq_unlock(struct rq *rq)
332  __releases(rq->lock)
333 {
334  raw_spin_unlock(&rq->lock);
335 }
336 
337 static inline void
338 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
339  __releases(rq->lock)
340  __releases(p->pi_lock)
341 {
342  raw_spin_unlock(&rq->lock);
343  raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
344 }
345 
346 /*
347  * this_rq_lock - lock this runqueue and disable interrupts.
348  */
349 static struct rq *this_rq_lock(void)
350  __acquires(rq->lock)
351 {
352  struct rq *rq;
353 
355  rq = this_rq();
356  raw_spin_lock(&rq->lock);
357 
358  return rq;
359 }
360 
361 #ifdef CONFIG_SCHED_HRTICK
362 /*
363  * Use HR-timers to deliver accurate preemption points.
364  *
365  * Its all a bit involved since we cannot program an hrt while holding the
366  * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
367  * reschedule event.
368  *
369  * When we get rescheduled we reprogram the hrtick_timer outside of the
370  * rq->lock.
371  */
372 
373 static void hrtick_clear(struct rq *rq)
374 {
375  if (hrtimer_active(&rq->hrtick_timer))
376  hrtimer_cancel(&rq->hrtick_timer);
377 }
378 
379 /*
380  * High-resolution timer tick.
381  * Runs from hardirq context with interrupts disabled.
382  */
383 static enum hrtimer_restart hrtick(struct hrtimer *timer)
384 {
385  struct rq *rq = container_of(timer, struct rq, hrtick_timer);
386 
387  WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
388 
389  raw_spin_lock(&rq->lock);
390  update_rq_clock(rq);
391  rq->curr->sched_class->task_tick(rq, rq->curr, 1);
392  raw_spin_unlock(&rq->lock);
393 
394  return HRTIMER_NORESTART;
395 }
396 
397 #ifdef CONFIG_SMP
398 /*
399  * called from hardirq (IPI) context
400  */
401 static void __hrtick_start(void *arg)
402 {
403  struct rq *rq = arg;
404 
405  raw_spin_lock(&rq->lock);
406  hrtimer_restart(&rq->hrtick_timer);
407  rq->hrtick_csd_pending = 0;
408  raw_spin_unlock(&rq->lock);
409 }
410 
411 /*
412  * Called to set the hrtick timer state.
413  *
414  * called with rq->lock held and irqs disabled
415  */
416 void hrtick_start(struct rq *rq, u64 delay)
417 {
418  struct hrtimer *timer = &rq->hrtick_timer;
419  ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
420 
421  hrtimer_set_expires(timer, time);
422 
423  if (rq == this_rq()) {
424  hrtimer_restart(timer);
425  } else if (!rq->hrtick_csd_pending) {
426  __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
427  rq->hrtick_csd_pending = 1;
428  }
429 }
430 
431 static int
432 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
433 {
434  int cpu = (int)(long)hcpu;
435 
436  switch (action) {
437  case CPU_UP_CANCELED:
439  case CPU_DOWN_PREPARE:
441  case CPU_DEAD:
442  case CPU_DEAD_FROZEN:
443  hrtick_clear(cpu_rq(cpu));
444  return NOTIFY_OK;
445  }
446 
447  return NOTIFY_DONE;
448 }
449 
450 static __init void init_hrtick(void)
451 {
452  hotcpu_notifier(hotplug_hrtick, 0);
453 }
454 #else
455 /*
456  * Called to set the hrtick timer state.
457  *
458  * called with rq->lock held and irqs disabled
459  */
460 void hrtick_start(struct rq *rq, u64 delay)
461 {
462  __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
464 }
465 
466 static inline void init_hrtick(void)
467 {
468 }
469 #endif /* CONFIG_SMP */
470 
471 static void init_rq_hrtick(struct rq *rq)
472 {
473 #ifdef CONFIG_SMP
474  rq->hrtick_csd_pending = 0;
475 
476  rq->hrtick_csd.flags = 0;
477  rq->hrtick_csd.func = __hrtick_start;
478  rq->hrtick_csd.info = rq;
479 #endif
480 
481  hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
482  rq->hrtick_timer.function = hrtick;
483 }
484 #else /* CONFIG_SCHED_HRTICK */
485 static inline void hrtick_clear(struct rq *rq)
486 {
487 }
488 
489 static inline void init_rq_hrtick(struct rq *rq)
490 {
491 }
492 
493 static inline void init_hrtick(void)
494 {
495 }
496 #endif /* CONFIG_SCHED_HRTICK */
497 
498 /*
499  * resched_task - mark a task 'to be rescheduled now'.
500  *
501  * On UP this means the setting of the need_resched flag, on SMP it
502  * might also involve a cross-CPU call to trigger the scheduler on
503  * the target CPU.
504  */
505 #ifdef CONFIG_SMP
506 
507 #ifndef tsk_is_polling
508 #define tsk_is_polling(t) 0
509 #endif
510 
511 void resched_task(struct task_struct *p)
512 {
513  int cpu;
514 
515  assert_raw_spin_locked(&task_rq(p)->lock);
516 
517  if (test_tsk_need_resched(p))
518  return;
519 
520  set_tsk_need_resched(p);
521 
522  cpu = task_cpu(p);
523  if (cpu == smp_processor_id())
524  return;
525 
526  /* NEED_RESCHED must be visible before we test polling */
527  smp_mb();
528  if (!tsk_is_polling(p))
529  smp_send_reschedule(cpu);
530 }
531 
532 void resched_cpu(int cpu)
533 {
534  struct rq *rq = cpu_rq(cpu);
535  unsigned long flags;
536 
537  if (!raw_spin_trylock_irqsave(&rq->lock, flags))
538  return;
539  resched_task(cpu_curr(cpu));
540  raw_spin_unlock_irqrestore(&rq->lock, flags);
541 }
542 
543 #ifdef CONFIG_NO_HZ
544 /*
545  * In the semi idle case, use the nearest busy cpu for migrating timers
546  * from an idle cpu. This is good for power-savings.
547  *
548  * We don't do similar optimization for completely idle system, as
549  * selecting an idle cpu will add more delays to the timers than intended
550  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
551  */
552 int get_nohz_timer_target(void)
553 {
554  int cpu = smp_processor_id();
555  int i;
556  struct sched_domain *sd;
557 
558  rcu_read_lock();
559  for_each_domain(cpu, sd) {
560  for_each_cpu(i, sched_domain_span(sd)) {
561  if (!idle_cpu(i)) {
562  cpu = i;
563  goto unlock;
564  }
565  }
566  }
567 unlock:
568  rcu_read_unlock();
569  return cpu;
570 }
571 /*
572  * When add_timer_on() enqueues a timer into the timer wheel of an
573  * idle CPU then this timer might expire before the next timer event
574  * which is scheduled to wake up that CPU. In case of a completely
575  * idle system the next event might even be infinite time into the
576  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
577  * leaves the inner idle loop so the newly added timer is taken into
578  * account when the CPU goes back to idle and evaluates the timer
579  * wheel for the next timer event.
580  */
581 void wake_up_idle_cpu(int cpu)
582 {
583  struct rq *rq = cpu_rq(cpu);
584 
585  if (cpu == smp_processor_id())
586  return;
587 
588  /*
589  * This is safe, as this function is called with the timer
590  * wheel base lock of (cpu) held. When the CPU is on the way
591  * to idle and has not yet set rq->curr to idle then it will
592  * be serialized on the timer wheel base lock and take the new
593  * timer into account automatically.
594  */
595  if (rq->curr != rq->idle)
596  return;
597 
598  /*
599  * We can set TIF_RESCHED on the idle task of the other CPU
600  * lockless. The worst case is that the other CPU runs the
601  * idle task through an additional NOOP schedule()
602  */
603  set_tsk_need_resched(rq->idle);
604 
605  /* NEED_RESCHED must be visible before we test polling */
606  smp_mb();
607  if (!tsk_is_polling(rq->idle))
608  smp_send_reschedule(cpu);
609 }
610 
611 static inline bool got_nohz_idle_kick(void)
612 {
613  int cpu = smp_processor_id();
614  return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
615 }
616 
617 #else /* CONFIG_NO_HZ */
618 
619 static inline bool got_nohz_idle_kick(void)
620 {
621  return false;
622 }
623 
624 #endif /* CONFIG_NO_HZ */
625 
626 void sched_avg_update(struct rq *rq)
627 {
628  s64 period = sched_avg_period();
629 
630  while ((s64)(rq->clock - rq->age_stamp) > period) {
631  /*
632  * Inline assembly required to prevent the compiler
633  * optimising this loop into a divmod call.
634  * See __iter_div_u64_rem() for another example of this.
635  */
636  asm("" : "+rm" (rq->age_stamp));
637  rq->age_stamp += period;
638  rq->rt_avg /= 2;
639  }
640 }
641 
642 #else /* !CONFIG_SMP */
643 void resched_task(struct task_struct *p)
644 {
645  assert_raw_spin_locked(&task_rq(p)->lock);
646  set_tsk_need_resched(p);
647 }
648 #endif /* CONFIG_SMP */
649 
650 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
651  (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
652 /*
653  * Iterate task_group tree rooted at *from, calling @down when first entering a
654  * node and @up when leaving it for the final time.
655  *
656  * Caller must hold rcu_lock or sufficient equivalent.
657  */
658 int walk_tg_tree_from(struct task_group *from,
659  tg_visitor down, tg_visitor up, void *data)
660 {
661  struct task_group *parent, *child;
662  int ret;
663 
664  parent = from;
665 
666 down:
667  ret = (*down)(parent, data);
668  if (ret)
669  goto out;
670  list_for_each_entry_rcu(child, &parent->children, siblings) {
671  parent = child;
672  goto down;
673 
674 up:
675  continue;
676  }
677  ret = (*up)(parent, data);
678  if (ret || parent == from)
679  goto out;
680 
681  child = parent;
682  parent = parent->parent;
683  if (parent)
684  goto up;
685 out:
686  return ret;
687 }
688 
689 int tg_nop(struct task_group *tg, void *data)
690 {
691  return 0;
692 }
693 #endif
694 
695 static void set_load_weight(struct task_struct *p)
696 {
697  int prio = p->static_prio - MAX_RT_PRIO;
698  struct load_weight *load = &p->se.load;
699 
700  /*
701  * SCHED_IDLE tasks get minimal weight:
702  */
703  if (p->policy == SCHED_IDLE) {
705  load->inv_weight = WMULT_IDLEPRIO;
706  return;
707  }
708 
709  load->weight = scale_load(prio_to_weight[prio]);
710  load->inv_weight = prio_to_wmult[prio];
711 }
712 
713 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
714 {
715  update_rq_clock(rq);
717  p->sched_class->enqueue_task(rq, p, flags);
718 }
719 
720 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
721 {
722  update_rq_clock(rq);
724  p->sched_class->dequeue_task(rq, p, flags);
725 }
726 
727 void activate_task(struct rq *rq, struct task_struct *p, int flags)
728 {
730  rq->nr_uninterruptible--;
731 
732  enqueue_task(rq, p, flags);
733 }
734 
735 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
736 {
738  rq->nr_uninterruptible++;
739 
740  dequeue_task(rq, p, flags);
741 }
742 
743 static void update_rq_clock_task(struct rq *rq, s64 delta)
744 {
745 /*
746  * In theory, the compile should just see 0 here, and optimize out the call
747  * to sched_rt_avg_update. But I don't trust it...
748  */
749 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
750  s64 steal = 0, irq_delta = 0;
751 #endif
752 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
753  irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
754 
755  /*
756  * Since irq_time is only updated on {soft,}irq_exit, we might run into
757  * this case when a previous update_rq_clock() happened inside a
758  * {soft,}irq region.
759  *
760  * When this happens, we stop ->clock_task and only update the
761  * prev_irq_time stamp to account for the part that fit, so that a next
762  * update will consume the rest. This ensures ->clock_task is
763  * monotonic.
764  *
765  * It does however cause some slight miss-attribution of {soft,}irq
766  * time, a more accurate solution would be to update the irq_time using
767  * the current rq->clock timestamp, except that would require using
768  * atomic ops.
769  */
770  if (irq_delta > delta)
771  irq_delta = delta;
772 
773  rq->prev_irq_time += irq_delta;
774  delta -= irq_delta;
775 #endif
776 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
777  if (static_key_false((&paravirt_steal_rq_enabled))) {
778  u64 st;
779 
780  steal = paravirt_steal_clock(cpu_of(rq));
781  steal -= rq->prev_steal_time_rq;
782 
783  if (unlikely(steal > delta))
784  steal = delta;
785 
786  st = steal_ticks(steal);
787  steal = st * TICK_NSEC;
788 
789  rq->prev_steal_time_rq += steal;
790 
791  delta -= steal;
792  }
793 #endif
794 
795  rq->clock_task += delta;
796 
797 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
798  if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
799  sched_rt_avg_update(rq, irq_delta + steal);
800 #endif
801 }
802 
803 void sched_set_stop_task(int cpu, struct task_struct *stop)
804 {
805  struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
806  struct task_struct *old_stop = cpu_rq(cpu)->stop;
807 
808  if (stop) {
809  /*
810  * Make it appear like a SCHED_FIFO task, its something
811  * userspace knows about and won't get confused about.
812  *
813  * Also, it will make PI more or less work without too
814  * much confusion -- but then, stop work should not
815  * rely on PI working anyway.
816  */
817  sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
818 
819  stop->sched_class = &stop_sched_class;
820  }
821 
822  cpu_rq(cpu)->stop = stop;
823 
824  if (old_stop) {
825  /*
826  * Reset it back to a normal scheduling class so that
827  * it can die in pieces.
828  */
829  old_stop->sched_class = &rt_sched_class;
830  }
831 }
832 
833 /*
834  * __normal_prio - return the priority that is based on the static prio
835  */
836 static inline int __normal_prio(struct task_struct *p)
837 {
838  return p->static_prio;
839 }
840 
841 /*
842  * Calculate the expected normal priority: i.e. priority
843  * without taking RT-inheritance into account. Might be
844  * boosted by interactivity modifiers. Changes upon fork,
845  * setprio syscalls, and whenever the interactivity
846  * estimator recalculates.
847  */
848 static inline int normal_prio(struct task_struct *p)
849 {
850  int prio;
851 
852  if (task_has_rt_policy(p))
853  prio = MAX_RT_PRIO-1 - p->rt_priority;
854  else
855  prio = __normal_prio(p);
856  return prio;
857 }
858 
859 /*
860  * Calculate the current priority, i.e. the priority
861  * taken into account by the scheduler. This value might
862  * be boosted by RT tasks, or might be boosted by
863  * interactivity modifiers. Will be RT if the task got
864  * RT-boosted. If not then it returns p->normal_prio.
865  */
866 static int effective_prio(struct task_struct *p)
867 {
868  p->normal_prio = normal_prio(p);
869  /*
870  * If we are RT tasks or we were boosted to RT priority,
871  * keep the priority unchanged. Otherwise, update priority
872  * to the normal priority:
873  */
874  if (!rt_prio(p->prio))
875  return p->normal_prio;
876  return p->prio;
877 }
878 
883 inline int task_curr(const struct task_struct *p)
884 {
885  return cpu_curr(task_cpu(p)) == p;
886 }
887 
888 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
889  const struct sched_class *prev_class,
890  int oldprio)
891 {
892  if (prev_class != p->sched_class) {
893  if (prev_class->switched_from)
894  prev_class->switched_from(rq, p);
895  p->sched_class->switched_to(rq, p);
896  } else if (oldprio != p->prio)
897  p->sched_class->prio_changed(rq, p, oldprio);
898 }
899 
900 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
901 {
902  const struct sched_class *class;
903 
904  if (p->sched_class == rq->curr->sched_class) {
905  rq->curr->sched_class->check_preempt_curr(rq, p, flags);
906  } else {
907  for_each_class(class) {
908  if (class == rq->curr->sched_class)
909  break;
910  if (class == p->sched_class) {
911  resched_task(rq->curr);
912  break;
913  }
914  }
915  }
916 
917  /*
918  * A queue event has occurred, and we're going to schedule. In
919  * this case, we can save a useless back to back clock update.
920  */
921  if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
922  rq->skip_clock_update = 1;
923 }
924 
925 #ifdef CONFIG_SMP
926 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
927 {
928 #ifdef CONFIG_SCHED_DEBUG
929  /*
930  * We should never call set_task_cpu() on a blocked task,
931  * ttwu() will sort out the placement.
932  */
935 
936 #ifdef CONFIG_LOCKDEP
937  /*
938  * The caller should hold either p->pi_lock or rq->lock, when changing
939  * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
940  *
941  * sched_move_task() holds both and thus holding either pins the cgroup,
942  * see task_group().
943  *
944  * Furthermore, all task_rq users should acquire both locks, see
945  * task_rq_lock().
946  */
947  WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
948  lockdep_is_held(&task_rq(p)->lock)));
949 #endif
950 #endif
951 
952  trace_sched_migrate_task(p, new_cpu);
953 
954  if (task_cpu(p) != new_cpu) {
955  p->se.nr_migrations++;
956  perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
957  }
958 
959  __set_task_cpu(p, new_cpu);
960 }
961 
962 struct migration_arg {
963  struct task_struct *task;
964  int dest_cpu;
965 };
966 
967 static int migration_cpu_stop(void *data);
968 
969 /*
970  * wait_task_inactive - wait for a thread to unschedule.
971  *
972  * If @match_state is nonzero, it's the @p->state value just checked and
973  * not expected to change. If it changes, i.e. @p might have woken up,
974  * then return zero. When we succeed in waiting for @p to be off its CPU,
975  * we return a positive number (its total switch count). If a second call
976  * a short while later returns the same number, the caller can be sure that
977  * @p has remained unscheduled the whole time.
978  *
979  * The caller must ensure that the task *will* unschedule sometime soon,
980  * else this function might spin for a *long* time. This function can't
981  * be called with interrupts off, or it may introduce deadlock with
982  * smp_call_function() if an IPI is sent by the same process we are
983  * waiting to become inactive.
984  */
985 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
986 {
987  unsigned long flags;
988  int running, on_rq;
989  unsigned long ncsw;
990  struct rq *rq;
991 
992  for (;;) {
993  /*
994  * We do the initial early heuristics without holding
995  * any task-queue locks at all. We'll only try to get
996  * the runqueue lock when things look like they will
997  * work out!
998  */
999  rq = task_rq(p);
1000 
1001  /*
1002  * If the task is actively running on another CPU
1003  * still, just relax and busy-wait without holding
1004  * any locks.
1005  *
1006  * NOTE! Since we don't hold any locks, it's not
1007  * even sure that "rq" stays as the right runqueue!
1008  * But we don't care, since "task_running()" will
1009  * return false if the runqueue has changed and p
1010  * is actually now running somewhere else!
1011  */
1012  while (task_running(rq, p)) {
1013  if (match_state && unlikely(p->state != match_state))
1014  return 0;
1015  cpu_relax();
1016  }
1017 
1018  /*
1019  * Ok, time to look more closely! We need the rq
1020  * lock now, to be *sure*. If we're wrong, we'll
1021  * just go back and repeat.
1022  */
1023  rq = task_rq_lock(p, &flags);
1024  trace_sched_wait_task(p);
1025  running = task_running(rq, p);
1026  on_rq = p->on_rq;
1027  ncsw = 0;
1028  if (!match_state || p->state == match_state)
1029  ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1030  task_rq_unlock(rq, p, &flags);
1031 
1032  /*
1033  * If it changed from the expected state, bail out now.
1034  */
1035  if (unlikely(!ncsw))
1036  break;
1037 
1038  /*
1039  * Was it really running after all now that we
1040  * checked with the proper locks actually held?
1041  *
1042  * Oops. Go back and try again..
1043  */
1044  if (unlikely(running)) {
1045  cpu_relax();
1046  continue;
1047  }
1048 
1049  /*
1050  * It's not enough that it's not actively running,
1051  * it must be off the runqueue _entirely_, and not
1052  * preempted!
1053  *
1054  * So if it was still runnable (but just not actively
1055  * running right now), it's preempted, and we should
1056  * yield - it could be a while.
1057  */
1058  if (unlikely(on_rq)) {
1059  ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1060 
1063  continue;
1064  }
1065 
1066  /*
1067  * Ahh, all good. It wasn't running, and it wasn't
1068  * runnable, which means that it will never become
1069  * running in the future either. We're all done!
1070  */
1071  break;
1072  }
1073 
1074  return ncsw;
1075 }
1076 
1077 /***
1078  * kick_process - kick a running thread to enter/exit the kernel
1079  * @p: the to-be-kicked thread
1080  *
1081  * Cause a process which is running on another CPU to enter
1082  * kernel-mode, without any delay. (to get signals handled.)
1083  *
1084  * NOTE: this function doesn't have to take the runqueue lock,
1085  * because all it wants to ensure is that the remote task enters
1086  * the kernel. If the IPI races and the task has been migrated
1087  * to another CPU then no harm is done and the purpose has been
1088  * achieved as well.
1089  */
1090 void kick_process(struct task_struct *p)
1091 {
1092  int cpu;
1093 
1094  preempt_disable();
1095  cpu = task_cpu(p);
1096  if ((cpu != smp_processor_id()) && task_curr(p))
1097  smp_send_reschedule(cpu);
1098  preempt_enable();
1099 }
1100 EXPORT_SYMBOL_GPL(kick_process);
1101 #endif /* CONFIG_SMP */
1102 
1103 #ifdef CONFIG_SMP
1104 /*
1105  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
1106  */
1107 static int select_fallback_rq(int cpu, struct task_struct *p)
1108 {
1109  const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1110  enum { cpuset, possible, fail } state = cpuset;
1111  int dest_cpu;
1112 
1113  /* Look for allowed, online CPU in same node. */
1114  for_each_cpu(dest_cpu, nodemask) {
1115  if (!cpu_online(dest_cpu))
1116  continue;
1117  if (!cpu_active(dest_cpu))
1118  continue;
1119  if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1120  return dest_cpu;
1121  }
1122 
1123  for (;;) {
1124  /* Any allowed, online CPU? */
1125  for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1126  if (!cpu_online(dest_cpu))
1127  continue;
1128  if (!cpu_active(dest_cpu))
1129  continue;
1130  goto out;
1131  }
1132 
1133  switch (state) {
1134  case cpuset:
1135  /* No more Mr. Nice Guy. */
1137  state = possible;
1138  break;
1139 
1140  case possible:
1141  do_set_cpus_allowed(p, cpu_possible_mask);
1142  state = fail;
1143  break;
1144 
1145  case fail:
1146  BUG();
1147  break;
1148  }
1149  }
1150 
1151 out:
1152  if (state != cpuset) {
1153  /*
1154  * Don't tell them about moving exiting tasks or
1155  * kernel threads (both mm NULL), since they never
1156  * leave kernel.
1157  */
1158  if (p->mm && printk_ratelimit()) {
1159  printk_sched("process %d (%s) no longer affine to cpu%d\n",
1160  task_pid_nr(p), p->comm, cpu);
1161  }
1162  }
1163 
1164  return dest_cpu;
1165 }
1166 
1167 /*
1168  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1169  */
1170 static inline
1171 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1172 {
1173  int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1174 
1175  /*
1176  * In order not to call set_task_cpu() on a blocking task we need
1177  * to rely on ttwu() to place the task on a valid ->cpus_allowed
1178  * cpu.
1179  *
1180  * Since this is common to all placement strategies, this lives here.
1181  *
1182  * [ this allows ->select_task() to simply return task_cpu(p) and
1183  * not worry about this generic constraint ]
1184  */
1185  if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1186  !cpu_online(cpu)))
1187  cpu = select_fallback_rq(task_cpu(p), p);
1188 
1189  return cpu;
1190 }
1191 
1192 static void update_avg(u64 *avg, u64 sample)
1193 {
1194  s64 diff = sample - *avg;
1195  *avg += diff >> 3;
1196 }
1197 #endif
1198 
1199 static void
1200 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1201 {
1202 #ifdef CONFIG_SCHEDSTATS
1203  struct rq *rq = this_rq();
1204 
1205 #ifdef CONFIG_SMP
1206  int this_cpu = smp_processor_id();
1207 
1208  if (cpu == this_cpu) {
1209  schedstat_inc(rq, ttwu_local);
1210  schedstat_inc(p, se.statistics.nr_wakeups_local);
1211  } else {
1212  struct sched_domain *sd;
1213 
1214  schedstat_inc(p, se.statistics.nr_wakeups_remote);
1215  rcu_read_lock();
1216  for_each_domain(this_cpu, sd) {
1217  if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1218  schedstat_inc(sd, ttwu_wake_remote);
1219  break;
1220  }
1221  }
1222  rcu_read_unlock();
1223  }
1224 
1225  if (wake_flags & WF_MIGRATED)
1226  schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1227 
1228 #endif /* CONFIG_SMP */
1229 
1230  schedstat_inc(rq, ttwu_count);
1231  schedstat_inc(p, se.statistics.nr_wakeups);
1232 
1233  if (wake_flags & WF_SYNC)
1234  schedstat_inc(p, se.statistics.nr_wakeups_sync);
1235 
1236 #endif /* CONFIG_SCHEDSTATS */
1237 }
1238 
1239 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1240 {
1241  activate_task(rq, p, en_flags);
1242  p->on_rq = 1;
1243 
1244  /* if a worker is waking up, notify workqueue */
1245  if (p->flags & PF_WQ_WORKER)
1246  wq_worker_waking_up(p, cpu_of(rq));
1247 }
1248 
1249 /*
1250  * Mark the task runnable and perform wakeup-preemption.
1251  */
1252 static void
1253 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1254 {
1255  trace_sched_wakeup(p, true);
1256  check_preempt_curr(rq, p, wake_flags);
1257 
1258  p->state = TASK_RUNNING;
1259 #ifdef CONFIG_SMP
1260  if (p->sched_class->task_woken)
1261  p->sched_class->task_woken(rq, p);
1262 
1263  if (rq->idle_stamp) {
1264  u64 delta = rq->clock - rq->idle_stamp;
1266 
1267  if (delta > max)
1268  rq->avg_idle = max;
1269  else
1270  update_avg(&rq->avg_idle, delta);
1271  rq->idle_stamp = 0;
1272  }
1273 #endif
1274 }
1275 
1276 static void
1277 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1278 {
1279 #ifdef CONFIG_SMP
1281  rq->nr_uninterruptible--;
1282 #endif
1283 
1284  ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1285  ttwu_do_wakeup(rq, p, wake_flags);
1286 }
1287 
1288 /*
1289  * Called in case the task @p isn't fully descheduled from its runqueue,
1290  * in this case we must do a remote wakeup. Its a 'light' wakeup though,
1291  * since all we need to do is flip p->state to TASK_RUNNING, since
1292  * the task is still ->on_rq.
1293  */
1294 static int ttwu_remote(struct task_struct *p, int wake_flags)
1295 {
1296  struct rq *rq;
1297  int ret = 0;
1298 
1299  rq = __task_rq_lock(p);
1300  if (p->on_rq) {
1301  ttwu_do_wakeup(rq, p, wake_flags);
1302  ret = 1;
1303  }
1304  __task_rq_unlock(rq);
1305 
1306  return ret;
1307 }
1308 
1309 #ifdef CONFIG_SMP
1310 static void sched_ttwu_pending(void)
1311 {
1312  struct rq *rq = this_rq();
1313  struct llist_node *llist = llist_del_all(&rq->wake_list);
1314  struct task_struct *p;
1315 
1316  raw_spin_lock(&rq->lock);
1317 
1318  while (llist) {
1319  p = llist_entry(llist, struct task_struct, wake_entry);
1320  llist = llist_next(llist);
1321  ttwu_do_activate(rq, p, 0);
1322  }
1323 
1324  raw_spin_unlock(&rq->lock);
1325 }
1326 
1327 void scheduler_ipi(void)
1328 {
1329  if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1330  return;
1331 
1332  /*
1333  * Not all reschedule IPI handlers call irq_enter/irq_exit, since
1334  * traditionally all their work was done from the interrupt return
1335  * path. Now that we actually do some work, we need to make sure
1336  * we do call them.
1337  *
1338  * Some archs already do call them, luckily irq_enter/exit nest
1339  * properly.
1340  *
1341  * Arguably we should visit all archs and update all handlers,
1342  * however a fair share of IPIs are still resched only so this would
1343  * somewhat pessimize the simple resched case.
1344  */
1345  irq_enter();
1346  sched_ttwu_pending();
1347 
1348  /*
1349  * Check if someone kicked us for doing the nohz idle load balance.
1350  */
1351  if (unlikely(got_nohz_idle_kick() && !need_resched())) {
1352  this_rq()->idle_balance = 1;
1354  }
1355  irq_exit();
1356 }
1357 
1358 static void ttwu_queue_remote(struct task_struct *p, int cpu)
1359 {
1360  if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1361  smp_send_reschedule(cpu);
1362 }
1363 
1364 bool cpus_share_cache(int this_cpu, int that_cpu)
1365 {
1366  return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1367 }
1368 #endif /* CONFIG_SMP */
1369 
1370 static void ttwu_queue(struct task_struct *p, int cpu)
1371 {
1372  struct rq *rq = cpu_rq(cpu);
1373 
1374 #if defined(CONFIG_SMP)
1375  if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1376  sched_clock_cpu(cpu); /* sync clocks x-cpu */
1377  ttwu_queue_remote(p, cpu);
1378  return;
1379  }
1380 #endif
1381 
1382  raw_spin_lock(&rq->lock);
1383  ttwu_do_activate(rq, p, 0);
1384  raw_spin_unlock(&rq->lock);
1385 }
1386 
1402 static int
1403 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1404 {
1405  unsigned long flags;
1406  int cpu, success = 0;
1407 
1408  smp_wmb();
1409  raw_spin_lock_irqsave(&p->pi_lock, flags);
1410  if (!(p->state & state))
1411  goto out;
1412 
1413  success = 1; /* we're going to change ->state */
1414  cpu = task_cpu(p);
1415 
1416  if (p->on_rq && ttwu_remote(p, wake_flags))
1417  goto stat;
1418 
1419 #ifdef CONFIG_SMP
1420  /*
1421  * If the owning (remote) cpu is still in the middle of schedule() with
1422  * this task as prev, wait until its done referencing the task.
1423  */
1424  while (p->on_cpu)
1425  cpu_relax();
1426  /*
1427  * Pairs with the smp_wmb() in finish_lock_switch().
1428  */
1429  smp_rmb();
1430 
1432  p->state = TASK_WAKING;
1433 
1434  if (p->sched_class->task_waking)
1435  p->sched_class->task_waking(p);
1436 
1437  cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1438  if (task_cpu(p) != cpu) {
1439  wake_flags |= WF_MIGRATED;
1440  set_task_cpu(p, cpu);
1441  }
1442 #endif /* CONFIG_SMP */
1443 
1444  ttwu_queue(p, cpu);
1445 stat:
1446  ttwu_stat(p, cpu, wake_flags);
1447 out:
1448  raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1449 
1450  return success;
1451 }
1452 
1461 static void try_to_wake_up_local(struct task_struct *p)
1462 {
1463  struct rq *rq = task_rq(p);
1464 
1465  BUG_ON(rq != this_rq());
1466  BUG_ON(p == current);
1467  lockdep_assert_held(&rq->lock);
1468 
1469  if (!raw_spin_trylock(&p->pi_lock)) {
1470  raw_spin_unlock(&rq->lock);
1471  raw_spin_lock(&p->pi_lock);
1472  raw_spin_lock(&rq->lock);
1473  }
1474 
1475  if (!(p->state & TASK_NORMAL))
1476  goto out;
1477 
1478  if (!p->on_rq)
1479  ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1480 
1481  ttwu_do_wakeup(rq, p, 0);
1482  ttwu_stat(p, smp_processor_id(), 0);
1483 out:
1484  raw_spin_unlock(&p->pi_lock);
1485 }
1486 
1499 {
1500  return try_to_wake_up(p, TASK_ALL, 0);
1501 }
1503 
1504 int wake_up_state(struct task_struct *p, unsigned int state)
1505 {
1506  return try_to_wake_up(p, state, 0);
1507 }
1508 
1509 /*
1510  * Perform scheduler related setup for a newly forked process p.
1511  * p is forked by current.
1512  *
1513  * __sched_fork() is basic setup used by init_idle() too:
1514  */
1515 static void __sched_fork(struct task_struct *p)
1516 {
1517  p->on_rq = 0;
1518 
1519  p->se.on_rq = 0;
1520  p->se.exec_start = 0;
1521  p->se.sum_exec_runtime = 0;
1522  p->se.prev_sum_exec_runtime = 0;
1523  p->se.nr_migrations = 0;
1524  p->se.vruntime = 0;
1525  INIT_LIST_HEAD(&p->se.group_node);
1526 
1527 #ifdef CONFIG_SCHEDSTATS
1528  memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1529 #endif
1530 
1531  INIT_LIST_HEAD(&p->rt.run_list);
1532 
1533 #ifdef CONFIG_PREEMPT_NOTIFIERS
1534  INIT_HLIST_HEAD(&p->preempt_notifiers);
1535 #endif
1536 }
1537 
1538 /*
1539  * fork()/clone()-time setup:
1540  */
1541 void sched_fork(struct task_struct *p)
1542 {
1543  unsigned long flags;
1544  int cpu = get_cpu();
1545 
1546  __sched_fork(p);
1547  /*
1548  * We mark the process as running here. This guarantees that
1549  * nobody will actually run it, and a signal or other external
1550  * event cannot wake it up and insert it on the runqueue either.
1551  */
1552  p->state = TASK_RUNNING;
1553 
1554  /*
1555  * Make sure we do not leak PI boosting priority to the child.
1556  */
1557  p->prio = current->normal_prio;
1558 
1559  /*
1560  * Revert to default priority/policy on fork if requested.
1561  */
1562  if (unlikely(p->sched_reset_on_fork)) {
1563  if (task_has_rt_policy(p)) {
1564  p->policy = SCHED_NORMAL;
1565  p->static_prio = NICE_TO_PRIO(0);
1566  p->rt_priority = 0;
1567  } else if (PRIO_TO_NICE(p->static_prio) < 0)
1568  p->static_prio = NICE_TO_PRIO(0);
1569 
1570  p->prio = p->normal_prio = __normal_prio(p);
1571  set_load_weight(p);
1572 
1573  /*
1574  * We don't need the reset flag anymore after the fork. It has
1575  * fulfilled its duty:
1576  */
1577  p->sched_reset_on_fork = 0;
1578  }
1579 
1580  if (!rt_prio(p->prio))
1582 
1583  if (p->sched_class->task_fork)
1584  p->sched_class->task_fork(p);
1585 
1586  /*
1587  * The child is not yet in the pid-hash so no cgroup attach races,
1588  * and the cgroup is pinned to this child due to cgroup_fork()
1589  * is ran before sched_fork().
1590  *
1591  * Silence PROVE_RCU.
1592  */
1593  raw_spin_lock_irqsave(&p->pi_lock, flags);
1594  set_task_cpu(p, cpu);
1595  raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1596 
1597 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1598  if (likely(sched_info_on()))
1599  memset(&p->sched_info, 0, sizeof(p->sched_info));
1600 #endif
1601 #if defined(CONFIG_SMP)
1602  p->on_cpu = 0;
1603 #endif
1604 #ifdef CONFIG_PREEMPT_COUNT
1605  /* Want to start with kernel preemption disabled. */
1606  task_thread_info(p)->preempt_count = 1;
1607 #endif
1608 #ifdef CONFIG_SMP
1609  plist_node_init(&p->pushable_tasks, MAX_PRIO);
1610 #endif
1611 
1612  put_cpu();
1613 }
1614 
1615 /*
1616  * wake_up_new_task - wake up a newly created task for the first time.
1617  *
1618  * This function will do some initial scheduler statistics housekeeping
1619  * that must be done for every newly created context, then puts the task
1620  * on the runqueue and wakes it.
1621  */
1623 {
1624  unsigned long flags;
1625  struct rq *rq;
1626 
1627  raw_spin_lock_irqsave(&p->pi_lock, flags);
1628 #ifdef CONFIG_SMP
1629  /*
1630  * Fork balancing, do it here and not earlier because:
1631  * - cpus_allowed can change in the fork path
1632  * - any previously selected cpu might disappear through hotplug
1633  */
1634  set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1635 #endif
1636 
1637  rq = __task_rq_lock(p);
1638  activate_task(rq, p, 0);
1639  p->on_rq = 1;
1640  trace_sched_wakeup_new(p, true);
1641  check_preempt_curr(rq, p, WF_FORK);
1642 #ifdef CONFIG_SMP
1643  if (p->sched_class->task_woken)
1644  p->sched_class->task_woken(rq, p);
1645 #endif
1646  task_rq_unlock(rq, p, &flags);
1647 }
1648 
1649 #ifdef CONFIG_PREEMPT_NOTIFIERS
1650 
1655 void preempt_notifier_register(struct preempt_notifier *notifier)
1656 {
1657  hlist_add_head(&notifier->link, &current->preempt_notifiers);
1658 }
1659 EXPORT_SYMBOL_GPL(preempt_notifier_register);
1660 
1667 void preempt_notifier_unregister(struct preempt_notifier *notifier)
1668 {
1669  hlist_del(&notifier->link);
1670 }
1671 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1672 
1673 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1674 {
1675  struct preempt_notifier *notifier;
1676  struct hlist_node *node;
1677 
1678  hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1679  notifier->ops->sched_in(notifier, raw_smp_processor_id());
1680 }
1681 
1682 static void
1683 fire_sched_out_preempt_notifiers(struct task_struct *curr,
1685 {
1686  struct preempt_notifier *notifier;
1687  struct hlist_node *node;
1688 
1689  hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1690  notifier->ops->sched_out(notifier, next);
1691 }
1692 
1693 #else /* !CONFIG_PREEMPT_NOTIFIERS */
1694 
1695 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1696 {
1697 }
1698 
1699 static void
1700 fire_sched_out_preempt_notifiers(struct task_struct *curr,
1701  struct task_struct *next)
1702 {
1703 }
1704 
1705 #endif /* CONFIG_PREEMPT_NOTIFIERS */
1706 
1720 static inline void
1721 prepare_task_switch(struct rq *rq, struct task_struct *prev,
1722  struct task_struct *next)
1723 {
1724  trace_sched_switch(prev, next);
1725  sched_info_switch(prev, next);
1726  perf_event_task_sched_out(prev, next);
1727  fire_sched_out_preempt_notifiers(prev, next);
1728  prepare_lock_switch(rq, next);
1729  prepare_arch_switch(next);
1730 }
1731 
1747 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1748  __releases(rq->lock)
1749 {
1750  struct mm_struct *mm = rq->prev_mm;
1751  long prev_state;
1752 
1753  rq->prev_mm = NULL;
1754 
1755  /*
1756  * A task struct has one reference for the use as "current".
1757  * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1758  * schedule one last time. The schedule call will never return, and
1759  * the scheduled task must drop that reference.
1760  * The test for TASK_DEAD must occur while the runqueue locks are
1761  * still held, otherwise prev could be scheduled on another cpu, die
1762  * there before we look at prev->state, and then the reference would
1763  * be dropped twice.
1764  * Manfred Spraul <[email protected]>
1765  */
1766  prev_state = prev->state;
1769  perf_event_task_sched_in(prev, current);
1770  finish_lock_switch(rq, prev);
1772 
1773  fire_sched_in_preempt_notifiers(current);
1774  if (mm)
1775  mmdrop(mm);
1776  if (unlikely(prev_state == TASK_DEAD)) {
1777  /*
1778  * Remove function-return probe instances associated with this
1779  * task and put them back on the free list.
1780  */
1782  put_task_struct(prev);
1783  }
1784 }
1785 
1786 #ifdef CONFIG_SMP
1787 
1788 /* assumes rq->lock is held */
1789 static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1790 {
1791  if (prev->sched_class->pre_schedule)
1792  prev->sched_class->pre_schedule(rq, prev);
1793 }
1794 
1795 /* rq->lock is NOT held, but preemption is disabled */
1796 static inline void post_schedule(struct rq *rq)
1797 {
1798  if (rq->post_schedule) {
1799  unsigned long flags;
1800 
1801  raw_spin_lock_irqsave(&rq->lock, flags);
1802  if (rq->curr->sched_class->post_schedule)
1803  rq->curr->sched_class->post_schedule(rq);
1804  raw_spin_unlock_irqrestore(&rq->lock, flags);
1805 
1806  rq->post_schedule = 0;
1807  }
1808 }
1809 
1810 #else
1811 
1812 static inline void pre_schedule(struct rq *rq, struct task_struct *p)
1813 {
1814 }
1815 
1816 static inline void post_schedule(struct rq *rq)
1817 {
1818 }
1819 
1820 #endif
1821 
1827  __releases(rq->lock)
1828 {
1829  struct rq *rq = this_rq();
1830 
1831  finish_task_switch(rq, prev);
1832 
1833  /*
1834  * FIXME: do we need to worry about rq being invalidated by the
1835  * task_switch?
1836  */
1837  post_schedule(rq);
1838 
1839 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1840  /* In this case, finish_task_switch does not reenable preemption */
1841  preempt_enable();
1842 #endif
1843  if (current->set_child_tid)
1844  put_user(task_pid_vnr(current), current->set_child_tid);
1845 }
1846 
1847 /*
1848  * context_switch - switch to the new MM and the new
1849  * thread's register state.
1850  */
1851 static inline void
1852 context_switch(struct rq *rq, struct task_struct *prev,
1853  struct task_struct *next)
1854 {
1855  struct mm_struct *mm, *oldmm;
1856 
1857  prepare_task_switch(rq, prev, next);
1858 
1859  mm = next->mm;
1860  oldmm = prev->active_mm;
1861  /*
1862  * For paravirt, this is coupled with an exit in switch_to to
1863  * combine the page table reload and the switch backend into
1864  * one hypercall.
1865  */
1867 
1868  if (!mm) {
1869  next->active_mm = oldmm;
1870  atomic_inc(&oldmm->mm_count);
1871  enter_lazy_tlb(oldmm, next);
1872  } else
1873  switch_mm(oldmm, mm, next);
1874 
1875  if (!prev->mm) {
1876  prev->active_mm = NULL;
1877  rq->prev_mm = oldmm;
1878  }
1879  /*
1880  * Since the runqueue lock will be released by the next
1881  * task (which is an invalid locking op but in the case
1882  * of the scheduler it's an obvious special-case), so we
1883  * do an early lockdep release here:
1884  */
1885 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1886  spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1887 #endif
1888 
1889  /* Here we just switch the register state and the stack. */
1890  rcu_switch(prev, next);
1891  switch_to(prev, next, prev);
1892 
1893  barrier();
1894  /*
1895  * this_rq must be evaluated again because prev may have moved
1896  * CPUs since it called schedule(), thus the 'rq' on its stack
1897  * frame will be invalid.
1898  */
1899  finish_task_switch(this_rq(), prev);
1900 }
1901 
1902 /*
1903  * nr_running, nr_uninterruptible and nr_context_switches:
1904  *
1905  * externally visible scheduler statistics: current number of runnable
1906  * threads, current number of uninterruptible-sleeping threads, total
1907  * number of context switches performed since bootup.
1908  */
1909 unsigned long nr_running(void)
1910 {
1911  unsigned long i, sum = 0;
1912 
1914  sum += cpu_rq(i)->nr_running;
1915 
1916  return sum;
1917 }
1918 
1919 unsigned long nr_uninterruptible(void)
1920 {
1921  unsigned long i, sum = 0;
1922 
1924  sum += cpu_rq(i)->nr_uninterruptible;
1925 
1926  /*
1927  * Since we read the counters lockless, it might be slightly
1928  * inaccurate. Do not allow it to go below zero though:
1929  */
1930  if (unlikely((long)sum < 0))
1931  sum = 0;
1932 
1933  return sum;
1934 }
1935 
1936 unsigned long long nr_context_switches(void)
1937 {
1938  int i;
1939  unsigned long long sum = 0;
1940 
1942  sum += cpu_rq(i)->nr_switches;
1943 
1944  return sum;
1945 }
1946 
1947 unsigned long nr_iowait(void)
1948 {
1949  unsigned long i, sum = 0;
1950 
1952  sum += atomic_read(&cpu_rq(i)->nr_iowait);
1953 
1954  return sum;
1955 }
1956 
1957 unsigned long nr_iowait_cpu(int cpu)
1958 {
1959  struct rq *this = cpu_rq(cpu);
1960  return atomic_read(&this->nr_iowait);
1961 }
1962 
1963 unsigned long this_cpu_load(void)
1964 {
1965  struct rq *this = this_rq();
1966  return this->cpu_load[0];
1967 }
1968 
1969 
1970 /*
1971  * Global load-average calculations
1972  *
1973  * We take a distributed and async approach to calculating the global load-avg
1974  * in order to minimize overhead.
1975  *
1976  * The global load average is an exponentially decaying average of nr_running +
1977  * nr_uninterruptible.
1978  *
1979  * Once every LOAD_FREQ:
1980  *
1981  * nr_active = 0;
1982  * for_each_possible_cpu(cpu)
1983  * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
1984  *
1985  * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
1986  *
1987  * Due to a number of reasons the above turns in the mess below:
1988  *
1989  * - for_each_possible_cpu() is prohibitively expensive on machines with
1990  * serious number of cpus, therefore we need to take a distributed approach
1991  * to calculating nr_active.
1992  *
1993  * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
1994  * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
1995  *
1996  * So assuming nr_active := 0 when we start out -- true per definition, we
1997  * can simply take per-cpu deltas and fold those into a global accumulate
1998  * to obtain the same result. See calc_load_fold_active().
1999  *
2000  * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2001  * across the machine, we assume 10 ticks is sufficient time for every
2002  * cpu to have completed this task.
2003  *
2004  * This places an upper-bound on the IRQ-off latency of the machine. Then
2005  * again, being late doesn't loose the delta, just wrecks the sample.
2006  *
2007  * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2008  * this would add another cross-cpu cacheline miss and atomic operation
2009  * to the wakeup path. Instead we increment on whatever cpu the task ran
2010  * when it went into uninterruptible state and decrement on whatever cpu
2011  * did the wakeup. This means that only the sum of nr_uninterruptible over
2012  * all cpus yields the correct result.
2013  *
2014  * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2015  */
2016 
2017 /* Variables and functions for calc_load */
2018 static atomic_long_t calc_load_tasks;
2019 static unsigned long calc_load_update;
2020 unsigned long avenrun[3];
2021 EXPORT_SYMBOL(avenrun); /* should be removed */
2022 
2031 void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2032 {
2033  loads[0] = (avenrun[0] + offset) << shift;
2034  loads[1] = (avenrun[1] + offset) << shift;
2035  loads[2] = (avenrun[2] + offset) << shift;
2036 }
2037 
2038 static long calc_load_fold_active(struct rq *this_rq)
2039 {
2040  long nr_active, delta = 0;
2041 
2042  nr_active = this_rq->nr_running;
2043  nr_active += (long) this_rq->nr_uninterruptible;
2044 
2045  if (nr_active != this_rq->calc_load_active) {
2046  delta = nr_active - this_rq->calc_load_active;
2047  this_rq->calc_load_active = nr_active;
2048  }
2049 
2050  return delta;
2051 }
2052 
2053 /*
2054  * a1 = a0 * e + a * (1 - e)
2055  */
2056 static unsigned long
2057 calc_load(unsigned long load, unsigned long exp, unsigned long active)
2058 {
2059  load *= exp;
2060  load += active * (FIXED_1 - exp);
2061  load += 1UL << (FSHIFT - 1);
2062  return load >> FSHIFT;
2063 }
2064 
2065 #ifdef CONFIG_NO_HZ
2066 /*
2067  * Handle NO_HZ for the global load-average.
2068  *
2069  * Since the above described distributed algorithm to compute the global
2070  * load-average relies on per-cpu sampling from the tick, it is affected by
2071  * NO_HZ.
2072  *
2073  * The basic idea is to fold the nr_active delta into a global idle-delta upon
2074  * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2075  * when we read the global state.
2076  *
2077  * Obviously reality has to ruin such a delightfully simple scheme:
2078  *
2079  * - When we go NO_HZ idle during the window, we can negate our sample
2080  * contribution, causing under-accounting.
2081  *
2082  * We avoid this by keeping two idle-delta counters and flipping them
2083  * when the window starts, thus separating old and new NO_HZ load.
2084  *
2085  * The only trick is the slight shift in index flip for read vs write.
2086  *
2087  * 0s 5s 10s 15s
2088  * +10 +10 +10 +10
2089  * |-|-----------|-|-----------|-|-----------|-|
2090  * r:0 0 1 1 0 0 1 1 0
2091  * w:0 1 1 0 0 1 1 0 0
2092  *
2093  * This ensures we'll fold the old idle contribution in this window while
2094  * accumlating the new one.
2095  *
2096  * - When we wake up from NO_HZ idle during the window, we push up our
2097  * contribution, since we effectively move our sample point to a known
2098  * busy state.
2099  *
2100  * This is solved by pushing the window forward, and thus skipping the
2101  * sample, for this cpu (effectively using the idle-delta for this cpu which
2102  * was in effect at the time the window opened). This also solves the issue
2103  * of having to deal with a cpu having been in NOHZ idle for multiple
2104  * LOAD_FREQ intervals.
2105  *
2106  * When making the ILB scale, we should try to pull this in as well.
2107  */
2108 static atomic_long_t calc_load_idle[2];
2109 static int calc_load_idx;
2110 
2111 static inline int calc_load_write_idx(void)
2112 {
2113  int idx = calc_load_idx;
2114 
2115  /*
2116  * See calc_global_nohz(), if we observe the new index, we also
2117  * need to observe the new update time.
2118  */
2119  smp_rmb();
2120 
2121  /*
2122  * If the folding window started, make sure we start writing in the
2123  * next idle-delta.
2124  */
2125  if (!time_before(jiffies, calc_load_update))
2126  idx++;
2127 
2128  return idx & 1;
2129 }
2130 
2131 static inline int calc_load_read_idx(void)
2132 {
2133  return calc_load_idx & 1;
2134 }
2135 
2136 void calc_load_enter_idle(void)
2137 {
2138  struct rq *this_rq = this_rq();
2139  long delta;
2140 
2141  /*
2142  * We're going into NOHZ mode, if there's any pending delta, fold it
2143  * into the pending idle delta.
2144  */
2145  delta = calc_load_fold_active(this_rq);
2146  if (delta) {
2147  int idx = calc_load_write_idx();
2148  atomic_long_add(delta, &calc_load_idle[idx]);
2149  }
2150 }
2151 
2152 void calc_load_exit_idle(void)
2153 {
2154  struct rq *this_rq = this_rq();
2155 
2156  /*
2157  * If we're still before the sample window, we're done.
2158  */
2159  if (time_before(jiffies, this_rq->calc_load_update))
2160  return;
2161 
2162  /*
2163  * We woke inside or after the sample window, this means we're already
2164  * accounted through the nohz accounting, so skip the entire deal and
2165  * sync up for the next window.
2166  */
2168  if (time_before(jiffies, this_rq->calc_load_update + 10))
2169  this_rq->calc_load_update += LOAD_FREQ;
2170 }
2171 
2172 static long calc_load_fold_idle(void)
2173 {
2174  int idx = calc_load_read_idx();
2175  long delta = 0;
2176 
2177  if (atomic_long_read(&calc_load_idle[idx]))
2178  delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2179 
2180  return delta;
2181 }
2182 
2198 static unsigned long
2199 fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2200 {
2201  unsigned long result = 1UL << frac_bits;
2202 
2203  if (n) for (;;) {
2204  if (n & 1) {
2205  result *= x;
2206  result += 1UL << (frac_bits - 1);
2207  result >>= frac_bits;
2208  }
2209  n >>= 1;
2210  if (!n)
2211  break;
2212  x *= x;
2213  x += 1UL << (frac_bits - 1);
2214  x >>= frac_bits;
2215  }
2216 
2217  return result;
2218 }
2219 
2220 /*
2221  * a1 = a0 * e + a * (1 - e)
2222  *
2223  * a2 = a1 * e + a * (1 - e)
2224  * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
2225  * = a0 * e^2 + a * (1 - e) * (1 + e)
2226  *
2227  * a3 = a2 * e + a * (1 - e)
2228  * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
2229  * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
2230  *
2231  * ...
2232  *
2233  * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
2234  * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
2235  * = a0 * e^n + a * (1 - e^n)
2236  *
2237  * [1] application of the geometric series:
2238  *
2239  * n 1 - x^(n+1)
2240  * S_n := \Sum x^i = -------------
2241  * i=0 1 - x
2242  */
2243 static unsigned long
2244 calc_load_n(unsigned long load, unsigned long exp,
2245  unsigned long active, unsigned int n)
2246 {
2247 
2248  return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2249 }
2250 
2251 /*
2252  * NO_HZ can leave us missing all per-cpu ticks calling
2253  * calc_load_account_active(), but since an idle CPU folds its delta into
2254  * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
2255  * in the pending idle delta if our idle period crossed a load cycle boundary.
2256  *
2257  * Once we've updated the global active value, we need to apply the exponential
2258  * weights adjusted to the number of cycles missed.
2259  */
2260 static void calc_global_nohz(void)
2261 {
2262  long delta, active, n;
2263 
2264  if (!time_before(jiffies, calc_load_update + 10)) {
2265  /*
2266  * Catch-up, fold however many we are behind still
2267  */
2268  delta = jiffies - calc_load_update - 10;
2269  n = 1 + (delta / LOAD_FREQ);
2270 
2271  active = atomic_long_read(&calc_load_tasks);
2272  active = active > 0 ? active * FIXED_1 : 0;
2273 
2274  avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2275  avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2276  avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2277 
2278  calc_load_update += n * LOAD_FREQ;
2279  }
2280 
2281  /*
2282  * Flip the idle index...
2283  *
2284  * Make sure we first write the new time then flip the index, so that
2285  * calc_load_write_idx() will see the new time when it reads the new
2286  * index, this avoids a double flip messing things up.
2287  */
2288  smp_wmb();
2289  calc_load_idx++;
2290 }
2291 #else /* !CONFIG_NO_HZ */
2292 
2293 static inline long calc_load_fold_idle(void) { return 0; }
2294 static inline void calc_global_nohz(void) { }
2295 
2296 #endif /* CONFIG_NO_HZ */
2297 
2298 /*
2299  * calc_load - update the avenrun load estimates 10 ticks after the
2300  * CPUs have updated calc_load_tasks.
2301  */
2302 void calc_global_load(unsigned long ticks)
2303 {
2304  long active, delta;
2305 
2306  if (time_before(jiffies, calc_load_update + 10))
2307  return;
2308 
2309  /*
2310  * Fold the 'old' idle-delta to include all NO_HZ cpus.
2311  */
2312  delta = calc_load_fold_idle();
2313  if (delta)
2314  atomic_long_add(delta, &calc_load_tasks);
2315 
2316  active = atomic_long_read(&calc_load_tasks);
2317  active = active > 0 ? active * FIXED_1 : 0;
2318 
2319  avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2320  avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2321  avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2322 
2323  calc_load_update += LOAD_FREQ;
2324 
2325  /*
2326  * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2327  */
2328  calc_global_nohz();
2329 }
2330 
2331 /*
2332  * Called from update_cpu_load() to periodically update this CPU's
2333  * active count.
2334  */
2335 static void calc_load_account_active(struct rq *this_rq)
2336 {
2337  long delta;
2338 
2339  if (time_before(jiffies, this_rq->calc_load_update))
2340  return;
2341 
2342  delta = calc_load_fold_active(this_rq);
2343  if (delta)
2344  atomic_long_add(delta, &calc_load_tasks);
2345 
2346  this_rq->calc_load_update += LOAD_FREQ;
2347 }
2348 
2349 /*
2350  * End of global load-average stuff
2351  */
2352 
2353 /*
2354  * The exact cpuload at various idx values, calculated at every tick would be
2355  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2356  *
2357  * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
2358  * on nth tick when cpu may be busy, then we have:
2359  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2360  * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
2361  *
2362  * decay_load_missed() below does efficient calculation of
2363  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2364  * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
2365  *
2366  * The calculation is approximated on a 128 point scale.
2367  * degrade_zero_ticks is the number of ticks after which load at any
2368  * particular idx is approximated to be zero.
2369  * degrade_factor is a precomputed table, a row for each load idx.
2370  * Each column corresponds to degradation factor for a power of two ticks,
2371  * based on 128 point scale.
2372  * Example:
2373  * row 2, col 3 (=12) says that the degradation at load idx 2 after
2374  * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
2375  *
2376  * With this power of 2 load factors, we can degrade the load n times
2377  * by looking at 1 bits in n and doing as many mult/shift instead of
2378  * n mult/shifts needed by the exact degradation.
2379  */
2380 #define DEGRADE_SHIFT 7
2381 static const unsigned char
2382  degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2383 static const unsigned char
2384  degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2385  {0, 0, 0, 0, 0, 0, 0, 0},
2386  {64, 32, 8, 0, 0, 0, 0, 0},
2387  {96, 72, 40, 12, 1, 0, 0},
2388  {112, 98, 75, 43, 15, 1, 0},
2389  {120, 112, 98, 76, 45, 16, 2} };
2390 
2391 /*
2392  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
2393  * would be when CPU is idle and so we just decay the old load without
2394  * adding any new load.
2395  */
2396 static unsigned long
2397 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2398 {
2399  int j = 0;
2400 
2401  if (!missed_updates)
2402  return load;
2403 
2404  if (missed_updates >= degrade_zero_ticks[idx])
2405  return 0;
2406 
2407  if (idx == 1)
2408  return load >> missed_updates;
2409 
2410  while (missed_updates) {
2411  if (missed_updates % 2)
2412  load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2413 
2414  missed_updates >>= 1;
2415  j++;
2416  }
2417  return load;
2418 }
2419 
2420 /*
2421  * Update rq->cpu_load[] statistics. This function is usually called every
2422  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2423  * every tick. We fix it up based on jiffies.
2424  */
2425 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2426  unsigned long pending_updates)
2427 {
2428  int i, scale;
2429 
2430  this_rq->nr_load_updates++;
2431 
2432  /* Update our load: */
2433  this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2434  for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2435  unsigned long old_load, new_load;
2436 
2437  /* scale is effectively 1 << i now, and >> i divides by scale */
2438 
2439  old_load = this_rq->cpu_load[i];
2440  old_load = decay_load_missed(old_load, pending_updates - 1, i);
2441  new_load = this_load;
2442  /*
2443  * Round up the averaging division if load is increasing. This
2444  * prevents us from getting stuck on 9 if the load is 10, for
2445  * example.
2446  */
2447  if (new_load > old_load)
2448  new_load += scale - 1;
2449 
2450  this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2451  }
2452 
2453  sched_avg_update(this_rq);
2454 }
2455 
2456 #ifdef CONFIG_NO_HZ
2457 /*
2458  * There is no sane way to deal with nohz on smp when using jiffies because the
2459  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2460  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2461  *
2462  * Therefore we cannot use the delta approach from the regular tick since that
2463  * would seriously skew the load calculation. However we'll make do for those
2464  * updates happening while idle (nohz_idle_balance) or coming out of idle
2465  * (tick_nohz_idle_exit).
2466  *
2467  * This means we might still be one tick off for nohz periods.
2468  */
2469 
2470 /*
2471  * Called from nohz_idle_balance() to update the load ratings before doing the
2472  * idle balance.
2473  */
2474 void update_idle_cpu_load(struct rq *this_rq)
2475 {
2476  unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2477  unsigned long load = this_rq->load.weight;
2478  unsigned long pending_updates;
2479 
2480  /*
2481  * bail if there's load or we're actually up-to-date.
2482  */
2483  if (load || curr_jiffies == this_rq->last_load_update_tick)
2484  return;
2485 
2486  pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2487  this_rq->last_load_update_tick = curr_jiffies;
2488 
2489  __update_cpu_load(this_rq, load, pending_updates);
2490 }
2491 
2492 /*
2493  * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2494  */
2495 void update_cpu_load_nohz(void)
2496 {
2497  struct rq *this_rq = this_rq();
2498  unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2499  unsigned long pending_updates;
2500 
2501  if (curr_jiffies == this_rq->last_load_update_tick)
2502  return;
2503 
2504  raw_spin_lock(&this_rq->lock);
2505  pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2506  if (pending_updates) {
2507  this_rq->last_load_update_tick = curr_jiffies;
2508  /*
2509  * We were idle, this means load 0, the current load might be
2510  * !0 due to remote wakeups and the sort.
2511  */
2512  __update_cpu_load(this_rq, 0, pending_updates);
2513  }
2514  raw_spin_unlock(&this_rq->lock);
2515 }
2516 #endif /* CONFIG_NO_HZ */
2517 
2518 /*
2519  * Called from scheduler_tick()
2520  */
2521 static void update_cpu_load_active(struct rq *this_rq)
2522 {
2523  /*
2524  * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2525  */
2526  this_rq->last_load_update_tick = jiffies;
2527  __update_cpu_load(this_rq, this_rq->load.weight, 1);
2528 
2529  calc_load_account_active(this_rq);
2530 }
2531 
2532 #ifdef CONFIG_SMP
2533 
2534 /*
2535  * sched_exec - execve() is a valuable balancing opportunity, because at
2536  * this point the task has the smallest effective memory and cache footprint.
2537  */
2538 void sched_exec(void)
2539 {
2540  struct task_struct *p = current;
2541  unsigned long flags;
2542  int dest_cpu;
2543 
2544  raw_spin_lock_irqsave(&p->pi_lock, flags);
2545  dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2546  if (dest_cpu == smp_processor_id())
2547  goto unlock;
2548 
2549  if (likely(cpu_active(dest_cpu))) {
2550  struct migration_arg arg = { p, dest_cpu };
2551 
2552  raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2553  stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2554  return;
2555  }
2556 unlock:
2557  raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2558 }
2559 
2560 #endif
2561 
2564 
2567 
2568 /*
2569  * Return any ns on the sched_clock that have not yet been accounted in
2570  * @p in case that task is currently running.
2571  *
2572  * Called with task_rq_lock() held on @rq.
2573  */
2574 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2575 {
2576  u64 ns = 0;
2577 
2578  if (task_current(rq, p)) {
2579  update_rq_clock(rq);
2580  ns = rq->clock_task - p->se.exec_start;
2581  if ((s64)ns < 0)
2582  ns = 0;
2583  }
2584 
2585  return ns;
2586 }
2587 
2588 unsigned long long task_delta_exec(struct task_struct *p)
2589 {
2590  unsigned long flags;
2591  struct rq *rq;
2592  u64 ns = 0;
2593 
2594  rq = task_rq_lock(p, &flags);
2595  ns = do_task_delta_exec(p, rq);
2596  task_rq_unlock(rq, p, &flags);
2597 
2598  return ns;
2599 }
2600 
2601 /*
2602  * Return accounted runtime for the task.
2603  * In case the task is currently running, return the runtime plus current's
2604  * pending runtime that have not been accounted yet.
2605  */
2606 unsigned long long task_sched_runtime(struct task_struct *p)
2607 {
2608  unsigned long flags;
2609  struct rq *rq;
2610  u64 ns = 0;
2611 
2612  rq = task_rq_lock(p, &flags);
2613  ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2614  task_rq_unlock(rq, p, &flags);
2615 
2616  return ns;
2617 }
2618 
2619 /*
2620  * This function gets called by the timer code, with HZ frequency.
2621  * We call it with interrupts disabled.
2622  */
2623 void scheduler_tick(void)
2624 {
2625  int cpu = smp_processor_id();
2626  struct rq *rq = cpu_rq(cpu);
2627  struct task_struct *curr = rq->curr;
2628 
2629  sched_clock_tick();
2630 
2631  raw_spin_lock(&rq->lock);
2632  update_rq_clock(rq);
2633  update_cpu_load_active(rq);
2634  curr->sched_class->task_tick(rq, curr, 0);
2635  raw_spin_unlock(&rq->lock);
2636 
2638 
2639 #ifdef CONFIG_SMP
2640  rq->idle_balance = idle_cpu(cpu);
2641  trigger_load_balance(rq, cpu);
2642 #endif
2643 }
2644 
2645 notrace unsigned long get_parent_ip(unsigned long addr)
2646 {
2647  if (in_lock_functions(addr)) {
2648  addr = CALLER_ADDR2;
2649  if (in_lock_functions(addr))
2650  addr = CALLER_ADDR3;
2651  }
2652  return addr;
2653 }
2654 
2655 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2656  defined(CONFIG_PREEMPT_TRACER))
2657 
2659 {
2660 #ifdef CONFIG_DEBUG_PREEMPT
2661  /*
2662  * Underflow?
2663  */
2664  if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2665  return;
2666 #endif
2667  preempt_count() += val;
2668 #ifdef CONFIG_DEBUG_PREEMPT
2669  /*
2670  * Spinlock count overflowing soon?
2671  */
2673  PREEMPT_MASK - 10);
2674 #endif
2675  if (preempt_count() == val)
2677 }
2679 
2680 void __kprobes sub_preempt_count(int val)
2681 {
2682 #ifdef CONFIG_DEBUG_PREEMPT
2683  /*
2684  * Underflow?
2685  */
2686  if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2687  return;
2688  /*
2689  * Is the spinlock portion underflowing?
2690  */
2691  if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2692  !(preempt_count() & PREEMPT_MASK)))
2693  return;
2694 #endif
2695 
2696  if (preempt_count() == val)
2698  preempt_count() -= val;
2699 }
2701 
2702 #endif
2703 
2704 /*
2705  * Print scheduling while atomic bug:
2706  */
2707 static noinline void __schedule_bug(struct task_struct *prev)
2708 {
2709  if (oops_in_progress)
2710  return;
2711 
2712  printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2713  prev->comm, prev->pid, preempt_count());
2714 
2715  debug_show_held_locks(prev);
2716  print_modules();
2717  if (irqs_disabled())
2718  print_irqtrace_events(prev);
2719  dump_stack();
2721 }
2722 
2723 /*
2724  * Various schedule()-time debugging checks and statistics:
2725  */
2726 static inline void schedule_debug(struct task_struct *prev)
2727 {
2728  /*
2729  * Test if we are atomic. Since do_exit() needs to call into
2730  * schedule() atomically, we ignore that path for now.
2731  * Otherwise, whine if we are scheduling when we should not be.
2732  */
2733  if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
2734  __schedule_bug(prev);
2735  rcu_sleep_check();
2736 
2737  profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2738 
2739  schedstat_inc(this_rq(), sched_count);
2740 }
2741 
2742 static void put_prev_task(struct rq *rq, struct task_struct *prev)
2743 {
2744  if (prev->on_rq || rq->skip_clock_update < 0)
2745  update_rq_clock(rq);
2746  prev->sched_class->put_prev_task(rq, prev);
2747 }
2748 
2749 /*
2750  * Pick up the highest-prio task:
2751  */
2752 static inline struct task_struct *
2753 pick_next_task(struct rq *rq)
2754 {
2755  const struct sched_class *class;
2756  struct task_struct *p;
2757 
2758  /*
2759  * Optimization: we know that if all tasks are in
2760  * the fair class we can call that function directly:
2761  */
2762  if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
2763  p = fair_sched_class.pick_next_task(rq);
2764  if (likely(p))
2765  return p;
2766  }
2767 
2768  for_each_class(class) {
2769  p = class->pick_next_task(rq);
2770  if (p)
2771  return p;
2772  }
2773 
2774  BUG(); /* the idle class will always have a runnable task */
2775 }
2776 
2777 /*
2778  * __schedule() is the main scheduler function.
2779  *
2780  * The main means of driving the scheduler and thus entering this function are:
2781  *
2782  * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
2783  *
2784  * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
2785  * paths. For example, see arch/x86/entry_64.S.
2786  *
2787  * To drive preemption between tasks, the scheduler sets the flag in timer
2788  * interrupt handler scheduler_tick().
2789  *
2790  * 3. Wakeups don't really cause entry into schedule(). They add a
2791  * task to the run-queue and that's it.
2792  *
2793  * Now, if the new task added to the run-queue preempts the current
2794  * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
2795  * called on the nearest possible occasion:
2796  *
2797  * - If the kernel is preemptible (CONFIG_PREEMPT=y):
2798  *
2799  * - in syscall or exception context, at the next outmost
2800  * preempt_enable(). (this might be as soon as the wake_up()'s
2801  * spin_unlock()!)
2802  *
2803  * - in IRQ context, return from interrupt-handler to
2804  * preemptible context
2805  *
2806  * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
2807  * then at the next:
2808  *
2809  * - cond_resched() call
2810  * - explicit schedule() call
2811  * - return from syscall or exception to user-space
2812  * - return from interrupt-handler to user-space
2813  */
2814 static void __sched __schedule(void)
2815 {
2816  struct task_struct *prev, *next;
2817  unsigned long *switch_count;
2818  struct rq *rq;
2819  int cpu;
2820 
2821 need_resched:
2822  preempt_disable();
2823  cpu = smp_processor_id();
2824  rq = cpu_rq(cpu);
2826  prev = rq->curr;
2827 
2828  schedule_debug(prev);
2829 
2830  if (sched_feat(HRTICK))
2831  hrtick_clear(rq);
2832 
2833  raw_spin_lock_irq(&rq->lock);
2834 
2835  switch_count = &prev->nivcsw;
2836  if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2837  if (unlikely(signal_pending_state(prev->state, prev))) {
2838  prev->state = TASK_RUNNING;
2839  } else {
2840  deactivate_task(rq, prev, DEQUEUE_SLEEP);
2841  prev->on_rq = 0;
2842 
2843  /*
2844  * If a worker went to sleep, notify and ask workqueue
2845  * whether it wants to wake up a task to maintain
2846  * concurrency.
2847  */
2848  if (prev->flags & PF_WQ_WORKER) {
2849  struct task_struct *to_wakeup;
2850 
2851  to_wakeup = wq_worker_sleeping(prev, cpu);
2852  if (to_wakeup)
2853  try_to_wake_up_local(to_wakeup);
2854  }
2855  }
2856  switch_count = &prev->nvcsw;
2857  }
2858 
2859  pre_schedule(rq, prev);
2860 
2861  if (unlikely(!rq->nr_running))
2862  idle_balance(cpu, rq);
2863 
2864  put_prev_task(rq, prev);
2865  next = pick_next_task(rq);
2866  clear_tsk_need_resched(prev);
2867  rq->skip_clock_update = 0;
2868 
2869  if (likely(prev != next)) {
2870  rq->nr_switches++;
2871  rq->curr = next;
2872  ++*switch_count;
2873 
2874  context_switch(rq, prev, next); /* unlocks the rq */
2875  /*
2876  * The context switch have flipped the stack from under us
2877  * and restored the local variables which were saved when
2878  * this task called schedule() in the past. prev == current
2879  * is still correct, but it can be moved to another cpu/rq.
2880  */
2881  cpu = smp_processor_id();
2882  rq = cpu_rq(cpu);
2883  } else
2884  raw_spin_unlock_irq(&rq->lock);
2885 
2886  post_schedule(rq);
2887 
2889  if (need_resched())
2890  goto need_resched;
2891 }
2892 
2893 static inline void sched_submit_work(struct task_struct *tsk)
2894 {
2895  if (!tsk->state || tsk_is_pi_blocked(tsk))
2896  return;
2897  /*
2898  * If we are going to sleep and we have plugged IO queued,
2899  * make sure to submit it to avoid deadlocks.
2900  */
2901  if (blk_needs_flush_plug(tsk))
2902  blk_schedule_flush_plug(tsk);
2903 }
2904 
2906 {
2907  struct task_struct *tsk = current;
2908 
2909  sched_submit_work(tsk);
2910  __schedule();
2911 }
2913 
2914 #ifdef CONFIG_RCU_USER_QS
2915 asmlinkage void __sched schedule_user(void)
2916 {
2917  /*
2918  * If we come here after a random call to set_need_resched(),
2919  * or we have been woken up remotely but the IPI has not yet arrived,
2920  * we haven't yet exited the RCU idle mode. Do it here manually until
2921  * we find a better solution.
2922  */
2923  rcu_user_exit();
2924  schedule();
2925  rcu_user_enter();
2926 }
2927 #endif
2928 
2935 {
2937  schedule();
2938  preempt_disable();
2939 }
2940 
2941 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
2942 
2943 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
2944 {
2945  if (lock->owner != owner)
2946  return false;
2947 
2948  /*
2949  * Ensure we emit the owner->on_cpu, dereference _after_ checking
2950  * lock->owner still matches owner, if that fails, owner might
2951  * point to free()d memory, if it still matches, the rcu_read_lock()
2952  * ensures the memory stays valid.
2953  */
2954  barrier();
2955 
2956  return owner->on_cpu;
2957 }
2958 
2959 /*
2960  * Look out! "owner" is an entirely speculative pointer
2961  * access and not reliable.
2962  */
2963 int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
2964 {
2965  if (!sched_feat(OWNER_SPIN))
2966  return 0;
2967 
2968  rcu_read_lock();
2969  while (owner_running(lock, owner)) {
2970  if (need_resched())
2971  break;
2972 
2974  }
2975  rcu_read_unlock();
2976 
2977  /*
2978  * We break out the loop above on need_resched() and when the
2979  * owner changed, which is a sign for heavy contention. Return
2980  * success only when lock->owner is NULL.
2981  */
2982  return lock->owner == NULL;
2983 }
2984 #endif
2985 
2986 #ifdef CONFIG_PREEMPT
2987 /*
2988  * this is the entry point to schedule() from in-kernel preemption
2989  * off of preempt_enable. Kernel preemptions off return from interrupt
2990  * occur there and call schedule directly.
2991  */
2992 asmlinkage void __sched notrace preempt_schedule(void)
2993 {
2994  struct thread_info *ti = current_thread_info();
2995 
2996  /*
2997  * If there is a non-zero preempt_count or interrupts are disabled,
2998  * we do not want to preempt the current task. Just return..
2999  */
3000  if (likely(ti->preempt_count || irqs_disabled()))
3001  return;
3002 
3003  do {
3004  add_preempt_count_notrace(PREEMPT_ACTIVE);
3005  __schedule();
3006  sub_preempt_count_notrace(PREEMPT_ACTIVE);
3007 
3008  /*
3009  * Check again in case we missed a preemption opportunity
3010  * between schedule and now.
3011  */
3012  barrier();
3013  } while (need_resched());
3014 }
3015 EXPORT_SYMBOL(preempt_schedule);
3016 
3017 /*
3018  * this is the entry point to schedule() from kernel preemption
3019  * off of irq context.
3020  * Note, that this is called and return with irqs disabled. This will
3021  * protect us against recursive calling from irq.
3022  */
3023 asmlinkage void __sched preempt_schedule_irq(void)
3024 {
3025  struct thread_info *ti = current_thread_info();
3026 
3027  /* Catch callers which need to be fixed */
3028  BUG_ON(ti->preempt_count || !irqs_disabled());
3029 
3030  rcu_user_exit();
3031  do {
3033  local_irq_enable();
3034  __schedule();
3037 
3038  /*
3039  * Check again in case we missed a preemption opportunity
3040  * between schedule and now.
3041  */
3042  barrier();
3043  } while (need_resched());
3044 }
3045 
3046 #endif /* CONFIG_PREEMPT */
3047 
3048 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3049  void *key)
3050 {
3051  return try_to_wake_up(curr->private, mode, wake_flags);
3052 }
3054 
3055 /*
3056  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3057  * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3058  * number) then we wake all the non-exclusive tasks and one exclusive task.
3059  *
3060  * There are circumstances in which we can try to wake a task which has already
3061  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3062  * zero in this (rare) case, and we handle it by continuing to scan the queue.
3063  */
3064 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3065  int nr_exclusive, int wake_flags, void *key)
3066 {
3067  wait_queue_t *curr, *next;
3068 
3069  list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3070  unsigned flags = curr->flags;
3071 
3072  if (curr->func(curr, mode, wake_flags, key) &&
3073  (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3074  break;
3075  }
3076 }
3077 
3088 void __wake_up(wait_queue_head_t *q, unsigned int mode,
3089  int nr_exclusive, void *key)
3090 {
3091  unsigned long flags;
3092 
3093  spin_lock_irqsave(&q->lock, flags);
3094  __wake_up_common(q, mode, nr_exclusive, 0, key);
3095  spin_unlock_irqrestore(&q->lock, flags);
3096 }
3098 
3099 /*
3100  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3101  */
3102 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3103 {
3104  __wake_up_common(q, mode, nr, 0, NULL);
3105 }
3107 
3108 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3109 {
3110  __wake_up_common(q, mode, 1, 0, key);
3111 }
3113 
3131 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3132  int nr_exclusive, void *key)
3133 {
3134  unsigned long flags;
3135  int wake_flags = WF_SYNC;
3136 
3137  if (unlikely(!q))
3138  return;
3139 
3140  if (unlikely(!nr_exclusive))
3141  wake_flags = 0;
3142 
3143  spin_lock_irqsave(&q->lock, flags);
3144  __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3145  spin_unlock_irqrestore(&q->lock, flags);
3146 }
3148 
3149 /*
3150  * __wake_up_sync - see __wake_up_sync_key()
3151  */
3152 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3153 {
3154  __wake_up_sync_key(q, mode, nr_exclusive, NULL);
3155 }
3156 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3157 
3170 void complete(struct completion *x)
3171 {
3172  unsigned long flags;
3173 
3174  spin_lock_irqsave(&x->wait.lock, flags);
3175  x->done++;
3176  __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3177  spin_unlock_irqrestore(&x->wait.lock, flags);
3178 }
3180 
3190 void complete_all(struct completion *x)
3191 {
3192  unsigned long flags;
3193 
3194  spin_lock_irqsave(&x->wait.lock, flags);
3195  x->done += UINT_MAX/2;
3196  __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3197  spin_unlock_irqrestore(&x->wait.lock, flags);
3198 }
3200 
3201 static inline long __sched
3202 do_wait_for_common(struct completion *x, long timeout, int state)
3203 {
3204  if (!x->done) {
3206 
3207  __add_wait_queue_tail_exclusive(&x->wait, &wait);
3208  do {
3209  if (signal_pending_state(state, current)) {
3210  timeout = -ERESTARTSYS;
3211  break;
3212  }
3213  __set_current_state(state);
3214  spin_unlock_irq(&x->wait.lock);
3215  timeout = schedule_timeout(timeout);
3216  spin_lock_irq(&x->wait.lock);
3217  } while (!x->done && timeout);
3218  __remove_wait_queue(&x->wait, &wait);
3219  if (!x->done)
3220  return timeout;
3221  }
3222  x->done--;
3223  return timeout ?: 1;
3224 }
3225 
3226 static long __sched
3227 wait_for_common(struct completion *x, long timeout, int state)
3228 {
3229  might_sleep();
3230 
3231  spin_lock_irq(&x->wait.lock);
3232  timeout = do_wait_for_common(x, timeout, state);
3233  spin_unlock_irq(&x->wait.lock);
3234  return timeout;
3235 }
3236 
3248 {
3249  wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3250 }
3252 
3265 unsigned long __sched
3266 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3267 {
3268  return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3269 }
3271 
3282 {
3283  long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3284  if (t == -ERESTARTSYS)
3285  return t;
3286  return 0;
3287 }
3289 
3301 long __sched
3303  unsigned long timeout)
3304 {
3305  return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3306 }
3308 
3319 {
3320  long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3321  if (t == -ERESTARTSYS)
3322  return t;
3323  return 0;
3324 }
3326 
3339 long __sched
3341  unsigned long timeout)
3342 {
3343  return wait_for_common(x, timeout, TASK_KILLABLE);
3344 }
3346 
3360 {
3361  unsigned long flags;
3362  int ret = 1;
3363 
3364  spin_lock_irqsave(&x->wait.lock, flags);
3365  if (!x->done)
3366  ret = 0;
3367  else
3368  x->done--;
3369  spin_unlock_irqrestore(&x->wait.lock, flags);
3370  return ret;
3371 }
3373 
3383 {
3384  unsigned long flags;
3385  int ret = 1;
3386 
3387  spin_lock_irqsave(&x->wait.lock, flags);
3388  if (!x->done)
3389  ret = 0;
3390  spin_unlock_irqrestore(&x->wait.lock, flags);
3391  return ret;
3392 }
3394 
3395 static long __sched
3396 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3397 {
3398  unsigned long flags;
3400 
3401  init_waitqueue_entry(&wait, current);
3402 
3403  __set_current_state(state);
3404 
3405  spin_lock_irqsave(&q->lock, flags);
3406  __add_wait_queue(q, &wait);
3407  spin_unlock(&q->lock);
3408  timeout = schedule_timeout(timeout);
3409  spin_lock_irq(&q->lock);
3410  __remove_wait_queue(q, &wait);
3411  spin_unlock_irqrestore(&q->lock, flags);
3412 
3413  return timeout;
3414 }
3415 
3417 {
3418  sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3419 }
3421 
3422 long __sched
3424 {
3425  return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3426 }
3428 
3430 {
3431  sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3432 }
3434 
3436 {
3437  return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3438 }
3440 
3441 #ifdef CONFIG_RT_MUTEXES
3442 
3443 /*
3444  * rt_mutex_setprio - set the current priority of a task
3445  * @p: task
3446  * @prio: prio value (kernel-internal form)
3447  *
3448  * This function changes the 'effective' priority of a task. It does
3449  * not touch ->normal_prio like __setscheduler().
3450  *
3451  * Used by the rt_mutex code to implement priority inheritance logic.
3452  */
3453 void rt_mutex_setprio(struct task_struct *p, int prio)
3454 {
3455  int oldprio, on_rq, running;
3456  struct rq *rq;
3457  const struct sched_class *prev_class;
3458 
3459  BUG_ON(prio < 0 || prio > MAX_PRIO);
3460 
3461  rq = __task_rq_lock(p);
3462 
3463  /*
3464  * Idle task boosting is a nono in general. There is one
3465  * exception, when PREEMPT_RT and NOHZ is active:
3466  *
3467  * The idle task calls get_next_timer_interrupt() and holds
3468  * the timer wheel base->lock on the CPU and another CPU wants
3469  * to access the timer (probably to cancel it). We can safely
3470  * ignore the boosting request, as the idle CPU runs this code
3471  * with interrupts disabled and will complete the lock
3472  * protected section without being interrupted. So there is no
3473  * real need to boost.
3474  */
3475  if (unlikely(p == rq->idle)) {
3476  WARN_ON(p != rq->curr);
3477  WARN_ON(p->pi_blocked_on);
3478  goto out_unlock;
3479  }
3480 
3481  trace_sched_pi_setprio(p, prio);
3482  oldprio = p->prio;
3483  prev_class = p->sched_class;
3484  on_rq = p->on_rq;
3485  running = task_current(rq, p);
3486  if (on_rq)
3487  dequeue_task(rq, p, 0);
3488  if (running)
3489  p->sched_class->put_prev_task(rq, p);
3490 
3491  if (rt_prio(prio))
3493  else
3495 
3496  p->prio = prio;
3497 
3498  if (running)
3499  p->sched_class->set_curr_task(rq);
3500  if (on_rq)
3501  enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3502 
3503  check_class_changed(rq, p, prev_class, oldprio);
3504 out_unlock:
3505  __task_rq_unlock(rq);
3506 }
3507 #endif
3508 void set_user_nice(struct task_struct *p, long nice)
3509 {
3510  int old_prio, delta, on_rq;
3511  unsigned long flags;
3512  struct rq *rq;
3513 
3514  if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3515  return;
3516  /*
3517  * We have to be careful, if called from sys_setpriority(),
3518  * the task might be in the middle of scheduling on another CPU.
3519  */
3520  rq = task_rq_lock(p, &flags);
3521  /*
3522  * The RT priorities are set via sched_setscheduler(), but we still
3523  * allow the 'normal' nice value to be set - but as expected
3524  * it wont have any effect on scheduling until the task is
3525  * SCHED_FIFO/SCHED_RR:
3526  */
3527  if (task_has_rt_policy(p)) {
3528  p->static_prio = NICE_TO_PRIO(nice);
3529  goto out_unlock;
3530  }
3531  on_rq = p->on_rq;
3532  if (on_rq)
3533  dequeue_task(rq, p, 0);
3534 
3535  p->static_prio = NICE_TO_PRIO(nice);
3536  set_load_weight(p);
3537  old_prio = p->prio;
3538  p->prio = effective_prio(p);
3539  delta = p->prio - old_prio;
3540 
3541  if (on_rq) {
3542  enqueue_task(rq, p, 0);
3543  /*
3544  * If the task increased its priority or is running and
3545  * lowered its priority, then reschedule its CPU:
3546  */
3547  if (delta < 0 || (delta > 0 && task_running(rq, p)))
3548  resched_task(rq->curr);
3549  }
3550 out_unlock:
3551  task_rq_unlock(rq, p, &flags);
3552 }
3554 
3555 /*
3556  * can_nice - check if a task can reduce its nice value
3557  * @p: task
3558  * @nice: nice value
3559  */
3560 int can_nice(const struct task_struct *p, const int nice)
3561 {
3562  /* convert nice value [19,-20] to rlimit style value [1,40] */
3563  int nice_rlim = 20 - nice;
3564 
3565  return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3567 }
3568 
3569 #ifdef __ARCH_WANT_SYS_NICE
3570 
3571 /*
3572  * sys_nice - change the priority of the current process.
3573  * @increment: priority increment
3574  *
3575  * sys_setpriority is a more generic, but much slower function that
3576  * does similar things.
3577  */
3578 SYSCALL_DEFINE1(nice, int, increment)
3579 {
3580  long nice, retval;
3581 
3582  /*
3583  * Setpriority might change our priority at the same moment.
3584  * We don't have to worry. Conceptually one call occurs first
3585  * and we have a single winner.
3586  */
3587  if (increment < -40)
3588  increment = -40;
3589  if (increment > 40)
3590  increment = 40;
3591 
3592  nice = TASK_NICE(current) + increment;
3593  if (nice < -20)
3594  nice = -20;
3595  if (nice > 19)
3596  nice = 19;
3597 
3598  if (increment < 0 && !can_nice(current, nice))
3599  return -EPERM;
3600 
3601  retval = security_task_setnice(current, nice);
3602  if (retval)
3603  return retval;
3604 
3605  set_user_nice(current, nice);
3606  return 0;
3607 }
3608 
3609 #endif
3610 
3619 int task_prio(const struct task_struct *p)
3620 {
3621  return p->prio - MAX_RT_PRIO;
3622 }
3623 
3628 int task_nice(const struct task_struct *p)
3629 {
3630  return TASK_NICE(p);
3631 }
3633 
3638 int idle_cpu(int cpu)
3639 {
3640  struct rq *rq = cpu_rq(cpu);
3641 
3642  if (rq->curr != rq->idle)
3643  return 0;
3644 
3645  if (rq->nr_running)
3646  return 0;
3647 
3648 #ifdef CONFIG_SMP
3649  if (!llist_empty(&rq->wake_list))
3650  return 0;
3651 #endif
3652 
3653  return 1;
3654 }
3655 
3660 struct task_struct *idle_task(int cpu)
3661 {
3662  return cpu_rq(cpu)->idle;
3663 }
3664 
3669 static struct task_struct *find_process_by_pid(pid_t pid)
3670 {
3671  return pid ? find_task_by_vpid(pid) : current;
3672 }
3673 
3674 /* Actually do priority change: must hold rq lock. */
3675 static void
3676 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3677 {
3678  p->policy = policy;
3679  p->rt_priority = prio;
3680  p->normal_prio = normal_prio(p);
3681  /* we are holding p->pi_lock already */
3682  p->prio = rt_mutex_getprio(p);
3683  if (rt_prio(p->prio))
3685  else
3687  set_load_weight(p);
3688 }
3689 
3690 /*
3691  * check the target process has a UID that matches the current process's
3692  */
3693 static bool check_same_owner(struct task_struct *p)
3694 {
3695  const struct cred *cred = current_cred(), *pcred;
3696  bool match;
3697 
3698  rcu_read_lock();
3699  pcred = __task_cred(p);
3700  match = (uid_eq(cred->euid, pcred->euid) ||
3701  uid_eq(cred->euid, pcred->uid));
3702  rcu_read_unlock();
3703  return match;
3704 }
3705 
3706 static int __sched_setscheduler(struct task_struct *p, int policy,
3707  const struct sched_param *param, bool user)
3708 {
3709  int retval, oldprio, oldpolicy = -1, on_rq, running;
3710  unsigned long flags;
3711  const struct sched_class *prev_class;
3712  struct rq *rq;
3713  int reset_on_fork;
3714 
3715  /* may grab non-irq protected spin_locks */
3716  BUG_ON(in_interrupt());
3717 recheck:
3718  /* double check policy once rq lock held */
3719  if (policy < 0) {
3720  reset_on_fork = p->sched_reset_on_fork;
3721  policy = oldpolicy = p->policy;
3722  } else {
3723  reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
3724  policy &= ~SCHED_RESET_ON_FORK;
3725 
3726  if (policy != SCHED_FIFO && policy != SCHED_RR &&
3727  policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3728  policy != SCHED_IDLE)
3729  return -EINVAL;
3730  }
3731 
3732  /*
3733  * Valid priorities for SCHED_FIFO and SCHED_RR are
3734  * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3735  * SCHED_BATCH and SCHED_IDLE is 0.
3736  */
3737  if (param->sched_priority < 0 ||
3738  (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3739  (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3740  return -EINVAL;
3741  if (rt_policy(policy) != (param->sched_priority != 0))
3742  return -EINVAL;
3743 
3744  /*
3745  * Allow unprivileged RT tasks to decrease priority:
3746  */
3747  if (user && !capable(CAP_SYS_NICE)) {
3748  if (rt_policy(policy)) {
3749  unsigned long rlim_rtprio =
3750  task_rlimit(p, RLIMIT_RTPRIO);
3751 
3752  /* can't set/change the rt policy */
3753  if (policy != p->policy && !rlim_rtprio)
3754  return -EPERM;
3755 
3756  /* can't increase priority */
3757  if (param->sched_priority > p->rt_priority &&
3758  param->sched_priority > rlim_rtprio)
3759  return -EPERM;
3760  }
3761 
3762  /*
3763  * Treat SCHED_IDLE as nice 20. Only allow a switch to
3764  * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3765  */
3766  if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3767  if (!can_nice(p, TASK_NICE(p)))
3768  return -EPERM;
3769  }
3770 
3771  /* can't change other user's priorities */
3772  if (!check_same_owner(p))
3773  return -EPERM;
3774 
3775  /* Normal users shall not reset the sched_reset_on_fork flag */
3776  if (p->sched_reset_on_fork && !reset_on_fork)
3777  return -EPERM;
3778  }
3779 
3780  if (user) {
3781  retval = security_task_setscheduler(p);
3782  if (retval)
3783  return retval;
3784  }
3785 
3786  /*
3787  * make sure no PI-waiters arrive (or leave) while we are
3788  * changing the priority of the task:
3789  *
3790  * To be able to change p->policy safely, the appropriate
3791  * runqueue lock must be held.
3792  */
3793  rq = task_rq_lock(p, &flags);
3794 
3795  /*
3796  * Changing the policy of the stop threads its a very bad idea
3797  */
3798  if (p == rq->stop) {
3799  task_rq_unlock(rq, p, &flags);
3800  return -EINVAL;
3801  }
3802 
3803  /*
3804  * If not changing anything there's no need to proceed further:
3805  */
3806  if (unlikely(policy == p->policy && (!rt_policy(policy) ||
3807  param->sched_priority == p->rt_priority))) {
3808  task_rq_unlock(rq, p, &flags);
3809  return 0;
3810  }
3811 
3812 #ifdef CONFIG_RT_GROUP_SCHED
3813  if (user) {
3814  /*
3815  * Do not allow realtime tasks into groups that have no runtime
3816  * assigned.
3817  */
3818  if (rt_bandwidth_enabled() && rt_policy(policy) &&
3819  task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3820  !task_group_is_autogroup(task_group(p))) {
3821  task_rq_unlock(rq, p, &flags);
3822  return -EPERM;
3823  }
3824  }
3825 #endif
3826 
3827  /* recheck policy now with rq lock held */
3828  if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3829  policy = oldpolicy = -1;
3830  task_rq_unlock(rq, p, &flags);
3831  goto recheck;
3832  }
3833  on_rq = p->on_rq;
3834  running = task_current(rq, p);
3835  if (on_rq)
3836  dequeue_task(rq, p, 0);
3837  if (running)
3838  p->sched_class->put_prev_task(rq, p);
3839 
3840  p->sched_reset_on_fork = reset_on_fork;
3841 
3842  oldprio = p->prio;
3843  prev_class = p->sched_class;
3844  __setscheduler(rq, p, policy, param->sched_priority);
3845 
3846  if (running)
3847  p->sched_class->set_curr_task(rq);
3848  if (on_rq)
3849  enqueue_task(rq, p, 0);
3850 
3851  check_class_changed(rq, p, prev_class, oldprio);
3852  task_rq_unlock(rq, p, &flags);
3853 
3854  rt_mutex_adjust_pi(p);
3855 
3856  return 0;
3857 }
3858 
3867 int sched_setscheduler(struct task_struct *p, int policy,
3868  const struct sched_param *param)
3869 {
3870  return __sched_setscheduler(p, policy, param, true);
3871 }
3873 
3885 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3886  const struct sched_param *param)
3887 {
3888  return __sched_setscheduler(p, policy, param, false);
3889 }
3890 
3891 static int
3892 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3893 {
3894  struct sched_param lparam;
3895  struct task_struct *p;
3896  int retval;
3897 
3898  if (!param || pid < 0)
3899  return -EINVAL;
3900  if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3901  return -EFAULT;
3902 
3903  rcu_read_lock();
3904  retval = -ESRCH;
3905  p = find_process_by_pid(pid);
3906  if (p != NULL)
3907  retval = sched_setscheduler(p, policy, &lparam);
3908  rcu_read_unlock();
3909 
3910  return retval;
3911 }
3912 
3920  struct sched_param __user *, param)
3921 {
3922  /* negative values for policy are not valid */
3923  if (policy < 0)
3924  return -EINVAL;
3925 
3926  return do_sched_setscheduler(pid, policy, param);
3927 }
3928 
3934 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3935 {
3936  return do_sched_setscheduler(pid, -1, param);
3937 }
3938 
3943 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3944 {
3945  struct task_struct *p;
3946  int retval;
3947 
3948  if (pid < 0)
3949  return -EINVAL;
3950 
3951  retval = -ESRCH;
3952  rcu_read_lock();
3953  p = find_process_by_pid(pid);
3954  if (p) {
3955  retval = security_task_getscheduler(p);
3956  if (!retval)
3957  retval = p->policy
3959  }
3960  rcu_read_unlock();
3961  return retval;
3962 }
3963 
3969 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3970 {
3971  struct sched_param lp;
3972  struct task_struct *p;
3973  int retval;
3974 
3975  if (!param || pid < 0)
3976  return -EINVAL;
3977 
3978  rcu_read_lock();
3979  p = find_process_by_pid(pid);
3980  retval = -ESRCH;
3981  if (!p)
3982  goto out_unlock;
3983 
3984  retval = security_task_getscheduler(p);
3985  if (retval)
3986  goto out_unlock;
3987 
3988  lp.sched_priority = p->rt_priority;
3989  rcu_read_unlock();
3990 
3991  /*
3992  * This one might sleep, we cannot do it with a spinlock held ...
3993  */
3994  retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3995 
3996  return retval;
3997 
3998 out_unlock:
3999  rcu_read_unlock();
4000  return retval;
4001 }
4002 
4003 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4004 {
4005  cpumask_var_t cpus_allowed, new_mask;
4006  struct task_struct *p;
4007  int retval;
4008 
4009  get_online_cpus();
4010  rcu_read_lock();
4011 
4012  p = find_process_by_pid(pid);
4013  if (!p) {
4014  rcu_read_unlock();
4015  put_online_cpus();
4016  return -ESRCH;
4017  }
4018 
4019  /* Prevent p going away */
4020  get_task_struct(p);
4021  rcu_read_unlock();
4022 
4023  if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4024  retval = -ENOMEM;
4025  goto out_put_task;
4026  }
4027  if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4028  retval = -ENOMEM;
4029  goto out_free_cpus_allowed;
4030  }
4031  retval = -EPERM;
4032  if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
4033  goto out_unlock;
4034 
4035  retval = security_task_setscheduler(p);
4036  if (retval)
4037  goto out_unlock;
4038 
4039  cpuset_cpus_allowed(p, cpus_allowed);
4040  cpumask_and(new_mask, in_mask, cpus_allowed);
4041 again:
4042  retval = set_cpus_allowed_ptr(p, new_mask);
4043 
4044  if (!retval) {
4045  cpuset_cpus_allowed(p, cpus_allowed);
4046  if (!cpumask_subset(new_mask, cpus_allowed)) {
4047  /*
4048  * We must have raced with a concurrent cpuset
4049  * update. Just reset the cpus_allowed to the
4050  * cpuset's cpus_allowed
4051  */
4052  cpumask_copy(new_mask, cpus_allowed);
4053  goto again;
4054  }
4055  }
4056 out_unlock:
4057  free_cpumask_var(new_mask);
4058 out_free_cpus_allowed:
4059  free_cpumask_var(cpus_allowed);
4060 out_put_task:
4061  put_task_struct(p);
4062  put_online_cpus();
4063  return retval;
4064 }
4065 
4066 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4067  struct cpumask *new_mask)
4068 {
4069  if (len < cpumask_size())
4070  cpumask_clear(new_mask);
4071  else if (len > cpumask_size())
4072  len = cpumask_size();
4073 
4074  return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4075 }
4076 
4083 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4084  unsigned long __user *, user_mask_ptr)
4085 {
4086  cpumask_var_t new_mask;
4087  int retval;
4088 
4089  if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4090  return -ENOMEM;
4091 
4092  retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4093  if (retval == 0)
4094  retval = sched_setaffinity(pid, new_mask);
4095  free_cpumask_var(new_mask);
4096  return retval;
4097 }
4098 
4100 {
4101  struct task_struct *p;
4102  unsigned long flags;
4103  int retval;
4104 
4105  get_online_cpus();
4106  rcu_read_lock();
4107 
4108  retval = -ESRCH;
4109  p = find_process_by_pid(pid);
4110  if (!p)
4111  goto out_unlock;
4112 
4113  retval = security_task_getscheduler(p);
4114  if (retval)
4115  goto out_unlock;
4116 
4117  raw_spin_lock_irqsave(&p->pi_lock, flags);
4118  cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4119  raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4120 
4121 out_unlock:
4122  rcu_read_unlock();
4123  put_online_cpus();
4124 
4125  return retval;
4126 }
4127 
4134 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4135  unsigned long __user *, user_mask_ptr)
4136 {
4137  int ret;
4139 
4140  if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4141  return -EINVAL;
4142  if (len & (sizeof(unsigned long)-1))
4143  return -EINVAL;
4144 
4145  if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4146  return -ENOMEM;
4147 
4148  ret = sched_getaffinity(pid, mask);
4149  if (ret == 0) {
4150  size_t retlen = min_t(size_t, len, cpumask_size());
4151 
4152  if (copy_to_user(user_mask_ptr, mask, retlen))
4153  ret = -EFAULT;
4154  else
4155  ret = retlen;
4156  }
4157  free_cpumask_var(mask);
4158 
4159  return ret;
4160 }
4161 
4168 SYSCALL_DEFINE0(sched_yield)
4169 {
4170  struct rq *rq = this_rq_lock();
4171 
4172  schedstat_inc(rq, yld_count);
4173  current->sched_class->yield_task(rq);
4174 
4175  /*
4176  * Since we are going to call schedule() anyway, there's
4177  * no need to preempt or enable interrupts:
4178  */
4179  __release(rq->lock);
4180  spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4181  do_raw_spin_unlock(&rq->lock);
4183 
4184  schedule();
4185 
4186  return 0;
4187 }
4188 
4189 static inline int should_resched(void)
4190 {
4191  return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4192 }
4193 
4194 static void __cond_resched(void)
4195 {
4197  __schedule();
4199 }
4200 
4202 {
4203  if (should_resched()) {
4204  __cond_resched();
4205  return 1;
4206  }
4207  return 0;
4208 }
4210 
4211 /*
4212  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4213  * call schedule, and on return reacquire the lock.
4214  *
4215  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4216  * operations here to prevent schedule() from being called twice (once via
4217  * spin_unlock(), once by hand).
4218  */
4220 {
4221  int resched = should_resched();
4222  int ret = 0;
4223 
4224  lockdep_assert_held(lock);
4225 
4226  if (spin_needbreak(lock) || resched) {
4227  spin_unlock(lock);
4228  if (resched)
4229  __cond_resched();
4230  else
4231  cpu_relax();
4232  ret = 1;
4233  spin_lock(lock);
4234  }
4235  return ret;
4236 }
4238 
4240 {
4241  BUG_ON(!in_softirq());
4242 
4243  if (should_resched()) {
4244  local_bh_enable();
4245  __cond_resched();
4246  local_bh_disable();
4247  return 1;
4248  }
4249  return 0;
4250 }
4252 
4275 void __sched yield(void)
4276 {
4278  sys_sched_yield();
4279 }
4281 
4294 bool __sched yield_to(struct task_struct *p, bool preempt)
4295 {
4296  struct task_struct *curr = current;
4297  struct rq *rq, *p_rq;
4298  unsigned long flags;
4299  bool yielded = 0;
4300 
4301  local_irq_save(flags);
4302  rq = this_rq();
4303 
4304 again:
4305  p_rq = task_rq(p);
4306  double_rq_lock(rq, p_rq);
4307  while (task_rq(p) != p_rq) {
4308  double_rq_unlock(rq, p_rq);
4309  goto again;
4310  }
4311 
4312  if (!curr->sched_class->yield_to_task)
4313  goto out;
4314 
4315  if (curr->sched_class != p->sched_class)
4316  goto out;
4317 
4318  if (task_running(p_rq, p) || p->state)
4319  goto out;
4320 
4321  yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4322  if (yielded) {
4323  schedstat_inc(rq, yld_count);
4324  /*
4325  * Make p's CPU reschedule; pick_next_entity takes care of
4326  * fairness.
4327  */
4328  if (preempt && rq != p_rq)
4329  resched_task(p_rq->curr);
4330  }
4331 
4332 out:
4333  double_rq_unlock(rq, p_rq);
4334  local_irq_restore(flags);
4335 
4336  if (yielded)
4337  schedule();
4338 
4339  return yielded;
4340 }
4342 
4343 /*
4344  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4345  * that process accounting knows that this is a task in IO wait state.
4346  */
4348 {
4349  struct rq *rq = raw_rq();
4350 
4351  delayacct_blkio_start();
4352  atomic_inc(&rq->nr_iowait);
4353  blk_flush_plug(current);
4354  current->in_iowait = 1;
4355  schedule();
4356  current->in_iowait = 0;
4357  atomic_dec(&rq->nr_iowait);
4358  delayacct_blkio_end();
4359 }
4361 
4362 long __sched io_schedule_timeout(long timeout)
4363 {
4364  struct rq *rq = raw_rq();
4365  long ret;
4366 
4367  delayacct_blkio_start();
4368  atomic_inc(&rq->nr_iowait);
4369  blk_flush_plug(current);
4370  current->in_iowait = 1;
4371  ret = schedule_timeout(timeout);
4372  current->in_iowait = 0;
4373  atomic_dec(&rq->nr_iowait);
4374  delayacct_blkio_end();
4375  return ret;
4376 }
4377 
4385 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4386 {
4387  int ret = -EINVAL;
4388 
4389  switch (policy) {
4390  case SCHED_FIFO:
4391  case SCHED_RR:
4392  ret = MAX_USER_RT_PRIO-1;
4393  break;
4394  case SCHED_NORMAL:
4395  case SCHED_BATCH:
4396  case SCHED_IDLE:
4397  ret = 0;
4398  break;
4399  }
4400  return ret;
4401 }
4402 
4410 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4411 {
4412  int ret = -EINVAL;
4413 
4414  switch (policy) {
4415  case SCHED_FIFO:
4416  case SCHED_RR:
4417  ret = 1;
4418  break;
4419  case SCHED_NORMAL:
4420  case SCHED_BATCH:
4421  case SCHED_IDLE:
4422  ret = 0;
4423  }
4424  return ret;
4425 }
4426 
4435 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4436  struct timespec __user *, interval)
4437 {
4438  struct task_struct *p;
4439  unsigned int time_slice;
4440  unsigned long flags;
4441  struct rq *rq;
4442  int retval;
4443  struct timespec t;
4444 
4445  if (pid < 0)
4446  return -EINVAL;
4447 
4448  retval = -ESRCH;
4449  rcu_read_lock();
4450  p = find_process_by_pid(pid);
4451  if (!p)
4452  goto out_unlock;
4453 
4454  retval = security_task_getscheduler(p);
4455  if (retval)
4456  goto out_unlock;
4457 
4458  rq = task_rq_lock(p, &flags);
4459  time_slice = p->sched_class->get_rr_interval(rq, p);
4460  task_rq_unlock(rq, p, &flags);
4461 
4462  rcu_read_unlock();
4463  jiffies_to_timespec(time_slice, &t);
4464  retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4465  return retval;
4466 
4467 out_unlock:
4468  rcu_read_unlock();
4469  return retval;
4470 }
4471 
4472 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4473 
4475 {
4476  unsigned long free = 0;
4477  unsigned state;
4478 
4479  state = p->state ? __ffs(p->state) + 1 : 0;
4480  printk(KERN_INFO "%-15.15s %c", p->comm,
4481  state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4482 #if BITS_PER_LONG == 32
4483  if (state == TASK_RUNNING)
4484  printk(KERN_CONT " running ");
4485  else
4486  printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4487 #else
4488  if (state == TASK_RUNNING)
4489  printk(KERN_CONT " running task ");
4490  else
4491  printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4492 #endif
4493 #ifdef CONFIG_DEBUG_STACK_USAGE
4494  free = stack_not_used(p);
4495 #endif
4496  printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4497  task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
4498  (unsigned long)task_thread_info(p)->flags);
4499 
4500  show_stack(p, NULL);
4501 }
4502 
4503 void show_state_filter(unsigned long state_filter)
4504 {
4505  struct task_struct *g, *p;
4506 
4507 #if BITS_PER_LONG == 32
4509  " task PC stack pid father\n");
4510 #else
4512  " task PC stack pid father\n");
4513 #endif
4514  rcu_read_lock();
4515  do_each_thread(g, p) {
4516  /*
4517  * reset the NMI-timeout, listing all files on a slow
4518  * console might take a lot of time:
4519  */
4521  if (!state_filter || (p->state & state_filter))
4522  sched_show_task(p);
4523  } while_each_thread(g, p);
4524 
4526 
4527 #ifdef CONFIG_SCHED_DEBUG
4529 #endif
4530  rcu_read_unlock();
4531  /*
4532  * Only show locks if all tasks are dumped:
4533  */
4534  if (!state_filter)
4536 }
4537 
4539 {
4540  idle->sched_class = &idle_sched_class;
4541 }
4542 
4551 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4552 {
4553  struct rq *rq = cpu_rq(cpu);
4554  unsigned long flags;
4555 
4556  raw_spin_lock_irqsave(&rq->lock, flags);
4557 
4558  __sched_fork(idle);
4559  idle->state = TASK_RUNNING;
4560  idle->se.exec_start = sched_clock();
4561 
4562  do_set_cpus_allowed(idle, cpumask_of(cpu));
4563  /*
4564  * We're having a chicken and egg problem, even though we are
4565  * holding rq->lock, the cpu isn't yet set to this cpu so the
4566  * lockdep check in task_group() will fail.
4567  *
4568  * Similar case to sched_fork(). / Alternatively we could
4569  * use task_rq_lock() here and obtain the other rq->lock.
4570  *
4571  * Silence PROVE_RCU
4572  */
4573  rcu_read_lock();
4574  __set_task_cpu(idle, cpu);
4575  rcu_read_unlock();
4576 
4577  rq->curr = rq->idle = idle;
4578 #if defined(CONFIG_SMP)
4579  idle->on_cpu = 1;
4580 #endif
4581  raw_spin_unlock_irqrestore(&rq->lock, flags);
4582 
4583  /* Set the preempt count _outside_ the spinlocks! */
4584  task_thread_info(idle)->preempt_count = 0;
4585 
4586  /*
4587  * The idle tasks have their own, simple scheduling class:
4588  */
4589  idle->sched_class = &idle_sched_class;
4590  ftrace_graph_init_idle_task(idle, cpu);
4591 #if defined(CONFIG_SMP)
4592  sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4593 #endif
4594 }
4595 
4596 #ifdef CONFIG_SMP
4597 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4598 {
4599  if (p->sched_class && p->sched_class->set_cpus_allowed)
4600  p->sched_class->set_cpus_allowed(p, new_mask);
4601 
4602  cpumask_copy(&p->cpus_allowed, new_mask);
4603  p->nr_cpus_allowed = cpumask_weight(new_mask);
4604 }
4605 
4606 /*
4607  * This is how migration works:
4608  *
4609  * 1) we invoke migration_cpu_stop() on the target CPU using
4610  * stop_one_cpu().
4611  * 2) stopper starts to run (implicitly forcing the migrated thread
4612  * off the CPU)
4613  * 3) it checks whether the migrated task is still in the wrong runqueue.
4614  * 4) if it's in the wrong runqueue then the migration thread removes
4615  * it and puts it into the right queue.
4616  * 5) stopper completes and stop_one_cpu() returns and the migration
4617  * is done.
4618  */
4619 
4620 /*
4621  * Change a given task's CPU affinity. Migrate the thread to a
4622  * proper CPU and schedule it away if the CPU it's executing on
4623  * is removed from the allowed bitmask.
4624  *
4625  * NOTE: the caller must have a valid reference to the task, the
4626  * task must not exit() & deallocate itself prematurely. The
4627  * call is not atomic; no spinlocks may be held.
4628  */
4629 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4630 {
4631  unsigned long flags;
4632  struct rq *rq;
4633  unsigned int dest_cpu;
4634  int ret = 0;
4635 
4636  rq = task_rq_lock(p, &flags);
4637 
4638  if (cpumask_equal(&p->cpus_allowed, new_mask))
4639  goto out;
4640 
4641  if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4642  ret = -EINVAL;
4643  goto out;
4644  }
4645 
4646  if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4647  ret = -EINVAL;
4648  goto out;
4649  }
4650 
4651  do_set_cpus_allowed(p, new_mask);
4652 
4653  /* Can the task run on the task's current CPU? If so, we're done */
4654  if (cpumask_test_cpu(task_cpu(p), new_mask))
4655  goto out;
4656 
4657  dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4658  if (p->on_rq) {
4659  struct migration_arg arg = { p, dest_cpu };
4660  /* Need help from migration thread: drop lock and wait. */
4661  task_rq_unlock(rq, p, &flags);
4662  stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4663  tlb_migrate_finish(p->mm);
4664  return 0;
4665  }
4666 out:
4667  task_rq_unlock(rq, p, &flags);
4668 
4669  return ret;
4670 }
4671 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4672 
4673 /*
4674  * Move (not current) task off this cpu, onto dest cpu. We're doing
4675  * this because either it can't run here any more (set_cpus_allowed()
4676  * away from this CPU, or CPU going down), or because we're
4677  * attempting to rebalance this task on exec (sched_exec).
4678  *
4679  * So we race with normal scheduler movements, but that's OK, as long
4680  * as the task is no longer on this CPU.
4681  *
4682  * Returns non-zero if task was successfully migrated.
4683  */
4684 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4685 {
4686  struct rq *rq_dest, *rq_src;
4687  int ret = 0;
4688 
4689  if (unlikely(!cpu_active(dest_cpu)))
4690  return ret;
4691 
4692  rq_src = cpu_rq(src_cpu);
4693  rq_dest = cpu_rq(dest_cpu);
4694 
4695  raw_spin_lock(&p->pi_lock);
4696  double_rq_lock(rq_src, rq_dest);
4697  /* Already moved. */
4698  if (task_cpu(p) != src_cpu)
4699  goto done;
4700  /* Affinity changed (again). */
4701  if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4702  goto fail;
4703 
4704  /*
4705  * If we're not on a rq, the next wake-up will ensure we're
4706  * placed properly.
4707  */
4708  if (p->on_rq) {
4709  dequeue_task(rq_src, p, 0);
4710  set_task_cpu(p, dest_cpu);
4711  enqueue_task(rq_dest, p, 0);
4712  check_preempt_curr(rq_dest, p, 0);
4713  }
4714 done:
4715  ret = 1;
4716 fail:
4717  double_rq_unlock(rq_src, rq_dest);
4718  raw_spin_unlock(&p->pi_lock);
4719  return ret;
4720 }
4721 
4722 /*
4723  * migration_cpu_stop - this will be executed by a highprio stopper thread
4724  * and performs thread migration by bumping thread off CPU then
4725  * 'pushing' onto another runqueue.
4726  */
4727 static int migration_cpu_stop(void *data)
4728 {
4729  struct migration_arg *arg = data;
4730 
4731  /*
4732  * The original target cpu might have gone down and we might
4733  * be on another cpu but it doesn't matter.
4734  */
4736  __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4737  local_irq_enable();
4738  return 0;
4739 }
4740 
4741 #ifdef CONFIG_HOTPLUG_CPU
4742 
4743 /*
4744  * Ensures that the idle task is using init_mm right before its cpu goes
4745  * offline.
4746  */
4747 void idle_task_exit(void)
4748 {
4749  struct mm_struct *mm = current->active_mm;
4750 
4752 
4753  if (mm != &init_mm)
4754  switch_mm(mm, &init_mm, current);
4755  mmdrop(mm);
4756 }
4757 
4758 /*
4759  * Since this CPU is going 'away' for a while, fold any nr_active delta
4760  * we might have. Assumes we're called after migrate_tasks() so that the
4761  * nr_active count is stable.
4762  *
4763  * Also see the comment "Global load-average calculations".
4764  */
4765 static void calc_load_migrate(struct rq *rq)
4766 {
4767  long delta = calc_load_fold_active(rq);
4768  if (delta)
4769  atomic_long_add(delta, &calc_load_tasks);
4770 }
4771 
4772 /*
4773  * Migrate all tasks from the rq, sleeping tasks will be migrated by
4774  * try_to_wake_up()->select_task_rq().
4775  *
4776  * Called with rq->lock held even though we'er in stop_machine() and
4777  * there's no concurrency possible, we hold the required locks anyway
4778  * because of lock validation efforts.
4779  */
4780 static void migrate_tasks(unsigned int dead_cpu)
4781 {
4782  struct rq *rq = cpu_rq(dead_cpu);
4783  struct task_struct *next, *stop = rq->stop;
4784  int dest_cpu;
4785 
4786  /*
4787  * Fudge the rq selection such that the below task selection loop
4788  * doesn't get stuck on the currently eligible stop task.
4789  *
4790  * We're currently inside stop_machine() and the rq is either stuck
4791  * in the stop_machine_cpu_stop() loop, or we're executing this code,
4792  * either way we should never end up calling schedule() until we're
4793  * done here.
4794  */
4795  rq->stop = NULL;
4796 
4797  for ( ; ; ) {
4798  /*
4799  * There's this thread running, bail when that's the only
4800  * remaining thread.
4801  */
4802  if (rq->nr_running == 1)
4803  break;
4804 
4805  next = pick_next_task(rq);
4806  BUG_ON(!next);
4807  next->sched_class->put_prev_task(rq, next);
4808 
4809  /* Find suitable destination for @next, with force if needed. */
4810  dest_cpu = select_fallback_rq(dead_cpu, next);
4811  raw_spin_unlock(&rq->lock);
4812 
4813  __migrate_task(next, dead_cpu, dest_cpu);
4814 
4815  raw_spin_lock(&rq->lock);
4816  }
4817 
4818  rq->stop = stop;
4819 }
4820 
4821 #endif /* CONFIG_HOTPLUG_CPU */
4822 
4823 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4824 
4825 static struct ctl_table sd_ctl_dir[] = {
4826  {
4827  .procname = "sched_domain",
4828  .mode = 0555,
4829  },
4830  {}
4831 };
4832 
4833 static struct ctl_table sd_ctl_root[] = {
4834  {
4835  .procname = "kernel",
4836  .mode = 0555,
4837  .child = sd_ctl_dir,
4838  },
4839  {}
4840 };
4841 
4842 static struct ctl_table *sd_alloc_ctl_entry(int n)
4843 {
4844  struct ctl_table *entry =
4845  kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
4846 
4847  return entry;
4848 }
4849 
4850 static void sd_free_ctl_entry(struct ctl_table **tablep)
4851 {
4852  struct ctl_table *entry;
4853 
4854  /*
4855  * In the intermediate directories, both the child directory and
4856  * procname are dynamically allocated and could fail but the mode
4857  * will always be set. In the lowest directory the names are
4858  * static strings and all have proc handlers.
4859  */
4860  for (entry = *tablep; entry->mode; entry++) {
4861  if (entry->child)
4862  sd_free_ctl_entry(&entry->child);
4863  if (entry->proc_handler == NULL)
4864  kfree(entry->procname);
4865  }
4866 
4867  kfree(*tablep);
4868  *tablep = NULL;
4869 }
4870 
4871 static int min_load_idx = 0;
4872 static int max_load_idx = CPU_LOAD_IDX_MAX;
4873 
4874 static void
4875 set_table_entry(struct ctl_table *entry,
4876  const char *procname, void *data, int maxlen,
4878  bool load_idx)
4879 {
4880  entry->procname = procname;
4881  entry->data = data;
4882  entry->maxlen = maxlen;
4883  entry->mode = mode;
4884  entry->proc_handler = proc_handler;
4885 
4886  if (load_idx) {
4887  entry->extra1 = &min_load_idx;
4888  entry->extra2 = &max_load_idx;
4889  }
4890 }
4891 
4892 static struct ctl_table *
4893 sd_alloc_ctl_domain_table(struct sched_domain *sd)
4894 {
4895  struct ctl_table *table = sd_alloc_ctl_entry(13);
4896 
4897  if (table == NULL)
4898  return NULL;
4899 
4900  set_table_entry(&table[0], "min_interval", &sd->min_interval,
4901  sizeof(long), 0644, proc_doulongvec_minmax, false);
4902  set_table_entry(&table[1], "max_interval", &sd->max_interval,
4903  sizeof(long), 0644, proc_doulongvec_minmax, false);
4904  set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
4905  sizeof(int), 0644, proc_dointvec_minmax, true);
4906  set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
4907  sizeof(int), 0644, proc_dointvec_minmax, true);
4908  set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
4909  sizeof(int), 0644, proc_dointvec_minmax, true);
4910  set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
4911  sizeof(int), 0644, proc_dointvec_minmax, true);
4912  set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
4913  sizeof(int), 0644, proc_dointvec_minmax, true);
4914  set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
4915  sizeof(int), 0644, proc_dointvec_minmax, false);
4916  set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
4917  sizeof(int), 0644, proc_dointvec_minmax, false);
4918  set_table_entry(&table[9], "cache_nice_tries",
4919  &sd->cache_nice_tries,
4920  sizeof(int), 0644, proc_dointvec_minmax, false);
4921  set_table_entry(&table[10], "flags", &sd->flags,
4922  sizeof(int), 0644, proc_dointvec_minmax, false);
4923  set_table_entry(&table[11], "name", sd->name,
4924  CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4925  /* &table[12] is terminator */
4926 
4927  return table;
4928 }
4929 
4930 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
4931 {
4932  struct ctl_table *entry, *table;
4933  struct sched_domain *sd;
4934  int domain_num = 0, i;
4935  char buf[32];
4936 
4937  for_each_domain(cpu, sd)
4938  domain_num++;
4939  entry = table = sd_alloc_ctl_entry(domain_num + 1);
4940  if (table == NULL)
4941  return NULL;
4942 
4943  i = 0;
4944  for_each_domain(cpu, sd) {
4945  snprintf(buf, 32, "domain%d", i);
4946  entry->procname = kstrdup(buf, GFP_KERNEL);
4947  entry->mode = 0555;
4948  entry->child = sd_alloc_ctl_domain_table(sd);
4949  entry++;
4950  i++;
4951  }
4952  return table;
4953 }
4954 
4955 static struct ctl_table_header *sd_sysctl_header;
4956 static void register_sched_domain_sysctl(void)
4957 {
4958  int i, cpu_num = num_possible_cpus();
4959  struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
4960  char buf[32];
4961 
4962  WARN_ON(sd_ctl_dir[0].child);
4963  sd_ctl_dir[0].child = entry;
4964 
4965  if (entry == NULL)
4966  return;
4967 
4969  snprintf(buf, 32, "cpu%d", i);
4970  entry->procname = kstrdup(buf, GFP_KERNEL);
4971  entry->mode = 0555;
4972  entry->child = sd_alloc_ctl_cpu_table(i);
4973  entry++;
4974  }
4975 
4976  WARN_ON(sd_sysctl_header);
4977  sd_sysctl_header = register_sysctl_table(sd_ctl_root);
4978 }
4979 
4980 /* may be called multiple times per register */
4981 static void unregister_sched_domain_sysctl(void)
4982 {
4983  if (sd_sysctl_header)
4984  unregister_sysctl_table(sd_sysctl_header);
4985  sd_sysctl_header = NULL;
4986  if (sd_ctl_dir[0].child)
4987  sd_free_ctl_entry(&sd_ctl_dir[0].child);
4988 }
4989 #else
4990 static void register_sched_domain_sysctl(void)
4991 {
4992 }
4993 static void unregister_sched_domain_sysctl(void)
4994 {
4995 }
4996 #endif
4997 
4998 static void set_rq_online(struct rq *rq)
4999 {
5000  if (!rq->online) {
5001  const struct sched_class *class;
5002 
5003  cpumask_set_cpu(rq->cpu, rq->rd->online);
5004  rq->online = 1;
5005 
5006  for_each_class(class) {
5007  if (class->rq_online)
5008  class->rq_online(rq);
5009  }
5010  }
5011 }
5012 
5013 static void set_rq_offline(struct rq *rq)
5014 {
5015  if (rq->online) {
5016  const struct sched_class *class;
5017 
5018  for_each_class(class) {
5019  if (class->rq_offline)
5020  class->rq_offline(rq);
5021  }
5022 
5023  cpumask_clear_cpu(rq->cpu, rq->rd->online);
5024  rq->online = 0;
5025  }
5026 }
5027 
5028 /*
5029  * migration_call - callback that gets triggered when a CPU is added.
5030  * Here we can start up the necessary migration thread for the new CPU.
5031  */
5032 static int __cpuinit
5033 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5034 {
5035  int cpu = (long)hcpu;
5036  unsigned long flags;
5037  struct rq *rq = cpu_rq(cpu);
5038 
5039  switch (action & ~CPU_TASKS_FROZEN) {
5040 
5041  case CPU_UP_PREPARE:
5043  break;
5044 
5045  case CPU_ONLINE:
5046  /* Update our root-domain */
5047  raw_spin_lock_irqsave(&rq->lock, flags);
5048  if (rq->rd) {
5049  BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5050 
5051  set_rq_online(rq);
5052  }
5053  raw_spin_unlock_irqrestore(&rq->lock, flags);
5054  break;
5055 
5056 #ifdef CONFIG_HOTPLUG_CPU
5057  case CPU_DYING:
5058  sched_ttwu_pending();
5059  /* Update our root-domain */
5060  raw_spin_lock_irqsave(&rq->lock, flags);
5061  if (rq->rd) {
5062  BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5063  set_rq_offline(rq);
5064  }
5065  migrate_tasks(cpu);
5066  BUG_ON(rq->nr_running != 1); /* the migration thread */
5067  raw_spin_unlock_irqrestore(&rq->lock, flags);
5068  break;
5069 
5070  case CPU_DEAD:
5071  calc_load_migrate(rq);
5072  break;
5073 #endif
5074  }
5075 
5077 
5078  return NOTIFY_OK;
5079 }
5080 
5081 /*
5082  * Register at high priority so that task migration (migrate_all_tasks)
5083  * happens before everything else. This has to be lower priority than
5084  * the notifier in the perf_event subsystem, though.
5085  */
5086 static struct notifier_block __cpuinitdata migration_notifier = {
5087  .notifier_call = migration_call,
5088  .priority = CPU_PRI_MIGRATION,
5089 };
5090 
5091 static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5092  unsigned long action, void *hcpu)
5093 {
5094  switch (action & ~CPU_TASKS_FROZEN) {
5095  case CPU_STARTING:
5096  case CPU_DOWN_FAILED:
5097  set_cpu_active((long)hcpu, true);
5098  return NOTIFY_OK;
5099  default:
5100  return NOTIFY_DONE;
5101  }
5102 }
5103 
5104 static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5105  unsigned long action, void *hcpu)
5106 {
5107  switch (action & ~CPU_TASKS_FROZEN) {
5108  case CPU_DOWN_PREPARE:
5109  set_cpu_active((long)hcpu, false);
5110  return NOTIFY_OK;
5111  default:
5112  return NOTIFY_DONE;
5113  }
5114 }
5115 
5116 static int __init migration_init(void)
5117 {
5118  void *cpu = (void *)(long)smp_processor_id();
5119  int err;
5120 
5121  /* Initialize migration for the boot CPU */
5122  err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5123  BUG_ON(err == NOTIFY_BAD);
5124  migration_call(&migration_notifier, CPU_ONLINE, cpu);
5125  register_cpu_notifier(&migration_notifier);
5126 
5127  /* Register cpu active notifiers */
5128  cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5129  cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5130 
5131  return 0;
5132 }
5133 early_initcall(migration_init);
5134 #endif
5135 
5136 #ifdef CONFIG_SMP
5137 
5138 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5139 
5140 #ifdef CONFIG_SCHED_DEBUG
5141 
5142 static __read_mostly int sched_debug_enabled;
5143 
5144 static int __init sched_debug_setup(char *str)
5145 {
5146  sched_debug_enabled = 1;
5147 
5148  return 0;
5149 }
5150 early_param("sched_debug", sched_debug_setup);
5151 
5152 static inline bool sched_debug(void)
5153 {
5154  return sched_debug_enabled;
5155 }
5156 
5157 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5158  struct cpumask *groupmask)
5159 {
5160  struct sched_group *group = sd->groups;
5161  char str[256];
5162 
5163  cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5164  cpumask_clear(groupmask);
5165 
5166  printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5167 
5168  if (!(sd->flags & SD_LOAD_BALANCE)) {
5169  printk("does not load-balance\n");
5170  if (sd->parent)
5171  printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5172  " has parent");
5173  return -1;
5174  }
5175 
5176  printk(KERN_CONT "span %s level %s\n", str, sd->name);
5177 
5178  if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5179  printk(KERN_ERR "ERROR: domain->span does not contain "
5180  "CPU%d\n", cpu);
5181  }
5182  if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5183  printk(KERN_ERR "ERROR: domain->groups does not contain"
5184  " CPU%d\n", cpu);
5185  }
5186 
5187  printk(KERN_DEBUG "%*s groups:", level + 1, "");
5188  do {
5189  if (!group) {
5190  printk("\n");
5191  printk(KERN_ERR "ERROR: group is NULL\n");
5192  break;
5193  }
5194 
5195  /*
5196  * Even though we initialize ->power to something semi-sane,
5197  * we leave power_orig unset. This allows us to detect if
5198  * domain iteration is still funny without causing /0 traps.
5199  */
5200  if (!group->sgp->power_orig) {
5201  printk(KERN_CONT "\n");
5202  printk(KERN_ERR "ERROR: domain->cpu_power not "
5203  "set\n");
5204  break;
5205  }
5206 
5207  if (!cpumask_weight(sched_group_cpus(group))) {
5208  printk(KERN_CONT "\n");
5209  printk(KERN_ERR "ERROR: empty group\n");
5210  break;
5211  }
5212 
5213  if (!(sd->flags & SD_OVERLAP) &&
5214  cpumask_intersects(groupmask, sched_group_cpus(group))) {
5215  printk(KERN_CONT "\n");
5216  printk(KERN_ERR "ERROR: repeated CPUs\n");
5217  break;
5218  }
5219 
5220  cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5221 
5222  cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5223 
5224  printk(KERN_CONT " %s", str);
5225  if (group->sgp->power != SCHED_POWER_SCALE) {
5226  printk(KERN_CONT " (cpu_power = %d)",
5227  group->sgp->power);
5228  }
5229 
5230  group = group->next;
5231  } while (group != sd->groups);
5232  printk(KERN_CONT "\n");
5233 
5234  if (!cpumask_equal(sched_domain_span(sd), groupmask))
5235  printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5236 
5237  if (sd->parent &&
5238  !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5239  printk(KERN_ERR "ERROR: parent span is not a superset "
5240  "of domain->span\n");
5241  return 0;
5242 }
5243 
5244 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5245 {
5246  int level = 0;
5247 
5248  if (!sched_debug_enabled)
5249  return;
5250 
5251  if (!sd) {
5252  printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5253  return;
5254  }
5255 
5256  printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5257 
5258  for (;;) {
5259  if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5260  break;
5261  level++;
5262  sd = sd->parent;
5263  if (!sd)
5264  break;
5265  }
5266 }
5267 #else /* !CONFIG_SCHED_DEBUG */
5268 # define sched_domain_debug(sd, cpu) do { } while (0)
5269 static inline bool sched_debug(void)
5270 {
5271  return false;
5272 }
5273 #endif /* CONFIG_SCHED_DEBUG */
5274 
5275 static int sd_degenerate(struct sched_domain *sd)
5276 {
5277  if (cpumask_weight(sched_domain_span(sd)) == 1)
5278  return 1;
5279 
5280  /* Following flags need at least 2 groups */
5281  if (sd->flags & (SD_LOAD_BALANCE |
5282  SD_BALANCE_NEWIDLE |
5283  SD_BALANCE_FORK |
5284  SD_BALANCE_EXEC |
5285  SD_SHARE_CPUPOWER |
5286  SD_SHARE_PKG_RESOURCES)) {
5287  if (sd->groups != sd->groups->next)
5288  return 0;
5289  }
5290 
5291  /* Following flags don't use groups */
5292  if (sd->flags & (SD_WAKE_AFFINE))
5293  return 0;
5294 
5295  return 1;
5296 }
5297 
5298 static int
5299 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5300 {
5301  unsigned long cflags = sd->flags, pflags = parent->flags;
5302 
5303  if (sd_degenerate(parent))
5304  return 1;
5305 
5306  if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5307  return 0;
5308 
5309  /* Flags needing groups don't count if only 1 group in parent */
5310  if (parent->groups == parent->groups->next) {
5311  pflags &= ~(SD_LOAD_BALANCE |
5312  SD_BALANCE_NEWIDLE |
5313  SD_BALANCE_FORK |
5314  SD_BALANCE_EXEC |
5315  SD_SHARE_CPUPOWER |
5316  SD_SHARE_PKG_RESOURCES);
5317  if (nr_node_ids == 1)
5318  pflags &= ~SD_SERIALIZE;
5319  }
5320  if (~cflags & pflags)
5321  return 0;
5322 
5323  return 1;
5324 }
5325 
5326 static void free_rootdomain(struct rcu_head *rcu)
5327 {
5328  struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5329 
5330  cpupri_cleanup(&rd->cpupri);
5331  free_cpumask_var(rd->rto_mask);
5332  free_cpumask_var(rd->online);
5333  free_cpumask_var(rd->span);
5334  kfree(rd);
5335 }
5336 
5337 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5338 {
5339  struct root_domain *old_rd = NULL;
5340  unsigned long flags;
5341 
5342  raw_spin_lock_irqsave(&rq->lock, flags);
5343 
5344  if (rq->rd) {
5345  old_rd = rq->rd;
5346 
5347  if (cpumask_test_cpu(rq->cpu, old_rd->online))
5348  set_rq_offline(rq);
5349 
5350  cpumask_clear_cpu(rq->cpu, old_rd->span);
5351 
5352  /*
5353  * If we dont want to free the old_rt yet then
5354  * set old_rd to NULL to skip the freeing later
5355  * in this function:
5356  */
5357  if (!atomic_dec_and_test(&old_rd->refcount))
5358  old_rd = NULL;
5359  }
5360 
5361  atomic_inc(&rd->refcount);
5362  rq->rd = rd;
5363 
5364  cpumask_set_cpu(rq->cpu, rd->span);
5365  if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5366  set_rq_online(rq);
5367 
5368  raw_spin_unlock_irqrestore(&rq->lock, flags);
5369 
5370  if (old_rd)
5371  call_rcu_sched(&old_rd->rcu, free_rootdomain);
5372 }
5373 
5374 static int init_rootdomain(struct root_domain *rd)
5375 {
5376  memset(rd, 0, sizeof(*rd));
5377 
5378  if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5379  goto out;
5380  if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5381  goto free_span;
5382  if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5383  goto free_online;
5384 
5385  if (cpupri_init(&rd->cpupri) != 0)
5386  goto free_rto_mask;
5387  return 0;
5388 
5389 free_rto_mask:
5390  free_cpumask_var(rd->rto_mask);
5391 free_online:
5392  free_cpumask_var(rd->online);
5393 free_span:
5394  free_cpumask_var(rd->span);
5395 out:
5396  return -ENOMEM;
5397 }
5398 
5399 /*
5400  * By default the system creates a single root-domain with all cpus as
5401  * members (mimicking the global state we have today).
5402  */
5403 struct root_domain def_root_domain;
5404 
5405 static void init_defrootdomain(void)
5406 {
5407  init_rootdomain(&def_root_domain);
5408 
5409  atomic_set(&def_root_domain.refcount, 1);
5410 }
5411 
5412 static struct root_domain *alloc_rootdomain(void)
5413 {
5414  struct root_domain *rd;
5415 
5416  rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5417  if (!rd)
5418  return NULL;
5419 
5420  if (init_rootdomain(rd) != 0) {
5421  kfree(rd);
5422  return NULL;
5423  }
5424 
5425  return rd;
5426 }
5427 
5428 static void free_sched_groups(struct sched_group *sg, int free_sgp)
5429 {
5430  struct sched_group *tmp, *first;
5431 
5432  if (!sg)
5433  return;
5434 
5435  first = sg;
5436  do {
5437  tmp = sg->next;
5438 
5439  if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5440  kfree(sg->sgp);
5441 
5442  kfree(sg);
5443  sg = tmp;
5444  } while (sg != first);
5445 }
5446 
5447 static void free_sched_domain(struct rcu_head *rcu)
5448 {
5449  struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5450 
5451  /*
5452  * If its an overlapping domain it has private groups, iterate and
5453  * nuke them all.
5454  */
5455  if (sd->flags & SD_OVERLAP) {
5456  free_sched_groups(sd->groups, 1);
5457  } else if (atomic_dec_and_test(&sd->groups->ref)) {
5458  kfree(sd->groups->sgp);
5459  kfree(sd->groups);
5460  }
5461  kfree(sd);
5462 }
5463 
5464 static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5465 {
5466  call_rcu(&sd->rcu, free_sched_domain);
5467 }
5468 
5469 static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5470 {
5471  for (; sd; sd = sd->parent)
5472  destroy_sched_domain(sd, cpu);
5473 }
5474 
5475 /*
5476  * Keep a special pointer to the highest sched_domain that has
5477  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5478  * allows us to avoid some pointer chasing select_idle_sibling().
5479  *
5480  * Also keep a unique ID per domain (we use the first cpu number in
5481  * the cpumask of the domain), this allows us to quickly tell if
5482  * two cpus are in the same cache domain, see cpus_share_cache().
5483  */
5484 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5485 DEFINE_PER_CPU(int, sd_llc_id);
5486 
5487 static void update_top_cache_domain(int cpu)
5488 {
5489  struct sched_domain *sd;
5490  int id = cpu;
5491 
5492  sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5493  if (sd)
5494  id = cpumask_first(sched_domain_span(sd));
5495 
5496  rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5497  per_cpu(sd_llc_id, cpu) = id;
5498 }
5499 
5500 /*
5501  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5502  * hold the hotplug lock.
5503  */
5504 static void
5505 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5506 {
5507  struct rq *rq = cpu_rq(cpu);
5508  struct sched_domain *tmp;
5509 
5510  /* Remove the sched domains which do not contribute to scheduling. */
5511  for (tmp = sd; tmp; ) {
5512  struct sched_domain *parent = tmp->parent;
5513  if (!parent)
5514  break;
5515 
5516  if (sd_parent_degenerate(tmp, parent)) {
5517  tmp->parent = parent->parent;
5518  if (parent->parent)
5519  parent->parent->child = tmp;
5520  destroy_sched_domain(parent, cpu);
5521  } else
5522  tmp = tmp->parent;
5523  }
5524 
5525  if (sd && sd_degenerate(sd)) {
5526  tmp = sd;
5527  sd = sd->parent;
5528  destroy_sched_domain(tmp, cpu);
5529  if (sd)
5530  sd->child = NULL;
5531  }
5532 
5533  sched_domain_debug(sd, cpu);
5534 
5535  rq_attach_root(rq, rd);
5536  tmp = rq->sd;
5537  rcu_assign_pointer(rq->sd, sd);
5538  destroy_sched_domains(tmp, cpu);
5539 
5540  update_top_cache_domain(cpu);
5541 }
5542 
5543 /* cpus with isolated domains */
5544 static cpumask_var_t cpu_isolated_map;
5545 
5546 /* Setup the mask of cpus configured for isolated domains */
5547 static int __init isolated_cpu_setup(char *str)
5548 {
5549  alloc_bootmem_cpumask_var(&cpu_isolated_map);
5550  cpulist_parse(str, cpu_isolated_map);
5551  return 1;
5552 }
5553 
5554 __setup("isolcpus=", isolated_cpu_setup);
5555 
5556 static const struct cpumask *cpu_cpu_mask(int cpu)
5557 {
5558  return cpumask_of_node(cpu_to_node(cpu));
5559 }
5560 
5561 struct sd_data {
5562  struct sched_domain **__percpu sd;
5563  struct sched_group **__percpu sg;
5564  struct sched_group_power **__percpu sgp;
5565 };
5566 
5567 struct s_data {
5568  struct sched_domain ** __percpu sd;
5569  struct root_domain *rd;
5570 };
5571 
5572 enum s_alloc {
5573  sa_rootdomain,
5574  sa_sd,
5575  sa_sd_storage,
5576  sa_none,
5577 };
5578 
5579 struct sched_domain_topology_level;
5580 
5581 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5582 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5583 
5584 #define SDTL_OVERLAP 0x01
5585 
5586 struct sched_domain_topology_level {
5587  sched_domain_init_f init;
5588  sched_domain_mask_f mask;
5589  int flags;
5590  int numa_level;
5591  struct sd_data data;
5592 };
5593 
5594 /*
5595  * Build an iteration mask that can exclude certain CPUs from the upwards
5596  * domain traversal.
5597  *
5598  * Asymmetric node setups can result in situations where the domain tree is of
5599  * unequal depth, make sure to skip domains that already cover the entire
5600  * range.
5601  *
5602  * In that case build_sched_domains() will have terminated the iteration early
5603  * and our sibling sd spans will be empty. Domains should always include the
5604  * cpu they're built on, so check that.
5605  *
5606  */
5607 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5608 {
5609  const struct cpumask *span = sched_domain_span(sd);
5610  struct sd_data *sdd = sd->private;
5611  struct sched_domain *sibling;
5612  int i;
5613 
5614  for_each_cpu(i, span) {
5615  sibling = *per_cpu_ptr(sdd->sd, i);
5616  if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5617  continue;
5618 
5619  cpumask_set_cpu(i, sched_group_mask(sg));
5620  }
5621 }
5622 
5623 /*
5624  * Return the canonical balance cpu for this group, this is the first cpu
5625  * of this group that's also in the iteration mask.
5626  */
5627 int group_balance_cpu(struct sched_group *sg)
5628 {
5629  return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5630 }
5631 
5632 static int
5633 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5634 {
5635  struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5636  const struct cpumask *span = sched_domain_span(sd);
5637  struct cpumask *covered = sched_domains_tmpmask;
5638  struct sd_data *sdd = sd->private;
5639  struct sched_domain *child;
5640  int i;
5641 
5642  cpumask_clear(covered);
5643 
5644  for_each_cpu(i, span) {
5645  struct cpumask *sg_span;
5646 
5647  if (cpumask_test_cpu(i, covered))
5648  continue;
5649 
5650  child = *per_cpu_ptr(sdd->sd, i);
5651 
5652  /* See the comment near build_group_mask(). */
5653  if (!cpumask_test_cpu(i, sched_domain_span(child)))
5654  continue;
5655 
5656  sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5657  GFP_KERNEL, cpu_to_node(cpu));
5658 
5659  if (!sg)
5660  goto fail;
5661 
5662  sg_span = sched_group_cpus(sg);
5663  if (child->child) {
5664  child = child->child;
5665  cpumask_copy(sg_span, sched_domain_span(child));
5666  } else
5667  cpumask_set_cpu(i, sg_span);
5668 
5669  cpumask_or(covered, covered, sg_span);
5670 
5671  sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5672  if (atomic_inc_return(&sg->sgp->ref) == 1)
5673  build_group_mask(sd, sg);
5674 
5675  /*
5676  * Initialize sgp->power such that even if we mess up the
5677  * domains and no possible iteration will get us here, we won't
5678  * die on a /0 trap.
5679  */
5680  sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5681 
5682  /*
5683  * Make sure the first group of this domain contains the
5684  * canonical balance cpu. Otherwise the sched_domain iteration
5685  * breaks. See update_sg_lb_stats().
5686  */
5687  if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5688  group_balance_cpu(sg) == cpu)
5689  groups = sg;
5690 
5691  if (!first)
5692  first = sg;
5693  if (last)
5694  last->next = sg;
5695  last = sg;
5696  last->next = first;
5697  }
5698  sd->groups = groups;
5699 
5700  return 0;
5701 
5702 fail:
5703  free_sched_groups(first, 0);
5704 
5705  return -ENOMEM;
5706 }
5707 
5708 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5709 {
5710  struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5711  struct sched_domain *child = sd->child;
5712 
5713  if (child)
5714  cpu = cpumask_first(sched_domain_span(child));
5715 
5716  if (sg) {
5717  *sg = *per_cpu_ptr(sdd->sg, cpu);
5718  (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
5719  atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
5720  }
5721 
5722  return cpu;
5723 }
5724 
5725 /*
5726  * build_sched_groups will build a circular linked list of the groups
5727  * covered by the given span, and will set each group's ->cpumask correctly,
5728  * and ->cpu_power to 0.
5729  *
5730  * Assumes the sched_domain tree is fully constructed
5731  */
5732 static int
5733 build_sched_groups(struct sched_domain *sd, int cpu)
5734 {
5735  struct sched_group *first = NULL, *last = NULL;
5736  struct sd_data *sdd = sd->private;
5737  const struct cpumask *span = sched_domain_span(sd);
5738  struct cpumask *covered;
5739  int i;
5740 
5741  get_group(cpu, sdd, &sd->groups);
5742  atomic_inc(&sd->groups->ref);
5743 
5744  if (cpu != cpumask_first(sched_domain_span(sd)))
5745  return 0;
5746 
5747  lockdep_assert_held(&sched_domains_mutex);
5748  covered = sched_domains_tmpmask;
5749 
5750  cpumask_clear(covered);
5751 
5752  for_each_cpu(i, span) {
5753  struct sched_group *sg;
5754  int group = get_group(i, sdd, &sg);
5755  int j;
5756 
5757  if (cpumask_test_cpu(i, covered))
5758  continue;
5759 
5760  cpumask_clear(sched_group_cpus(sg));
5761  sg->sgp->power = 0;
5762  cpumask_setall(sched_group_mask(sg));
5763 
5764  for_each_cpu(j, span) {
5765  if (get_group(j, sdd, NULL) != group)
5766  continue;
5767 
5768  cpumask_set_cpu(j, covered);
5769  cpumask_set_cpu(j, sched_group_cpus(sg));
5770  }
5771 
5772  if (!first)
5773  first = sg;
5774  if (last)
5775  last->next = sg;
5776  last = sg;
5777  }
5778  last->next = first;
5779 
5780  return 0;
5781 }
5782 
5783 /*
5784  * Initialize sched groups cpu_power.
5785  *
5786  * cpu_power indicates the capacity of sched group, which is used while
5787  * distributing the load between different sched groups in a sched domain.
5788  * Typically cpu_power for all the groups in a sched domain will be same unless
5789  * there are asymmetries in the topology. If there are asymmetries, group
5790  * having more cpu_power will pickup more load compared to the group having
5791  * less cpu_power.
5792  */
5793 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5794 {
5795  struct sched_group *sg = sd->groups;
5796 
5797  WARN_ON(!sd || !sg);
5798 
5799  do {
5800  sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5801  sg = sg->next;
5802  } while (sg != sd->groups);
5803 
5804  if (cpu != group_balance_cpu(sg))
5805  return;
5806 
5807  update_group_power(sd, cpu);
5808  atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5809 }
5810 
5812 {
5813  return 0*SD_ASYM_PACKING;
5814 }
5815 
5816 /*
5817  * Initializers for schedule domains
5818  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5819  */
5820 
5821 #ifdef CONFIG_SCHED_DEBUG
5822 # define SD_INIT_NAME(sd, type) sd->name = #type
5823 #else
5824 # define SD_INIT_NAME(sd, type) do { } while (0)
5825 #endif
5826 
5827 #define SD_INIT_FUNC(type) \
5828 static noinline struct sched_domain * \
5829 sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5830 { \
5831  struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5832  *sd = SD_##type##_INIT; \
5833  SD_INIT_NAME(sd, type); \
5834  sd->private = &tl->data; \
5835  return sd; \
5836 }
5837 
5838 SD_INIT_FUNC(CPU)
5839 #ifdef CONFIG_SCHED_SMT
5840  SD_INIT_FUNC(SIBLING)
5841 #endif
5842 #ifdef CONFIG_SCHED_MC
5843  SD_INIT_FUNC(MC)
5844 #endif
5845 #ifdef CONFIG_SCHED_BOOK
5846  SD_INIT_FUNC(BOOK)
5847 #endif
5848 
5849 static int default_relax_domain_level = -1;
5850 int sched_domain_level_max;
5851 
5852 static int __init setup_relax_domain_level(char *str)
5853 {
5854  if (kstrtoint(str, 0, &default_relax_domain_level))
5855  pr_warn("Unable to set relax_domain_level\n");
5856 
5857  return 1;
5858 }
5859 __setup("relax_domain_level=", setup_relax_domain_level);
5860 
5861 static void set_domain_attribute(struct sched_domain *sd,
5862  struct sched_domain_attr *attr)
5863 {
5864  int request;
5865 
5866  if (!attr || attr->relax_domain_level < 0) {
5867  if (default_relax_domain_level < 0)
5868  return;
5869  else
5870  request = default_relax_domain_level;
5871  } else
5872  request = attr->relax_domain_level;
5873  if (request < sd->level) {
5874  /* turn off idle balance on this domain */
5875  sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5876  } else {
5877  /* turn on idle balance on this domain */
5878  sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5879  }
5880 }
5881 
5882 static void __sdt_free(const struct cpumask *cpu_map);
5883 static int __sdt_alloc(const struct cpumask *cpu_map);
5884 
5885 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
5886  const struct cpumask *cpu_map)
5887 {
5888  switch (what) {
5889  case sa_rootdomain:
5890  if (!atomic_read(&d->rd->refcount))
5891  free_rootdomain(&d->rd->rcu); /* fall through */
5892  case sa_sd:
5893  free_percpu(d->sd); /* fall through */
5894  case sa_sd_storage:
5895  __sdt_free(cpu_map); /* fall through */
5896  case sa_none:
5897  break;
5898  }
5899 }
5900 
5901 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
5902  const struct cpumask *cpu_map)
5903 {
5904  memset(d, 0, sizeof(*d));
5905 
5906  if (__sdt_alloc(cpu_map))
5907  return sa_sd_storage;
5908  d->sd = alloc_percpu(struct sched_domain *);
5909  if (!d->sd)
5910  return sa_sd_storage;
5911  d->rd = alloc_rootdomain();
5912  if (!d->rd)
5913  return sa_sd;
5914  return sa_rootdomain;
5915 }
5916 
5917 /*
5918  * NULL the sd_data elements we've used to build the sched_domain and
5919  * sched_group structure so that the subsequent __free_domain_allocs()
5920  * will not free the data we're using.
5921  */
5922 static void claim_allocations(int cpu, struct sched_domain *sd)
5923 {
5924  struct sd_data *sdd = sd->private;
5925 
5926  WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
5927  *per_cpu_ptr(sdd->sd, cpu) = NULL;
5928 
5929  if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
5930  *per_cpu_ptr(sdd->sg, cpu) = NULL;
5931 
5932  if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
5933  *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5934 }
5935 
5936 #ifdef CONFIG_SCHED_SMT
5937 static const struct cpumask *cpu_smt_mask(int cpu)
5938 {
5939  return topology_thread_cpumask(cpu);
5940 }
5941 #endif
5942 
5943 /*
5944  * Topology list, bottom-up.
5945  */
5946 static struct sched_domain_topology_level default_topology[] = {
5947 #ifdef CONFIG_SCHED_SMT
5948  { sd_init_SIBLING, cpu_smt_mask, },
5949 #endif
5950 #ifdef CONFIG_SCHED_MC
5951  { sd_init_MC, cpu_coregroup_mask, },
5952 #endif
5953 #ifdef CONFIG_SCHED_BOOK
5954  { sd_init_BOOK, cpu_book_mask, },
5955 #endif
5956  { sd_init_CPU, cpu_cpu_mask, },
5957  { NULL, },
5958 };
5959 
5960 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5961 
5962 #ifdef CONFIG_NUMA
5963 
5964 static int sched_domains_numa_levels;
5965 static int *sched_domains_numa_distance;
5966 static struct cpumask ***sched_domains_numa_masks;
5967 static int sched_domains_curr_level;
5968 
5969 static inline int sd_local_flags(int level)
5970 {
5971  if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
5972  return 0;
5973 
5974  return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
5975 }
5976 
5977 static struct sched_domain *
5978 sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5979 {
5980  struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
5981  int level = tl->numa_level;
5982  int sd_weight = cpumask_weight(
5983  sched_domains_numa_masks[level][cpu_to_node(cpu)]);
5984 
5985  *sd = (struct sched_domain){
5986  .min_interval = sd_weight,
5987  .max_interval = 2*sd_weight,
5988  .busy_factor = 32,
5989  .imbalance_pct = 125,
5990  .cache_nice_tries = 2,
5991  .busy_idx = 3,
5992  .idle_idx = 2,
5993  .newidle_idx = 0,
5994  .wake_idx = 0,
5995  .forkexec_idx = 0,
5996 
5997  .flags = 1*SD_LOAD_BALANCE
5998  | 1*SD_BALANCE_NEWIDLE
5999  | 0*SD_BALANCE_EXEC
6000  | 0*SD_BALANCE_FORK
6001  | 0*SD_BALANCE_WAKE
6002  | 0*SD_WAKE_AFFINE
6003  | 0*SD_SHARE_CPUPOWER
6004  | 0*SD_SHARE_PKG_RESOURCES
6005  | 1*SD_SERIALIZE
6006  | 0*SD_PREFER_SIBLING
6007  | sd_local_flags(level)
6008  ,
6009  .last_balance = jiffies,
6010  .balance_interval = sd_weight,
6011  };
6012  SD_INIT_NAME(sd, NUMA);
6013  sd->private = &tl->data;
6014 
6015  /*
6016  * Ugly hack to pass state to sd_numa_mask()...
6017  */
6018  sched_domains_curr_level = tl->numa_level;
6019 
6020  return sd;
6021 }
6022 
6023 static const struct cpumask *sd_numa_mask(int cpu)
6024 {
6025  return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6026 }
6027 
6028 static void sched_numa_warn(const char *str)
6029 {
6030  static int done = false;
6031  int i,j;
6032 
6033  if (done)
6034  return;
6035 
6036  done = true;
6037 
6038  printk(KERN_WARNING "ERROR: %s\n\n", str);
6039 
6040  for (i = 0; i < nr_node_ids; i++) {
6041  printk(KERN_WARNING " ");
6042  for (j = 0; j < nr_node_ids; j++)
6043  printk(KERN_CONT "%02d ", node_distance(i,j));
6044  printk(KERN_CONT "\n");
6045  }
6046  printk(KERN_WARNING "\n");
6047 }
6048 
6049 static bool find_numa_distance(int distance)
6050 {
6051  int i;
6052 
6053  if (distance == node_distance(0, 0))
6054  return true;
6055 
6056  for (i = 0; i < sched_domains_numa_levels; i++) {
6057  if (sched_domains_numa_distance[i] == distance)
6058  return true;
6059  }
6060 
6061  return false;
6062 }
6063 
6064 static void sched_init_numa(void)
6065 {
6066  int next_distance, curr_distance = node_distance(0, 0);
6067  struct sched_domain_topology_level *tl;
6068  int level = 0;
6069  int i, j, k;
6070 
6071  sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6072  if (!sched_domains_numa_distance)
6073  return;
6074 
6075  /*
6076  * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6077  * unique distances in the node_distance() table.
6078  *
6079  * Assumes node_distance(0,j) includes all distances in
6080  * node_distance(i,j) in order to avoid cubic time.
6081  */
6082  next_distance = curr_distance;
6083  for (i = 0; i < nr_node_ids; i++) {
6084  for (j = 0; j < nr_node_ids; j++) {
6085  for (k = 0; k < nr_node_ids; k++) {
6086  int distance = node_distance(i, k);
6087 
6088  if (distance > curr_distance &&
6089  (distance < next_distance ||
6090  next_distance == curr_distance))
6091  next_distance = distance;
6092 
6093  /*
6094  * While not a strong assumption it would be nice to know
6095  * about cases where if node A is connected to B, B is not
6096  * equally connected to A.
6097  */
6098  if (sched_debug() && node_distance(k, i) != distance)
6099  sched_numa_warn("Node-distance not symmetric");
6100 
6101  if (sched_debug() && i && !find_numa_distance(distance))
6102  sched_numa_warn("Node-0 not representative");
6103  }
6104  if (next_distance != curr_distance) {
6105  sched_domains_numa_distance[level++] = next_distance;
6106  sched_domains_numa_levels = level;
6107  curr_distance = next_distance;
6108  } else break;
6109  }
6110 
6111  /*
6112  * In case of sched_debug() we verify the above assumption.
6113  */
6114  if (!sched_debug())
6115  break;
6116  }
6117  /*
6118  * 'level' contains the number of unique distances, excluding the
6119  * identity distance node_distance(i,i).
6120  *
6121  * The sched_domains_nume_distance[] array includes the actual distance
6122  * numbers.
6123  */
6124 
6125  /*
6126  * Here, we should temporarily reset sched_domains_numa_levels to 0.
6127  * If it fails to allocate memory for array sched_domains_numa_masks[][],
6128  * the array will contain less then 'level' members. This could be
6129  * dangerous when we use it to iterate array sched_domains_numa_masks[][]
6130  * in other functions.
6131  *
6132  * We reset it to 'level' at the end of this function.
6133  */
6134  sched_domains_numa_levels = 0;
6135 
6136  sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6137  if (!sched_domains_numa_masks)
6138  return;
6139 
6140  /*
6141  * Now for each level, construct a mask per node which contains all
6142  * cpus of nodes that are that many hops away from us.
6143  */
6144  for (i = 0; i < level; i++) {
6145  sched_domains_numa_masks[i] =
6146  kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6147  if (!sched_domains_numa_masks[i])
6148  return;
6149 
6150  for (j = 0; j < nr_node_ids; j++) {
6151  struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6152  if (!mask)
6153  return;
6154 
6155  sched_domains_numa_masks[i][j] = mask;
6156 
6157  for (k = 0; k < nr_node_ids; k++) {
6158  if (node_distance(j, k) > sched_domains_numa_distance[i])
6159  continue;
6160 
6161  cpumask_or(mask, mask, cpumask_of_node(k));
6162  }
6163  }
6164  }
6165 
6166  tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6167  sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6168  if (!tl)
6169  return;
6170 
6171  /*
6172  * Copy the default topology bits..
6173  */
6174  for (i = 0; default_topology[i].init; i++)
6175  tl[i] = default_topology[i];
6176 
6177  /*
6178  * .. and append 'j' levels of NUMA goodness.
6179  */
6180  for (j = 0; j < level; i++, j++) {
6181  tl[i] = (struct sched_domain_topology_level){
6182  .init = sd_numa_init,
6183  .mask = sd_numa_mask,
6184  .flags = SDTL_OVERLAP,
6185  .numa_level = j,
6186  };
6187  }
6188 
6189  sched_domain_topology = tl;
6190 
6191  sched_domains_numa_levels = level;
6192 }
6193 
6194 static void sched_domains_numa_masks_set(int cpu)
6195 {
6196  int i, j;
6197  int node = cpu_to_node(cpu);
6198 
6199  for (i = 0; i < sched_domains_numa_levels; i++) {
6200  for (j = 0; j < nr_node_ids; j++) {
6201  if (node_distance(j, node) <= sched_domains_numa_distance[i])
6202  cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6203  }
6204  }
6205 }
6206 
6207 static void sched_domains_numa_masks_clear(int cpu)
6208 {
6209  int i, j;
6210  for (i = 0; i < sched_domains_numa_levels; i++) {
6211  for (j = 0; j < nr_node_ids; j++)
6212  cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6213  }
6214 }
6215 
6216 /*
6217  * Update sched_domains_numa_masks[level][node] array when new cpus
6218  * are onlined.
6219  */
6220 static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6221  unsigned long action,
6222  void *hcpu)
6223 {
6224  int cpu = (long)hcpu;
6225 
6226  switch (action & ~CPU_TASKS_FROZEN) {
6227  case CPU_ONLINE:
6228  sched_domains_numa_masks_set(cpu);
6229  break;
6230 
6231  case CPU_DEAD:
6232  sched_domains_numa_masks_clear(cpu);
6233  break;
6234 
6235  default:
6236  return NOTIFY_DONE;
6237  }
6238 
6239  return NOTIFY_OK;
6240 }
6241 #else
6242 static inline void sched_init_numa(void)
6243 {
6244 }
6245 
6246 static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6247  unsigned long action,
6248  void *hcpu)
6249 {
6250  return 0;
6251 }
6252 #endif /* CONFIG_NUMA */
6253 
6254 static int __sdt_alloc(const struct cpumask *cpu_map)
6255 {
6256  struct sched_domain_topology_level *tl;
6257  int j;
6258 
6259  for (tl = sched_domain_topology; tl->init; tl++) {
6260  struct sd_data *sdd = &tl->data;
6261 
6262  sdd->sd = alloc_percpu(struct sched_domain *);
6263  if (!sdd->sd)
6264  return -ENOMEM;
6265 
6266  sdd->sg = alloc_percpu(struct sched_group *);
6267  if (!sdd->sg)
6268  return -ENOMEM;
6269 
6270  sdd->sgp = alloc_percpu(struct sched_group_power *);
6271  if (!sdd->sgp)
6272  return -ENOMEM;
6273 
6274  for_each_cpu(j, cpu_map) {
6275  struct sched_domain *sd;
6276  struct sched_group *sg;
6277  struct sched_group_power *sgp;
6278 
6279  sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6280  GFP_KERNEL, cpu_to_node(j));
6281  if (!sd)
6282  return -ENOMEM;
6283 
6284  *per_cpu_ptr(sdd->sd, j) = sd;
6285 
6286  sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6287  GFP_KERNEL, cpu_to_node(j));
6288  if (!sg)
6289  return -ENOMEM;
6290 
6291  sg->next = sg;
6292 
6293  *per_cpu_ptr(sdd->sg, j) = sg;
6294 
6295  sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6296  GFP_KERNEL, cpu_to_node(j));
6297  if (!sgp)
6298  return -ENOMEM;
6299 
6300  *per_cpu_ptr(sdd->sgp, j) = sgp;
6301  }
6302  }
6303 
6304  return 0;
6305 }
6306 
6307 static void __sdt_free(const struct cpumask *cpu_map)
6308 {
6309  struct sched_domain_topology_level *tl;
6310  int j;
6311 
6312  for (tl = sched_domain_topology; tl->init; tl++) {
6313  struct sd_data *sdd = &tl->data;
6314 
6315  for_each_cpu(j, cpu_map) {
6316  struct sched_domain *sd;
6317 
6318  if (sdd->sd) {
6319  sd = *per_cpu_ptr(sdd->sd, j);
6320  if (sd && (sd->flags & SD_OVERLAP))
6321  free_sched_groups(sd->groups, 0);
6322  kfree(*per_cpu_ptr(sdd->sd, j));
6323  }
6324 
6325  if (sdd->sg)
6326  kfree(*per_cpu_ptr(sdd->sg, j));
6327  if (sdd->sgp)
6328  kfree(*per_cpu_ptr(sdd->sgp, j));
6329  }
6330  free_percpu(sdd->sd);
6331  sdd->sd = NULL;
6332  free_percpu(sdd->sg);
6333  sdd->sg = NULL;
6334  free_percpu(sdd->sgp);
6335  sdd->sgp = NULL;
6336  }
6337 }
6338 
6339 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6340  struct s_data *d, const struct cpumask *cpu_map,
6341  struct sched_domain_attr *attr, struct sched_domain *child,
6342  int cpu)
6343 {
6344  struct sched_domain *sd = tl->init(tl, cpu);
6345  if (!sd)
6346  return child;
6347 
6348  cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6349  if (child) {
6350  sd->level = child->level + 1;
6351  sched_domain_level_max = max(sched_domain_level_max, sd->level);
6352  child->parent = sd;
6353  }
6354  sd->child = child;
6355  set_domain_attribute(sd, attr);
6356 
6357  return sd;
6358 }
6359 
6360 /*
6361  * Build sched domains for a given set of cpus and attach the sched domains
6362  * to the individual cpus
6363  */
6364 static int build_sched_domains(const struct cpumask *cpu_map,
6365  struct sched_domain_attr *attr)
6366 {
6367  enum s_alloc alloc_state = sa_none;
6368  struct sched_domain *sd;
6369  struct s_data d;
6370  int i, ret = -ENOMEM;
6371 
6372  alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6373  if (alloc_state != sa_rootdomain)
6374  goto error;
6375 
6376  /* Set up domains for cpus specified by the cpu_map. */
6377  for_each_cpu(i, cpu_map) {
6378  struct sched_domain_topology_level *tl;
6379 
6380  sd = NULL;
6381  for (tl = sched_domain_topology; tl->init; tl++) {
6382  sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
6383  if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6384  sd->flags |= SD_OVERLAP;
6385  if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6386  break;
6387  }
6388 
6389  while (sd->child)
6390  sd = sd->child;
6391 
6392  *per_cpu_ptr(d.sd, i) = sd;
6393  }
6394 
6395  /* Build the groups for the domains */
6396  for_each_cpu(i, cpu_map) {
6397  for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6398  sd->span_weight = cpumask_weight(sched_domain_span(sd));
6399  if (sd->flags & SD_OVERLAP) {
6400  if (build_overlap_sched_groups(sd, i))
6401  goto error;
6402  } else {
6403  if (build_sched_groups(sd, i))
6404  goto error;
6405  }
6406  }
6407  }
6408 
6409  /* Calculate CPU power for physical packages and nodes */
6410  for (i = nr_cpumask_bits-1; i >= 0; i--) {
6411  if (!cpumask_test_cpu(i, cpu_map))
6412  continue;
6413 
6414  for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6415  claim_allocations(i, sd);
6416  init_sched_groups_power(i, sd);
6417  }
6418  }
6419 
6420  /* Attach the domains */
6421  rcu_read_lock();
6422  for_each_cpu(i, cpu_map) {
6423  sd = *per_cpu_ptr(d.sd, i);
6424  cpu_attach_domain(sd, d.rd, i);
6425  }
6426  rcu_read_unlock();
6427 
6428  ret = 0;
6429 error:
6430  __free_domain_allocs(&d, alloc_state, cpu_map);
6431  return ret;
6432 }
6433 
6434 static cpumask_var_t *doms_cur; /* current sched domains */
6435 static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6436 static struct sched_domain_attr *dattr_cur;
6437  /* attribues of custom domains in 'doms_cur' */
6438 
6439 /*
6440  * Special case: If a kmalloc of a doms_cur partition (array of
6441  * cpumask) fails, then fallback to a single sched domain,
6442  * as determined by the single cpumask fallback_doms.
6443  */
6444 static cpumask_var_t fallback_doms;
6445 
6446 /*
6447  * arch_update_cpu_topology lets virtualized architectures update the
6448  * cpu core maps. It is supposed to return 1 if the topology changed
6449  * or 0 if it stayed the same.
6450  */
6451 int __attribute__((weak)) arch_update_cpu_topology(void)
6452 {
6453  return 0;
6454 }
6455 
6456 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6457 {
6458  int i;
6459  cpumask_var_t *doms;
6460 
6461  doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6462  if (!doms)
6463  return NULL;
6464  for (i = 0; i < ndoms; i++) {
6465  if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6466  free_sched_domains(doms, i);
6467  return NULL;
6468  }
6469  }
6470  return doms;
6471 }
6472 
6473 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6474 {
6475  unsigned int i;
6476  for (i = 0; i < ndoms; i++)
6477  free_cpumask_var(doms[i]);
6478  kfree(doms);
6479 }
6480 
6481 /*
6482  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6483  * For now this just excludes isolated cpus, but could be used to
6484  * exclude other special cases in the future.
6485  */
6486 static int init_sched_domains(const struct cpumask *cpu_map)
6487 {
6488  int err;
6489 
6491  ndoms_cur = 1;
6492  doms_cur = alloc_sched_domains(ndoms_cur);
6493  if (!doms_cur)
6494  doms_cur = &fallback_doms;
6495  cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6496  err = build_sched_domains(doms_cur[0], NULL);
6497  register_sched_domain_sysctl();
6498 
6499  return err;
6500 }
6501 
6502 /*
6503  * Detach sched domains from a group of cpus specified in cpu_map
6504  * These cpus will now be attached to the NULL domain
6505  */
6506 static void detach_destroy_domains(const struct cpumask *cpu_map)
6507 {
6508  int i;
6509 
6510  rcu_read_lock();
6511  for_each_cpu(i, cpu_map)
6512  cpu_attach_domain(NULL, &def_root_domain, i);
6513  rcu_read_unlock();
6514 }
6515 
6516 /* handle null as "default" */
6517 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6518  struct sched_domain_attr *new, int idx_new)
6519 {
6520  struct sched_domain_attr tmp;
6521 
6522  /* fast path */
6523  if (!new && !cur)
6524  return 1;
6525 
6526  tmp = SD_ATTR_INIT;
6527  return !memcmp(cur ? (cur + idx_cur) : &tmp,
6528  new ? (new + idx_new) : &tmp,
6529  sizeof(struct sched_domain_attr));
6530 }
6531 
6532 /*
6533  * Partition sched domains as specified by the 'ndoms_new'
6534  * cpumasks in the array doms_new[] of cpumasks. This compares
6535  * doms_new[] to the current sched domain partitioning, doms_cur[].
6536  * It destroys each deleted domain and builds each new domain.
6537  *
6538  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
6539  * The masks don't intersect (don't overlap.) We should setup one
6540  * sched domain for each mask. CPUs not in any of the cpumasks will
6541  * not be load balanced. If the same cpumask appears both in the
6542  * current 'doms_cur' domains and in the new 'doms_new', we can leave
6543  * it as it is.
6544  *
6545  * The passed in 'doms_new' should be allocated using
6546  * alloc_sched_domains. This routine takes ownership of it and will
6547  * free_sched_domains it when done with it. If the caller failed the
6548  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
6549  * and partition_sched_domains() will fallback to the single partition
6550  * 'fallback_doms', it also forces the domains to be rebuilt.
6551  *
6552  * If doms_new == NULL it will be replaced with cpu_online_mask.
6553  * ndoms_new == 0 is a special case for destroying existing domains,
6554  * and it will not create the default domain.
6555  *
6556  * Call with hotplug lock held
6557  */
6558 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6559  struct sched_domain_attr *dattr_new)
6560 {
6561  int i, j, n;
6562  int new_topology;
6563 
6564  mutex_lock(&sched_domains_mutex);
6565 
6566  /* always unregister in case we don't destroy any domains */
6567  unregister_sched_domain_sysctl();
6568 
6569  /* Let architecture update cpu core mappings. */
6570  new_topology = arch_update_cpu_topology();
6571 
6572  n = doms_new ? ndoms_new : 0;
6573 
6574  /* Destroy deleted domains */
6575  for (i = 0; i < ndoms_cur; i++) {
6576  for (j = 0; j < n && !new_topology; j++) {
6577  if (cpumask_equal(doms_cur[i], doms_new[j])
6578  && dattrs_equal(dattr_cur, i, dattr_new, j))
6579  goto match1;
6580  }
6581  /* no match - a current sched domain not in new doms_new[] */
6582  detach_destroy_domains(doms_cur[i]);
6583 match1:
6584  ;
6585  }
6586 
6587  if (doms_new == NULL) {
6588  ndoms_cur = 0;
6589  doms_new = &fallback_doms;
6590  cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6591  WARN_ON_ONCE(dattr_new);
6592  }
6593 
6594  /* Build new domains */
6595  for (i = 0; i < ndoms_new; i++) {
6596  for (j = 0; j < ndoms_cur && !new_topology; j++) {
6597  if (cpumask_equal(doms_new[i], doms_cur[j])
6598  && dattrs_equal(dattr_new, i, dattr_cur, j))
6599  goto match2;
6600  }
6601  /* no match - add a new doms_new */
6602  build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6603 match2:
6604  ;
6605  }
6606 
6607  /* Remember the new sched domains */
6608  if (doms_cur != &fallback_doms)
6609  free_sched_domains(doms_cur, ndoms_cur);
6610  kfree(dattr_cur); /* kfree(NULL) is safe */
6611  doms_cur = doms_new;
6612  dattr_cur = dattr_new;
6613  ndoms_cur = ndoms_new;
6614 
6615  register_sched_domain_sysctl();
6616 
6617  mutex_unlock(&sched_domains_mutex);
6618 }
6619 
6620 static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
6621 
6622 /*
6623  * Update cpusets according to cpu_active mask. If cpusets are
6624  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
6625  * around partition_sched_domains().
6626  *
6627  * If we come here as part of a suspend/resume, don't touch cpusets because we
6628  * want to restore it back to its original state upon resume anyway.
6629  */
6630 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6631  void *hcpu)
6632 {
6633  switch (action) {
6634  case CPU_ONLINE_FROZEN:
6636 
6637  /*
6638  * num_cpus_frozen tracks how many CPUs are involved in suspend
6639  * resume sequence. As long as this is not the last online
6640  * operation in the resume sequence, just build a single sched
6641  * domain, ignoring cpusets.
6642  */
6643  num_cpus_frozen--;
6644  if (likely(num_cpus_frozen)) {
6645  partition_sched_domains(1, NULL, NULL);
6646  break;
6647  }
6648 
6649  /*
6650  * This is the last CPU online operation. So fall through and
6651  * restore the original sched domains by considering the
6652  * cpuset configurations.
6653  */
6654 
6655  case CPU_ONLINE:
6656  case CPU_DOWN_FAILED:
6658  break;
6659  default:
6660  return NOTIFY_DONE;
6661  }
6662  return NOTIFY_OK;
6663 }
6664 
6665 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6666  void *hcpu)
6667 {
6668  switch (action) {
6669  case CPU_DOWN_PREPARE:
6671  break;
6673  num_cpus_frozen++;
6674  partition_sched_domains(1, NULL, NULL);
6675  break;
6676  default:
6677  return NOTIFY_DONE;
6678  }
6679  return NOTIFY_OK;
6680 }
6681 
6682 void __init sched_init_smp(void)
6683 {
6684  cpumask_var_t non_isolated_cpus;
6685 
6686  alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6687  alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6688 
6689  sched_init_numa();
6690 
6691  get_online_cpus();
6692  mutex_lock(&sched_domains_mutex);
6693  init_sched_domains(cpu_active_mask);
6694  cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6695  if (cpumask_empty(non_isolated_cpus))
6696  cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6697  mutex_unlock(&sched_domains_mutex);
6698  put_online_cpus();
6699 
6700  hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6701  hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6702  hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6703 
6704  /* RT runtime code needs to handle some hotplug events */
6706 
6707  init_hrtick();
6708 
6709  /* Move init over to a non-isolated CPU */
6710  if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6711  BUG();
6713  free_cpumask_var(non_isolated_cpus);
6714 
6716 }
6717 #else
6719 {
6721 }
6722 #endif /* CONFIG_SMP */
6723 
6725 
6726 int in_sched_functions(unsigned long addr)
6727 {
6728  return in_lock_functions(addr) ||
6729  (addr >= (unsigned long)__sched_text_start
6730  && addr < (unsigned long)__sched_text_end);
6731 }
6732 
6733 #ifdef CONFIG_CGROUP_SCHED
6734 struct task_group root_task_group;
6735 LIST_HEAD(task_groups);
6736 #endif
6737 
6738 DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
6739 
6740 void __init sched_init(void)
6741 {
6742  int i, j;
6743  unsigned long alloc_size = 0, ptr;
6744 
6745 #ifdef CONFIG_FAIR_GROUP_SCHED
6746  alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6747 #endif
6748 #ifdef CONFIG_RT_GROUP_SCHED
6749  alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6750 #endif
6751 #ifdef CONFIG_CPUMASK_OFFSTACK
6752  alloc_size += num_possible_cpus() * cpumask_size();
6753 #endif
6754  if (alloc_size) {
6755  ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6756 
6757 #ifdef CONFIG_FAIR_GROUP_SCHED
6758  root_task_group.se = (struct sched_entity **)ptr;
6759  ptr += nr_cpu_ids * sizeof(void **);
6760 
6761  root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6762  ptr += nr_cpu_ids * sizeof(void **);
6763 
6764 #endif /* CONFIG_FAIR_GROUP_SCHED */
6765 #ifdef CONFIG_RT_GROUP_SCHED
6766  root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6767  ptr += nr_cpu_ids * sizeof(void **);
6768 
6769  root_task_group.rt_rq = (struct rt_rq **)ptr;
6770  ptr += nr_cpu_ids * sizeof(void **);
6771 
6772 #endif /* CONFIG_RT_GROUP_SCHED */
6773 #ifdef CONFIG_CPUMASK_OFFSTACK
6775  per_cpu(load_balance_tmpmask, i) = (void *)ptr;
6776  ptr += cpumask_size();
6777  }
6778 #endif /* CONFIG_CPUMASK_OFFSTACK */
6779  }
6780 
6781 #ifdef CONFIG_SMP
6782  init_defrootdomain();
6783 #endif
6784 
6786  global_rt_period(), global_rt_runtime());
6787 
6788 #ifdef CONFIG_RT_GROUP_SCHED
6789  init_rt_bandwidth(&root_task_group.rt_bandwidth,
6790  global_rt_period(), global_rt_runtime());
6791 #endif /* CONFIG_RT_GROUP_SCHED */
6792 
6793 #ifdef CONFIG_CGROUP_SCHED
6794  list_add(&root_task_group.list, &task_groups);
6795  INIT_LIST_HEAD(&root_task_group.children);
6796  INIT_LIST_HEAD(&root_task_group.siblings);
6797  autogroup_init(&init_task);
6798 
6799 #endif /* CONFIG_CGROUP_SCHED */
6800 
6801 #ifdef CONFIG_CGROUP_CPUACCT
6802  root_cpuacct.cpustat = &kernel_cpustat;
6803  root_cpuacct.cpuusage = alloc_percpu(u64);
6804  /* Too early, not expected to fail */
6805  BUG_ON(!root_cpuacct.cpuusage);
6806 #endif
6808  struct rq *rq;
6809 
6810  rq = cpu_rq(i);
6811  raw_spin_lock_init(&rq->lock);
6812  rq->nr_running = 0;
6813  rq->calc_load_active = 0;
6815  init_cfs_rq(&rq->cfs);
6816  init_rt_rq(&rq->rt, rq);
6817 #ifdef CONFIG_FAIR_GROUP_SCHED
6818  root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6819  INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6820  /*
6821  * How much cpu bandwidth does root_task_group get?
6822  *
6823  * In case of task-groups formed thr' the cgroup filesystem, it
6824  * gets 100% of the cpu resources in the system. This overall
6825  * system cpu resource is divided among the tasks of
6826  * root_task_group and its child task-groups in a fair manner,
6827  * based on each entity's (task or task-group's) weight
6828  * (se->load.weight).
6829  *
6830  * In other words, if root_task_group has 10 tasks of weight
6831  * 1024) and two child groups A0 and A1 (of weight 1024 each),
6832  * then A0's share of the cpu resource is:
6833  *
6834  * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
6835  *
6836  * We achieve this by letting root_task_group's tasks sit
6837  * directly in rq->cfs (i.e root_task_group->se[] = NULL).
6838  */
6839  init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6840  init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6841 #endif /* CONFIG_FAIR_GROUP_SCHED */
6842 
6843  rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6844 #ifdef CONFIG_RT_GROUP_SCHED
6845  INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6846  init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6847 #endif
6848 
6849  for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6850  rq->cpu_load[j] = 0;
6851 
6853 
6854 #ifdef CONFIG_SMP
6855  rq->sd = NULL;
6856  rq->rd = NULL;
6857  rq->cpu_power = SCHED_POWER_SCALE;
6858  rq->post_schedule = 0;
6859  rq->active_balance = 0;
6860  rq->next_balance = jiffies;
6861  rq->push_cpu = 0;
6862  rq->cpu = i;
6863  rq->online = 0;
6864  rq->idle_stamp = 0;
6865  rq->avg_idle = 2*sysctl_sched_migration_cost;
6866 
6867  INIT_LIST_HEAD(&rq->cfs_tasks);
6868 
6869  rq_attach_root(rq, &def_root_domain);
6870 #ifdef CONFIG_NO_HZ
6871  rq->nohz_flags = 0;
6872 #endif
6873 #endif
6874  init_rq_hrtick(rq);
6875  atomic_set(&rq->nr_iowait, 0);
6876  }
6877 
6878  set_load_weight(&init_task);
6879 
6880 #ifdef CONFIG_PREEMPT_NOTIFIERS
6881  INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6882 #endif
6883 
6884 #ifdef CONFIG_RT_MUTEXES
6885  plist_head_init(&init_task.pi_waiters);
6886 #endif
6887 
6888  /*
6889  * The boot idle thread does lazy MMU switching as well:
6890  */
6891  atomic_inc(&init_mm.mm_count);
6893 
6894  /*
6895  * Make us the idle thread. Technically, schedule() should not be
6896  * called from this thread, however somewhere below it might be,
6897  * but because we are the idle thread, we just pick up running again
6898  * when this runqueue becomes "idle".
6899  */
6901 
6902  calc_load_update = jiffies + LOAD_FREQ;
6903 
6904  /*
6905  * During early bootup we pretend to be a normal task:
6906  */
6907  current->sched_class = &fair_sched_class;
6908 
6909 #ifdef CONFIG_SMP
6910  zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6911  /* May be allocated at isolcpus cmdline parse time */
6912  if (cpu_isolated_map == NULL)
6913  zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6914  idle_thread_set_boot_cpu();
6915 #endif
6917 
6918  scheduler_running = 1;
6919 }
6920 
6921 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6922 static inline int preempt_count_equals(int preempt_offset)
6923 {
6924  int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
6925 
6926  return (nested == preempt_offset);
6927 }
6928 
6929 void __might_sleep(const char *file, int line, int preempt_offset)
6930 {
6931  static unsigned long prev_jiffy; /* ratelimiting */
6932 
6933  rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
6934  if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
6936  return;
6937  if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6938  return;
6939  prev_jiffy = jiffies;
6940 
6942  "BUG: sleeping function called from invalid context at %s:%d\n",
6943  file, line);
6945  "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6946  in_atomic(), irqs_disabled(),
6947  current->pid, current->comm);
6948 
6950  if (irqs_disabled())
6951  print_irqtrace_events(current);
6952  dump_stack();
6953 }
6954 EXPORT_SYMBOL(__might_sleep);
6955 #endif
6956 
6957 #ifdef CONFIG_MAGIC_SYSRQ
6958 static void normalize_task(struct rq *rq, struct task_struct *p)
6959 {
6960  const struct sched_class *prev_class = p->sched_class;
6961  int old_prio = p->prio;
6962  int on_rq;
6963 
6964  on_rq = p->on_rq;
6965  if (on_rq)
6966  dequeue_task(rq, p, 0);
6967  __setscheduler(rq, p, SCHED_NORMAL, 0);
6968  if (on_rq) {
6969  enqueue_task(rq, p, 0);
6970  resched_task(rq->curr);
6971  }
6972 
6973  check_class_changed(rq, p, prev_class, old_prio);
6974 }
6975 
6976 void normalize_rt_tasks(void)
6977 {
6978  struct task_struct *g, *p;
6979  unsigned long flags;
6980  struct rq *rq;
6981 
6982  read_lock_irqsave(&tasklist_lock, flags);
6983  do_each_thread(g, p) {
6984  /*
6985  * Only normalize user tasks:
6986  */
6987  if (!p->mm)
6988  continue;
6989 
6990  p->se.exec_start = 0;
6991 #ifdef CONFIG_SCHEDSTATS
6992  p->se.statistics.wait_start = 0;
6993  p->se.statistics.sleep_start = 0;
6994  p->se.statistics.block_start = 0;
6995 #endif
6996 
6997  if (!rt_task(p)) {
6998  /*
6999  * Renice negative nice level userspace
7000  * tasks back to 0:
7001  */
7002  if (TASK_NICE(p) < 0 && p->mm)
7003  set_user_nice(p, 0);
7004  continue;
7005  }
7006 
7007  raw_spin_lock(&p->pi_lock);
7008  rq = __task_rq_lock(p);
7009 
7010  normalize_task(rq, p);
7011 
7012  __task_rq_unlock(rq);
7013  raw_spin_unlock(&p->pi_lock);
7014  } while_each_thread(g, p);
7015 
7016  read_unlock_irqrestore(&tasklist_lock, flags);
7017 }
7018 
7019 #endif /* CONFIG_MAGIC_SYSRQ */
7020 
7021 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7022 /*
7023  * These functions are only useful for the IA64 MCA handling, or kdb.
7024  *
7025  * They can only be called when the whole system has been
7026  * stopped - every CPU needs to be quiescent, and no scheduling
7027  * activity can take place. Using them for anything else would
7028  * be a serious bug, and as a result, they aren't even visible
7029  * under any other configuration.
7030  */
7031 
7038 struct task_struct *curr_task(int cpu)
7039 {
7040  return cpu_curr(cpu);
7041 }
7042 
7043 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7044 
7045 #ifdef CONFIG_IA64
7046 
7061 void set_curr_task(int cpu, struct task_struct *p)
7062 {
7063  cpu_curr(cpu) = p;
7064 }
7065 
7066 #endif
7067 
7068 #ifdef CONFIG_CGROUP_SCHED
7069 /* task_group_lock serializes the addition/removal of task groups */
7070 static DEFINE_SPINLOCK(task_group_lock);
7071 
7072 static void free_sched_group(struct task_group *tg)
7073 {
7075  free_rt_sched_group(tg);
7076  autogroup_free(tg);
7077  kfree(tg);
7078 }
7079 
7080 /* allocate runqueue etc for a new task group */
7081 struct task_group *sched_create_group(struct task_group *parent)
7082 {
7083  struct task_group *tg;
7084  unsigned long flags;
7085 
7086  tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7087  if (!tg)
7088  return ERR_PTR(-ENOMEM);
7089 
7090  if (!alloc_fair_sched_group(tg, parent))
7091  goto err;
7092 
7093  if (!alloc_rt_sched_group(tg, parent))
7094  goto err;
7095 
7096  spin_lock_irqsave(&task_group_lock, flags);
7097  list_add_rcu(&tg->list, &task_groups);
7098 
7099  WARN_ON(!parent); /* root should already exist */
7100 
7101  tg->parent = parent;
7102  INIT_LIST_HEAD(&tg->children);
7103  list_add_rcu(&tg->siblings, &parent->children);
7104  spin_unlock_irqrestore(&task_group_lock, flags);
7105 
7106  return tg;
7107 
7108 err:
7109  free_sched_group(tg);
7110  return ERR_PTR(-ENOMEM);
7111 }
7112 
7113 /* rcu callback to free various structures associated with a task group */
7114 static void free_sched_group_rcu(struct rcu_head *rhp)
7115 {
7116  /* now it should be safe to free those cfs_rqs */
7117  free_sched_group(container_of(rhp, struct task_group, rcu));
7118 }
7119 
7120 /* Destroy runqueue etc associated with a task group */
7121 void sched_destroy_group(struct task_group *tg)
7122 {
7123  unsigned long flags;
7124  int i;
7125 
7126  /* end participation in shares distribution */
7129 
7130  spin_lock_irqsave(&task_group_lock, flags);
7131  list_del_rcu(&tg->list);
7132  list_del_rcu(&tg->siblings);
7133  spin_unlock_irqrestore(&task_group_lock, flags);
7134 
7135  /* wait for possible concurrent references to cfs_rqs complete */
7136  call_rcu(&tg->rcu, free_sched_group_rcu);
7137 }
7138 
7139 /* change task's runqueue when it moves between groups.
7140  * The caller of this function should have put the task in its new group
7141  * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7142  * reflect its new group.
7143  */
7144 void sched_move_task(struct task_struct *tsk)
7145 {
7146  struct task_group *tg;
7147  int on_rq, running;
7148  unsigned long flags;
7149  struct rq *rq;
7150 
7151  rq = task_rq_lock(tsk, &flags);
7152 
7153  running = task_current(rq, tsk);
7154  on_rq = tsk->on_rq;
7155 
7156  if (on_rq)
7157  dequeue_task(rq, tsk, 0);
7158  if (unlikely(running))
7159  tsk->sched_class->put_prev_task(rq, tsk);
7160 
7161  tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
7162  lockdep_is_held(&tsk->sighand->siglock)),
7163  struct task_group, css);
7164  tg = autogroup_task_group(tsk, tg);
7165  tsk->sched_task_group = tg;
7166 
7167 #ifdef CONFIG_FAIR_GROUP_SCHED
7168  if (tsk->sched_class->task_move_group)
7169  tsk->sched_class->task_move_group(tsk, on_rq);
7170  else
7171 #endif
7172  set_task_rq(tsk, task_cpu(tsk));
7173 
7174  if (unlikely(running))
7175  tsk->sched_class->set_curr_task(rq);
7176  if (on_rq)
7177  enqueue_task(rq, tsk, 0);
7178 
7179  task_rq_unlock(rq, tsk, &flags);
7180 }
7181 #endif /* CONFIG_CGROUP_SCHED */
7182 
7183 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
7184 static unsigned long to_ratio(u64 period, u64 runtime)
7185 {
7186  if (runtime == RUNTIME_INF)
7187  return 1ULL << 20;
7188 
7189  return div64_u64(runtime << 20, period);
7190 }
7191 #endif
7192 
7193 #ifdef CONFIG_RT_GROUP_SCHED
7194 /*
7195  * Ensure that the real time constraints are schedulable.
7196  */
7197 static DEFINE_MUTEX(rt_constraints_mutex);
7198 
7199 /* Must be called with tasklist_lock held */
7200 static inline int tg_has_rt_tasks(struct task_group *tg)
7201 {
7202  struct task_struct *g, *p;
7203 
7204  do_each_thread(g, p) {
7205  if (rt_task(p) && task_rq(p)->rt.tg == tg)
7206  return 1;
7207  } while_each_thread(g, p);
7208 
7209  return 0;
7210 }
7211 
7212 struct rt_schedulable_data {
7213  struct task_group *tg;
7214  u64 rt_period;
7215  u64 rt_runtime;
7216 };
7217 
7218 static int tg_rt_schedulable(struct task_group *tg, void *data)
7219 {
7220  struct rt_schedulable_data *d = data;
7221  struct task_group *child;
7222  unsigned long total, sum = 0;
7223  u64 period, runtime;
7224 
7225  period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7226  runtime = tg->rt_bandwidth.rt_runtime;
7227 
7228  if (tg == d->tg) {
7229  period = d->rt_period;
7230  runtime = d->rt_runtime;
7231  }
7232 
7233  /*
7234  * Cannot have more runtime than the period.
7235  */
7236  if (runtime > period && runtime != RUNTIME_INF)
7237  return -EINVAL;
7238 
7239  /*
7240  * Ensure we don't starve existing RT tasks.
7241  */
7242  if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7243  return -EBUSY;
7244 
7245  total = to_ratio(period, runtime);
7246 
7247  /*
7248  * Nobody can have more than the global setting allows.
7249  */
7250  if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7251  return -EINVAL;
7252 
7253  /*
7254  * The sum of our children's runtime should not exceed our own.
7255  */
7256  list_for_each_entry_rcu(child, &tg->children, siblings) {
7257  period = ktime_to_ns(child->rt_bandwidth.rt_period);
7258  runtime = child->rt_bandwidth.rt_runtime;
7259 
7260  if (child == d->tg) {
7261  period = d->rt_period;
7262  runtime = d->rt_runtime;
7263  }
7264 
7265  sum += to_ratio(period, runtime);
7266  }
7267 
7268  if (sum > total)
7269  return -EINVAL;
7270 
7271  return 0;
7272 }
7273 
7274 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7275 {
7276  int ret;
7277 
7278  struct rt_schedulable_data data = {
7279  .tg = tg,
7280  .rt_period = period,
7281  .rt_runtime = runtime,
7282  };
7283 
7284  rcu_read_lock();
7285  ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7286  rcu_read_unlock();
7287 
7288  return ret;
7289 }
7290 
7291 static int tg_set_rt_bandwidth(struct task_group *tg,
7292  u64 rt_period, u64 rt_runtime)
7293 {
7294  int i, err = 0;
7295 
7296  mutex_lock(&rt_constraints_mutex);
7297  read_lock(&tasklist_lock);
7298  err = __rt_schedulable(tg, rt_period, rt_runtime);
7299  if (err)
7300  goto unlock;
7301 
7302  raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7303  tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7304  tg->rt_bandwidth.rt_runtime = rt_runtime;
7305 
7307  struct rt_rq *rt_rq = tg->rt_rq[i];
7308 
7309  raw_spin_lock(&rt_rq->rt_runtime_lock);
7310  rt_rq->rt_runtime = rt_runtime;
7312  }
7313  raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7314 unlock:
7315  read_unlock(&tasklist_lock);
7316  mutex_unlock(&rt_constraints_mutex);
7317 
7318  return err;
7319 }
7320 
7321 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7322 {
7323  u64 rt_runtime, rt_period;
7324 
7325  rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7326  rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7327  if (rt_runtime_us < 0)
7328  rt_runtime = RUNTIME_INF;
7329 
7330  return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7331 }
7332 
7333 long sched_group_rt_runtime(struct task_group *tg)
7334 {
7335  u64 rt_runtime_us;
7336 
7337  if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7338  return -1;
7339 
7340  rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7341  do_div(rt_runtime_us, NSEC_PER_USEC);
7342  return rt_runtime_us;
7343 }
7344 
7345 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7346 {
7347  u64 rt_runtime, rt_period;
7348 
7349  rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7350  rt_runtime = tg->rt_bandwidth.rt_runtime;
7351 
7352  if (rt_period == 0)
7353  return -EINVAL;
7354 
7355  return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7356 }
7357 
7358 long sched_group_rt_period(struct task_group *tg)
7359 {
7360  u64 rt_period_us;
7361 
7362  rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7363  do_div(rt_period_us, NSEC_PER_USEC);
7364  return rt_period_us;
7365 }
7366 
7367 static int sched_rt_global_constraints(void)
7368 {
7369  u64 runtime, period;
7370  int ret = 0;
7371 
7372  if (sysctl_sched_rt_period <= 0)
7373  return -EINVAL;
7374 
7375  runtime = global_rt_runtime();
7376  period = global_rt_period();
7377 
7378  /*
7379  * Sanity check on the sysctl variables.
7380  */
7381  if (runtime > period && runtime != RUNTIME_INF)
7382  return -EINVAL;
7383 
7384  mutex_lock(&rt_constraints_mutex);
7385  read_lock(&tasklist_lock);
7386  ret = __rt_schedulable(NULL, 0, 0);
7387  read_unlock(&tasklist_lock);
7388  mutex_unlock(&rt_constraints_mutex);
7389 
7390  return ret;
7391 }
7392 
7393 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7394 {
7395  /* Don't accept realtime tasks when there is no way for them to run */
7396  if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7397  return 0;
7398 
7399  return 1;
7400 }
7401 
7402 #else /* !CONFIG_RT_GROUP_SCHED */
7403 static int sched_rt_global_constraints(void)
7404 {
7405  unsigned long flags;
7406  int i;
7407 
7408  if (sysctl_sched_rt_period <= 0)
7409  return -EINVAL;
7410 
7411  /*
7412  * There's always some RT tasks in the root group
7413  * -- migration, kstopmachine etc..
7414  */
7415  if (sysctl_sched_rt_runtime == 0)
7416  return -EBUSY;
7417 
7418  raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7420  struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7421 
7422  raw_spin_lock(&rt_rq->rt_runtime_lock);
7423  rt_rq->rt_runtime = global_rt_runtime();
7425  }
7426  raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7427 
7428  return 0;
7429 }
7430 #endif /* CONFIG_RT_GROUP_SCHED */
7431 
7432 int sched_rt_handler(struct ctl_table *table, int write,
7433  void __user *buffer, size_t *lenp,
7434  loff_t *ppos)
7435 {
7436  int ret;
7437  int old_period, old_runtime;
7438  static DEFINE_MUTEX(mutex);
7439 
7440  mutex_lock(&mutex);
7441  old_period = sysctl_sched_rt_period;
7442  old_runtime = sysctl_sched_rt_runtime;
7443 
7444  ret = proc_dointvec(table, write, buffer, lenp, ppos);
7445 
7446  if (!ret && write) {
7447  ret = sched_rt_global_constraints();
7448  if (ret) {
7449  sysctl_sched_rt_period = old_period;
7450  sysctl_sched_rt_runtime = old_runtime;
7451  } else {
7452  def_rt_bandwidth.rt_runtime = global_rt_runtime();
7453  def_rt_bandwidth.rt_period =
7454  ns_to_ktime(global_rt_period());
7455  }
7456  }
7457  mutex_unlock(&mutex);
7458 
7459  return ret;
7460 }
7461 
7462 #ifdef CONFIG_CGROUP_SCHED
7463 
7464 /* return corresponding task_group object of a cgroup */
7465 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7466 {
7467  return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7468  struct task_group, css);
7469 }
7470 
7471 static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7472 {
7473  struct task_group *tg, *parent;
7474 
7475  if (!cgrp->parent) {
7476  /* This is early initialization for the top cgroup */
7477  return &root_task_group.css;
7478  }
7479 
7480  parent = cgroup_tg(cgrp->parent);
7481  tg = sched_create_group(parent);
7482  if (IS_ERR(tg))
7483  return ERR_PTR(-ENOMEM);
7484 
7485  return &tg->css;
7486 }
7487 
7488 static void cpu_cgroup_destroy(struct cgroup *cgrp)
7489 {
7490  struct task_group *tg = cgroup_tg(cgrp);
7491 
7492  sched_destroy_group(tg);
7493 }
7494 
7495 static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7496  struct cgroup_taskset *tset)
7497 {
7498  struct task_struct *task;
7499 
7500  cgroup_taskset_for_each(task, cgrp, tset) {
7501 #ifdef CONFIG_RT_GROUP_SCHED
7502  if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
7503  return -EINVAL;
7504 #else
7505  /* We don't support RT-tasks being in separate groups */
7506  if (task->sched_class != &fair_sched_class)
7507  return -EINVAL;
7508 #endif
7509  }
7510  return 0;
7511 }
7512 
7513 static void cpu_cgroup_attach(struct cgroup *cgrp,
7514  struct cgroup_taskset *tset)
7515 {
7516  struct task_struct *task;
7517 
7518  cgroup_taskset_for_each(task, cgrp, tset)
7519  sched_move_task(task);
7520 }
7521 
7522 static void
7523 cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7524  struct task_struct *task)
7525 {
7526  /*
7527  * cgroup_exit() is called in the copy_process() failure path.
7528  * Ignore this case since the task hasn't ran yet, this avoids
7529  * trying to poke a half freed task state from generic code.
7530  */
7531  if (!(task->flags & PF_EXITING))
7532  return;
7533 
7534  sched_move_task(task);
7535 }
7536 
7537 #ifdef CONFIG_FAIR_GROUP_SCHED
7538 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7539  u64 shareval)
7540 {
7541  return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
7542 }
7543 
7544 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
7545 {
7546  struct task_group *tg = cgroup_tg(cgrp);
7547 
7548  return (u64) scale_load_down(tg->shares);
7549 }
7550 
7551 #ifdef CONFIG_CFS_BANDWIDTH
7552 static DEFINE_MUTEX(cfs_constraints_mutex);
7553 
7554 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7555 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
7556 
7557 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7558 
7559 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7560 {
7561  int i, ret = 0, runtime_enabled, runtime_was_enabled;
7562  struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7563 
7564  if (tg == &root_task_group)
7565  return -EINVAL;
7566 
7567  /*
7568  * Ensure we have at some amount of bandwidth every period. This is
7569  * to prevent reaching a state of large arrears when throttled via
7570  * entity_tick() resulting in prolonged exit starvation.
7571  */
7572  if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7573  return -EINVAL;
7574 
7575  /*
7576  * Likewise, bound things on the otherside by preventing insane quota
7577  * periods. This also allows us to normalize in computing quota
7578  * feasibility.
7579  */
7580  if (period > max_cfs_quota_period)
7581  return -EINVAL;
7582 
7583  mutex_lock(&cfs_constraints_mutex);
7584  ret = __cfs_schedulable(tg, period, quota);
7585  if (ret)
7586  goto out_unlock;
7587 
7588  runtime_enabled = quota != RUNTIME_INF;
7589  runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7590  account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
7591  raw_spin_lock_irq(&cfs_b->lock);
7592  cfs_b->period = ns_to_ktime(period);
7593  cfs_b->quota = quota;
7594 
7595  __refill_cfs_bandwidth_runtime(cfs_b);
7596  /* restart the period timer (if active) to handle new period expiry */
7597  if (runtime_enabled && cfs_b->timer_active) {
7598  /* force a reprogram */
7599  cfs_b->timer_active = 0;
7600  __start_cfs_bandwidth(cfs_b);
7601  }
7602  raw_spin_unlock_irq(&cfs_b->lock);
7603 
7605  struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7606  struct rq *rq = cfs_rq->rq;
7607 
7608  raw_spin_lock_irq(&rq->lock);
7609  cfs_rq->runtime_enabled = runtime_enabled;
7610  cfs_rq->runtime_remaining = 0;
7611 
7612  if (cfs_rq->throttled)
7613  unthrottle_cfs_rq(cfs_rq);
7614  raw_spin_unlock_irq(&rq->lock);
7615  }
7616 out_unlock:
7617  mutex_unlock(&cfs_constraints_mutex);
7618 
7619  return ret;
7620 }
7621 
7622 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7623 {
7624  u64 quota, period;
7625 
7626  period = ktime_to_ns(tg->cfs_bandwidth.period);
7627  if (cfs_quota_us < 0)
7628  quota = RUNTIME_INF;
7629  else
7630  quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7631 
7632  return tg_set_cfs_bandwidth(tg, period, quota);
7633 }
7634 
7635 long tg_get_cfs_quota(struct task_group *tg)
7636 {
7637  u64 quota_us;
7638 
7639  if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7640  return -1;
7641 
7642  quota_us = tg->cfs_bandwidth.quota;
7643  do_div(quota_us, NSEC_PER_USEC);
7644 
7645  return quota_us;
7646 }
7647 
7648 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7649 {
7650  u64 quota, period;
7651 
7652  period = (u64)cfs_period_us * NSEC_PER_USEC;
7653  quota = tg->cfs_bandwidth.quota;
7654 
7655  return tg_set_cfs_bandwidth(tg, period, quota);
7656 }
7657 
7658 long tg_get_cfs_period(struct task_group *tg)
7659 {
7660  u64 cfs_period_us;
7661 
7662  cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7663  do_div(cfs_period_us, NSEC_PER_USEC);
7664 
7665  return cfs_period_us;
7666 }
7667 
7668 static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
7669 {
7670  return tg_get_cfs_quota(cgroup_tg(cgrp));
7671 }
7672 
7673 static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
7674  s64 cfs_quota_us)
7675 {
7676  return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
7677 }
7678 
7679 static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
7680 {
7681  return tg_get_cfs_period(cgroup_tg(cgrp));
7682 }
7683 
7684 static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7685  u64 cfs_period_us)
7686 {
7687  return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
7688 }
7689 
7690 struct cfs_schedulable_data {
7691  struct task_group *tg;
7692  u64 period, quota;
7693 };
7694 
7695 /*
7696  * normalize group quota/period to be quota/max_period
7697  * note: units are usecs
7698  */
7699 static u64 normalize_cfs_quota(struct task_group *tg,
7700  struct cfs_schedulable_data *d)
7701 {
7702  u64 quota, period;
7703 
7704  if (tg == d->tg) {
7705  period = d->period;
7706  quota = d->quota;
7707  } else {
7708  period = tg_get_cfs_period(tg);
7709  quota = tg_get_cfs_quota(tg);
7710  }
7711 
7712  /* note: these should typically be equivalent */
7713  if (quota == RUNTIME_INF || quota == -1)
7714  return RUNTIME_INF;
7715 
7716  return to_ratio(period, quota);
7717 }
7718 
7719 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7720 {
7721  struct cfs_schedulable_data *d = data;
7722  struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7723  s64 quota = 0, parent_quota = -1;
7724 
7725  if (!tg->parent) {
7726  quota = RUNTIME_INF;
7727  } else {
7728  struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7729 
7730  quota = normalize_cfs_quota(tg, d);
7731  parent_quota = parent_b->hierarchal_quota;
7732 
7733  /*
7734  * ensure max(child_quota) <= parent_quota, inherit when no
7735  * limit is set
7736  */
7737  if (quota == RUNTIME_INF)
7738  quota = parent_quota;
7739  else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7740  return -EINVAL;
7741  }
7742  cfs_b->hierarchal_quota = quota;
7743 
7744  return 0;
7745 }
7746 
7747 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7748 {
7749  int ret;
7750  struct cfs_schedulable_data data = {
7751  .tg = tg,
7752  .period = period,
7753  .quota = quota,
7754  };
7755 
7756  if (quota != RUNTIME_INF) {
7757  do_div(data.period, NSEC_PER_USEC);
7758  do_div(data.quota, NSEC_PER_USEC);
7759  }
7760 
7761  rcu_read_lock();
7762  ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7763  rcu_read_unlock();
7764 
7765  return ret;
7766 }
7767 
7768 static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7769  struct cgroup_map_cb *cb)
7770 {
7771  struct task_group *tg = cgroup_tg(cgrp);
7772  struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7773 
7774  cb->fill(cb, "nr_periods", cfs_b->nr_periods);
7775  cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
7776  cb->fill(cb, "throttled_time", cfs_b->throttled_time);
7777 
7778  return 0;
7779 }
7780 #endif /* CONFIG_CFS_BANDWIDTH */
7781 #endif /* CONFIG_FAIR_GROUP_SCHED */
7782 
7783 #ifdef CONFIG_RT_GROUP_SCHED
7784 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
7785  s64 val)
7786 {
7787  return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
7788 }
7789 
7790 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
7791 {
7792  return sched_group_rt_runtime(cgroup_tg(cgrp));
7793 }
7794 
7795 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7796  u64 rt_period_us)
7797 {
7798  return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
7799 }
7800 
7801 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
7802 {
7803  return sched_group_rt_period(cgroup_tg(cgrp));
7804 }
7805 #endif /* CONFIG_RT_GROUP_SCHED */
7806 
7807 static struct cftype cpu_files[] = {
7808 #ifdef CONFIG_FAIR_GROUP_SCHED
7809  {
7810  .name = "shares",
7811  .read_u64 = cpu_shares_read_u64,
7812  .write_u64 = cpu_shares_write_u64,
7813  },
7814 #endif
7815 #ifdef CONFIG_CFS_BANDWIDTH
7816  {
7817  .name = "cfs_quota_us",
7818  .read_s64 = cpu_cfs_quota_read_s64,
7819  .write_s64 = cpu_cfs_quota_write_s64,
7820  },
7821  {
7822  .name = "cfs_period_us",
7823  .read_u64 = cpu_cfs_period_read_u64,
7824  .write_u64 = cpu_cfs_period_write_u64,
7825  },
7826  {
7827  .name = "stat",
7828  .read_map = cpu_stats_show,
7829  },
7830 #endif
7831 #ifdef CONFIG_RT_GROUP_SCHED
7832  {
7833  .name = "rt_runtime_us",
7834  .read_s64 = cpu_rt_runtime_read,
7835  .write_s64 = cpu_rt_runtime_write,
7836  },
7837  {
7838  .name = "rt_period_us",
7839  .read_u64 = cpu_rt_period_read_uint,
7840  .write_u64 = cpu_rt_period_write_uint,
7841  },
7842 #endif
7843  { } /* terminate */
7844 };
7845 
7846 struct cgroup_subsys cpu_cgroup_subsys = {
7847  .name = "cpu",
7848  .create = cpu_cgroup_create,
7849  .destroy = cpu_cgroup_destroy,
7850  .can_attach = cpu_cgroup_can_attach,
7851  .attach = cpu_cgroup_attach,
7852  .exit = cpu_cgroup_exit,
7853  .subsys_id = cpu_cgroup_subsys_id,
7854  .base_cftypes = cpu_files,
7855  .early_init = 1,
7856 };
7857 
7858 #endif /* CONFIG_CGROUP_SCHED */
7859 
7860 #ifdef CONFIG_CGROUP_CPUACCT
7861 
7862 /*
7863  * CPU accounting code for task groups.
7864  *
7865  * Based on the work by Paul Menage ([email protected]) and Balbir Singh
7866  * ([email protected]).
7867  */
7868 
7869 struct cpuacct root_cpuacct;
7870 
7871 /* create a new cpu accounting group */
7872 static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
7873 {
7874  struct cpuacct *ca;
7875 
7876  if (!cgrp->parent)
7877  return &root_cpuacct.css;
7878 
7879  ca = kzalloc(sizeof(*ca), GFP_KERNEL);
7880  if (!ca)
7881  goto out;
7882 
7883  ca->cpuusage = alloc_percpu(u64);
7884  if (!ca->cpuusage)
7885  goto out_free_ca;
7886 
7887  ca->cpustat = alloc_percpu(struct kernel_cpustat);
7888  if (!ca->cpustat)
7889  goto out_free_cpuusage;
7890 
7891  return &ca->css;
7892 
7893 out_free_cpuusage:
7894  free_percpu(ca->cpuusage);
7895 out_free_ca:
7896  kfree(ca);
7897 out:
7898  return ERR_PTR(-ENOMEM);
7899 }
7900 
7901 /* destroy an existing cpu accounting group */
7902 static void cpuacct_destroy(struct cgroup *cgrp)
7903 {
7904  struct cpuacct *ca = cgroup_ca(cgrp);
7905 
7906  free_percpu(ca->cpustat);
7907  free_percpu(ca->cpuusage);
7908  kfree(ca);
7909 }
7910 
7911 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
7912 {
7913  u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
7914  u64 data;
7915 
7916 #ifndef CONFIG_64BIT
7917  /*
7918  * Take rq->lock to make 64-bit read safe on 32-bit platforms.
7919  */
7920  raw_spin_lock_irq(&cpu_rq(cpu)->lock);
7921  data = *cpuusage;
7922  raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
7923 #else
7924  data = *cpuusage;
7925 #endif
7926 
7927  return data;
7928 }
7929 
7930 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
7931 {
7932  u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
7933 
7934 #ifndef CONFIG_64BIT
7935  /*
7936  * Take rq->lock to make 64-bit write safe on 32-bit platforms.
7937  */
7938  raw_spin_lock_irq(&cpu_rq(cpu)->lock);
7939  *cpuusage = val;
7940  raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
7941 #else
7942  *cpuusage = val;
7943 #endif
7944 }
7945 
7946 /* return total cpu usage (in nanoseconds) of a group */
7947 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
7948 {
7949  struct cpuacct *ca = cgroup_ca(cgrp);
7950  u64 totalcpuusage = 0;
7951  int i;
7952 
7954  totalcpuusage += cpuacct_cpuusage_read(ca, i);
7955 
7956  return totalcpuusage;
7957 }
7958 
7959 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
7960  u64 reset)
7961 {
7962  struct cpuacct *ca = cgroup_ca(cgrp);
7963  int err = 0;
7964  int i;
7965 
7966  if (reset) {
7967  err = -EINVAL;
7968  goto out;
7969  }
7970 
7972  cpuacct_cpuusage_write(ca, i, 0);
7973 
7974 out:
7975  return err;
7976 }
7977 
7978 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
7979  struct seq_file *m)
7980 {
7981  struct cpuacct *ca = cgroup_ca(cgroup);
7982  u64 percpu;
7983  int i;
7984 
7986  percpu = cpuacct_cpuusage_read(ca, i);
7987  seq_printf(m, "%llu ", (unsigned long long) percpu);
7988  }
7989  seq_printf(m, "\n");
7990  return 0;
7991 }
7992 
7993 static const char *cpuacct_stat_desc[] = {
7994  [CPUACCT_STAT_USER] = "user",
7995  [CPUACCT_STAT_SYSTEM] = "system",
7996 };
7997 
7998 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
7999  struct cgroup_map_cb *cb)
8000 {
8001  struct cpuacct *ca = cgroup_ca(cgrp);
8002  int cpu;
8003  s64 val = 0;
8004 
8005  for_each_online_cpu(cpu) {
8006  struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8007  val += kcpustat->cpustat[CPUTIME_USER];
8008  val += kcpustat->cpustat[CPUTIME_NICE];
8009  }
8010  val = cputime64_to_clock_t(val);
8011  cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8012 
8013  val = 0;
8014  for_each_online_cpu(cpu) {
8015  struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8016  val += kcpustat->cpustat[CPUTIME_SYSTEM];
8017  val += kcpustat->cpustat[CPUTIME_IRQ];
8018  val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8019  }
8020 
8021  val = cputime64_to_clock_t(val);
8022  cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8023 
8024  return 0;
8025 }
8026 
8027 static struct cftype files[] = {
8028  {
8029  .name = "usage",
8030  .read_u64 = cpuusage_read,
8031  .write_u64 = cpuusage_write,
8032  },
8033  {
8034  .name = "usage_percpu",
8035  .read_seq_string = cpuacct_percpu_seq_read,
8036  },
8037  {
8038  .name = "stat",
8039  .read_map = cpuacct_stats_show,
8040  },
8041  { } /* terminate */
8042 };
8043 
8044 /*
8045  * charge this task's execution time to its accounting group.
8046  *
8047  * called with rq->lock held.
8048  */
8049 void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8050 {
8051  struct cpuacct *ca;
8052  int cpu;
8053 
8054  if (unlikely(!cpuacct_subsys.active))
8055  return;
8056 
8057  cpu = task_cpu(tsk);
8058 
8059  rcu_read_lock();
8060 
8061  ca = task_ca(tsk);
8062 
8063  for (; ca; ca = parent_ca(ca)) {
8064  u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8065  *cpuusage += cputime;
8066  }
8067 
8068  rcu_read_unlock();
8069 }
8070 
8071 struct cgroup_subsys cpuacct_subsys = {
8072  .name = "cpuacct",
8073  .create = cpuacct_create,
8074  .destroy = cpuacct_destroy,
8075  .subsys_id = cpuacct_subsys_id,
8076  .base_cftypes = files,
8077 };
8078 #endif /* CONFIG_CGROUP_CPUACCT */