24 #include <linux/sched.h>
26 #include <linux/slab.h>
71 static unsigned int sched_nr_latency = 8;
99 #ifdef CONFIG_CFS_BANDWIDTH
110 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000
UL;
122 static int get_update_sysctl_factor(
void)
136 factor = 1 +
ilog2(cpus);
143 static void update_sysctl(
void)
145 unsigned int factor = get_update_sysctl_factor();
147 #define SET_SYSCTL(name) \
148 (sysctl_##name = (factor) * normalized_sysctl_##name)
160 #if BITS_PER_LONG == 32
161 # define WMULT_CONST (~0UL)
163 # define WMULT_CONST (1UL << 32)
166 #define WMULT_SHIFT 32
171 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
177 calc_delta_mine(
unsigned long delta_exec,
unsigned long weight,
190 tmp = (
u64)delta_exec;
222 #ifdef CONFIG_FAIR_GROUP_SCHED
231 #define entity_is_task(se) (!se->my_q)
235 #ifdef CONFIG_SCHED_DEBUG
242 #define for_each_sched_entity(se) \
243 for (; se; se = se->parent)
262 static inline void list_add_leaf_cfs_rq(
struct cfs_rq *
cfs_rq)
264 if (!cfs_rq->on_list) {
271 if (cfs_rq->tg->parent &&
272 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
273 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
274 &rq_of(cfs_rq)->leaf_cfs_rq_list);
276 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
277 &rq_of(cfs_rq)->leaf_cfs_rq_list);
284 static inline void list_del_leaf_cfs_rq(
struct cfs_rq *cfs_rq)
286 if (cfs_rq->on_list) {
287 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
293 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
294 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
300 if (se->cfs_rq == pse->cfs_rq)
325 int se_depth, pse_depth;
335 se_depth = depth_se(*se);
336 pse_depth = depth_se(*pse);
338 while (se_depth > pse_depth) {
340 *se = parent_entity(*se);
343 while (pse_depth > se_depth) {
345 *pse = parent_entity(*pse);
348 while (!is_same_group(*se, *pse)) {
349 *se = parent_entity(*se);
350 *pse = parent_entity(*pse);
361 static inline struct rq *rq_of(
struct cfs_rq *cfs_rq)
366 #define entity_is_task(se) 1
368 #define for_each_sched_entity(se) \
369 for (; se; se = NULL)
371 static inline struct cfs_rq *task_cfs_rq(
struct task_struct *
p)
376 static inline struct cfs_rq *cfs_rq_of(
struct sched_entity *se)
385 static inline struct cfs_rq *group_cfs_rq(
struct sched_entity *grp)
390 static inline void list_add_leaf_cfs_rq(
struct cfs_rq *cfs_rq)
394 static inline void list_del_leaf_cfs_rq(
struct cfs_rq *cfs_rq)
398 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
399 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
420 void account_cfs_rq_runtime(
struct cfs_rq *cfs_rq,
unsigned long delta_exec);
435 static inline u64 min_vruntime(
u64 min_vruntime,
u64 vruntime)
437 s64 delta = (
s64)(vruntime - min_vruntime);
450 static void update_min_vruntime(
struct cfs_rq *cfs_rq)
455 vruntime = cfs_rq->
curr->vruntime;
465 vruntime = min_vruntime(vruntime, se->
vruntime);
478 static void __enqueue_entity(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
495 if (entity_before(se, entry)) {
510 rb_link_node(&se->
run_node, parent, link);
514 static void __dequeue_entity(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
546 #ifdef CONFIG_SCHED_DEBUG
562 void __user *
buffer,
size_t *lenp,
566 int factor = get_update_sysctl_factor();
574 #define WRT_SYSCTL(name) \
575 (normalized_sysctl_##name = sysctl_##name / (factor))
576 WRT_SYSCTL(sched_min_granularity);
577 WRT_SYSCTL(sched_latency);
578 WRT_SYSCTL(sched_wakeup_granularity);
588 static inline unsigned long
589 calc_delta_fair(
unsigned long delta,
struct sched_entity *se)
608 unsigned long nr_latency = sched_nr_latency;
610 if (
unlikely(nr_running > nr_latency)) {
632 cfs_rq = cfs_rq_of(se);
633 load = &cfs_rq->load;
638 update_load_add(&lw, se->
load.weight);
641 slice = calc_delta_mine(slice, se->
load.weight, load);
651 static u64 sched_vslice(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
653 return calc_delta_fair(sched_slice(cfs_rq, se), se);
656 static void update_cfs_load(
struct cfs_rq *cfs_rq,
int global_update);
657 static void update_cfs_shares(
struct cfs_rq *cfs_rq);
665 unsigned long delta_exec)
667 unsigned long delta_exec_weighted;
670 max((
u64)delta_exec, curr->statistics.exec_max));
674 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
676 curr->
vruntime += delta_exec_weighted;
677 update_min_vruntime(cfs_rq);
679 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
680 cfs_rq->load_unacc_exec_time += delta_exec;
684 static void update_curr(
struct cfs_rq *cfs_rq)
687 u64 now = rq_of(cfs_rq)->clock_task;
688 unsigned long delta_exec;
702 __update_curr(cfs_rq, curr, delta_exec);
708 trace_sched_stat_runtime(curtask, delta_exec, curr->
vruntime);
709 cpuacct_charge(curtask, delta_exec);
710 account_group_exec_runtime(curtask, delta_exec);
713 account_cfs_rq_runtime(cfs_rq, delta_exec);
717 update_stats_wait_start(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
725 static void update_stats_enqueue(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
731 if (se != cfs_rq->
curr)
732 update_stats_wait_start(cfs_rq, se);
736 update_stats_wait_end(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
739 rq_of(cfs_rq)->
clock - se->statistics.wait_start));
740 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
741 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
742 rq_of(cfs_rq)->
clock - se->statistics.wait_start);
743 #ifdef CONFIG_SCHEDSTATS
745 trace_sched_stat_wait(task_of(se),
746 rq_of(cfs_rq)->
clock - se->statistics.wait_start);
753 update_stats_dequeue(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
759 if (se != cfs_rq->
curr)
760 update_stats_wait_end(cfs_rq, se);
767 update_stats_curr_start(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
780 account_entity_enqueue(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
782 update_load_add(&cfs_rq->
load, se->
load.weight);
783 if (!parent_entity(se))
784 update_load_add(&rq_of(cfs_rq)->load, se->
load.weight);
787 list_add(&se->
group_node, &rq_of(cfs_rq)->cfs_tasks);
793 account_entity_dequeue(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
795 update_load_sub(&cfs_rq->
load, se->
load.weight);
796 if (!parent_entity(se))
797 update_load_sub(&rq_of(cfs_rq)->load, se->
load.weight);
803 #ifdef CONFIG_FAIR_GROUP_SCHED
805 static inline int throttled_hierarchy(
struct cfs_rq *cfs_rq);
807 static void update_cfs_rq_load_contribution(
struct cfs_rq *cfs_rq,
810 struct task_group *tg = cfs_rq->tg;
813 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
814 load_avg -= cfs_rq->load_contribution;
816 if (global_update ||
abs(load_avg) > cfs_rq->load_contribution / 8) {
818 cfs_rq->load_contribution += load_avg;
822 static void update_cfs_load(
struct cfs_rq *cfs_rq,
int global_update)
826 unsigned long load = cfs_rq->
load.weight;
828 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
831 now = rq_of(cfs_rq)->clock_task;
832 delta = now - cfs_rq->load_stamp;
835 if (cfs_rq->load_stamp > cfs_rq->load_last &&
836 now - cfs_rq->load_last > 4 * period) {
837 cfs_rq->load_period = 0;
838 cfs_rq->load_avg = 0;
842 cfs_rq->load_stamp = now;
843 cfs_rq->load_unacc_exec_time = 0;
844 cfs_rq->load_period +=
delta;
846 cfs_rq->load_last = now;
847 cfs_rq->load_avg += delta *
load;
851 if (global_update || cfs_rq->load_period > period
852 || !cfs_rq->load_period)
853 update_cfs_rq_load_contribution(cfs_rq, global_update);
855 while (cfs_rq->load_period > period) {
861 asm(
"" :
"+rm" (cfs_rq->load_period));
862 cfs_rq->load_period /= 2;
863 cfs_rq->load_avg /= 2;
867 list_del_leaf_cfs_rq(cfs_rq);
870 static inline long calc_tg_weight(
struct task_group *tg,
struct cfs_rq *cfs_rq)
880 tg_weight -= cfs_rq->load_contribution;
881 tg_weight += cfs_rq->
load.weight;
886 static long calc_cfs_shares(
struct cfs_rq *cfs_rq,
struct task_group *tg)
888 long tg_weight,
load, shares;
890 tg_weight = calc_tg_weight(tg, cfs_rq);
891 load = cfs_rq->
load.weight;
893 shares = (tg->shares *
load);
897 if (shares < MIN_SHARES)
899 if (shares > tg->shares)
905 static void update_entity_shares_tick(
struct cfs_rq *cfs_rq)
908 update_cfs_load(cfs_rq, 0);
909 update_cfs_shares(cfs_rq);
913 static void update_cfs_load(
struct cfs_rq *cfs_rq,
int global_update)
917 static inline long calc_cfs_shares(
struct cfs_rq *cfs_rq,
struct task_group *tg)
922 static inline void update_entity_shares_tick(
struct cfs_rq *cfs_rq)
926 static void reweight_entity(
struct cfs_rq *cfs_rq,
struct sched_entity *se,
931 if (cfs_rq->
curr == se)
933 account_entity_dequeue(cfs_rq, se);
936 update_load_set(&se->
load, weight);
939 account_entity_enqueue(cfs_rq, se);
942 static void update_cfs_shares(
struct cfs_rq *cfs_rq)
944 struct task_group *tg;
949 se = tg->se[cpu_of(rq_of(cfs_rq))];
950 if (!se || throttled_hierarchy(cfs_rq))
956 shares = calc_cfs_shares(cfs_rq, tg);
958 reweight_entity(cfs_rq_of(se), se, shares);
961 static void update_cfs_load(
struct cfs_rq *cfs_rq,
int global_update)
965 static inline void update_cfs_shares(
struct cfs_rq *cfs_rq)
969 static inline void update_entity_shares_tick(
struct cfs_rq *cfs_rq)
974 static void enqueue_sleeper(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
976 #ifdef CONFIG_SCHEDSTATS
982 if (se->statistics.sleep_start) {
983 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
988 if (
unlikely(delta > se->statistics.sleep_max))
989 se->statistics.sleep_max =
delta;
991 se->statistics.sleep_start = 0;
992 se->statistics.sum_sleep_runtime +=
delta;
995 account_scheduler_latency(tsk, delta >> 10, 1);
996 trace_sched_stat_sleep(tsk, delta);
999 if (se->statistics.block_start) {
1000 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1005 if (
unlikely(delta > se->statistics.block_max))
1006 se->statistics.block_max =
delta;
1008 se->statistics.block_start = 0;
1009 se->statistics.sum_sleep_runtime +=
delta;
1013 se->statistics.iowait_sum +=
delta;
1014 se->statistics.iowait_count++;
1015 trace_sched_stat_iowait(tsk, delta);
1018 trace_sched_stat_blocked(tsk, delta);
1030 account_scheduler_latency(tsk, delta >> 10, 0);
1036 static void check_spread(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
1038 #ifdef CONFIG_SCHED_DEBUG
1050 place_entity(
struct cfs_rq *cfs_rq,
struct sched_entity *se,
int initial)
1061 vruntime += sched_vslice(cfs_rq, se);
1078 vruntime = max_vruntime(se->
vruntime, vruntime);
1083 static void check_enqueue_throttle(
struct cfs_rq *cfs_rq);
1098 update_curr(cfs_rq);
1099 update_cfs_load(cfs_rq, 0);
1100 account_entity_enqueue(cfs_rq, se);
1101 update_cfs_shares(cfs_rq);
1103 if (flags & ENQUEUE_WAKEUP) {
1104 place_entity(cfs_rq, se, 0);
1105 enqueue_sleeper(cfs_rq, se);
1108 update_stats_enqueue(cfs_rq, se);
1109 check_spread(cfs_rq, se);
1110 if (se != cfs_rq->
curr)
1111 __enqueue_entity(cfs_rq, se);
1115 list_add_leaf_cfs_rq(cfs_rq);
1116 check_enqueue_throttle(cfs_rq);
1120 static void __clear_buddies_last(
struct sched_entity *se)
1123 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1124 if (cfs_rq->
last == se)
1131 static void __clear_buddies_next(
struct sched_entity *se)
1134 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1135 if (cfs_rq->
next == se)
1142 static void __clear_buddies_skip(
struct sched_entity *se)
1145 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1146 if (cfs_rq->
skip == se)
1153 static void clear_buddies(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
1155 if (cfs_rq->
last == se)
1156 __clear_buddies_last(se);
1158 if (cfs_rq->
next == se)
1159 __clear_buddies_next(se);
1161 if (cfs_rq->
skip == se)
1162 __clear_buddies_skip(se);
1165 static __always_inline void return_cfs_rq_runtime(
struct cfs_rq *cfs_rq);
1168 dequeue_entity(
struct cfs_rq *cfs_rq,
struct sched_entity *se,
int flags)
1173 update_curr(cfs_rq);
1175 update_stats_dequeue(cfs_rq, se);
1177 #ifdef CONFIG_SCHEDSTATS
1182 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1184 se->statistics.block_start = rq_of(cfs_rq)->clock;
1189 clear_buddies(cfs_rq, se);
1191 if (se != cfs_rq->
curr)
1192 __dequeue_entity(cfs_rq, se);
1194 update_cfs_load(cfs_rq, 0);
1195 account_entity_dequeue(cfs_rq, se);
1202 if (!(flags & DEQUEUE_SLEEP))
1206 return_cfs_rq_runtime(cfs_rq);
1208 update_min_vruntime(cfs_rq);
1209 update_cfs_shares(cfs_rq);
1216 check_preempt_tick(
struct cfs_rq *cfs_rq,
struct sched_entity *curr)
1218 unsigned long ideal_runtime, delta_exec;
1222 ideal_runtime = sched_slice(cfs_rq, curr);
1224 if (delta_exec > ideal_runtime) {
1230 clear_buddies(cfs_rq, curr);
1248 if (delta > ideal_runtime)
1253 set_next_entity(
struct cfs_rq *cfs_rq,
struct sched_entity *se)
1262 update_stats_wait_end(cfs_rq, se);
1263 __dequeue_entity(cfs_rq, se);
1266 update_stats_curr_start(cfs_rq, se);
1268 #ifdef CONFIG_SCHEDSTATS
1274 if (rq_of(cfs_rq)->load.weight >= 2*se->
load.weight) {
1275 se->statistics.slice_max =
max(se->statistics.slice_max,
1292 static struct sched_entity *pick_next_entity(
struct cfs_rq *cfs_rq)
1301 if (cfs_rq->
skip == se) {
1303 if (second && wakeup_preempt_entity(second, left) < 1)
1310 if (cfs_rq->
last && wakeup_preempt_entity(cfs_rq->
last, left) < 1)
1316 if (cfs_rq->
next && wakeup_preempt_entity(cfs_rq->
next, left) < 1)
1319 clear_buddies(cfs_rq, se);
1324 static void check_cfs_rq_runtime(
struct cfs_rq *cfs_rq);
1326 static void put_prev_entity(
struct cfs_rq *cfs_rq,
struct sched_entity *
prev)
1333 update_curr(cfs_rq);
1336 check_cfs_rq_runtime(cfs_rq);
1338 check_spread(cfs_rq, prev);
1340 update_stats_wait_start(cfs_rq, prev);
1342 __enqueue_entity(cfs_rq, prev);
1348 entity_tick(
struct cfs_rq *cfs_rq,
struct sched_entity *curr,
int queued)
1353 update_curr(cfs_rq);
1358 update_entity_shares_tick(cfs_rq);
1360 #ifdef CONFIG_SCHED_HRTICK
1373 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1378 check_preempt_tick(cfs_rq, curr);
1386 #ifdef CONFIG_CFS_BANDWIDTH
1388 #ifdef HAVE_JUMP_LABEL
1389 static struct static_key __cfs_bandwidth_used;
1391 static inline bool cfs_bandwidth_used(
void)
1393 return static_key_false(&__cfs_bandwidth_used);
1399 if (enabled && !was_enabled)
1400 static_key_slow_inc(&__cfs_bandwidth_used);
1401 else if (!enabled && was_enabled)
1402 static_key_slow_dec(&__cfs_bandwidth_used);
1405 static bool cfs_bandwidth_used(
void)
1417 static inline u64 default_cfs_period(
void)
1419 return 100000000ULL;
1422 static inline u64 sched_cfs_bandwidth_slice(
void)
1434 void __refill_cfs_bandwidth_runtime(
struct cfs_bandwidth *cfs_b)
1442 cfs_b->runtime = cfs_b->quota;
1443 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1446 static inline struct cfs_bandwidth *tg_cfs_bandwidth(
struct task_group *tg)
1448 return &tg->cfs_bandwidth;
1452 static int assign_cfs_rq_runtime(
struct cfs_rq *cfs_rq)
1454 struct task_group *tg = cfs_rq->tg;
1456 u64 amount = 0, min_amount, expires;
1459 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1463 amount = min_amount;
1471 if (!cfs_b->timer_active) {
1472 __refill_cfs_bandwidth_runtime(cfs_b);
1473 __start_cfs_bandwidth(cfs_b);
1476 if (cfs_b->runtime > 0) {
1477 amount =
min(cfs_b->runtime, min_amount);
1478 cfs_b->runtime -= amount;
1482 expires = cfs_b->runtime_expires;
1485 cfs_rq->runtime_remaining += amount;
1491 if ((
s64)(expires - cfs_rq->runtime_expires) > 0)
1492 cfs_rq->runtime_expires = expires;
1494 return cfs_rq->runtime_remaining > 0;
1501 static void expire_cfs_rq_runtime(
struct cfs_rq *cfs_rq)
1504 struct rq *
rq = rq_of(cfs_rq);
1510 if (cfs_rq->runtime_remaining < 0)
1522 if ((
s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1527 cfs_rq->runtime_remaining = 0;
1531 static void __account_cfs_rq_runtime(
struct cfs_rq *cfs_rq,
1532 unsigned long delta_exec)
1535 cfs_rq->runtime_remaining -= delta_exec;
1536 expire_cfs_rq_runtime(cfs_rq);
1538 if (
likely(cfs_rq->runtime_remaining > 0))
1545 if (!assign_cfs_rq_runtime(cfs_rq) &&
likely(cfs_rq->
curr))
1550 void account_cfs_rq_runtime(
struct cfs_rq *cfs_rq,
unsigned long delta_exec)
1552 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1555 __account_cfs_rq_runtime(cfs_rq, delta_exec);
1558 static inline int cfs_rq_throttled(
struct cfs_rq *cfs_rq)
1560 return cfs_bandwidth_used() && cfs_rq->throttled;
1564 static inline int throttled_hierarchy(
struct cfs_rq *cfs_rq)
1566 return cfs_bandwidth_used() && cfs_rq->throttle_count;
1574 static inline int throttled_lb_pair(
struct task_group *tg,
1575 int src_cpu,
int dest_cpu)
1577 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1579 src_cfs_rq = tg->cfs_rq[src_cpu];
1580 dest_cfs_rq = tg->cfs_rq[dest_cpu];
1582 return throttled_hierarchy(src_cfs_rq) ||
1583 throttled_hierarchy(dest_cfs_rq);
1587 static int tg_unthrottle_up(
struct task_group *tg,
void *
data)
1589 struct rq *rq =
data;
1590 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1592 cfs_rq->throttle_count--;
1594 if (!cfs_rq->throttle_count) {
1598 cfs_rq->load_stamp +=
delta;
1599 cfs_rq->load_last +=
delta;
1602 update_cfs_shares(cfs_rq);
1609 static int tg_throttle_down(
struct task_group *tg,
void *data)
1611 struct rq *rq =
data;
1612 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1615 if (!cfs_rq->throttle_count)
1616 update_cfs_load(cfs_rq, 0);
1617 cfs_rq->throttle_count++;
1622 static void throttle_cfs_rq(
struct cfs_rq *cfs_rq)
1624 struct rq *rq = rq_of(cfs_rq);
1627 long task_delta, dequeue = 1;
1629 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1633 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (
void *)rq);
1638 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1644 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1647 if (qcfs_rq->
load.weight)
1654 cfs_rq->throttled = 1;
1655 cfs_rq->throttled_timestamp = rq->
clock;
1657 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1661 void unthrottle_cfs_rq(
struct cfs_rq *cfs_rq)
1663 struct rq *rq = rq_of(cfs_rq);
1669 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1671 cfs_rq->throttled = 0;
1673 cfs_b->throttled_time += rq->
clock - cfs_rq->throttled_timestamp;
1674 list_del_rcu(&cfs_rq->throttled_list);
1676 cfs_rq->throttled_timestamp = 0;
1680 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (
void *)rq);
1682 if (!cfs_rq->
load.weight)
1690 cfs_rq = cfs_rq_of(se);
1692 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1695 if (cfs_rq_throttled(cfs_rq))
1708 u64 remaining,
u64 expires)
1710 struct cfs_rq *cfs_rq;
1711 u64 runtime = remaining;
1714 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1716 struct rq *rq = rq_of(cfs_rq);
1719 if (!cfs_rq_throttled(cfs_rq))
1722 runtime = -cfs_rq->runtime_remaining + 1;
1723 if (runtime > remaining)
1724 runtime = remaining;
1725 remaining -= runtime;
1727 cfs_rq->runtime_remaining += runtime;
1728 cfs_rq->runtime_expires = expires;
1731 if (cfs_rq->runtime_remaining > 0)
1732 unthrottle_cfs_rq(cfs_rq);
1753 u64 runtime, runtime_expires;
1754 int idle = 1, throttled;
1761 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1763 idle = cfs_b->idle && !throttled;
1770 __refill_cfs_bandwidth_runtime(cfs_b);
1779 cfs_b->nr_throttled +=
overrun;
1787 runtime = cfs_b->runtime;
1788 runtime_expires = cfs_b->runtime_expires;
1796 while (throttled && runtime > 0) {
1799 runtime = distribute_cfs_runtime(cfs_b, runtime,
1803 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1807 cfs_b->runtime = runtime;
1817 cfs_b->timer_active = 0;
1831 static int runtime_refresh_within(
struct cfs_bandwidth *cfs_b,
u64 min_expire)
1833 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1837 if (hrtimer_callback_running(refresh_timer))
1841 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1842 if (remaining < min_expire)
1848 static void start_cfs_slack_bandwidth(
struct cfs_bandwidth *cfs_b)
1850 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1853 if (runtime_refresh_within(cfs_b, min_left))
1857 ns_to_ktime(cfs_bandwidth_slack_period));
1861 static void __return_cfs_rq_runtime(
struct cfs_rq *cfs_rq)
1864 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1866 if (slack_runtime <= 0)
1871 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1872 cfs_b->runtime += slack_runtime;
1875 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1876 !list_empty(&cfs_b->throttled_cfs_rq))
1877 start_cfs_slack_bandwidth(cfs_b);
1882 cfs_rq->runtime_remaining -= slack_runtime;
1885 static __always_inline void return_cfs_rq_runtime(
struct cfs_rq *cfs_rq)
1887 if (!cfs_bandwidth_used())
1890 if (!cfs_rq->runtime_enabled || cfs_rq->
nr_running)
1893 __return_cfs_rq_runtime(cfs_rq);
1900 static void do_sched_cfs_slack_timer(
struct cfs_bandwidth *cfs_b)
1902 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1906 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1910 if (cfs_b->quota !=
RUNTIME_INF && cfs_b->runtime > slice) {
1911 runtime = cfs_b->runtime;
1914 expires = cfs_b->runtime_expires;
1920 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1923 if (expires == cfs_b->runtime_expires)
1924 cfs_b->runtime = runtime;
1933 static void check_enqueue_throttle(
struct cfs_rq *cfs_rq)
1935 if (!cfs_bandwidth_used())
1939 if (!cfs_rq->runtime_enabled || cfs_rq->
curr)
1943 if (cfs_rq_throttled(cfs_rq))
1947 account_cfs_rq_runtime(cfs_rq, 0);
1948 if (cfs_rq->runtime_remaining <= 0)
1949 throttle_cfs_rq(cfs_rq);
1953 static void check_cfs_rq_runtime(
struct cfs_rq *cfs_rq)
1955 if (!cfs_bandwidth_used())
1958 if (
likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1965 if (cfs_rq_throttled(cfs_rq))
1968 throttle_cfs_rq(cfs_rq);
1971 static inline u64 default_cfs_period(
void);
1972 static int do_sched_cfs_period_timer(
struct cfs_bandwidth *cfs_b,
int overrun);
1973 static void do_sched_cfs_slack_timer(
struct cfs_bandwidth *cfs_b);
1979 do_sched_cfs_slack_timer(cfs_b);
1993 now = hrtimer_cb_get_time(timer);
1999 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2010 cfs_b->period = ns_to_ktime(default_cfs_period());
2012 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2014 cfs_b->period_timer.function = sched_cfs_period_timer;
2016 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2019 static void init_cfs_rq_runtime(
struct cfs_rq *cfs_rq)
2021 cfs_rq->runtime_enabled = 0;
2022 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2034 while (
unlikely(hrtimer_active(&cfs_b->period_timer))) {
2041 if (cfs_b->timer_active)
2045 cfs_b->timer_active = 1;
2049 static void destroy_cfs_bandwidth(
struct cfs_bandwidth *cfs_b)
2055 static void unthrottle_offline_cfs_rqs(
struct rq *rq)
2057 struct cfs_rq *cfs_rq;
2062 if (!cfs_rq->runtime_enabled)
2069 cfs_rq->runtime_remaining = cfs_b->quota;
2070 if (cfs_rq_throttled(cfs_rq))
2071 unthrottle_cfs_rq(cfs_rq);
2077 void account_cfs_rq_runtime(
struct cfs_rq *cfs_rq,
unsigned long delta_exec) {}
2078 static void check_cfs_rq_runtime(
struct cfs_rq *cfs_rq) {}
2079 static void check_enqueue_throttle(
struct cfs_rq *cfs_rq) {}
2080 static __always_inline void return_cfs_rq_runtime(
struct cfs_rq *cfs_rq) {}
2082 static inline int cfs_rq_throttled(
struct cfs_rq *cfs_rq)
2087 static inline int throttled_hierarchy(
struct cfs_rq *cfs_rq)
2092 static inline int throttled_lb_pair(
struct task_group *tg,
2093 int src_cpu,
int dest_cpu)
2100 #ifdef CONFIG_FAIR_GROUP_SCHED
2101 static void init_cfs_rq_runtime(
struct cfs_rq *cfs_rq) {}
2104 static inline struct cfs_bandwidth *tg_cfs_bandwidth(
struct task_group *tg)
2108 static inline void destroy_cfs_bandwidth(
struct cfs_bandwidth *cfs_b) {}
2109 static inline void unthrottle_offline_cfs_rqs(
struct rq *rq) {}
2117 #ifdef CONFIG_SCHED_HRTICK
2118 static void hrtick_start_fair(
struct rq *rq,
struct task_struct *
p)
2121 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2126 u64 slice = sched_slice(cfs_rq, se);
2128 s64 delta = slice - ran;
2143 hrtick_start(rq, delta);
2152 static void hrtick_update(
struct rq *rq)
2159 if (cfs_rq_of(&curr->
se)->nr_running < sched_nr_latency)
2160 hrtick_start_fair(rq, curr);
2164 hrtick_start_fair(
struct rq *rq,
struct task_struct *p)
2168 static inline void hrtick_update(
struct rq *rq)
2179 enqueue_task_fair(
struct rq *rq,
struct task_struct *p,
int flags)
2181 struct cfs_rq *cfs_rq;
2187 cfs_rq = cfs_rq_of(se);
2188 enqueue_entity(cfs_rq, se, flags);
2196 if (cfs_rq_throttled(cfs_rq))
2204 cfs_rq = cfs_rq_of(se);
2207 if (cfs_rq_throttled(cfs_rq))
2210 update_cfs_load(cfs_rq, 0);
2211 update_cfs_shares(cfs_rq);
2226 static void dequeue_task_fair(
struct rq *rq,
struct task_struct *p,
int flags)
2228 struct cfs_rq *cfs_rq;
2233 cfs_rq = cfs_rq_of(se);
2234 dequeue_entity(cfs_rq, se, flags);
2242 if (cfs_rq_throttled(cfs_rq))
2247 if (cfs_rq->
load.weight) {
2252 if (task_sleep && parent_entity(se))
2253 set_next_buddy(parent_entity(se));
2256 se = parent_entity(se);
2263 cfs_rq = cfs_rq_of(se);
2266 if (cfs_rq_throttled(cfs_rq))
2269 update_cfs_load(cfs_rq, 0);
2270 update_cfs_shares(cfs_rq);
2280 static unsigned long weighted_cpuload(
const int cpu)
2282 return cpu_rq(cpu)->load.weight;
2292 static unsigned long source_load(
int cpu,
int type)
2294 struct rq *rq =
cpu_rq(cpu);
2295 unsigned long total = weighted_cpuload(cpu);
2307 static unsigned long target_load(
int cpu,
int type)
2309 struct rq *rq =
cpu_rq(cpu);
2310 unsigned long total = weighted_cpuload(cpu);
2318 static unsigned long power_of(
int cpu)
2320 return cpu_rq(cpu)->cpu_power;
2323 static unsigned long cpu_avg_load_per_task(
int cpu)
2325 struct rq *rq =
cpu_rq(cpu);
2335 static void task_waking_fair(
struct task_struct *p)
2338 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2341 #ifndef CONFIG_64BIT
2348 }
while (min_vruntime != min_vruntime_copy);
2356 #ifdef CONFIG_FAIR_GROUP_SCHED
2407 static long effective_load(
struct task_group *tg,
int cpu,
long wl,
long wg)
2422 W = wg + calc_tg_weight(tg, se->my_q);
2427 w = se->my_q->
load.weight + wl;
2433 wl = (w * tg->shares) / W;
2442 if (wl < MIN_SHARES)
2448 wl -= se->
load.weight;
2464 static inline unsigned long effective_load(
struct task_group *tg,
int cpu,
2465 unsigned long wl,
unsigned long wg)
2472 static int wake_affine(
struct sched_domain *
sd,
struct task_struct *p,
int sync)
2475 int idx, this_cpu, prev_cpu;
2476 unsigned long tl_per_task;
2477 struct task_group *tg;
2483 prev_cpu = task_cpu(p);
2484 load = source_load(prev_cpu, idx);
2485 this_load = target_load(this_cpu, idx);
2494 weight =
current->se.load.weight;
2496 this_load += effective_load(tg, this_cpu, -weight, -weight);
2497 load += effective_load(tg, prev_cpu, 0, -weight);
2501 weight = p->
se.load.weight;
2512 if (this_load > 0) {
2513 s64 this_eff_load, prev_eff_load;
2515 this_eff_load = 100;
2516 this_eff_load *= power_of(prev_cpu);
2517 this_eff_load *= this_load +
2518 effective_load(tg, this_cpu, weight, weight);
2520 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
2521 prev_eff_load *= power_of(this_cpu);
2522 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
2524 balanced = this_eff_load <= prev_eff_load;
2533 if (sync && balanced)
2537 tl_per_task = cpu_avg_load_per_task(this_cpu);
2540 (this_load <= load &&
2541 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
2559 static struct sched_group *
2560 find_idlest_group(
struct sched_domain *sd,
struct task_struct *p,
2561 int this_cpu,
int load_idx)
2563 struct sched_group *idlest =
NULL, *
group = sd->groups;
2564 unsigned long min_load =
ULONG_MAX, this_load = 0;
2565 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2568 unsigned long load, avg_load;
2573 if (!cpumask_intersects(sched_group_cpus(group),
2578 sched_group_cpus(group));
2586 load = source_load(i, load_idx);
2588 load = target_load(i, load_idx);
2597 this_load = avg_load;
2598 }
else if (avg_load < min_load) {
2599 min_load = avg_load;
2602 }
while (group = group->next, group != sd->groups);
2604 if (!idlest || 100*this_load < imbalance*min_load)
2613 find_idlest_cpu(
struct sched_group *group,
struct task_struct *p,
int this_cpu)
2621 load = weighted_cpuload(i);
2623 if (load < min_load || (load == min_load && i == this_cpu)) {
2638 int prev_cpu = task_cpu(p);
2639 struct sched_domain *sd;
2640 struct sched_group *
sg;
2647 if (target == cpu &&
idle_cpu(cpu))
2654 if (target == prev_cpu &&
idle_cpu(prev_cpu))
2661 for_each_lower_domain(sd) {
2664 if (!cpumask_intersects(sched_group_cpus(sg),
2678 }
while (sg != sd->groups);
2696 select_task_rq_fair(
struct task_struct *p,
int sd_flag,
int wake_flags)
2698 struct sched_domain *
tmp, *affine_sd =
NULL, *sd =
NULL;
2700 int prev_cpu = task_cpu(p);
2702 int want_affine = 0;
2703 int sync = wake_flags &
WF_SYNC;
2708 if (sd_flag & SD_BALANCE_WAKE) {
2715 for_each_domain(cpu, tmp) {
2716 if (!(tmp->flags & SD_LOAD_BALANCE))
2723 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
2729 if (tmp->flags & sd_flag)
2734 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
2737 new_cpu = select_idle_sibling(p, prev_cpu);
2742 int load_idx = sd->forkexec_idx;
2743 struct sched_group *
group;
2746 if (!(sd->flags & sd_flag)) {
2751 if (sd_flag & SD_BALANCE_WAKE)
2752 load_idx = sd->wake_idx;
2754 group = find_idlest_group(sd, p, cpu, load_idx);
2760 new_cpu = find_idlest_cpu(group, p, cpu);
2761 if (new_cpu == -1 || new_cpu == cpu) {
2769 weight = sd->span_weight;
2771 for_each_domain(cpu, tmp) {
2772 if (weight <= tmp->span_weight)
2774 if (tmp->flags & sd_flag)
2786 static unsigned long
2804 return calc_delta_fair(gran, se);
2829 gran = wakeup_gran(curr, se);
2842 cfs_rq_of(se)->last = se;
2851 cfs_rq_of(se)->next = se;
2857 cfs_rq_of(se)->
skip = se;
2867 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2868 int scale = cfs_rq->
nr_running >= sched_nr_latency;
2869 int next_buddy_marked = 0;
2880 if (
unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2884 set_next_buddy(pse);
2885 next_buddy_marked = 1;
2898 if (test_tsk_need_resched(curr))
2913 find_matching_se(&se, &pse);
2914 update_curr(cfs_rq_of(se));
2916 if (wakeup_preempt_entity(se, pse) == 1) {
2921 if (!next_buddy_marked)
2922 set_next_buddy(pse);
2946 static struct task_struct *pick_next_task_fair(
struct rq *rq)
2949 struct cfs_rq *cfs_rq = &rq->
cfs;
2956 se = pick_next_entity(cfs_rq);
2957 set_next_entity(cfs_rq, se);
2958 cfs_rq = group_cfs_rq(se);
2962 if (hrtick_enabled(rq))
2963 hrtick_start_fair(rq, p);
2971 static void put_prev_task_fair(
struct rq *rq,
struct task_struct *prev)
2974 struct cfs_rq *cfs_rq;
2977 cfs_rq = cfs_rq_of(se);
2978 put_prev_entity(cfs_rq, se);
2987 static void yield_task_fair(
struct rq *rq)
2990 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2999 clear_buddies(cfs_rq, se);
3006 update_curr(cfs_rq);
3018 static bool yield_to_task_fair(
struct rq *rq,
struct task_struct *p,
bool preempt)
3023 if (!se->
on_rq || throttled_hierarchy(cfs_rq_of(se)))
3029 yield_task_fair(rq);
3041 #define LBF_ALL_PINNED 0x01
3042 #define LBF_NEED_BREAK 0x02
3043 #define LBF_SOME_PINNED 0x04
3046 struct sched_domain *sd;
3064 unsigned int loop_break;
3065 unsigned int loop_max;
3075 set_task_cpu(p, env->dst_cpu);
3084 task_hot(
struct task_struct *p,
u64 now,
struct sched_domain *sd)
3098 (&p->
se == cfs_rq_of(&p->
se)->next ||
3099 &p->
se == cfs_rq_of(&p->
se)->last))
3107 delta = now - p->
se.exec_start;
3118 int tsk_cache_hot = 0;
3128 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3138 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3143 if (new_dst_cpu < nr_cpu_ids) {
3144 env->flags |= LBF_SOME_PINNED;
3145 env->new_dst_cpu = new_dst_cpu;
3151 env->flags &= ~LBF_ALL_PINNED;
3153 if (task_running(env->src_rq, p)) {
3154 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3164 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3165 if (!tsk_cache_hot ||
3166 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3167 #ifdef CONFIG_SCHEDSTATS
3168 if (tsk_cache_hot) {
3176 if (tsk_cache_hot) {
3190 static int move_one_task(
struct lb_env *env)
3195 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3198 if (!can_migrate_task(p, env))
3213 static unsigned long task_h_load(
struct task_struct *p);
3215 static const unsigned int sched_nr_migrate_break = 32;
3224 static int move_tasks(
struct lb_env *env)
3226 struct list_head *tasks = &env->src_rq->cfs_tasks;
3231 if (env->imbalance <= 0)
3234 while (!list_empty(tasks)) {
3239 if (env->loop > env->loop_max)
3243 if (env->loop > env->loop_break) {
3244 env->loop_break += sched_nr_migrate_break;
3245 env->flags |= LBF_NEED_BREAK;
3249 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3252 load = task_h_load(p);
3254 if (
sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3257 if ((load / 2) > env->imbalance)
3260 if (!can_migrate_task(p, env))
3265 env->imbalance -=
load;
3267 #ifdef CONFIG_PREEMPT
3281 if (env->imbalance <= 0)
3286 list_move_tail(&p->
se.group_node, tasks);
3299 #ifdef CONFIG_FAIR_GROUP_SCHED
3303 static int update_shares_cpu(
struct task_group *tg,
int cpu)
3305 struct cfs_rq *cfs_rq;
3306 unsigned long flags;
3313 cfs_rq = tg->cfs_rq[
cpu];
3318 update_cfs_load(cfs_rq, 1);
3324 update_cfs_shares(cfs_rq);
3331 static void update_shares(
int cpu)
3333 struct cfs_rq *cfs_rq;
3334 struct rq *rq =
cpu_rq(cpu);
3343 if (throttled_hierarchy(cfs_rq))
3346 update_shares_cpu(cfs_rq->tg, cpu);
3356 static int tg_load_down(
struct task_group *tg,
void *data)
3359 long cpu = (
long)data;
3362 load =
cpu_rq(cpu)->load.weight;
3364 load = tg->parent->cfs_rq[
cpu]->h_load;
3365 load *= tg->se[
cpu]->load.weight;
3366 load /= tg->parent->cfs_rq[
cpu]->load.weight + 1;
3369 tg->cfs_rq[
cpu]->h_load =
load;
3374 static void update_h_load(
long cpu)
3376 struct rq *rq =
cpu_rq(cpu);
3379 if (rq->h_load_throttle == now)
3382 rq->h_load_throttle = now;
3385 walk_tg_tree(tg_load_down, tg_nop, (
void *)cpu);
3389 static unsigned long task_h_load(
struct task_struct *p)
3391 struct cfs_rq *cfs_rq = task_cfs_rq(p);
3394 load = p->
se.load.weight;
3395 load = div_u64(load * cfs_rq->h_load, cfs_rq->
load.weight + 1);
3400 static inline void update_shares(
int cpu)
3404 static inline void update_h_load(
long cpu)
3408 static unsigned long task_h_load(
struct task_struct *p)
3410 return p->
se.load.weight;
3419 struct sd_lb_stats {
3420 struct sched_group *busiest;
3421 struct sched_group *
this;
3422 unsigned long total_load;
3423 unsigned long total_pwr;
3424 unsigned long avg_load;
3427 unsigned long this_load;
3428 unsigned long this_load_per_task;
3429 unsigned long this_nr_running;
3430 unsigned long this_has_capacity;
3431 unsigned int this_idle_cpus;
3434 unsigned int busiest_idle_cpus;
3435 unsigned long max_load;
3436 unsigned long busiest_load_per_task;
3437 unsigned long busiest_nr_running;
3438 unsigned long busiest_group_capacity;
3439 unsigned long busiest_has_capacity;
3440 unsigned int busiest_group_weight;
3448 struct sg_lb_stats {
3449 unsigned long avg_load;
3450 unsigned long group_load;
3451 unsigned long sum_nr_running;
3452 unsigned long sum_weighted_load;
3453 unsigned long group_capacity;
3454 unsigned long idle_cpus;
3455 unsigned long group_weight;
3457 int group_has_capacity;
3465 static inline int get_sd_load_idx(
struct sched_domain *sd,
3472 load_idx = sd->busy_idx;
3476 load_idx = sd->newidle_idx;
3479 load_idx = sd->idle_idx;
3486 unsigned long default_scale_freq_power(
struct sched_domain *sd,
int cpu)
3493 return default_scale_freq_power(sd, cpu);
3496 unsigned long default_scale_smt_power(
struct sched_domain *sd,
int cpu)
3498 unsigned long weight = sd->span_weight;
3499 unsigned long smt_gain = sd->smt_gain;
3506 unsigned long __weak arch_scale_smt_power(
struct sched_domain *sd,
int cpu)
3508 return default_scale_smt_power(sd, cpu);
3511 unsigned long scale_rt_power(
int cpu)
3513 struct rq *rq =
cpu_rq(cpu);
3523 total = sched_avg_period() + (rq->
clock - age_stamp);
3529 available = total -
avg;
3537 return div_u64(available, total);
3540 static void update_cpu_power(
struct sched_domain *sd,
int cpu)
3542 unsigned long weight = sd->span_weight;
3544 struct sched_group *sdg = sd->groups;
3546 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3548 power *= arch_scale_smt_power(sd, cpu);
3550 power *= default_scale_smt_power(sd, cpu);
3555 sdg->sgp->power_orig = power;
3560 power *= default_scale_freq_power(sd, cpu);
3564 power *= scale_rt_power(cpu);
3570 cpu_rq(cpu)->cpu_power = power;
3571 sdg->sgp->power = power;
3576 struct sched_domain *
child = sd->child;
3577 struct sched_group *
group, *sdg = sd->groups;
3578 unsigned long power;
3582 interval =
clamp(interval, 1
UL, max_load_balance_interval);
3586 update_cpu_power(sd, cpu);
3592 if (child->flags & SD_OVERLAP) {
3599 power += power_of(cpu);
3606 group = child->groups;
3608 power += group->sgp->power;
3609 group = group->next;
3610 }
while (group != child->groups);
3613 sdg->sgp->power_orig = sdg->sgp->power = power;
3624 fix_small_capacity(
struct sched_domain *sd,
struct sched_group *group)
3629 if (!(sd->flags & SD_SHARE_CPUPOWER))
3635 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
3650 static inline void update_sg_lb_stats(
struct lb_env *env,
3651 struct sched_group *group,
int load_idx,
3652 int local_group,
int *balance,
struct sg_lb_stats *sgs)
3654 unsigned long nr_running, max_nr_running, min_nr_running;
3655 unsigned long load, max_cpu_load, min_cpu_load;
3656 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3657 unsigned long avg_load_per_task = 0;
3661 balance_cpu = group_balance_cpu(group);
3665 min_cpu_load = ~0
UL;
3667 min_nr_running = ~0
UL;
3670 struct rq *rq =
cpu_rq(i);
3676 if (
idle_cpu(i) && !first_idle_cpu &&
3682 load = target_load(i, load_idx);
3684 load = source_load(i, load_idx);
3685 if (load > max_cpu_load)
3686 max_cpu_load =
load;
3687 if (min_cpu_load > load)
3688 min_cpu_load =
load;
3690 if (nr_running > max_nr_running)
3692 if (min_nr_running > nr_running)
3696 sgs->group_load +=
load;
3698 sgs->sum_weighted_load += weighted_cpuload(i);
3711 if (balance_cpu != env->dst_cpu) {
3732 if (sgs->sum_nr_running)
3733 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3735 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3736 (max_nr_running - min_nr_running) > 1)
3741 if (!sgs->group_capacity)
3742 sgs->group_capacity = fix_small_capacity(env->sd, group);
3743 sgs->group_weight = group->group_weight;
3745 if (sgs->group_capacity > sgs->sum_nr_running)
3746 sgs->group_has_capacity = 1;
3759 static bool update_sd_pick_busiest(
struct lb_env *env,
3760 struct sd_lb_stats *sds,
3761 struct sched_group *sg,
3762 struct sg_lb_stats *sgs)
3764 if (sgs->avg_load <= sds->max_load)
3767 if (sgs->sum_nr_running > sgs->group_capacity)
3778 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3779 env->dst_cpu < group_first_cpu(sg)) {
3783 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
3796 static inline void update_sd_lb_stats(
struct lb_env *env,
3797 int *balance,
struct sd_lb_stats *sds)
3799 struct sched_domain *child = env->sd->child;
3800 struct sched_group *sg = env->sd->groups;
3801 struct sg_lb_stats sgs;
3802 int load_idx, prefer_sibling = 0;
3804 if (child && child->flags & SD_PREFER_SIBLING)
3807 load_idx = get_sd_load_idx(env->sd, env->idle);
3813 memset(&sgs, 0,
sizeof(sgs));
3814 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
3816 if (local_group && !(*balance))
3819 sds->total_load += sgs.group_load;
3820 sds->total_pwr += sg->sgp->power;
3832 if (prefer_sibling && !local_group && sds->this_has_capacity)
3833 sgs.group_capacity =
min(sgs.group_capacity, 1
UL);
3836 sds->this_load = sgs.avg_load;
3838 sds->this_nr_running = sgs.sum_nr_running;
3839 sds->this_load_per_task = sgs.sum_weighted_load;
3840 sds->this_has_capacity = sgs.group_has_capacity;
3841 sds->this_idle_cpus = sgs.idle_cpus;
3842 }
else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3843 sds->max_load = sgs.avg_load;
3845 sds->busiest_nr_running = sgs.sum_nr_running;
3846 sds->busiest_idle_cpus = sgs.idle_cpus;
3847 sds->busiest_group_capacity = sgs.group_capacity;
3848 sds->busiest_load_per_task = sgs.sum_weighted_load;
3849 sds->busiest_has_capacity = sgs.group_has_capacity;
3850 sds->busiest_group_weight = sgs.group_weight;
3851 sds->group_imb = sgs.group_imb;
3855 }
while (sg != env->sd->groups);
3881 static int check_asym_packing(
struct lb_env *env,
struct sd_lb_stats *sds)
3885 if (!(env->sd->flags & SD_ASYM_PACKING))
3891 busiest_cpu = group_first_cpu(sds->busiest);
3892 if (env->dst_cpu > busiest_cpu)
3909 void fix_small_imbalance(
struct lb_env *env,
struct sd_lb_stats *sds)
3911 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3912 unsigned int imbn = 2;
3913 unsigned long scaled_busy_load_per_task;
3915 if (sds->this_nr_running) {
3916 sds->this_load_per_task /= sds->this_nr_running;
3917 if (sds->busiest_load_per_task >
3918 sds->this_load_per_task)
3921 sds->this_load_per_task =
3922 cpu_avg_load_per_task(env->dst_cpu);
3925 scaled_busy_load_per_task = sds->busiest_load_per_task
3927 scaled_busy_load_per_task /= sds->busiest->sgp->power;
3929 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
3930 (scaled_busy_load_per_task * imbn)) {
3931 env->imbalance = sds->busiest_load_per_task;
3941 pwr_now += sds->busiest->sgp->power *
3942 min(sds->busiest_load_per_task, sds->max_load);
3943 pwr_now += sds->this->sgp->power *
3944 min(sds->this_load_per_task, sds->this_load);
3949 sds->busiest->sgp->power;
3950 if (sds->max_load > tmp)
3951 pwr_move += sds->busiest->sgp->power *
3952 min(sds->busiest_load_per_task, sds->max_load - tmp);
3955 if (sds->max_load * sds->busiest->sgp->power <
3957 tmp = (sds->max_load * sds->busiest->sgp->power) /
3958 sds->this->sgp->power;
3961 sds->this->sgp->power;
3962 pwr_move += sds->this->sgp->power *
3963 min(sds->this_load_per_task, sds->this_load + tmp);
3967 if (pwr_move > pwr_now)
3968 env->imbalance = sds->busiest_load_per_task;
3977 static inline void calculate_imbalance(
struct lb_env *env,
struct sd_lb_stats *sds)
3979 unsigned long max_pull, load_above_capacity = ~0
UL;
3981 sds->busiest_load_per_task /= sds->busiest_nr_running;
3982 if (sds->group_imb) {
3983 sds->busiest_load_per_task =
3984 min(sds->busiest_load_per_task, sds->avg_load);
3992 if (sds->max_load < sds->avg_load) {
3994 return fix_small_imbalance(env, sds);
3997 if (!sds->group_imb) {
4001 load_above_capacity = (sds->busiest_nr_running -
4002 sds->busiest_group_capacity);
4006 load_above_capacity /= sds->busiest->sgp->power;
4019 max_pull =
min(sds->max_load - sds->avg_load, load_above_capacity);
4022 env->imbalance =
min(max_pull * sds->busiest->sgp->power,
4023 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4032 if (env->imbalance < sds->busiest_load_per_task)
4033 return fix_small_imbalance(env, sds);
4058 static struct sched_group *
4059 find_busiest_group(
struct lb_env *env,
int *balance)
4061 struct sd_lb_stats sds;
4063 memset(&sds, 0,
sizeof(sds));
4069 update_sd_lb_stats(env, balance, &sds);
4079 check_asym_packing(env, &sds))
4083 if (!sds.busiest || sds.busiest_nr_running == 0)
4098 !sds.busiest_has_capacity)
4105 if (sds.this_load >= sds.max_load)
4112 if (sds.this_load >= sds.avg_load)
4122 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4123 sds.busiest_nr_running <= sds.busiest_group_weight)
4130 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4136 calculate_imbalance(env, &sds);
4148 static struct rq *find_busiest_queue(
struct lb_env *env,
4149 struct sched_group *group)
4151 struct rq *busiest =
NULL, *
rq;
4152 unsigned long max_load = 0;
4156 unsigned long power = power_of(i);
4162 capacity = fix_small_capacity(env->sd, group);
4168 wl = weighted_cpuload(i);
4174 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4185 if (wl > max_load) {
4198 #define MAX_PINNED_INTERVAL 512
4203 static int need_active_balance(
struct lb_env *env)
4205 struct sched_domain *sd = env->sd;
4214 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4218 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
4221 static int active_load_balance_cpu_stop(
void *data);
4227 static int load_balance(
int this_cpu,
struct rq *
this_rq,
4231 int ld_moved, cur_ld_moved, active_balance = 0;
4232 int lb_iterations, max_lb_iterations;
4233 struct sched_group *
group;
4235 unsigned long flags;
4238 struct lb_env env = {
4240 .dst_cpu = this_cpu,
4242 .dst_grpmask = sched_group_cpus(sd->groups),
4244 .loop_break = sched_nr_migrate_break,
4248 cpumask_copy(cpus, cpu_active_mask);
4249 max_lb_iterations = cpumask_weight(env.dst_grpmask);
4254 group = find_busiest_group(&env, balance);
4264 busiest = find_busiest_queue(&env, group);
4270 BUG_ON(busiest == env.dst_rq);
4283 env.flags |= LBF_ALL_PINNED;
4284 env.src_cpu = busiest->cpu;
4285 env.src_rq = busiest;
4288 update_h_load(env.src_cpu);
4291 double_rq_lock(env.dst_rq, busiest);
4297 cur_ld_moved = move_tasks(&env);
4298 ld_moved += cur_ld_moved;
4299 double_rq_unlock(env.dst_rq, busiest);
4302 if (env.flags & LBF_NEED_BREAK) {
4303 env.flags &= ~LBF_NEED_BREAK;
4332 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4333 lb_iterations++ < max_lb_iterations) {
4335 env.dst_rq =
cpu_rq(env.new_dst_cpu);
4336 env.dst_cpu = env.new_dst_cpu;
4337 env.flags &= ~LBF_SOME_PINNED;
4339 env.loop_break = sched_nr_migrate_break;
4348 if (
unlikely(env.flags & LBF_ALL_PINNED)) {
4349 cpumask_clear_cpu(cpu_of(busiest), cpus);
4350 if (!cpumask_empty(cpus)) {
4352 env.loop_break = sched_nr_migrate_break;
4368 sd->nr_balance_failed++;
4370 if (need_active_balance(&env)) {
4381 env.flags |= LBF_ALL_PINNED;
4382 goto out_one_pinned;
4390 if (!busiest->active_balance) {
4391 busiest->active_balance = 1;
4392 busiest->push_cpu = this_cpu;
4397 if (active_balance) {
4399 active_load_balance_cpu_stop, busiest,
4400 &busiest->active_balance_work);
4407 sd->nr_balance_failed = sd->cache_nice_tries+1;
4410 sd->nr_balance_failed = 0;
4412 if (
likely(!active_balance)) {
4414 sd->balance_interval = sd->min_interval;
4422 if (sd->balance_interval < sd->max_interval)
4423 sd->balance_interval *= 2;
4431 sd->nr_balance_failed = 0;
4435 if (((env.flags & LBF_ALL_PINNED) &&
4436 sd->balance_interval < MAX_PINNED_INTERVAL) ||
4437 (sd->balance_interval < sd->max_interval))
4438 sd->balance_interval *= 2;
4449 void idle_balance(
int this_cpu,
struct rq *this_rq)
4451 struct sched_domain *sd;
4452 int pulled_task = 0;
4453 unsigned long next_balance =
jiffies +
HZ;
4455 this_rq->idle_stamp = this_rq->
clock;
4465 update_shares(this_cpu);
4467 for_each_domain(this_cpu, sd) {
4471 if (!(sd->flags & SD_LOAD_BALANCE))
4474 if (sd->flags & SD_BALANCE_NEWIDLE) {
4476 pulled_task = load_balance(this_cpu, this_rq,
4481 if (
time_after(next_balance, sd->last_balance + interval))
4482 next_balance = sd->last_balance +
interval;
4484 this_rq->idle_stamp = 0;
4507 static int active_load_balance_cpu_stop(
void *data)
4509 struct rq *busiest_rq =
data;
4510 int busiest_cpu = cpu_of(busiest_rq);
4511 int target_cpu = busiest_rq->push_cpu;
4512 struct rq *target_rq =
cpu_rq(target_cpu);
4513 struct sched_domain *sd;
4519 !busiest_rq->active_balance))
4531 BUG_ON(busiest_rq == target_rq);
4534 double_lock_balance(busiest_rq, target_rq);
4538 for_each_domain(target_cpu, sd) {
4539 if ((sd->flags & SD_LOAD_BALANCE) &&
4545 struct lb_env env = {
4547 .dst_cpu = target_cpu,
4548 .dst_rq = target_rq,
4549 .src_cpu = busiest_rq->cpu,
4550 .src_rq = busiest_rq,
4556 if (move_one_task(&env))
4562 double_unlock_balance(busiest_rq, target_rq);
4564 busiest_rq->active_balance = 0;
4579 unsigned long next_balance;
4582 static inline int find_new_ilb(
int call_cpu)
4584 int ilb = cpumask_first(nohz.idle_cpus_mask);
4586 if (ilb < nr_cpu_ids &&
idle_cpu(ilb))
4597 static void nohz_balancer_kick(
int cpu)
4601 nohz.next_balance++;
4603 ilb_cpu = find_new_ilb(cpu);
4605 if (ilb_cpu >= nr_cpu_ids)
4620 static inline void nohz_balance_exit_idle(
int cpu)
4623 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
4625 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4629 static inline void set_cpu_sd_state_busy(
void)
4631 struct sched_domain *sd;
4634 if (!
test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4639 for_each_domain(cpu, sd)
4644 void set_cpu_sd_state_idle(
void)
4646 struct sched_domain *sd;
4649 if (
test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4651 set_bit(NOHZ_IDLE, nohz_flags(cpu));
4654 for_each_domain(cpu, sd)
4663 void nohz_balance_enter_idle(
int cpu)
4671 if (
test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4674 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4676 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4680 unsigned long action,
void *hcpu)
4709 static void rebalance_domains(
int cpu,
enum cpu_idle_type idle)
4712 struct rq *rq =
cpu_rq(cpu);
4714 struct sched_domain *sd;
4716 unsigned long next_balance =
jiffies + 60*
HZ;
4717 int update_next_balance = 0;
4723 for_each_domain(cpu, sd) {
4724 if (!(sd->flags & SD_LOAD_BALANCE))
4727 interval = sd->balance_interval;
4729 interval *= sd->busy_factor;
4733 interval =
clamp(interval, 1
UL, max_load_balance_interval);
4735 need_serialize = sd->flags & SD_SERIALIZE;
4737 if (need_serialize) {
4738 if (!spin_trylock(&balancing))
4743 if (load_balance(cpu, rq, sd, idle, &balance)) {
4753 spin_unlock(&balancing);
4755 if (
time_after(next_balance, sd->last_balance + interval)) {
4756 next_balance = sd->last_balance +
interval;
4757 update_next_balance = 1;
4775 if (
likely(update_next_balance))
4784 static void nohz_idle_balance(
int this_cpu,
enum cpu_idle_type idle)
4786 struct rq *this_rq =
cpu_rq(this_cpu);
4791 !
test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
4795 if (balance_cpu == this_cpu || !
idle_cpu(balance_cpu))
4806 rq =
cpu_rq(balance_cpu);
4813 rebalance_domains(balance_cpu,
CPU_IDLE);
4820 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
4832 static inline int nohz_kick_needed(
struct rq *rq,
int cpu)
4835 struct sched_domain *sd;
4844 set_cpu_sd_state_busy();
4845 nohz_balance_exit_idle(cpu);
4861 for_each_domain(cpu, sd) {
4862 struct sched_group *sg = sd->groups;
4863 struct sched_group_power *sgp = sg->sgp;
4866 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
4867 goto need_kick_unlock;
4869 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
4871 sched_domain_span(sd)) < cpu))
4872 goto need_kick_unlock;
4874 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
4886 static void nohz_idle_balance(
int this_cpu,
enum cpu_idle_type idle) { }
4896 struct rq *this_rq =
cpu_rq(this_cpu);
4900 rebalance_domains(this_cpu, idle);
4907 nohz_idle_balance(this_cpu, idle);
4910 static inline int on_null_domain(
int cpu)
4918 void trigger_load_balance(
struct rq *rq,
int cpu)
4922 likely(!on_null_domain(cpu)))
4925 if (nohz_kick_needed(rq, cpu) &&
likely(!on_null_domain(cpu)))
4926 nohz_balancer_kick(cpu);
4930 static void rq_online_fair(
struct rq *rq)
4935 static void rq_offline_fair(
struct rq *rq)
4940 unthrottle_offline_cfs_rqs(rq);
4948 static void task_tick_fair(
struct rq *rq,
struct task_struct *curr,
int queued)
4950 struct cfs_rq *cfs_rq;
4954 cfs_rq = cfs_rq_of(se);
4955 entity_tick(cfs_rq, se, queued);
4966 struct cfs_rq *cfs_rq;
4970 unsigned long flags;
4976 cfs_rq = task_cfs_rq(
current);
4977 curr = cfs_rq->
curr;
4979 if (
unlikely(task_cpu(p) != this_cpu)) {
4981 __set_task_cpu(p, this_cpu);
4985 update_curr(cfs_rq);
4989 place_entity(cfs_rq, se, 1);
4991 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5010 prio_changed_fair(
struct rq *rq,
struct task_struct *p,
int oldprio)
5020 if (rq->
curr == p) {
5021 if (p->
prio > oldprio)
5027 static void switched_from_fair(
struct rq *rq,
struct task_struct *p)
5030 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5046 place_entity(cfs_rq, se, 0);
5054 static void switched_to_fair(
struct rq *rq,
struct task_struct *p)
5075 static void set_curr_task_fair(
struct rq *rq)
5080 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5082 set_next_entity(cfs_rq, se);
5084 account_cfs_rq_runtime(cfs_rq, 0);
5092 #ifndef CONFIG_64BIT
5097 #ifdef CONFIG_FAIR_GROUP_SCHED
5098 static void task_move_group_fair(
struct task_struct *p,
int on_rq)
5129 p->
se.vruntime -= cfs_rq_of(&p->
se)->min_vruntime;
5130 set_task_rq(p, task_cpu(p));
5132 p->
se.vruntime += cfs_rq_of(&p->
se)->min_vruntime;
5139 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5143 kfree(tg->cfs_rq[i]);
5154 struct cfs_rq *cfs_rq;
5158 tg->cfs_rq = kzalloc(
sizeof(cfs_rq) * nr_cpu_ids,
GFP_KERNEL);
5161 tg->se = kzalloc(
sizeof(se) * nr_cpu_ids,
GFP_KERNEL);
5170 cfs_rq = kzalloc_node(
sizeof(
struct cfs_rq),
5181 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5194 struct rq *rq =
cpu_rq(cpu);
5195 unsigned long flags;
5201 if (!tg->cfs_rq[cpu]->on_list)
5205 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5209 void init_tg_cfs_entry(
struct task_group *tg,
struct cfs_rq *cfs_rq,
5213 struct rq *rq =
cpu_rq(cpu);
5219 cfs_rq->load_stamp = 1;
5221 init_cfs_rq_runtime(cfs_rq);
5223 tg->cfs_rq[
cpu] = cfs_rq;
5231 se->cfs_rq = &rq->
cfs;
5233 se->cfs_rq = parent->my_q;
5236 update_load_set(&se->
load, 0);
5237 se->parent = parent;
5242 int sched_group_set_shares(
struct task_group *tg,
unsigned long shares)
5245 unsigned long flags;
5256 if (tg->shares == shares)
5259 tg->shares = shares;
5261 struct rq *rq =
cpu_rq(i);
5268 update_cfs_shares(group_cfs_rq(se));
5290 static unsigned int get_rr_interval_fair(
struct rq *rq,
struct task_struct *
task)
5293 unsigned int rr_interval = 0;
5299 if (rq->
cfs.load.weight)
5310 .enqueue_task = enqueue_task_fair,
5311 .dequeue_task = dequeue_task_fair,
5312 .yield_task = yield_task_fair,
5313 .yield_to_task = yield_to_task_fair,
5315 .check_preempt_curr = check_preempt_wakeup,
5317 .pick_next_task = pick_next_task_fair,
5318 .put_prev_task = put_prev_task_fair,
5321 .select_task_rq = select_task_rq_fair,
5323 .rq_online = rq_online_fair,
5324 .rq_offline = rq_offline_fair,
5326 .task_waking = task_waking_fair,
5329 .set_curr_task = set_curr_task_fair,
5330 .task_tick = task_tick_fair,
5331 .task_fork = task_fork_fair,
5333 .prio_changed = prio_changed_fair,
5334 .switched_from = switched_from_fair,
5335 .switched_to = switched_to_fair,
5337 .get_rr_interval = get_rr_interval_fair,
5339 #ifdef CONFIG_FAIR_GROUP_SCHED
5340 .task_move_group = task_move_group_fair,
5344 #ifdef CONFIG_SCHED_DEBUG
5347 struct cfs_rq *cfs_rq;
5363 zalloc_cpumask_var(&nohz.idle_cpus_mask,
GFP_NOWAIT);