29 #include <linux/errno.h>
34 #include <linux/kernel.h>
36 #include <linux/list.h>
37 #include <linux/mempolicy.h>
40 #include <linux/export.h>
46 #include <linux/sched.h>
49 #include <linux/slab.h>
51 #include <linux/stat.h>
52 #include <linux/string.h>
53 #include <linux/time.h>
57 #include <asm/uaccess.h>
92 struct cgroup_subsys_state
css;
113 static inline struct cpuset *cgroup_cs(
struct cgroup *
cont)
115 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
122 return container_of(task_subsys_state(task, cpuset_subsys_id),
127 static inline bool task_has_mempolicy(
struct task_struct *task)
129 return task->mempolicy;
132 static inline bool task_has_mempolicy(
struct task_struct *task)
157 static inline int is_cpu_exclusive(
const struct cpuset *
cs)
162 static inline int is_mem_exclusive(
const struct cpuset *
cs)
167 static inline int is_mem_hardwall(
const struct cpuset *
cs)
172 static inline int is_sched_load_balance(
const struct cpuset *
cs)
177 static inline int is_memory_migrate(
const struct cpuset *
cs)
182 static inline int is_spread_page(
const struct cpuset *
cs)
187 static inline int is_spread_slab(
const struct cpuset *
cs)
192 static struct cpuset top_cpuset = {
242 #define CPUSET_NAME_LEN (128)
243 #define CPUSET_NODELIST_LEN (256)
254 int flags,
const char *unused_dev_name,
void *
data)
261 "release_agent=/sbin/cpuset_release_agent";
262 ret = cgroup_fs->
mount(cgroup_fs, flags,
263 unused_dev_name, mountopts);
271 .mount = cpuset_mount,
288 static void guarantee_online_cpus(
const struct cpuset *
cs,
291 while (cs && !cpumask_intersects(cs->
cpus_allowed, cpu_online_mask))
296 cpumask_copy(pmask, cpu_online_mask);
297 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
313 static void guarantee_online_mems(
const struct cpuset *cs,
nodemask_t *pmask)
331 static void cpuset_update_task_spread_flag(
struct cpuset *cs,
334 if (is_spread_page(cs))
338 if (is_spread_slab(cs))
352 static int is_cpuset_subset(
const struct cpuset *
p,
const struct cpuset *
q)
356 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
357 is_mem_exclusive(p) <= is_mem_exclusive(q);
364 static struct cpuset *alloc_trial_cpuset(
const struct cpuset *cs)
385 static void free_trial_cpuset(
struct cpuset *trial)
411 static int validate_change(
const struct cpuset *
cur,
const struct cpuset *trial)
418 if (!is_cpuset_subset(cgroup_cs(cont), trial))
423 if (cur == &top_cpuset)
429 if (!is_cpuset_subset(trial, par))
438 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
442 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
470 update_domain_attr(
struct sched_domain_attr *dattr,
struct cpuset *c)
478 update_domain_attr_tree(
struct sched_domain_attr *dattr,
struct cpuset *c)
483 while (!list_empty(&
q)) {
494 if (is_sched_load_balance(cp))
495 update_domain_attr(dattr, cp);
498 child = cgroup_cs(cont);
567 struct sched_domain_attr *dattr;
576 if (is_sched_load_balance(&top_cpuset)) {
578 doms = alloc_sched_domains(ndoms);
584 *dattr = SD_ATTR_INIT;
585 update_domain_attr_tree(dattr, &top_cpuset);
598 while (!list_empty(&
q)) {
614 if (is_sched_load_balance(cp)) {
620 child = cgroup_cs(cont);
625 for (i = 0; i < csn; i++)
631 for (i = 0; i < csn; i++) {
635 for (j = 0; j < csn; j++) {
639 if (apn != bpn && cpusets_overlap(a, b)) {
640 for (k = 0; k < csn; k++) {
656 doms = alloc_sched_domains(ndoms);
666 for (nslot = 0, i = 0; i < csn; i++) {
678 if (nslot == ndoms) {
679 static int warnings = 10;
682 "rebuild_sched_domains confused:"
683 " nslot %d, ndoms %d, csn %d, i %d,"
685 nslot, ndoms, csn, i, apn);
693 *(dattr + nslot) = SD_ATTR_INIT;
694 for (j = i; j < csn; j++) {
700 update_domain_attr_tree(dattr + nslot, b);
737 struct sched_domain_attr *
attr;
745 ndoms = generate_sched_domains(&doms, &attr);
749 partition_sched_domains(ndoms, doms, attr);
759 struct sched_domain_attr **attributes)
766 static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
787 static void async_rebuild_sched_domains(
void)
789 queue_work(cpuset_wq, &rebuild_sched_domains_work);
803 do_rebuild_sched_domains(
NULL);
816 static int cpuset_test_cpumask(
struct task_struct *tsk,
817 struct cgroup_scanner *
scan)
820 (cgroup_cs(scan->cg))->cpus_allowed);
834 static void cpuset_change_cpumask(
struct task_struct *tsk,
835 struct cgroup_scanner *scan)
837 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
853 static void update_tasks_cpumask(
struct cpuset *cs,
struct ptr_heap *heap)
855 struct cgroup_scanner scan;
857 scan.cg = cs->
css.cgroup;
858 scan.test_task = cpuset_test_cpumask;
859 scan.process_task = cpuset_change_cpumask;
869 static int update_cpumask(
struct cpuset *cs,
struct cpuset *trialcs,
874 int is_load_balanced;
877 if (cs == &top_cpuset)
893 if (!cpumask_subset(trialcs->
cpus_allowed, cpu_active_mask))
896 retval = validate_change(cs, trialcs);
908 is_load_balanced = is_sched_load_balance(trialcs);
918 update_tasks_cpumask(cs, &heap);
922 if (is_load_balanced)
923 async_rebuild_sched_domains();
952 tsk->mems_allowed = *to;
956 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
968 static void cpuset_change_task_nodemask(
struct task_struct *tsk,
989 need_loop = task_has_mempolicy(tsk) ||
993 write_seqcount_begin(&tsk->mems_allowed_seq);
995 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
999 tsk->mems_allowed = *newmems;
1002 write_seqcount_end(&tsk->mems_allowed_seq);
1012 static void cpuset_change_nodemask(
struct task_struct *
p,
1013 struct cgroup_scanner *scan)
1021 cs = cgroup_cs(scan->cg);
1022 guarantee_online_mems(cs, &newmems);
1024 cpuset_change_task_nodemask(p, &newmems);
1030 migrate = is_memory_migrate(cs);
1038 static void *cpuset_being_rebound;
1050 static void update_tasks_nodemask(
struct cpuset *cs,
const nodemask_t *oldmem,
1053 struct cgroup_scanner scan;
1055 cpuset_being_rebound =
cs;
1057 scan.cg = cs->
css.cgroup;
1058 scan.test_task =
NULL;
1059 scan.process_task = cpuset_change_nodemask;
1076 cpuset_being_rebound =
NULL;
1092 static int update_nodemask(
struct cpuset *cs,
struct cpuset *trialcs,
1106 if (cs == &top_cpuset) {
1135 retval = validate_change(cs, trialcs);
1147 update_tasks_nodemask(cs, oldmem, &heap);
1157 return task_cs(
current) == cpuset_being_rebound;
1160 static int update_relax_domain_level(
struct cpuset *cs,
s64 val)
1163 if (val < -1 || val >= sched_domain_level_max)
1170 is_sched_load_balance(cs))
1171 async_rebuild_sched_domains();
1187 static void cpuset_change_flag(
struct task_struct *tsk,
1188 struct cgroup_scanner *scan)
1190 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1206 static void update_tasks_flags(
struct cpuset *cs,
struct ptr_heap *heap)
1208 struct cgroup_scanner scan;
1210 scan.cg = cs->
css.cgroup;
1211 scan.test_task =
NULL;
1212 scan.process_task = cpuset_change_flag;
1230 int balance_flag_changed;
1231 int spread_flag_changed;
1235 trialcs = alloc_trial_cpuset(cs);
1244 err = validate_change(cs, trialcs);
1252 balance_flag_changed = (is_sched_load_balance(cs) !=
1253 is_sched_load_balance(trialcs));
1255 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1256 || (is_spread_page(cs) != is_spread_page(trialcs)));
1262 if (!cpumask_empty(trialcs->
cpus_allowed) && balance_flag_changed)
1263 async_rebuild_sched_domains();
1265 if (spread_flag_changed)
1266 update_tasks_flags(cs, &heap);
1269 free_trial_cpuset(trialcs);
1319 #define FM_MAXTICKS ((time_t)99)
1320 #define FM_MAXCNT 1000000
1321 #define FM_SCALE 1000
1324 static void fmeter_init(
struct fmeter *fmp)
1333 static void fmeter_update(
struct fmeter *fmp)
1351 static void fmeter_markevent(
struct fmeter *fmp)
1353 spin_lock(&fmp->
lock);
1356 spin_unlock(&fmp->
lock);
1360 static int fmeter_getrate(
struct fmeter *fmp)
1364 spin_lock(&fmp->
lock);
1367 spin_unlock(&fmp->
lock);
1377 static nodemask_t cpuset_attach_nodemask_from;
1381 static int cpuset_can_attach(
struct cgroup *cgrp,
struct cgroup_taskset *tset)
1383 struct cpuset *cs = cgroup_cs(cgrp);
1390 cgroup_taskset_for_each(task, cgrp, tset) {
1407 if (cs == &top_cpuset)
1408 cpumask_copy(cpus_attach, cpu_possible_mask);
1410 guarantee_online_cpus(cs, cpus_attach);
1412 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1417 static void cpuset_attach(
struct cgroup *cgrp,
struct cgroup_taskset *tset)
1423 struct cpuset *cs = cgroup_cs(cgrp);
1424 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1426 cgroup_taskset_for_each(task, cgrp, tset) {
1433 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1434 cpuset_update_task_spread_flag(cs, task);
1446 if (is_memory_migrate(cs))
1447 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
1448 &cpuset_attach_nodemask_to);
1470 static int cpuset_write_u64(
struct cgroup *cgrp,
struct cftype *cft,
u64 val)
1473 struct cpuset *cs = cgroup_cs(cgrp);
1496 cpuset_memory_pressure_enabled = !!
val;
1515 static int cpuset_write_s64(
struct cgroup *cgrp,
struct cftype *cft,
s64 val)
1518 struct cpuset *cs = cgroup_cs(cgrp);
1526 retval = update_relax_domain_level(cs, val);
1539 static int cpuset_write_resmask(
struct cgroup *cgrp,
struct cftype *cft,
1543 struct cpuset *cs = cgroup_cs(cgrp);
1549 trialcs = alloc_trial_cpuset(cs);
1555 switch (cft->private) {
1557 retval = update_cpumask(cs, trialcs, buf);
1560 retval = update_nodemask(cs, trialcs, buf);
1567 free_trial_cpuset(trialcs);
1585 static size_t cpuset_sprintf_cpulist(
char *
page,
struct cpuset *cs)
1596 static size_t cpuset_sprintf_memlist(
char *
page,
struct cpuset *cs)
1607 static ssize_t cpuset_common_file_read(
struct cgroup *cont,
1611 size_t nbytes, loff_t *ppos)
1613 struct cpuset *cs = cgroup_cs(cont);
1626 s += cpuset_sprintf_cpulist(s, cs);
1629 s += cpuset_sprintf_memlist(s, cs);
1643 static u64 cpuset_read_u64(
struct cgroup *cont,
struct cftype *cft)
1645 struct cpuset *cs = cgroup_cs(cont);
1649 return is_cpu_exclusive(cs);
1651 return is_mem_exclusive(cs);
1653 return is_mem_hardwall(cs);
1655 return is_sched_load_balance(cs);
1657 return is_memory_migrate(cs);
1659 return cpuset_memory_pressure_enabled;
1661 return fmeter_getrate(&cs->
fmeter);
1663 return is_spread_page(cs);
1665 return is_spread_slab(cs);
1674 static s64 cpuset_read_s64(
struct cgroup *cont,
struct cftype *cft)
1676 struct cpuset *cs = cgroup_cs(cont);
1694 static struct cftype
files[] = {
1697 .read = cpuset_common_file_read,
1698 .write_string = cpuset_write_resmask,
1699 .max_write_len = (100
U + 6 *
NR_CPUS),
1705 .read = cpuset_common_file_read,
1706 .write_string = cpuset_write_resmask,
1712 .name =
"cpu_exclusive",
1713 .read_u64 = cpuset_read_u64,
1714 .write_u64 = cpuset_write_u64,
1719 .name =
"mem_exclusive",
1720 .read_u64 = cpuset_read_u64,
1721 .write_u64 = cpuset_write_u64,
1726 .name =
"mem_hardwall",
1727 .read_u64 = cpuset_read_u64,
1728 .write_u64 = cpuset_write_u64,
1733 .name =
"sched_load_balance",
1734 .read_u64 = cpuset_read_u64,
1735 .write_u64 = cpuset_write_u64,
1740 .name =
"sched_relax_domain_level",
1741 .read_s64 = cpuset_read_s64,
1742 .write_s64 = cpuset_write_s64,
1747 .name =
"memory_migrate",
1748 .read_u64 = cpuset_read_u64,
1749 .write_u64 = cpuset_write_u64,
1754 .name =
"memory_pressure",
1755 .read_u64 = cpuset_read_u64,
1756 .write_u64 = cpuset_write_u64,
1762 .name =
"memory_spread_page",
1763 .read_u64 = cpuset_read_u64,
1764 .write_u64 = cpuset_write_u64,
1769 .name =
"memory_spread_slab",
1770 .read_u64 = cpuset_read_u64,
1771 .write_u64 = cpuset_write_u64,
1776 .name =
"memory_pressure_enabled",
1777 .flags = CFTYPE_ONLY_ON_ROOT,
1778 .read_u64 = cpuset_read_u64,
1779 .write_u64 = cpuset_write_u64,
1802 static void cpuset_post_clone(
struct cgroup *cgroup)
1804 struct cgroup *
parent, *child;
1807 parent = cgroup->parent;
1809 cs = cgroup_cs(child);
1810 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1813 cs = cgroup_cs(cgroup);
1814 parent_cs = cgroup_cs(parent);
1828 static struct cgroup_subsys_state *cpuset_create(
struct cgroup *cont)
1833 if (!cont->parent) {
1834 return &top_cpuset.
css;
1836 parent = cgroup_cs(cont->parent);
1846 if (is_spread_page(parent))
1848 if (is_spread_slab(parent))
1853 fmeter_init(&cs->
fmeter);
1857 number_of_cpusets++;
1867 static void cpuset_destroy(
struct cgroup *cont)
1869 struct cpuset *cs = cgroup_cs(cont);
1871 if (is_sched_load_balance(cs))
1874 number_of_cpusets--;
1881 .create = cpuset_create,
1882 .destroy = cpuset_destroy,
1883 .can_attach = cpuset_can_attach,
1884 .attach = cpuset_attach,
1885 .post_clone = cpuset_post_clone,
1886 .subsys_id = cpuset_subsys_id,
1887 .base_cftypes =
files,
1907 fmeter_init(&top_cpuset.
fmeter);
1915 if (!alloc_cpumask_var(&cpus_attach,
GFP_KERNEL))
1918 number_of_cpusets = 1;
1930 static void cpuset_do_move_task(
struct task_struct *tsk,
1931 struct cgroup_scanner *scan)
1933 struct cgroup *new_cgroup = scan->data;
1949 static void move_member_tasks_to_cpuset(
struct cpuset *from,
struct cpuset *to)
1951 struct cgroup_scanner scan;
1953 scan.cg = from->
css.cgroup;
1954 scan.test_task =
NULL;
1955 scan.process_task = cpuset_do_move_task;
1957 scan.data = to->
css.cgroup;
1961 "cgroup_scan_tasks failed\n");
1974 static void remove_tasks_in_empty_cpuset(
struct cpuset *cs)
1983 if (list_empty(&cs->
css.cgroup->css_sets))
1995 move_member_tasks_to_cpuset(cs, parent);
2008 struct cgroup *
cont;
2010 if (list_empty(queue))
2016 child = cgroup_cs(cont);
2051 while ((cp = cpuset_next(&queue)) !=
NULL) {
2065 remove_tasks_in_empty_cpuset(cp);
2067 update_tasks_cpumask(cp,
NULL);
2072 while ((cp = cpuset_next(&queue)) !=
NULL) {
2089 remove_tasks_in_empty_cpuset(cp);
2091 update_tasks_nodemask(cp, &oldmems,
NULL);
2116 struct sched_domain_attr *
attr;
2128 ndoms = generate_sched_domains(&doms, &attr);
2132 partition_sched_domains(ndoms, doms, attr);
2135 #ifdef CONFIG_MEMORY_HOTPLUG
2141 static int cpuset_track_online_nodes(
struct notifier_block *
self,
2153 update_tasks_nodemask(&top_cpuset, &oldmems,
NULL);
2203 guarantee_online_cpus(task_cs(tsk), pmask);
2258 guarantee_online_mems(task_cs(tsk), &mask);
2282 static const struct cpuset *nearest_hardwall_ancestor(
const struct cpuset *cs)
2284 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->
parent)
2366 if (gfp_mask & __GFP_HARDWALL)
2376 cs = nearest_hardwall_ancestor(task_cs(
current));
2460 static int cpuset_spread_node(
int *rotor)
2474 current->cpuset_mem_spread_rotor =
2477 return cpuset_spread_node(&
current->cpuset_mem_spread_rotor);
2483 current->cpuset_slab_spread_rotor =
2486 return cpuset_spread_node(&
current->cpuset_slab_spread_rotor);
2520 dentry = task_cs(tsk)->css.cgroup->dentry;
2521 spin_lock(&cpuset_buffer_lock);
2523 dentry ? (
const char *)dentry->
d_name.name :
"/");
2527 tsk->
comm, cpuset_name, cpuset_nodelist);
2528 spin_unlock(&cpuset_buffer_lock);
2564 #ifdef CONFIG_PROC_PID_CPUSET
2574 static int proc_cpuset_show(
struct seq_file *
m,
void *unused_v)
2579 struct cgroup_subsys_state *css;
2595 css = task_subsys_state(tsk, cpuset_subsys_id);
2603 put_task_struct(tsk);
2610 static int cpuset_open(
struct inode *
inode,
struct file *file)
2612 struct pid *pid = PROC_I(inode)->pid;
2617 .
open = cpuset_open,
2628 seq_nodemask(m, &task->mems_allowed);
2631 seq_nodemask_list(m, &task->mems_allowed);