68 #include <linux/mempolicy.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
76 #include <linux/slab.h>
77 #include <linux/string.h>
78 #include <linux/export.h>
91 #include <linux/ctype.h>
94 #include <asm/tlbflush.h>
95 #include <asm/uaccess.h>
96 #include <linux/random.h>
101 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)
102 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)
114 static struct mempolicy default_policy = {
120 static const struct mempolicy_operations {
141 static int is_valid_nodemask(
const nodemask_t *nodemask)
158 static inline int mpol_store_user_nodemask(
const struct mempolicy *
pol)
175 pol->v.nodes = *nodes;
192 if (!is_valid_nodemask(nodes))
194 pol->v.nodes = *nodes;
207 static int mpol_set_nodemask(
struct mempolicy *pol,
224 mpol_relative_nodemask(&nsc->
mask2, nodes,&nsc->
mask1);
228 if (mpol_store_user_nodemask(pol))
229 pol->w.user_nodemask = *nodes;
231 pol->w.cpuset_mems_allowed =
236 ret = mpol_ops[pol->mode].create(pol, &nsc->
mask2);
238 ret = mpol_ops[pol->mode].create(pol,
NULL);
251 pr_debug(
"setting mode %d flags %d nodes[0] %lx\n",
252 mode, flags, nodes ?
nodes_addr(*nodes)[0] : -1);
279 policy->flags =
flags;
309 nodes_and(tmp, pol->w.user_nodemask, *nodes);
311 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
319 pol->w.cpuset_mems_allowed, *nodes);
320 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
322 tmp = pol->w.cpuset_mems_allowed;
323 pol->w.cpuset_mems_allowed = *nodes;
332 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
347 static void mpol_rebind_preferred(
struct mempolicy *pol,
357 pol->v.preferred_node =
node;
362 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
365 pol->v.preferred_node =
node_remap(pol->v.preferred_node,
366 pol->w.cpuset_mems_allowed,
368 pol->w.cpuset_mems_allowed = *nodes;
410 mpol_ops[pol->mode].rebind(pol, newmask, step);
423 mpol_rebind_policy(tsk->mempolicy,
new, step);
442 static const struct mempolicy_operations mpol_ops[
MPOL_MAX] = {
444 .rebind = mpol_rebind_default,
447 .create = mpol_new_interleave,
448 .rebind = mpol_rebind_nodemask,
451 .create = mpol_new_preferred,
452 .rebind = mpol_rebind_preferred,
455 .create = mpol_new_bind,
456 .rebind = mpol_rebind_nodemask,
461 unsigned long flags);
465 unsigned long addr,
unsigned long end,
473 orig_pte = pte = pte_offset_map_lock(vma->
vm_mm, pmd, addr, &ptl);
488 if (PageReserved(page) || PageKsm(page))
490 nid = page_to_nid(page);
495 migrate_page_add(page,
private, flags);
498 }
while (pte++, addr +=
PAGE_SIZE, addr != end);
499 pte_unmap_unlock(orig_pte, ptl);
504 unsigned long addr,
unsigned long end,
515 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
517 if (check_pte_range(vma, pmd, addr, next, nodes,
520 }
while (pmd++, addr = next, addr != end);
525 unsigned long addr,
unsigned long end,
535 if (pud_none_or_clear_bad(pud))
537 if (check_pmd_range(vma, pud, addr, next, nodes,
540 }
while (pud++, addr = next, addr != end);
545 unsigned long addr,
unsigned long end,
554 next = pgd_addr_end(addr, end);
555 if (pgd_none_or_clear_bad(pgd))
557 if (check_pud_range(vma, pgd, addr, next, nodes,
560 }
while (pgd++, addr = next, addr != end);
571 const nodemask_t *nodes,
unsigned long flags,
void *
private)
588 if (!is_vm_hugetlb_page(vma) &&
591 vma_migratable(vma)))) {
592 unsigned long endvma = vma->
vm_end;
598 err = check_pgd_range(vma, start, endvma, nodes,
601 first = ERR_PTR(err);
621 pr_debug(
"vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
631 err = vma->
vm_ops->set_policy(vma,
new);
636 old = vma->vm_policy;
637 vma->vm_policy =
new;
647 static int mbind_range(
struct mm_struct *mm,
unsigned long start,
648 unsigned long end,
struct mempolicy *new_pol)
655 unsigned long vmstart;
668 vmstart =
max(start, vma->vm_start);
669 vmend =
min(end, vma->vm_end);
674 pgoff = vma->vm_pgoff +
677 vma->anon_vma, vma->vm_file, pgoff,
684 if (vma->vm_start != vmstart) {
685 err =
split_vma(vma->vm_mm, vma, vmstart, 1);
689 if (vma->vm_end != vmend) {
690 err =
split_vma(vma->vm_mm, vma, vmend, 0);
694 err = vma_replace_policy(vma, new_pol);
728 static void mpol_set_task_struct_flag(
void)
734 static long do_set_mempolicy(
unsigned short mode,
unsigned short flags,
745 new = mpol_new(mode, flags, nodes);
759 ret = mpol_set_nodemask(
new, nodes,
scratch);
769 mpol_set_task_struct_flag();
792 if (p == &default_policy)
803 node_set(p->v.preferred_node, *nodes);
811 static int lookup_node(
struct mm_struct *mm,
unsigned long addr)
818 err = page_to_nid(p);
825 static long do_get_mempolicy(
int *policy,
nodemask_t *nmask,
826 unsigned long addr,
unsigned long flags)
854 vma = find_vma_intersection(mm, addr, addr+1);
860 pol = vma->
vm_ops->get_policy(vma, addr);
862 pol = vma->vm_policy;
867 pol = &default_policy;
870 if (flags & MPOL_F_ADDR) {
871 err = lookup_node(mm, addr);
875 }
else if (pol ==
current->mempolicy &&
899 if (mpol_store_user_nodemask(pol)) {
900 *nmask = pol->w.user_nodemask;
903 get_policy_nodemask(pol, nmask);
915 #ifdef CONFIG_MIGRATION
919 static void migrate_page_add(
struct page *page,
struct list_head *pagelist,
929 page_is_file_cache(page));
934 static struct page *new_node_page(
struct page *page,
unsigned long node,
int **
x)
960 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
962 if (!list_empty(&pagelist)) {
991 err = migrate_vmas(mm, from, to, flags);
1068 err = migrate_to_node(mm, source, dest, flags);
1089 static struct page *new_vma_page(
struct page *page,
unsigned long private,
int **
x)
1108 static void migrate_page_add(
struct page *page,
struct list_head *pagelist,
1109 unsigned long flags)
1119 static struct page *new_vma_page(
struct page *page,
unsigned long private,
int **x)
1125 static long do_mbind(
unsigned long start,
unsigned long len,
1126 unsigned short mode,
unsigned short mode_flags,
1136 if (flags & ~(
unsigned long)(MPOL_MF_STRICT |
1146 flags &= ~MPOL_MF_STRICT;
1156 new = mpol_new(mode, mode_flags, nmask);
1158 return PTR_ERR(
new);
1167 pr_debug(
"mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1168 start, start + len, mode, mode_flags,
1182 err = mpol_set_nodemask(
new, nmask,
scratch);
1194 flags | MPOL_MF_INVERT, &pagelist);
1200 err = mbind_range(mm, start, end,
new);
1202 if (!list_empty(&pagelist)) {
1210 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1226 static int get_nodes(
nodemask_t *nodes,
const unsigned long __user *nmask,
1227 unsigned long maxnode)
1230 unsigned long nlongs;
1231 unsigned long endmask;
1235 if (maxnode == 0 || !nmask)
1255 if (k == nlongs - 1) {
1272 static int copy_nodes_to_user(
unsigned long __user *
mask,
unsigned long maxnode,
1275 unsigned long copy =
ALIGN(maxnode-1, 64) / 8;
1278 if (copy > nbytes) {
1281 if (
clear_user((
char __user *)mask + nbytes, copy - nbytes))
1289 unsigned long, mode,
unsigned long __user *, nmask,
1290 unsigned long, maxnode,
unsigned, flags)
1294 unsigned short mode_flags;
1297 mode &= ~MPOL_MODE_FLAGS;
1303 err = get_nodes(&nodes, nmask, maxnode);
1306 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1311 unsigned long, maxnode)
1315 unsigned short flags;
1318 mode &= ~MPOL_MODE_FLAGS;
1319 if ((
unsigned int)mode >=
MPOL_MAX)
1323 err = get_nodes(&nodes, nmask, maxnode);
1326 return do_set_mempolicy(mode, flags, &nodes);
1330 const unsigned long __user *, old_nodes,
1331 const unsigned long __user *, new_nodes)
1348 err = get_nodes(old, old_nodes, maxnode);
1352 err = get_nodes(
new, new_nodes, maxnode);
1375 if (!uid_eq(cred->
euid, tcred->suid) && !uid_eq(cred->
euid, tcred->uid) &&
1376 !uid_eq(cred->
uid, tcred->suid) && !uid_eq(cred->
uid, tcred->uid) &&
1401 put_task_struct(task);
1418 put_task_struct(task);
1426 unsigned long __user *, nmask,
unsigned long, maxnode,
1427 unsigned long, addr,
unsigned long, flags)
1436 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1441 if (policy &&
put_user(pval, policy))
1445 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1450 #ifdef CONFIG_COMPAT
1452 asmlinkage long compat_sys_get_mempolicy(
int __user *policy,
1459 unsigned long nr_bits, alloc_size;
1463 alloc_size =
ALIGN(nr_bits, BITS_PER_LONG) / 8;
1470 if (!err && nmask) {
1471 unsigned long copy_size;
1472 copy_size =
min_t(
unsigned long,
sizeof(bm), alloc_size);
1487 unsigned long nr_bits, alloc_size;
1491 alloc_size =
ALIGN(nr_bits, BITS_PER_LONG) / 8;
1511 unsigned long nr_bits, alloc_size;
1515 alloc_size =
ALIGN(nr_bits, BITS_PER_LONG) / 8;
1526 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1549 struct mempolicy *pol = task->mempolicy;
1557 }
else if (vma->vm_policy) {
1558 pol = vma->vm_policy;
1566 if (mpol_needs_cond_ref(pol))
1571 pol = &default_policy;
1585 return &policy->v.nodes;
1594 switch (policy->mode) {
1597 nd = policy->v.preferred_node;
1613 return node_zonelist(nd, gfp);
1617 static unsigned interleave_nodes(
struct mempolicy *policy)
1650 switch (policy->mode) {
1655 return policy->v.preferred_node;
1658 return interleave_nodes(policy);
1669 (
void)first_zones_zonelist(zonelist, highest_zoneidx,
1681 static unsigned offset_il_node(
struct mempolicy *pol,
1691 target = (
unsigned int)off % nnodes;
1696 }
while (c <= target);
1701 static inline unsigned interleave_nid(
struct mempolicy *pol,
1716 off += (addr - vma->
vm_start) >> shift;
1717 return offset_il_node(pol, vma, off);
1719 return interleave_nodes(pol);
1737 #ifdef CONFIG_HUGETLBFS
1763 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1766 zl = policy_zonelist(gfp_flags, *mpol,
numa_node_id());
1768 *nodemask = &(*mpol)->v.nodes;
1789 bool init_nodemask_of_mempolicy(
nodemask_t *mask)
1794 if (!(mask &&
current->mempolicy))
1798 mempolicy =
current->mempolicy;
1799 switch (mempolicy->mode) {
1804 nid = mempolicy->v.preferred_node;
1805 init_nodemask_of_node(mask, nid);
1811 *mask = mempolicy->v.nodes;
1836 struct mempolicy *mempolicy;
1842 mempolicy = tsk->mempolicy;
1846 switch (mempolicy->mode) {
1869 static struct page *alloc_page_interleave(
gfp_t gfp,
unsigned order,
1875 zl = node_zonelist(nid, gfp);
1876 page = __alloc_pages(gfp, order, zl);
1877 if (page && page_zone(page) == zonelist_zone(&zl->
_zonerefs[0]))
1907 unsigned long addr,
int node)
1909 struct mempolicy *
pol;
1912 unsigned int cpuset_mems_cookie;
1916 cpuset_mems_cookie = get_mems_allowed();
1921 nid = interleave_nid(pol, vma, addr,
PAGE_SHIFT + order);
1923 page = alloc_page_interleave(gfp, order, nid);
1924 if (
unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1929 zl = policy_zonelist(gfp, pol, node);
1930 if (
unlikely(mpol_needs_cond_ref(pol))) {
1935 zl, policy_nodemask(gfp, pol));
1937 if (
unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1945 policy_nodemask(gfp, pol));
1946 if (
unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1972 struct mempolicy *pol =
current->mempolicy;
1974 unsigned int cpuset_mems_cookie;
1977 pol = &default_policy;
1980 cpuset_mems_cookie = get_mems_allowed();
1987 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1991 policy_nodemask(gfp, pol));
1993 if (
unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2020 if (old ==
current->mempolicy) {
2045 if (a->mode != b->mode)
2047 if (a->flags != b->flags)
2049 if (mpol_store_user_nodemask(a))
2050 if (!
nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2059 return a->v.preferred_node == b->v.preferred_node;
2077 static struct sp_node *
2078 sp_lookup(
struct shared_policy *
sp,
unsigned long start,
unsigned long end)
2080 struct rb_node *
n = sp->root.rb_node;
2083 struct sp_node *p =
rb_entry(n,
struct sp_node, nd);
2085 if (start >= p->end)
2087 else if (end <= p->start)
2095 struct sp_node *
w =
NULL;
2099 w =
rb_entry(prev,
struct sp_node, nd);
2100 if (w->end <= start)
2104 return rb_entry(n,
struct sp_node, nd);
2109 static void sp_insert(
struct shared_policy *sp,
struct sp_node *
new)
2111 struct rb_node **p = &sp->root.rb_node;
2117 nd =
rb_entry(parent,
struct sp_node, nd);
2118 if (new->start < nd->start)
2120 else if (new->end > nd->end)
2125 rb_link_node(&new->nd, parent, p);
2127 pr_debug(
"inserting %lx-%lx: %d\n", new->start, new->end,
2128 new->policy ? new->policy->mode : 0);
2135 struct mempolicy *pol =
NULL;
2138 if (!sp->root.rb_node)
2141 sn = sp_lookup(sp, idx, idx+1);
2143 mpol_get(sn->policy);
2150 static void sp_free(
struct sp_node *n)
2152 mpol_put(n->policy);
2156 static void sp_delete(
struct shared_policy *sp,
struct sp_node *n)
2158 pr_debug(
"deleting %lx-l%lx\n", n->start, n->end);
2163 static struct sp_node *sp_alloc(
unsigned long start,
unsigned long end,
2164 struct mempolicy *pol)
2167 struct mempolicy *newpol;
2173 newpol = mpol_dup(pol);
2174 if (IS_ERR(newpol)) {
2188 static int shared_policy_replace(
struct shared_policy *sp,
unsigned long start,
2189 unsigned long end,
struct sp_node *
new)
2195 n = sp_lookup(sp, start, end);
2197 while (n && n->start < end) {
2199 if (n->start >= start) {
2207 struct sp_node *new2;
2208 new2 = sp_alloc(end, n->end, n->policy);
2214 sp_insert(sp, new2);
2221 n =
rb_entry(next,
struct sp_node, nd);
2249 struct mempolicy *
new;
2255 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2260 ret = mpol_set_nodemask(
new, &mpol->w.user_nodemask,
scratch);
2283 struct sp_node *
new =
NULL;
2284 unsigned long sz = vma_pages(vma);
2286 pr_debug(
"set_shared_policy %lx sz %lu %d %d %lx\n",
2288 sz, npol ? npol->mode : -1,
2289 npol ? npol->flags : -1,
2309 if (!p->root.rb_node)
2314 n =
rb_entry(next,
struct sp_node, nd);
2325 unsigned long largest = 0;
2326 int nid, prefer = 0;
2329 sizeof(
struct mempolicy),
2333 sizeof(
struct sp_node),
2346 if (largest < total_pages) {
2347 largest = total_pages;
2352 if ((total_pages <<
PAGE_SHIFT) >= (16 << 20))
2358 node_set(prefer, interleave_nodes);
2361 printk(
"numa_policy_init: interleaving failed\n");
2378 #define MPOL_LOCAL MPOL_MAX
2379 static const char *
const policy_modes[] =
2408 int mpol_parse_str(
char *
str,
struct mempolicy **mpol,
int no_context)
2410 struct mempolicy *
new =
NULL;
2411 unsigned short mode;
2414 char *nodelist =
strchr(str,
':');
2415 char *flags =
strchr(str,
'=');
2432 if (!
strcmp(str, policy_modes[mode])) {
2445 char *
rest = nodelist;
2488 if (!
strcmp(flags,
"static"))
2490 else if (!
strcmp(flags,
"relative"))
2496 new = mpol_new(mode, mode_flags, &nodes);
2502 new->w.user_nodemask = nodes;
2508 ret = mpol_set_nodemask(
new, &nodes,
scratch);
2548 unsigned short mode;
2549 unsigned short flags = pol ? pol->flags : 0;
2556 if (!pol || pol == &default_policy)
2571 node_set(pol->v.preferred_node, nodes);
2578 nodes = pol->w.user_nodemask;
2580 nodes = pol->v.nodes;
2587 l =
strlen(policy_modes[mode]);
2588 if (buffer + maxlen < p + l + 1)
2591 strcpy(p, policy_modes[mode]);
2595 if (buffer + maxlen < p + 2)
2603 p +=
snprintf(p, buffer + maxlen - p,
"static");
2605 p +=
snprintf(p, buffer + maxlen - p,
"relative");
2609 if (buffer + maxlen < p + 2)