Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
migrate.c
Go to the documentation of this file.
1 /*
2  * Memory Migration functionality - linux/mm/migration.c
3  *
4  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5  *
6  * Page migration was first developed in the context of the memory hotplug
7  * project. The main authors of the migration code are:
8  *
9  * IWAMOTO Toshihiro <[email protected]>
10  * Hirokazu Takahashi <[email protected]>
11  * Dave Hansen <[email protected]>
12  * Christoph Lameter
13  */
14 
15 #include <linux/migrate.h>
16 #include <linux/export.h>
17 #include <linux/swap.h>
18 #include <linux/swapops.h>
19 #include <linux/pagemap.h>
20 #include <linux/buffer_head.h>
21 #include <linux/mm_inline.h>
22 #include <linux/nsproxy.h>
23 #include <linux/pagevec.h>
24 #include <linux/ksm.h>
25 #include <linux/rmap.h>
26 #include <linux/topology.h>
27 #include <linux/cpu.h>
28 #include <linux/cpuset.h>
29 #include <linux/writeback.h>
30 #include <linux/mempolicy.h>
31 #include <linux/vmalloc.h>
32 #include <linux/security.h>
33 #include <linux/memcontrol.h>
34 #include <linux/syscalls.h>
35 #include <linux/hugetlb.h>
36 #include <linux/hugetlb_cgroup.h>
37 #include <linux/gfp.h>
38 
39 #include <asm/tlbflush.h>
40 
41 #include "internal.h"
42 
43 /*
44  * migrate_prep() needs to be called before we start compiling a list of pages
45  * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
46  * undesirable, use migrate_prep_local()
47  */
48 int migrate_prep(void)
49 {
50  /*
51  * Clear the LRU lists so pages can be isolated.
52  * Note that pages may be moved off the LRU after we have
53  * drained them. Those pages will fail to migrate like other
54  * pages that may be busy.
55  */
57 
58  return 0;
59 }
60 
61 /* Do the necessary work of migrate_prep but not if it involves other CPUs */
63 {
64  lru_add_drain();
65 
66  return 0;
67 }
68 
69 /*
70  * Add isolated pages on the list back to the LRU under page lock
71  * to avoid leaking evictable pages back onto unevictable list.
72  */
74 {
75  struct page *page;
76  struct page *page2;
77 
78  list_for_each_entry_safe(page, page2, l, lru) {
79  list_del(&page->lru);
81  page_is_file_cache(page));
82  putback_lru_page(page);
83  }
84 }
85 
86 /*
87  * Restore a potential migration pte to a working pte entry
88  */
89 static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
90  unsigned long addr, void *old)
91 {
92  struct mm_struct *mm = vma->vm_mm;
94  pgd_t *pgd;
95  pud_t *pud;
96  pmd_t *pmd;
97  pte_t *ptep, pte;
98  spinlock_t *ptl;
99 
100  if (unlikely(PageHuge(new))) {
101  ptep = huge_pte_offset(mm, addr);
102  if (!ptep)
103  goto out;
104  ptl = &mm->page_table_lock;
105  } else {
106  pgd = pgd_offset(mm, addr);
107  if (!pgd_present(*pgd))
108  goto out;
109 
110  pud = pud_offset(pgd, addr);
111  if (!pud_present(*pud))
112  goto out;
113 
114  pmd = pmd_offset(pud, addr);
115  if (pmd_trans_huge(*pmd))
116  goto out;
117  if (!pmd_present(*pmd))
118  goto out;
119 
120  ptep = pte_offset_map(pmd, addr);
121 
122  /*
123  * Peek to check is_swap_pte() before taking ptlock? No, we
124  * can race mremap's move_ptes(), which skips anon_vma lock.
125  */
126 
127  ptl = pte_lockptr(mm, pmd);
128  }
129 
130  spin_lock(ptl);
131  pte = *ptep;
132  if (!is_swap_pte(pte))
133  goto unlock;
134 
135  entry = pte_to_swp_entry(pte);
136 
137  if (!is_migration_entry(entry) ||
138  migration_entry_to_page(entry) != old)
139  goto unlock;
140 
141  get_page(new);
142  pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
143  if (is_write_migration_entry(entry))
144  pte = pte_mkwrite(pte);
145 #ifdef CONFIG_HUGETLB_PAGE
146  if (PageHuge(new))
147  pte = pte_mkhuge(pte);
148 #endif
149  flush_cache_page(vma, addr, pte_pfn(pte));
150  set_pte_at(mm, addr, ptep, pte);
151 
152  if (PageHuge(new)) {
153  if (PageAnon(new))
154  hugepage_add_anon_rmap(new, vma, addr);
155  else
156  page_dup_rmap(new);
157  } else if (PageAnon(new))
158  page_add_anon_rmap(new, vma, addr);
159  else
160  page_add_file_rmap(new);
161 
162  /* No need to invalidate - it was non-present before */
163  update_mmu_cache(vma, addr, ptep);
164 unlock:
165  pte_unmap_unlock(ptep, ptl);
166 out:
167  return SWAP_AGAIN;
168 }
169 
170 /*
171  * Get rid of all migration entries and replace them by
172  * references to the indicated page.
173  */
174 static void remove_migration_ptes(struct page *old, struct page *new)
175 {
176  rmap_walk(new, remove_migration_pte, old);
177 }
178 
179 /*
180  * Something used the pte of a page under migration. We need to
181  * get to the page and wait until migration is finished.
182  * When we return from this function the fault will be retried.
183  */
184 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
185  unsigned long address)
186 {
187  pte_t *ptep, pte;
188  spinlock_t *ptl;
190  struct page *page;
191 
192  ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
193  pte = *ptep;
194  if (!is_swap_pte(pte))
195  goto out;
196 
197  entry = pte_to_swp_entry(pte);
198  if (!is_migration_entry(entry))
199  goto out;
200 
201  page = migration_entry_to_page(entry);
202 
203  /*
204  * Once radix-tree replacement of page migration started, page_count
205  * *must* be zero. And, we don't want to call wait_on_page_locked()
206  * against a page without get_page().
207  * So, we use get_page_unless_zero(), here. Even failed, page fault
208  * will occur again.
209  */
210  if (!get_page_unless_zero(page))
211  goto out;
212  pte_unmap_unlock(ptep, ptl);
213  wait_on_page_locked(page);
214  put_page(page);
215  return;
216 out:
217  pte_unmap_unlock(ptep, ptl);
218 }
219 
220 #ifdef CONFIG_BLOCK
221 /* Returns true if all buffers are successfully locked */
222 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
223  enum migrate_mode mode)
224 {
225  struct buffer_head *bh = head;
226 
227  /* Simple case, sync compaction */
228  if (mode != MIGRATE_ASYNC) {
229  do {
230  get_bh(bh);
231  lock_buffer(bh);
232  bh = bh->b_this_page;
233 
234  } while (bh != head);
235 
236  return true;
237  }
238 
239  /* async case, we cannot block on lock_buffer so use trylock_buffer */
240  do {
241  get_bh(bh);
242  if (!trylock_buffer(bh)) {
243  /*
244  * We failed to lock the buffer and cannot stall in
245  * async migration. Release the taken locks
246  */
247  struct buffer_head *failed_bh = bh;
248  put_bh(failed_bh);
249  bh = head;
250  while (bh != failed_bh) {
251  unlock_buffer(bh);
252  put_bh(bh);
253  bh = bh->b_this_page;
254  }
255  return false;
256  }
257 
258  bh = bh->b_this_page;
259  } while (bh != head);
260  return true;
261 }
262 #else
263 static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
264  enum migrate_mode mode)
265 {
266  return true;
267 }
268 #endif /* CONFIG_BLOCK */
269 
270 /*
271  * Replace the page in the mapping.
272  *
273  * The number of remaining references must be:
274  * 1 for anonymous pages without a mapping
275  * 2 for pages with a mapping
276  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
277  */
278 static int migrate_page_move_mapping(struct address_space *mapping,
279  struct page *newpage, struct page *page,
280  struct buffer_head *head, enum migrate_mode mode)
281 {
282  int expected_count;
283  void **pslot;
284 
285  if (!mapping) {
286  /* Anonymous page without mapping */
287  if (page_count(page) != 1)
288  return -EAGAIN;
289  return 0;
290  }
291 
292  spin_lock_irq(&mapping->tree_lock);
293 
294  pslot = radix_tree_lookup_slot(&mapping->page_tree,
295  page_index(page));
296 
297  expected_count = 2 + page_has_private(page);
298  if (page_count(page) != expected_count ||
299  radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
300  spin_unlock_irq(&mapping->tree_lock);
301  return -EAGAIN;
302  }
303 
304  if (!page_freeze_refs(page, expected_count)) {
305  spin_unlock_irq(&mapping->tree_lock);
306  return -EAGAIN;
307  }
308 
309  /*
310  * In the async migration case of moving a page with buffers, lock the
311  * buffers using trylock before the mapping is moved. If the mapping
312  * was moved, we later failed to lock the buffers and could not move
313  * the mapping back due to an elevated page count, we would have to
314  * block waiting on other references to be dropped.
315  */
316  if (mode == MIGRATE_ASYNC && head &&
317  !buffer_migrate_lock_buffers(head, mode)) {
318  page_unfreeze_refs(page, expected_count);
319  spin_unlock_irq(&mapping->tree_lock);
320  return -EAGAIN;
321  }
322 
323  /*
324  * Now we know that no one else is looking at the page.
325  */
326  get_page(newpage); /* add cache reference */
327  if (PageSwapCache(page)) {
328  SetPageSwapCache(newpage);
329  set_page_private(newpage, page_private(page));
330  }
331 
332  radix_tree_replace_slot(pslot, newpage);
333 
334  /*
335  * Drop cache reference from old page by unfreezing
336  * to one less reference.
337  * We know this isn't the last reference.
338  */
339  page_unfreeze_refs(page, expected_count - 1);
340 
341  /*
342  * If moved to a different zone then also account
343  * the page for that zone. Other VM counters will be
344  * taken care of when we establish references to the
345  * new page and drop references to the old page.
346  *
347  * Note that anonymous pages are accounted for
348  * via NR_FILE_PAGES and NR_ANON_PAGES if they
349  * are mapped to swap space.
350  */
351  __dec_zone_page_state(page, NR_FILE_PAGES);
352  __inc_zone_page_state(newpage, NR_FILE_PAGES);
353  if (!PageSwapCache(page) && PageSwapBacked(page)) {
354  __dec_zone_page_state(page, NR_SHMEM);
355  __inc_zone_page_state(newpage, NR_SHMEM);
356  }
357  spin_unlock_irq(&mapping->tree_lock);
358 
359  return 0;
360 }
361 
362 /*
363  * The expected number of remaining references is the same as that
364  * of migrate_page_move_mapping().
365  */
367  struct page *newpage, struct page *page)
368 {
369  int expected_count;
370  void **pslot;
371 
372  if (!mapping) {
373  if (page_count(page) != 1)
374  return -EAGAIN;
375  return 0;
376  }
377 
378  spin_lock_irq(&mapping->tree_lock);
379 
380  pslot = radix_tree_lookup_slot(&mapping->page_tree,
381  page_index(page));
382 
383  expected_count = 2 + page_has_private(page);
384  if (page_count(page) != expected_count ||
385  radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
386  spin_unlock_irq(&mapping->tree_lock);
387  return -EAGAIN;
388  }
389 
390  if (!page_freeze_refs(page, expected_count)) {
391  spin_unlock_irq(&mapping->tree_lock);
392  return -EAGAIN;
393  }
394 
395  get_page(newpage);
396 
397  radix_tree_replace_slot(pslot, newpage);
398 
399  page_unfreeze_refs(page, expected_count - 1);
400 
401  spin_unlock_irq(&mapping->tree_lock);
402  return 0;
403 }
404 
405 /*
406  * Copy the page to its new location
407  */
408 void migrate_page_copy(struct page *newpage, struct page *page)
409 {
410  if (PageHuge(page))
411  copy_huge_page(newpage, page);
412  else
413  copy_highpage(newpage, page);
414 
415  if (PageError(page))
416  SetPageError(newpage);
417  if (PageReferenced(page))
418  SetPageReferenced(newpage);
419  if (PageUptodate(page))
420  SetPageUptodate(newpage);
421  if (TestClearPageActive(page)) {
422  VM_BUG_ON(PageUnevictable(page));
423  SetPageActive(newpage);
424  } else if (TestClearPageUnevictable(page))
425  SetPageUnevictable(newpage);
426  if (PageChecked(page))
427  SetPageChecked(newpage);
428  if (PageMappedToDisk(page))
429  SetPageMappedToDisk(newpage);
430 
431  if (PageDirty(page)) {
433  /*
434  * Want to mark the page and the radix tree as dirty, and
435  * redo the accounting that clear_page_dirty_for_io undid,
436  * but we can't use set_page_dirty because that function
437  * is actually a signal that all of the page has become dirty.
438  * Whereas only part of our page may be dirty.
439  */
440  if (PageSwapBacked(page))
441  SetPageDirty(newpage);
442  else
444  }
445 
446  mlock_migrate_page(newpage, page);
447  ksm_migrate_page(newpage, page);
448 
449  ClearPageSwapCache(page);
450  ClearPagePrivate(page);
451  set_page_private(page, 0);
452 
453  /*
454  * If any waiters have accumulated on the new page then
455  * wake them up.
456  */
457  if (PageWriteback(newpage))
458  end_page_writeback(newpage);
459 }
460 
461 /************************************************************
462  * Migration functions
463  ***********************************************************/
464 
465 /* Always fail migration. Used for mappings that are not movable */
466 int fail_migrate_page(struct address_space *mapping,
467  struct page *newpage, struct page *page)
468 {
469  return -EIO;
470 }
472 
473 /*
474  * Common logic to directly migrate a single page suitable for
475  * pages that do not use PagePrivate/PagePrivate2.
476  *
477  * Pages are locked upon entry and exit.
478  */
479 int migrate_page(struct address_space *mapping,
480  struct page *newpage, struct page *page,
481  enum migrate_mode mode)
482 {
483  int rc;
484 
485  BUG_ON(PageWriteback(page)); /* Writeback must be complete */
486 
487  rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
488 
489  if (rc)
490  return rc;
491 
492  migrate_page_copy(newpage, page);
493  return 0;
494 }
496 
497 #ifdef CONFIG_BLOCK
498 /*
499  * Migration function for pages with buffers. This function can only be used
500  * if the underlying filesystem guarantees that no other references to "page"
501  * exist.
502  */
503 int buffer_migrate_page(struct address_space *mapping,
504  struct page *newpage, struct page *page, enum migrate_mode mode)
505 {
506  struct buffer_head *bh, *head;
507  int rc;
508 
509  if (!page_has_buffers(page))
510  return migrate_page(mapping, newpage, page, mode);
511 
512  head = page_buffers(page);
513 
514  rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
515 
516  if (rc)
517  return rc;
518 
519  /*
520  * In the async case, migrate_page_move_mapping locked the buffers
521  * with an IRQ-safe spinlock held. In the sync case, the buffers
522  * need to be locked now
523  */
524  if (mode != MIGRATE_ASYNC)
525  BUG_ON(!buffer_migrate_lock_buffers(head, mode));
526 
527  ClearPagePrivate(page);
528  set_page_private(newpage, page_private(page));
529  set_page_private(page, 0);
530  put_page(page);
531  get_page(newpage);
532 
533  bh = head;
534  do {
535  set_bh_page(bh, newpage, bh_offset(bh));
536  bh = bh->b_this_page;
537 
538  } while (bh != head);
539 
540  SetPagePrivate(newpage);
541 
542  migrate_page_copy(newpage, page);
543 
544  bh = head;
545  do {
546  unlock_buffer(bh);
547  put_bh(bh);
548  bh = bh->b_this_page;
549 
550  } while (bh != head);
551 
552  return 0;
553 }
555 #endif
556 
557 /*
558  * Writeback a page to clean the dirty state
559  */
560 static int writeout(struct address_space *mapping, struct page *page)
561 {
562  struct writeback_control wbc = {
564  .nr_to_write = 1,
565  .range_start = 0,
566  .range_end = LLONG_MAX,
567  .for_reclaim = 1
568  };
569  int rc;
570 
571  if (!mapping->a_ops->writepage)
572  /* No write method for the address space */
573  return -EINVAL;
574 
575  if (!clear_page_dirty_for_io(page))
576  /* Someone else already triggered a write */
577  return -EAGAIN;
578 
579  /*
580  * A dirty page may imply that the underlying filesystem has
581  * the page on some queue. So the page must be clean for
582  * migration. Writeout may mean we loose the lock and the
583  * page state is no longer what we checked for earlier.
584  * At this point we know that the migration attempt cannot
585  * be successful.
586  */
587  remove_migration_ptes(page, page);
588 
589  rc = mapping->a_ops->writepage(page, &wbc);
590 
591  if (rc != AOP_WRITEPAGE_ACTIVATE)
592  /* unlocked. Relock */
593  lock_page(page);
594 
595  return (rc < 0) ? -EIO : -EAGAIN;
596 }
597 
598 /*
599  * Default handling if a filesystem does not provide a migration function.
600  */
601 static int fallback_migrate_page(struct address_space *mapping,
602  struct page *newpage, struct page *page, enum migrate_mode mode)
603 {
604  if (PageDirty(page)) {
605  /* Only writeback pages in full synchronous migration */
606  if (mode != MIGRATE_SYNC)
607  return -EBUSY;
608  return writeout(mapping, page);
609  }
610 
611  /*
612  * Buffers may be managed in a filesystem specific way.
613  * We must have no buffers or drop them.
614  */
615  if (page_has_private(page) &&
617  return -EAGAIN;
618 
619  return migrate_page(mapping, newpage, page, mode);
620 }
621 
622 /*
623  * Move a page to a newly allocated page
624  * The page is locked and all ptes have been successfully removed.
625  *
626  * The new page will have replaced the old page if this function
627  * is successful.
628  *
629  * Return value:
630  * < 0 - error code
631  * == 0 - success
632  */
633 static int move_to_new_page(struct page *newpage, struct page *page,
634  int remap_swapcache, enum migrate_mode mode)
635 {
636  struct address_space *mapping;
637  int rc;
638 
639  /*
640  * Block others from accessing the page when we get around to
641  * establishing additional references. We are the only one
642  * holding a reference to the new page at this point.
643  */
644  if (!trylock_page(newpage))
645  BUG();
646 
647  /* Prepare mapping for the new page.*/
648  newpage->index = page->index;
649  newpage->mapping = page->mapping;
650  if (PageSwapBacked(page))
651  SetPageSwapBacked(newpage);
652 
653  mapping = page_mapping(page);
654  if (!mapping)
655  rc = migrate_page(mapping, newpage, page, mode);
656  else if (mapping->a_ops->migratepage)
657  /*
658  * Most pages have a mapping and most filesystems provide a
659  * migratepage callback. Anonymous pages are part of swap
660  * space which also has its own migratepage callback. This
661  * is the most common path for page migration.
662  */
663  rc = mapping->a_ops->migratepage(mapping,
664  newpage, page, mode);
665  else
666  rc = fallback_migrate_page(mapping, newpage, page, mode);
667 
668  if (rc) {
669  newpage->mapping = NULL;
670  } else {
671  if (remap_swapcache)
672  remove_migration_ptes(page, newpage);
673  page->mapping = NULL;
674  }
675 
676  unlock_page(newpage);
677 
678  return rc;
679 }
680 
681 static int __unmap_and_move(struct page *page, struct page *newpage,
682  int force, bool offlining, enum migrate_mode mode)
683 {
684  int rc = -EAGAIN;
685  int remap_swapcache = 1;
686  struct mem_cgroup *mem;
687  struct anon_vma *anon_vma = NULL;
688 
689  if (!trylock_page(page)) {
690  if (!force || mode == MIGRATE_ASYNC)
691  goto out;
692 
693  /*
694  * It's not safe for direct compaction to call lock_page.
695  * For example, during page readahead pages are added locked
696  * to the LRU. Later, when the IO completes the pages are
697  * marked uptodate and unlocked. However, the queueing
698  * could be merging multiple pages for one bio (e.g.
699  * mpage_readpages). If an allocation happens for the
700  * second or third page, the process can end up locking
701  * the same page twice and deadlocking. Rather than
702  * trying to be clever about what pages can be locked,
703  * avoid the use of lock_page for direct compaction
704  * altogether.
705  */
706  if (current->flags & PF_MEMALLOC)
707  goto out;
708 
709  lock_page(page);
710  }
711 
712  /*
713  * Only memory hotplug's offline_pages() caller has locked out KSM,
714  * and can safely migrate a KSM page. The other cases have skipped
715  * PageKsm along with PageReserved - but it is only now when we have
716  * the page lock that we can be certain it will not go KSM beneath us
717  * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
718  * its pagecount raised, but only here do we take the page lock which
719  * serializes that).
720  */
721  if (PageKsm(page) && !offlining) {
722  rc = -EBUSY;
723  goto unlock;
724  }
725 
726  /* charge against new page */
727  mem_cgroup_prepare_migration(page, newpage, &mem);
728 
729  if (PageWriteback(page)) {
730  /*
731  * Only in the case of a full syncronous migration is it
732  * necessary to wait for PageWriteback. In the async case,
733  * the retry loop is too short and in the sync-light case,
734  * the overhead of stalling is too much
735  */
736  if (mode != MIGRATE_SYNC) {
737  rc = -EBUSY;
738  goto uncharge;
739  }
740  if (!force)
741  goto uncharge;
742  wait_on_page_writeback(page);
743  }
744  /*
745  * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
746  * we cannot notice that anon_vma is freed while we migrates a page.
747  * This get_anon_vma() delays freeing anon_vma pointer until the end
748  * of migration. File cache pages are no problem because of page_lock()
749  * File Caches may use write_page() or lock_page() in migration, then,
750  * just care Anon page here.
751  */
752  if (PageAnon(page)) {
753  /*
754  * Only page_lock_anon_vma() understands the subtleties of
755  * getting a hold on an anon_vma from outside one of its mms.
756  */
757  anon_vma = page_get_anon_vma(page);
758  if (anon_vma) {
759  /*
760  * Anon page
761  */
762  } else if (PageSwapCache(page)) {
763  /*
764  * We cannot be sure that the anon_vma of an unmapped
765  * swapcache page is safe to use because we don't
766  * know in advance if the VMA that this page belonged
767  * to still exists. If the VMA and others sharing the
768  * data have been freed, then the anon_vma could
769  * already be invalid.
770  *
771  * To avoid this possibility, swapcache pages get
772  * migrated but are not remapped when migration
773  * completes
774  */
775  remap_swapcache = 0;
776  } else {
777  goto uncharge;
778  }
779  }
780 
781  /*
782  * Corner case handling:
783  * 1. When a new swap-cache page is read into, it is added to the LRU
784  * and treated as swapcache but it has no rmap yet.
785  * Calling try_to_unmap() against a page->mapping==NULL page will
786  * trigger a BUG. So handle it here.
787  * 2. An orphaned page (see truncate_complete_page) might have
788  * fs-private metadata. The page can be picked up due to memory
789  * offlining. Everywhere else except page reclaim, the page is
790  * invisible to the vm, so the page can not be migrated. So try to
791  * free the metadata, so the page can be freed.
792  */
793  if (!page->mapping) {
794  VM_BUG_ON(PageAnon(page));
795  if (page_has_private(page)) {
796  try_to_free_buffers(page);
797  goto uncharge;
798  }
799  goto skip_unmap;
800  }
801 
802  /* Establish migration ptes or remove ptes */
804 
805 skip_unmap:
806  if (!page_mapped(page))
807  rc = move_to_new_page(newpage, page, remap_swapcache, mode);
808 
809  if (rc && remap_swapcache)
810  remove_migration_ptes(page, page);
811 
812  /* Drop an anon_vma reference if we took one */
813  if (anon_vma)
814  put_anon_vma(anon_vma);
815 
816 uncharge:
817  mem_cgroup_end_migration(mem, page, newpage, rc == 0);
818 unlock:
819  unlock_page(page);
820 out:
821  return rc;
822 }
823 
824 /*
825  * Obtain the lock on page, remove all ptes and migrate the page
826  * to the newly allocated page in newpage.
827  */
828 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
829  struct page *page, int force, bool offlining,
830  enum migrate_mode mode)
831 {
832  int rc = 0;
833  int *result = NULL;
834  struct page *newpage = get_new_page(page, private, &result);
835 
836  if (!newpage)
837  return -ENOMEM;
838 
839  if (page_count(page) == 1) {
840  /* page was freed from under us. So we are done. */
841  goto out;
842  }
843 
844  if (unlikely(PageTransHuge(page)))
845  if (unlikely(split_huge_page(page)))
846  goto out;
847 
848  rc = __unmap_and_move(page, newpage, force, offlining, mode);
849 out:
850  if (rc != -EAGAIN) {
851  /*
852  * A page that has been migrated has all references
853  * removed and will be freed. A page that has not been
854  * migrated will have kepts its references and be
855  * restored.
856  */
857  list_del(&page->lru);
859  page_is_file_cache(page));
860  putback_lru_page(page);
861  }
862  /*
863  * Move the new page to the LRU. If migration was not successful
864  * then this will free the page.
865  */
866  putback_lru_page(newpage);
867  if (result) {
868  if (rc)
869  *result = rc;
870  else
871  *result = page_to_nid(newpage);
872  }
873  return rc;
874 }
875 
876 /*
877  * Counterpart of unmap_and_move_page() for hugepage migration.
878  *
879  * This function doesn't wait the completion of hugepage I/O
880  * because there is no race between I/O and migration for hugepage.
881  * Note that currently hugepage I/O occurs only in direct I/O
882  * where no lock is held and PG_writeback is irrelevant,
883  * and writeback status of all subpages are counted in the reference
884  * count of the head page (i.e. if all subpages of a 2MB hugepage are
885  * under direct I/O, the reference of the head page is 512 and a bit more.)
886  * This means that when we try to migrate hugepage whose subpages are
887  * doing direct I/O, some references remain after try_to_unmap() and
888  * hugepage migration fails without data corruption.
889  *
890  * There is also no race when direct I/O is issued on the page under migration,
891  * because then pte is replaced with migration swap entry and direct I/O code
892  * will wait in the page fault for migration to complete.
893  */
894 static int unmap_and_move_huge_page(new_page_t get_new_page,
895  unsigned long private, struct page *hpage,
896  int force, bool offlining,
897  enum migrate_mode mode)
898 {
899  int rc = 0;
900  int *result = NULL;
901  struct page *new_hpage = get_new_page(hpage, private, &result);
902  struct anon_vma *anon_vma = NULL;
903 
904  if (!new_hpage)
905  return -ENOMEM;
906 
907  rc = -EAGAIN;
908 
909  if (!trylock_page(hpage)) {
910  if (!force || mode != MIGRATE_SYNC)
911  goto out;
912  lock_page(hpage);
913  }
914 
915  if (PageAnon(hpage))
916  anon_vma = page_get_anon_vma(hpage);
917 
919 
920  if (!page_mapped(hpage))
921  rc = move_to_new_page(new_hpage, hpage, 1, mode);
922 
923  if (rc)
924  remove_migration_ptes(hpage, hpage);
925 
926  if (anon_vma)
927  put_anon_vma(anon_vma);
928 
929  if (!rc)
930  hugetlb_cgroup_migrate(hpage, new_hpage);
931 
932  unlock_page(hpage);
933 out:
934  put_page(new_hpage);
935  if (result) {
936  if (rc)
937  *result = rc;
938  else
939  *result = page_to_nid(new_hpage);
940  }
941  return rc;
942 }
943 
944 /*
945  * migrate_pages
946  *
947  * The function takes one list of pages to migrate and a function
948  * that determines from the page to be migrated and the private data
949  * the target of the move and allocates the page.
950  *
951  * The function returns after 10 attempts or if no pages
952  * are movable anymore because to has become empty
953  * or no retryable pages exist anymore.
954  * Caller should call putback_lru_pages to return pages to the LRU
955  * or free list only if ret != 0.
956  *
957  * Return: Number of pages not migrated or error code.
958  */
960  new_page_t get_new_page, unsigned long private, bool offlining,
961  enum migrate_mode mode)
962 {
963  int retry = 1;
964  int nr_failed = 0;
965  int pass = 0;
966  struct page *page;
967  struct page *page2;
968  int swapwrite = current->flags & PF_SWAPWRITE;
969  int rc;
970 
971  if (!swapwrite)
972  current->flags |= PF_SWAPWRITE;
973 
974  for(pass = 0; pass < 10 && retry; pass++) {
975  retry = 0;
976 
977  list_for_each_entry_safe(page, page2, from, lru) {
978  cond_resched();
979 
980  rc = unmap_and_move(get_new_page, private,
981  page, pass > 2, offlining,
982  mode);
983 
984  switch(rc) {
985  case -ENOMEM:
986  goto out;
987  case -EAGAIN:
988  retry++;
989  break;
990  case 0:
991  break;
992  default:
993  /* Permanent failure */
994  nr_failed++;
995  break;
996  }
997  }
998  }
999  rc = 0;
1000 out:
1001  if (!swapwrite)
1002  current->flags &= ~PF_SWAPWRITE;
1003 
1004  if (rc)
1005  return rc;
1006 
1007  return nr_failed + retry;
1008 }
1009 
1010 int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1011  unsigned long private, bool offlining,
1012  enum migrate_mode mode)
1013 {
1014  int pass, rc;
1015 
1016  for (pass = 0; pass < 10; pass++) {
1017  rc = unmap_and_move_huge_page(get_new_page,
1018  private, hpage, pass > 2, offlining,
1019  mode);
1020  switch (rc) {
1021  case -ENOMEM:
1022  goto out;
1023  case -EAGAIN:
1024  /* try again */
1025  cond_resched();
1026  break;
1027  case 0:
1028  goto out;
1029  default:
1030  rc = -EIO;
1031  goto out;
1032  }
1033  }
1034 out:
1035  return rc;
1036 }
1037 
1038 #ifdef CONFIG_NUMA
1039 /*
1040  * Move a list of individual pages
1041  */
1042 struct page_to_node {
1043  unsigned long addr;
1044  struct page *page;
1045  int node;
1046  int status;
1047 };
1048 
1049 static struct page *new_page_node(struct page *p, unsigned long private,
1050  int **result)
1051 {
1052  struct page_to_node *pm = (struct page_to_node *)private;
1053 
1054  while (pm->node != MAX_NUMNODES && pm->page != p)
1055  pm++;
1056 
1057  if (pm->node == MAX_NUMNODES)
1058  return NULL;
1059 
1060  *result = &pm->status;
1061 
1062  return alloc_pages_exact_node(pm->node,
1064 }
1065 
1066 /*
1067  * Move a set of pages as indicated in the pm array. The addr
1068  * field must be set to the virtual address of the page to be moved
1069  * and the node number must contain a valid target node.
1070  * The pm array ends with node = MAX_NUMNODES.
1071  */
1072 static int do_move_page_to_node_array(struct mm_struct *mm,
1073  struct page_to_node *pm,
1074  int migrate_all)
1075 {
1076  int err;
1077  struct page_to_node *pp;
1078  LIST_HEAD(pagelist);
1079 
1080  down_read(&mm->mmap_sem);
1081 
1082  /*
1083  * Build a list of pages to migrate
1084  */
1085  for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1086  struct vm_area_struct *vma;
1087  struct page *page;
1088 
1089  err = -EFAULT;
1090  vma = find_vma(mm, pp->addr);
1091  if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1092  goto set_status;
1093 
1094  page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
1095 
1096  err = PTR_ERR(page);
1097  if (IS_ERR(page))
1098  goto set_status;
1099 
1100  err = -ENOENT;
1101  if (!page)
1102  goto set_status;
1103 
1104  /* Use PageReserved to check for zero page */
1105  if (PageReserved(page) || PageKsm(page))
1106  goto put_and_set;
1107 
1108  pp->page = page;
1109  err = page_to_nid(page);
1110 
1111  if (err == pp->node)
1112  /*
1113  * Node already in the right place
1114  */
1115  goto put_and_set;
1116 
1117  err = -EACCES;
1118  if (page_mapcount(page) > 1 &&
1119  !migrate_all)
1120  goto put_and_set;
1121 
1122  err = isolate_lru_page(page);
1123  if (!err) {
1124  list_add_tail(&page->lru, &pagelist);
1126  page_is_file_cache(page));
1127  }
1128 put_and_set:
1129  /*
1130  * Either remove the duplicate refcount from
1131  * isolate_lru_page() or drop the page ref if it was
1132  * not isolated.
1133  */
1134  put_page(page);
1135 set_status:
1136  pp->status = err;
1137  }
1138 
1139  err = 0;
1140  if (!list_empty(&pagelist)) {
1141  err = migrate_pages(&pagelist, new_page_node,
1142  (unsigned long)pm, 0, MIGRATE_SYNC);
1143  if (err)
1144  putback_lru_pages(&pagelist);
1145  }
1146 
1147  up_read(&mm->mmap_sem);
1148  return err;
1149 }
1150 
1151 /*
1152  * Migrate an array of page address onto an array of nodes and fill
1153  * the corresponding array of status.
1154  */
1155 static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1156  unsigned long nr_pages,
1157  const void __user * __user *pages,
1158  const int __user *nodes,
1159  int __user *status, int flags)
1160 {
1161  struct page_to_node *pm;
1162  unsigned long chunk_nr_pages;
1163  unsigned long chunk_start;
1164  int err;
1165 
1166  err = -ENOMEM;
1167  pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1168  if (!pm)
1169  goto out;
1170 
1171  migrate_prep();
1172 
1173  /*
1174  * Store a chunk of page_to_node array in a page,
1175  * but keep the last one as a marker
1176  */
1177  chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
1178 
1179  for (chunk_start = 0;
1180  chunk_start < nr_pages;
1181  chunk_start += chunk_nr_pages) {
1182  int j;
1183 
1184  if (chunk_start + chunk_nr_pages > nr_pages)
1185  chunk_nr_pages = nr_pages - chunk_start;
1186 
1187  /* fill the chunk pm with addrs and nodes from user-space */
1188  for (j = 0; j < chunk_nr_pages; j++) {
1189  const void __user *p;
1190  int node;
1191 
1192  err = -EFAULT;
1193  if (get_user(p, pages + j + chunk_start))
1194  goto out_pm;
1195  pm[j].addr = (unsigned long) p;
1196 
1197  if (get_user(node, nodes + j + chunk_start))
1198  goto out_pm;
1199 
1200  err = -ENODEV;
1202  goto out_pm;
1203 
1204  if (!node_state(node, N_HIGH_MEMORY))
1205  goto out_pm;
1206 
1207  err = -EACCES;
1208  if (!node_isset(node, task_nodes))
1209  goto out_pm;
1210 
1211  pm[j].node = node;
1212  }
1213 
1214  /* End marker for this chunk */
1215  pm[chunk_nr_pages].node = MAX_NUMNODES;
1216 
1217  /* Migrate this chunk */
1218  err = do_move_page_to_node_array(mm, pm,
1219  flags & MPOL_MF_MOVE_ALL);
1220  if (err < 0)
1221  goto out_pm;
1222 
1223  /* Return status information */
1224  for (j = 0; j < chunk_nr_pages; j++)
1225  if (put_user(pm[j].status, status + j + chunk_start)) {
1226  err = -EFAULT;
1227  goto out_pm;
1228  }
1229  }
1230  err = 0;
1231 
1232 out_pm:
1233  free_page((unsigned long)pm);
1234 out:
1235  return err;
1236 }
1237 
1238 /*
1239  * Determine the nodes of an array of pages and store it in an array of status.
1240  */
1241 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1242  const void __user **pages, int *status)
1243 {
1244  unsigned long i;
1245 
1246  down_read(&mm->mmap_sem);
1247 
1248  for (i = 0; i < nr_pages; i++) {
1249  unsigned long addr = (unsigned long)(*pages);
1250  struct vm_area_struct *vma;
1251  struct page *page;
1252  int err = -EFAULT;
1253 
1254  vma = find_vma(mm, addr);
1255  if (!vma || addr < vma->vm_start)
1256  goto set_status;
1257 
1258  page = follow_page(vma, addr, 0);
1259 
1260  err = PTR_ERR(page);
1261  if (IS_ERR(page))
1262  goto set_status;
1263 
1264  err = -ENOENT;
1265  /* Use PageReserved to check for zero page */
1266  if (!page || PageReserved(page) || PageKsm(page))
1267  goto set_status;
1268 
1269  err = page_to_nid(page);
1270 set_status:
1271  *status = err;
1272 
1273  pages++;
1274  status++;
1275  }
1276 
1277  up_read(&mm->mmap_sem);
1278 }
1279 
1280 /*
1281  * Determine the nodes of a user array of pages and store it in
1282  * a user array of status.
1283  */
1284 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1285  const void __user * __user *pages,
1286  int __user *status)
1287 {
1288 #define DO_PAGES_STAT_CHUNK_NR 16
1289  const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1290  int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1291 
1292  while (nr_pages) {
1293  unsigned long chunk_nr;
1294 
1295  chunk_nr = nr_pages;
1296  if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1297  chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1298 
1299  if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1300  break;
1301 
1302  do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1303 
1304  if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1305  break;
1306 
1307  pages += chunk_nr;
1308  status += chunk_nr;
1309  nr_pages -= chunk_nr;
1310  }
1311  return nr_pages ? -EFAULT : 0;
1312 }
1313 
1314 /*
1315  * Move a list of pages in the address space of the currently executing
1316  * process.
1317  */
1318 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1319  const void __user * __user *, pages,
1320  const int __user *, nodes,
1321  int __user *, status, int, flags)
1322 {
1323  const struct cred *cred = current_cred(), *tcred;
1324  struct task_struct *task;
1325  struct mm_struct *mm;
1326  int err;
1327  nodemask_t task_nodes;
1328 
1329  /* Check flags */
1330  if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1331  return -EINVAL;
1332 
1333  if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1334  return -EPERM;
1335 
1336  /* Find the mm_struct */
1337  rcu_read_lock();
1338  task = pid ? find_task_by_vpid(pid) : current;
1339  if (!task) {
1340  rcu_read_unlock();
1341  return -ESRCH;
1342  }
1343  get_task_struct(task);
1344 
1345  /*
1346  * Check if this process has the right to modify the specified
1347  * process. The right exists if the process has administrative
1348  * capabilities, superuser privileges or the same
1349  * userid as the target process.
1350  */
1351  tcred = __task_cred(task);
1352  if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1353  !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1354  !capable(CAP_SYS_NICE)) {
1355  rcu_read_unlock();
1356  err = -EPERM;
1357  goto out;
1358  }
1359  rcu_read_unlock();
1360 
1361  err = security_task_movememory(task);
1362  if (err)
1363  goto out;
1364 
1365  task_nodes = cpuset_mems_allowed(task);
1366  mm = get_task_mm(task);
1367  put_task_struct(task);
1368 
1369  if (!mm)
1370  return -EINVAL;
1371 
1372  if (nodes)
1373  err = do_pages_move(mm, task_nodes, nr_pages, pages,
1374  nodes, status, flags);
1375  else
1376  err = do_pages_stat(mm, nr_pages, pages, status);
1377 
1378  mmput(mm);
1379  return err;
1380 
1381 out:
1382  put_task_struct(task);
1383  return err;
1384 }
1385 
1386 /*
1387  * Call migration functions in the vma_ops that may prepare
1388  * memory in a vm for migration. migration functions may perform
1389  * the migration for vmas that do not have an underlying page struct.
1390  */
1391 int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1392  const nodemask_t *from, unsigned long flags)
1393 {
1394  struct vm_area_struct *vma;
1395  int err = 0;
1396 
1397  for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
1398  if (vma->vm_ops && vma->vm_ops->migrate) {
1399  err = vma->vm_ops->migrate(vma, to, from, flags);
1400  if (err)
1401  break;
1402  }
1403  }
1404  return err;
1405 }
1406 #endif