Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
memory_hotplug.c
Go to the documentation of this file.
1 /*
2  * linux/mm/memory_hotplug.c
3  *
4  * Copyright (C)
5  */
6 
7 #include <linux/stddef.h>
8 #include <linux/mm.h>
9 #include <linux/swap.h>
10 #include <linux/interrupt.h>
11 #include <linux/pagemap.h>
12 #include <linux/bootmem.h>
13 #include <linux/compiler.h>
14 #include <linux/export.h>
15 #include <linux/pagevec.h>
16 #include <linux/writeback.h>
17 #include <linux/slab.h>
18 #include <linux/sysctl.h>
19 #include <linux/cpu.h>
20 #include <linux/memory.h>
21 #include <linux/memory_hotplug.h>
22 #include <linux/highmem.h>
23 #include <linux/vmalloc.h>
24 #include <linux/ioport.h>
25 #include <linux/delay.h>
26 #include <linux/migrate.h>
27 #include <linux/page-isolation.h>
28 #include <linux/pfn.h>
29 #include <linux/suspend.h>
30 #include <linux/mm_inline.h>
31 #include <linux/firmware-map.h>
32 
33 #include <asm/tlbflush.h>
34 
35 #include "internal.h"
36 
37 /*
38  * online_page_callback contains pointer to current page onlining function.
39  * Initially it is generic_online_page(). If it is required it could be
40  * changed by calling set_online_page_callback() for callback registration
41  * and restore_online_page_callback() for generic callback restore.
42  */
43 
44 static void generic_online_page(struct page *page);
45 
46 static online_page_callback_t online_page_callback = generic_online_page;
47 
48 DEFINE_MUTEX(mem_hotplug_mutex);
49 
51 {
52  mutex_lock(&mem_hotplug_mutex);
53 
54  /* for exclusive hibernation if CONFIG_HIBERNATION=y */
55  lock_system_sleep();
56 }
57 
59 {
60  unlock_system_sleep();
61  mutex_unlock(&mem_hotplug_mutex);
62 }
63 
64 
65 /* add this memory to iomem resource */
66 static struct resource *register_memory_resource(u64 start, u64 size)
67 {
68  struct resource *res;
69  res = kzalloc(sizeof(struct resource), GFP_KERNEL);
70  BUG_ON(!res);
71 
72  res->name = "System RAM";
73  res->start = start;
74  res->end = start + size - 1;
76  if (request_resource(&iomem_resource, res) < 0) {
77  printk("System RAM resource %pR cannot be added\n", res);
78  kfree(res);
79  res = NULL;
80  }
81  return res;
82 }
83 
84 static void release_memory_resource(struct resource *res)
85 {
86  if (!res)
87  return;
88  release_resource(res);
89  kfree(res);
90  return;
91 }
92 
93 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
94 #ifndef CONFIG_SPARSEMEM_VMEMMAP
95 static void get_page_bootmem(unsigned long info, struct page *page,
96  unsigned long type)
97 {
98  page->lru.next = (struct list_head *) type;
99  SetPagePrivate(page);
100  set_page_private(page, info);
101  atomic_inc(&page->_count);
102 }
103 
104 /* reference to __meminit __free_pages_bootmem is valid
105  * so use __ref to tell modpost not to generate a warning */
106 void __ref put_page_bootmem(struct page *page)
107 {
108  unsigned long type;
109 
110  type = (unsigned long) page->lru.next;
111  BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
112  type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
113 
114  if (atomic_dec_return(&page->_count) == 1) {
115  ClearPagePrivate(page);
116  set_page_private(page, 0);
117  INIT_LIST_HEAD(&page->lru);
118  __free_pages_bootmem(page, 0);
119  }
120 
121 }
122 
123 static void register_page_bootmem_info_section(unsigned long start_pfn)
124 {
125  unsigned long *usemap, mapsize, section_nr, i;
126  struct mem_section *ms;
127  struct page *page, *memmap;
128 
129  section_nr = pfn_to_section_nr(start_pfn);
130  ms = __nr_to_section(section_nr);
131 
132  /* Get section's memmap address */
133  memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
134 
135  /*
136  * Get page for the memmap's phys address
137  * XXX: need more consideration for sparse_vmemmap...
138  */
139  page = virt_to_page(memmap);
140  mapsize = sizeof(struct page) * PAGES_PER_SECTION;
141  mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
142 
143  /* remember memmap's page */
144  for (i = 0; i < mapsize; i++, page++)
145  get_page_bootmem(section_nr, page, SECTION_INFO);
146 
147  usemap = __nr_to_section(section_nr)->pageblock_flags;
148  page = virt_to_page(usemap);
149 
150  mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
151 
152  for (i = 0; i < mapsize; i++, page++)
153  get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
154 
155 }
156 
157 void register_page_bootmem_info_node(struct pglist_data *pgdat)
158 {
159  unsigned long i, pfn, end_pfn, nr_pages;
160  int node = pgdat->node_id;
161  struct page *page;
162  struct zone *zone;
163 
164  nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
165  page = virt_to_page(pgdat);
166 
167  for (i = 0; i < nr_pages; i++, page++)
168  get_page_bootmem(node, page, NODE_INFO);
169 
170  zone = &pgdat->node_zones[0];
171  for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
172  if (zone->wait_table) {
173  nr_pages = zone->wait_table_hash_nr_entries
174  * sizeof(wait_queue_head_t);
175  nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
176  page = virt_to_page(zone->wait_table);
177 
178  for (i = 0; i < nr_pages; i++, page++)
179  get_page_bootmem(node, page, NODE_INFO);
180  }
181  }
182 
183  pfn = pgdat->node_start_pfn;
184  end_pfn = pfn + pgdat->node_spanned_pages;
185 
186  /* register_section info */
187  for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
188  /*
189  * Some platforms can assign the same pfn to multiple nodes - on
190  * node0 as well as nodeN. To avoid registering a pfn against
191  * multiple nodes we check that this pfn does not already
192  * reside in some other node.
193  */
194  if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
195  register_page_bootmem_info_section(pfn);
196  }
197 }
198 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
199 
200 static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
201  unsigned long end_pfn)
202 {
203  unsigned long old_zone_end_pfn;
204 
205  zone_span_writelock(zone);
206 
207  old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
208  if (start_pfn < zone->zone_start_pfn)
209  zone->zone_start_pfn = start_pfn;
210 
211  zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
212  zone->zone_start_pfn;
213 
214  zone_span_writeunlock(zone);
215 }
216 
217 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
218  unsigned long end_pfn)
219 {
220  unsigned long old_pgdat_end_pfn =
221  pgdat->node_start_pfn + pgdat->node_spanned_pages;
222 
223  if (start_pfn < pgdat->node_start_pfn)
224  pgdat->node_start_pfn = start_pfn;
225 
226  pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
227  pgdat->node_start_pfn;
228 }
229 
230 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
231 {
232  struct pglist_data *pgdat = zone->zone_pgdat;
233  int nr_pages = PAGES_PER_SECTION;
234  int nid = pgdat->node_id;
235  int zone_type;
236  unsigned long flags;
237 
238  zone_type = zone - pgdat->node_zones;
239  if (!zone->wait_table) {
240  int ret;
241 
242  ret = init_currently_empty_zone(zone, phys_start_pfn,
243  nr_pages, MEMMAP_HOTPLUG);
244  if (ret)
245  return ret;
246  }
247  pgdat_resize_lock(zone->zone_pgdat, &flags);
248  grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
249  grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
250  phys_start_pfn + nr_pages);
251  pgdat_resize_unlock(zone->zone_pgdat, &flags);
252  memmap_init_zone(nr_pages, nid, zone_type,
253  phys_start_pfn, MEMMAP_HOTPLUG);
254  return 0;
255 }
256 
257 static int __meminit __add_section(int nid, struct zone *zone,
258  unsigned long phys_start_pfn)
259 {
260  int nr_pages = PAGES_PER_SECTION;
261  int ret;
262 
263  if (pfn_valid(phys_start_pfn))
264  return -EEXIST;
265 
266  ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
267 
268  if (ret < 0)
269  return ret;
270 
271  ret = __add_zone(zone, phys_start_pfn);
272 
273  if (ret < 0)
274  return ret;
275 
276  return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
277 }
278 
279 #ifdef CONFIG_SPARSEMEM_VMEMMAP
280 static int __remove_section(struct zone *zone, struct mem_section *ms)
281 {
282  /*
283  * XXX: Freeing memmap with vmemmap is not implement yet.
284  * This should be removed later.
285  */
286  return -EBUSY;
287 }
288 #else
289 static int __remove_section(struct zone *zone, struct mem_section *ms)
290 {
291  unsigned long flags;
292  struct pglist_data *pgdat = zone->zone_pgdat;
293  int ret = -EINVAL;
294 
295  if (!valid_section(ms))
296  return ret;
297 
298  ret = unregister_memory_section(ms);
299  if (ret)
300  return ret;
301 
302  pgdat_resize_lock(pgdat, &flags);
303  sparse_remove_one_section(zone, ms);
304  pgdat_resize_unlock(pgdat, &flags);
305  return 0;
306 }
307 #endif
308 
309 /*
310  * Reasonably generic function for adding memory. It is
311  * expected that archs that support memory hotplug will
312  * call this function after deciding the zone to which to
313  * add the new pages.
314  */
315 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
316  unsigned long nr_pages)
317 {
318  unsigned long i;
319  int err = 0;
320  int start_sec, end_sec;
321  /* during initialize mem_map, align hot-added range to section */
322  start_sec = pfn_to_section_nr(phys_start_pfn);
323  end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
324 
325  for (i = start_sec; i <= end_sec; i++) {
326  err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
327 
328  /*
329  * EEXIST is finally dealt with by ioresource collision
330  * check. see add_memory() => register_memory_resource()
331  * Warning will be printed if there is collision.
332  */
333  if (err && (err != -EEXIST))
334  break;
335  err = 0;
336  }
337 
338  return err;
339 }
340 EXPORT_SYMBOL_GPL(__add_pages);
341 
353 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
354  unsigned long nr_pages)
355 {
356  unsigned long i, ret = 0;
357  int sections_to_remove;
358 
359  /*
360  * We can only remove entire sections
361  */
362  BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
363  BUG_ON(nr_pages % PAGES_PER_SECTION);
364 
365  release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
366 
367  sections_to_remove = nr_pages / PAGES_PER_SECTION;
368  for (i = 0; i < sections_to_remove; i++) {
369  unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
370  ret = __remove_section(zone, __pfn_to_section(pfn));
371  if (ret)
372  break;
373  }
374  return ret;
375 }
376 EXPORT_SYMBOL_GPL(__remove_pages);
377 
378 int set_online_page_callback(online_page_callback_t callback)
379 {
380  int rc = -EINVAL;
381 
383 
384  if (online_page_callback == generic_online_page) {
385  online_page_callback = callback;
386  rc = 0;
387  }
388 
390 
391  return rc;
392 }
393 EXPORT_SYMBOL_GPL(set_online_page_callback);
394 
395 int restore_online_page_callback(online_page_callback_t callback)
396 {
397  int rc = -EINVAL;
398 
400 
401  if (online_page_callback == callback) {
402  online_page_callback = generic_online_page;
403  rc = 0;
404  }
405 
407 
408  return rc;
409 }
410 EXPORT_SYMBOL_GPL(restore_online_page_callback);
411 
412 void __online_page_set_limits(struct page *page)
413 {
414  unsigned long pfn = page_to_pfn(page);
415 
416  if (pfn >= num_physpages)
417  num_physpages = pfn + 1;
418 }
419 EXPORT_SYMBOL_GPL(__online_page_set_limits);
420 
421 void __online_page_increment_counters(struct page *page)
422 {
423  totalram_pages++;
424 
425 #ifdef CONFIG_HIGHMEM
426  if (PageHighMem(page))
427  totalhigh_pages++;
428 #endif
429 }
430 EXPORT_SYMBOL_GPL(__online_page_increment_counters);
431 
432 void __online_page_free(struct page *page)
433 {
434  ClearPageReserved(page);
435  init_page_count(page);
436  __free_page(page);
437 }
438 EXPORT_SYMBOL_GPL(__online_page_free);
439 
440 static void generic_online_page(struct page *page)
441 {
442  __online_page_set_limits(page);
443  __online_page_increment_counters(page);
444  __online_page_free(page);
445 }
446 
447 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
448  void *arg)
449 {
450  unsigned long i;
451  unsigned long onlined_pages = *(unsigned long *)arg;
452  struct page *page;
453  if (PageReserved(pfn_to_page(start_pfn)))
454  for (i = 0; i < nr_pages; i++) {
455  page = pfn_to_page(start_pfn + i);
456  (*online_page_callback)(page);
457  onlined_pages++;
458  }
459  *(unsigned long *)arg = onlined_pages;
460  return 0;
461 }
462 
463 
464 int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
465 {
466  unsigned long onlined_pages = 0;
467  struct zone *zone;
468  int need_zonelists_rebuild = 0;
469  int nid;
470  int ret;
471  struct memory_notify arg;
472 
474  arg.start_pfn = pfn;
475  arg.nr_pages = nr_pages;
476  arg.status_change_nid = -1;
477 
478  nid = page_to_nid(pfn_to_page(pfn));
479  if (node_present_pages(nid) == 0)
480  arg.status_change_nid = nid;
481 
482  ret = memory_notify(MEM_GOING_ONLINE, &arg);
483  ret = notifier_to_errno(ret);
484  if (ret) {
487  return ret;
488  }
489  /*
490  * This doesn't need a lock to do pfn_to_page().
491  * The section can't be removed here because of the
492  * memory_block->state_mutex.
493  */
494  zone = page_zone(pfn_to_page(pfn));
495  /*
496  * If this zone is not populated, then it is not in zonelist.
497  * This means the page allocator ignores this zone.
498  * So, zonelist must be updated after online.
499  */
500  mutex_lock(&zonelists_mutex);
501  if (!populated_zone(zone))
502  need_zonelists_rebuild = 1;
503 
504  ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
505  online_pages_range);
506  if (ret) {
507  mutex_unlock(&zonelists_mutex);
508  printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
509  (unsigned long long) pfn << PAGE_SHIFT,
510  (((unsigned long long) pfn + nr_pages)
511  << PAGE_SHIFT) - 1);
514  return ret;
515  }
516 
517  zone->present_pages += onlined_pages;
518  zone->zone_pgdat->node_present_pages += onlined_pages;
519  if (onlined_pages) {
520  node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
521  if (need_zonelists_rebuild)
522  build_all_zonelists(NULL, zone);
523  else
524  zone_pcp_update(zone);
525  }
526 
527  mutex_unlock(&zonelists_mutex);
528 
530 
531  if (onlined_pages)
532  kswapd_run(zone_to_nid(zone));
533 
535 
537 
538  if (onlined_pages)
539  memory_notify(MEM_ONLINE, &arg);
541 
542  return 0;
543 }
544 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
545 
546 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
547 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
548 {
549  struct pglist_data *pgdat;
550  unsigned long zones_size[MAX_NR_ZONES] = {0};
551  unsigned long zholes_size[MAX_NR_ZONES] = {0};
552  unsigned long start_pfn = start >> PAGE_SHIFT;
553 
554  pgdat = arch_alloc_nodedata(nid);
555  if (!pgdat)
556  return NULL;
557 
558  arch_refresh_nodedata(nid, pgdat);
559 
560  /* we can use NODE_DATA(nid) from here */
561 
562  /* init node's zones as empty zones, we don't have any present pages.*/
563  free_area_init_node(nid, zones_size, start_pfn, zholes_size);
564 
565  /*
566  * The node we allocated has no zone fallback lists. For avoiding
567  * to access not-initialized zonelist, build here.
568  */
569  mutex_lock(&zonelists_mutex);
570  build_all_zonelists(pgdat, NULL);
571  mutex_unlock(&zonelists_mutex);
572 
573  return pgdat;
574 }
575 
576 static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
577 {
578  arch_refresh_nodedata(nid, NULL);
579  arch_free_nodedata(pgdat);
580  return;
581 }
582 
583 
584 /*
585  * called by cpu_up() to online a node without onlined memory.
586  */
587 int mem_online_node(int nid)
588 {
589  pg_data_t *pgdat;
590  int ret;
591 
593  pgdat = hotadd_new_pgdat(nid, 0);
594  if (!pgdat) {
595  ret = -ENOMEM;
596  goto out;
597  }
598  node_set_online(nid);
599  ret = register_one_node(nid);
600  BUG_ON(ret);
601 
602 out:
604  return ret;
605 }
606 
607 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
608 int __ref add_memory(int nid, u64 start, u64 size)
609 {
610  pg_data_t *pgdat = NULL;
611  int new_pgdat = 0;
612  struct resource *res;
613  int ret;
614 
616 
617  res = register_memory_resource(start, size);
618  ret = -EEXIST;
619  if (!res)
620  goto out;
621 
622  if (!node_online(nid)) {
623  pgdat = hotadd_new_pgdat(nid, start);
624  ret = -ENOMEM;
625  if (!pgdat)
626  goto error;
627  new_pgdat = 1;
628  }
629 
630  /* call arch's memory hotadd */
631  ret = arch_add_memory(nid, start, size);
632 
633  if (ret < 0)
634  goto error;
635 
636  /* we online node here. we can't roll back from here. */
637  node_set_online(nid);
638 
639  if (new_pgdat) {
640  ret = register_one_node(nid);
641  /*
642  * If sysfs file of new node can't create, cpu on the node
643  * can't be hot-added. There is no rollback way now.
644  * So, check by BUG_ON() to catch it reluctantly..
645  */
646  BUG_ON(ret);
647  }
648 
649  /* create new memmap entry */
650  firmware_map_add_hotplug(start, start + size, "System RAM");
651 
652  goto out;
653 
654 error:
655  /* rollback pgdat allocation and others */
656  if (new_pgdat)
657  rollback_node_hotadd(nid, pgdat);
658  if (res)
659  release_memory_resource(res);
660 
661 out:
663  return ret;
664 }
666 
667 #ifdef CONFIG_MEMORY_HOTREMOVE
668 /*
669  * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
670  * set and the size of the free page is given by page_order(). Using this,
671  * the function determines if the pageblock contains only free pages.
672  * Due to buddy contraints, a free page at least the size of a pageblock will
673  * be located at the start of the pageblock
674  */
675 static inline int pageblock_free(struct page *page)
676 {
677  return PageBuddy(page) && page_order(page) >= pageblock_order;
678 }
679 
680 /* Return the start of the next active pageblock after a given page */
681 static struct page *next_active_pageblock(struct page *page)
682 {
683  /* Ensure the starting page is pageblock-aligned */
684  BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
685 
686  /* If the entire pageblock is free, move to the end of free page */
687  if (pageblock_free(page)) {
688  int order;
689  /* be careful. we don't have locks, page_order can be changed.*/
690  order = page_order(page);
691  if ((order < MAX_ORDER) && (order >= pageblock_order))
692  return page + (1 << order);
693  }
694 
695  return page + pageblock_nr_pages;
696 }
697 
698 /* Checks if this range of memory is likely to be hot-removable. */
699 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
700 {
701  struct page *page = pfn_to_page(start_pfn);
702  struct page *end_page = page + nr_pages;
703 
704  /* Check the starting page of each pageblock within the range */
705  for (; page < end_page; page = next_active_pageblock(page)) {
707  return 0;
708  cond_resched();
709  }
710 
711  /* All pageblocks in the memory block are likely to be hot-removable */
712  return 1;
713 }
714 
715 /*
716  * Confirm all pages in a range [start, end) is belongs to the same zone.
717  */
718 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
719 {
720  unsigned long pfn;
721  struct zone *zone = NULL;
722  struct page *page;
723  int i;
724  for (pfn = start_pfn;
725  pfn < end_pfn;
726  pfn += MAX_ORDER_NR_PAGES) {
727  i = 0;
728  /* This is just a CONFIG_HOLES_IN_ZONE check.*/
729  while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
730  i++;
731  if (i == MAX_ORDER_NR_PAGES)
732  continue;
733  page = pfn_to_page(pfn + i);
734  if (zone && page_zone(page) != zone)
735  return 0;
736  zone = page_zone(page);
737  }
738  return 1;
739 }
740 
741 /*
742  * Scanning pfn is much easier than scanning lru list.
743  * Scan pfn from start to end and Find LRU page.
744  */
745 static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
746 {
747  unsigned long pfn;
748  struct page *page;
749  for (pfn = start; pfn < end; pfn++) {
750  if (pfn_valid(pfn)) {
751  page = pfn_to_page(pfn);
752  if (PageLRU(page))
753  return pfn;
754  }
755  }
756  return 0;
757 }
758 
759 #define NR_OFFLINE_AT_ONCE_PAGES (256)
760 static int
761 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
762 {
763  unsigned long pfn;
764  struct page *page;
765  int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
766  int not_managed = 0;
767  int ret = 0;
768  LIST_HEAD(source);
769 
770  for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
771  if (!pfn_valid(pfn))
772  continue;
773  page = pfn_to_page(pfn);
774  if (!get_page_unless_zero(page))
775  continue;
776  /*
777  * We can skip free pages. And we can only deal with pages on
778  * LRU.
779  */
780  ret = isolate_lru_page(page);
781  if (!ret) { /* Success */
782  put_page(page);
783  list_add_tail(&page->lru, &source);
784  move_pages--;
786  page_is_file_cache(page));
787 
788  } else {
789 #ifdef CONFIG_DEBUG_VM
790  printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
791  pfn);
792  dump_page(page);
793 #endif
794  put_page(page);
795  /* Because we don't have big zone->lock. we should
796  check this again here. */
797  if (page_count(page)) {
798  not_managed++;
799  ret = -EBUSY;
800  break;
801  }
802  }
803  }
804  if (!list_empty(&source)) {
805  if (not_managed) {
807  goto out;
808  }
809 
810  /*
811  * alloc_migrate_target should be improooooved!!
812  * migrate_pages returns # of failed pages.
813  */
815  true, MIGRATE_SYNC);
816  if (ret)
818  }
819 out:
820  return ret;
821 }
822 
823 /*
824  * remove from free_area[] and mark all as Reserved.
825  */
826 static int
827 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
828  void *data)
829 {
830  __offline_isolated_pages(start, start + nr_pages);
831  return 0;
832 }
833 
834 static void
835 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
836 {
837  walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
838  offline_isolated_pages_cb);
839 }
840 
841 /*
842  * Check all pages in range, recoreded as memory resource, are isolated.
843  */
844 static int
845 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
846  void *data)
847 {
848  int ret;
849  long offlined = *(long *)data;
850  ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
851  offlined = nr_pages;
852  if (!ret)
853  *(long *)data += offlined;
854  return ret;
855 }
856 
857 static long
858 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
859 {
860  long offlined = 0;
861  int ret;
862 
863  ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
864  check_pages_isolated_cb);
865  if (ret < 0)
866  offlined = (long)ret;
867  return offlined;
868 }
869 
870 static int __ref __offline_pages(unsigned long start_pfn,
871  unsigned long end_pfn, unsigned long timeout)
872 {
873  unsigned long pfn, nr_pages, expire;
874  long offlined_pages;
875  int ret, drain, retry_max, node;
876  struct zone *zone;
877  struct memory_notify arg;
878 
879  BUG_ON(start_pfn >= end_pfn);
880  /* at least, alignment against pageblock is necessary */
881  if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
882  return -EINVAL;
883  if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
884  return -EINVAL;
885  /* This makes hotplug much easier...and readable.
886  we assume this for now. .*/
887  if (!test_pages_in_a_zone(start_pfn, end_pfn))
888  return -EINVAL;
889 
891 
892  zone = page_zone(pfn_to_page(start_pfn));
893  node = zone_to_nid(zone);
894  nr_pages = end_pfn - start_pfn;
895 
896  /* set above range as isolated */
897  ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
898  if (ret)
899  goto out;
900 
901  arg.start_pfn = start_pfn;
902  arg.nr_pages = nr_pages;
903  arg.status_change_nid = -1;
904  if (nr_pages >= node_present_pages(node))
905  arg.status_change_nid = node;
906 
907  ret = memory_notify(MEM_GOING_OFFLINE, &arg);
908  ret = notifier_to_errno(ret);
909  if (ret)
910  goto failed_removal;
911 
912  pfn = start_pfn;
913  expire = jiffies + timeout;
914  drain = 0;
915  retry_max = 5;
916 repeat:
917  /* start memory hot removal */
918  ret = -EAGAIN;
919  if (time_after(jiffies, expire))
920  goto failed_removal;
921  ret = -EINTR;
922  if (signal_pending(current))
923  goto failed_removal;
924  ret = 0;
925  if (drain) {
927  cond_resched();
928  drain_all_pages();
929  }
930 
931  pfn = scan_lru_pages(start_pfn, end_pfn);
932  if (pfn) { /* We have page on LRU */
933  ret = do_migrate_range(pfn, end_pfn);
934  if (!ret) {
935  drain = 1;
936  goto repeat;
937  } else {
938  if (ret < 0)
939  if (--retry_max == 0)
940  goto failed_removal;
941  yield();
942  drain = 1;
943  goto repeat;
944  }
945  }
946  /* drain all zone's lru pagevec, this is asyncronous... */
948  yield();
949  /* drain pcp pages , this is synchrouns. */
950  drain_all_pages();
951  /* check again */
952  offlined_pages = check_pages_isolated(start_pfn, end_pfn);
953  if (offlined_pages < 0) {
954  ret = -EBUSY;
955  goto failed_removal;
956  }
957  printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
958  /* Ok, all of our target is islaoted.
959  We cannot do rollback at this point. */
960  offline_isolated_pages(start_pfn, end_pfn);
961  /* reset pagetype flags and makes migrate type to be MOVABLE */
962  undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
963  /* removal success */
964  zone->present_pages -= offlined_pages;
965  zone->zone_pgdat->node_present_pages -= offlined_pages;
966  totalram_pages -= offlined_pages;
967 
969 
970  if (!populated_zone(zone)) {
971  zone_pcp_reset(zone);
972  mutex_lock(&zonelists_mutex);
974  mutex_unlock(&zonelists_mutex);
975  } else
976  zone_pcp_update(zone);
977 
978  if (!node_present_pages(node)) {
979  node_clear_state(node, N_HIGH_MEMORY);
980  kswapd_stop(node);
981  }
982 
985 
986  memory_notify(MEM_OFFLINE, &arg);
988  return 0;
989 
990 failed_removal:
991  printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
992  (unsigned long long) start_pfn << PAGE_SHIFT,
993  ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
995  /* pushback to free area */
996  undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
997 
998 out:
1000  return ret;
1001 }
1002 
1003 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1004 {
1005  return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1006 }
1007 
1008 int remove_memory(u64 start, u64 size)
1009 {
1010  struct memory_block *mem = NULL;
1011  struct mem_section *section;
1012  unsigned long start_pfn, end_pfn;
1013  unsigned long pfn, section_nr;
1014  int ret;
1015 
1016  start_pfn = PFN_DOWN(start);
1017  end_pfn = start_pfn + PFN_DOWN(size);
1018 
1019  for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1020  section_nr = pfn_to_section_nr(pfn);
1021  if (!present_section_nr(section_nr))
1022  continue;
1023 
1024  section = __nr_to_section(section_nr);
1025  /* same memblock? */
1026  if (mem)
1027  if ((section_nr >= mem->start_section_nr) &&
1028  (section_nr <= mem->end_section_nr))
1029  continue;
1030 
1031  mem = find_memory_block_hinted(section, mem);
1032  if (!mem)
1033  continue;
1034 
1035  ret = offline_memory_block(mem);
1036  if (ret) {
1037  kobject_put(&mem->dev.kobj);
1038  return ret;
1039  }
1040  }
1041 
1042  if (mem)
1043  kobject_put(&mem->dev.kobj);
1044 
1045  return 0;
1046 }
1047 #else
1048 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1049 {
1050  return -EINVAL;
1051 }
1052 int remove_memory(u64 start, u64 size)
1053 {
1054  return -EINVAL;
1055 }
1056 #endif /* CONFIG_MEMORY_HOTREMOVE */