17 #include <linux/errno.h>
20 #include <linux/mman.h>
21 #include <linux/sched.h>
29 #include <linux/wait.h>
30 #include <linux/slab.h>
31 #include <linux/rbtree.h>
36 #include <linux/hash.h>
38 #include <linux/oom.h>
40 #include <asm/tlbflush.h>
151 #define SEQNR_MASK 0x0ff
152 #define UNSTABLE_FLAG 0x100
153 #define STABLE_FLAG 0x200
159 #define MM_SLOTS_HASH_SHIFT 10
160 #define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
163 static struct mm_slot ksm_mm_head = {
175 static unsigned long ksm_pages_shared;
178 static unsigned long ksm_pages_sharing;
181 static unsigned long ksm_pages_unshared;
184 static unsigned long ksm_rmap_items;
187 static unsigned int ksm_thread_pages_to_scan = 100;
190 static unsigned int ksm_thread_sleep_millisecs = 20;
192 #define KSM_RUN_STOP 0
193 #define KSM_RUN_MERGE 1
194 #define KSM_RUN_UNMERGE 2
201 #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
202 sizeof(struct __struct), __alignof__(struct __struct),\
205 static int __init ksm_slab_init(
void)
208 if (!rmap_item_cache)
212 if (!stable_node_cache)
229 static void __init ksm_slab_free(
void)
234 mm_slot_cache =
NULL;
237 static inline struct rmap_item *alloc_rmap_item(
void)
241 rmap_item = kmem_cache_zalloc(rmap_item_cache,
GFP_KERNEL);
247 static inline void free_rmap_item(
struct rmap_item *rmap_item)
254 static inline struct stable_node *alloc_stable_node(
void)
264 static inline struct mm_slot *alloc_mm_slot(
void)
268 return kmem_cache_zalloc(mm_slot_cache,
GFP_KERNEL);
284 if (mm == mm_slot->
mm)
290 static void insert_to_mm_slots_hash(
struct mm_struct *mm,
291 struct mm_slot *mm_slot)
297 hlist_add_head(&mm_slot->
link, bucket);
300 static inline int in_stable_tree(
struct rmap_item *rmap_item)
313 static inline bool ksm_test_exit(
struct mm_struct *mm)
337 if (IS_ERR_OR_NULL(page))
343 ret = VM_FAULT_WRITE;
345 }
while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
374 return (ret & VM_FAULT_OOM) ? -
ENOMEM : 0;
381 if (ksm_test_exit(mm))
391 static void break_cow(
struct rmap_item *rmap_item)
394 unsigned long addr = rmap_item->
address;
404 vma = find_mergeable_vma(mm, addr);
406 break_ksm(vma, addr);
410 static struct page *page_trans_compound_anon(
struct page *page)
412 if (PageTransCompound(page)) {
424 static struct page *get_mergeable_page(
struct rmap_item *rmap_item)
427 unsigned long addr = rmap_item->
address;
432 vma = find_mergeable_vma(mm, addr);
437 if (IS_ERR_OR_NULL(page))
439 if (PageAnon(page) || page_trans_compound_anon(page)) {
440 flush_anon_page(vma, page, addr);
452 struct rmap_item *rmap_item;
456 if (rmap_item->
hlist.next)
466 free_stable_node(stable_node);
498 static struct page *get_ksm_page(
struct stable_node *stable_node)
501 void *expected_mapping;
504 expected_mapping = (
void *)stable_node +
505 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
507 if (page->
mapping != expected_mapping)
509 if (!get_page_unless_zero(page))
511 if (page->
mapping != expected_mapping) {
519 remove_node_from_stable_tree(stable_node);
527 static void remove_rmap_item_from_tree(
struct rmap_item *rmap_item)
530 struct stable_node *stable_node;
533 stable_node = rmap_item->
head;
534 page = get_ksm_page(stable_node);
539 hlist_del(&rmap_item->
hlist);
543 if (stable_node->
hlist.first)
565 ksm_pages_unshared--;
572 static void remove_trailing_rmap_items(
struct mm_slot *mm_slot,
573 struct rmap_item **rmap_list)
576 struct rmap_item *rmap_item = *
rmap_list;
578 remove_rmap_item_from_tree(rmap_item);
579 free_rmap_item(rmap_item);
597 unsigned long start,
unsigned long end)
602 for (addr = start; addr < end && !
err; addr +=
PAGE_SIZE) {
603 if (ksm_test_exit(vma->
vm_mm))
608 err = break_ksm(vma, addr);
617 static int unmerge_and_remove_all_rmap_items(
void)
619 struct mm_slot *mm_slot;
624 spin_lock(&ksm_mmlist_lock);
626 struct mm_slot, mm_list);
627 spin_unlock(&ksm_mmlist_lock);
629 for (mm_slot = ksm_scan.
mm_slot;
630 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.
mm_slot) {
634 if (ksm_test_exit(mm))
638 err = unmerge_ksm_pages(vma,
644 remove_trailing_rmap_items(mm_slot, &mm_slot->
rmap_list);
646 spin_lock(&ksm_mmlist_lock);
648 struct mm_slot, mm_list);
649 if (ksm_test_exit(mm)) {
650 hlist_del(&mm_slot->
link);
652 spin_unlock(&ksm_mmlist_lock);
654 free_mm_slot(mm_slot);
659 spin_unlock(&ksm_mmlist_lock);
669 spin_lock(&ksm_mmlist_lock);
670 ksm_scan.
mm_slot = &ksm_mm_head;
671 spin_unlock(&ksm_mmlist_lock);
676 static u32 calc_checksum(
struct page *page)
680 checksum = jhash2(addr,
PAGE_SIZE / 4, 17);
685 static int memcmp_pages(
struct page *page1,
struct page *page2)
698 static inline int pages_identical(
struct page *page1,
struct page *page2)
700 return !memcmp_pages(page1, page2);
703 static int write_protect_page(
struct vm_area_struct *vma,
struct page *page,
712 unsigned long mmun_start;
713 unsigned long mmun_end;
719 BUG_ON(PageTransCompound(page));
723 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
725 ptep = page_check_address(page, mm, addr, &ptl, 0);
732 swapped = PageSwapCache(page);
748 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
761 pte_unmap_unlock(ptep, ptl);
763 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
777 static int replace_page(
struct vm_area_struct *vma,
struct page *page,
778 struct page *kpage,
pte_t orig_pte)
788 unsigned long mmun_start;
789 unsigned long mmun_end;
804 BUG_ON(pmd_trans_huge(*pmd));
810 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
812 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
814 pte_unmap_unlock(ptep, ptl);
826 if (!page_mapped(page))
830 pte_unmap_unlock(ptep, ptl);
833 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
838 static int page_trans_compound_anon_split(
struct page *page)
841 struct page *transhuge_head = page_trans_compound_anon(page);
842 if (transhuge_head) {
844 if (get_page_unless_zero(transhuge_head)) {
849 if (PageAnon(transhuge_head))
875 struct page *page,
struct page *kpage)
883 if (!(vma->
vm_flags & VM_MERGEABLE))
885 if (PageTransCompound(page) && page_trans_compound_anon_split(page))
887 BUG_ON(PageTransCompound(page));
898 if (!trylock_page(page))
906 if (write_protect_page(vma, page, &orig_pte) == 0) {
913 set_page_stable_node(page,
NULL);
916 }
else if (pages_identical(page, kpage))
917 err = replace_page(vma, page, kpage, orig_pte);
920 if ((vma->
vm_flags & VM_LOCKED) && kpage && !err) {
922 if (!PageMlocked(kpage)) {
941 static int try_to_merge_with_ksm_page(
struct rmap_item *rmap_item,
942 struct page *page,
struct page *kpage)
949 if (ksm_test_exit(mm))
955 err = try_to_merge_one_page(vma, page, kpage);
977 static struct page *try_to_merge_two_pages(
struct rmap_item *rmap_item,
979 struct rmap_item *tree_rmap_item,
980 struct page *tree_page)
984 err = try_to_merge_with_ksm_page(rmap_item, page,
NULL);
986 err = try_to_merge_with_ksm_page(tree_rmap_item,
993 break_cow(rmap_item);
1007 static struct page *stable_tree_search(
struct page *page)
1010 struct stable_node *stable_node;
1012 stable_node = page_stable_node(page);
1019 struct page *tree_page;
1023 stable_node =
rb_entry(node,
struct stable_node, node);
1024 tree_page = get_ksm_page(stable_node);
1028 ret = memcmp_pages(page, tree_page);
1033 }
else if (ret > 0) {
1050 static struct stable_node *stable_tree_insert(
struct page *kpage)
1054 struct stable_node *stable_node;
1057 struct page *tree_page;
1061 stable_node =
rb_entry(*
new,
struct stable_node, node);
1062 tree_page = get_ksm_page(stable_node);
1066 ret = memcmp_pages(kpage, tree_page);
1084 stable_node = alloc_stable_node();
1088 rb_link_node(&stable_node->
node, parent,
new);
1094 set_page_stable_node(kpage, stable_node);
1114 struct rmap_item *unstable_tree_search_insert(
struct rmap_item *rmap_item,
1116 struct page **tree_pagep)
1123 struct rmap_item *tree_rmap_item;
1124 struct page *tree_page;
1128 tree_rmap_item =
rb_entry(*
new,
struct rmap_item, node);
1129 tree_page = get_mergeable_page(tree_rmap_item);
1130 if (IS_ERR_OR_NULL(tree_page))
1136 if (page == tree_page) {
1141 ret = memcmp_pages(page, tree_page);
1147 }
else if (ret > 0) {
1151 *tree_pagep = tree_page;
1152 return tree_rmap_item;
1158 rb_link_node(&rmap_item->
node, parent,
new);
1161 ksm_pages_unshared++;
1170 static void stable_tree_append(
struct rmap_item *rmap_item,
1171 struct stable_node *stable_node)
1173 rmap_item->
head = stable_node;
1175 hlist_add_head(&rmap_item->
hlist, &stable_node->
hlist);
1177 if (rmap_item->
hlist.next)
1178 ksm_pages_sharing++;
1192 static void cmp_and_merge_page(
struct page *page,
struct rmap_item *rmap_item)
1194 struct rmap_item *tree_rmap_item;
1195 struct page *tree_page =
NULL;
1196 struct stable_node *stable_node;
1201 remove_rmap_item_from_tree(rmap_item);
1204 kpage = stable_tree_search(page);
1206 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1213 stable_tree_append(rmap_item, page_stable_node(kpage));
1226 checksum = calc_checksum(page);
1233 unstable_tree_search_insert(rmap_item, page, &tree_page);
1234 if (tree_rmap_item) {
1235 kpage = try_to_merge_two_pages(rmap_item, page,
1236 tree_rmap_item, tree_page);
1244 remove_rmap_item_from_tree(tree_rmap_item);
1247 stable_node = stable_tree_insert(kpage);
1249 stable_tree_append(tree_rmap_item, stable_node);
1250 stable_tree_append(rmap_item, stable_node);
1261 break_cow(tree_rmap_item);
1262 break_cow(rmap_item);
1268 static struct rmap_item *get_next_rmap_item(
struct mm_slot *mm_slot,
1269 struct rmap_item **rmap_list,
1272 struct rmap_item *rmap_item;
1274 while (*rmap_list) {
1278 if (rmap_item->
address > addr)
1281 remove_rmap_item_from_tree(rmap_item);
1282 free_rmap_item(rmap_item);
1285 rmap_item = alloc_rmap_item();
1288 rmap_item->
mm = mm_slot->
mm;
1291 *rmap_list = rmap_item;
1296 static struct rmap_item *scan_get_next_rmap_item(
struct page **page)
1299 struct mm_slot *
slot;
1301 struct rmap_item *rmap_item;
1303 if (list_empty(&ksm_mm_head.
mm_list))
1307 if (slot == &ksm_mm_head) {
1322 spin_lock(&ksm_mmlist_lock);
1325 spin_unlock(&ksm_mmlist_lock);
1330 if (slot == &ksm_mm_head)
1339 if (ksm_test_exit(mm))
1344 for (; vma; vma = vma->
vm_next) {
1345 if (!(vma->
vm_flags & VM_MERGEABLE))
1353 if (ksm_test_exit(mm))
1356 if (IS_ERR_OR_NULL(*page)) {
1361 if (PageAnon(*page) ||
1362 page_trans_compound_anon(*page)) {
1363 flush_anon_page(vma, *page, ksm_scan.
address);
1365 rmap_item = get_next_rmap_item(slot,
1382 if (ksm_test_exit(mm)) {
1390 remove_trailing_rmap_items(slot, ksm_scan.
rmap_list);
1392 spin_lock(&ksm_mmlist_lock);
1394 struct mm_slot, mm_list);
1405 hlist_del(&slot->
link);
1407 spin_unlock(&ksm_mmlist_lock);
1414 spin_unlock(&ksm_mmlist_lock);
1420 if (slot != &ksm_mm_head)
1431 static void ksm_do_scan(
unsigned int scan_npages)
1433 struct rmap_item *rmap_item;
1438 rmap_item = scan_get_next_rmap_item(&page);
1441 if (!PageKsm(page) || !in_stable_tree(rmap_item))
1442 cmp_and_merge_page(page, rmap_item);
1447 static int ksmd_should_run(
void)
1452 static int ksm_scan_thread(
void *
nothing)
1459 if (ksmd_should_run())
1460 ksm_do_scan(ksm_thread_pages_to_scan);
1465 if (ksmd_should_run()) {
1477 unsigned long end,
int advice,
unsigned long *
vm_flags)
1487 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1488 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1489 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
1493 if (*vm_flags & VM_SAO)
1503 *vm_flags |= VM_MERGEABLE;
1507 if (!(*vm_flags & VM_MERGEABLE))
1511 err = unmerge_ksm_pages(vma, start, end);
1516 *vm_flags &= ~VM_MERGEABLE;
1525 struct mm_slot *mm_slot;
1528 mm_slot = alloc_mm_slot();
1533 needs_wakeup = list_empty(&ksm_mm_head.
mm_list);
1535 spin_lock(&ksm_mmlist_lock);
1536 insert_to_mm_slots_hash(mm, mm_slot);
1543 spin_unlock(&ksm_mmlist_lock);
1556 struct mm_slot *mm_slot;
1557 int easy_to_free = 0;
1568 spin_lock(&ksm_mmlist_lock);
1569 mm_slot = get_mm_slot(mm);
1570 if (mm_slot && ksm_scan.
mm_slot != mm_slot) {
1572 hlist_del(&mm_slot->
link);
1580 spin_unlock(&ksm_mmlist_lock);
1583 free_mm_slot(mm_slot);
1586 }
else if (mm_slot) {
1595 struct page *new_page;
1601 SetPageDirty(new_page);
1602 __SetPageUptodate(new_page);
1603 SetPageSwapBacked(new_page);
1604 __set_page_locked(new_page);
1606 if (!mlocked_vma_newpage(vma, new_page))
1618 struct stable_node *stable_node;
1619 struct rmap_item *rmap_item;
1621 unsigned int mapcount = page_mapcount(page);
1623 int search_new_forks = 0;
1628 stable_node = page_stable_node(page);
1637 anon_vma_lock(anon_vma);
1638 anon_vma_interval_tree_foreach(vmac, &anon_vma->
rb_root,
1650 if ((rmap_item->
mm == vma->
vm_mm) == search_new_forks)
1653 if (memcg && !mm_match_cgroup(vma->
vm_mm, memcg))
1657 rmap_item->
address, &mapcount, vm_flags);
1658 if (!search_new_forks || !mapcount)
1661 anon_vma_unlock(anon_vma);
1665 if (!search_new_forks++)
1673 struct stable_node *stable_node;
1675 struct rmap_item *rmap_item;
1677 int search_new_forks = 0;
1682 stable_node = page_stable_node(page);
1691 anon_vma_lock(anon_vma);
1692 anon_vma_interval_tree_foreach(vmac, &anon_vma->
rb_root,
1704 if ((rmap_item->
mm == vma->
vm_mm) == search_new_forks)
1709 if (ret !=
SWAP_AGAIN || !page_mapped(page)) {
1710 anon_vma_unlock(anon_vma);
1714 anon_vma_unlock(anon_vma);
1716 if (!search_new_forks++)
1722 #ifdef CONFIG_MIGRATION
1723 int rmap_walk_ksm(
struct page *page,
int (*rmap_one)(
struct page *,
1726 struct stable_node *stable_node;
1728 struct rmap_item *rmap_item;
1730 int search_new_forks = 0;
1735 stable_node = page_stable_node(page);
1744 anon_vma_lock(anon_vma);
1745 anon_vma_interval_tree_foreach(vmac, &anon_vma->
rb_root,
1757 if ((rmap_item->
mm == vma->
vm_mm) == search_new_forks)
1760 ret = rmap_one(page, vma, rmap_item->
address, arg);
1762 anon_vma_unlock(anon_vma);
1766 anon_vma_unlock(anon_vma);
1768 if (!search_new_forks++)
1774 void ksm_migrate_page(
struct page *newpage,
struct page *oldpage)
1776 struct stable_node *stable_node;
1782 stable_node = page_stable_node(newpage);
1790 #ifdef CONFIG_MEMORY_HOTREMOVE
1791 static struct stable_node *ksm_check_stable_tree(
unsigned long start_pfn,
1792 unsigned long end_pfn)
1797 struct stable_node *stable_node;
1799 stable_node =
rb_entry(node,
struct stable_node, node);
1800 if (stable_node->
kpfn >= start_pfn &&
1801 stable_node->
kpfn < end_pfn)
1808 unsigned long action,
void *arg)
1811 struct stable_node *stable_node;
1833 while ((stable_node = ksm_check_stable_tree(mn->
start_pfn,
1835 remove_node_from_stable_tree(stable_node);
1851 #define KSM_ATTR_RO(_name) \
1852 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1853 #define KSM_ATTR(_name) \
1854 static struct kobj_attribute _name##_attr = \
1855 __ATTR(_name, 0644, _name##_show, _name##_store)
1860 return sprintf(buf,
"%u\n", ksm_thread_sleep_millisecs);
1867 unsigned long msecs;
1874 ksm_thread_sleep_millisecs = msecs;
1878 KSM_ATTR(sleep_millisecs);
1883 return sprintf(buf,
"%u\n", ksm_thread_pages_to_scan);
1888 const char *buf,
size_t count)
1891 unsigned long nr_pages;
1897 ksm_thread_pages_to_scan = nr_pages;
1901 KSM_ATTR(pages_to_scan);
1906 return sprintf(buf,
"%u\n", ksm_run);
1910 const char *buf,
size_t count)
1913 unsigned long flags;
1929 if (ksm_run != flags) {
1935 err = unmerge_and_remove_all_rmap_items();
1956 return sprintf(buf,
"%lu\n", ksm_pages_shared);
1958 KSM_ATTR_RO(pages_shared);
1963 return sprintf(buf,
"%lu\n", ksm_pages_sharing);
1965 KSM_ATTR_RO(pages_sharing);
1970 return sprintf(buf,
"%lu\n", ksm_pages_unshared);
1972 KSM_ATTR_RO(pages_unshared);
1977 long ksm_pages_volatile;
1979 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
1980 - ksm_pages_sharing - ksm_pages_unshared;
1985 if (ksm_pages_volatile < 0)
1986 ksm_pages_volatile = 0;
1987 return sprintf(buf,
"%ld\n", ksm_pages_volatile);
1989 KSM_ATTR_RO(pages_volatile);
1996 KSM_ATTR_RO(full_scans);
1998 static struct attribute *ksm_attrs[] = {
1999 &sleep_millisecs_attr.attr,
2000 &pages_to_scan_attr.attr,
2002 &pages_shared_attr.attr,
2003 &pages_sharing_attr.attr,
2004 &pages_unshared_attr.attr,
2005 &pages_volatile_attr.attr,
2006 &full_scans_attr.attr,
2016 static int __init ksm_init(
void)
2021 err = ksm_slab_init();
2026 if (IS_ERR(ksm_thread)) {
2028 err = PTR_ERR(ksm_thread);
2044 #ifdef CONFIG_MEMORY_HOTREMOVE