Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
mlock.c
Go to the documentation of this file.
1 /*
2  * linux/mm/mlock.c
3  *
4  * (C) Copyright 1995 Linus Torvalds
5  * (C) Copyright 2002 Christoph Hellwig
6  */
7 
8 #include <linux/capability.h>
9 #include <linux/mman.h>
10 #include <linux/mm.h>
11 #include <linux/swap.h>
12 #include <linux/swapops.h>
13 #include <linux/pagemap.h>
14 #include <linux/mempolicy.h>
15 #include <linux/syscalls.h>
16 #include <linux/sched.h>
17 #include <linux/export.h>
18 #include <linux/rmap.h>
19 #include <linux/mmzone.h>
20 #include <linux/hugetlb.h>
21 
22 #include "internal.h"
23 
24 int can_do_mlock(void)
25 {
26  if (capable(CAP_IPC_LOCK))
27  return 1;
28  if (rlimit(RLIMIT_MEMLOCK) != 0)
29  return 1;
30  return 0;
31 }
33 
34 /*
35  * Mlocked pages are marked with PageMlocked() flag for efficient testing
36  * in vmscan and, possibly, the fault path; and to support semi-accurate
37  * statistics.
38  *
39  * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
40  * be placed on the LRU "unevictable" list, rather than the [in]active lists.
41  * The unevictable list is an LRU sibling list to the [in]active lists.
42  * PageUnevictable is set to indicate the unevictable state.
43  *
44  * When lazy mlocking via vmscan, it is important to ensure that the
45  * vma's VM_LOCKED status is not concurrently being modified, otherwise we
46  * may have mlocked a page that is being munlocked. So lazy mlock must take
47  * the mmap_sem for read, and verify that the vma really is locked
48  * (see mm/rmap.c).
49  */
50 
51 /*
52  * LRU accounting for clear_page_mlock()
53  */
54 void clear_page_mlock(struct page *page)
55 {
56  if (!TestClearPageMlocked(page))
57  return;
58 
59  mod_zone_page_state(page_zone(page), NR_MLOCK,
60  -hpage_nr_pages(page));
61  count_vm_event(UNEVICTABLE_PGCLEARED);
62  if (!isolate_lru_page(page)) {
63  putback_lru_page(page);
64  } else {
65  /*
66  * We lost the race. the page already moved to evictable list.
67  */
68  if (PageUnevictable(page))
69  count_vm_event(UNEVICTABLE_PGSTRANDED);
70  }
71 }
72 
73 /*
74  * Mark page as mlocked if not already.
75  * If page on LRU, isolate and putback to move to unevictable list.
76  */
77 void mlock_vma_page(struct page *page)
78 {
79  BUG_ON(!PageLocked(page));
80 
81  if (!TestSetPageMlocked(page)) {
82  mod_zone_page_state(page_zone(page), NR_MLOCK,
83  hpage_nr_pages(page));
84  count_vm_event(UNEVICTABLE_PGMLOCKED);
85  if (!isolate_lru_page(page))
86  putback_lru_page(page);
87  }
88 }
89 
106 {
107  BUG_ON(!PageLocked(page));
108 
109  if (TestClearPageMlocked(page)) {
110  mod_zone_page_state(page_zone(page), NR_MLOCK,
111  -hpage_nr_pages(page));
112  if (!isolate_lru_page(page)) {
113  int ret = SWAP_AGAIN;
114 
115  /*
116  * Optimization: if the page was mapped just once,
117  * that's our mapping and we don't need to check all the
118  * other vmas.
119  */
120  if (page_mapcount(page) > 1)
121  ret = try_to_munlock(page);
122  /*
123  * did try_to_unlock() succeed or punt?
124  */
125  if (ret != SWAP_MLOCK)
126  count_vm_event(UNEVICTABLE_PGMUNLOCKED);
127 
128  putback_lru_page(page);
129  } else {
130  /*
131  * Some other task has removed the page from the LRU.
132  * putback_lru_page() will take care of removing the
133  * page from the unevictable list, if necessary.
134  * vmscan [page_referenced()] will move the page back
135  * to the unevictable list if some other vma has it
136  * mlocked.
137  */
138  if (PageUnevictable(page))
139  count_vm_event(UNEVICTABLE_PGSTRANDED);
140  else
141  count_vm_event(UNEVICTABLE_PGMUNLOCKED);
142  }
143  }
144 }
145 
158 static long __mlock_vma_pages_range(struct vm_area_struct *vma,
159  unsigned long start, unsigned long end,
160  int *nonblocking)
161 {
162  struct mm_struct *mm = vma->vm_mm;
163  unsigned long addr = start;
164  int nr_pages = (end - start) / PAGE_SIZE;
165  int gup_flags;
166 
167  VM_BUG_ON(start & ~PAGE_MASK);
168  VM_BUG_ON(end & ~PAGE_MASK);
169  VM_BUG_ON(start < vma->vm_start);
170  VM_BUG_ON(end > vma->vm_end);
172 
173  gup_flags = FOLL_TOUCH | FOLL_MLOCK;
174  /*
175  * We want to touch writable mappings with a write fault in order
176  * to break COW, except for shared mappings because these don't COW
177  * and we would not want to dirty them for nothing.
178  */
179  if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
180  gup_flags |= FOLL_WRITE;
181 
182  /*
183  * We want mlock to succeed for regions that have any permissions
184  * other than PROT_NONE.
185  */
186  if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
187  gup_flags |= FOLL_FORCE;
188 
189  return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
190  NULL, NULL, nonblocking);
191 }
192 
193 /*
194  * convert get_user_pages() return value to posix mlock() error
195  */
196 static int __mlock_posix_error_return(long retval)
197 {
198  if (retval == -EFAULT)
199  retval = -ENOMEM;
200  else if (retval == -ENOMEM)
201  retval = -EAGAIN;
202  return retval;
203 }
204 
219  unsigned long start, unsigned long end)
220 {
221  int nr_pages = (end - start) / PAGE_SIZE;
222  BUG_ON(!(vma->vm_flags & VM_LOCKED));
223 
224  /*
225  * filter unlockable vmas
226  */
227  if (vma->vm_flags & (VM_IO | VM_PFNMAP))
228  goto no_mlock;
229 
230  if (!((vma->vm_flags & VM_DONTEXPAND) ||
231  is_vm_hugetlb_page(vma) ||
232  vma == get_gate_vma(current->mm))) {
233 
234  __mlock_vma_pages_range(vma, start, end, NULL);
235 
236  /* Hide errors from mmap() and other callers */
237  return 0;
238  }
239 
240  /*
241  * User mapped kernel pages or huge pages:
242  * make these pages present to populate the ptes, but
243  * fall thru' to reset VM_LOCKED--no need to unlock, and
244  * return nr_pages so these don't get counted against task's
245  * locked limit. huge pages are already counted against
246  * locked vm limit.
247  */
248  make_pages_present(start, end);
249 
250 no_mlock:
251  vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
252  return nr_pages; /* error or pages NOT mlocked */
253 }
254 
255 /*
256  * munlock_vma_pages_range() - munlock all pages in the vma range.'
257  * @vma - vma containing range to be munlock()ed.
258  * @start - start address in @vma of the range
259  * @end - end of range in @vma.
260  *
261  * For mremap(), munmap() and exit().
262  *
263  * Called with @vma VM_LOCKED.
264  *
265  * Returns with VM_LOCKED cleared. Callers must be prepared to
266  * deal with this.
267  *
268  * We don't save and restore VM_LOCKED here because pages are
269  * still on lru. In unmap path, pages might be scanned by reclaim
270  * and re-mlocked by try_to_{munlock|unmap} before we unmap and
271  * free them. This will result in freeing mlocked pages.
272  */
274  unsigned long start, unsigned long end)
275 {
276  unsigned long addr;
277 
278  lru_add_drain();
279  vma->vm_flags &= ~VM_LOCKED;
280 
281  for (addr = start; addr < end; addr += PAGE_SIZE) {
282  struct page *page;
283  /*
284  * Although FOLL_DUMP is intended for get_dump_page(),
285  * it just so happens that its special treatment of the
286  * ZERO_PAGE (returning an error instead of doing get_page)
287  * suits munlock very well (and if somehow an abnormal page
288  * has sneaked into the range, we won't oops here: great).
289  */
290  page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
291  if (page && !IS_ERR(page)) {
292  lock_page(page);
293  munlock_vma_page(page);
294  unlock_page(page);
295  put_page(page);
296  }
297  cond_resched();
298  }
299 }
300 
301 /*
302  * mlock_fixup - handle mlock[all]/munlock[all] requests.
303  *
304  * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
305  * munlock is a no-op. However, for some special vmas, we go ahead and
306  * populate the ptes via make_pages_present().
307  *
308  * For vmas that pass the filters, merge/split as appropriate.
309  */
310 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
311  unsigned long start, unsigned long end, vm_flags_t newflags)
312 {
313  struct mm_struct *mm = vma->vm_mm;
314  pgoff_t pgoff;
315  int nr_pages;
316  int ret = 0;
317  int lock = !!(newflags & VM_LOCKED);
318 
319  if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
320  is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
321  goto out; /* don't set VM_LOCKED, don't count */
322 
323  pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
324  *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
325  vma->vm_file, pgoff, vma_policy(vma));
326  if (*prev) {
327  vma = *prev;
328  goto success;
329  }
330 
331  if (start != vma->vm_start) {
332  ret = split_vma(mm, vma, start, 1);
333  if (ret)
334  goto out;
335  }
336 
337  if (end != vma->vm_end) {
338  ret = split_vma(mm, vma, end, 0);
339  if (ret)
340  goto out;
341  }
342 
343 success:
344  /*
345  * Keep track of amount of locked VM.
346  */
347  nr_pages = (end - start) >> PAGE_SHIFT;
348  if (!lock)
349  nr_pages = -nr_pages;
350  mm->locked_vm += nr_pages;
351 
352  /*
353  * vm_flags is protected by the mmap_sem held in write mode.
354  * It's okay if try_to_unmap_one unmaps a page just after we
355  * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
356  */
357 
358  if (lock)
359  vma->vm_flags = newflags;
360  else
361  munlock_vma_pages_range(vma, start, end);
362 
363 out:
364  *prev = vma;
365  return ret;
366 }
367 
368 static int do_mlock(unsigned long start, size_t len, int on)
369 {
370  unsigned long nstart, end, tmp;
371  struct vm_area_struct * vma, * prev;
372  int error;
373 
374  VM_BUG_ON(start & ~PAGE_MASK);
375  VM_BUG_ON(len != PAGE_ALIGN(len));
376  end = start + len;
377  if (end < start)
378  return -EINVAL;
379  if (end == start)
380  return 0;
381  vma = find_vma(current->mm, start);
382  if (!vma || vma->vm_start > start)
383  return -ENOMEM;
384 
385  prev = vma->vm_prev;
386  if (start > vma->vm_start)
387  prev = vma;
388 
389  for (nstart = start ; ; ) {
390  vm_flags_t newflags;
391 
392  /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
393 
394  newflags = vma->vm_flags | VM_LOCKED;
395  if (!on)
396  newflags &= ~VM_LOCKED;
397 
398  tmp = vma->vm_end;
399  if (tmp > end)
400  tmp = end;
401  error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
402  if (error)
403  break;
404  nstart = tmp;
405  if (nstart < prev->vm_end)
406  nstart = prev->vm_end;
407  if (nstart >= end)
408  break;
409 
410  vma = prev->vm_next;
411  if (!vma || vma->vm_start != nstart) {
412  error = -ENOMEM;
413  break;
414  }
415  }
416  return error;
417 }
418 
419 static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
420 {
421  struct mm_struct *mm = current->mm;
422  unsigned long end, nstart, nend;
423  struct vm_area_struct *vma = NULL;
424  int locked = 0;
425  int ret = 0;
426 
427  VM_BUG_ON(start & ~PAGE_MASK);
428  VM_BUG_ON(len != PAGE_ALIGN(len));
429  end = start + len;
430 
431  for (nstart = start; nstart < end; nstart = nend) {
432  /*
433  * We want to fault in pages for [nstart; end) address range.
434  * Find first corresponding VMA.
435  */
436  if (!locked) {
437  locked = 1;
438  down_read(&mm->mmap_sem);
439  vma = find_vma(mm, nstart);
440  } else if (nstart >= vma->vm_end)
441  vma = vma->vm_next;
442  if (!vma || vma->vm_start >= end)
443  break;
444  /*
445  * Set [nstart; nend) to intersection of desired address
446  * range with the first VMA. Also, skip undesirable VMA types.
447  */
448  nend = min(end, vma->vm_end);
449  if (vma->vm_flags & (VM_IO | VM_PFNMAP))
450  continue;
451  if (nstart < vma->vm_start)
452  nstart = vma->vm_start;
453  /*
454  * Now fault in a range of pages. __mlock_vma_pages_range()
455  * double checks the vma flags, so that it won't mlock pages
456  * if the vma was already munlocked.
457  */
458  ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
459  if (ret < 0) {
460  if (ignore_errors) {
461  ret = 0;
462  continue; /* continue at next VMA */
463  }
464  ret = __mlock_posix_error_return(ret);
465  break;
466  }
467  nend = nstart + ret * PAGE_SIZE;
468  ret = 0;
469  }
470  if (locked)
471  up_read(&mm->mmap_sem);
472  return ret; /* 0 or negative error code */
473 }
474 
475 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
476 {
477  unsigned long locked;
478  unsigned long lock_limit;
479  int error = -ENOMEM;
480 
481  if (!can_do_mlock())
482  return -EPERM;
483 
484  lru_add_drain_all(); /* flush pagevec */
485 
486  down_write(&current->mm->mmap_sem);
487  len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
488  start &= PAGE_MASK;
489 
490  locked = len >> PAGE_SHIFT;
491  locked += current->mm->locked_vm;
492 
493  lock_limit = rlimit(RLIMIT_MEMLOCK);
494  lock_limit >>= PAGE_SHIFT;
495 
496  /* check against resource limits */
497  if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
498  error = do_mlock(start, len, 1);
499  up_write(&current->mm->mmap_sem);
500  if (!error)
501  error = do_mlock_pages(start, len, 0);
502  return error;
503 }
504 
505 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
506 {
507  int ret;
508 
509  down_write(&current->mm->mmap_sem);
510  len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
511  start &= PAGE_MASK;
512  ret = do_mlock(start, len, 0);
513  up_write(&current->mm->mmap_sem);
514  return ret;
515 }
516 
517 static int do_mlockall(int flags)
518 {
519  struct vm_area_struct * vma, * prev = NULL;
520  unsigned int def_flags = 0;
521 
522  if (flags & MCL_FUTURE)
523  def_flags = VM_LOCKED;
524  current->mm->def_flags = def_flags;
525  if (flags == MCL_FUTURE)
526  goto out;
527 
528  for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
529  vm_flags_t newflags;
530 
531  newflags = vma->vm_flags | VM_LOCKED;
532  if (!(flags & MCL_CURRENT))
533  newflags &= ~VM_LOCKED;
534 
535  /* Ignore errors */
536  mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
537  }
538 out:
539  return 0;
540 }
541 
542 SYSCALL_DEFINE1(mlockall, int, flags)
543 {
544  unsigned long lock_limit;
545  int ret = -EINVAL;
546 
547  if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
548  goto out;
549 
550  ret = -EPERM;
551  if (!can_do_mlock())
552  goto out;
553 
554  if (flags & MCL_CURRENT)
555  lru_add_drain_all(); /* flush pagevec */
556 
557  down_write(&current->mm->mmap_sem);
558 
559  lock_limit = rlimit(RLIMIT_MEMLOCK);
560  lock_limit >>= PAGE_SHIFT;
561 
562  ret = -ENOMEM;
563  if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
565  ret = do_mlockall(flags);
566  up_write(&current->mm->mmap_sem);
567  if (!ret && (flags & MCL_CURRENT)) {
568  /* Ignore errors */
569  do_mlock_pages(0, TASK_SIZE, 1);
570  }
571 out:
572  return ret;
573 }
574 
575 SYSCALL_DEFINE0(munlockall)
576 {
577  int ret;
578 
579  down_write(&current->mm->mmap_sem);
580  ret = do_mlockall(0);
581  up_write(&current->mm->mmap_sem);
582  return ret;
583 }
584 
585 /*
586  * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
587  * shm segments) get accounted against the user_struct instead.
588  */
589 static DEFINE_SPINLOCK(shmlock_user_lock);
590 
591 int user_shm_lock(size_t size, struct user_struct *user)
592 {
593  unsigned long lock_limit, locked;
594  int allowed = 0;
595 
596  locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
597  lock_limit = rlimit(RLIMIT_MEMLOCK);
598  if (lock_limit == RLIM_INFINITY)
599  allowed = 1;
600  lock_limit >>= PAGE_SHIFT;
601  spin_lock(&shmlock_user_lock);
602  if (!allowed &&
603  locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
604  goto out;
605  get_uid(user);
606  user->locked_shm += locked;
607  allowed = 1;
608 out:
609  spin_unlock(&shmlock_user_lock);
610  return allowed;
611 }
612 
613 void user_shm_unlock(size_t size, struct user_struct *user)
614 {
615  spin_lock(&shmlock_user_lock);
616  user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
617  spin_unlock(&shmlock_user_lock);
618  free_uid(user);
619 }