Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
mremap.c
Go to the documentation of this file.
1 /*
2  * mm/mremap.c
3  *
4  * (C) Copyright 1996 Linus Torvalds
5  *
6  * Address space accounting code <[email protected]>
7  * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
8  */
9 
10 #include <linux/mm.h>
11 #include <linux/hugetlb.h>
12 #include <linux/shm.h>
13 #include <linux/ksm.h>
14 #include <linux/mman.h>
15 #include <linux/swap.h>
16 #include <linux/capability.h>
17 #include <linux/fs.h>
18 #include <linux/highmem.h>
19 #include <linux/security.h>
20 #include <linux/syscalls.h>
21 #include <linux/mmu_notifier.h>
22 
23 #include <asm/uaccess.h>
24 #include <asm/cacheflush.h>
25 #include <asm/tlbflush.h>
26 
27 #include "internal.h"
28 
29 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
30 {
31  pgd_t *pgd;
32  pud_t *pud;
33  pmd_t *pmd;
34 
35  pgd = pgd_offset(mm, addr);
36  if (pgd_none_or_clear_bad(pgd))
37  return NULL;
38 
39  pud = pud_offset(pgd, addr);
40  if (pud_none_or_clear_bad(pud))
41  return NULL;
42 
43  pmd = pmd_offset(pud, addr);
44  if (pmd_none(*pmd))
45  return NULL;
46 
47  return pmd;
48 }
49 
50 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
51  unsigned long addr)
52 {
53  pgd_t *pgd;
54  pud_t *pud;
55  pmd_t *pmd;
56 
57  pgd = pgd_offset(mm, addr);
58  pud = pud_alloc(mm, pgd, addr);
59  if (!pud)
60  return NULL;
61 
62  pmd = pmd_alloc(mm, pud, addr);
63  if (!pmd)
64  return NULL;
65 
66  VM_BUG_ON(pmd_trans_huge(*pmd));
67 
68  return pmd;
69 }
70 
71 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
72  unsigned long old_addr, unsigned long old_end,
73  struct vm_area_struct *new_vma, pmd_t *new_pmd,
74  unsigned long new_addr, bool need_rmap_locks)
75 {
76  struct address_space *mapping = NULL;
77  struct anon_vma *anon_vma = NULL;
78  struct mm_struct *mm = vma->vm_mm;
79  pte_t *old_pte, *new_pte, pte;
80  spinlock_t *old_ptl, *new_ptl;
81 
82  /*
83  * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
84  * locks to ensure that rmap will always observe either the old or the
85  * new ptes. This is the easiest way to avoid races with
86  * truncate_pagecache(), page migration, etc...
87  *
88  * When need_rmap_locks is false, we use other ways to avoid
89  * such races:
90  *
91  * - During exec() shift_arg_pages(), we use a specially tagged vma
92  * which rmap call sites look for using is_vma_temporary_stack().
93  *
94  * - During mremap(), new_vma is often known to be placed after vma
95  * in rmap traversal order. This ensures rmap will always observe
96  * either the old pte, or the new pte, or both (the page table locks
97  * serialize access to individual ptes, but only rmap traversal
98  * order guarantees that we won't miss both the old and new ptes).
99  */
100  if (need_rmap_locks) {
101  if (vma->vm_file) {
102  mapping = vma->vm_file->f_mapping;
103  mutex_lock(&mapping->i_mmap_mutex);
104  }
105  if (vma->anon_vma) {
106  anon_vma = vma->anon_vma;
107  anon_vma_lock(anon_vma);
108  }
109  }
110 
111  /*
112  * We don't have to worry about the ordering of src and dst
113  * pte locks because exclusive mmap_sem prevents deadlock.
114  */
115  old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
116  new_pte = pte_offset_map(new_pmd, new_addr);
117  new_ptl = pte_lockptr(mm, new_pmd);
118  if (new_ptl != old_ptl)
121 
122  for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
123  new_pte++, new_addr += PAGE_SIZE) {
124  if (pte_none(*old_pte))
125  continue;
126  pte = ptep_get_and_clear(mm, old_addr, old_pte);
127  pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
128  set_pte_at(mm, new_addr, new_pte, pte);
129  }
130 
132  if (new_ptl != old_ptl)
133  spin_unlock(new_ptl);
134  pte_unmap(new_pte - 1);
135  pte_unmap_unlock(old_pte - 1, old_ptl);
136  if (anon_vma)
137  anon_vma_unlock(anon_vma);
138  if (mapping)
139  mutex_unlock(&mapping->i_mmap_mutex);
140 }
141 
142 #define LATENCY_LIMIT (64 * PAGE_SIZE)
143 
144 unsigned long move_page_tables(struct vm_area_struct *vma,
145  unsigned long old_addr, struct vm_area_struct *new_vma,
146  unsigned long new_addr, unsigned long len,
147  bool need_rmap_locks)
148 {
149  unsigned long extent, next, old_end;
150  pmd_t *old_pmd, *new_pmd;
151  bool need_flush = false;
152  unsigned long mmun_start; /* For mmu_notifiers */
153  unsigned long mmun_end; /* For mmu_notifiers */
154 
155  old_end = old_addr + len;
156  flush_cache_range(vma, old_addr, old_end);
157 
158  mmun_start = old_addr;
159  mmun_end = old_end;
160  mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
161 
162  for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
163  cond_resched();
164  next = (old_addr + PMD_SIZE) & PMD_MASK;
165  /* even if next overflowed, extent below will be ok */
166  extent = next - old_addr;
167  if (extent > old_end - old_addr)
168  extent = old_end - old_addr;
169  old_pmd = get_old_pmd(vma->vm_mm, old_addr);
170  if (!old_pmd)
171  continue;
172  new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
173  if (!new_pmd)
174  break;
175  if (pmd_trans_huge(*old_pmd)) {
176  int err = 0;
177  if (extent == HPAGE_PMD_SIZE)
178  err = move_huge_pmd(vma, new_vma, old_addr,
179  new_addr, old_end,
180  old_pmd, new_pmd);
181  if (err > 0) {
182  need_flush = true;
183  continue;
184  } else if (!err) {
185  split_huge_page_pmd(vma->vm_mm, old_pmd);
186  }
187  VM_BUG_ON(pmd_trans_huge(*old_pmd));
188  }
189  if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
190  new_pmd, new_addr))
191  break;
192  next = (new_addr + PMD_SIZE) & PMD_MASK;
193  if (extent > next - new_addr)
194  extent = next - new_addr;
195  if (extent > LATENCY_LIMIT)
196  extent = LATENCY_LIMIT;
197  move_ptes(vma, old_pmd, old_addr, old_addr + extent,
198  new_vma, new_pmd, new_addr, need_rmap_locks);
199  need_flush = true;
200  }
201  if (likely(need_flush))
202  flush_tlb_range(vma, old_end-len, old_addr);
203 
204  mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
205 
206  return len + old_addr - old_end; /* how much done */
207 }
208 
209 static unsigned long move_vma(struct vm_area_struct *vma,
210  unsigned long old_addr, unsigned long old_len,
211  unsigned long new_len, unsigned long new_addr)
212 {
213  struct mm_struct *mm = vma->vm_mm;
214  struct vm_area_struct *new_vma;
215  unsigned long vm_flags = vma->vm_flags;
216  unsigned long new_pgoff;
217  unsigned long moved_len;
218  unsigned long excess = 0;
219  unsigned long hiwater_vm;
220  int split = 0;
221  int err;
222  bool need_rmap_locks;
223 
224  /*
225  * We'd prefer to avoid failure later on in do_munmap:
226  * which may split one vma into three before unmapping.
227  */
228  if (mm->map_count >= sysctl_max_map_count - 3)
229  return -ENOMEM;
230 
231  /*
232  * Advise KSM to break any KSM pages in the area to be moved:
233  * it would be confusing if they were to turn up at the new
234  * location, where they happen to coincide with different KSM
235  * pages recently unmapped. But leave vma->vm_flags as it was,
236  * so KSM can come around to merge on vma and new_vma afterwards.
237  */
238  err = ksm_madvise(vma, old_addr, old_addr + old_len,
239  MADV_UNMERGEABLE, &vm_flags);
240  if (err)
241  return err;
242 
243  new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
244  new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
245  &need_rmap_locks);
246  if (!new_vma)
247  return -ENOMEM;
248 
249  moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
250  need_rmap_locks);
251  if (moved_len < old_len) {
252  /*
253  * On error, move entries back from new area to old,
254  * which will succeed since page tables still there,
255  * and then proceed to unmap new area instead of old.
256  */
257  move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
258  true);
259  vma = new_vma;
260  old_len = new_len;
261  old_addr = new_addr;
262  new_addr = -ENOMEM;
263  }
264 
265  /* Conceal VM_ACCOUNT so old reservation is not undone */
266  if (vm_flags & VM_ACCOUNT) {
267  vma->vm_flags &= ~VM_ACCOUNT;
268  excess = vma->vm_end - vma->vm_start - old_len;
269  if (old_addr > vma->vm_start &&
270  old_addr + old_len < vma->vm_end)
271  split = 1;
272  }
273 
274  /*
275  * If we failed to move page tables we still do total_vm increment
276  * since do_munmap() will decrement it by old_len == new_len.
277  *
278  * Since total_vm is about to be raised artificially high for a
279  * moment, we need to restore high watermark afterwards: if stats
280  * are taken meanwhile, total_vm and hiwater_vm appear too high.
281  * If this were a serious issue, we'd add a flag to do_munmap().
282  */
283  hiwater_vm = mm->hiwater_vm;
284  vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
285 
286  if (do_munmap(mm, old_addr, old_len) < 0) {
287  /* OOM: unable to split vma, just get accounts right */
288  vm_unacct_memory(excess >> PAGE_SHIFT);
289  excess = 0;
290  }
291  mm->hiwater_vm = hiwater_vm;
292 
293  /* Restore VM_ACCOUNT if one or two pieces of vma left */
294  if (excess) {
295  vma->vm_flags |= VM_ACCOUNT;
296  if (split)
297  vma->vm_next->vm_flags |= VM_ACCOUNT;
298  }
299 
300  if (vm_flags & VM_LOCKED) {
301  mm->locked_vm += new_len >> PAGE_SHIFT;
302  if (new_len > old_len)
303  mlock_vma_pages_range(new_vma, new_addr + old_len,
304  new_addr + new_len);
305  }
306 
307  return new_addr;
308 }
309 
310 static struct vm_area_struct *vma_to_resize(unsigned long addr,
311  unsigned long old_len, unsigned long new_len, unsigned long *p)
312 {
313  struct mm_struct *mm = current->mm;
314  struct vm_area_struct *vma = find_vma(mm, addr);
315 
316  if (!vma || vma->vm_start > addr)
317  goto Efault;
318 
319  if (is_vm_hugetlb_page(vma))
320  goto Einval;
321 
322  /* We can't remap across vm area boundaries */
323  if (old_len > vma->vm_end - addr)
324  goto Efault;
325 
326  /* Need to be careful about a growing mapping */
327  if (new_len > old_len) {
328  unsigned long pgoff;
329 
330  if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
331  goto Efault;
332  pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
333  pgoff += vma->vm_pgoff;
334  if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
335  goto Einval;
336  }
337 
338  if (vma->vm_flags & VM_LOCKED) {
339  unsigned long locked, lock_limit;
340  locked = mm->locked_vm << PAGE_SHIFT;
341  lock_limit = rlimit(RLIMIT_MEMLOCK);
342  locked += new_len - old_len;
343  if (locked > lock_limit && !capable(CAP_IPC_LOCK))
344  goto Eagain;
345  }
346 
347  if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
348  goto Enomem;
349 
350  if (vma->vm_flags & VM_ACCOUNT) {
351  unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
352  if (security_vm_enough_memory_mm(mm, charged))
353  goto Efault;
354  *p = charged;
355  }
356 
357  return vma;
358 
359 Efault: /* very odd choice for most of the cases, but... */
360  return ERR_PTR(-EFAULT);
361 Einval:
362  return ERR_PTR(-EINVAL);
363 Enomem:
364  return ERR_PTR(-ENOMEM);
365 Eagain:
366  return ERR_PTR(-EAGAIN);
367 }
368 
369 static unsigned long mremap_to(unsigned long addr,
370  unsigned long old_len, unsigned long new_addr,
371  unsigned long new_len)
372 {
373  struct mm_struct *mm = current->mm;
374  struct vm_area_struct *vma;
375  unsigned long ret = -EINVAL;
376  unsigned long charged = 0;
377  unsigned long map_flags;
378 
379  if (new_addr & ~PAGE_MASK)
380  goto out;
381 
382  if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
383  goto out;
384 
385  /* Check if the location we're moving into overlaps the
386  * old location at all, and fail if it does.
387  */
388  if ((new_addr <= addr) && (new_addr+new_len) > addr)
389  goto out;
390 
391  if ((addr <= new_addr) && (addr+old_len) > new_addr)
392  goto out;
393 
394  ret = do_munmap(mm, new_addr, new_len);
395  if (ret)
396  goto out;
397 
398  if (old_len >= new_len) {
399  ret = do_munmap(mm, addr+new_len, old_len - new_len);
400  if (ret && old_len != new_len)
401  goto out;
402  old_len = new_len;
403  }
404 
405  vma = vma_to_resize(addr, old_len, new_len, &charged);
406  if (IS_ERR(vma)) {
407  ret = PTR_ERR(vma);
408  goto out;
409  }
410 
411  map_flags = MAP_FIXED;
412  if (vma->vm_flags & VM_MAYSHARE)
413  map_flags |= MAP_SHARED;
414 
415  ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
416  ((addr - vma->vm_start) >> PAGE_SHIFT),
417  map_flags);
418  if (ret & ~PAGE_MASK)
419  goto out1;
420 
421  ret = move_vma(vma, addr, old_len, new_len, new_addr);
422  if (!(ret & ~PAGE_MASK))
423  goto out;
424 out1:
425  vm_unacct_memory(charged);
426 
427 out:
428  return ret;
429 }
430 
431 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
432 {
433  unsigned long end = vma->vm_end + delta;
434  if (end < vma->vm_end) /* overflow */
435  return 0;
436  if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
437  return 0;
438  if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
439  0, MAP_FIXED) & ~PAGE_MASK)
440  return 0;
441  return 1;
442 }
443 
444 /*
445  * Expand (or shrink) an existing mapping, potentially moving it at the
446  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
447  *
448  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
449  * This option implies MREMAP_MAYMOVE.
450  */
451 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
452  unsigned long, new_len, unsigned long, flags,
453  unsigned long, new_addr)
454 {
455  struct mm_struct *mm = current->mm;
456  struct vm_area_struct *vma;
457  unsigned long ret = -EINVAL;
458  unsigned long charged = 0;
459 
460  down_write(&current->mm->mmap_sem);
461 
462  if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
463  goto out;
464 
465  if (addr & ~PAGE_MASK)
466  goto out;
467 
468  old_len = PAGE_ALIGN(old_len);
469  new_len = PAGE_ALIGN(new_len);
470 
471  /*
472  * We allow a zero old-len as a special case
473  * for DOS-emu "duplicate shm area" thing. But
474  * a zero new-len is nonsensical.
475  */
476  if (!new_len)
477  goto out;
478 
479  if (flags & MREMAP_FIXED) {
480  if (flags & MREMAP_MAYMOVE)
481  ret = mremap_to(addr, old_len, new_addr, new_len);
482  goto out;
483  }
484 
485  /*
486  * Always allow a shrinking remap: that just unmaps
487  * the unnecessary pages..
488  * do_munmap does all the needed commit accounting
489  */
490  if (old_len >= new_len) {
491  ret = do_munmap(mm, addr+new_len, old_len - new_len);
492  if (ret && old_len != new_len)
493  goto out;
494  ret = addr;
495  goto out;
496  }
497 
498  /*
499  * Ok, we need to grow..
500  */
501  vma = vma_to_resize(addr, old_len, new_len, &charged);
502  if (IS_ERR(vma)) {
503  ret = PTR_ERR(vma);
504  goto out;
505  }
506 
507  /* old_len exactly to the end of the area..
508  */
509  if (old_len == vma->vm_end - addr) {
510  /* can we just expand the current mapping? */
511  if (vma_expandable(vma, new_len - old_len)) {
512  int pages = (new_len - old_len) >> PAGE_SHIFT;
513 
514  if (vma_adjust(vma, vma->vm_start, addr + new_len,
515  vma->vm_pgoff, NULL)) {
516  ret = -ENOMEM;
517  goto out;
518  }
519 
520  vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
521  if (vma->vm_flags & VM_LOCKED) {
522  mm->locked_vm += pages;
523  mlock_vma_pages_range(vma, addr + old_len,
524  addr + new_len);
525  }
526  ret = addr;
527  goto out;
528  }
529  }
530 
531  /*
532  * We weren't able to just expand or shrink the area,
533  * we need to create a new one and move it..
534  */
535  ret = -ENOMEM;
536  if (flags & MREMAP_MAYMOVE) {
537  unsigned long map_flags = 0;
538  if (vma->vm_flags & VM_MAYSHARE)
539  map_flags |= MAP_SHARED;
540 
541  new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
542  vma->vm_pgoff +
543  ((addr - vma->vm_start) >> PAGE_SHIFT),
544  map_flags);
545  if (new_addr & ~PAGE_MASK) {
546  ret = new_addr;
547  goto out;
548  }
549 
550  ret = move_vma(vma, addr, old_len, new_len, new_addr);
551  }
552 out:
553  if (ret & ~PAGE_MASK)
554  vm_unacct_memory(charged);
555  up_write(&current->mm->mmap_sem);
556  return ret;
557 }