Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
gup.c
Go to the documentation of this file.
1 /*
2  * Lockless get_user_pages_fast for x86
3  *
4  * Copyright (C) 2008 Nick Piggin
5  * Copyright (C) 2008 Novell Inc.
6  */
7 #include <linux/sched.h>
8 #include <linux/mm.h>
9 #include <linux/vmstat.h>
10 #include <linux/highmem.h>
11 #include <linux/swap.h>
12 
13 #include <asm/pgtable.h>
14 
15 static inline pte_t gup_get_pte(pte_t *ptep)
16 {
17 #ifndef CONFIG_X86_PAE
18  return ACCESS_ONCE(*ptep);
19 #else
20  /*
21  * With get_user_pages_fast, we walk down the pagetables without taking
22  * any locks. For this we would like to load the pointers atomically,
23  * but that is not possible (without expensive cmpxchg8b) on PAE. What
24  * we do have is the guarantee that a pte will only either go from not
25  * present to present, or present to not present or both -- it will not
26  * switch to a completely different present page without a TLB flush in
27  * between; something that we are blocking by holding interrupts off.
28  *
29  * Setting ptes from not present to present goes:
30  * ptep->pte_high = h;
31  * smp_wmb();
32  * ptep->pte_low = l;
33  *
34  * And present to not present goes:
35  * ptep->pte_low = 0;
36  * smp_wmb();
37  * ptep->pte_high = 0;
38  *
39  * We must ensure here that the load of pte_low sees l iff pte_high
40  * sees h. We load pte_high *after* loading pte_low, which ensures we
41  * don't see an older value of pte_high. *Then* we recheck pte_low,
42  * which ensures that we haven't picked up a changed pte high. We might
43  * have got rubbish values from pte_low and pte_high, but we are
44  * guaranteed that pte_low will not have the present bit set *unless*
45  * it is 'l'. And get_user_pages_fast only operates on present ptes, so
46  * we're safe.
47  *
48  * gup_get_pte should not be used or copied outside gup.c without being
49  * very careful -- it does not atomically load the pte or anything that
50  * is likely to be useful for you.
51  */
52  pte_t pte;
53 
54 retry:
55  pte.pte_low = ptep->pte_low;
56  smp_rmb();
57  pte.pte_high = ptep->pte_high;
58  smp_rmb();
59  if (unlikely(pte.pte_low != ptep->pte_low))
60  goto retry;
61 
62  return pte;
63 #endif
64 }
65 
66 /*
67  * The performance critical leaf functions are made noinline otherwise gcc
68  * inlines everything into a single function which results in too much
69  * register pressure.
70  */
71 static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
72  unsigned long end, int write, struct page **pages, int *nr)
73 {
74  unsigned long mask;
75  pte_t *ptep;
76 
78  if (write)
79  mask |= _PAGE_RW;
80 
81  ptep = pte_offset_map(&pmd, addr);
82  do {
83  pte_t pte = gup_get_pte(ptep);
84  struct page *page;
85 
86  if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
87  pte_unmap(ptep);
88  return 0;
89  }
90  VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
91  page = pte_page(pte);
92  get_page(page);
93  SetPageReferenced(page);
94  pages[*nr] = page;
95  (*nr)++;
96 
97  } while (ptep++, addr += PAGE_SIZE, addr != end);
98  pte_unmap(ptep - 1);
99 
100  return 1;
101 }
102 
103 static inline void get_head_page_multiple(struct page *page, int nr)
104 {
105  VM_BUG_ON(page != compound_head(page));
106  VM_BUG_ON(page_count(page) == 0);
107  atomic_add(nr, &page->_count);
108  SetPageReferenced(page);
109 }
110 
111 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
112  unsigned long end, int write, struct page **pages, int *nr)
113 {
114  unsigned long mask;
115  pte_t pte = *(pte_t *)&pmd;
116  struct page *head, *page;
117  int refs;
118 
119  mask = _PAGE_PRESENT|_PAGE_USER;
120  if (write)
121  mask |= _PAGE_RW;
122  if ((pte_flags(pte) & mask) != mask)
123  return 0;
124  /* hugepages are never "special" */
126  VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
127 
128  refs = 0;
129  head = pte_page(pte);
130  page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
131  do {
132  VM_BUG_ON(compound_head(page) != head);
133  pages[*nr] = page;
134  if (PageTail(page))
135  get_huge_page_tail(page);
136  (*nr)++;
137  page++;
138  refs++;
139  } while (addr += PAGE_SIZE, addr != end);
140  get_head_page_multiple(head, refs);
141 
142  return 1;
143 }
144 
145 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
146  int write, struct page **pages, int *nr)
147 {
148  unsigned long next;
149  pmd_t *pmdp;
150 
151  pmdp = pmd_offset(&pud, addr);
152  do {
153  pmd_t pmd = *pmdp;
154 
155  next = pmd_addr_end(addr, end);
156  /*
157  * The pmd_trans_splitting() check below explains why
158  * pmdp_splitting_flush has to flush the tlb, to stop
159  * this gup-fast code from running while we set the
160  * splitting bit in the pmd. Returning zero will take
161  * the slow path that will call wait_split_huge_page()
162  * if the pmd is still in splitting state. gup-fast
163  * can't because it has irq disabled and
164  * wait_split_huge_page() would never return as the
165  * tlb flush IPI wouldn't run.
166  */
167  if (pmd_none(pmd) || pmd_trans_splitting(pmd))
168  return 0;
169  if (unlikely(pmd_large(pmd))) {
170  if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
171  return 0;
172  } else {
173  if (!gup_pte_range(pmd, addr, next, write, pages, nr))
174  return 0;
175  }
176  } while (pmdp++, addr = next, addr != end);
177 
178  return 1;
179 }
180 
181 static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
182  unsigned long end, int write, struct page **pages, int *nr)
183 {
184  unsigned long mask;
185  pte_t pte = *(pte_t *)&pud;
186  struct page *head, *page;
187  int refs;
188 
189  mask = _PAGE_PRESENT|_PAGE_USER;
190  if (write)
191  mask |= _PAGE_RW;
192  if ((pte_flags(pte) & mask) != mask)
193  return 0;
194  /* hugepages are never "special" */
196  VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
197 
198  refs = 0;
199  head = pte_page(pte);
200  page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
201  do {
202  VM_BUG_ON(compound_head(page) != head);
203  pages[*nr] = page;
204  if (PageTail(page))
205  get_huge_page_tail(page);
206  (*nr)++;
207  page++;
208  refs++;
209  } while (addr += PAGE_SIZE, addr != end);
210  get_head_page_multiple(head, refs);
211 
212  return 1;
213 }
214 
215 static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
216  int write, struct page **pages, int *nr)
217 {
218  unsigned long next;
219  pud_t *pudp;
220 
221  pudp = pud_offset(&pgd, addr);
222  do {
223  pud_t pud = *pudp;
224 
225  next = pud_addr_end(addr, end);
226  if (pud_none(pud))
227  return 0;
228  if (unlikely(pud_large(pud))) {
229  if (!gup_huge_pud(pud, addr, next, write, pages, nr))
230  return 0;
231  } else {
232  if (!gup_pmd_range(pud, addr, next, write, pages, nr))
233  return 0;
234  }
235  } while (pudp++, addr = next, addr != end);
236 
237  return 1;
238 }
239 
240 /*
241  * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
242  * back to the regular GUP.
243  */
244 int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
245  struct page **pages)
246 {
247  struct mm_struct *mm = current->mm;
248  unsigned long addr, len, end;
249  unsigned long next;
250  unsigned long flags;
251  pgd_t *pgdp;
252  int nr = 0;
253 
254  start &= PAGE_MASK;
255  addr = start;
256  len = (unsigned long) nr_pages << PAGE_SHIFT;
257  end = start + len;
259  (void __user *)start, len)))
260  return 0;
261 
262  /*
263  * XXX: batch / limit 'nr', to avoid large irq off latency
264  * needs some instrumenting to determine the common sizes used by
265  * important workloads (eg. DB2), and whether limiting the batch size
266  * will decrease performance.
267  *
268  * It seems like we're in the clear for the moment. Direct-IO is
269  * the main guy that batches up lots of get_user_pages, and even
270  * they are limited to 64-at-a-time which is not so many.
271  */
272  /*
273  * This doesn't prevent pagetable teardown, but does prevent
274  * the pagetables and pages from being freed on x86.
275  *
276  * So long as we atomically load page table pointers versus teardown
277  * (which we do on x86, with the above PAE exception), we can follow the
278  * address down to the the page and take a ref on it.
279  */
280  local_irq_save(flags);
281  pgdp = pgd_offset(mm, addr);
282  do {
283  pgd_t pgd = *pgdp;
284 
285  next = pgd_addr_end(addr, end);
286  if (pgd_none(pgd))
287  break;
288  if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
289  break;
290  } while (pgdp++, addr = next, addr != end);
291  local_irq_restore(flags);
292 
293  return nr;
294 }
295 
312 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
313  struct page **pages)
314 {
315  struct mm_struct *mm = current->mm;
316  unsigned long addr, len, end;
317  unsigned long next;
318  pgd_t *pgdp;
319  int nr = 0;
320 
321  start &= PAGE_MASK;
322  addr = start;
323  len = (unsigned long) nr_pages << PAGE_SHIFT;
324 
325  end = start + len;
326  if (end < start)
327  goto slow_irqon;
328 
329 #ifdef CONFIG_X86_64
330  if (end >> __VIRTUAL_MASK_SHIFT)
331  goto slow_irqon;
332 #endif
333 
334  /*
335  * XXX: batch / limit 'nr', to avoid large irq off latency
336  * needs some instrumenting to determine the common sizes used by
337  * important workloads (eg. DB2), and whether limiting the batch size
338  * will decrease performance.
339  *
340  * It seems like we're in the clear for the moment. Direct-IO is
341  * the main guy that batches up lots of get_user_pages, and even
342  * they are limited to 64-at-a-time which is not so many.
343  */
344  /*
345  * This doesn't prevent pagetable teardown, but does prevent
346  * the pagetables and pages from being freed on x86.
347  *
348  * So long as we atomically load page table pointers versus teardown
349  * (which we do on x86, with the above PAE exception), we can follow the
350  * address down to the the page and take a ref on it.
351  */
353  pgdp = pgd_offset(mm, addr);
354  do {
355  pgd_t pgd = *pgdp;
356 
357  next = pgd_addr_end(addr, end);
358  if (pgd_none(pgd))
359  goto slow;
360  if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
361  goto slow;
362  } while (pgdp++, addr = next, addr != end);
364 
365  VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
366  return nr;
367 
368  {
369  int ret;
370 
371 slow:
373 slow_irqon:
374  /* Try to get the remaining pages with get_user_pages */
375  start += nr << PAGE_SHIFT;
376  pages += nr;
377 
378  down_read(&mm->mmap_sem);
379  ret = get_user_pages(current, mm, start,
380  (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
381  up_read(&mm->mmap_sem);
382 
383  /* Have to be a bit careful with return values */
384  if (nr > 0) {
385  if (ret < 0)
386  ret = nr;
387  else
388  ret += nr;
389  }
390 
391  return ret;
392  }
393 }