Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
pgtable.c
Go to the documentation of this file.
1 /*
2  * Copyright IBM Corp. 2007, 2011
3  * Author(s): Martin Schwidefsky <[email protected]>
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/gfp.h>
10 #include <linux/mm.h>
11 #include <linux/swap.h>
12 #include <linux/smp.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
16 #include <linux/module.h>
17 #include <linux/quicklist.h>
18 #include <linux/rcupdate.h>
19 #include <linux/slab.h>
20 
21 #include <asm/pgtable.h>
22 #include <asm/pgalloc.h>
23 #include <asm/tlb.h>
24 #include <asm/tlbflush.h>
25 #include <asm/mmu_context.h>
26 
27 #ifndef CONFIG_64BIT
28 #define ALLOC_ORDER 1
29 #define FRAG_MASK 0x0f
30 #else
31 #define ALLOC_ORDER 2
32 #define FRAG_MASK 0x03
33 #endif
34 
35 
36 unsigned long *crst_table_alloc(struct mm_struct *mm)
37 {
39 
40  if (!page)
41  return NULL;
42  return (unsigned long *) page_to_phys(page);
43 }
44 
45 void crst_table_free(struct mm_struct *mm, unsigned long *table)
46 {
47  free_pages((unsigned long) table, ALLOC_ORDER);
48 }
49 
50 #ifdef CONFIG_64BIT
51 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
52 {
53  unsigned long *table, *pgd;
54  unsigned long entry;
55 
56  BUG_ON(limit > (1UL << 53));
57 repeat:
58  table = crst_table_alloc(mm);
59  if (!table)
60  return -ENOMEM;
61  spin_lock_bh(&mm->page_table_lock);
62  if (mm->context.asce_limit < limit) {
63  pgd = (unsigned long *) mm->pgd;
64  if (mm->context.asce_limit <= (1UL << 31)) {
65  entry = _REGION3_ENTRY_EMPTY;
66  mm->context.asce_limit = 1UL << 42;
69  _ASCE_TYPE_REGION3;
70  } else {
71  entry = _REGION2_ENTRY_EMPTY;
72  mm->context.asce_limit = 1UL << 53;
75  _ASCE_TYPE_REGION2;
76  }
77  crst_table_init(table, entry);
78  pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
79  mm->pgd = (pgd_t *) table;
80  mm->task_size = mm->context.asce_limit;
81  table = NULL;
82  }
83  spin_unlock_bh(&mm->page_table_lock);
84  if (table)
85  crst_table_free(mm, table);
86  if (mm->context.asce_limit < limit)
87  goto repeat;
88  return 0;
89 }
90 
91 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
92 {
93  pgd_t *pgd;
94 
95  while (mm->context.asce_limit > limit) {
96  pgd = mm->pgd;
97  switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
98  case _REGION_ENTRY_TYPE_R2:
99  mm->context.asce_limit = 1UL << 42;
102  _ASCE_TYPE_REGION3;
103  break;
104  case _REGION_ENTRY_TYPE_R3:
105  mm->context.asce_limit = 1UL << 31;
108  _ASCE_TYPE_SEGMENT;
109  break;
110  default:
111  BUG();
112  }
113  mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
114  mm->task_size = mm->context.asce_limit;
115  crst_table_free(mm, (unsigned long *) pgd);
116  }
117 }
118 #endif
119 
120 #ifdef CONFIG_PGSTE
121 
128 struct gmap *gmap_alloc(struct mm_struct *mm)
129 {
130  struct gmap *gmap;
131  struct page *page;
132  unsigned long *table;
133 
134  gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
135  if (!gmap)
136  goto out;
137  INIT_LIST_HEAD(&gmap->crst_list);
138  gmap->mm = mm;
140  if (!page)
141  goto out_free;
142  list_add(&page->lru, &gmap->crst_list);
143  table = (unsigned long *) page_to_phys(page);
144  crst_table_init(table, _REGION1_ENTRY_EMPTY);
145  gmap->table = table;
146  gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
147  _ASCE_USER_BITS | __pa(table);
148  list_add(&gmap->list, &mm->context.gmap_list);
149  return gmap;
150 
151 out_free:
152  kfree(gmap);
153 out:
154  return NULL;
155 }
157 
158 static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
159 {
160  struct gmap_pgtable *mp;
161  struct gmap_rmap *rmap;
162  struct page *page;
163 
164  if (*table & _SEGMENT_ENTRY_INV)
165  return 0;
166  page = pfn_to_page(*table >> PAGE_SHIFT);
167  mp = (struct gmap_pgtable *) page->index;
168  list_for_each_entry(rmap, &mp->mapper, list) {
169  if (rmap->entry != table)
170  continue;
171  list_del(&rmap->list);
172  kfree(rmap);
173  break;
174  }
175  *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
176  return 1;
177 }
178 
179 static void gmap_flush_tlb(struct gmap *gmap)
180 {
181  if (MACHINE_HAS_IDTE)
182  __tlb_flush_idte((unsigned long) gmap->table |
183  _ASCE_TYPE_REGION1);
184  else
186 }
187 
192 void gmap_free(struct gmap *gmap)
193 {
194  struct page *page, *next;
195  unsigned long *table;
196  int i;
197 
198 
199  /* Flush tlb. */
200  if (MACHINE_HAS_IDTE)
201  __tlb_flush_idte((unsigned long) gmap->table |
202  _ASCE_TYPE_REGION1);
203  else
205 
206  /* Free all segment & region tables. */
207  down_read(&gmap->mm->mmap_sem);
208  spin_lock(&gmap->mm->page_table_lock);
209  list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
210  table = (unsigned long *) page_to_phys(page);
211  if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
212  /* Remove gmap rmap structures for segment table. */
213  for (i = 0; i < PTRS_PER_PMD; i++, table++)
214  gmap_unlink_segment(gmap, table);
215  __free_pages(page, ALLOC_ORDER);
216  }
217  spin_unlock(&gmap->mm->page_table_lock);
218  up_read(&gmap->mm->mmap_sem);
219  list_del(&gmap->list);
220  kfree(gmap);
221 }
223 
228 void gmap_enable(struct gmap *gmap)
229 {
230  S390_lowcore.gmap = (unsigned long) gmap;
231 }
233 
238 void gmap_disable(struct gmap *gmap)
239 {
240  S390_lowcore.gmap = 0UL;
241 }
243 
244 /*
245  * gmap_alloc_table is assumed to be called with mmap_sem held
246  */
247 static int gmap_alloc_table(struct gmap *gmap,
248  unsigned long *table, unsigned long init)
249 {
250  struct page *page;
251  unsigned long *new;
252 
253  /* since we dont free the gmap table until gmap_free we can unlock */
254  spin_unlock(&gmap->mm->page_table_lock);
256  spin_lock(&gmap->mm->page_table_lock);
257  if (!page)
258  return -ENOMEM;
259  new = (unsigned long *) page_to_phys(page);
260  crst_table_init(new, init);
261  if (*table & _REGION_ENTRY_INV) {
262  list_add(&page->lru, &gmap->crst_list);
263  *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
264  (*table & _REGION_ENTRY_TYPE_MASK);
265  } else
266  __free_pages(page, ALLOC_ORDER);
267  return 0;
268 }
269 
278 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
279 {
280  unsigned long *table;
281  unsigned long off;
282  int flush;
283 
284  if ((to | len) & (PMD_SIZE - 1))
285  return -EINVAL;
286  if (len == 0 || to + len < to)
287  return -EINVAL;
288 
289  flush = 0;
290  down_read(&gmap->mm->mmap_sem);
291  spin_lock(&gmap->mm->page_table_lock);
292  for (off = 0; off < len; off += PMD_SIZE) {
293  /* Walk the guest addr space page table */
294  table = gmap->table + (((to + off) >> 53) & 0x7ff);
295  if (*table & _REGION_ENTRY_INV)
296  goto out;
297  table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
298  table = table + (((to + off) >> 42) & 0x7ff);
299  if (*table & _REGION_ENTRY_INV)
300  goto out;
301  table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
302  table = table + (((to + off) >> 31) & 0x7ff);
303  if (*table & _REGION_ENTRY_INV)
304  goto out;
305  table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
306  table = table + (((to + off) >> 20) & 0x7ff);
307 
308  /* Clear segment table entry in guest address space. */
309  flush |= gmap_unlink_segment(gmap, table);
310  *table = _SEGMENT_ENTRY_INV;
311  }
312 out:
313  spin_unlock(&gmap->mm->page_table_lock);
314  up_read(&gmap->mm->mmap_sem);
315  if (flush)
316  gmap_flush_tlb(gmap);
317  return 0;
318 }
320 
329 int gmap_map_segment(struct gmap *gmap, unsigned long from,
330  unsigned long to, unsigned long len)
331 {
332  unsigned long *table;
333  unsigned long off;
334  int flush;
335 
336  if ((from | to | len) & (PMD_SIZE - 1))
337  return -EINVAL;
338  if (len == 0 || from + len > PGDIR_SIZE ||
339  from + len < from || to + len < to)
340  return -EINVAL;
341 
342  flush = 0;
343  down_read(&gmap->mm->mmap_sem);
344  spin_lock(&gmap->mm->page_table_lock);
345  for (off = 0; off < len; off += PMD_SIZE) {
346  /* Walk the gmap address space page table */
347  table = gmap->table + (((to + off) >> 53) & 0x7ff);
348  if ((*table & _REGION_ENTRY_INV) &&
349  gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
350  goto out_unmap;
351  table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
352  table = table + (((to + off) >> 42) & 0x7ff);
353  if ((*table & _REGION_ENTRY_INV) &&
354  gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
355  goto out_unmap;
356  table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
357  table = table + (((to + off) >> 31) & 0x7ff);
358  if ((*table & _REGION_ENTRY_INV) &&
359  gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
360  goto out_unmap;
361  table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
362  table = table + (((to + off) >> 20) & 0x7ff);
363 
364  /* Store 'from' address in an invalid segment table entry. */
365  flush |= gmap_unlink_segment(gmap, table);
366  *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off);
367  }
368  spin_unlock(&gmap->mm->page_table_lock);
369  up_read(&gmap->mm->mmap_sem);
370  if (flush)
371  gmap_flush_tlb(gmap);
372  return 0;
373 
374 out_unmap:
375  spin_unlock(&gmap->mm->page_table_lock);
376  up_read(&gmap->mm->mmap_sem);
377  gmap_unmap_segment(gmap, to, len);
378  return -ENOMEM;
379 }
381 
382 /*
383  * this function is assumed to be called with mmap_sem held
384  */
385 unsigned long __gmap_fault(unsigned long address, struct gmap *gmap)
386 {
387  unsigned long *table, vmaddr, segment;
388  struct mm_struct *mm;
389  struct gmap_pgtable *mp;
390  struct gmap_rmap *rmap;
391  struct vm_area_struct *vma;
392  struct page *page;
393  pgd_t *pgd;
394  pud_t *pud;
395  pmd_t *pmd;
396 
397  current->thread.gmap_addr = address;
398  mm = gmap->mm;
399  /* Walk the gmap address space page table */
400  table = gmap->table + ((address >> 53) & 0x7ff);
401  if (unlikely(*table & _REGION_ENTRY_INV))
402  return -EFAULT;
403  table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
404  table = table + ((address >> 42) & 0x7ff);
405  if (unlikely(*table & _REGION_ENTRY_INV))
406  return -EFAULT;
407  table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
408  table = table + ((address >> 31) & 0x7ff);
409  if (unlikely(*table & _REGION_ENTRY_INV))
410  return -EFAULT;
411  table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
412  table = table + ((address >> 20) & 0x7ff);
413 
414  /* Convert the gmap address to an mm address. */
415  segment = *table;
416  if (likely(!(segment & _SEGMENT_ENTRY_INV))) {
417  page = pfn_to_page(segment >> PAGE_SHIFT);
418  mp = (struct gmap_pgtable *) page->index;
419  return mp->vmaddr | (address & ~PMD_MASK);
420  } else if (segment & _SEGMENT_ENTRY_RO) {
421  vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
422  vma = find_vma(mm, vmaddr);
423  if (!vma || vma->vm_start > vmaddr)
424  return -EFAULT;
425 
426  /* Walk the parent mm page table */
427  pgd = pgd_offset(mm, vmaddr);
428  pud = pud_alloc(mm, pgd, vmaddr);
429  if (!pud)
430  return -ENOMEM;
431  pmd = pmd_alloc(mm, pud, vmaddr);
432  if (!pmd)
433  return -ENOMEM;
434  if (!pmd_present(*pmd) &&
435  __pte_alloc(mm, vma, pmd, vmaddr))
436  return -ENOMEM;
437  /* pmd now points to a valid segment table entry. */
438  rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
439  if (!rmap)
440  return -ENOMEM;
441  /* Link gmap segment table entry location to page table. */
442  page = pmd_page(*pmd);
443  mp = (struct gmap_pgtable *) page->index;
444  rmap->entry = table;
445  spin_lock(&mm->page_table_lock);
446  list_add(&rmap->list, &mp->mapper);
447  spin_unlock(&mm->page_table_lock);
448  /* Set gmap segment table entry to page table. */
449  *table = pmd_val(*pmd) & PAGE_MASK;
450  return vmaddr | (address & ~PMD_MASK);
451  }
452  return -EFAULT;
453 }
454 
455 unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
456 {
457  unsigned long rc;
458 
459  down_read(&gmap->mm->mmap_sem);
460  rc = __gmap_fault(address, gmap);
461  up_read(&gmap->mm->mmap_sem);
462 
463  return rc;
464 }
466 
467 void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
468 {
469 
470  unsigned long *table, address, size;
471  struct vm_area_struct *vma;
472  struct gmap_pgtable *mp;
473  struct page *page;
474 
475  down_read(&gmap->mm->mmap_sem);
476  address = from;
477  while (address < to) {
478  /* Walk the gmap address space page table */
479  table = gmap->table + ((address >> 53) & 0x7ff);
480  if (unlikely(*table & _REGION_ENTRY_INV)) {
481  address = (address + PMD_SIZE) & PMD_MASK;
482  continue;
483  }
484  table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
485  table = table + ((address >> 42) & 0x7ff);
486  if (unlikely(*table & _REGION_ENTRY_INV)) {
487  address = (address + PMD_SIZE) & PMD_MASK;
488  continue;
489  }
490  table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
491  table = table + ((address >> 31) & 0x7ff);
492  if (unlikely(*table & _REGION_ENTRY_INV)) {
493  address = (address + PMD_SIZE) & PMD_MASK;
494  continue;
495  }
496  table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
497  table = table + ((address >> 20) & 0x7ff);
498  if (unlikely(*table & _SEGMENT_ENTRY_INV)) {
499  address = (address + PMD_SIZE) & PMD_MASK;
500  continue;
501  }
502  page = pfn_to_page(*table >> PAGE_SHIFT);
503  mp = (struct gmap_pgtable *) page->index;
504  vma = find_vma(gmap->mm, mp->vmaddr);
505  size = min(to - address, PMD_SIZE - (address & ~PMD_MASK));
506  zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK),
507  size, NULL);
508  address = (address + PMD_SIZE) & PMD_MASK;
509  }
510  up_read(&gmap->mm->mmap_sem);
511 }
513 
514 void gmap_unmap_notifier(struct mm_struct *mm, unsigned long *table)
515 {
516  struct gmap_rmap *rmap, *next;
517  struct gmap_pgtable *mp;
518  struct page *page;
519  int flush;
520 
521  flush = 0;
522  spin_lock(&mm->page_table_lock);
523  page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
524  mp = (struct gmap_pgtable *) page->index;
525  list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
526  *rmap->entry =
527  _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
528  list_del(&rmap->list);
529  kfree(rmap);
530  flush = 1;
531  }
532  spin_unlock(&mm->page_table_lock);
533  if (flush)
535 }
536 
537 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
538  unsigned long vmaddr)
539 {
540  struct page *page;
541  unsigned long *table;
542  struct gmap_pgtable *mp;
543 
545  if (!page)
546  return NULL;
547  mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
548  if (!mp) {
549  __free_page(page);
550  return NULL;
551  }
552  pgtable_page_ctor(page);
553  mp->vmaddr = vmaddr & PMD_MASK;
554  INIT_LIST_HEAD(&mp->mapper);
555  page->index = (unsigned long) mp;
556  atomic_set(&page->_mapcount, 3);
557  table = (unsigned long *) page_to_phys(page);
558  clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
559  clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
560  return table;
561 }
562 
563 static inline void page_table_free_pgste(unsigned long *table)
564 {
565  struct page *page;
566  struct gmap_pgtable *mp;
567 
568  page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
569  mp = (struct gmap_pgtable *) page->index;
570  BUG_ON(!list_empty(&mp->mapper));
571  pgtable_page_dtor(page);
572  atomic_set(&page->_mapcount, -1);
573  kfree(mp);
574  __free_page(page);
575 }
576 
577 #else /* CONFIG_PGSTE */
578 
579 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
580  unsigned long vmaddr)
581 {
582  return NULL;
583 }
584 
585 static inline void page_table_free_pgste(unsigned long *table)
586 {
587 }
588 
589 static inline void gmap_unmap_notifier(struct mm_struct *mm,
590  unsigned long *table)
591 {
592 }
593 
594 #endif /* CONFIG_PGSTE */
595 
596 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
597 {
598  unsigned int old, new;
599 
600  do {
601  old = atomic_read(v);
602  new = old ^ bits;
603  } while (atomic_cmpxchg(v, old, new) != old);
604  return new;
605 }
606 
607 /*
608  * page table entry allocation/free routines.
609  */
610 unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
611 {
612  unsigned long *uninitialized_var(table);
613  struct page *uninitialized_var(page);
614  unsigned int mask, bit;
615 
616  if (mm_has_pgste(mm))
617  return page_table_alloc_pgste(mm, vmaddr);
618  /* Allocate fragments of a 4K page as 1K/2K page table */
619  spin_lock_bh(&mm->context.list_lock);
620  mask = FRAG_MASK;
621  if (!list_empty(&mm->context.pgtable_list)) {
623  struct page, lru);
624  table = (unsigned long *) page_to_phys(page);
625  mask = atomic_read(&page->_mapcount);
626  mask = mask | (mask >> 4);
627  }
628  if ((mask & FRAG_MASK) == FRAG_MASK) {
629  spin_unlock_bh(&mm->context.list_lock);
631  if (!page)
632  return NULL;
633  pgtable_page_ctor(page);
634  atomic_set(&page->_mapcount, 1);
635  table = (unsigned long *) page_to_phys(page);
636  clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
637  spin_lock_bh(&mm->context.list_lock);
638  list_add(&page->lru, &mm->context.pgtable_list);
639  } else {
640  for (bit = 1; mask & bit; bit <<= 1)
641  table += PTRS_PER_PTE;
642  mask = atomic_xor_bits(&page->_mapcount, bit);
643  if ((mask & FRAG_MASK) == FRAG_MASK)
644  list_del(&page->lru);
645  }
646  spin_unlock_bh(&mm->context.list_lock);
647  return table;
648 }
649 
650 void page_table_free(struct mm_struct *mm, unsigned long *table)
651 {
652  struct page *page;
653  unsigned int bit, mask;
654 
655  if (mm_has_pgste(mm)) {
656  gmap_unmap_notifier(mm, table);
657  return page_table_free_pgste(table);
658  }
659  /* Free 1K/2K page table fragment of a 4K page */
660  page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
661  bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
662  spin_lock_bh(&mm->context.list_lock);
663  if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
664  list_del(&page->lru);
665  mask = atomic_xor_bits(&page->_mapcount, bit);
666  if (mask & FRAG_MASK)
667  list_add(&page->lru, &mm->context.pgtable_list);
668  spin_unlock_bh(&mm->context.list_lock);
669  if (mask == 0) {
670  pgtable_page_dtor(page);
671  atomic_set(&page->_mapcount, -1);
672  __free_page(page);
673  }
674 }
675 
676 static void __page_table_free_rcu(void *table, unsigned bit)
677 {
678  struct page *page;
679 
680  if (bit == FRAG_MASK)
681  return page_table_free_pgste(table);
682  /* Free 1K/2K page table fragment of a 4K page */
683  page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
684  if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
685  pgtable_page_dtor(page);
686  atomic_set(&page->_mapcount, -1);
687  __free_page(page);
688  }
689 }
690 
691 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
692 {
693  struct mm_struct *mm;
694  struct page *page;
695  unsigned int bit, mask;
696 
697  mm = tlb->mm;
698  if (mm_has_pgste(mm)) {
699  gmap_unmap_notifier(mm, table);
700  table = (unsigned long *) (__pa(table) | FRAG_MASK);
701  tlb_remove_table(tlb, table);
702  return;
703  }
704  bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
705  page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
706  spin_lock_bh(&mm->context.list_lock);
707  if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
708  list_del(&page->lru);
709  mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
710  if (mask & FRAG_MASK)
711  list_add_tail(&page->lru, &mm->context.pgtable_list);
712  spin_unlock_bh(&mm->context.list_lock);
713  table = (unsigned long *) (__pa(table) | (bit << 4));
714  tlb_remove_table(tlb, table);
715 }
716 
717 void __tlb_remove_table(void *_table)
718 {
719  const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
720  void *table = (void *)((unsigned long) _table & ~mask);
721  unsigned type = (unsigned long) _table & mask;
722 
723  if (type)
724  __page_table_free_rcu(table, type);
725  else
726  free_pages((unsigned long) table, ALLOC_ORDER);
727 }
728 
729 static void tlb_remove_table_smp_sync(void *arg)
730 {
731  /* Simply deliver the interrupt */
732 }
733 
734 static void tlb_remove_table_one(void *table)
735 {
736  /*
737  * This isn't an RCU grace period and hence the page-tables cannot be
738  * assumed to be actually RCU-freed.
739  *
740  * It is however sufficient for software page-table walkers that rely
741  * on IRQ disabling. See the comment near struct mmu_table_batch.
742  */
743  smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
744  __tlb_remove_table(table);
745 }
746 
747 static void tlb_remove_table_rcu(struct rcu_head *head)
748 {
749  struct mmu_table_batch *batch;
750  int i;
751 
752  batch = container_of(head, struct mmu_table_batch, rcu);
753 
754  for (i = 0; i < batch->nr; i++)
755  __tlb_remove_table(batch->tables[i]);
756 
757  free_page((unsigned long)batch);
758 }
759 
760 void tlb_table_flush(struct mmu_gather *tlb)
761 {
762  struct mmu_table_batch **batch = &tlb->batch;
763 
764  if (*batch) {
765  __tlb_flush_mm(tlb->mm);
766  call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
767  *batch = NULL;
768  }
769 }
770 
771 void tlb_remove_table(struct mmu_gather *tlb, void *table)
772 {
773  struct mmu_table_batch **batch = &tlb->batch;
774 
775  if (*batch == NULL) {
776  *batch = (struct mmu_table_batch *)
778  if (*batch == NULL) {
779  __tlb_flush_mm(tlb->mm);
780  tlb_remove_table_one(table);
781  return;
782  }
783  (*batch)->nr = 0;
784  }
785  (*batch)->tables[(*batch)->nr++] = table;
786  if ((*batch)->nr == MAX_TABLE_BATCH)
787  tlb_table_flush(tlb);
788 }
789 
790 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
791 void thp_split_vma(struct vm_area_struct *vma)
792 {
793  unsigned long addr;
794  struct page *page;
795 
796  for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
797  page = follow_page(vma, addr, FOLL_SPLIT);
798  }
799 }
800 
801 void thp_split_mm(struct mm_struct *mm)
802 {
803  struct vm_area_struct *vma = mm->mmap;
804 
805  while (vma != NULL) {
806  thp_split_vma(vma);
807  vma->vm_flags &= ~VM_HUGEPAGE;
808  vma->vm_flags |= VM_NOHUGEPAGE;
809  vma = vma->vm_next;
810  }
811 }
812 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
813 
814 /*
815  * switch on pgstes for its userspace process (for kvm)
816  */
818 {
819  struct task_struct *tsk = current;
820  struct mm_struct *mm, *old_mm;
821 
822  /* Do we have switched amode? If no, we cannot do sie */
824  return -EINVAL;
825 
826  /* Do we have pgstes? if yes, we are done */
827  if (mm_has_pgste(tsk->mm))
828  return 0;
829 
830  /* lets check if we are allowed to replace the mm */
831  task_lock(tsk);
832  if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
833 #ifdef CONFIG_AIO
834  !hlist_empty(&tsk->mm->ioctx_list) ||
835 #endif
836  tsk->mm != tsk->active_mm) {
837  task_unlock(tsk);
838  return -EINVAL;
839  }
840  task_unlock(tsk);
841 
842  /* we copy the mm and let dup_mm create the page tables with_pgstes */
843  tsk->mm->context.alloc_pgste = 1;
844  /* make sure that both mms have a correct rss state */
845  sync_mm_rss(tsk->mm);
846  mm = dup_mm(tsk);
847  tsk->mm->context.alloc_pgste = 0;
848  if (!mm)
849  return -ENOMEM;
850 
851 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
852  /* split thp mappings and disable thp for future mappings */
853  thp_split_mm(mm);
854  mm->def_flags |= VM_NOHUGEPAGE;
855 #endif
856 
857  /* Now lets check again if something happened */
858  task_lock(tsk);
859  if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
860 #ifdef CONFIG_AIO
861  !hlist_empty(&tsk->mm->ioctx_list) ||
862 #endif
863  tsk->mm != tsk->active_mm) {
864  mmput(mm);
865  task_unlock(tsk);
866  return -EINVAL;
867  }
868 
869  /* ok, we are alone. No ptrace, no threads, etc. */
870  old_mm = tsk->mm;
871  tsk->mm = tsk->active_mm = mm;
872  preempt_disable();
873  update_mm(mm, tsk);
875  atomic_dec(&old_mm->context.attach_count);
876  cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
877  preempt_enable();
878  task_unlock(tsk);
879  mmput(old_mm);
880  return 0;
881 }
883 
884 #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
885 bool kernel_page_present(struct page *page)
886 {
887  unsigned long addr;
888  int cc;
889 
890  addr = page_to_phys(page);
891  asm volatile(
892  " lra %1,0(%1)\n"
893  " ipm %0\n"
894  " srl %0,28"
895  : "=d" (cc), "+a" (addr) : : "cc");
896  return cc == 0;
897 }
898 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */
899 
900 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
901 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
902  pmd_t *pmdp)
903 {
904  VM_BUG_ON(address & ~HPAGE_PMD_MASK);
905  /* No need to flush TLB
906  * On s390 reference bits are in storage key and never in TLB */
907  return pmdp_test_and_clear_young(vma, address, pmdp);
908 }
909 
910 int pmdp_set_access_flags(struct vm_area_struct *vma,
911  unsigned long address, pmd_t *pmdp,
912  pmd_t entry, int dirty)
913 {
914  VM_BUG_ON(address & ~HPAGE_PMD_MASK);
915 
916  if (pmd_same(*pmdp, entry))
917  return 0;
918  pmdp_invalidate(vma, address, pmdp);
919  set_pmd_at(vma->vm_mm, address, pmdp, entry);
920  return 1;
921 }
922 
923 static void pmdp_splitting_flush_sync(void *arg)
924 {
925  /* Simply deliver the interrupt */
926 }
927 
928 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
929  pmd_t *pmdp)
930 {
931  VM_BUG_ON(address & ~HPAGE_PMD_MASK);
932  if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
933  (unsigned long *) pmdp)) {
934  /* need to serialize against gup-fast (IRQ disabled) */
935  smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
936  }
937 }
938 
939 void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
940 {
941  struct list_head *lh = (struct list_head *) pgtable;
942 
944 
945  /* FIFO */
946  if (!mm->pmd_huge_pte)
947  INIT_LIST_HEAD(lh);
948  else
949  list_add(lh, (struct list_head *) mm->pmd_huge_pte);
950  mm->pmd_huge_pte = pgtable;
951 }
952 
953 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
954 {
955  struct list_head *lh;
956  pgtable_t pgtable;
957  pte_t *ptep;
958 
960 
961  /* FIFO */
962  pgtable = mm->pmd_huge_pte;
963  lh = (struct list_head *) pgtable;
964  if (list_empty(lh))
965  mm->pmd_huge_pte = NULL;
966  else {
967  mm->pmd_huge_pte = (pgtable_t) lh->next;
968  list_del(lh);
969  }
970  ptep = (pte_t *) pgtable;
971  pte_val(*ptep) = _PAGE_TYPE_EMPTY;
972  ptep++;
973  pte_val(*ptep) = _PAGE_TYPE_EMPTY;
974  return pgtable;
975 }
976 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */