Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
mmu.c
Go to the documentation of this file.
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * MMU support
8  *
9  * Copyright (C) 2006 Qumranet, Inc.
10  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11  *
12  * Authors:
13  * Yaniv Kamay <[email protected]>
14  * Avi Kivity <[email protected]>
15  *
16  * This work is licensed under the terms of the GNU GPL, version 2. See
17  * the COPYING file in the top-level directory.
18  *
19  */
20 
21 #include "irq.h"
22 #include "mmu.h"
23 #include "x86.h"
24 #include "kvm_cache_regs.h"
25 
26 #include <linux/kvm_host.h>
27 #include <linux/types.h>
28 #include <linux/string.h>
29 #include <linux/mm.h>
30 #include <linux/highmem.h>
31 #include <linux/module.h>
32 #include <linux/swap.h>
33 #include <linux/hugetlb.h>
34 #include <linux/compiler.h>
35 #include <linux/srcu.h>
36 #include <linux/slab.h>
37 #include <linux/uaccess.h>
38 
39 #include <asm/page.h>
40 #include <asm/cmpxchg.h>
41 #include <asm/io.h>
42 #include <asm/vmx.h>
43 
44 /*
45  * When setting this variable to true it enables Two-Dimensional-Paging
46  * where the hardware walks 2 page tables:
47  * 1. the guest-virtual to guest-physical
48  * 2. while doing 1. it walks guest-physical to host-physical
49  * If the hardware supports that we don't need to do shadow paging.
50  */
51 bool tdp_enabled = false;
52 
53 enum {
60 };
61 
62 #undef MMU_DEBUG
63 
64 #ifdef MMU_DEBUG
65 
66 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
67 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
68 
69 #else
70 
71 #define pgprintk(x...) do { } while (0)
72 #define rmap_printk(x...) do { } while (0)
73 
74 #endif
75 
76 #ifdef MMU_DEBUG
77 static bool dbg = 0;
78 module_param(dbg, bool, 0644);
79 #endif
80 
81 #ifndef MMU_DEBUG
82 #define ASSERT(x) do { } while (0)
83 #else
84 #define ASSERT(x) \
85  if (!(x)) { \
86  printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
87  __FILE__, __LINE__, #x); \
88  }
89 #endif
90 
91 #define PTE_PREFETCH_NUM 8
92 
93 #define PT_FIRST_AVAIL_BITS_SHIFT 10
94 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
95 
96 #define PT64_LEVEL_BITS 9
97 
98 #define PT64_LEVEL_SHIFT(level) \
99  (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
100 
101 #define PT64_INDEX(address, level)\
102  (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
103 
104 
105 #define PT32_LEVEL_BITS 10
106 
107 #define PT32_LEVEL_SHIFT(level) \
108  (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
109 
110 #define PT32_LVL_OFFSET_MASK(level) \
111  (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
112  * PT32_LEVEL_BITS))) - 1))
113 
114 #define PT32_INDEX(address, level)\
115  (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
116 
117 
118 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
119 #define PT64_DIR_BASE_ADDR_MASK \
120  (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
121 #define PT64_LVL_ADDR_MASK(level) \
122  (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
123  * PT64_LEVEL_BITS))) - 1))
124 #define PT64_LVL_OFFSET_MASK(level) \
125  (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
126  * PT64_LEVEL_BITS))) - 1))
127 
128 #define PT32_BASE_ADDR_MASK PAGE_MASK
129 #define PT32_DIR_BASE_ADDR_MASK \
130  (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
131 #define PT32_LVL_ADDR_MASK(level) \
132  (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
133  * PT32_LEVEL_BITS))) - 1))
134 
135 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
136  | PT64_NX_MASK)
137 
138 #define ACC_EXEC_MASK 1
139 #define ACC_WRITE_MASK PT_WRITABLE_MASK
140 #define ACC_USER_MASK PT_USER_MASK
141 #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
142 
143 #include <trace/events/kvm.h>
144 
145 #define CREATE_TRACE_POINTS
146 #include "mmutrace.h"
147 
148 #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
149 #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
150 
151 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
152 
153 /* make pte_list_desc fit well in cache line */
154 #define PTE_LIST_EXT 3
155 
159 };
160 
165  int level;
166  unsigned index;
167 };
168 
169 #define for_each_shadow_entry(_vcpu, _addr, _walker) \
170  for (shadow_walk_init(&(_walker), _vcpu, _addr); \
171  shadow_walk_okay(&(_walker)); \
172  shadow_walk_next(&(_walker)))
173 
174 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
175  for (shadow_walk_init(&(_walker), _vcpu, _addr); \
176  shadow_walk_okay(&(_walker)) && \
177  ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
178  __shadow_walk_next(&(_walker), spte))
179 
180 static struct kmem_cache *pte_list_desc_cache;
181 static struct kmem_cache *mmu_page_header_cache;
182 static struct percpu_counter kvm_total_used_mmu_pages;
183 
184 static u64 __read_mostly shadow_nx_mask;
185 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
186 static u64 __read_mostly shadow_user_mask;
187 static u64 __read_mostly shadow_accessed_mask;
188 static u64 __read_mostly shadow_dirty_mask;
189 static u64 __read_mostly shadow_mmio_mask;
190 
191 static void mmu_spte_set(u64 *sptep, u64 spte);
192 static void mmu_free_roots(struct kvm_vcpu *vcpu);
193 
195 {
196  shadow_mmio_mask = mmio_mask;
197 }
199 
200 static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
201 {
202  access &= ACC_WRITE_MASK | ACC_USER_MASK;
203 
204  trace_mark_mmio_spte(sptep, gfn, access);
205  mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
206 }
207 
208 static bool is_mmio_spte(u64 spte)
209 {
210  return (spte & shadow_mmio_mask) == shadow_mmio_mask;
211 }
212 
213 static gfn_t get_mmio_spte_gfn(u64 spte)
214 {
215  return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
216 }
217 
218 static unsigned get_mmio_spte_access(u64 spte)
219 {
220  return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
221 }
222 
223 static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
224 {
225  if (unlikely(is_noslot_pfn(pfn))) {
226  mark_mmio_spte(sptep, gfn, access);
227  return true;
228  }
229 
230  return false;
231 }
232 
233 static inline u64 rsvd_bits(int s, int e)
234 {
235  return ((1ULL << (e - s + 1)) - 1) << s;
236 }
237 
238 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
239  u64 dirty_mask, u64 nx_mask, u64 x_mask)
240 {
241  shadow_user_mask = user_mask;
242  shadow_accessed_mask = accessed_mask;
243  shadow_dirty_mask = dirty_mask;
244  shadow_nx_mask = nx_mask;
245  shadow_x_mask = x_mask;
246 }
248 
249 static int is_cpuid_PSE36(void)
250 {
251  return 1;
252 }
253 
254 static int is_nx(struct kvm_vcpu *vcpu)
255 {
256  return vcpu->arch.efer & EFER_NX;
257 }
258 
259 static int is_shadow_present_pte(u64 pte)
260 {
261  return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
262 }
263 
264 static int is_large_pte(u64 pte)
265 {
266  return pte & PT_PAGE_SIZE_MASK;
267 }
268 
269 static int is_dirty_gpte(unsigned long pte)
270 {
271  return pte & PT_DIRTY_MASK;
272 }
273 
274 static int is_rmap_spte(u64 pte)
275 {
276  return is_shadow_present_pte(pte);
277 }
278 
279 static int is_last_spte(u64 pte, int level)
280 {
281  if (level == PT_PAGE_TABLE_LEVEL)
282  return 1;
283  if (is_large_pte(pte))
284  return 1;
285  return 0;
286 }
287 
288 static pfn_t spte_to_pfn(u64 pte)
289 {
290  return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
291 }
292 
293 static gfn_t pse36_gfn_delta(u32 gpte)
294 {
295  int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
296 
297  return (gpte & PT32_DIR_PSE36_MASK) << shift;
298 }
299 
300 #ifdef CONFIG_X86_64
301 static void __set_spte(u64 *sptep, u64 spte)
302 {
303  *sptep = spte;
304 }
305 
306 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
307 {
308  *sptep = spte;
309 }
310 
311 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
312 {
313  return xchg(sptep, spte);
314 }
315 
316 static u64 __get_spte_lockless(u64 *sptep)
317 {
318  return ACCESS_ONCE(*sptep);
319 }
320 
321 static bool __check_direct_spte_mmio_pf(u64 spte)
322 {
323  /* It is valid if the spte is zapped. */
324  return spte == 0ull;
325 }
326 #else
327 union split_spte {
328  struct {
331  };
333 };
334 
335 static void count_spte_clear(u64 *sptep, u64 spte)
336 {
337  struct kvm_mmu_page *sp = page_header(__pa(sptep));
338 
339  if (is_shadow_present_pte(spte))
340  return;
341 
342  /* Ensure the spte is completely set before we increase the count */
343  smp_wmb();
344  sp->clear_spte_count++;
345 }
346 
347 static void __set_spte(u64 *sptep, u64 spte)
348 {
349  union split_spte *ssptep, sspte;
350 
351  ssptep = (union split_spte *)sptep;
352  sspte = (union split_spte)spte;
353 
354  ssptep->spte_high = sspte.spte_high;
355 
356  /*
357  * If we map the spte from nonpresent to present, We should store
358  * the high bits firstly, then set present bit, so cpu can not
359  * fetch this spte while we are setting the spte.
360  */
361  smp_wmb();
362 
363  ssptep->spte_low = sspte.spte_low;
364 }
365 
366 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
367 {
368  union split_spte *ssptep, sspte;
369 
370  ssptep = (union split_spte *)sptep;
371  sspte = (union split_spte)spte;
372 
373  ssptep->spte_low = sspte.spte_low;
374 
375  /*
376  * If we map the spte from present to nonpresent, we should clear
377  * present bit firstly to avoid vcpu fetch the old high bits.
378  */
379  smp_wmb();
380 
381  ssptep->spte_high = sspte.spte_high;
382  count_spte_clear(sptep, spte);
383 }
384 
385 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
386 {
387  union split_spte *ssptep, sspte, orig;
388 
389  ssptep = (union split_spte *)sptep;
390  sspte = (union split_spte)spte;
391 
392  /* xchg acts as a barrier before the setting of the high bits */
393  orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
394  orig.spte_high = ssptep->spte_high;
395  ssptep->spte_high = sspte.spte_high;
396  count_spte_clear(sptep, spte);
397 
398  return orig.spte;
399 }
400 
401 /*
402  * The idea using the light way get the spte on x86_32 guest is from
403  * gup_get_pte(arch/x86/mm/gup.c).
404  * The difference is we can not catch the spte tlb flush if we leave
405  * guest mode, so we emulate it by increase clear_spte_count when spte
406  * is cleared.
407  */
408 static u64 __get_spte_lockless(u64 *sptep)
409 {
410  struct kvm_mmu_page *sp = page_header(__pa(sptep));
411  union split_spte spte, *orig = (union split_spte *)sptep;
412  int count;
413 
414 retry:
415  count = sp->clear_spte_count;
416  smp_rmb();
417 
418  spte.spte_low = orig->spte_low;
419  smp_rmb();
420 
421  spte.spte_high = orig->spte_high;
422  smp_rmb();
423 
424  if (unlikely(spte.spte_low != orig->spte_low ||
425  count != sp->clear_spte_count))
426  goto retry;
427 
428  return spte.spte;
429 }
430 
431 static bool __check_direct_spte_mmio_pf(u64 spte)
432 {
433  union split_spte sspte = (union split_spte)spte;
434  u32 high_mmio_mask = shadow_mmio_mask >> 32;
435 
436  /* It is valid if the spte is zapped. */
437  if (spte == 0ull)
438  return true;
439 
440  /* It is valid if the spte is being zapped. */
441  if (sspte.spte_low == 0ull &&
442  (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
443  return true;
444 
445  return false;
446 }
447 #endif
448 
449 static bool spte_is_locklessly_modifiable(u64 spte)
450 {
451  return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
452 }
453 
454 static bool spte_has_volatile_bits(u64 spte)
455 {
456  /*
457  * Always atomicly update spte if it can be updated
458  * out of mmu-lock, it can ensure dirty bit is not lost,
459  * also, it can help us to get a stable is_writable_pte()
460  * to ensure tlb flush is not missed.
461  */
462  if (spte_is_locklessly_modifiable(spte))
463  return true;
464 
465  if (!shadow_accessed_mask)
466  return false;
467 
468  if (!is_shadow_present_pte(spte))
469  return false;
470 
471  if ((spte & shadow_accessed_mask) &&
472  (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
473  return false;
474 
475  return true;
476 }
477 
478 static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
479 {
480  return (old_spte & bit_mask) && !(new_spte & bit_mask);
481 }
482 
483 /* Rules for using mmu_spte_set:
484  * Set the sptep from nonpresent to present.
485  * Note: the sptep being assigned *must* be either not present
486  * or in a state where the hardware will not attempt to update
487  * the spte.
488  */
489 static void mmu_spte_set(u64 *sptep, u64 new_spte)
490 {
491  WARN_ON(is_shadow_present_pte(*sptep));
492  __set_spte(sptep, new_spte);
493 }
494 
495 /* Rules for using mmu_spte_update:
496  * Update the state bits, it means the mapped pfn is not changged.
497  *
498  * Whenever we overwrite a writable spte with a read-only one we
499  * should flush remote TLBs. Otherwise rmap_write_protect
500  * will find a read-only spte, even though the writable spte
501  * might be cached on a CPU's TLB, the return value indicates this
502  * case.
503  */
504 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
505 {
506  u64 old_spte = *sptep;
507  bool ret = false;
508 
509  WARN_ON(!is_rmap_spte(new_spte));
510 
511  if (!is_shadow_present_pte(old_spte)) {
512  mmu_spte_set(sptep, new_spte);
513  return ret;
514  }
515 
516  if (!spte_has_volatile_bits(old_spte))
517  __update_clear_spte_fast(sptep, new_spte);
518  else
519  old_spte = __update_clear_spte_slow(sptep, new_spte);
520 
521  /*
522  * For the spte updated out of mmu-lock is safe, since
523  * we always atomicly update it, see the comments in
524  * spte_has_volatile_bits().
525  */
526  if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
527  ret = true;
528 
529  if (!shadow_accessed_mask)
530  return ret;
531 
532  if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
533  kvm_set_pfn_accessed(spte_to_pfn(old_spte));
534  if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
535  kvm_set_pfn_dirty(spte_to_pfn(old_spte));
536 
537  return ret;
538 }
539 
540 /*
541  * Rules for using mmu_spte_clear_track_bits:
542  * It sets the sptep from present to nonpresent, and track the
543  * state bits, it is used to clear the last level sptep.
544  */
545 static int mmu_spte_clear_track_bits(u64 *sptep)
546 {
547  pfn_t pfn;
548  u64 old_spte = *sptep;
549 
550  if (!spte_has_volatile_bits(old_spte))
551  __update_clear_spte_fast(sptep, 0ull);
552  else
553  old_spte = __update_clear_spte_slow(sptep, 0ull);
554 
555  if (!is_rmap_spte(old_spte))
556  return 0;
557 
558  pfn = spte_to_pfn(old_spte);
559 
560  /*
561  * KVM does not hold the refcount of the page used by
562  * kvm mmu, before reclaiming the page, we should
563  * unmap it from mmu first.
564  */
565  WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
566 
567  if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
569  if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
570  kvm_set_pfn_dirty(pfn);
571  return 1;
572 }
573 
574 /*
575  * Rules for using mmu_spte_clear_no_track:
576  * Directly clear spte without caring the state bits of sptep,
577  * it is used to set the upper level spte.
578  */
579 static void mmu_spte_clear_no_track(u64 *sptep)
580 {
581  __update_clear_spte_fast(sptep, 0ull);
582 }
583 
584 static u64 mmu_spte_get_lockless(u64 *sptep)
585 {
586  return __get_spte_lockless(sptep);
587 }
588 
589 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
590 {
591  /*
592  * Prevent page table teardown by making any free-er wait during
593  * kvm_flush_remote_tlbs() IPI to all active vcpus.
594  */
597  /*
598  * Make sure a following spte read is not reordered ahead of the write
599  * to vcpu->mode.
600  */
601  smp_mb();
602 }
603 
604 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
605 {
606  /*
607  * Make sure the write to vcpu->mode is not reordered in front of
608  * reads to sptes. If it does, kvm_commit_zap_page() can see us
609  * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
610  */
611  smp_mb();
612  vcpu->mode = OUTSIDE_GUEST_MODE;
614 }
615 
616 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
617  struct kmem_cache *base_cache, int min)
618 {
619  void *obj;
620 
621  if (cache->nobjs >= min)
622  return 0;
623  while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
624  obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
625  if (!obj)
626  return -ENOMEM;
627  cache->objects[cache->nobjs++] = obj;
628  }
629  return 0;
630 }
631 
632 static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
633 {
634  return cache->nobjs;
635 }
636 
637 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
638  struct kmem_cache *cache)
639 {
640  while (mc->nobjs)
641  kmem_cache_free(cache, mc->objects[--mc->nobjs]);
642 }
643 
644 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
645  int min)
646 {
647  void *page;
648 
649  if (cache->nobjs >= min)
650  return 0;
651  while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
652  page = (void *)__get_free_page(GFP_KERNEL);
653  if (!page)
654  return -ENOMEM;
655  cache->objects[cache->nobjs++] = page;
656  }
657  return 0;
658 }
659 
660 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
661 {
662  while (mc->nobjs)
663  free_page((unsigned long)mc->objects[--mc->nobjs]);
664 }
665 
666 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
667 {
668  int r;
669 
670  r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
671  pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
672  if (r)
673  goto out;
674  r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
675  if (r)
676  goto out;
677  r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
678  mmu_page_header_cache, 4);
679 out:
680  return r;
681 }
682 
683 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
684 {
685  mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
686  pte_list_desc_cache);
687  mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
688  mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
689  mmu_page_header_cache);
690 }
691 
692 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
693 {
694  void *p;
695 
696  BUG_ON(!mc->nobjs);
697  p = mc->objects[--mc->nobjs];
698  return p;
699 }
700 
701 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
702 {
703  return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
704 }
705 
706 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
707 {
708  kmem_cache_free(pte_list_desc_cache, pte_list_desc);
709 }
710 
711 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
712 {
713  if (!sp->role.direct)
714  return sp->gfns[index];
715 
716  return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
717 }
718 
719 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
720 {
721  if (sp->role.direct)
722  BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
723  else
724  sp->gfns[index] = gfn;
725 }
726 
727 /*
728  * Return the pointer to the large page information for a given gfn,
729  * handling slots that are not large page aligned.
730  */
731 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
732  struct kvm_memory_slot *slot,
733  int level)
734 {
735  unsigned long idx;
736 
737  idx = gfn_to_index(gfn, slot->base_gfn, level);
738  return &slot->arch.lpage_info[level - 2][idx];
739 }
740 
741 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
742 {
743  struct kvm_memory_slot *slot;
744  struct kvm_lpage_info *linfo;
745  int i;
746 
747  slot = gfn_to_memslot(kvm, gfn);
748  for (i = PT_DIRECTORY_LEVEL;
750  linfo = lpage_info_slot(gfn, slot, i);
751  linfo->write_count += 1;
752  }
753  kvm->arch.indirect_shadow_pages++;
754 }
755 
756 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
757 {
758  struct kvm_memory_slot *slot;
759  struct kvm_lpage_info *linfo;
760  int i;
761 
762  slot = gfn_to_memslot(kvm, gfn);
763  for (i = PT_DIRECTORY_LEVEL;
765  linfo = lpage_info_slot(gfn, slot, i);
766  linfo->write_count -= 1;
767  WARN_ON(linfo->write_count < 0);
768  }
769  kvm->arch.indirect_shadow_pages--;
770 }
771 
772 static int has_wrprotected_page(struct kvm *kvm,
773  gfn_t gfn,
774  int level)
775 {
776  struct kvm_memory_slot *slot;
777  struct kvm_lpage_info *linfo;
778 
779  slot = gfn_to_memslot(kvm, gfn);
780  if (slot) {
781  linfo = lpage_info_slot(gfn, slot, level);
782  return linfo->write_count;
783  }
784 
785  return 1;
786 }
787 
788 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
789 {
790  unsigned long page_size;
791  int i, ret = 0;
792 
793  page_size = kvm_host_page_size(kvm, gfn);
794 
795  for (i = PT_PAGE_TABLE_LEVEL;
797  if (page_size >= KVM_HPAGE_SIZE(i))
798  ret = i;
799  else
800  break;
801  }
802 
803  return ret;
804 }
805 
806 static struct kvm_memory_slot *
807 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
808  bool no_dirty_log)
809 {
810  struct kvm_memory_slot *slot;
811 
812  slot = gfn_to_memslot(vcpu->kvm, gfn);
813  if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
814  (no_dirty_log && slot->dirty_bitmap))
815  slot = NULL;
816 
817  return slot;
818 }
819 
820 static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
821 {
822  return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
823 }
824 
825 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
826 {
827  int host_level, level, max_level;
828 
829  host_level = host_mapping_level(vcpu->kvm, large_gfn);
830 
831  if (host_level == PT_PAGE_TABLE_LEVEL)
832  return host_level;
833 
834  max_level = kvm_x86_ops->get_lpage_level() < host_level ?
835  kvm_x86_ops->get_lpage_level() : host_level;
836 
837  for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
838  if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
839  break;
840 
841  return level - 1;
842 }
843 
844 /*
845  * Pte mapping structures:
846  *
847  * If pte_list bit zero is zero, then pte_list point to the spte.
848  *
849  * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
850  * pte_list_desc containing more mappings.
851  *
852  * Returns the number of pte entries before the spte was added or zero if
853  * the spte was not added.
854  *
855  */
856 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
857  unsigned long *pte_list)
858 {
859  struct pte_list_desc *desc;
860  int i, count = 0;
861 
862  if (!*pte_list) {
863  rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
864  *pte_list = (unsigned long)spte;
865  } else if (!(*pte_list & 1)) {
866  rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
867  desc = mmu_alloc_pte_list_desc(vcpu);
868  desc->sptes[0] = (u64 *)*pte_list;
869  desc->sptes[1] = spte;
870  *pte_list = (unsigned long)desc | 1;
871  ++count;
872  } else {
873  rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
874  desc = (struct pte_list_desc *)(*pte_list & ~1ul);
875  while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
876  desc = desc->more;
877  count += PTE_LIST_EXT;
878  }
879  if (desc->sptes[PTE_LIST_EXT-1]) {
880  desc->more = mmu_alloc_pte_list_desc(vcpu);
881  desc = desc->more;
882  }
883  for (i = 0; desc->sptes[i]; ++i)
884  ++count;
885  desc->sptes[i] = spte;
886  }
887  return count;
888 }
889 
890 static void
891 pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
892  int i, struct pte_list_desc *prev_desc)
893 {
894  int j;
895 
896  for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
897  ;
898  desc->sptes[i] = desc->sptes[j];
899  desc->sptes[j] = NULL;
900  if (j != 0)
901  return;
902  if (!prev_desc && !desc->more)
903  *pte_list = (unsigned long)desc->sptes[0];
904  else
905  if (prev_desc)
906  prev_desc->more = desc->more;
907  else
908  *pte_list = (unsigned long)desc->more | 1;
909  mmu_free_pte_list_desc(desc);
910 }
911 
912 static void pte_list_remove(u64 *spte, unsigned long *pte_list)
913 {
914  struct pte_list_desc *desc;
915  struct pte_list_desc *prev_desc;
916  int i;
917 
918  if (!*pte_list) {
919  printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
920  BUG();
921  } else if (!(*pte_list & 1)) {
922  rmap_printk("pte_list_remove: %p 1->0\n", spte);
923  if ((u64 *)*pte_list != spte) {
924  printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
925  BUG();
926  }
927  *pte_list = 0;
928  } else {
929  rmap_printk("pte_list_remove: %p many->many\n", spte);
930  desc = (struct pte_list_desc *)(*pte_list & ~1ul);
931  prev_desc = NULL;
932  while (desc) {
933  for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
934  if (desc->sptes[i] == spte) {
935  pte_list_desc_remove_entry(pte_list,
936  desc, i,
937  prev_desc);
938  return;
939  }
940  prev_desc = desc;
941  desc = desc->more;
942  }
943  pr_err("pte_list_remove: %p many->many\n", spte);
944  BUG();
945  }
946 }
947 
948 typedef void (*pte_list_walk_fn) (u64 *spte);
949 static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
950 {
951  struct pte_list_desc *desc;
952  int i;
953 
954  if (!*pte_list)
955  return;
956 
957  if (!(*pte_list & 1))
958  return fn((u64 *)*pte_list);
959 
960  desc = (struct pte_list_desc *)(*pte_list & ~1ul);
961  while (desc) {
962  for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
963  fn(desc->sptes[i]);
964  desc = desc->more;
965  }
966 }
967 
968 static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
969  struct kvm_memory_slot *slot)
970 {
971  unsigned long idx;
972 
973  idx = gfn_to_index(gfn, slot->base_gfn, level);
974  return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
975 }
976 
977 /*
978  * Take gfn and return the reverse mapping to it.
979  */
980 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
981 {
982  struct kvm_memory_slot *slot;
983 
984  slot = gfn_to_memslot(kvm, gfn);
985  return __gfn_to_rmap(gfn, level, slot);
986 }
987 
988 static bool rmap_can_add(struct kvm_vcpu *vcpu)
989 {
990  struct kvm_mmu_memory_cache *cache;
991 
992  cache = &vcpu->arch.mmu_pte_list_desc_cache;
993  return mmu_memory_cache_free_objects(cache);
994 }
995 
996 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
997 {
998  struct kvm_mmu_page *sp;
999  unsigned long *rmapp;
1000 
1001  sp = page_header(__pa(spte));
1002  kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1003  rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
1004  return pte_list_add(vcpu, spte, rmapp);
1005 }
1006 
1007 static void rmap_remove(struct kvm *kvm, u64 *spte)
1008 {
1009  struct kvm_mmu_page *sp;
1010  gfn_t gfn;
1011  unsigned long *rmapp;
1012 
1013  sp = page_header(__pa(spte));
1014  gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1015  rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
1016  pte_list_remove(spte, rmapp);
1017 }
1018 
1019 /*
1020  * Used by the following functions to iterate through the sptes linked by a
1021  * rmap. All fields are private and not assumed to be used outside.
1022  */
1024  /* private fields */
1025  struct pte_list_desc *desc; /* holds the sptep if not NULL */
1026  int pos; /* index of the sptep */
1027 };
1028 
1029 /*
1030  * Iteration must be started by this function. This should also be used after
1031  * removing/dropping sptes from the rmap link because in such cases the
1032  * information in the itererator may not be valid.
1033  *
1034  * Returns sptep if found, NULL otherwise.
1035  */
1036 static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
1037 {
1038  if (!rmap)
1039  return NULL;
1040 
1041  if (!(rmap & 1)) {
1042  iter->desc = NULL;
1043  return (u64 *)rmap;
1044  }
1045 
1046  iter->desc = (struct pte_list_desc *)(rmap & ~1ul);
1047  iter->pos = 0;
1048  return iter->desc->sptes[iter->pos];
1049 }
1050 
1051 /*
1052  * Must be used with a valid iterator: e.g. after rmap_get_first().
1053  *
1054  * Returns sptep if found, NULL otherwise.
1055  */
1056 static u64 *rmap_get_next(struct rmap_iterator *iter)
1057 {
1058  if (iter->desc) {
1059  if (iter->pos < PTE_LIST_EXT - 1) {
1060  u64 *sptep;
1061 
1062  ++iter->pos;
1063  sptep = iter->desc->sptes[iter->pos];
1064  if (sptep)
1065  return sptep;
1066  }
1067 
1068  iter->desc = iter->desc->more;
1069 
1070  if (iter->desc) {
1071  iter->pos = 0;
1072  /* desc->sptes[0] cannot be NULL */
1073  return iter->desc->sptes[iter->pos];
1074  }
1075  }
1076 
1077  return NULL;
1078 }
1079 
1080 static void drop_spte(struct kvm *kvm, u64 *sptep)
1081 {
1082  if (mmu_spte_clear_track_bits(sptep))
1083  rmap_remove(kvm, sptep);
1084 }
1085 
1086 
1087 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1088 {
1089  if (is_large_pte(*sptep)) {
1090  WARN_ON(page_header(__pa(sptep))->role.level ==
1092  drop_spte(kvm, sptep);
1093  --kvm->stat.lpages;
1094  return true;
1095  }
1096 
1097  return false;
1098 }
1099 
1100 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1101 {
1102  if (__drop_large_spte(vcpu->kvm, sptep))
1103  kvm_flush_remote_tlbs(vcpu->kvm);
1104 }
1105 
1106 /*
1107  * Write-protect on the specified @sptep, @pt_protect indicates whether
1108  * spte writ-protection is caused by protecting shadow page table.
1109  * @flush indicates whether tlb need be flushed.
1110  *
1111  * Note: write protection is difference between drity logging and spte
1112  * protection:
1113  * - for dirty logging, the spte can be set to writable at anytime if
1114  * its dirty bitmap is properly set.
1115  * - for spte protection, the spte can be writable only after unsync-ing
1116  * shadow page.
1117  *
1118  * Return true if the spte is dropped.
1119  */
1120 static bool
1121 spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
1122 {
1123  u64 spte = *sptep;
1124 
1125  if (!is_writable_pte(spte) &&
1126  !(pt_protect && spte_is_locklessly_modifiable(spte)))
1127  return false;
1128 
1129  rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1130 
1131  if (__drop_large_spte(kvm, sptep)) {
1132  *flush |= true;
1133  return true;
1134  }
1135 
1136  if (pt_protect)
1137  spte &= ~SPTE_MMU_WRITEABLE;
1138  spte = spte & ~PT_WRITABLE_MASK;
1139 
1140  *flush |= mmu_spte_update(sptep, spte);
1141  return false;
1142 }
1143 
1144 static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1145  int level, bool pt_protect)
1146 {
1147  u64 *sptep;
1148  struct rmap_iterator iter;
1149  bool flush = false;
1150 
1151  for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1152  BUG_ON(!(*sptep & PT_PRESENT_MASK));
1153  if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
1154  sptep = rmap_get_first(*rmapp, &iter);
1155  continue;
1156  }
1157 
1158  sptep = rmap_get_next(&iter);
1159  }
1160 
1161  return flush;
1162 }
1163 
1174 void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1175  struct kvm_memory_slot *slot,
1176  gfn_t gfn_offset, unsigned long mask)
1177 {
1178  unsigned long *rmapp;
1179 
1180  while (mask) {
1181  rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1182  PT_PAGE_TABLE_LEVEL, slot);
1183  __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
1184 
1185  /* clear the first set bit */
1186  mask &= mask - 1;
1187  }
1188 }
1189 
1190 static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1191 {
1192  struct kvm_memory_slot *slot;
1193  unsigned long *rmapp;
1194  int i;
1195  bool write_protected = false;
1196 
1197  slot = gfn_to_memslot(kvm, gfn);
1198 
1199  for (i = PT_PAGE_TABLE_LEVEL;
1201  rmapp = __gfn_to_rmap(gfn, i, slot);
1202  write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
1203  }
1204 
1205  return write_protected;
1206 }
1207 
1208 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1209  struct kvm_memory_slot *slot, unsigned long data)
1210 {
1211  u64 *sptep;
1212  struct rmap_iterator iter;
1213  int need_tlb_flush = 0;
1214 
1215  while ((sptep = rmap_get_first(*rmapp, &iter))) {
1216  BUG_ON(!(*sptep & PT_PRESENT_MASK));
1217  rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
1218 
1219  drop_spte(kvm, sptep);
1220  need_tlb_flush = 1;
1221  }
1222 
1223  return need_tlb_flush;
1224 }
1225 
1226 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1227  struct kvm_memory_slot *slot, unsigned long data)
1228 {
1229  u64 *sptep;
1230  struct rmap_iterator iter;
1231  int need_flush = 0;
1232  u64 new_spte;
1233  pte_t *ptep = (pte_t *)data;
1234  pfn_t new_pfn;
1235 
1236  WARN_ON(pte_huge(*ptep));
1237  new_pfn = pte_pfn(*ptep);
1238 
1239  for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1240  BUG_ON(!is_shadow_present_pte(*sptep));
1241  rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
1242 
1243  need_flush = 1;
1244 
1245  if (pte_write(*ptep)) {
1246  drop_spte(kvm, sptep);
1247  sptep = rmap_get_first(*rmapp, &iter);
1248  } else {
1249  new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1250  new_spte |= (u64)new_pfn << PAGE_SHIFT;
1251 
1252  new_spte &= ~PT_WRITABLE_MASK;
1253  new_spte &= ~SPTE_HOST_WRITEABLE;
1254  new_spte &= ~shadow_accessed_mask;
1255 
1256  mmu_spte_clear_track_bits(sptep);
1257  mmu_spte_set(sptep, new_spte);
1258  sptep = rmap_get_next(&iter);
1259  }
1260  }
1261 
1262  if (need_flush)
1263  kvm_flush_remote_tlbs(kvm);
1264 
1265  return 0;
1266 }
1267 
1268 static int kvm_handle_hva_range(struct kvm *kvm,
1269  unsigned long start,
1270  unsigned long end,
1271  unsigned long data,
1272  int (*handler)(struct kvm *kvm,
1273  unsigned long *rmapp,
1274  struct kvm_memory_slot *slot,
1275  unsigned long data))
1276 {
1277  int j;
1278  int ret = 0;
1279  struct kvm_memslots *slots;
1280  struct kvm_memory_slot *memslot;
1281 
1282  slots = kvm_memslots(kvm);
1283 
1284  kvm_for_each_memslot(memslot, slots) {
1285  unsigned long hva_start, hva_end;
1286  gfn_t gfn_start, gfn_end;
1287 
1288  hva_start = max(start, memslot->userspace_addr);
1289  hva_end = min(end, memslot->userspace_addr +
1290  (memslot->npages << PAGE_SHIFT));
1291  if (hva_start >= hva_end)
1292  continue;
1293  /*
1294  * {gfn(page) | page intersects with [hva_start, hva_end)} =
1295  * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1296  */
1297  gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1298  gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1299 
1300  for (j = PT_PAGE_TABLE_LEVEL;
1302  unsigned long idx, idx_end;
1303  unsigned long *rmapp;
1304 
1305  /*
1306  * {idx(page_j) | page_j intersects with
1307  * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
1308  */
1309  idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
1310  idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
1311 
1312  rmapp = __gfn_to_rmap(gfn_start, j, memslot);
1313 
1314  for (; idx <= idx_end; ++idx)
1315  ret |= handler(kvm, rmapp++, memslot, data);
1316  }
1317  }
1318 
1319  return ret;
1320 }
1321 
1322 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1323  unsigned long data,
1324  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
1325  struct kvm_memory_slot *slot,
1326  unsigned long data))
1327 {
1328  return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1329 }
1330 
1331 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1332 {
1333  return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
1334 }
1335 
1336 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1337 {
1338  return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1339 }
1340 
1341 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1342 {
1343  kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1344 }
1345 
1346 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1347  struct kvm_memory_slot *slot, unsigned long data)
1348 {
1349  u64 *sptep;
1350  struct rmap_iterator uninitialized_var(iter);
1351  int young = 0;
1352 
1353  /*
1354  * In case of absence of EPT Access and Dirty Bits supports,
1355  * emulate the accessed bit for EPT, by checking if this page has
1356  * an EPT mapping, and clearing it if it does. On the next access,
1357  * a new EPT mapping will be established.
1358  * This has some overhead, but not as much as the cost of swapping
1359  * out actively used pages or breaking up actively used hugepages.
1360  */
1361  if (!shadow_accessed_mask) {
1362  young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
1363  goto out;
1364  }
1365 
1366  for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1367  sptep = rmap_get_next(&iter)) {
1368  BUG_ON(!is_shadow_present_pte(*sptep));
1369 
1370  if (*sptep & shadow_accessed_mask) {
1371  young = 1;
1372  clear_bit((ffs(shadow_accessed_mask) - 1),
1373  (unsigned long *)sptep);
1374  }
1375  }
1376 out:
1377  /* @data has hva passed to kvm_age_hva(). */
1378  trace_kvm_age_page(data, slot, young);
1379  return young;
1380 }
1381 
1382 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1383  struct kvm_memory_slot *slot, unsigned long data)
1384 {
1385  u64 *sptep;
1386  struct rmap_iterator iter;
1387  int young = 0;
1388 
1389  /*
1390  * If there's no access bit in the secondary pte set by the
1391  * hardware it's up to gup-fast/gup to set the access bit in
1392  * the primary pte or in the page structure.
1393  */
1394  if (!shadow_accessed_mask)
1395  goto out;
1396 
1397  for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1398  sptep = rmap_get_next(&iter)) {
1399  BUG_ON(!is_shadow_present_pte(*sptep));
1400 
1401  if (*sptep & shadow_accessed_mask) {
1402  young = 1;
1403  break;
1404  }
1405  }
1406 out:
1407  return young;
1408 }
1409 
1410 #define RMAP_RECYCLE_THRESHOLD 1000
1411 
1412 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1413 {
1414  unsigned long *rmapp;
1415  struct kvm_mmu_page *sp;
1416 
1417  sp = page_header(__pa(spte));
1418 
1419  rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
1420 
1421  kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
1422  kvm_flush_remote_tlbs(vcpu->kvm);
1423 }
1424 
1425 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
1426 {
1427  return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
1428 }
1429 
1430 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1431 {
1432  return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1433 }
1434 
1435 #ifdef MMU_DEBUG
1436 static int is_empty_shadow_page(u64 *spt)
1437 {
1438  u64 *pos;
1439  u64 *end;
1440 
1441  for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1442  if (is_shadow_present_pte(*pos)) {
1443  printk(KERN_ERR "%s: %p %llx\n", __func__,
1444  pos, *pos);
1445  return 0;
1446  }
1447  return 1;
1448 }
1449 #endif
1450 
1451 /*
1452  * This value is the sum of all of the kvm instances's
1453  * kvm->arch.n_used_mmu_pages values. We need a global,
1454  * aggregate version in order to make the slab shrinker
1455  * faster
1456  */
1457 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1458 {
1459  kvm->arch.n_used_mmu_pages += nr;
1460  percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1461 }
1462 
1463 /*
1464  * Remove the sp from shadow page cache, after call it,
1465  * we can not find this sp from the cache, and the shadow
1466  * page table is still valid.
1467  * It should be under the protection of mmu lock.
1468  */
1469 static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
1470 {
1471  ASSERT(is_empty_shadow_page(sp->spt));
1472  hlist_del(&sp->hash_link);
1473  if (!sp->role.direct)
1474  free_page((unsigned long)sp->gfns);
1475 }
1476 
1477 /*
1478  * Free the shadow page table and the sp, we can do it
1479  * out of the protection of mmu lock.
1480  */
1481 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1482 {
1483  list_del(&sp->link);
1484  free_page((unsigned long)sp->spt);
1485  kmem_cache_free(mmu_page_header_cache, sp);
1486 }
1487 
1488 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1489 {
1490  return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
1491 }
1492 
1493 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1494  struct kvm_mmu_page *sp, u64 *parent_pte)
1495 {
1496  if (!parent_pte)
1497  return;
1498 
1499  pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
1500 }
1501 
1502 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1503  u64 *parent_pte)
1504 {
1505  pte_list_remove(parent_pte, &sp->parent_ptes);
1506 }
1507 
1508 static void drop_parent_pte(struct kvm_mmu_page *sp,
1509  u64 *parent_pte)
1510 {
1511  mmu_page_remove_parent_pte(sp, parent_pte);
1512  mmu_spte_clear_no_track(parent_pte);
1513 }
1514 
1515 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1516  u64 *parent_pte, int direct)
1517 {
1518  struct kvm_mmu_page *sp;
1519  sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1520  sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1521  if (!direct)
1522  sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1523  set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1524  list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1525  bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
1526  sp->parent_ptes = 0;
1527  mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1528  kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1529  return sp;
1530 }
1531 
1532 static void mark_unsync(u64 *spte);
1533 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1534 {
1535  pte_list_walk(&sp->parent_ptes, mark_unsync);
1536 }
1537 
1538 static void mark_unsync(u64 *spte)
1539 {
1540  struct kvm_mmu_page *sp;
1541  unsigned int index;
1542 
1543  sp = page_header(__pa(spte));
1544  index = spte - sp->spt;
1545  if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1546  return;
1547  if (sp->unsync_children++)
1548  return;
1549  kvm_mmu_mark_parents_unsync(sp);
1550 }
1551 
1552 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1553  struct kvm_mmu_page *sp)
1554 {
1555  return 1;
1556 }
1557 
1558 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1559 {
1560 }
1561 
1562 static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1563  struct kvm_mmu_page *sp, u64 *spte,
1564  const void *pte)
1565 {
1566  WARN_ON(1);
1567 }
1568 
1569 #define KVM_PAGE_ARRAY_NR 16
1570 
1573  struct kvm_mmu_page *sp;
1574  unsigned int idx;
1575  } page[KVM_PAGE_ARRAY_NR];
1576  unsigned int nr;
1577 };
1578 
1579 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1580  int idx)
1581 {
1582  int i;
1583 
1584  if (sp->unsync)
1585  for (i=0; i < pvec->nr; i++)
1586  if (pvec->page[i].sp == sp)
1587  return 0;
1588 
1589  pvec->page[pvec->nr].sp = sp;
1590  pvec->page[pvec->nr].idx = idx;
1591  pvec->nr++;
1592  return (pvec->nr == KVM_PAGE_ARRAY_NR);
1593 }
1594 
1595 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1596  struct kvm_mmu_pages *pvec)
1597 {
1598  int i, ret, nr_unsync_leaf = 0;
1599 
1600  for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1601  struct kvm_mmu_page *child;
1602  u64 ent = sp->spt[i];
1603 
1604  if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1605  goto clear_child_bitmap;
1606 
1607  child = page_header(ent & PT64_BASE_ADDR_MASK);
1608 
1609  if (child->unsync_children) {
1610  if (mmu_pages_add(pvec, child, i))
1611  return -ENOSPC;
1612 
1613  ret = __mmu_unsync_walk(child, pvec);
1614  if (!ret)
1615  goto clear_child_bitmap;
1616  else if (ret > 0)
1617  nr_unsync_leaf += ret;
1618  else
1619  return ret;
1620  } else if (child->unsync) {
1621  nr_unsync_leaf++;
1622  if (mmu_pages_add(pvec, child, i))
1623  return -ENOSPC;
1624  } else
1625  goto clear_child_bitmap;
1626 
1627  continue;
1628 
1629 clear_child_bitmap:
1630  __clear_bit(i, sp->unsync_child_bitmap);
1631  sp->unsync_children--;
1632  WARN_ON((int)sp->unsync_children < 0);
1633  }
1634 
1635 
1636  return nr_unsync_leaf;
1637 }
1638 
1639 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1640  struct kvm_mmu_pages *pvec)
1641 {
1642  if (!sp->unsync_children)
1643  return 0;
1644 
1645  mmu_pages_add(pvec, sp, 0);
1646  return __mmu_unsync_walk(sp, pvec);
1647 }
1648 
1649 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1650 {
1651  WARN_ON(!sp->unsync);
1652  trace_kvm_mmu_sync_page(sp);
1653  sp->unsync = 0;
1654  --kvm->stat.mmu_unsync;
1655 }
1656 
1657 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1658  struct list_head *invalid_list);
1659 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1660  struct list_head *invalid_list);
1661 
1662 #define for_each_gfn_sp(kvm, sp, gfn, pos) \
1663  hlist_for_each_entry(sp, pos, \
1664  &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1665  if ((sp)->gfn != (gfn)) {} else
1666 
1667 #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
1668  hlist_for_each_entry(sp, pos, \
1669  &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1670  if ((sp)->gfn != (gfn) || (sp)->role.direct || \
1671  (sp)->role.invalid) {} else
1672 
1673 /* @sp->gfn should be write-protected at the call site */
1674 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1675  struct list_head *invalid_list, bool clear_unsync)
1676 {
1677  if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1678  kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1679  return 1;
1680  }
1681 
1682  if (clear_unsync)
1683  kvm_unlink_unsync_page(vcpu->kvm, sp);
1684 
1685  if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1686  kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1687  return 1;
1688  }
1689 
1690  kvm_mmu_flush_tlb(vcpu);
1691  return 0;
1692 }
1693 
1694 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1695  struct kvm_mmu_page *sp)
1696 {
1697  LIST_HEAD(invalid_list);
1698  int ret;
1699 
1700  ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1701  if (ret)
1702  kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1703 
1704  return ret;
1705 }
1706 
1707 #ifdef CONFIG_KVM_MMU_AUDIT
1708 #include "mmu_audit.c"
1709 #else
1710 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
1711 static void mmu_audit_disable(void) { }
1712 #endif
1713 
1714 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1715  struct list_head *invalid_list)
1716 {
1717  return __kvm_sync_page(vcpu, sp, invalid_list, true);
1718 }
1719 
1720 /* @gfn should be write-protected at the call site */
1721 static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1722 {
1723  struct kvm_mmu_page *s;
1724  struct hlist_node *node;
1725  LIST_HEAD(invalid_list);
1726  bool flush = false;
1727 
1728  for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1729  if (!s->unsync)
1730  continue;
1731 
1733  kvm_unlink_unsync_page(vcpu->kvm, s);
1734  if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1735  (vcpu->arch.mmu.sync_page(vcpu, s))) {
1736  kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1737  continue;
1738  }
1739  flush = true;
1740  }
1741 
1742  kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1743  if (flush)
1744  kvm_mmu_flush_tlb(vcpu);
1745 }
1746 
1749  unsigned int idx[PT64_ROOT_LEVEL-1];
1750 };
1751 
1752 #define for_each_sp(pvec, sp, parents, i) \
1753  for (i = mmu_pages_next(&pvec, &parents, -1), \
1754  sp = pvec.page[i].sp; \
1755  i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
1756  i = mmu_pages_next(&pvec, &parents, i))
1757 
1758 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1759  struct mmu_page_path *parents,
1760  int i)
1761 {
1762  int n;
1763 
1764  for (n = i+1; n < pvec->nr; n++) {
1765  struct kvm_mmu_page *sp = pvec->page[n].sp;
1766 
1767  if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1768  parents->idx[0] = pvec->page[n].idx;
1769  return n;
1770  }
1771 
1772  parents->parent[sp->role.level-2] = sp;
1773  parents->idx[sp->role.level-1] = pvec->page[n].idx;
1774  }
1775 
1776  return n;
1777 }
1778 
1779 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1780 {
1781  struct kvm_mmu_page *sp;
1782  unsigned int level = 0;
1783 
1784  do {
1785  unsigned int idx = parents->idx[level];
1786 
1787  sp = parents->parent[level];
1788  if (!sp)
1789  return;
1790 
1791  --sp->unsync_children;
1792  WARN_ON((int)sp->unsync_children < 0);
1793  __clear_bit(idx, sp->unsync_child_bitmap);
1794  level++;
1795  } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1796 }
1797 
1798 static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1799  struct mmu_page_path *parents,
1800  struct kvm_mmu_pages *pvec)
1801 {
1802  parents->parent[parent->role.level-1] = NULL;
1803  pvec->nr = 0;
1804 }
1805 
1806 static void mmu_sync_children(struct kvm_vcpu *vcpu,
1807  struct kvm_mmu_page *parent)
1808 {
1809  int i;
1810  struct kvm_mmu_page *sp;
1811  struct mmu_page_path parents;
1812  struct kvm_mmu_pages pages;
1813  LIST_HEAD(invalid_list);
1814 
1815  kvm_mmu_pages_init(parent, &parents, &pages);
1816  while (mmu_unsync_walk(parent, &pages)) {
1817  bool protected = false;
1818 
1819  for_each_sp(pages, sp, parents, i)
1820  protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1821 
1822  if (protected)
1823  kvm_flush_remote_tlbs(vcpu->kvm);
1824 
1825  for_each_sp(pages, sp, parents, i) {
1826  kvm_sync_page(vcpu, sp, &invalid_list);
1827  mmu_pages_clear_parents(&parents);
1828  }
1829  kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1830  cond_resched_lock(&vcpu->kvm->mmu_lock);
1831  kvm_mmu_pages_init(parent, &parents, &pages);
1832  }
1833 }
1834 
1835 static void init_shadow_page_table(struct kvm_mmu_page *sp)
1836 {
1837  int i;
1838 
1839  for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1840  sp->spt[i] = 0ull;
1841 }
1842 
1843 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
1844 {
1845  sp->write_flooding_count = 0;
1846 }
1847 
1848 static void clear_sp_write_flooding_count(u64 *spte)
1849 {
1850  struct kvm_mmu_page *sp = page_header(__pa(spte));
1851 
1852  __clear_sp_write_flooding_count(sp);
1853 }
1854 
1855 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1856  gfn_t gfn,
1857  gva_t gaddr,
1858  unsigned level,
1859  int direct,
1860  unsigned access,
1861  u64 *parent_pte)
1862 {
1863  union kvm_mmu_page_role role;
1864  unsigned quadrant;
1865  struct kvm_mmu_page *sp;
1866  struct hlist_node *node;
1867  bool need_sync = false;
1868 
1869  role = vcpu->arch.mmu.base_role;
1870  role.level = level;
1871  role.direct = direct;
1872  if (role.direct)
1873  role.cr4_pae = 0;
1874  role.access = access;
1875  if (!vcpu->arch.mmu.direct_map
1876  && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1877  quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1878  quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1879  role.quadrant = quadrant;
1880  }
1881  for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1882  if (!need_sync && sp->unsync)
1883  need_sync = true;
1884 
1885  if (sp->role.word != role.word)
1886  continue;
1887 
1888  if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1889  break;
1890 
1891  mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1892  if (sp->unsync_children) {
1893  kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1894  kvm_mmu_mark_parents_unsync(sp);
1895  } else if (sp->unsync)
1896  kvm_mmu_mark_parents_unsync(sp);
1897 
1898  __clear_sp_write_flooding_count(sp);
1899  trace_kvm_mmu_get_page(sp, false);
1900  return sp;
1901  }
1902  ++vcpu->kvm->stat.mmu_cache_miss;
1903  sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1904  if (!sp)
1905  return sp;
1906  sp->gfn = gfn;
1907  sp->role = role;
1908  hlist_add_head(&sp->hash_link,
1909  &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1910  if (!direct) {
1911  if (rmap_write_protect(vcpu->kvm, gfn))
1912  kvm_flush_remote_tlbs(vcpu->kvm);
1913  if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1914  kvm_sync_pages(vcpu, gfn);
1915 
1916  account_shadowed(vcpu->kvm, gfn);
1917  }
1918  init_shadow_page_table(sp);
1919  trace_kvm_mmu_get_page(sp, true);
1920  return sp;
1921 }
1922 
1923 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1924  struct kvm_vcpu *vcpu, u64 addr)
1925 {
1926  iterator->addr = addr;
1927  iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1928  iterator->level = vcpu->arch.mmu.shadow_root_level;
1929 
1930  if (iterator->level == PT64_ROOT_LEVEL &&
1931  vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1932  !vcpu->arch.mmu.direct_map)
1933  --iterator->level;
1934 
1935  if (iterator->level == PT32E_ROOT_LEVEL) {
1936  iterator->shadow_addr
1937  = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1938  iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1939  --iterator->level;
1940  if (!iterator->shadow_addr)
1941  iterator->level = 0;
1942  }
1943 }
1944 
1945 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1946 {
1947  if (iterator->level < PT_PAGE_TABLE_LEVEL)
1948  return false;
1949 
1950  iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1951  iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1952  return true;
1953 }
1954 
1955 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
1956  u64 spte)
1957 {
1958  if (is_last_spte(spte, iterator->level)) {
1959  iterator->level = 0;
1960  return;
1961  }
1962 
1963  iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
1964  --iterator->level;
1965 }
1966 
1967 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1968 {
1969  return __shadow_walk_next(iterator, *iterator->sptep);
1970 }
1971 
1972 static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1973 {
1974  u64 spte;
1975 
1976  spte = __pa(sp->spt)
1979  mmu_spte_set(sptep, spte);
1980 }
1981 
1982 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1983  unsigned direct_access)
1984 {
1985  if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1986  struct kvm_mmu_page *child;
1987 
1988  /*
1989  * For the direct sp, if the guest pte's dirty bit
1990  * changed form clean to dirty, it will corrupt the
1991  * sp's access: allow writable in the read-only sp,
1992  * so we should update the spte at this point to get
1993  * a new sp with the correct access.
1994  */
1995  child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1996  if (child->role.access == direct_access)
1997  return;
1998 
1999  drop_parent_pte(child, sptep);
2000  kvm_flush_remote_tlbs(vcpu->kvm);
2001  }
2002 }
2003 
2004 static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2005  u64 *spte)
2006 {
2007  u64 pte;
2008  struct kvm_mmu_page *child;
2009 
2010  pte = *spte;
2011  if (is_shadow_present_pte(pte)) {
2012  if (is_last_spte(pte, sp->role.level)) {
2013  drop_spte(kvm, spte);
2014  if (is_large_pte(pte))
2015  --kvm->stat.lpages;
2016  } else {
2017  child = page_header(pte & PT64_BASE_ADDR_MASK);
2018  drop_parent_pte(child, spte);
2019  }
2020  return true;
2021  }
2022 
2023  if (is_mmio_spte(pte))
2024  mmu_spte_clear_no_track(spte);
2025 
2026  return false;
2027 }
2028 
2029 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2030  struct kvm_mmu_page *sp)
2031 {
2032  unsigned i;
2033 
2034  for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2035  mmu_page_zap_pte(kvm, sp, sp->spt + i);
2036 }
2037 
2038 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
2039 {
2040  mmu_page_remove_parent_pte(sp, parent_pte);
2041 }
2042 
2043 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2044 {
2045  u64 *sptep;
2046  struct rmap_iterator iter;
2047 
2048  while ((sptep = rmap_get_first(sp->parent_ptes, &iter)))
2049  drop_parent_pte(sp, sptep);
2050 }
2051 
2052 static int mmu_zap_unsync_children(struct kvm *kvm,
2053  struct kvm_mmu_page *parent,
2054  struct list_head *invalid_list)
2055 {
2056  int i, zapped = 0;
2057  struct mmu_page_path parents;
2058  struct kvm_mmu_pages pages;
2059 
2060  if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2061  return 0;
2062 
2063  kvm_mmu_pages_init(parent, &parents, &pages);
2064  while (mmu_unsync_walk(parent, &pages)) {
2065  struct kvm_mmu_page *sp;
2066 
2067  for_each_sp(pages, sp, parents, i) {
2068  kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2069  mmu_pages_clear_parents(&parents);
2070  zapped++;
2071  }
2072  kvm_mmu_pages_init(parent, &parents, &pages);
2073  }
2074 
2075  return zapped;
2076 }
2077 
2078 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2079  struct list_head *invalid_list)
2080 {
2081  int ret;
2082 
2083  trace_kvm_mmu_prepare_zap_page(sp);
2084  ++kvm->stat.mmu_shadow_zapped;
2085  ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
2086  kvm_mmu_page_unlink_children(kvm, sp);
2087  kvm_mmu_unlink_parents(kvm, sp);
2088  if (!sp->role.invalid && !sp->role.direct)
2089  unaccount_shadowed(kvm, sp->gfn);
2090  if (sp->unsync)
2091  kvm_unlink_unsync_page(kvm, sp);
2092  if (!sp->root_count) {
2093  /* Count self */
2094  ret++;
2095  list_move(&sp->link, invalid_list);
2096  kvm_mod_used_mmu_pages(kvm, -1);
2097  } else {
2098  list_move(&sp->link, &kvm->arch.active_mmu_pages);
2100  }
2101 
2102  sp->role.invalid = 1;
2103  return ret;
2104 }
2105 
2106 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2107  struct list_head *invalid_list)
2108 {
2109  struct kvm_mmu_page *sp;
2110 
2111  if (list_empty(invalid_list))
2112  return;
2113 
2114  /*
2115  * wmb: make sure everyone sees our modifications to the page tables
2116  * rmb: make sure we see changes to vcpu->mode
2117  */
2118  smp_mb();
2119 
2120  /*
2121  * Wait for all vcpus to exit guest mode and/or lockless shadow
2122  * page table walks.
2123  */
2124  kvm_flush_remote_tlbs(kvm);
2125 
2126  do {
2127  sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
2128  WARN_ON(!sp->role.invalid || sp->root_count);
2129  kvm_mmu_isolate_page(sp);
2130  kvm_mmu_free_page(sp);
2131  } while (!list_empty(invalid_list));
2132 }
2133 
2134 /*
2135  * Changing the number of mmu pages allocated to the vm
2136  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2137  */
2138 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
2139 {
2140  LIST_HEAD(invalid_list);
2141  /*
2142  * If we set the number of mmu pages to be smaller be than the
2143  * number of actived pages , we must to free some mmu pages before we
2144  * change the value
2145  */
2146 
2147  if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2148  while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
2149  !list_empty(&kvm->arch.active_mmu_pages)) {
2150  struct kvm_mmu_page *page;
2151 
2152  page = container_of(kvm->arch.active_mmu_pages.prev,
2153  struct kvm_mmu_page, link);
2154  kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
2155  }
2156  kvm_mmu_commit_zap_page(kvm, &invalid_list);
2157  goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2158  }
2159 
2160  kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2161 }
2162 
2163 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2164 {
2165  struct kvm_mmu_page *sp;
2166  struct hlist_node *node;
2167  LIST_HEAD(invalid_list);
2168  int r;
2169 
2170  pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2171  r = 0;
2172  spin_lock(&kvm->mmu_lock);
2173  for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
2174  pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2175  sp->role.word);
2176  r = 1;
2177  kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2178  }
2179  kvm_mmu_commit_zap_page(kvm, &invalid_list);
2180  spin_unlock(&kvm->mmu_lock);
2181 
2182  return r;
2183 }
2185 
2186 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
2187 {
2188  int slot = memslot_id(kvm, gfn);
2189  struct kvm_mmu_page *sp = page_header(__pa(pte));
2190 
2191  __set_bit(slot, sp->slot_bitmap);
2192 }
2193 
2194 /*
2195  * The function is based on mtrr_type_lookup() in
2196  * arch/x86/kernel/cpu/mtrr/generic.c
2197  */
2198 static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
2199  u64 start, u64 end)
2200 {
2201  int i;
2202  u64 base, mask;
2203  u8 prev_match, curr_match;
2205 
2206  if (!mtrr_state->enabled)
2207  return 0xFF;
2208 
2209  /* Make end inclusive end, instead of exclusive */
2210  end--;
2211 
2212  /* Look in fixed ranges. Just return the type as per start */
2213  if (mtrr_state->have_fixed && (start < 0x100000)) {
2214  int idx;
2215 
2216  if (start < 0x80000) {
2217  idx = 0;
2218  idx += (start >> 16);
2219  return mtrr_state->fixed_ranges[idx];
2220  } else if (start < 0xC0000) {
2221  idx = 1 * 8;
2222  idx += ((start - 0x80000) >> 14);
2223  return mtrr_state->fixed_ranges[idx];
2224  } else if (start < 0x1000000) {
2225  idx = 3 * 8;
2226  idx += ((start - 0xC0000) >> 12);
2227  return mtrr_state->fixed_ranges[idx];
2228  }
2229  }
2230 
2231  /*
2232  * Look in variable ranges
2233  * Look of multiple ranges matching this address and pick type
2234  * as per MTRR precedence
2235  */
2236  if (!(mtrr_state->enabled & 2))
2237  return mtrr_state->def_type;
2238 
2239  prev_match = 0xFF;
2240  for (i = 0; i < num_var_ranges; ++i) {
2241  unsigned short start_state, end_state;
2242 
2243  if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
2244  continue;
2245 
2246  base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
2247  (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
2248  mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
2249  (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
2250 
2251  start_state = ((start & mask) == (base & mask));
2252  end_state = ((end & mask) == (base & mask));
2253  if (start_state != end_state)
2254  return 0xFE;
2255 
2256  if ((start & mask) != (base & mask))
2257  continue;
2258 
2259  curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
2260  if (prev_match == 0xFF) {
2261  prev_match = curr_match;
2262  continue;
2263  }
2264 
2265  if (prev_match == MTRR_TYPE_UNCACHABLE ||
2266  curr_match == MTRR_TYPE_UNCACHABLE)
2267  return MTRR_TYPE_UNCACHABLE;
2268 
2269  if ((prev_match == MTRR_TYPE_WRBACK &&
2270  curr_match == MTRR_TYPE_WRTHROUGH) ||
2271  (prev_match == MTRR_TYPE_WRTHROUGH &&
2272  curr_match == MTRR_TYPE_WRBACK)) {
2273  prev_match = MTRR_TYPE_WRTHROUGH;
2274  curr_match = MTRR_TYPE_WRTHROUGH;
2275  }
2276 
2277  if (prev_match != curr_match)
2278  return MTRR_TYPE_UNCACHABLE;
2279  }
2280 
2281  if (prev_match != 0xFF)
2282  return prev_match;
2283 
2284  return mtrr_state->def_type;
2285 }
2286 
2288 {
2289  u8 mtrr;
2290 
2291  mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
2292  (gfn << PAGE_SHIFT) + PAGE_SIZE);
2293  if (mtrr == 0xfe || mtrr == 0xff)
2294  mtrr = MTRR_TYPE_WRBACK;
2295  return mtrr;
2296 }
2298 
2299 static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2300 {
2301  trace_kvm_mmu_unsync_page(sp);
2302  ++vcpu->kvm->stat.mmu_unsync;
2303  sp->unsync = 1;
2304 
2305  kvm_mmu_mark_parents_unsync(sp);
2306 }
2307 
2308 static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
2309 {
2310  struct kvm_mmu_page *s;
2311  struct hlist_node *node;
2312 
2313  for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
2314  if (s->unsync)
2315  continue;
2317  __kvm_unsync_page(vcpu, s);
2318  }
2319 }
2320 
2321 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2322  bool can_unsync)
2323 {
2324  struct kvm_mmu_page *s;
2325  struct hlist_node *node;
2326  bool need_unsync = false;
2327 
2328  for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
2329  if (!can_unsync)
2330  return 1;
2331 
2332  if (s->role.level != PT_PAGE_TABLE_LEVEL)
2333  return 1;
2334 
2335  if (!need_unsync && !s->unsync) {
2336  need_unsync = true;
2337  }
2338  }
2339  if (need_unsync)
2340  kvm_unsync_pages(vcpu, gfn);
2341  return 0;
2342 }
2343 
2344 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2345  unsigned pte_access, int user_fault,
2346  int write_fault, int level,
2347  gfn_t gfn, pfn_t pfn, bool speculative,
2348  bool can_unsync, bool host_writable)
2349 {
2350  u64 spte;
2351  int ret = 0;
2352 
2353  if (set_mmio_spte(sptep, gfn, pfn, pte_access))
2354  return 0;
2355 
2356  spte = PT_PRESENT_MASK;
2357  if (!speculative)
2358  spte |= shadow_accessed_mask;
2359 
2360  if (pte_access & ACC_EXEC_MASK)
2361  spte |= shadow_x_mask;
2362  else
2363  spte |= shadow_nx_mask;
2364 
2365  if (pte_access & ACC_USER_MASK)
2366  spte |= shadow_user_mask;
2367 
2368  if (level > PT_PAGE_TABLE_LEVEL)
2369  spte |= PT_PAGE_SIZE_MASK;
2370  if (tdp_enabled)
2371  spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
2372  kvm_is_mmio_pfn(pfn));
2373 
2374  if (host_writable)
2375  spte |= SPTE_HOST_WRITEABLE;
2376  else
2377  pte_access &= ~ACC_WRITE_MASK;
2378 
2379  spte |= (u64)pfn << PAGE_SHIFT;
2380 
2381  if ((pte_access & ACC_WRITE_MASK)
2382  || (!vcpu->arch.mmu.direct_map && write_fault
2383  && !is_write_protection(vcpu) && !user_fault)) {
2384 
2385  if (level > PT_PAGE_TABLE_LEVEL &&
2386  has_wrprotected_page(vcpu->kvm, gfn, level)) {
2387  ret = 1;
2388  drop_spte(vcpu->kvm, sptep);
2389  goto done;
2390  }
2391 
2393 
2394  if (!vcpu->arch.mmu.direct_map
2395  && !(pte_access & ACC_WRITE_MASK)) {
2396  spte &= ~PT_USER_MASK;
2397  /*
2398  * If we converted a user page to a kernel page,
2399  * so that the kernel can write to it when cr0.wp=0,
2400  * then we should prevent the kernel from executing it
2401  * if SMEP is enabled.
2402  */
2403  if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
2404  spte |= PT64_NX_MASK;
2405  }
2406 
2407  /*
2408  * Optimization: for pte sync, if spte was writable the hash
2409  * lookup is unnecessary (and expensive). Write protection
2410  * is responsibility of mmu_get_page / kvm_sync_page.
2411  * Same reasoning can be applied to dirty page accounting.
2412  */
2413  if (!can_unsync && is_writable_pte(*sptep))
2414  goto set_pte;
2415 
2416  if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
2417  pgprintk("%s: found shadow page for %llx, marking ro\n",
2418  __func__, gfn);
2419  ret = 1;
2420  pte_access &= ~ACC_WRITE_MASK;
2421  spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
2422  }
2423  }
2424 
2425  if (pte_access & ACC_WRITE_MASK)
2426  mark_page_dirty(vcpu->kvm, gfn);
2427 
2428 set_pte:
2429  if (mmu_spte_update(sptep, spte))
2430  kvm_flush_remote_tlbs(vcpu->kvm);
2431 done:
2432  return ret;
2433 }
2434 
2435 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2436  unsigned pt_access, unsigned pte_access,
2437  int user_fault, int write_fault,
2438  int *emulate, int level, gfn_t gfn,
2439  pfn_t pfn, bool speculative,
2440  bool host_writable)
2441 {
2442  int was_rmapped = 0;
2443  int rmap_count;
2444 
2445  pgprintk("%s: spte %llx access %x write_fault %d"
2446  " user_fault %d gfn %llx\n",
2447  __func__, *sptep, pt_access,
2448  write_fault, user_fault, gfn);
2449 
2450  if (is_rmap_spte(*sptep)) {
2451  /*
2452  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2453  * the parent of the now unreachable PTE.
2454  */
2455  if (level > PT_PAGE_TABLE_LEVEL &&
2456  !is_large_pte(*sptep)) {
2457  struct kvm_mmu_page *child;
2458  u64 pte = *sptep;
2459 
2460  child = page_header(pte & PT64_BASE_ADDR_MASK);
2461  drop_parent_pte(child, sptep);
2462  kvm_flush_remote_tlbs(vcpu->kvm);
2463  } else if (pfn != spte_to_pfn(*sptep)) {
2464  pgprintk("hfn old %llx new %llx\n",
2465  spte_to_pfn(*sptep), pfn);
2466  drop_spte(vcpu->kvm, sptep);
2467  kvm_flush_remote_tlbs(vcpu->kvm);
2468  } else
2469  was_rmapped = 1;
2470  }
2471 
2472  if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2473  level, gfn, pfn, speculative, true,
2474  host_writable)) {
2475  if (write_fault)
2476  *emulate = 1;
2477  kvm_mmu_flush_tlb(vcpu);
2478  }
2479 
2480  if (unlikely(is_mmio_spte(*sptep) && emulate))
2481  *emulate = 1;
2482 
2483  pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2484  pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2485  is_large_pte(*sptep)? "2MB" : "4kB",
2486  *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2487  *sptep, sptep);
2488  if (!was_rmapped && is_large_pte(*sptep))
2489  ++vcpu->kvm->stat.lpages;
2490 
2491  if (is_shadow_present_pte(*sptep)) {
2492  page_header_update_slot(vcpu->kvm, sptep, gfn);
2493  if (!was_rmapped) {
2494  rmap_count = rmap_add(vcpu, sptep, gfn);
2495  if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2496  rmap_recycle(vcpu, sptep, gfn);
2497  }
2498  }
2499 
2500  kvm_release_pfn_clean(pfn);
2501 }
2502 
2503 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2504 {
2505  mmu_free_roots(vcpu);
2506 }
2507 
2508 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2509  bool no_dirty_log)
2510 {
2511  struct kvm_memory_slot *slot;
2512 
2513  slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2514  if (!slot)
2515  return KVM_PFN_ERR_FAULT;
2516 
2517  return gfn_to_pfn_memslot_atomic(slot, gfn);
2518 }
2519 
2520 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2521  struct kvm_mmu_page *sp,
2522  u64 *start, u64 *end)
2523 {
2524  struct page *pages[PTE_PREFETCH_NUM];
2525  unsigned access = sp->role.access;
2526  int i, ret;
2527  gfn_t gfn;
2528 
2529  gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2530  if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2531  return -1;
2532 
2533  ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2534  if (ret <= 0)
2535  return -1;
2536 
2537  for (i = 0; i < ret; i++, gfn++, start++)
2538  mmu_set_spte(vcpu, start, ACC_ALL,
2539  access, 0, 0, NULL,
2540  sp->role.level, gfn,
2541  page_to_pfn(pages[i]), true, true);
2542 
2543  return 0;
2544 }
2545 
2546 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2547  struct kvm_mmu_page *sp, u64 *sptep)
2548 {
2549  u64 *spte, *start = NULL;
2550  int i;
2551 
2552  WARN_ON(!sp->role.direct);
2553 
2554  i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2555  spte = sp->spt + i;
2556 
2557  for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2558  if (is_shadow_present_pte(*spte) || spte == sptep) {
2559  if (!start)
2560  continue;
2561  if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2562  break;
2563  start = NULL;
2564  } else if (!start)
2565  start = spte;
2566  }
2567 }
2568 
2569 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2570 {
2571  struct kvm_mmu_page *sp;
2572 
2573  /*
2574  * Since it's no accessed bit on EPT, it's no way to
2575  * distinguish between actually accessed translations
2576  * and prefetched, so disable pte prefetch if EPT is
2577  * enabled.
2578  */
2579  if (!shadow_accessed_mask)
2580  return;
2581 
2582  sp = page_header(__pa(sptep));
2583  if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2584  return;
2585 
2586  __direct_pte_prefetch(vcpu, sp, sptep);
2587 }
2588 
2589 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2590  int map_writable, int level, gfn_t gfn, pfn_t pfn,
2591  bool prefault)
2592 {
2593  struct kvm_shadow_walk_iterator iterator;
2594  struct kvm_mmu_page *sp;
2595  int emulate = 0;
2596  gfn_t pseudo_gfn;
2597 
2598  for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2599  if (iterator.level == level) {
2600  unsigned pte_access = ACC_ALL;
2601 
2602  mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2603  0, write, &emulate,
2604  level, gfn, pfn, prefault, map_writable);
2605  direct_pte_prefetch(vcpu, iterator.sptep);
2606  ++vcpu->stat.pf_fixed;
2607  break;
2608  }
2609 
2610  if (!is_shadow_present_pte(*iterator.sptep)) {
2611  u64 base_addr = iterator.addr;
2612 
2613  base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2614  pseudo_gfn = base_addr >> PAGE_SHIFT;
2615  sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2616  iterator.level - 1,
2617  1, ACC_ALL, iterator.sptep);
2618 
2619  mmu_spte_set(iterator.sptep,
2620  __pa(sp->spt)
2622  | shadow_user_mask | shadow_x_mask
2623  | shadow_accessed_mask);
2624  }
2625  }
2626  return emulate;
2627 }
2628 
2629 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2630 {
2631  siginfo_t info;
2632 
2633  info.si_signo = SIGBUS;
2634  info.si_errno = 0;
2635  info.si_code = BUS_MCEERR_AR;
2636  info.si_addr = (void __user *)address;
2637  info.si_addr_lsb = PAGE_SHIFT;
2638 
2639  send_sig_info(SIGBUS, &info, tsk);
2640 }
2641 
2642 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
2643 {
2644  /*
2645  * Do not cache the mmio info caused by writing the readonly gfn
2646  * into the spte otherwise read access on readonly gfn also can
2647  * caused mmio page fault and treat it as mmio access.
2648  * Return 1 to tell kvm to emulate it.
2649  */
2650  if (pfn == KVM_PFN_ERR_RO_FAULT)
2651  return 1;
2652 
2653  if (pfn == KVM_PFN_ERR_HWPOISON) {
2654  kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
2655  return 0;
2656  }
2657 
2658  return -EFAULT;
2659 }
2660 
2661 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2662  gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2663 {
2664  pfn_t pfn = *pfnp;
2665  gfn_t gfn = *gfnp;
2666  int level = *levelp;
2667 
2668  /*
2669  * Check if it's a transparent hugepage. If this would be an
2670  * hugetlbfs page, level wouldn't be set to
2671  * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2672  * here.
2673  */
2674  if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2675  level == PT_PAGE_TABLE_LEVEL &&
2676  PageTransCompound(pfn_to_page(pfn)) &&
2677  !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2678  unsigned long mask;
2679  /*
2680  * mmu_notifier_retry was successful and we hold the
2681  * mmu_lock here, so the pmd can't become splitting
2682  * from under us, and in turn
2683  * __split_huge_page_refcount() can't run from under
2684  * us and we can safely transfer the refcount from
2685  * PG_tail to PG_head as we switch the pfn to tail to
2686  * head.
2687  */
2688  *levelp = level = PT_DIRECTORY_LEVEL;
2689  mask = KVM_PAGES_PER_HPAGE(level) - 1;
2690  VM_BUG_ON((gfn & mask) != (pfn & mask));
2691  if (pfn & mask) {
2692  gfn &= ~mask;
2693  *gfnp = gfn;
2694  kvm_release_pfn_clean(pfn);
2695  pfn &= ~mask;
2696  kvm_get_pfn(pfn);
2697  *pfnp = pfn;
2698  }
2699  }
2700 }
2701 
2702 static bool mmu_invalid_pfn(pfn_t pfn)
2703 {
2704  return unlikely(is_invalid_pfn(pfn));
2705 }
2706 
2707 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
2708  pfn_t pfn, unsigned access, int *ret_val)
2709 {
2710  bool ret = true;
2711 
2712  /* The pfn is invalid, report the error! */
2713  if (unlikely(is_invalid_pfn(pfn))) {
2714  *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
2715  goto exit;
2716  }
2717 
2718  if (unlikely(is_noslot_pfn(pfn)))
2719  vcpu_cache_mmio_info(vcpu, gva, gfn, access);
2720 
2721  ret = false;
2722 exit:
2723  return ret;
2724 }
2725 
2726 static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
2727 {
2728  /*
2729  * #PF can be fast only if the shadow page table is present and it
2730  * is caused by write-protect, that means we just need change the
2731  * W bit of the spte which can be done out of mmu-lock.
2732  */
2733  if (!(error_code & PFERR_PRESENT_MASK) ||
2734  !(error_code & PFERR_WRITE_MASK))
2735  return false;
2736 
2737  return true;
2738 }
2739 
2740 static bool
2741 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
2742 {
2743  struct kvm_mmu_page *sp = page_header(__pa(sptep));
2744  gfn_t gfn;
2745 
2746  WARN_ON(!sp->role.direct);
2747 
2748  /*
2749  * The gfn of direct spte is stable since it is calculated
2750  * by sp->gfn.
2751  */
2752  gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
2753 
2754  if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
2755  mark_page_dirty(vcpu->kvm, gfn);
2756 
2757  return true;
2758 }
2759 
2760 /*
2761  * Return value:
2762  * - true: let the vcpu to access on the same address again.
2763  * - false: let the real page fault path to fix it.
2764  */
2765 static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2766  u32 error_code)
2767 {
2768  struct kvm_shadow_walk_iterator iterator;
2769  bool ret = false;
2770  u64 spte = 0ull;
2771 
2772  if (!page_fault_can_be_fast(vcpu, error_code))
2773  return false;
2774 
2775  walk_shadow_page_lockless_begin(vcpu);
2776  for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
2777  if (!is_shadow_present_pte(spte) || iterator.level < level)
2778  break;
2779 
2780  /*
2781  * If the mapping has been changed, let the vcpu fault on the
2782  * same address again.
2783  */
2784  if (!is_rmap_spte(spte)) {
2785  ret = true;
2786  goto exit;
2787  }
2788 
2789  if (!is_last_spte(spte, level))
2790  goto exit;
2791 
2792  /*
2793  * Check if it is a spurious fault caused by TLB lazily flushed.
2794  *
2795  * Need not check the access of upper level table entries since
2796  * they are always ACC_ALL.
2797  */
2798  if (is_writable_pte(spte)) {
2799  ret = true;
2800  goto exit;
2801  }
2802 
2803  /*
2804  * Currently, to simplify the code, only the spte write-protected
2805  * by dirty-log can be fast fixed.
2806  */
2807  if (!spte_is_locklessly_modifiable(spte))
2808  goto exit;
2809 
2810  /*
2811  * Currently, fast page fault only works for direct mapping since
2812  * the gfn is not stable for indirect shadow page.
2813  * See Documentation/virtual/kvm/locking.txt to get more detail.
2814  */
2815  ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
2816 exit:
2817  trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
2818  spte, ret);
2819  walk_shadow_page_lockless_end(vcpu);
2820 
2821  return ret;
2822 }
2823 
2824 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2825  gva_t gva, pfn_t *pfn, bool write, bool *writable);
2826 
2827 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2828  gfn_t gfn, bool prefault)
2829 {
2830  int r;
2831  int level;
2832  int force_pt_level;
2833  pfn_t pfn;
2834  unsigned long mmu_seq;
2835  bool map_writable, write = error_code & PFERR_WRITE_MASK;
2836 
2837  force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2838  if (likely(!force_pt_level)) {
2839  level = mapping_level(vcpu, gfn);
2840  /*
2841  * This path builds a PAE pagetable - so we can map
2842  * 2mb pages at maximum. Therefore check if the level
2843  * is larger than that.
2844  */
2845  if (level > PT_DIRECTORY_LEVEL)
2846  level = PT_DIRECTORY_LEVEL;
2847 
2848  gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2849  } else
2850  level = PT_PAGE_TABLE_LEVEL;
2851 
2852  if (fast_page_fault(vcpu, v, level, error_code))
2853  return 0;
2854 
2855  mmu_seq = vcpu->kvm->mmu_notifier_seq;
2856  smp_rmb();
2857 
2858  if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2859  return 0;
2860 
2861  if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
2862  return r;
2863 
2864  spin_lock(&vcpu->kvm->mmu_lock);
2865  if (mmu_notifier_retry(vcpu, mmu_seq))
2866  goto out_unlock;
2867  kvm_mmu_free_some_pages(vcpu);
2868  if (likely(!force_pt_level))
2869  transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2870  r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2871  prefault);
2872  spin_unlock(&vcpu->kvm->mmu_lock);
2873 
2874 
2875  return r;
2876 
2877 out_unlock:
2878  spin_unlock(&vcpu->kvm->mmu_lock);
2879  kvm_release_pfn_clean(pfn);
2880  return 0;
2881 }
2882 
2883 
2884 static void mmu_free_roots(struct kvm_vcpu *vcpu)
2885 {
2886  int i;
2887  struct kvm_mmu_page *sp;
2888  LIST_HEAD(invalid_list);
2889 
2890  if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2891  return;
2892  spin_lock(&vcpu->kvm->mmu_lock);
2893  if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2894  (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2895  vcpu->arch.mmu.direct_map)) {
2896  hpa_t root = vcpu->arch.mmu.root_hpa;
2897 
2898  sp = page_header(root);
2899  --sp->root_count;
2900  if (!sp->root_count && sp->role.invalid) {
2901  kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2902  kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2903  }
2904  vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2905  spin_unlock(&vcpu->kvm->mmu_lock);
2906  return;
2907  }
2908  for (i = 0; i < 4; ++i) {
2909  hpa_t root = vcpu->arch.mmu.pae_root[i];
2910 
2911  if (root) {
2912  root &= PT64_BASE_ADDR_MASK;
2913  sp = page_header(root);
2914  --sp->root_count;
2915  if (!sp->root_count && sp->role.invalid)
2916  kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2917  &invalid_list);
2918  }
2919  vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2920  }
2921  kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2922  spin_unlock(&vcpu->kvm->mmu_lock);
2923  vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2924 }
2925 
2926 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2927 {
2928  int ret = 0;
2929 
2930  if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2931  kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2932  ret = 1;
2933  }
2934 
2935  return ret;
2936 }
2937 
2938 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2939 {
2940  struct kvm_mmu_page *sp;
2941  unsigned i;
2942 
2943  if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2944  spin_lock(&vcpu->kvm->mmu_lock);
2945  kvm_mmu_free_some_pages(vcpu);
2946  sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2947  1, ACC_ALL, NULL);
2948  ++sp->root_count;
2949  spin_unlock(&vcpu->kvm->mmu_lock);
2950  vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2951  } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2952  for (i = 0; i < 4; ++i) {
2953  hpa_t root = vcpu->arch.mmu.pae_root[i];
2954 
2955  ASSERT(!VALID_PAGE(root));
2956  spin_lock(&vcpu->kvm->mmu_lock);
2957  kvm_mmu_free_some_pages(vcpu);
2958  sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2959  i << 30,
2961  NULL);
2962  root = __pa(sp->spt);
2963  ++sp->root_count;
2964  spin_unlock(&vcpu->kvm->mmu_lock);
2965  vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2966  }
2967  vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2968  } else
2969  BUG();
2970 
2971  return 0;
2972 }
2973 
2974 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2975 {
2976  struct kvm_mmu_page *sp;
2977  u64 pdptr, pm_mask;
2978  gfn_t root_gfn;
2979  int i;
2980 
2981  root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2982 
2983  if (mmu_check_root(vcpu, root_gfn))
2984  return 1;
2985 
2986  /*
2987  * Do we shadow a long mode page table? If so we need to
2988  * write-protect the guests page table root.
2989  */
2990  if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2991  hpa_t root = vcpu->arch.mmu.root_hpa;
2992 
2993  ASSERT(!VALID_PAGE(root));
2994 
2995  spin_lock(&vcpu->kvm->mmu_lock);
2996  kvm_mmu_free_some_pages(vcpu);
2997  sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2998  0, ACC_ALL, NULL);
2999  root = __pa(sp->spt);
3000  ++sp->root_count;
3001  spin_unlock(&vcpu->kvm->mmu_lock);
3002  vcpu->arch.mmu.root_hpa = root;
3003  return 0;
3004  }
3005 
3006  /*
3007  * We shadow a 32 bit page table. This may be a legacy 2-level
3008  * or a PAE 3-level page table. In either case we need to be aware that
3009  * the shadow page table may be a PAE or a long mode page table.
3010  */
3011  pm_mask = PT_PRESENT_MASK;
3012  if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
3014 
3015  for (i = 0; i < 4; ++i) {
3016  hpa_t root = vcpu->arch.mmu.pae_root[i];
3017 
3018  ASSERT(!VALID_PAGE(root));
3019  if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
3020  pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
3021  if (!is_present_gpte(pdptr)) {
3022  vcpu->arch.mmu.pae_root[i] = 0;
3023  continue;
3024  }
3025  root_gfn = pdptr >> PAGE_SHIFT;
3026  if (mmu_check_root(vcpu, root_gfn))
3027  return 1;
3028  }
3029  spin_lock(&vcpu->kvm->mmu_lock);
3030  kvm_mmu_free_some_pages(vcpu);
3031  sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
3032  PT32_ROOT_LEVEL, 0,
3033  ACC_ALL, NULL);
3034  root = __pa(sp->spt);
3035  ++sp->root_count;
3036  spin_unlock(&vcpu->kvm->mmu_lock);
3037 
3038  vcpu->arch.mmu.pae_root[i] = root | pm_mask;
3039  }
3040  vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
3041 
3042  /*
3043  * If we shadow a 32 bit page table with a long mode page
3044  * table we enter this path.
3045  */
3046  if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3047  if (vcpu->arch.mmu.lm_root == NULL) {
3048  /*
3049  * The additional page necessary for this is only
3050  * allocated on demand.
3051  */
3052 
3053  u64 *lm_root;
3054 
3055  lm_root = (void*)get_zeroed_page(GFP_KERNEL);
3056  if (lm_root == NULL)
3057  return 1;
3058 
3059  lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
3060 
3061  vcpu->arch.mmu.lm_root = lm_root;
3062  }
3063 
3064  vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
3065  }
3066 
3067  return 0;
3068 }
3069 
3070 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3071 {
3072  if (vcpu->arch.mmu.direct_map)
3073  return mmu_alloc_direct_roots(vcpu);
3074  else
3075  return mmu_alloc_shadow_roots(vcpu);
3076 }
3077 
3078 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
3079 {
3080  int i;
3081  struct kvm_mmu_page *sp;
3082 
3083  if (vcpu->arch.mmu.direct_map)
3084  return;
3085 
3086  if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3087  return;
3088 
3089  vcpu_clear_mmio_info(vcpu, ~0ul);
3090  kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3091  if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
3092  hpa_t root = vcpu->arch.mmu.root_hpa;
3093  sp = page_header(root);
3094  mmu_sync_children(vcpu, sp);
3095  kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3096  return;
3097  }
3098  for (i = 0; i < 4; ++i) {
3099  hpa_t root = vcpu->arch.mmu.pae_root[i];
3100 
3101  if (root && VALID_PAGE(root)) {
3102  root &= PT64_BASE_ADDR_MASK;
3103  sp = page_header(root);
3104  mmu_sync_children(vcpu, sp);
3105  }
3106  }
3107  kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3108 }
3109 
3110 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3111 {
3112  spin_lock(&vcpu->kvm->mmu_lock);
3113  mmu_sync_roots(vcpu);
3114  spin_unlock(&vcpu->kvm->mmu_lock);
3115 }
3116 
3117 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3118  u32 access, struct x86_exception *exception)
3119 {
3120  if (exception)
3121  exception->error_code = 0;
3122  return vaddr;
3123 }
3124 
3125 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
3126  u32 access,
3127  struct x86_exception *exception)
3128 {
3129  if (exception)
3130  exception->error_code = 0;
3131  return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
3132 }
3133 
3134 static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3135 {
3136  if (direct)
3137  return vcpu_match_mmio_gpa(vcpu, addr);
3138 
3139  return vcpu_match_mmio_gva(vcpu, addr);
3140 }
3141 
3142 
3143 /*
3144  * On direct hosts, the last spte is only allows two states
3145  * for mmio page fault:
3146  * - It is the mmio spte
3147  * - It is zapped or it is being zapped.
3148  *
3149  * This function completely checks the spte when the last spte
3150  * is not the mmio spte.
3151  */
3152 static bool check_direct_spte_mmio_pf(u64 spte)
3153 {
3154  return __check_direct_spte_mmio_pf(spte);
3155 }
3156 
3157 static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
3158 {
3159  struct kvm_shadow_walk_iterator iterator;
3160  u64 spte = 0ull;
3161 
3162  walk_shadow_page_lockless_begin(vcpu);
3163  for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
3164  if (!is_shadow_present_pte(spte))
3165  break;
3166  walk_shadow_page_lockless_end(vcpu);
3167 
3168  return spte;
3169 }
3170 
3171 /*
3172  * If it is a real mmio page fault, return 1 and emulat the instruction
3173  * directly, return 0 to let CPU fault again on the address, -1 is
3174  * returned if bug is detected.
3175  */
3176 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3177 {
3178  u64 spte;
3179 
3180  if (quickly_check_mmio_pf(vcpu, addr, direct))
3181  return 1;
3182 
3183  spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
3184 
3185  if (is_mmio_spte(spte)) {
3186  gfn_t gfn = get_mmio_spte_gfn(spte);
3187  unsigned access = get_mmio_spte_access(spte);
3188 
3189  if (direct)
3190  addr = 0;
3191 
3192  trace_handle_mmio_page_fault(addr, gfn, access);
3193  vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3194  return 1;
3195  }
3196 
3197  /*
3198  * It's ok if the gva is remapped by other cpus on shadow guest,
3199  * it's a BUG if the gfn is not a mmio page.
3200  */
3201  if (direct && !check_direct_spte_mmio_pf(spte))
3202  return -1;
3203 
3204  /*
3205  * If the page table is zapped by other cpus, let CPU fault again on
3206  * the address.
3207  */
3208  return 0;
3209 }
3211 
3212 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
3213  u32 error_code, bool direct)
3214 {
3215  int ret;
3216 
3217  ret = handle_mmio_page_fault_common(vcpu, addr, direct);
3218  WARN_ON(ret < 0);
3219  return ret;
3220 }
3221 
3222 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3223  u32 error_code, bool prefault)
3224 {
3225  gfn_t gfn;
3226  int r;
3227 
3228  pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
3229 
3230  if (unlikely(error_code & PFERR_RSVD_MASK))
3231  return handle_mmio_page_fault(vcpu, gva, error_code, true);
3232 
3233  r = mmu_topup_memory_caches(vcpu);
3234  if (r)
3235  return r;
3236 
3237  ASSERT(vcpu);
3238  ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
3239 
3240  gfn = gva >> PAGE_SHIFT;
3241 
3242  return nonpaging_map(vcpu, gva & PAGE_MASK,
3243  error_code, gfn, prefault);
3244 }
3245 
3246 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
3247 {
3248  struct kvm_arch_async_pf arch;
3249 
3250  arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
3251  arch.gfn = gfn;
3252  arch.direct_map = vcpu->arch.mmu.direct_map;
3253  arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
3254 
3255  return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
3256 }
3257 
3258 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
3259 {
3260  if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
3261  kvm_event_needs_reinjection(vcpu)))
3262  return false;
3263 
3264  return kvm_x86_ops->interrupt_allowed(vcpu);
3265 }
3266 
3267 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3268  gva_t gva, pfn_t *pfn, bool write, bool *writable)
3269 {
3270  bool async;
3271 
3272  *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
3273 
3274  if (!async)
3275  return false; /* *pfn has correct page already */
3276 
3277  if (!prefault && can_do_async_pf(vcpu)) {
3278  trace_kvm_try_async_get_page(gva, gfn);
3279  if (kvm_find_async_pf_gfn(vcpu, gfn)) {
3280  trace_kvm_async_pf_doublefault(gva, gfn);
3281  kvm_make_request(KVM_REQ_APF_HALT, vcpu);
3282  return true;
3283  } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
3284  return true;
3285  }
3286 
3287  *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
3288 
3289  return false;
3290 }
3291 
3292 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3293  bool prefault)
3294 {
3295  pfn_t pfn;
3296  int r;
3297  int level;
3298  int force_pt_level;
3299  gfn_t gfn = gpa >> PAGE_SHIFT;
3300  unsigned long mmu_seq;
3301  int write = error_code & PFERR_WRITE_MASK;
3302  bool map_writable;
3303 
3304  ASSERT(vcpu);
3305  ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
3306 
3307  if (unlikely(error_code & PFERR_RSVD_MASK))
3308  return handle_mmio_page_fault(vcpu, gpa, error_code, true);
3309 
3310  r = mmu_topup_memory_caches(vcpu);
3311  if (r)
3312  return r;
3313 
3314  force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
3315  if (likely(!force_pt_level)) {
3316  level = mapping_level(vcpu, gfn);
3317  gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3318  } else
3319  level = PT_PAGE_TABLE_LEVEL;
3320 
3321  if (fast_page_fault(vcpu, gpa, level, error_code))
3322  return 0;
3323 
3324  mmu_seq = vcpu->kvm->mmu_notifier_seq;
3325  smp_rmb();
3326 
3327  if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
3328  return 0;
3329 
3330  if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
3331  return r;
3332 
3333  spin_lock(&vcpu->kvm->mmu_lock);
3334  if (mmu_notifier_retry(vcpu, mmu_seq))
3335  goto out_unlock;
3336  kvm_mmu_free_some_pages(vcpu);
3337  if (likely(!force_pt_level))
3338  transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
3339  r = __direct_map(vcpu, gpa, write, map_writable,
3340  level, gfn, pfn, prefault);
3341  spin_unlock(&vcpu->kvm->mmu_lock);
3342 
3343  return r;
3344 
3345 out_unlock:
3346  spin_unlock(&vcpu->kvm->mmu_lock);
3347  kvm_release_pfn_clean(pfn);
3348  return 0;
3349 }
3350 
3351 static void nonpaging_free(struct kvm_vcpu *vcpu)
3352 {
3353  mmu_free_roots(vcpu);
3354 }
3355 
3356 static int nonpaging_init_context(struct kvm_vcpu *vcpu,
3357  struct kvm_mmu *context)
3358 {
3359  context->new_cr3 = nonpaging_new_cr3;
3360  context->page_fault = nonpaging_page_fault;
3361  context->gva_to_gpa = nonpaging_gva_to_gpa;
3362  context->free = nonpaging_free;
3363  context->sync_page = nonpaging_sync_page;
3364  context->invlpg = nonpaging_invlpg;
3365  context->update_pte = nonpaging_update_pte;
3366  context->root_level = 0;
3368  context->root_hpa = INVALID_PAGE;
3369  context->direct_map = true;
3370  context->nx = false;
3371  return 0;
3372 }
3373 
3374 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3375 {
3376  ++vcpu->stat.tlb_flush;
3377  kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3378 }
3379 
3380 static void paging_new_cr3(struct kvm_vcpu *vcpu)
3381 {
3382  pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
3383  mmu_free_roots(vcpu);
3384 }
3385 
3386 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
3387 {
3388  return kvm_read_cr3(vcpu);
3389 }
3390 
3391 static void inject_page_fault(struct kvm_vcpu *vcpu,
3392  struct x86_exception *fault)
3393 {
3394  vcpu->arch.mmu.inject_page_fault(vcpu, fault);
3395 }
3396 
3397 static void paging_free(struct kvm_vcpu *vcpu)
3398 {
3399  nonpaging_free(vcpu);
3400 }
3401 
3402 static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3403 {
3404  int bit7;
3405 
3406  bit7 = (gpte >> 7) & 1;
3407  return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
3408 }
3409 
3410 static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3411 {
3412  unsigned mask;
3413 
3414  BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
3415 
3416  mask = (unsigned)~ACC_WRITE_MASK;
3417  /* Allow write access to dirty gptes */
3418  mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
3419  *access &= mask;
3420 }
3421 
3422 static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3423  int *nr_present)
3424 {
3425  if (unlikely(is_mmio_spte(*sptep))) {
3426  if (gfn != get_mmio_spte_gfn(*sptep)) {
3427  mmu_spte_clear_no_track(sptep);
3428  return true;
3429  }
3430 
3431  (*nr_present)++;
3432  mark_mmio_spte(sptep, gfn, access);
3433  return true;
3434  }
3435 
3436  return false;
3437 }
3438 
3439 static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
3440 {
3441  unsigned access;
3442 
3443  access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
3444  access &= ~(gpte >> PT64_NX_SHIFT);
3445 
3446  return access;
3447 }
3448 
3449 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
3450 {
3451  unsigned index;
3452 
3453  index = level - 1;
3454  index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
3455  return mmu->last_pte_bitmap & (1 << index);
3456 }
3457 
3458 #define PTTYPE 64
3459 #include "paging_tmpl.h"
3460 #undef PTTYPE
3461 
3462 #define PTTYPE 32
3463 #include "paging_tmpl.h"
3464 #undef PTTYPE
3465 
3466 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3467  struct kvm_mmu *context)
3468 {
3469  int maxphyaddr = cpuid_maxphyaddr(vcpu);
3470  u64 exb_bit_rsvd = 0;
3471 
3472  if (!context->nx)
3473  exb_bit_rsvd = rsvd_bits(63, 63);
3474  switch (context->root_level) {
3475  case PT32_ROOT_LEVEL:
3476  /* no rsvd bits for 2 level 4K page table entries */
3477  context->rsvd_bits_mask[0][1] = 0;
3478  context->rsvd_bits_mask[0][0] = 0;
3479  context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
3480 
3481  if (!is_pse(vcpu)) {
3482  context->rsvd_bits_mask[1][1] = 0;
3483  break;
3484  }
3485 
3486  if (is_cpuid_PSE36())
3487  /* 36bits PSE 4MB page */
3488  context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
3489  else
3490  /* 32 bits PSE 4MB page */
3491  context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
3492  break;
3493  case PT32E_ROOT_LEVEL:
3494  context->rsvd_bits_mask[0][2] =
3495  rsvd_bits(maxphyaddr, 63) |
3496  rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
3497  context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
3498  rsvd_bits(maxphyaddr, 62); /* PDE */
3499  context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
3500  rsvd_bits(maxphyaddr, 62); /* PTE */
3501  context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
3502  rsvd_bits(maxphyaddr, 62) |
3503  rsvd_bits(13, 20); /* large page */
3504  context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
3505  break;
3506  case PT64_ROOT_LEVEL:
3507  context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
3508  rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
3509  context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
3510  rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
3511  context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
3512  rsvd_bits(maxphyaddr, 51);
3513  context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
3514  rsvd_bits(maxphyaddr, 51);
3515  context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
3516  context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
3517  rsvd_bits(maxphyaddr, 51) |
3518  rsvd_bits(13, 29);
3519  context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
3520  rsvd_bits(maxphyaddr, 51) |
3521  rsvd_bits(13, 20); /* large page */
3522  context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
3523  break;
3524  }
3525 }
3526 
3527 static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
3528 {
3529  unsigned bit, byte, pfec;
3530  u8 map;
3531  bool fault, x, w, u, wf, uf, ff, smep;
3532 
3533  smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
3534  for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
3535  pfec = byte << 1;
3536  map = 0;
3537  wf = pfec & PFERR_WRITE_MASK;
3538  uf = pfec & PFERR_USER_MASK;
3539  ff = pfec & PFERR_FETCH_MASK;
3540  for (bit = 0; bit < 8; ++bit) {
3541  x = bit & ACC_EXEC_MASK;
3542  w = bit & ACC_WRITE_MASK;
3543  u = bit & ACC_USER_MASK;
3544 
3545  /* Not really needed: !nx will cause pte.nx to fault */
3546  x |= !mmu->nx;
3547  /* Allow supervisor writes if !cr0.wp */
3548  w |= !is_write_protection(vcpu) && !uf;
3549  /* Disallow supervisor fetches of user code if cr4.smep */
3550  x &= !(smep && u && !uf);
3551 
3552  fault = (ff && !x) || (uf && !u) || (wf && !w);
3553  map |= fault << bit;
3554  }
3555  mmu->permissions[byte] = map;
3556  }
3557 }
3558 
3559 static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
3560 {
3561  u8 map;
3562  unsigned level, root_level = mmu->root_level;
3563  const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */
3564 
3565  if (root_level == PT32E_ROOT_LEVEL)
3566  --root_level;
3567  /* PT_PAGE_TABLE_LEVEL always terminates */
3568  map = 1 | (1 << ps_set_index);
3569  for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
3570  if (level <= PT_PDPE_LEVEL
3571  && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
3572  map |= 1 << (ps_set_index | (level - 1));
3573  }
3574  mmu->last_pte_bitmap = map;
3575 }
3576 
3577 static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3578  struct kvm_mmu *context,
3579  int level)
3580 {
3581  context->nx = is_nx(vcpu);
3582  context->root_level = level;
3583 
3584  reset_rsvds_bits_mask(vcpu, context);
3585  update_permission_bitmask(vcpu, context);
3586  update_last_pte_bitmap(vcpu, context);
3587 
3588  ASSERT(is_pae(vcpu));
3589  context->new_cr3 = paging_new_cr3;
3590  context->page_fault = paging64_page_fault;
3591  context->gva_to_gpa = paging64_gva_to_gpa;
3592  context->sync_page = paging64_sync_page;
3593  context->invlpg = paging64_invlpg;
3594  context->update_pte = paging64_update_pte;
3595  context->free = paging_free;
3596  context->shadow_root_level = level;
3597  context->root_hpa = INVALID_PAGE;
3598  context->direct_map = false;
3599  return 0;
3600 }
3601 
3602 static int paging64_init_context(struct kvm_vcpu *vcpu,
3603  struct kvm_mmu *context)
3604 {
3605  return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
3606 }
3607 
3608 static int paging32_init_context(struct kvm_vcpu *vcpu,
3609  struct kvm_mmu *context)
3610 {
3611  context->nx = false;
3612  context->root_level = PT32_ROOT_LEVEL;
3613 
3614  reset_rsvds_bits_mask(vcpu, context);
3615  update_permission_bitmask(vcpu, context);
3616  update_last_pte_bitmap(vcpu, context);
3617 
3618  context->new_cr3 = paging_new_cr3;
3619  context->page_fault = paging32_page_fault;
3620  context->gva_to_gpa = paging32_gva_to_gpa;
3621  context->free = paging_free;
3622  context->sync_page = paging32_sync_page;
3623  context->invlpg = paging32_invlpg;
3624  context->update_pte = paging32_update_pte;
3626  context->root_hpa = INVALID_PAGE;
3627  context->direct_map = false;
3628  return 0;
3629 }
3630 
3631 static int paging32E_init_context(struct kvm_vcpu *vcpu,
3632  struct kvm_mmu *context)
3633 {
3634  return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
3635 }
3636 
3637 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3638 {
3639  struct kvm_mmu *context = vcpu->arch.walk_mmu;
3640 
3641  context->base_role.word = 0;
3642  context->new_cr3 = nonpaging_new_cr3;
3643  context->page_fault = tdp_page_fault;
3644  context->free = nonpaging_free;
3645  context->sync_page = nonpaging_sync_page;
3646  context->invlpg = nonpaging_invlpg;
3647  context->update_pte = nonpaging_update_pte;
3649  context->root_hpa = INVALID_PAGE;
3650  context->direct_map = true;
3651  context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
3652  context->get_cr3 = get_cr3;
3653  context->get_pdptr = kvm_pdptr_read;
3655 
3656  if (!is_paging(vcpu)) {
3657  context->nx = false;
3658  context->gva_to_gpa = nonpaging_gva_to_gpa;
3659  context->root_level = 0;
3660  } else if (is_long_mode(vcpu)) {
3661  context->nx = is_nx(vcpu);
3662  context->root_level = PT64_ROOT_LEVEL;
3663  reset_rsvds_bits_mask(vcpu, context);
3664  context->gva_to_gpa = paging64_gva_to_gpa;
3665  } else if (is_pae(vcpu)) {
3666  context->nx = is_nx(vcpu);
3667  context->root_level = PT32E_ROOT_LEVEL;
3668  reset_rsvds_bits_mask(vcpu, context);
3669  context->gva_to_gpa = paging64_gva_to_gpa;
3670  } else {
3671  context->nx = false;
3672  context->root_level = PT32_ROOT_LEVEL;
3673  reset_rsvds_bits_mask(vcpu, context);
3674  context->gva_to_gpa = paging32_gva_to_gpa;
3675  }
3676 
3677  update_permission_bitmask(vcpu, context);
3678  update_last_pte_bitmap(vcpu, context);
3679 
3680  return 0;
3681 }
3682 
3683 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3684 {
3685  int r;
3686  bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
3687  ASSERT(vcpu);
3688  ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3689 
3690  if (!is_paging(vcpu))
3691  r = nonpaging_init_context(vcpu, context);
3692  else if (is_long_mode(vcpu))
3693  r = paging64_init_context(vcpu, context);
3694  else if (is_pae(vcpu))
3695  r = paging32E_init_context(vcpu, context);
3696  else
3697  r = paging32_init_context(vcpu, context);
3698 
3699  vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3700  vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
3701  vcpu->arch.mmu.base_role.smep_andnot_wp
3702  = smep && !is_write_protection(vcpu);
3703 
3704  return r;
3705 }
3707 
3708 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
3709 {
3710  int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
3711 
3712  vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
3713  vcpu->arch.walk_mmu->get_cr3 = get_cr3;
3714  vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read;
3715  vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3716 
3717  return r;
3718 }
3719 
3720 static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3721 {
3722  struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
3723 
3724  g_context->get_cr3 = get_cr3;
3725  g_context->get_pdptr = kvm_pdptr_read;
3727 
3728  /*
3729  * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
3730  * translation of l2_gpa to l1_gpa addresses is done using the
3731  * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
3732  * functions between mmu and nested_mmu are swapped.
3733  */
3734  if (!is_paging(vcpu)) {
3735  g_context->nx = false;
3736  g_context->root_level = 0;
3737  g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
3738  } else if (is_long_mode(vcpu)) {
3739  g_context->nx = is_nx(vcpu);
3740  g_context->root_level = PT64_ROOT_LEVEL;
3741  reset_rsvds_bits_mask(vcpu, g_context);
3742  g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3743  } else if (is_pae(vcpu)) {
3744  g_context->nx = is_nx(vcpu);
3745  g_context->root_level = PT32E_ROOT_LEVEL;
3746  reset_rsvds_bits_mask(vcpu, g_context);
3747  g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3748  } else {
3749  g_context->nx = false;
3750  g_context->root_level = PT32_ROOT_LEVEL;
3751  reset_rsvds_bits_mask(vcpu, g_context);
3752  g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3753  }
3754 
3755  update_permission_bitmask(vcpu, g_context);
3756  update_last_pte_bitmap(vcpu, g_context);
3757 
3758  return 0;
3759 }
3760 
3761 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3762 {
3763  if (mmu_is_nested(vcpu))
3764  return init_kvm_nested_mmu(vcpu);
3765  else if (tdp_enabled)
3766  return init_kvm_tdp_mmu(vcpu);
3767  else
3768  return init_kvm_softmmu(vcpu);
3769 }
3770 
3771 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
3772 {
3773  ASSERT(vcpu);
3774  if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
3775  /* mmu.free() should set root_hpa = INVALID_PAGE */
3776  vcpu->arch.mmu.free(vcpu);
3777 }
3778 
3780 {
3781  destroy_kvm_mmu(vcpu);
3782  return init_kvm_mmu(vcpu);
3783 }
3785 
3786 int kvm_mmu_load(struct kvm_vcpu *vcpu)
3787 {
3788  int r;
3789 
3790  r = mmu_topup_memory_caches(vcpu);
3791  if (r)
3792  goto out;
3793  r = mmu_alloc_roots(vcpu);
3794  spin_lock(&vcpu->kvm->mmu_lock);
3795  mmu_sync_roots(vcpu);
3796  spin_unlock(&vcpu->kvm->mmu_lock);
3797  if (r)
3798  goto out;
3799  /* set_cr3() should ensure TLB has been flushed */
3800  vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
3801 out:
3802  return r;
3803 }
3805 
3806 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
3807 {
3808  mmu_free_roots(vcpu);
3809 }
3811 
3812 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3813  struct kvm_mmu_page *sp, u64 *spte,
3814  const void *new)
3815 {
3816  if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3817  ++vcpu->kvm->stat.mmu_pde_zapped;
3818  return;
3819  }
3820 
3821  ++vcpu->kvm->stat.mmu_pte_updated;
3822  vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
3823 }
3824 
3825 static bool need_remote_flush(u64 old, u64 new)
3826 {
3827  if (!is_shadow_present_pte(old))
3828  return false;
3829  if (!is_shadow_present_pte(new))
3830  return true;
3831  if ((old ^ new) & PT64_BASE_ADDR_MASK)
3832  return true;
3833  old ^= PT64_NX_MASK;
3834  new ^= PT64_NX_MASK;
3835  return (old & ~new & PT64_PERM_MASK) != 0;
3836 }
3837 
3838 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
3839  bool remote_flush, bool local_flush)
3840 {
3841  if (zap_page)
3842  return;
3843 
3844  if (remote_flush)
3845  kvm_flush_remote_tlbs(vcpu->kvm);
3846  else if (local_flush)
3847  kvm_mmu_flush_tlb(vcpu);
3848 }
3849 
3850 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
3851  const u8 *new, int *bytes)
3852 {
3853  u64 gentry;
3854  int r;
3855 
3856  /*
3857  * Assume that the pte write on a page table of the same type
3858  * as the current vcpu paging mode since we update the sptes only
3859  * when they have the same mode.
3860  */
3861  if (is_pae(vcpu) && *bytes == 4) {
3862  /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3863  *gpa &= ~(gpa_t)7;
3864  *bytes = 8;
3865  r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
3866  if (r)
3867  gentry = 0;
3868  new = (const u8 *)&gentry;
3869  }
3870 
3871  switch (*bytes) {
3872  case 4:
3873  gentry = *(const u32 *)new;
3874  break;
3875  case 8:
3876  gentry = *(const u64 *)new;
3877  break;
3878  default:
3879  gentry = 0;
3880  break;
3881  }
3882 
3883  return gentry;
3884 }
3885 
3886 /*
3887  * If we're seeing too many writes to a page, it may no longer be a page table,
3888  * or we may be forking, in which case it is better to unmap the page.
3889  */
3890 static bool detect_write_flooding(struct kvm_mmu_page *sp)
3891 {
3892  /*
3893  * Skip write-flooding detected for the sp whose level is 1, because
3894  * it can become unsync, then the guest page is not write-protected.
3895  */
3896  if (sp->role.level == PT_PAGE_TABLE_LEVEL)
3897  return false;
3898 
3899  return ++sp->write_flooding_count >= 3;
3900 }
3901 
3902 /*
3903  * Misaligned accesses are too much trouble to fix up; also, they usually
3904  * indicate a page is not used as a page table.
3905  */
3906 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
3907  int bytes)
3908 {
3909  unsigned offset, pte_size, misaligned;
3910 
3911  pgprintk("misaligned: gpa %llx bytes %d role %x\n",
3912  gpa, bytes, sp->role.word);
3913 
3914  offset = offset_in_page(gpa);
3915  pte_size = sp->role.cr4_pae ? 8 : 4;
3916 
3917  /*
3918  * Sometimes, the OS only writes the last one bytes to update status
3919  * bits, for example, in linux, andb instruction is used in clear_bit().
3920  */
3921  if (!(offset & (pte_size - 1)) && bytes == 1)
3922  return false;
3923 
3924  misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
3925  misaligned |= bytes < 4;
3926 
3927  return misaligned;
3928 }
3929 
3930 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
3931 {
3932  unsigned page_offset, quadrant;
3933  u64 *spte;
3934  int level;
3935 
3936  page_offset = offset_in_page(gpa);
3937  level = sp->role.level;
3938  *nspte = 1;
3939  if (!sp->role.cr4_pae) {
3940  page_offset <<= 1; /* 32->64 */
3941  /*
3942  * A 32-bit pde maps 4MB while the shadow pdes map
3943  * only 2MB. So we need to double the offset again
3944  * and zap two pdes instead of one.
3945  */
3946  if (level == PT32_ROOT_LEVEL) {
3947  page_offset &= ~7; /* kill rounding error */
3948  page_offset <<= 1;
3949  *nspte = 2;
3950  }
3951  quadrant = page_offset >> PAGE_SHIFT;
3952  page_offset &= ~PAGE_MASK;
3953  if (quadrant != sp->role.quadrant)
3954  return NULL;
3955  }
3956 
3957  spte = &sp->spt[page_offset / sizeof(*spte)];
3958  return spte;
3959 }
3960 
3961 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3962  const u8 *new, int bytes)
3963 {
3964  gfn_t gfn = gpa >> PAGE_SHIFT;
3965  union kvm_mmu_page_role mask = { .word = 0 };
3966  struct kvm_mmu_page *sp;
3967  struct hlist_node *node;
3968  LIST_HEAD(invalid_list);
3969  u64 entry, gentry, *spte;
3970  int npte;
3971  bool remote_flush, local_flush, zap_page;
3972 
3973  /*
3974  * If we don't have indirect shadow pages, it means no page is
3975  * write-protected, so we can exit simply.
3976  */
3977  if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
3978  return;
3979 
3980  zap_page = remote_flush = local_flush = false;
3981 
3982  pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3983 
3984  gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);
3985 
3986  /*
3987  * No need to care whether allocation memory is successful
3988  * or not since pte prefetch is skiped if it does not have
3989  * enough objects in the cache.
3990  */
3991  mmu_topup_memory_caches(vcpu);
3992 
3993  spin_lock(&vcpu->kvm->mmu_lock);
3994  ++vcpu->kvm->stat.mmu_pte_write;
3995  kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3996 
3997  mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3998  for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3999  if (detect_write_misaligned(sp, gpa, bytes) ||
4000  detect_write_flooding(sp)) {
4001  zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
4002  &invalid_list);
4003  ++vcpu->kvm->stat.mmu_flooded;
4004  continue;
4005  }
4006 
4007  spte = get_written_sptes(sp, gpa, &npte);
4008  if (!spte)
4009  continue;
4010 
4011  local_flush = true;
4012  while (npte--) {
4013  entry = *spte;
4014  mmu_page_zap_pte(vcpu->kvm, sp, spte);
4015  if (gentry &&
4016  !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
4017  & mask.word) && rmap_can_add(vcpu))
4018  mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
4019  if (!remote_flush && need_remote_flush(entry, *spte))
4020  remote_flush = true;
4021  ++spte;
4022  }
4023  }
4024  mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
4025  kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
4026  kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
4027  spin_unlock(&vcpu->kvm->mmu_lock);
4028 }
4029 
4031 {
4032  gpa_t gpa;
4033  int r;
4034 
4035  if (vcpu->arch.mmu.direct_map)
4036  return 0;
4037 
4038  gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
4039 
4040  r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
4041 
4042  return r;
4043 }
4045 
4047 {
4048  LIST_HEAD(invalid_list);
4049 
4050  while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
4051  !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
4052  struct kvm_mmu_page *sp;
4053 
4054  sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
4055  struct kvm_mmu_page, link);
4056  kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
4057  ++vcpu->kvm->stat.mmu_recycled;
4058  }
4059  kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
4060 }
4061 
4062 static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
4063 {
4064  if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
4065  return vcpu_match_mmio_gpa(vcpu, addr);
4066 
4067  return vcpu_match_mmio_gva(vcpu, addr);
4068 }
4069 
4070 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
4071  void *insn, int insn_len)
4072 {
4073  int r, emulation_type = EMULTYPE_RETRY;
4074  enum emulation_result er;
4075 
4076  r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
4077  if (r < 0)
4078  goto out;
4079 
4080  if (!r) {
4081  r = 1;
4082  goto out;
4083  }
4084 
4085  if (is_mmio_page_fault(vcpu, cr2))
4086  emulation_type = 0;
4087 
4088  er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
4089 
4090  switch (er) {
4091  case EMULATE_DONE:
4092  return 1;
4093  case EMULATE_DO_MMIO:
4094  ++vcpu->stat.mmio_exits;
4095  /* fall through */
4096  case EMULATE_FAIL:
4097  return 0;
4098  default:
4099  BUG();
4100  }
4101 out:
4102  return r;
4103 }
4105 
4106 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
4107 {
4108  vcpu->arch.mmu.invlpg(vcpu, gva);
4109  kvm_mmu_flush_tlb(vcpu);
4110  ++vcpu->stat.invlpg;
4111 }
4113 
4114 void kvm_enable_tdp(void)
4115 {
4116  tdp_enabled = true;
4117 }
4119 
4121 {
4122  tdp_enabled = false;
4123 }
4125 
4126 static void free_mmu_pages(struct kvm_vcpu *vcpu)
4127 {
4128  free_page((unsigned long)vcpu->arch.mmu.pae_root);
4129  if (vcpu->arch.mmu.lm_root != NULL)
4130  free_page((unsigned long)vcpu->arch.mmu.lm_root);
4131 }
4132 
4133 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
4134 {
4135  struct page *page;
4136  int i;
4137 
4138  ASSERT(vcpu);
4139 
4140  /*
4141  * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
4142  * Therefore we need to allocate shadow page tables in the first
4143  * 4GB of memory, which happens to fit the DMA32 zone.
4144  */
4145  page = alloc_page(GFP_KERNEL | __GFP_DMA32);
4146  if (!page)
4147  return -ENOMEM;
4148 
4149  vcpu->arch.mmu.pae_root = page_address(page);
4150  for (i = 0; i < 4; ++i)
4151  vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
4152 
4153  return 0;
4154 }
4155 
4156 int kvm_mmu_create(struct kvm_vcpu *vcpu)
4157 {
4158  ASSERT(vcpu);
4159 
4160  vcpu->arch.walk_mmu = &vcpu->arch.mmu;
4161  vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4162  vcpu->arch.mmu.translate_gpa = translate_gpa;
4163  vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
4164 
4165  return alloc_mmu_pages(vcpu);
4166 }
4167 
4168 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
4169 {
4170  ASSERT(vcpu);
4171  ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
4172 
4173  return init_kvm_mmu(vcpu);
4174 }
4175 
4176 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4177 {
4178  struct kvm_mmu_page *sp;
4179  bool flush = false;
4180 
4181  list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
4182  int i;
4183  u64 *pt;
4184 
4185  if (!test_bit(slot, sp->slot_bitmap))
4186  continue;
4187 
4188  pt = sp->spt;
4189  for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
4190  if (!is_shadow_present_pte(pt[i]) ||
4191  !is_last_spte(pt[i], sp->role.level))
4192  continue;
4193 
4194  spte_write_protect(kvm, &pt[i], &flush, false);
4195  }
4196  }
4197  kvm_flush_remote_tlbs(kvm);
4198 }
4199 
4200 void kvm_mmu_zap_all(struct kvm *kvm)
4201 {
4202  struct kvm_mmu_page *sp, *node;
4203  LIST_HEAD(invalid_list);
4204 
4205  spin_lock(&kvm->mmu_lock);
4206 restart:
4207  list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
4208  if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
4209  goto restart;
4210 
4211  kvm_mmu_commit_zap_page(kvm, &invalid_list);
4212  spin_unlock(&kvm->mmu_lock);
4213 }
4214 
4215 static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
4216  struct list_head *invalid_list)
4217 {
4218  struct kvm_mmu_page *page;
4219 
4220  if (list_empty(&kvm->arch.active_mmu_pages))
4221  return;
4222 
4223  page = container_of(kvm->arch.active_mmu_pages.prev,
4224  struct kvm_mmu_page, link);
4225  kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
4226 }
4227 
4228 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4229 {
4230  struct kvm *kvm;
4231  int nr_to_scan = sc->nr_to_scan;
4232 
4233  if (nr_to_scan == 0)
4234  goto out;
4235 
4236  raw_spin_lock(&kvm_lock);
4237 
4239  int idx;
4240  LIST_HEAD(invalid_list);
4241 
4242  /*
4243  * Never scan more than sc->nr_to_scan VM instances.
4244  * Will not hit this condition practically since we do not try
4245  * to shrink more than one VM and it is very unlikely to see
4246  * !n_used_mmu_pages so many times.
4247  */
4248  if (!nr_to_scan--)
4249  break;
4250  /*
4251  * n_used_mmu_pages is accessed without holding kvm->mmu_lock
4252  * here. We may skip a VM instance errorneosly, but we do not
4253  * want to shrink a VM that only started to populate its MMU
4254  * anyway.
4255  */
4256  if (!kvm->arch.n_used_mmu_pages)
4257  continue;
4258 
4259  idx = srcu_read_lock(&kvm->srcu);
4260  spin_lock(&kvm->mmu_lock);
4261 
4262  kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
4263  kvm_mmu_commit_zap_page(kvm, &invalid_list);
4264 
4265  spin_unlock(&kvm->mmu_lock);
4266  srcu_read_unlock(&kvm->srcu, idx);
4267 
4268  list_move_tail(&kvm->vm_list, &vm_list);
4269  break;
4270  }
4271 
4272  raw_spin_unlock(&kvm_lock);
4273 
4274 out:
4275  return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
4276 }
4277 
4278 static struct shrinker mmu_shrinker = {
4279  .shrink = mmu_shrink,
4280  .seeks = DEFAULT_SEEKS * 10,
4281 };
4282 
4283 static void mmu_destroy_caches(void)
4284 {
4285  if (pte_list_desc_cache)
4286  kmem_cache_destroy(pte_list_desc_cache);
4287  if (mmu_page_header_cache)
4288  kmem_cache_destroy(mmu_page_header_cache);
4289 }
4290 
4292 {
4293  pte_list_desc_cache = kmem_cache_create("pte_list_desc",
4294  sizeof(struct pte_list_desc),
4295  0, 0, NULL);
4296  if (!pte_list_desc_cache)
4297  goto nomem;
4298 
4299  mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
4300  sizeof(struct kvm_mmu_page),
4301  0, 0, NULL);
4302  if (!mmu_page_header_cache)
4303  goto nomem;
4304 
4305  if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
4306  goto nomem;
4307 
4308  register_shrinker(&mmu_shrinker);
4309 
4310  return 0;
4311 
4312 nomem:
4313  mmu_destroy_caches();
4314  return -ENOMEM;
4315 }
4316 
4317 /*
4318  * Caculate mmu pages needed for kvm.
4319  */
4320 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
4321 {
4322  unsigned int nr_mmu_pages;
4323  unsigned int nr_pages = 0;
4324  struct kvm_memslots *slots;
4325  struct kvm_memory_slot *memslot;
4326 
4327  slots = kvm_memslots(kvm);
4328 
4329  kvm_for_each_memslot(memslot, slots)
4330  nr_pages += memslot->npages;
4331 
4332  nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
4333  nr_mmu_pages = max(nr_mmu_pages,
4334  (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
4335 
4336  return nr_mmu_pages;
4337 }
4338 
4339 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
4340 {
4341  struct kvm_shadow_walk_iterator iterator;
4342  u64 spte;
4343  int nr_sptes = 0;
4344 
4345  walk_shadow_page_lockless_begin(vcpu);
4346  for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
4347  sptes[iterator.level-1] = spte;
4348  nr_sptes++;
4349  if (!is_shadow_present_pte(spte))
4350  break;
4351  }
4352  walk_shadow_page_lockless_end(vcpu);
4353 
4354  return nr_sptes;
4355 }
4357 
4358 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
4359 {
4360  ASSERT(vcpu);
4361 
4362  destroy_kvm_mmu(vcpu);
4363  free_mmu_pages(vcpu);
4364  mmu_free_memory_caches(vcpu);
4365 }
4366 
4368 {
4369  mmu_destroy_caches();
4370  percpu_counter_destroy(&kvm_total_used_mmu_pages);
4371  unregister_shrinker(&mmu_shrinker);
4372  mmu_audit_disable();
4373 }