23 #include <linux/module.h>
24 #include <linux/errno.h>
29 #include <linux/reboot.h>
35 #include <linux/sched.h>
40 #include <linux/kvm_para.h>
42 #include <linux/mman.h>
44 #include <linux/bitops.h>
49 #include <linux/slab.h>
53 #include <asm/processor.h>
55 #include <asm/uaccess.h>
56 #include <asm/pgtable.h>
61 #define CREATE_TRACE_POINTS
77 static int kvm_usage_count = 0;
78 static atomic_t hardware_enable_failed;
87 static long kvm_vcpu_ioctl(
struct file *
file,
unsigned int ioctl,
90 static long kvm_vcpu_compat_ioctl(
struct file *
file,
unsigned int ioctl,
93 static int hardware_enable_all(
void);
94 static void hardware_disable_all(
void);
101 static bool largepages_enabled =
true;
109 reserved = PageReserved(head);
125 return PageReserved(tail);
142 struct pid *oldpid = vcpu->
pid;
149 preempt_notifier_register(&vcpu->preempt_notifier);
159 preempt_notifier_unregister(&vcpu->preempt_notifier);
164 static void ack_flush(
void *_completed)
168 static bool make_all_cpus_request(
struct kvm *
kvm,
unsigned int req)
179 kvm_make_request(req, vcpu);
185 if (cpus !=
NULL && cpu != -1 && cpu != me &&
187 cpumask_set_cpu(cpu, cpus);
191 else if (!cpumask_empty(cpus))
196 free_cpumask_var(cpus);
206 ++kvm->
stat.remote_tlb_flush;
235 kvm_vcpu_set_in_spin_loop(vcpu,
false);
236 kvm_vcpu_set_dy_eligible(vcpu,
false);
258 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
259 static inline struct kvm *mmu_notifier_to_kvm(
struct mmu_notifier *mn)
264 static void kvm_mmu_notifier_invalidate_page(
struct mmu_notifier *mn,
268 struct kvm *kvm = mmu_notifier_to_kvm(mn);
269 int need_tlb_flush,
idx;
289 idx = srcu_read_lock(&kvm->
srcu);
292 kvm->mmu_notifier_seq++;
299 srcu_read_unlock(&kvm->
srcu, idx);
302 static void kvm_mmu_notifier_change_pte(
struct mmu_notifier *mn,
304 unsigned long address,
307 struct kvm *kvm = mmu_notifier_to_kvm(mn);
310 idx = srcu_read_lock(&kvm->
srcu);
312 kvm->mmu_notifier_seq++;
315 srcu_read_unlock(&kvm->
srcu, idx);
318 static void kvm_mmu_notifier_invalidate_range_start(
struct mmu_notifier *mn,
323 struct kvm *kvm = mmu_notifier_to_kvm(mn);
324 int need_tlb_flush = 0,
idx;
326 idx = srcu_read_lock(&kvm->
srcu);
333 kvm->mmu_notifier_count++;
341 srcu_read_unlock(&kvm->
srcu, idx);
344 static void kvm_mmu_notifier_invalidate_range_end(
struct mmu_notifier *mn,
349 struct kvm *kvm = mmu_notifier_to_kvm(mn);
357 kvm->mmu_notifier_seq++;
364 kvm->mmu_notifier_count--;
367 BUG_ON(kvm->mmu_notifier_count < 0);
370 static int kvm_mmu_notifier_clear_flush_young(
struct mmu_notifier *mn,
372 unsigned long address)
374 struct kvm *kvm = mmu_notifier_to_kvm(mn);
377 idx = srcu_read_lock(&kvm->
srcu);
385 srcu_read_unlock(&kvm->
srcu, idx);
390 static int kvm_mmu_notifier_test_young(
struct mmu_notifier *mn,
392 unsigned long address)
394 struct kvm *kvm = mmu_notifier_to_kvm(mn);
397 idx = srcu_read_lock(&kvm->
srcu);
401 srcu_read_unlock(&kvm->
srcu, idx);
406 static void kvm_mmu_notifier_release(
struct mmu_notifier *mn,
409 struct kvm *kvm = mmu_notifier_to_kvm(mn);
412 idx = srcu_read_lock(&kvm->
srcu);
414 srcu_read_unlock(&kvm->
srcu, idx);
417 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
418 .invalidate_page = kvm_mmu_notifier_invalidate_page,
419 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
420 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
421 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
422 .test_young = kvm_mmu_notifier_test_young,
423 .change_pte = kvm_mmu_notifier_change_pte,
424 .release = kvm_mmu_notifier_release,
427 static int kvm_init_mmu_notifier(
struct kvm *kvm)
429 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
435 static int kvm_init_mmu_notifier(
struct kvm *kvm)
442 static void kvm_init_memslots_id(
struct kvm *kvm)
451 static struct kvm *kvm_create_vm(
unsigned long type)
461 goto out_err_nodisable;
463 r = hardware_enable_all();
465 goto out_err_nodisable;
467 #ifdef CONFIG_HAVE_KVM_IRQCHIP
476 kvm_init_memslots_id(kvm);
495 r = kvm_init_mmu_notifier(kvm);
508 hardware_disable_all();
531 if (is_vmalloc_addr(addr))
553 kvm_destroy_dirty_bitmap(free);
566 kvm_free_physmem_slot(memslot,
NULL);
571 static void kvm_destroy_vm(
struct kvm *kvm)
582 kvm_io_bus_destroy(kvm->
buses[i]);
584 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
593 hardware_disable_all();
628 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
638 static int cmp_memslot(
const void *
slot1,
const void *
slot2)
676 if (new->npages != npages)
677 sort_memslots(slots);
687 #ifdef KVM_CAP_READONLY_MEM
691 if (mem->
flags & ~valid_flags)
717 r = check_memory_region_flags(mem);
750 new = old = *memslot;
753 new.base_gfn = base_gfn;
755 new.flags = mem->
flags;
767 if (s == memslot || !s->
npages)
769 if (!((base_gfn + npages <= s->base_gfn) ||
776 new.dirty_bitmap =
NULL;
781 if (npages && !old.
npages) {
791 if (kvm_create_dirty_bitmap(&
new) < 0)
796 if (!npages || base_gfn != old.
base_gfn) {
804 slot = id_to_memslot(slots, mem->
slot);
843 new.dirty_bitmap =
NULL;
854 kvm_free_physmem_slot(&old, &
new);
860 kvm_free_physmem_slot(&
new, &old);
896 unsigned long any = 0;
907 n = kvm_dirty_bitmap_bytes(memslot);
909 for (i = 0; !any && i < n/
sizeof(
long); ++
i)
926 return largepages_enabled;
931 largepages_enabled =
false;
961 if (kvm_is_error_hva(addr))
988 if (memslot_is_readonly(slot) && write)
994 return __gfn_to_hva_memslot(slot, gfn);
1000 return __gfn_to_hva_many(slot, gfn, nr_pages,
true);
1006 return gfn_to_hva_many(slot, gfn,
NULL);
1020 static unsigned long gfn_to_hva_read(
struct kvm *kvm,
gfn_t gfn)
1025 static int kvm_read_hva(
void *
data,
void __user *hva,
int len)
1030 static int kvm_read_hva_atomic(
void *
data,
void __user *hva,
int len)
1036 unsigned long start,
int write,
struct page **
page)
1038 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
1041 flags |= FOLL_WRITE;
1046 static inline int check_user_page_hwpoison(
unsigned long addr)
1048 int rc,
flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
1059 static bool hva_to_pfn_fast(
unsigned long addr,
bool atomic,
bool *
async,
1065 if (!(async || atomic))
1073 if (!(write_fault || writable))
1092 static int hva_to_pfn_slow(
unsigned long addr,
bool *async,
bool write_fault,
1093 bool *writable,
pfn_t *pfn)
1095 struct page *page[1];
1101 *writable = write_fault;
1106 addr, write_fault, page);
1115 if (
unlikely(!write_fault) && writable) {
1116 struct page *wpage[1];
1131 static bool vma_is_valid(
struct vm_area_struct *vma,
bool write_fault)
1156 static pfn_t hva_to_pfn(
unsigned long addr,
bool atomic,
bool *async,
1157 bool write_fault,
bool *writable)
1166 if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
1172 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
1178 (!async && check_user_page_hwpoison(addr))) {
1183 vma = find_vma_intersection(
current->mm, addr, addr + 1);
1187 else if ((vma->
vm_flags & VM_PFNMAP)) {
1192 if (async && vma_is_valid(vma, write_fault))
1203 bool *async,
bool write_fault,
bool *writable)
1205 unsigned long addr = __gfn_to_hva_many(slot, gfn,
NULL, write_fault);
1210 if (kvm_is_error_hva(addr))
1214 if (writable && memslot_is_readonly(slot)) {
1219 return hva_to_pfn(addr, atomic, async, write_fault,
1223 static pfn_t __gfn_to_pfn(
struct kvm *kvm,
gfn_t gfn,
bool atomic,
bool *async,
1224 bool write_fault,
bool *writable)
1233 return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
1239 return __gfn_to_pfn(kvm, gfn,
true,
NULL,
true,
NULL);
1244 bool write_fault,
bool *writable)
1246 return __gfn_to_pfn(kvm, gfn,
false, async, write_fault, writable);
1252 return __gfn_to_pfn(kvm, gfn,
false,
NULL,
true,
NULL);
1259 return __gfn_to_pfn(kvm, gfn,
false,
NULL, write_fault, writable);
1265 return __gfn_to_pfn_memslot(slot, gfn,
false,
NULL,
true,
NULL);
1270 return __gfn_to_pfn_memslot(slot, gfn,
true,
NULL,
true,
NULL);
1281 if (kvm_is_error_hva(addr))
1284 if (entry < nr_pages)
1291 static struct page *kvm_pfn_to_page(
pfn_t pfn)
1293 if (is_error_pfn(pfn))
1310 return kvm_pfn_to_page(pfn);
1355 if (!PageReserved(page))
1375 static int next_segment(
unsigned long len,
int offset)
1389 addr = gfn_to_hva_read(kvm, gfn);
1390 if (kvm_is_error_hva(addr))
1392 r = kvm_read_hva(data, (
void __user *)addr + offset, len);
1406 while ((seg = next_segment(len, offset)) != 0) {
1427 addr = gfn_to_hva_read(kvm, gfn);
1428 if (kvm_is_error_hva(addr))
1430 pagefault_disable();
1431 r = kvm_read_hva_atomic(data, (
void __user *)addr + offset, len);
1440 int offset,
int len)
1446 if (kvm_is_error_hva(addr))
1464 while ((seg = next_segment(len, offset)) != 0) {
1487 if (!kvm_is_error_hva(ghc->
hva))
1497 void *
data,
unsigned long len)
1505 if (kvm_is_error_hva(ghc->
hva))
1518 void *
data,
unsigned long len)
1526 if (kvm_is_error_hva(ghc->
hva))
1551 while ((seg = next_segment(len, offset)) != 0) {
1567 unsigned long rel_gfn = gfn - memslot->
base_gfn;
1613 int cpu = vcpu->
cpu;
1616 wqp = kvm_arch_vcpu_wq(vcpu);
1617 if (waitqueue_active(wqp)) {
1619 ++vcpu->
stat.halt_wakeup;
1632 if (!need_resched())
1651 put_task_struct(task);
1655 put_task_struct(task);
1658 put_task_struct(task);
1663 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
1686 bool kvm_vcpu_eligible_for_directed_yield(
struct kvm_vcpu *vcpu)
1690 eligible = !vcpu->spin_loop.in_spin_loop ||
1691 (vcpu->spin_loop.in_spin_loop &&
1692 vcpu->spin_loop.dy_eligible);
1694 if (vcpu->spin_loop.in_spin_loop)
1695 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
1702 struct kvm *kvm = me->
kvm;
1704 int last_boosted_vcpu = me->
kvm->last_boosted_vcpu;
1709 kvm_vcpu_set_in_spin_loop(me,
true);
1717 for (pass = 0; pass < 2 && !yielded; pass++) {
1719 if (!pass && i <= last_boosted_vcpu) {
1720 i = last_boosted_vcpu;
1722 }
else if (pass && i > last_boosted_vcpu)
1726 if (waitqueue_active(&vcpu->
wq))
1728 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
1737 kvm_vcpu_set_in_spin_loop(me,
false);
1740 kvm_vcpu_set_dy_eligible(me,
false);
1744 static int kvm_vcpu_fault(
struct vm_area_struct *vma,
struct vm_fault *vmf)
1749 if (vmf->pgoff == 0)
1755 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1766 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
1767 .fault = kvm_vcpu_fault,
1772 vma->
vm_ops = &kvm_vcpu_vm_ops;
1776 static int kvm_vcpu_release(
struct inode *
inode,
struct file *filp)
1785 .release = kvm_vcpu_release,
1786 .unlocked_ioctl = kvm_vcpu_ioctl,
1787 #ifdef CONFIG_COMPAT
1788 .compat_ioctl = kvm_vcpu_compat_ioctl,
1790 .mmap = kvm_vcpu_mmap,
1797 static int create_vcpu_fd(
struct kvm_vcpu *vcpu)
1805 static int kvm_vm_ioctl_create_vcpu(
struct kvm *kvm,
u32 id)
1812 return PTR_ERR(vcpu);
1814 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1823 goto unlock_vcpu_destroy;
1827 goto unlock_vcpu_destroy;
1833 goto unlock_vcpu_destroy;
1840 r = create_vcpu_fd(vcpu);
1843 goto unlock_vcpu_destroy;
1853 unlock_vcpu_destroy:
1871 static long kvm_vcpu_ioctl(
struct file *filp,
1872 unsigned int ioctl,
unsigned long arg)
1883 #if defined(CONFIG_S390) || defined(CONFIG_PPC)
1902 trace_kvm_userspace_exit(vcpu->
run->exit_reason, r);
1908 kvm_regs = kzalloc(
sizeof(
struct kvm_regs),
GFP_KERNEL);
1915 if (
copy_to_user(argp, kvm_regs,
sizeof(
struct kvm_regs)))
1923 struct kvm_regs *kvm_regs;
1927 if (IS_ERR(kvm_regs)) {
1928 r = PTR_ERR(kvm_regs);
1940 kvm_sregs = kzalloc(
sizeof(
struct kvm_sregs),
GFP_KERNEL);
1948 if (
copy_to_user(argp, kvm_sregs,
sizeof(
struct kvm_sregs)))
1954 kvm_sregs =
memdup_user(argp,
sizeof(*kvm_sregs));
1955 if (IS_ERR(kvm_sregs)) {
1956 r = PTR_ERR(kvm_sregs);
2025 sizeof kvm_sigmask))
2028 if (kvm_sigmask.len !=
sizeof sigset)
2036 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
2075 #ifdef CONFIG_COMPAT
2076 static long kvm_vcpu_compat_ioctl(
struct file *filp,
2077 unsigned int ioctl,
unsigned long arg)
2080 void __user *argp = compat_ptr(arg);
2090 compat_sigset_t csigset;
2096 sizeof kvm_sigmask))
2099 if (kvm_sigmask.len !=
sizeof csigset)
2106 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2108 r = kvm_vcpu_ioctl_set_sigmask(vcpu,
NULL);
2112 r = kvm_vcpu_ioctl(filp, ioctl, arg);
2120 static long kvm_vm_ioctl(
struct file *filp,
2121 unsigned int ioctl,
unsigned long arg)
2131 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2140 sizeof kvm_userspace_mem))
2159 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2201 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
2208 kvm->bsp_vcpu_id =
arg;
2212 #ifdef CONFIG_HAVE_KVM_MSI
2223 #ifdef __KVM_HAVE_IRQ_LINE
2255 #ifdef CONFIG_COMPAT
2256 struct compat_kvm_dirty_log {
2265 static long kvm_vm_compat_ioctl(
struct file *filp,
2266 unsigned int ioctl,
unsigned long arg)
2293 r = kvm_vm_ioctl(filp, ioctl, arg);
2301 static int kvm_vm_fault(
struct vm_area_struct *vma,
struct vm_fault *vmf)
2303 struct page *page[1];
2306 gfn_t gfn = vmf->pgoff;
2307 struct kvm *kvm = vma->
vm_file->private_data;
2310 if (kvm_is_error_hva(addr))
2311 return VM_FAULT_SIGBUS;
2316 return VM_FAULT_SIGBUS;
2318 vmf->page = page[0];
2322 static const struct vm_operations_struct kvm_vm_vm_ops = {
2323 .fault = kvm_vm_fault,
2328 vma->
vm_ops = &kvm_vm_vm_ops;
2333 .release = kvm_vm_release,
2334 .unlocked_ioctl = kvm_vm_ioctl,
2335 #ifdef CONFIG_COMPAT
2336 .compat_ioctl = kvm_vm_compat_ioctl,
2338 .mmap = kvm_vm_mmap,
2342 static int kvm_dev_ioctl_create_vm(
unsigned long type)
2347 kvm = kvm_create_vm(type);
2349 return PTR_ERR(kvm);
2350 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2364 static long kvm_dev_ioctl_check_extension_generic(
long arg)
2370 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
2374 #ifdef CONFIG_HAVE_KVM_MSI
2378 #ifdef KVM_CAP_IRQ_ROUTING
2379 case KVM_CAP_IRQ_ROUTING:
2380 return KVM_MAX_IRQ_ROUTES;
2388 static long kvm_dev_ioctl(
struct file *filp,
2389 unsigned int ioctl,
unsigned long arg)
2401 r = kvm_dev_ioctl_create_vm(arg);
2404 r = kvm_dev_ioctl_check_extension_generic(arg);
2414 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2431 .unlocked_ioctl = kvm_dev_ioctl,
2432 .compat_ioctl = kvm_dev_ioctl,
2442 static void hardware_enable_nolock(
void *junk)
2450 cpumask_set_cpu(cpu, cpus_hardware_enabled);
2455 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2458 "CPU%d failed\n", cpu);
2462 static void hardware_enable(
void *junk)
2465 hardware_enable_nolock(junk);
2469 static void hardware_disable_nolock(
void *junk)
2475 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2479 static void hardware_disable(
void *junk)
2482 hardware_disable_nolock(junk);
2486 static void hardware_disable_all_nolock(
void)
2488 BUG_ON(!kvm_usage_count);
2491 if (!kvm_usage_count)
2495 static void hardware_disable_all(
void)
2498 hardware_disable_all_nolock();
2502 static int hardware_enable_all(
void)
2509 if (kvm_usage_count == 1) {
2514 hardware_disable_all_nolock();
2529 if (!kvm_usage_count)
2537 hardware_disable(
NULL);
2542 hardware_enable(
NULL);
2556 static int kvm_reboot(
struct notifier_block *notifier,
unsigned long val,
2572 .notifier_call = kvm_reboot,
2583 kvm_iodevice_destructor(pos);
2601 gpa_t addr,
int len)
2616 gpa_t addr,
int len)
2631 off = range - bus->
range;
2641 int len,
const void *val)
2657 while (idx < bus->dev_count &&
2659 if (!kvm_iodevice_write(bus->
range[idx].dev, addr, len, val))
2685 while (idx < bus->dev_count &&
2687 if (!kvm_iodevice_read(bus->
range[idx].dev, addr, len, val))
2701 bus = kvm->
buses[bus_idx];
2705 new_bus = kzalloc(
sizeof(*bus) + ((bus->
dev_count + 1) *
2726 bus = kvm->
buses[bus_idx];
2729 if (bus->
range[i].dev == dev) {
2737 new_bus = kzalloc(
sizeof(*bus) + ((bus->
dev_count - 1) *
2754 .notifier_call = kvm_cpu_hotplug,
2757 static int vm_stat_get(
void *_offset,
u64 *val)
2759 unsigned offset = (
long)_offset;
2772 static int vcpu_stat_get(
void *_offset,
u64 *val)
2774 unsigned offset = (
long)_offset;
2783 *val += *(
u32 *)((
void *)vcpu + offset);
2796 static int kvm_init_debug(
void)
2802 if (kvm_debugfs_dir ==
NULL)
2808 stat_fops[p->
kind]);
2821 static void kvm_exit_debug(
void)
2830 static int kvm_suspend(
void)
2832 if (kvm_usage_count)
2833 hardware_disable_nolock(
NULL);
2837 static void kvm_resume(
void)
2839 if (kvm_usage_count) {
2841 hardware_enable_nolock(
NULL);
2846 .suspend = kvm_suspend,
2847 .resume = kvm_resume,
2851 struct kvm_vcpu *preempt_notifier_to_vcpu(
struct preempt_notifier *
pn)
2856 static void kvm_sched_in(
struct preempt_notifier *
pn,
int cpu)
2858 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2863 static void kvm_sched_out(
struct preempt_notifier *
pn,
2866 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2871 int kvm_init(
void *opaque,
unsigned vcpu_size,
unsigned vcpu_align,
2881 if (!zalloc_cpumask_var(&cpus_hardware_enabled,
GFP_KERNEL)) {
2898 r = register_cpu_notifier(&kvm_cpu_notifier);
2905 vcpu_align = __alignof__(
struct kvm_vcpu);
2908 if (!kvm_vcpu_cache) {
2917 kvm_chardev_ops.
owner = module;
2918 kvm_vm_fops.
owner = module;
2919 kvm_vcpu_fops.
owner = module;
2929 kvm_preempt_ops.sched_in = kvm_sched_in;
2930 kvm_preempt_ops.sched_out = kvm_sched_out;
2932 r = kvm_init_debug();
2948 unregister_cpu_notifier(&kvm_cpu_notifier);
2953 free_cpumask_var(cpus_hardware_enabled);
2969 unregister_cpu_notifier(&kvm_cpu_notifier);
2973 free_cpumask_var(cpus_hardware_enabled);