KVM: VMX: Remove redundant test in vmx_set_efer()
[deliverable/linux.git] / virt / kvm / kvm_main.c
CommitLineData
6aa8b732
AK
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
e2174021 18#include "iodev.h"
6aa8b732 19
edf88417 20#include <linux/kvm_host.h>
6aa8b732
AK
21#include <linux/kvm.h>
22#include <linux/module.h>
23#include <linux/errno.h>
6aa8b732
AK
24#include <linux/percpu.h>
25#include <linux/gfp.h>
6aa8b732
AK
26#include <linux/mm.h>
27#include <linux/miscdevice.h>
28#include <linux/vmalloc.h>
6aa8b732 29#include <linux/reboot.h>
6aa8b732
AK
30#include <linux/debugfs.h>
31#include <linux/highmem.h>
32#include <linux/file.h>
59ae6c6b 33#include <linux/sysdev.h>
774c47f1 34#include <linux/cpu.h>
e8edc6e0 35#include <linux/sched.h>
d9e368d6
AK
36#include <linux/cpumask.h>
37#include <linux/smp.h>
d6d28168 38#include <linux/anon_inodes.h>
04d2cc77 39#include <linux/profile.h>
7aa81cc0 40#include <linux/kvm_para.h>
6fc138d2 41#include <linux/pagemap.h>
8d4e1288 42#include <linux/mman.h>
35149e21 43#include <linux/swap.h>
e56d532f 44#include <linux/bitops.h>
547de29e 45#include <linux/spinlock.h>
6ff5894c 46#include <linux/compat.h>
bc6678a3 47#include <linux/srcu.h>
6aa8b732 48
e495606d 49#include <asm/processor.h>
e495606d
AK
50#include <asm/io.h>
51#include <asm/uaccess.h>
3e021bf5 52#include <asm/pgtable.h>
c8240bd6 53#include <asm-generic/bitops/le.h>
6aa8b732 54
5f94c174 55#include "coalesced_mmio.h"
5f94c174 56
229456fc
MT
57#define CREATE_TRACE_POINTS
58#include <trace/events/kvm.h>
59
6aa8b732
AK
60MODULE_AUTHOR("Qumranet");
61MODULE_LICENSE("GPL");
62
fa40a821
MT
63/*
64 * Ordering of locks:
65 *
fae3a353 66 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
fa40a821
MT
67 */
68
e9b11c17
ZX
69DEFINE_SPINLOCK(kvm_lock);
70LIST_HEAD(vm_list);
133de902 71
7f59f492 72static cpumask_var_t cpus_hardware_enabled;
10474ae8
AG
73static int kvm_usage_count = 0;
74static atomic_t hardware_enable_failed;
1b6c0168 75
c16f862d
RR
76struct kmem_cache *kvm_vcpu_cache;
77EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
1165f5fe 78
15ad7146
AK
79static __read_mostly struct preempt_ops kvm_preempt_ops;
80
76f7c879 81struct dentry *kvm_debugfs_dir;
6aa8b732 82
bccf2150
AK
83static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
84 unsigned long arg);
10474ae8
AG
85static int hardware_enable_all(void);
86static void hardware_disable_all(void);
bccf2150 87
e93f8a0f
MT
88static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
89
e8ba5d31 90static bool kvm_rebooting;
4ecac3fd 91
54dee993
MT
92static bool largepages_enabled = true;
93
c77fb9dc 94inline int kvm_is_mmio_pfn(pfn_t pfn)
cbff90a7 95{
fc5659c8
JR
96 if (pfn_valid(pfn)) {
97 struct page *page = compound_head(pfn_to_page(pfn));
98 return PageReserved(page);
99 }
cbff90a7
BAY
100
101 return true;
102}
103
bccf2150
AK
104/*
105 * Switches to specified vcpu, until a matching vcpu_put()
106 */
313a3dc7 107void vcpu_load(struct kvm_vcpu *vcpu)
6aa8b732 108{
15ad7146
AK
109 int cpu;
110
bccf2150 111 mutex_lock(&vcpu->mutex);
15ad7146
AK
112 cpu = get_cpu();
113 preempt_notifier_register(&vcpu->preempt_notifier);
313a3dc7 114 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146 115 put_cpu();
6aa8b732
AK
116}
117
313a3dc7 118void vcpu_put(struct kvm_vcpu *vcpu)
6aa8b732 119{
15ad7146 120 preempt_disable();
313a3dc7 121 kvm_arch_vcpu_put(vcpu);
15ad7146
AK
122 preempt_notifier_unregister(&vcpu->preempt_notifier);
123 preempt_enable();
6aa8b732
AK
124 mutex_unlock(&vcpu->mutex);
125}
126
d9e368d6
AK
127static void ack_flush(void *_completed)
128{
d9e368d6
AK
129}
130
49846896 131static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
d9e368d6 132{
597a5f55 133 int i, cpu, me;
6ef7a1bc
RR
134 cpumask_var_t cpus;
135 bool called = true;
d9e368d6 136 struct kvm_vcpu *vcpu;
d9e368d6 137
79f55997 138 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
6ef7a1bc 139
84261923 140 spin_lock(&kvm->requests_lock);
e601e3be 141 me = smp_processor_id();
988a2cae 142 kvm_for_each_vcpu(i, vcpu, kvm) {
49846896 143 if (test_and_set_bit(req, &vcpu->requests))
d9e368d6
AK
144 continue;
145 cpu = vcpu->cpu;
6ef7a1bc
RR
146 if (cpus != NULL && cpu != -1 && cpu != me)
147 cpumask_set_cpu(cpu, cpus);
49846896 148 }
6ef7a1bc
RR
149 if (unlikely(cpus == NULL))
150 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
151 else if (!cpumask_empty(cpus))
152 smp_call_function_many(cpus, ack_flush, NULL, 1);
153 else
154 called = false;
84261923 155 spin_unlock(&kvm->requests_lock);
6ef7a1bc 156 free_cpumask_var(cpus);
49846896 157 return called;
d9e368d6
AK
158}
159
49846896 160void kvm_flush_remote_tlbs(struct kvm *kvm)
2e53d63a 161{
49846896
RR
162 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
163 ++kvm->stat.remote_tlb_flush;
2e53d63a
MT
164}
165
49846896
RR
166void kvm_reload_remote_mmus(struct kvm *kvm)
167{
168 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
169}
2e53d63a 170
fb3f0f51
RR
171int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
172{
173 struct page *page;
174 int r;
175
176 mutex_init(&vcpu->mutex);
177 vcpu->cpu = -1;
fb3f0f51
RR
178 vcpu->kvm = kvm;
179 vcpu->vcpu_id = id;
b6958ce4 180 init_waitqueue_head(&vcpu->wq);
fb3f0f51
RR
181
182 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
183 if (!page) {
184 r = -ENOMEM;
185 goto fail;
186 }
187 vcpu->run = page_address(page);
188
e9b11c17 189 r = kvm_arch_vcpu_init(vcpu);
fb3f0f51 190 if (r < 0)
e9b11c17 191 goto fail_free_run;
fb3f0f51
RR
192 return 0;
193
fb3f0f51
RR
194fail_free_run:
195 free_page((unsigned long)vcpu->run);
196fail:
76fafa5e 197 return r;
fb3f0f51
RR
198}
199EXPORT_SYMBOL_GPL(kvm_vcpu_init);
200
201void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
202{
e9b11c17 203 kvm_arch_vcpu_uninit(vcpu);
fb3f0f51
RR
204 free_page((unsigned long)vcpu->run);
205}
206EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
207
e930bffe
AA
208#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
209static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
210{
211 return container_of(mn, struct kvm, mmu_notifier);
212}
213
214static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
215 struct mm_struct *mm,
216 unsigned long address)
217{
218 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 219 int need_tlb_flush, idx;
e930bffe
AA
220
221 /*
222 * When ->invalidate_page runs, the linux pte has been zapped
223 * already but the page is still allocated until
224 * ->invalidate_page returns. So if we increase the sequence
225 * here the kvm page fault will notice if the spte can't be
226 * established because the page is going to be freed. If
227 * instead the kvm page fault establishes the spte before
228 * ->invalidate_page runs, kvm_unmap_hva will release it
229 * before returning.
230 *
231 * The sequence increase only need to be seen at spin_unlock
232 * time, and not at spin_lock time.
233 *
234 * Increasing the sequence after the spin_unlock would be
235 * unsafe because the kvm page fault could then establish the
236 * pte after kvm_unmap_hva returned, without noticing the page
237 * is going to be freed.
238 */
bc6678a3 239 idx = srcu_read_lock(&kvm->srcu);
e930bffe
AA
240 spin_lock(&kvm->mmu_lock);
241 kvm->mmu_notifier_seq++;
242 need_tlb_flush = kvm_unmap_hva(kvm, address);
243 spin_unlock(&kvm->mmu_lock);
bc6678a3 244 srcu_read_unlock(&kvm->srcu, idx);
e930bffe
AA
245
246 /* we've to flush the tlb before the pages can be freed */
247 if (need_tlb_flush)
248 kvm_flush_remote_tlbs(kvm);
249
250}
251
3da0dd43
IE
252static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
253 struct mm_struct *mm,
254 unsigned long address,
255 pte_t pte)
256{
257 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 258 int idx;
3da0dd43 259
bc6678a3 260 idx = srcu_read_lock(&kvm->srcu);
3da0dd43
IE
261 spin_lock(&kvm->mmu_lock);
262 kvm->mmu_notifier_seq++;
263 kvm_set_spte_hva(kvm, address, pte);
264 spin_unlock(&kvm->mmu_lock);
bc6678a3 265 srcu_read_unlock(&kvm->srcu, idx);
3da0dd43
IE
266}
267
e930bffe
AA
268static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
269 struct mm_struct *mm,
270 unsigned long start,
271 unsigned long end)
272{
273 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 274 int need_tlb_flush = 0, idx;
e930bffe 275
bc6678a3 276 idx = srcu_read_lock(&kvm->srcu);
e930bffe
AA
277 spin_lock(&kvm->mmu_lock);
278 /*
279 * The count increase must become visible at unlock time as no
280 * spte can be established without taking the mmu_lock and
281 * count is also read inside the mmu_lock critical section.
282 */
283 kvm->mmu_notifier_count++;
284 for (; start < end; start += PAGE_SIZE)
285 need_tlb_flush |= kvm_unmap_hva(kvm, start);
286 spin_unlock(&kvm->mmu_lock);
bc6678a3 287 srcu_read_unlock(&kvm->srcu, idx);
e930bffe
AA
288
289 /* we've to flush the tlb before the pages can be freed */
290 if (need_tlb_flush)
291 kvm_flush_remote_tlbs(kvm);
292}
293
294static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
295 struct mm_struct *mm,
296 unsigned long start,
297 unsigned long end)
298{
299 struct kvm *kvm = mmu_notifier_to_kvm(mn);
300
301 spin_lock(&kvm->mmu_lock);
302 /*
303 * This sequence increase will notify the kvm page fault that
304 * the page that is going to be mapped in the spte could have
305 * been freed.
306 */
307 kvm->mmu_notifier_seq++;
308 /*
309 * The above sequence increase must be visible before the
310 * below count decrease but both values are read by the kvm
311 * page fault under mmu_lock spinlock so we don't need to add
312 * a smb_wmb() here in between the two.
313 */
314 kvm->mmu_notifier_count--;
315 spin_unlock(&kvm->mmu_lock);
316
317 BUG_ON(kvm->mmu_notifier_count < 0);
318}
319
320static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
321 struct mm_struct *mm,
322 unsigned long address)
323{
324 struct kvm *kvm = mmu_notifier_to_kvm(mn);
bc6678a3 325 int young, idx;
e930bffe 326
bc6678a3 327 idx = srcu_read_lock(&kvm->srcu);
e930bffe
AA
328 spin_lock(&kvm->mmu_lock);
329 young = kvm_age_hva(kvm, address);
330 spin_unlock(&kvm->mmu_lock);
bc6678a3 331 srcu_read_unlock(&kvm->srcu, idx);
e930bffe
AA
332
333 if (young)
334 kvm_flush_remote_tlbs(kvm);
335
336 return young;
337}
338
85db06e5
MT
339static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
340 struct mm_struct *mm)
341{
342 struct kvm *kvm = mmu_notifier_to_kvm(mn);
343 kvm_arch_flush_shadow(kvm);
344}
345
e930bffe
AA
346static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
347 .invalidate_page = kvm_mmu_notifier_invalidate_page,
348 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
349 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
350 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
3da0dd43 351 .change_pte = kvm_mmu_notifier_change_pte,
85db06e5 352 .release = kvm_mmu_notifier_release,
e930bffe 353};
4c07b0a4
AK
354
355static int kvm_init_mmu_notifier(struct kvm *kvm)
356{
357 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
358 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
359}
360
361#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
362
363static int kvm_init_mmu_notifier(struct kvm *kvm)
364{
365 return 0;
366}
367
e930bffe
AA
368#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
369
f17abe9a 370static struct kvm *kvm_create_vm(void)
6aa8b732 371{
e93f8a0f 372 int r = 0, i;
d19a9cd2 373 struct kvm *kvm = kvm_arch_create_vm();
6aa8b732 374
d19a9cd2
ZX
375 if (IS_ERR(kvm))
376 goto out;
10474ae8
AG
377
378 r = hardware_enable_all();
379 if (r)
380 goto out_err_nodisable;
381
75858a84
AK
382#ifdef CONFIG_HAVE_KVM_IRQCHIP
383 INIT_HLIST_HEAD(&kvm->mask_notifier_list);
136bdfee 384 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
75858a84 385#endif
6aa8b732 386
46a26bf5
MT
387 r = -ENOMEM;
388 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
389 if (!kvm->memslots)
390 goto out_err;
bc6678a3
MT
391 if (init_srcu_struct(&kvm->srcu))
392 goto out_err;
e93f8a0f
MT
393 for (i = 0; i < KVM_NR_BUSES; i++) {
394 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
395 GFP_KERNEL);
396 if (!kvm->buses[i]) {
397 cleanup_srcu_struct(&kvm->srcu);
398 goto out_err;
399 }
400 }
46a26bf5 401
4c07b0a4 402 r = kvm_init_mmu_notifier(kvm);
283d0c65 403 if (r) {
bc6678a3 404 cleanup_srcu_struct(&kvm->srcu);
283d0c65 405 goto out_err;
e930bffe 406 }
e930bffe 407
6d4e4c4f
AK
408 kvm->mm = current->mm;
409 atomic_inc(&kvm->mm->mm_count);
aaee2c94 410 spin_lock_init(&kvm->mmu_lock);
84261923 411 spin_lock_init(&kvm->requests_lock);
d34e6b17 412 kvm_eventfd_init(kvm);
11ec2804 413 mutex_init(&kvm->lock);
60eead79 414 mutex_init(&kvm->irq_lock);
79fac95e 415 mutex_init(&kvm->slots_lock);
d39f13b0 416 atomic_set(&kvm->users_count, 1);
5e58cfe4
RR
417 spin_lock(&kvm_lock);
418 list_add(&kvm->vm_list, &vm_list);
419 spin_unlock(&kvm_lock);
5f94c174
LV
420#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
421 kvm_coalesced_mmio_init(kvm);
422#endif
d19a9cd2 423out:
f17abe9a 424 return kvm;
10474ae8
AG
425
426out_err:
427 hardware_disable_all();
428out_err_nodisable:
e93f8a0f
MT
429 for (i = 0; i < KVM_NR_BUSES; i++)
430 kfree(kvm->buses[i]);
46a26bf5 431 kfree(kvm->memslots);
10474ae8
AG
432 kfree(kvm);
433 return ERR_PTR(r);
f17abe9a
AK
434}
435
6aa8b732
AK
436/*
437 * Free any memory in @free but not in @dont.
438 */
439static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
440 struct kvm_memory_slot *dont)
441{
ec04b260
JR
442 int i;
443
290fc38d
IE
444 if (!dont || free->rmap != dont->rmap)
445 vfree(free->rmap);
6aa8b732
AK
446
447 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
448 vfree(free->dirty_bitmap);
449
ec04b260
JR
450
451 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
452 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
453 vfree(free->lpage_info[i]);
454 free->lpage_info[i] = NULL;
455 }
456 }
05da4558 457
6aa8b732 458 free->npages = 0;
8b6d44c7 459 free->dirty_bitmap = NULL;
8d4e1288 460 free->rmap = NULL;
6aa8b732
AK
461}
462
d19a9cd2 463void kvm_free_physmem(struct kvm *kvm)
6aa8b732
AK
464{
465 int i;
46a26bf5
MT
466 struct kvm_memslots *slots = kvm->memslots;
467
468 for (i = 0; i < slots->nmemslots; ++i)
469 kvm_free_physmem_slot(&slots->memslots[i], NULL);
6aa8b732 470
46a26bf5 471 kfree(kvm->memslots);
6aa8b732
AK
472}
473
f17abe9a
AK
474static void kvm_destroy_vm(struct kvm *kvm)
475{
e93f8a0f 476 int i;
6d4e4c4f
AK
477 struct mm_struct *mm = kvm->mm;
478
ad8ba2cd 479 kvm_arch_sync_events(kvm);
133de902
AK
480 spin_lock(&kvm_lock);
481 list_del(&kvm->vm_list);
482 spin_unlock(&kvm_lock);
399ec807 483 kvm_free_irq_routing(kvm);
e93f8a0f
MT
484 for (i = 0; i < KVM_NR_BUSES; i++)
485 kvm_io_bus_destroy(kvm->buses[i]);
980da6ce 486 kvm_coalesced_mmio_free(kvm);
e930bffe
AA
487#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
488 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
f00be0ca
GN
489#else
490 kvm_arch_flush_shadow(kvm);
5f94c174 491#endif
d19a9cd2 492 kvm_arch_destroy_vm(kvm);
10474ae8 493 hardware_disable_all();
6d4e4c4f 494 mmdrop(mm);
f17abe9a
AK
495}
496
d39f13b0
IE
497void kvm_get_kvm(struct kvm *kvm)
498{
499 atomic_inc(&kvm->users_count);
500}
501EXPORT_SYMBOL_GPL(kvm_get_kvm);
502
503void kvm_put_kvm(struct kvm *kvm)
504{
505 if (atomic_dec_and_test(&kvm->users_count))
506 kvm_destroy_vm(kvm);
507}
508EXPORT_SYMBOL_GPL(kvm_put_kvm);
509
510
f17abe9a
AK
511static int kvm_vm_release(struct inode *inode, struct file *filp)
512{
513 struct kvm *kvm = filp->private_data;
514
721eecbf
GH
515 kvm_irqfd_release(kvm);
516
d39f13b0 517 kvm_put_kvm(kvm);
6aa8b732
AK
518 return 0;
519}
520
6aa8b732
AK
521/*
522 * Allocate some memory and give it an address in the guest physical address
523 * space.
524 *
525 * Discontiguous memory is allowed, mostly for framebuffers.
f78e0e2e 526 *
10589a46 527 * Must be called holding mmap_sem for write.
6aa8b732 528 */
f78e0e2e
SY
529int __kvm_set_memory_region(struct kvm *kvm,
530 struct kvm_userspace_memory_region *mem,
531 int user_alloc)
6aa8b732 532{
bc6678a3 533 int r, flush_shadow = 0;
6aa8b732 534 gfn_t base_gfn;
28bcb112
HC
535 unsigned long npages;
536 unsigned long i;
6aa8b732
AK
537 struct kvm_memory_slot *memslot;
538 struct kvm_memory_slot old, new;
bc6678a3 539 struct kvm_memslots *slots, *old_memslots;
6aa8b732
AK
540
541 r = -EINVAL;
542 /* General sanity checks */
543 if (mem->memory_size & (PAGE_SIZE - 1))
544 goto out;
545 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
546 goto out;
e7cacd40 547 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1)))
78749809 548 goto out;
e0d62c7f 549 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
6aa8b732
AK
550 goto out;
551 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
552 goto out;
553
46a26bf5 554 memslot = &kvm->memslots->memslots[mem->slot];
6aa8b732
AK
555 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
556 npages = mem->memory_size >> PAGE_SHIFT;
557
558 if (!npages)
559 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
560
6aa8b732
AK
561 new = old = *memslot;
562
563 new.base_gfn = base_gfn;
564 new.npages = npages;
565 new.flags = mem->flags;
566
567 /* Disallow changing a memory slot's size. */
568 r = -EINVAL;
569 if (npages && old.npages && npages != old.npages)
f78e0e2e 570 goto out_free;
6aa8b732
AK
571
572 /* Check for overlaps */
573 r = -EEXIST;
574 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
46a26bf5 575 struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
6aa8b732 576
4cd481f6 577 if (s == memslot || !s->npages)
6aa8b732
AK
578 continue;
579 if (!((base_gfn + npages <= s->base_gfn) ||
580 (base_gfn >= s->base_gfn + s->npages)))
f78e0e2e 581 goto out_free;
6aa8b732 582 }
6aa8b732 583
6aa8b732
AK
584 /* Free page dirty bitmap if unneeded */
585 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
8b6d44c7 586 new.dirty_bitmap = NULL;
6aa8b732
AK
587
588 r = -ENOMEM;
589
590 /* Allocate if a slot is being created */
eff0114a 591#ifndef CONFIG_S390
8d4e1288 592 if (npages && !new.rmap) {
d77c26fc 593 new.rmap = vmalloc(npages * sizeof(struct page *));
290fc38d
IE
594
595 if (!new.rmap)
f78e0e2e 596 goto out_free;
290fc38d 597
290fc38d 598 memset(new.rmap, 0, npages * sizeof(*new.rmap));
8d4e1288 599
80b14b5b 600 new.user_alloc = user_alloc;
bc6678a3 601 new.userspace_addr = mem->userspace_addr;
6aa8b732 602 }
ec04b260
JR
603 if (!npages)
604 goto skip_lpage;
05da4558 605
ec04b260 606 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
28bcb112
HC
607 unsigned long ugfn;
608 unsigned long j;
609 int lpages;
ec04b260 610 int level = i + 2;
05da4558 611
ec04b260
JR
612 /* Avoid unused variable warning if no large pages */
613 (void)level;
614
615 if (new.lpage_info[i])
616 continue;
617
618 lpages = 1 + (base_gfn + npages - 1) /
619 KVM_PAGES_PER_HPAGE(level);
620 lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level);
621
622 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
623
624 if (!new.lpage_info[i])
05da4558
MT
625 goto out_free;
626
ec04b260
JR
627 memset(new.lpage_info[i], 0,
628 lpages * sizeof(*new.lpage_info[i]));
05da4558 629
ec04b260
JR
630 if (base_gfn % KVM_PAGES_PER_HPAGE(level))
631 new.lpage_info[i][0].write_count = 1;
632 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level))
633 new.lpage_info[i][lpages - 1].write_count = 1;
ac04527f
AK
634 ugfn = new.userspace_addr >> PAGE_SHIFT;
635 /*
636 * If the gfn and userspace address are not aligned wrt each
54dee993
MT
637 * other, or if explicitly asked to, disable large page
638 * support for this slot
ac04527f 639 */
ec04b260 640 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
54dee993 641 !largepages_enabled)
ec04b260
JR
642 for (j = 0; j < lpages; ++j)
643 new.lpage_info[i][j].write_count = 1;
05da4558 644 }
6aa8b732 645
ec04b260
JR
646skip_lpage:
647
6aa8b732
AK
648 /* Allocate page dirty bitmap if needed */
649 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
650 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
651
652 new.dirty_bitmap = vmalloc(dirty_bytes);
653 if (!new.dirty_bitmap)
f78e0e2e 654 goto out_free;
6aa8b732 655 memset(new.dirty_bitmap, 0, dirty_bytes);
bc6678a3 656 /* destroy any largepage mappings for dirty tracking */
e244584f 657 if (old.npages)
bc6678a3 658 flush_shadow = 1;
6aa8b732 659 }
3eea8437
CB
660#else /* not defined CONFIG_S390 */
661 new.user_alloc = user_alloc;
662 if (user_alloc)
663 new.userspace_addr = mem->userspace_addr;
eff0114a 664#endif /* not defined CONFIG_S390 */
6aa8b732 665
bc6678a3
MT
666 if (!npages) {
667 r = -ENOMEM;
668 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
669 if (!slots)
670 goto out_free;
671 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
672 if (mem->slot >= slots->nmemslots)
673 slots->nmemslots = mem->slot + 1;
674 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
675
676 old_memslots = kvm->memslots;
677 rcu_assign_pointer(kvm->memslots, slots);
678 synchronize_srcu_expedited(&kvm->srcu);
679 /* From this point no new shadow pages pointing to a deleted
680 * memslot will be created.
681 *
682 * validation of sp->gfn happens in:
683 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
684 * - kvm_is_visible_gfn (mmu_check_roots)
685 */
34d4cb8f 686 kvm_arch_flush_shadow(kvm);
bc6678a3
MT
687 kfree(old_memslots);
688 }
34d4cb8f 689
f7784b8e
MT
690 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
691 if (r)
692 goto out_free;
693
bc6678a3
MT
694#ifdef CONFIG_DMAR
695 /* map the pages in iommu page table */
696 if (npages) {
697 r = kvm_iommu_map_pages(kvm, &new);
698 if (r)
699 goto out_free;
700 }
701#endif
604b38ac 702
bc6678a3
MT
703 r = -ENOMEM;
704 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
705 if (!slots)
706 goto out_free;
707 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
708 if (mem->slot >= slots->nmemslots)
709 slots->nmemslots = mem->slot + 1;
710
711 /* actual memory is freed via old in kvm_free_physmem_slot below */
712 if (!npages) {
713 new.rmap = NULL;
714 new.dirty_bitmap = NULL;
715 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
716 new.lpage_info[i] = NULL;
717 }
718
719 slots->memslots[mem->slot] = new;
720 old_memslots = kvm->memslots;
721 rcu_assign_pointer(kvm->memslots, slots);
722 synchronize_srcu_expedited(&kvm->srcu);
3ad82a7e 723
f7784b8e 724 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
82ce2c96 725
bc6678a3
MT
726 kvm_free_physmem_slot(&old, &new);
727 kfree(old_memslots);
728
729 if (flush_shadow)
730 kvm_arch_flush_shadow(kvm);
731
6aa8b732
AK
732 return 0;
733
f78e0e2e 734out_free:
6aa8b732
AK
735 kvm_free_physmem_slot(&new, &old);
736out:
737 return r;
210c7c4d
IE
738
739}
f78e0e2e
SY
740EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
741
742int kvm_set_memory_region(struct kvm *kvm,
743 struct kvm_userspace_memory_region *mem,
744 int user_alloc)
745{
746 int r;
747
79fac95e 748 mutex_lock(&kvm->slots_lock);
f78e0e2e 749 r = __kvm_set_memory_region(kvm, mem, user_alloc);
79fac95e 750 mutex_unlock(&kvm->slots_lock);
f78e0e2e
SY
751 return r;
752}
210c7c4d
IE
753EXPORT_SYMBOL_GPL(kvm_set_memory_region);
754
1fe779f8
CO
755int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
756 struct
757 kvm_userspace_memory_region *mem,
758 int user_alloc)
210c7c4d 759{
e0d62c7f
IE
760 if (mem->slot >= KVM_MEMORY_SLOTS)
761 return -EINVAL;
210c7c4d 762 return kvm_set_memory_region(kvm, mem, user_alloc);
6aa8b732
AK
763}
764
5bb064dc
ZX
765int kvm_get_dirty_log(struct kvm *kvm,
766 struct kvm_dirty_log *log, int *is_dirty)
6aa8b732
AK
767{
768 struct kvm_memory_slot *memslot;
769 int r, i;
770 int n;
771 unsigned long any = 0;
772
6aa8b732
AK
773 r = -EINVAL;
774 if (log->slot >= KVM_MEMORY_SLOTS)
775 goto out;
776
46a26bf5 777 memslot = &kvm->memslots->memslots[log->slot];
6aa8b732
AK
778 r = -ENOENT;
779 if (!memslot->dirty_bitmap)
780 goto out;
781
cd1a4a98 782 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
6aa8b732 783
cd1a4a98 784 for (i = 0; !any && i < n/sizeof(long); ++i)
6aa8b732
AK
785 any = memslot->dirty_bitmap[i];
786
787 r = -EFAULT;
788 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
789 goto out;
790
5bb064dc
ZX
791 if (any)
792 *is_dirty = 1;
6aa8b732
AK
793
794 r = 0;
6aa8b732 795out:
6aa8b732
AK
796 return r;
797}
798
54dee993
MT
799void kvm_disable_largepages(void)
800{
801 largepages_enabled = false;
802}
803EXPORT_SYMBOL_GPL(kvm_disable_largepages);
804
cea7bb21
IE
805int is_error_page(struct page *page)
806{
807 return page == bad_page;
808}
809EXPORT_SYMBOL_GPL(is_error_page);
810
35149e21
AL
811int is_error_pfn(pfn_t pfn)
812{
813 return pfn == bad_pfn;
814}
815EXPORT_SYMBOL_GPL(is_error_pfn);
816
f9d46eb0
IE
817static inline unsigned long bad_hva(void)
818{
819 return PAGE_OFFSET;
820}
821
822int kvm_is_error_hva(unsigned long addr)
823{
824 return addr == bad_hva();
825}
826EXPORT_SYMBOL_GPL(kvm_is_error_hva);
827
2843099f 828struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
6aa8b732
AK
829{
830 int i;
bc6678a3 831 struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
6aa8b732 832
46a26bf5
MT
833 for (i = 0; i < slots->nmemslots; ++i) {
834 struct kvm_memory_slot *memslot = &slots->memslots[i];
6aa8b732
AK
835
836 if (gfn >= memslot->base_gfn
837 && gfn < memslot->base_gfn + memslot->npages)
838 return memslot;
839 }
8b6d44c7 840 return NULL;
6aa8b732 841}
2843099f 842EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased);
e8207547
AK
843
844struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
845{
846 gfn = unalias_gfn(kvm, gfn);
2843099f 847 return gfn_to_memslot_unaliased(kvm, gfn);
e8207547 848}
6aa8b732 849
e0d62c7f
IE
850int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
851{
852 int i;
bc6678a3 853 struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
e0d62c7f 854
a983fb23 855 gfn = unalias_gfn_instantiation(kvm, gfn);
e0d62c7f 856 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
46a26bf5 857 struct kvm_memory_slot *memslot = &slots->memslots[i];
e0d62c7f 858
bc6678a3
MT
859 if (memslot->flags & KVM_MEMSLOT_INVALID)
860 continue;
861
e0d62c7f
IE
862 if (gfn >= memslot->base_gfn
863 && gfn < memslot->base_gfn + memslot->npages)
864 return 1;
865 }
866 return 0;
867}
868EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
869
bc6678a3
MT
870int memslot_id(struct kvm *kvm, gfn_t gfn)
871{
872 int i;
873 struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
874 struct kvm_memory_slot *memslot = NULL;
875
876 gfn = unalias_gfn(kvm, gfn);
877 for (i = 0; i < slots->nmemslots; ++i) {
878 memslot = &slots->memslots[i];
879
880 if (gfn >= memslot->base_gfn
881 && gfn < memslot->base_gfn + memslot->npages)
882 break;
883 }
884
885 return memslot - slots->memslots;
886}
887
05da4558 888unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
539cb660
IE
889{
890 struct kvm_memory_slot *slot;
891
a983fb23 892 gfn = unalias_gfn_instantiation(kvm, gfn);
2843099f 893 slot = gfn_to_memslot_unaliased(kvm, gfn);
bc6678a3 894 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
539cb660
IE
895 return bad_hva();
896 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
897}
0d150298 898EXPORT_SYMBOL_GPL(gfn_to_hva);
539cb660 899
506f0d6f 900static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
954bbbc2 901{
8d4e1288
AL
902 struct page *page[1];
903 int npages;
2e2e3738 904 pfn_t pfn;
954bbbc2 905
60395224
AK
906 might_sleep();
907
4c2155ce 908 npages = get_user_pages_fast(addr, 1, 1, page);
539cb660 909
2e2e3738
AL
910 if (unlikely(npages != 1)) {
911 struct vm_area_struct *vma;
912
4c2155ce 913 down_read(&current->mm->mmap_sem);
2e2e3738 914 vma = find_vma(current->mm, addr);
4c2155ce 915
2e2e3738
AL
916 if (vma == NULL || addr < vma->vm_start ||
917 !(vma->vm_flags & VM_PFNMAP)) {
4c2155ce 918 up_read(&current->mm->mmap_sem);
2e2e3738
AL
919 get_page(bad_page);
920 return page_to_pfn(bad_page);
921 }
922
923 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
4c2155ce 924 up_read(&current->mm->mmap_sem);
c77fb9dc 925 BUG_ON(!kvm_is_mmio_pfn(pfn));
2e2e3738
AL
926 } else
927 pfn = page_to_pfn(page[0]);
8d4e1288 928
2e2e3738 929 return pfn;
35149e21
AL
930}
931
506f0d6f
MT
932pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
933{
934 unsigned long addr;
935
936 addr = gfn_to_hva(kvm, gfn);
937 if (kvm_is_error_hva(addr)) {
938 get_page(bad_page);
939 return page_to_pfn(bad_page);
940 }
941
942 return hva_to_pfn(kvm, addr);
943}
35149e21
AL
944EXPORT_SYMBOL_GPL(gfn_to_pfn);
945
506f0d6f
MT
946static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
947{
948 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
949}
950
951pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
952 struct kvm_memory_slot *slot, gfn_t gfn)
953{
954 unsigned long addr = gfn_to_hva_memslot(slot, gfn);
955 return hva_to_pfn(kvm, addr);
956}
957
35149e21
AL
958struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
959{
2e2e3738
AL
960 pfn_t pfn;
961
962 pfn = gfn_to_pfn(kvm, gfn);
c77fb9dc 963 if (!kvm_is_mmio_pfn(pfn))
2e2e3738
AL
964 return pfn_to_page(pfn);
965
c77fb9dc 966 WARN_ON(kvm_is_mmio_pfn(pfn));
2e2e3738
AL
967
968 get_page(bad_page);
969 return bad_page;
954bbbc2 970}
aab61cc0 971
954bbbc2
AK
972EXPORT_SYMBOL_GPL(gfn_to_page);
973
b4231d61
IE
974void kvm_release_page_clean(struct page *page)
975{
35149e21 976 kvm_release_pfn_clean(page_to_pfn(page));
b4231d61
IE
977}
978EXPORT_SYMBOL_GPL(kvm_release_page_clean);
979
35149e21
AL
980void kvm_release_pfn_clean(pfn_t pfn)
981{
c77fb9dc 982 if (!kvm_is_mmio_pfn(pfn))
2e2e3738 983 put_page(pfn_to_page(pfn));
35149e21
AL
984}
985EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
986
b4231d61 987void kvm_release_page_dirty(struct page *page)
8a7ae055 988{
35149e21
AL
989 kvm_release_pfn_dirty(page_to_pfn(page));
990}
991EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
992
993void kvm_release_pfn_dirty(pfn_t pfn)
994{
995 kvm_set_pfn_dirty(pfn);
996 kvm_release_pfn_clean(pfn);
997}
998EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
999
1000void kvm_set_page_dirty(struct page *page)
1001{
1002 kvm_set_pfn_dirty(page_to_pfn(page));
1003}
1004EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
1005
1006void kvm_set_pfn_dirty(pfn_t pfn)
1007{
c77fb9dc 1008 if (!kvm_is_mmio_pfn(pfn)) {
2e2e3738
AL
1009 struct page *page = pfn_to_page(pfn);
1010 if (!PageReserved(page))
1011 SetPageDirty(page);
1012 }
8a7ae055 1013}
35149e21
AL
1014EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1015
1016void kvm_set_pfn_accessed(pfn_t pfn)
1017{
c77fb9dc 1018 if (!kvm_is_mmio_pfn(pfn))
2e2e3738 1019 mark_page_accessed(pfn_to_page(pfn));
35149e21
AL
1020}
1021EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1022
1023void kvm_get_pfn(pfn_t pfn)
1024{
c77fb9dc 1025 if (!kvm_is_mmio_pfn(pfn))
2e2e3738 1026 get_page(pfn_to_page(pfn));
35149e21
AL
1027}
1028EXPORT_SYMBOL_GPL(kvm_get_pfn);
8a7ae055 1029
195aefde
IE
1030static int next_segment(unsigned long len, int offset)
1031{
1032 if (len > PAGE_SIZE - offset)
1033 return PAGE_SIZE - offset;
1034 else
1035 return len;
1036}
1037
1038int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1039 int len)
1040{
e0506bcb
IE
1041 int r;
1042 unsigned long addr;
195aefde 1043
e0506bcb
IE
1044 addr = gfn_to_hva(kvm, gfn);
1045 if (kvm_is_error_hva(addr))
1046 return -EFAULT;
1047 r = copy_from_user(data, (void __user *)addr + offset, len);
1048 if (r)
195aefde 1049 return -EFAULT;
195aefde
IE
1050 return 0;
1051}
1052EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1053
1054int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1055{
1056 gfn_t gfn = gpa >> PAGE_SHIFT;
1057 int seg;
1058 int offset = offset_in_page(gpa);
1059 int ret;
1060
1061 while ((seg = next_segment(len, offset)) != 0) {
1062 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1063 if (ret < 0)
1064 return ret;
1065 offset = 0;
1066 len -= seg;
1067 data += seg;
1068 ++gfn;
1069 }
1070 return 0;
1071}
1072EXPORT_SYMBOL_GPL(kvm_read_guest);
1073
7ec54588
MT
1074int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1075 unsigned long len)
1076{
1077 int r;
1078 unsigned long addr;
1079 gfn_t gfn = gpa >> PAGE_SHIFT;
1080 int offset = offset_in_page(gpa);
1081
1082 addr = gfn_to_hva(kvm, gfn);
1083 if (kvm_is_error_hva(addr))
1084 return -EFAULT;
0aac03f0 1085 pagefault_disable();
7ec54588 1086 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
0aac03f0 1087 pagefault_enable();
7ec54588
MT
1088 if (r)
1089 return -EFAULT;
1090 return 0;
1091}
1092EXPORT_SYMBOL(kvm_read_guest_atomic);
1093
195aefde
IE
1094int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1095 int offset, int len)
1096{
e0506bcb
IE
1097 int r;
1098 unsigned long addr;
195aefde 1099
e0506bcb
IE
1100 addr = gfn_to_hva(kvm, gfn);
1101 if (kvm_is_error_hva(addr))
1102 return -EFAULT;
1103 r = copy_to_user((void __user *)addr + offset, data, len);
1104 if (r)
195aefde 1105 return -EFAULT;
195aefde
IE
1106 mark_page_dirty(kvm, gfn);
1107 return 0;
1108}
1109EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1110
1111int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1112 unsigned long len)
1113{
1114 gfn_t gfn = gpa >> PAGE_SHIFT;
1115 int seg;
1116 int offset = offset_in_page(gpa);
1117 int ret;
1118
1119 while ((seg = next_segment(len, offset)) != 0) {
1120 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1121 if (ret < 0)
1122 return ret;
1123 offset = 0;
1124 len -= seg;
1125 data += seg;
1126 ++gfn;
1127 }
1128 return 0;
1129}
1130
1131int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1132{
3e021bf5 1133 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
195aefde
IE
1134}
1135EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1136
1137int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1138{
1139 gfn_t gfn = gpa >> PAGE_SHIFT;
1140 int seg;
1141 int offset = offset_in_page(gpa);
1142 int ret;
1143
1144 while ((seg = next_segment(len, offset)) != 0) {
1145 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1146 if (ret < 0)
1147 return ret;
1148 offset = 0;
1149 len -= seg;
1150 ++gfn;
1151 }
1152 return 0;
1153}
1154EXPORT_SYMBOL_GPL(kvm_clear_guest);
1155
6aa8b732
AK
1156void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1157{
31389947 1158 struct kvm_memory_slot *memslot;
6aa8b732 1159
3b6fff19 1160 gfn = unalias_gfn(kvm, gfn);
2843099f 1161 memslot = gfn_to_memslot_unaliased(kvm, gfn);
7e9d619d
RR
1162 if (memslot && memslot->dirty_bitmap) {
1163 unsigned long rel_gfn = gfn - memslot->base_gfn;
6aa8b732 1164
7e9d619d 1165 /* avoid RMW */
c8240bd6
AG
1166 if (!generic_test_le_bit(rel_gfn, memslot->dirty_bitmap))
1167 generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
6aa8b732
AK
1168 }
1169}
1170
b6958ce4
ED
1171/*
1172 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1173 */
8776e519 1174void kvm_vcpu_block(struct kvm_vcpu *vcpu)
d3bef15f 1175{
e5c239cf
MT
1176 DEFINE_WAIT(wait);
1177
1178 for (;;) {
1179 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1180
a1b37100 1181 if (kvm_arch_vcpu_runnable(vcpu)) {
d7690175 1182 set_bit(KVM_REQ_UNHALT, &vcpu->requests);
e5c239cf 1183 break;
d7690175 1184 }
09cec754
GN
1185 if (kvm_cpu_has_pending_timer(vcpu))
1186 break;
e5c239cf
MT
1187 if (signal_pending(current))
1188 break;
1189
b6958ce4 1190 schedule();
b6958ce4 1191 }
d3bef15f 1192
e5c239cf 1193 finish_wait(&vcpu->wq, &wait);
b6958ce4
ED
1194}
1195
6aa8b732
AK
1196void kvm_resched(struct kvm_vcpu *vcpu)
1197{
3fca0365
YD
1198 if (!need_resched())
1199 return;
6aa8b732 1200 cond_resched();
6aa8b732
AK
1201}
1202EXPORT_SYMBOL_GPL(kvm_resched);
1203
d255f4f2
ZE
1204void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
1205{
1206 ktime_t expires;
1207 DEFINE_WAIT(wait);
1208
1209 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1210
1211 /* Sleep for 100 us, and hope lock-holder got scheduled */
1212 expires = ktime_add_ns(ktime_get(), 100000UL);
1213 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1214
1215 finish_wait(&vcpu->wq, &wait);
1216}
1217EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1218
e4a533a4 1219static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
9a2bb7f4
AK
1220{
1221 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
9a2bb7f4
AK
1222 struct page *page;
1223
e4a533a4 1224 if (vmf->pgoff == 0)
039576c0 1225 page = virt_to_page(vcpu->run);
09566765 1226#ifdef CONFIG_X86
e4a533a4 1227 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
ad312c7c 1228 page = virt_to_page(vcpu->arch.pio_data);
5f94c174
LV
1229#endif
1230#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1231 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
1232 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
09566765 1233#endif
039576c0 1234 else
e4a533a4 1235 return VM_FAULT_SIGBUS;
9a2bb7f4 1236 get_page(page);
e4a533a4 1237 vmf->page = page;
1238 return 0;
9a2bb7f4
AK
1239}
1240
f0f37e2f 1241static const struct vm_operations_struct kvm_vcpu_vm_ops = {
e4a533a4 1242 .fault = kvm_vcpu_fault,
9a2bb7f4
AK
1243};
1244
1245static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1246{
1247 vma->vm_ops = &kvm_vcpu_vm_ops;
1248 return 0;
1249}
1250
bccf2150
AK
1251static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1252{
1253 struct kvm_vcpu *vcpu = filp->private_data;
1254
66c0b394 1255 kvm_put_kvm(vcpu->kvm);
bccf2150
AK
1256 return 0;
1257}
1258
3d3aab1b 1259static struct file_operations kvm_vcpu_fops = {
bccf2150
AK
1260 .release = kvm_vcpu_release,
1261 .unlocked_ioctl = kvm_vcpu_ioctl,
1262 .compat_ioctl = kvm_vcpu_ioctl,
9a2bb7f4 1263 .mmap = kvm_vcpu_mmap,
bccf2150
AK
1264};
1265
1266/*
1267 * Allocates an inode for the vcpu.
1268 */
1269static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1270{
628ff7c1 1271 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
bccf2150
AK
1272}
1273
c5ea7660
AK
1274/*
1275 * Creates some virtual cpus. Good luck creating more than one.
1276 */
73880c80 1277static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
c5ea7660
AK
1278{
1279 int r;
988a2cae 1280 struct kvm_vcpu *vcpu, *v;
c5ea7660 1281
73880c80 1282 vcpu = kvm_arch_vcpu_create(kvm, id);
fb3f0f51
RR
1283 if (IS_ERR(vcpu))
1284 return PTR_ERR(vcpu);
c5ea7660 1285
15ad7146
AK
1286 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1287
26e5215f
AK
1288 r = kvm_arch_vcpu_setup(vcpu);
1289 if (r)
7d8fece6 1290 return r;
26e5215f 1291
11ec2804 1292 mutex_lock(&kvm->lock);
73880c80
GN
1293 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1294 r = -EINVAL;
e9b11c17 1295 goto vcpu_destroy;
fb3f0f51 1296 }
73880c80 1297
988a2cae
GN
1298 kvm_for_each_vcpu(r, v, kvm)
1299 if (v->vcpu_id == id) {
73880c80
GN
1300 r = -EEXIST;
1301 goto vcpu_destroy;
1302 }
1303
1304 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
c5ea7660 1305
fb3f0f51 1306 /* Now it's all set up, let userspace reach it */
66c0b394 1307 kvm_get_kvm(kvm);
bccf2150 1308 r = create_vcpu_fd(vcpu);
73880c80
GN
1309 if (r < 0) {
1310 kvm_put_kvm(kvm);
1311 goto vcpu_destroy;
1312 }
1313
1314 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1315 smp_wmb();
1316 atomic_inc(&kvm->online_vcpus);
1317
1318#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1319 if (kvm->bsp_vcpu_id == id)
1320 kvm->bsp_vcpu = vcpu;
1321#endif
1322 mutex_unlock(&kvm->lock);
fb3f0f51 1323 return r;
39c3b86e 1324
e9b11c17 1325vcpu_destroy:
7d8fece6 1326 mutex_unlock(&kvm->lock);
d40ccc62 1327 kvm_arch_vcpu_destroy(vcpu);
c5ea7660
AK
1328 return r;
1329}
1330
1961d276
AK
1331static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1332{
1333 if (sigset) {
1334 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1335 vcpu->sigset_active = 1;
1336 vcpu->sigset = *sigset;
1337 } else
1338 vcpu->sigset_active = 0;
1339 return 0;
1340}
1341
bccf2150
AK
1342static long kvm_vcpu_ioctl(struct file *filp,
1343 unsigned int ioctl, unsigned long arg)
6aa8b732 1344{
bccf2150 1345 struct kvm_vcpu *vcpu = filp->private_data;
2f366987 1346 void __user *argp = (void __user *)arg;
313a3dc7 1347 int r;
fa3795a7
DH
1348 struct kvm_fpu *fpu = NULL;
1349 struct kvm_sregs *kvm_sregs = NULL;
6aa8b732 1350
6d4e4c4f
AK
1351 if (vcpu->kvm->mm != current->mm)
1352 return -EIO;
6aa8b732 1353 switch (ioctl) {
9a2bb7f4 1354 case KVM_RUN:
f0fe5108
AK
1355 r = -EINVAL;
1356 if (arg)
1357 goto out;
b6c7a5dc 1358 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
6aa8b732 1359 break;
6aa8b732 1360 case KVM_GET_REGS: {
3e4bb3ac 1361 struct kvm_regs *kvm_regs;
6aa8b732 1362
3e4bb3ac
XZ
1363 r = -ENOMEM;
1364 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1365 if (!kvm_regs)
6aa8b732 1366 goto out;
3e4bb3ac
XZ
1367 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
1368 if (r)
1369 goto out_free1;
6aa8b732 1370 r = -EFAULT;
3e4bb3ac
XZ
1371 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
1372 goto out_free1;
6aa8b732 1373 r = 0;
3e4bb3ac
XZ
1374out_free1:
1375 kfree(kvm_regs);
6aa8b732
AK
1376 break;
1377 }
1378 case KVM_SET_REGS: {
3e4bb3ac 1379 struct kvm_regs *kvm_regs;
6aa8b732 1380
3e4bb3ac
XZ
1381 r = -ENOMEM;
1382 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1383 if (!kvm_regs)
6aa8b732 1384 goto out;
3e4bb3ac
XZ
1385 r = -EFAULT;
1386 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
1387 goto out_free2;
1388 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
6aa8b732 1389 if (r)
3e4bb3ac 1390 goto out_free2;
6aa8b732 1391 r = 0;
3e4bb3ac
XZ
1392out_free2:
1393 kfree(kvm_regs);
6aa8b732
AK
1394 break;
1395 }
1396 case KVM_GET_SREGS: {
fa3795a7
DH
1397 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1398 r = -ENOMEM;
1399 if (!kvm_sregs)
1400 goto out;
1401 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
6aa8b732
AK
1402 if (r)
1403 goto out;
1404 r = -EFAULT;
fa3795a7 1405 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
6aa8b732
AK
1406 goto out;
1407 r = 0;
1408 break;
1409 }
1410 case KVM_SET_SREGS: {
fa3795a7
DH
1411 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1412 r = -ENOMEM;
1413 if (!kvm_sregs)
1414 goto out;
6aa8b732 1415 r = -EFAULT;
fa3795a7 1416 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
6aa8b732 1417 goto out;
fa3795a7 1418 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
6aa8b732
AK
1419 if (r)
1420 goto out;
1421 r = 0;
1422 break;
1423 }
62d9f0db
MT
1424 case KVM_GET_MP_STATE: {
1425 struct kvm_mp_state mp_state;
1426
1427 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
1428 if (r)
1429 goto out;
1430 r = -EFAULT;
1431 if (copy_to_user(argp, &mp_state, sizeof mp_state))
1432 goto out;
1433 r = 0;
1434 break;
1435 }
1436 case KVM_SET_MP_STATE: {
1437 struct kvm_mp_state mp_state;
1438
1439 r = -EFAULT;
1440 if (copy_from_user(&mp_state, argp, sizeof mp_state))
1441 goto out;
1442 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1443 if (r)
1444 goto out;
1445 r = 0;
1446 break;
1447 }
6aa8b732
AK
1448 case KVM_TRANSLATE: {
1449 struct kvm_translation tr;
1450
1451 r = -EFAULT;
2f366987 1452 if (copy_from_user(&tr, argp, sizeof tr))
6aa8b732 1453 goto out;
8b006791 1454 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
6aa8b732
AK
1455 if (r)
1456 goto out;
1457 r = -EFAULT;
2f366987 1458 if (copy_to_user(argp, &tr, sizeof tr))
6aa8b732
AK
1459 goto out;
1460 r = 0;
1461 break;
1462 }
d0bfb940
JK
1463 case KVM_SET_GUEST_DEBUG: {
1464 struct kvm_guest_debug dbg;
6aa8b732
AK
1465
1466 r = -EFAULT;
2f366987 1467 if (copy_from_user(&dbg, argp, sizeof dbg))
6aa8b732 1468 goto out;
d0bfb940 1469 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
6aa8b732
AK
1470 if (r)
1471 goto out;
1472 r = 0;
1473 break;
1474 }
1961d276
AK
1475 case KVM_SET_SIGNAL_MASK: {
1476 struct kvm_signal_mask __user *sigmask_arg = argp;
1477 struct kvm_signal_mask kvm_sigmask;
1478 sigset_t sigset, *p;
1479
1480 p = NULL;
1481 if (argp) {
1482 r = -EFAULT;
1483 if (copy_from_user(&kvm_sigmask, argp,
1484 sizeof kvm_sigmask))
1485 goto out;
1486 r = -EINVAL;
1487 if (kvm_sigmask.len != sizeof sigset)
1488 goto out;
1489 r = -EFAULT;
1490 if (copy_from_user(&sigset, sigmask_arg->sigset,
1491 sizeof sigset))
1492 goto out;
1493 p = &sigset;
1494 }
1495 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
1496 break;
1497 }
b8836737 1498 case KVM_GET_FPU: {
fa3795a7
DH
1499 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1500 r = -ENOMEM;
1501 if (!fpu)
1502 goto out;
1503 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
b8836737
AK
1504 if (r)
1505 goto out;
1506 r = -EFAULT;
fa3795a7 1507 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
b8836737
AK
1508 goto out;
1509 r = 0;
1510 break;
1511 }
1512 case KVM_SET_FPU: {
fa3795a7
DH
1513 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1514 r = -ENOMEM;
1515 if (!fpu)
1516 goto out;
b8836737 1517 r = -EFAULT;
fa3795a7 1518 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
b8836737 1519 goto out;
fa3795a7 1520 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
b8836737
AK
1521 if (r)
1522 goto out;
1523 r = 0;
1524 break;
1525 }
bccf2150 1526 default:
313a3dc7 1527 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
bccf2150
AK
1528 }
1529out:
fa3795a7
DH
1530 kfree(fpu);
1531 kfree(kvm_sregs);
bccf2150
AK
1532 return r;
1533}
1534
1535static long kvm_vm_ioctl(struct file *filp,
1536 unsigned int ioctl, unsigned long arg)
1537{
1538 struct kvm *kvm = filp->private_data;
1539 void __user *argp = (void __user *)arg;
1fe779f8 1540 int r;
bccf2150 1541
6d4e4c4f
AK
1542 if (kvm->mm != current->mm)
1543 return -EIO;
bccf2150
AK
1544 switch (ioctl) {
1545 case KVM_CREATE_VCPU:
1546 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1547 if (r < 0)
1548 goto out;
1549 break;
6fc138d2
IE
1550 case KVM_SET_USER_MEMORY_REGION: {
1551 struct kvm_userspace_memory_region kvm_userspace_mem;
1552
1553 r = -EFAULT;
1554 if (copy_from_user(&kvm_userspace_mem, argp,
1555 sizeof kvm_userspace_mem))
1556 goto out;
1557
1558 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
6aa8b732
AK
1559 if (r)
1560 goto out;
1561 break;
1562 }
1563 case KVM_GET_DIRTY_LOG: {
1564 struct kvm_dirty_log log;
1565
1566 r = -EFAULT;
2f366987 1567 if (copy_from_user(&log, argp, sizeof log))
6aa8b732 1568 goto out;
2c6f5df9 1569 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
6aa8b732
AK
1570 if (r)
1571 goto out;
1572 break;
1573 }
5f94c174
LV
1574#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1575 case KVM_REGISTER_COALESCED_MMIO: {
1576 struct kvm_coalesced_mmio_zone zone;
1577 r = -EFAULT;
1578 if (copy_from_user(&zone, argp, sizeof zone))
1579 goto out;
1580 r = -ENXIO;
1581 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
1582 if (r)
1583 goto out;
1584 r = 0;
1585 break;
1586 }
1587 case KVM_UNREGISTER_COALESCED_MMIO: {
1588 struct kvm_coalesced_mmio_zone zone;
1589 r = -EFAULT;
1590 if (copy_from_user(&zone, argp, sizeof zone))
1591 goto out;
1592 r = -ENXIO;
1593 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
1594 if (r)
1595 goto out;
1596 r = 0;
1597 break;
1598 }
1599#endif
721eecbf
GH
1600 case KVM_IRQFD: {
1601 struct kvm_irqfd data;
1602
1603 r = -EFAULT;
1604 if (copy_from_user(&data, argp, sizeof data))
1605 goto out;
1606 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
1607 break;
1608 }
d34e6b17
GH
1609 case KVM_IOEVENTFD: {
1610 struct kvm_ioeventfd data;
1611
1612 r = -EFAULT;
1613 if (copy_from_user(&data, argp, sizeof data))
1614 goto out;
1615 r = kvm_ioeventfd(kvm, &data);
1616 break;
1617 }
73880c80
GN
1618#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1619 case KVM_SET_BOOT_CPU_ID:
1620 r = 0;
894a9c55 1621 mutex_lock(&kvm->lock);
73880c80
GN
1622 if (atomic_read(&kvm->online_vcpus) != 0)
1623 r = -EBUSY;
1624 else
1625 kvm->bsp_vcpu_id = arg;
894a9c55 1626 mutex_unlock(&kvm->lock);
73880c80
GN
1627 break;
1628#endif
f17abe9a 1629 default:
1fe779f8 1630 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
bfd99ff5
AK
1631 if (r == -ENOTTY)
1632 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
f17abe9a
AK
1633 }
1634out:
1635 return r;
1636}
1637
6ff5894c
AB
1638#ifdef CONFIG_COMPAT
1639struct compat_kvm_dirty_log {
1640 __u32 slot;
1641 __u32 padding1;
1642 union {
1643 compat_uptr_t dirty_bitmap; /* one bit per page */
1644 __u64 padding2;
1645 };
1646};
1647
1648static long kvm_vm_compat_ioctl(struct file *filp,
1649 unsigned int ioctl, unsigned long arg)
1650{
1651 struct kvm *kvm = filp->private_data;
1652 int r;
1653
1654 if (kvm->mm != current->mm)
1655 return -EIO;
1656 switch (ioctl) {
1657 case KVM_GET_DIRTY_LOG: {
1658 struct compat_kvm_dirty_log compat_log;
1659 struct kvm_dirty_log log;
1660
1661 r = -EFAULT;
1662 if (copy_from_user(&compat_log, (void __user *)arg,
1663 sizeof(compat_log)))
1664 goto out;
1665 log.slot = compat_log.slot;
1666 log.padding1 = compat_log.padding1;
1667 log.padding2 = compat_log.padding2;
1668 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
1669
1670 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1671 if (r)
1672 goto out;
1673 break;
1674 }
1675 default:
1676 r = kvm_vm_ioctl(filp, ioctl, arg);
1677 }
1678
1679out:
1680 return r;
1681}
1682#endif
1683
e4a533a4 1684static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
f17abe9a 1685{
777b3f49
MT
1686 struct page *page[1];
1687 unsigned long addr;
1688 int npages;
1689 gfn_t gfn = vmf->pgoff;
f17abe9a 1690 struct kvm *kvm = vma->vm_file->private_data;
f17abe9a 1691
777b3f49
MT
1692 addr = gfn_to_hva(kvm, gfn);
1693 if (kvm_is_error_hva(addr))
e4a533a4 1694 return VM_FAULT_SIGBUS;
777b3f49
MT
1695
1696 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
1697 NULL);
1698 if (unlikely(npages != 1))
e4a533a4 1699 return VM_FAULT_SIGBUS;
777b3f49
MT
1700
1701 vmf->page = page[0];
e4a533a4 1702 return 0;
f17abe9a
AK
1703}
1704
f0f37e2f 1705static const struct vm_operations_struct kvm_vm_vm_ops = {
e4a533a4 1706 .fault = kvm_vm_fault,
f17abe9a
AK
1707};
1708
1709static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
1710{
1711 vma->vm_ops = &kvm_vm_vm_ops;
1712 return 0;
1713}
1714
3d3aab1b 1715static struct file_operations kvm_vm_fops = {
f17abe9a
AK
1716 .release = kvm_vm_release,
1717 .unlocked_ioctl = kvm_vm_ioctl,
6ff5894c
AB
1718#ifdef CONFIG_COMPAT
1719 .compat_ioctl = kvm_vm_compat_ioctl,
1720#endif
f17abe9a
AK
1721 .mmap = kvm_vm_mmap,
1722};
1723
1724static int kvm_dev_ioctl_create_vm(void)
1725{
2030a42c 1726 int fd;
f17abe9a
AK
1727 struct kvm *kvm;
1728
f17abe9a 1729 kvm = kvm_create_vm();
d6d28168
AK
1730 if (IS_ERR(kvm))
1731 return PTR_ERR(kvm);
628ff7c1 1732 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
2030a42c 1733 if (fd < 0)
66c0b394 1734 kvm_put_kvm(kvm);
f17abe9a 1735
f17abe9a 1736 return fd;
f17abe9a
AK
1737}
1738
1a811b61
AK
1739static long kvm_dev_ioctl_check_extension_generic(long arg)
1740{
1741 switch (arg) {
ca9edaee 1742 case KVM_CAP_USER_MEMORY:
1a811b61 1743 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4cd481f6 1744 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
73880c80
GN
1745#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1746 case KVM_CAP_SET_BOOT_CPU_ID:
1747#endif
a9c7399d 1748 case KVM_CAP_INTERNAL_ERROR_DATA:
1a811b61 1749 return 1;
399ec807
AK
1750#ifdef CONFIG_HAVE_KVM_IRQCHIP
1751 case KVM_CAP_IRQ_ROUTING:
36463146 1752 return KVM_MAX_IRQ_ROUTES;
399ec807 1753#endif
1a811b61
AK
1754 default:
1755 break;
1756 }
1757 return kvm_dev_ioctl_check_extension(arg);
1758}
1759
f17abe9a
AK
1760static long kvm_dev_ioctl(struct file *filp,
1761 unsigned int ioctl, unsigned long arg)
1762{
07c45a36 1763 long r = -EINVAL;
f17abe9a
AK
1764
1765 switch (ioctl) {
1766 case KVM_GET_API_VERSION:
f0fe5108
AK
1767 r = -EINVAL;
1768 if (arg)
1769 goto out;
f17abe9a
AK
1770 r = KVM_API_VERSION;
1771 break;
1772 case KVM_CREATE_VM:
f0fe5108
AK
1773 r = -EINVAL;
1774 if (arg)
1775 goto out;
f17abe9a
AK
1776 r = kvm_dev_ioctl_create_vm();
1777 break;
018d00d2 1778 case KVM_CHECK_EXTENSION:
1a811b61 1779 r = kvm_dev_ioctl_check_extension_generic(arg);
5d308f45 1780 break;
07c45a36
AK
1781 case KVM_GET_VCPU_MMAP_SIZE:
1782 r = -EINVAL;
1783 if (arg)
1784 goto out;
adb1ff46
AK
1785 r = PAGE_SIZE; /* struct kvm_run */
1786#ifdef CONFIG_X86
1787 r += PAGE_SIZE; /* pio data page */
5f94c174
LV
1788#endif
1789#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1790 r += PAGE_SIZE; /* coalesced mmio ring page */
adb1ff46 1791#endif
07c45a36 1792 break;
d4c9ff2d
FEL
1793 case KVM_TRACE_ENABLE:
1794 case KVM_TRACE_PAUSE:
1795 case KVM_TRACE_DISABLE:
2023a29c 1796 r = -EOPNOTSUPP;
d4c9ff2d 1797 break;
6aa8b732 1798 default:
043405e1 1799 return kvm_arch_dev_ioctl(filp, ioctl, arg);
6aa8b732
AK
1800 }
1801out:
1802 return r;
1803}
1804
6aa8b732 1805static struct file_operations kvm_chardev_ops = {
6aa8b732
AK
1806 .unlocked_ioctl = kvm_dev_ioctl,
1807 .compat_ioctl = kvm_dev_ioctl,
6aa8b732
AK
1808};
1809
1810static struct miscdevice kvm_dev = {
bbe4432e 1811 KVM_MINOR,
6aa8b732
AK
1812 "kvm",
1813 &kvm_chardev_ops,
1814};
1815
1b6c0168
AK
1816static void hardware_enable(void *junk)
1817{
1818 int cpu = raw_smp_processor_id();
10474ae8 1819 int r;
1b6c0168 1820
7f59f492 1821 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
1b6c0168 1822 return;
10474ae8 1823
7f59f492 1824 cpumask_set_cpu(cpu, cpus_hardware_enabled);
10474ae8
AG
1825
1826 r = kvm_arch_hardware_enable(NULL);
1827
1828 if (r) {
1829 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
1830 atomic_inc(&hardware_enable_failed);
1831 printk(KERN_INFO "kvm: enabling virtualization on "
1832 "CPU%d failed\n", cpu);
1833 }
1b6c0168
AK
1834}
1835
1836static void hardware_disable(void *junk)
1837{
1838 int cpu = raw_smp_processor_id();
1839
7f59f492 1840 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
1b6c0168 1841 return;
7f59f492 1842 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
e9b11c17 1843 kvm_arch_hardware_disable(NULL);
1b6c0168
AK
1844}
1845
10474ae8
AG
1846static void hardware_disable_all_nolock(void)
1847{
1848 BUG_ON(!kvm_usage_count);
1849
1850 kvm_usage_count--;
1851 if (!kvm_usage_count)
1852 on_each_cpu(hardware_disable, NULL, 1);
1853}
1854
1855static void hardware_disable_all(void)
1856{
1857 spin_lock(&kvm_lock);
1858 hardware_disable_all_nolock();
1859 spin_unlock(&kvm_lock);
1860}
1861
1862static int hardware_enable_all(void)
1863{
1864 int r = 0;
1865
1866 spin_lock(&kvm_lock);
1867
1868 kvm_usage_count++;
1869 if (kvm_usage_count == 1) {
1870 atomic_set(&hardware_enable_failed, 0);
1871 on_each_cpu(hardware_enable, NULL, 1);
1872
1873 if (atomic_read(&hardware_enable_failed)) {
1874 hardware_disable_all_nolock();
1875 r = -EBUSY;
1876 }
1877 }
1878
1879 spin_unlock(&kvm_lock);
1880
1881 return r;
1882}
1883
774c47f1
AK
1884static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
1885 void *v)
1886{
1887 int cpu = (long)v;
1888
10474ae8
AG
1889 if (!kvm_usage_count)
1890 return NOTIFY_OK;
1891
1a6f4d7f 1892 val &= ~CPU_TASKS_FROZEN;
774c47f1 1893 switch (val) {
cec9ad27 1894 case CPU_DYING:
6ec8a856
AK
1895 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1896 cpu);
1897 hardware_disable(NULL);
1898 break;
774c47f1 1899 case CPU_UP_CANCELED:
43934a38
JK
1900 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1901 cpu);
8691e5a8 1902 smp_call_function_single(cpu, hardware_disable, NULL, 1);
774c47f1 1903 break;
43934a38
JK
1904 case CPU_ONLINE:
1905 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
1906 cpu);
8691e5a8 1907 smp_call_function_single(cpu, hardware_enable, NULL, 1);
774c47f1
AK
1908 break;
1909 }
1910 return NOTIFY_OK;
1911}
1912
4ecac3fd
AK
1913
1914asmlinkage void kvm_handle_fault_on_reboot(void)
1915{
1916 if (kvm_rebooting)
1917 /* spin while reset goes on */
1918 while (true)
1919 ;
1920 /* Fault while not rebooting. We want the trace. */
1921 BUG();
1922}
1923EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
1924
9a2b85c6 1925static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
d77c26fc 1926 void *v)
9a2b85c6 1927{
8e1c1815
SY
1928 /*
1929 * Some (well, at least mine) BIOSes hang on reboot if
1930 * in vmx root mode.
1931 *
1932 * And Intel TXT required VMX off for all cpu when system shutdown.
1933 */
1934 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1935 kvm_rebooting = true;
1936 on_each_cpu(hardware_disable, NULL, 1);
9a2b85c6
RR
1937 return NOTIFY_OK;
1938}
1939
1940static struct notifier_block kvm_reboot_notifier = {
1941 .notifier_call = kvm_reboot,
1942 .priority = 0,
1943};
1944
e93f8a0f 1945static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2eeb2e94
GH
1946{
1947 int i;
1948
1949 for (i = 0; i < bus->dev_count; i++) {
1950 struct kvm_io_device *pos = bus->devs[i];
1951
1952 kvm_iodevice_destructor(pos);
1953 }
e93f8a0f 1954 kfree(bus);
2eeb2e94
GH
1955}
1956
bda9020e 1957/* kvm_io_bus_write - called under kvm->slots_lock */
e93f8a0f 1958int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
bda9020e 1959 int len, const void *val)
2eeb2e94
GH
1960{
1961 int i;
e93f8a0f 1962 struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
bda9020e
MT
1963 for (i = 0; i < bus->dev_count; i++)
1964 if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
1965 return 0;
1966 return -EOPNOTSUPP;
1967}
2eeb2e94 1968
bda9020e 1969/* kvm_io_bus_read - called under kvm->slots_lock */
e93f8a0f
MT
1970int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
1971 int len, void *val)
bda9020e
MT
1972{
1973 int i;
e93f8a0f
MT
1974 struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
1975
bda9020e
MT
1976 for (i = 0; i < bus->dev_count; i++)
1977 if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
1978 return 0;
1979 return -EOPNOTSUPP;
2eeb2e94
GH
1980}
1981
79fac95e 1982/* Caller must hold slots_lock. */
e93f8a0f
MT
1983int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
1984 struct kvm_io_device *dev)
6c474694 1985{
e93f8a0f 1986 struct kvm_io_bus *new_bus, *bus;
090b7aff 1987
e93f8a0f 1988 bus = kvm->buses[bus_idx];
090b7aff
GH
1989 if (bus->dev_count > NR_IOBUS_DEVS-1)
1990 return -ENOSPC;
2eeb2e94 1991
e93f8a0f
MT
1992 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
1993 if (!new_bus)
1994 return -ENOMEM;
1995 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
1996 new_bus->devs[new_bus->dev_count++] = dev;
1997 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
1998 synchronize_srcu_expedited(&kvm->srcu);
1999 kfree(bus);
090b7aff
GH
2000
2001 return 0;
2002}
2003
79fac95e 2004/* Caller must hold slots_lock. */
e93f8a0f
MT
2005int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2006 struct kvm_io_device *dev)
090b7aff 2007{
e93f8a0f
MT
2008 int i, r;
2009 struct kvm_io_bus *new_bus, *bus;
090b7aff 2010
e93f8a0f
MT
2011 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2012 if (!new_bus)
2013 return -ENOMEM;
090b7aff 2014
e93f8a0f
MT
2015 bus = kvm->buses[bus_idx];
2016 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2017
2018 r = -ENOENT;
2019 for (i = 0; i < new_bus->dev_count; i++)
2020 if (new_bus->devs[i] == dev) {
2021 r = 0;
2022 new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
090b7aff
GH
2023 break;
2024 }
e93f8a0f
MT
2025
2026 if (r) {
2027 kfree(new_bus);
2028 return r;
2029 }
2030
2031 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2032 synchronize_srcu_expedited(&kvm->srcu);
2033 kfree(bus);
2034 return r;
2eeb2e94
GH
2035}
2036
774c47f1
AK
2037static struct notifier_block kvm_cpu_notifier = {
2038 .notifier_call = kvm_cpu_hotplug,
2039 .priority = 20, /* must be > scheduler priority */
2040};
2041
8b88b099 2042static int vm_stat_get(void *_offset, u64 *val)
ba1389b7
AK
2043{
2044 unsigned offset = (long)_offset;
ba1389b7
AK
2045 struct kvm *kvm;
2046
8b88b099 2047 *val = 0;
ba1389b7
AK
2048 spin_lock(&kvm_lock);
2049 list_for_each_entry(kvm, &vm_list, vm_list)
8b88b099 2050 *val += *(u32 *)((void *)kvm + offset);
ba1389b7 2051 spin_unlock(&kvm_lock);
8b88b099 2052 return 0;
ba1389b7
AK
2053}
2054
2055DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
2056
8b88b099 2057static int vcpu_stat_get(void *_offset, u64 *val)
1165f5fe
AK
2058{
2059 unsigned offset = (long)_offset;
1165f5fe
AK
2060 struct kvm *kvm;
2061 struct kvm_vcpu *vcpu;
2062 int i;
2063
8b88b099 2064 *val = 0;
1165f5fe
AK
2065 spin_lock(&kvm_lock);
2066 list_for_each_entry(kvm, &vm_list, vm_list)
988a2cae
GN
2067 kvm_for_each_vcpu(i, vcpu, kvm)
2068 *val += *(u32 *)((void *)vcpu + offset);
2069
1165f5fe 2070 spin_unlock(&kvm_lock);
8b88b099 2071 return 0;
1165f5fe
AK
2072}
2073
ba1389b7
AK
2074DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
2075
828c0950 2076static const struct file_operations *stat_fops[] = {
ba1389b7
AK
2077 [KVM_STAT_VCPU] = &vcpu_stat_fops,
2078 [KVM_STAT_VM] = &vm_stat_fops,
2079};
1165f5fe 2080
a16b043c 2081static void kvm_init_debug(void)
6aa8b732
AK
2082{
2083 struct kvm_stats_debugfs_item *p;
2084
76f7c879 2085 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
6aa8b732 2086 for (p = debugfs_entries; p->name; ++p)
76f7c879 2087 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
1165f5fe 2088 (void *)(long)p->offset,
ba1389b7 2089 stat_fops[p->kind]);
6aa8b732
AK
2090}
2091
2092static void kvm_exit_debug(void)
2093{
2094 struct kvm_stats_debugfs_item *p;
2095
2096 for (p = debugfs_entries; p->name; ++p)
2097 debugfs_remove(p->dentry);
76f7c879 2098 debugfs_remove(kvm_debugfs_dir);
6aa8b732
AK
2099}
2100
59ae6c6b
AK
2101static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2102{
10474ae8
AG
2103 if (kvm_usage_count)
2104 hardware_disable(NULL);
59ae6c6b
AK
2105 return 0;
2106}
2107
2108static int kvm_resume(struct sys_device *dev)
2109{
10474ae8
AG
2110 if (kvm_usage_count)
2111 hardware_enable(NULL);
59ae6c6b
AK
2112 return 0;
2113}
2114
2115static struct sysdev_class kvm_sysdev_class = {
af5ca3f4 2116 .name = "kvm",
59ae6c6b
AK
2117 .suspend = kvm_suspend,
2118 .resume = kvm_resume,
2119};
2120
2121static struct sys_device kvm_sysdev = {
2122 .id = 0,
2123 .cls = &kvm_sysdev_class,
2124};
2125
cea7bb21 2126struct page *bad_page;
35149e21 2127pfn_t bad_pfn;
6aa8b732 2128
15ad7146
AK
2129static inline
2130struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2131{
2132 return container_of(pn, struct kvm_vcpu, preempt_notifier);
2133}
2134
2135static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2136{
2137 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2138
e9b11c17 2139 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146
AK
2140}
2141
2142static void kvm_sched_out(struct preempt_notifier *pn,
2143 struct task_struct *next)
2144{
2145 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2146
e9b11c17 2147 kvm_arch_vcpu_put(vcpu);
15ad7146
AK
2148}
2149
f8c16bba 2150int kvm_init(void *opaque, unsigned int vcpu_size,
c16f862d 2151 struct module *module)
6aa8b732
AK
2152{
2153 int r;
002c7f7c 2154 int cpu;
6aa8b732 2155
f8c16bba
ZX
2156 r = kvm_arch_init(opaque);
2157 if (r)
d2308784 2158 goto out_fail;
cb498ea2
ZX
2159
2160 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2161
2162 if (bad_page == NULL) {
2163 r = -ENOMEM;
2164 goto out;
2165 }
2166
35149e21
AL
2167 bad_pfn = page_to_pfn(bad_page);
2168
8437a617 2169 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
7f59f492
RR
2170 r = -ENOMEM;
2171 goto out_free_0;
2172 }
2173
e9b11c17 2174 r = kvm_arch_hardware_setup();
6aa8b732 2175 if (r < 0)
7f59f492 2176 goto out_free_0a;
6aa8b732 2177
002c7f7c
YS
2178 for_each_online_cpu(cpu) {
2179 smp_call_function_single(cpu,
e9b11c17 2180 kvm_arch_check_processor_compat,
8691e5a8 2181 &r, 1);
002c7f7c 2182 if (r < 0)
d2308784 2183 goto out_free_1;
002c7f7c
YS
2184 }
2185
774c47f1
AK
2186 r = register_cpu_notifier(&kvm_cpu_notifier);
2187 if (r)
d2308784 2188 goto out_free_2;
6aa8b732
AK
2189 register_reboot_notifier(&kvm_reboot_notifier);
2190
59ae6c6b
AK
2191 r = sysdev_class_register(&kvm_sysdev_class);
2192 if (r)
d2308784 2193 goto out_free_3;
59ae6c6b
AK
2194
2195 r = sysdev_register(&kvm_sysdev);
2196 if (r)
d2308784 2197 goto out_free_4;
59ae6c6b 2198
c16f862d
RR
2199 /* A kmem cache lets us meet the alignment requirements of fx_save. */
2200 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
56919c5c
JP
2201 __alignof__(struct kvm_vcpu),
2202 0, NULL);
c16f862d
RR
2203 if (!kvm_vcpu_cache) {
2204 r = -ENOMEM;
d2308784 2205 goto out_free_5;
c16f862d
RR
2206 }
2207
6aa8b732 2208 kvm_chardev_ops.owner = module;
3d3aab1b
CB
2209 kvm_vm_fops.owner = module;
2210 kvm_vcpu_fops.owner = module;
6aa8b732
AK
2211
2212 r = misc_register(&kvm_dev);
2213 if (r) {
d77c26fc 2214 printk(KERN_ERR "kvm: misc device register failed\n");
6aa8b732
AK
2215 goto out_free;
2216 }
2217
15ad7146
AK
2218 kvm_preempt_ops.sched_in = kvm_sched_in;
2219 kvm_preempt_ops.sched_out = kvm_sched_out;
2220
0ea4ed8e
DW
2221 kvm_init_debug();
2222
c7addb90 2223 return 0;
6aa8b732
AK
2224
2225out_free:
c16f862d 2226 kmem_cache_destroy(kvm_vcpu_cache);
d2308784 2227out_free_5:
59ae6c6b 2228 sysdev_unregister(&kvm_sysdev);
d2308784 2229out_free_4:
59ae6c6b 2230 sysdev_class_unregister(&kvm_sysdev_class);
d2308784 2231out_free_3:
6aa8b732 2232 unregister_reboot_notifier(&kvm_reboot_notifier);
774c47f1 2233 unregister_cpu_notifier(&kvm_cpu_notifier);
d2308784 2234out_free_2:
d2308784 2235out_free_1:
e9b11c17 2236 kvm_arch_hardware_unsetup();
7f59f492
RR
2237out_free_0a:
2238 free_cpumask_var(cpus_hardware_enabled);
d2308784
ZX
2239out_free_0:
2240 __free_page(bad_page);
ca45aaae 2241out:
f8c16bba 2242 kvm_arch_exit();
d2308784 2243out_fail:
6aa8b732
AK
2244 return r;
2245}
cb498ea2 2246EXPORT_SYMBOL_GPL(kvm_init);
6aa8b732 2247
cb498ea2 2248void kvm_exit(void)
6aa8b732 2249{
229456fc 2250 tracepoint_synchronize_unregister();
0ea4ed8e 2251 kvm_exit_debug();
6aa8b732 2252 misc_deregister(&kvm_dev);
c16f862d 2253 kmem_cache_destroy(kvm_vcpu_cache);
59ae6c6b
AK
2254 sysdev_unregister(&kvm_sysdev);
2255 sysdev_class_unregister(&kvm_sysdev_class);
6aa8b732 2256 unregister_reboot_notifier(&kvm_reboot_notifier);
59ae6c6b 2257 unregister_cpu_notifier(&kvm_cpu_notifier);
15c8b6c1 2258 on_each_cpu(hardware_disable, NULL, 1);
e9b11c17 2259 kvm_arch_hardware_unsetup();
f8c16bba 2260 kvm_arch_exit();
7f59f492 2261 free_cpumask_var(cpus_hardware_enabled);
cea7bb21 2262 __free_page(bad_page);
6aa8b732 2263}
cb498ea2 2264EXPORT_SYMBOL_GPL(kvm_exit);
This page took 0.565037 seconds and 5 git commands to generate.