#include "vmx.h"
#include "kvm.h"
+#include "x86.h"
#include <linux/types.h>
#include <linux/string.h>
#define PT32_DIR_PSE36_SIZE 4
#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+#define PT32_DIR_PSE36_MASK \
+ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_LEVEL_BITS 9
#define PT64_LEVEL_SHIFT(level) \
- ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
+ (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
#define PT64_LEVEL_MASK(level) \
(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
#define PT32_LEVEL_BITS 10
#define PT32_LEVEL_SHIFT(level) \
- ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
+ (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
#define PT32_LEVEL_MASK(level) \
(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
#define PT32_DIR_BASE_ADDR_MASK \
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
+ | PT64_NX_MASK)
#define PFERR_PRESENT_MASK (1U << 0)
#define PFERR_WRITE_MASK (1U << 1)
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
+static u64 __read_mostly shadow_trap_nonpresent_pte;
+static u64 __read_mostly shadow_notrap_nonpresent_pte;
+
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+{
+ shadow_trap_nonpresent_pte = trap_pte;
+ shadow_notrap_nonpresent_pte = notrap_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
+
static int is_write_protection(struct kvm_vcpu *vcpu)
{
- return vcpu->cr0 & CR0_WP_MASK;
+ return vcpu->cr0 & X86_CR0_WP;
}
static int is_cpuid_PSE36(void)
return pte & PT_PRESENT_MASK;
}
+static int is_shadow_present_pte(u64 pte)
+{
+ pte &= ~PT_SHADOW_IO_MARK;
+ return pte != shadow_trap_nonpresent_pte
+ && pte != shadow_notrap_nonpresent_pte;
+}
+
static int is_writeble_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
}
+static int is_dirty_pte(unsigned long pte)
+{
+ return pte & PT_DIRTY_MASK;
+}
+
static int is_io_pte(unsigned long pte)
{
return pte & PT_SHADOW_IO_MARK;
static int is_rmap_pte(u64 pte)
{
- return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
- == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
+ return pte != shadow_trap_nonpresent_pte
+ && pte != shadow_notrap_nonpresent_pte;
}
static void set_shadow_pte(u64 *sptep, u64 spte)
}
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- struct kmem_cache *base_cache, int min,
- gfp_t gfp_flags)
+ struct kmem_cache *base_cache, int min)
{
void *obj;
if (cache->nobjs >= min)
return 0;
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, gfp_flags);
+ obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
if (!obj)
return -ENOMEM;
cache->objects[cache->nobjs++] = obj;
}
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
- int min, gfp_t gfp_flags)
+ int min)
{
struct page *page;
if (cache->nobjs >= min)
return 0;
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- page = alloc_page(gfp_flags);
+ page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
set_page_private(page, 0);
free_page((unsigned long)mc->objects[--mc->nobjs]);
}
-static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
{
int r;
+ kvm_mmu_free_some_pages(vcpu);
r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
- pte_chain_cache, 4, gfp_flags);
+ pte_chain_cache, 4);
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
- rmap_desc_cache, 1, gfp_flags);
+ rmap_desc_cache, 1);
if (r)
goto out;
- r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4, gfp_flags);
+ r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 8);
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
- mmu_page_header_cache, 4, gfp_flags);
+ mmu_page_header_cache, 4);
out:
return r;
}
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
-{
- int r;
-
- r = __mmu_topup_memory_caches(vcpu, GFP_NOWAIT);
- if (r < 0) {
- spin_unlock(&vcpu->kvm->lock);
- kvm_arch_ops->vcpu_put(vcpu);
- r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL);
- kvm_arch_ops->vcpu_load(vcpu);
- spin_lock(&vcpu->kvm->lock);
- }
- return r;
-}
-
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
kfree(rd);
}
+/*
+ * Take gfn and return the reverse mapping to it.
+ * Note: gfn must be unaliased before this function get called
+ */
+
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ return &slot->rmap[gfn - slot->base_gfn];
+}
+
/*
* Reverse mapping data structures:
*
- * If page->private bit zero is zero, then page->private points to the
- * shadow page table entry that points to page_address(page).
+ * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
+ * that points to page_address(page).
*
- * If page->private bit zero is one, (then page->private & ~1) points
- * to a struct kvm_rmap_desc containing more mappings.
+ * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
+ * containing more mappings.
*/
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
- struct page *page;
+ struct kvm_mmu_page *page;
struct kvm_rmap_desc *desc;
+ unsigned long *rmapp;
int i;
if (!is_rmap_pte(*spte))
return;
- page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
- if (!page_private(page)) {
+ gfn = unalias_gfn(vcpu->kvm, gfn);
+ page = page_header(__pa(spte));
+ page->gfns[spte - page->spt] = gfn;
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+ if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
- set_page_private(page,(unsigned long)spte);
- } else if (!(page_private(page) & 1)) {
+ *rmapp = (unsigned long)spte;
+ } else if (!(*rmapp & 1)) {
rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_rmap_desc(vcpu);
- desc->shadow_ptes[0] = (u64 *)page_private(page);
+ desc->shadow_ptes[0] = (u64 *)*rmapp;
desc->shadow_ptes[1] = spte;
- set_page_private(page,(unsigned long)desc | 1);
+ *rmapp = (unsigned long)desc | 1;
} else {
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
desc = desc->more;
if (desc->shadow_ptes[RMAP_EXT-1]) {
}
}
-static void rmap_desc_remove_entry(struct page *page,
+static void rmap_desc_remove_entry(unsigned long *rmapp,
struct kvm_rmap_desc *desc,
int i,
struct kvm_rmap_desc *prev_desc)
if (j != 0)
return;
if (!prev_desc && !desc->more)
- set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
+ *rmapp = (unsigned long)desc->shadow_ptes[0];
else
if (prev_desc)
prev_desc->more = desc->more;
else
- set_page_private(page,(unsigned long)desc->more | 1);
+ *rmapp = (unsigned long)desc->more | 1;
mmu_free_rmap_desc(desc);
}
-static void rmap_remove(u64 *spte)
+static void rmap_remove(struct kvm *kvm, u64 *spte)
{
- struct page *page;
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
+ struct kvm_mmu_page *page;
+ struct page *release_page;
+ unsigned long *rmapp;
int i;
if (!is_rmap_pte(*spte))
return;
- page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
- if (!page_private(page)) {
+ page = page_header(__pa(spte));
+ release_page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+ if (is_writeble_pte(*spte))
+ kvm_release_page_dirty(release_page);
+ else
+ kvm_release_page_clean(release_page);
+ rmapp = gfn_to_rmap(kvm, page->gfns[spte - page->spt]);
+ if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
- } else if (!(page_private(page) & 1)) {
+ } else if (!(*rmapp & 1)) {
rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
- if ((u64 *)page_private(page) != spte) {
+ if ((u64 *)*rmapp != spte) {
printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
spte, *spte);
BUG();
}
- set_page_private(page,0);
+ *rmapp = 0;
} else {
rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
prev_desc = NULL;
while (desc) {
for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
if (desc->shadow_ptes[i] == spte) {
- rmap_desc_remove_entry(page,
+ rmap_desc_remove_entry(rmapp,
desc, i,
prev_desc);
return;
}
}
-static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
{
- struct kvm *kvm = vcpu->kvm;
- struct page *page;
struct kvm_rmap_desc *desc;
+ struct kvm_rmap_desc *prev_desc;
+ u64 *prev_spte;
+ int i;
+
+ if (!*rmapp)
+ return NULL;
+ else if (!(*rmapp & 1)) {
+ if (!spte)
+ return (u64 *)*rmapp;
+ return NULL;
+ }
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+ prev_desc = NULL;
+ prev_spte = NULL;
+ while (desc) {
+ for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
+ if (prev_spte == spte)
+ return desc->shadow_ptes[i];
+ prev_spte = desc->shadow_ptes[i];
+ }
+ desc = desc->more;
+ }
+ return NULL;
+}
+
+static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+ unsigned long *rmapp;
u64 *spte;
- page = gfn_to_page(kvm, gfn);
- BUG_ON(!page);
+ gfn = unalias_gfn(kvm, gfn);
+ rmapp = gfn_to_rmap(kvm, gfn);
- while (page_private(page)) {
- if (!(page_private(page) & 1))
- spte = (u64 *)page_private(page);
- else {
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
- spte = desc->shadow_ptes[0];
- }
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
BUG_ON(!spte);
- BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
- != page_to_pfn(page));
BUG_ON(!(*spte & PT_PRESENT_MASK));
- BUG_ON(!(*spte & PT_WRITABLE_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- rmap_remove(spte);
- set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
- kvm_flush_remote_tlbs(vcpu->kvm);
+ if (is_writeble_pte(*spte))
+ set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+ kvm_flush_remote_tlbs(kvm);
+ spte = rmap_next(kvm, rmapp, spte);
}
}
u64 *end;
for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
- if (*pos != 0) {
+ if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
pos, *pos);
return 0;
ASSERT(is_empty_shadow_page(page_head->spt));
list_del(&page_head->link);
__free_page(virt_to_page(page_head->spt));
+ __free_page(virt_to_page(page_head->gfns));
kfree(page_head);
++kvm->n_free_mmu_pages;
}
page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
sizeof *page);
page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
+ page->gfns = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
set_page_private(virt_to_page(page->spt), (unsigned long)page);
list_add(&page->link, &vcpu->kvm->active_mmu_pages);
ASSERT(is_empty_shadow_page(page->spt));
BUG();
}
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
+static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm,
gfn_t gfn)
{
unsigned index;
pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
- bucket = &vcpu->kvm->mmu_page_hash[index];
+ bucket = &kvm->mmu_page_hash[index];
hlist_for_each_entry(page, node, bucket, hash_link)
if (page->gfn == gfn && !page->role.metaphysical) {
pgprintk("%s: found role %x\n",
page->gfn = gfn;
page->role = role;
hlist_add_head(&page->hash_link, bucket);
+ vcpu->mmu.prefetch_page(vcpu, page);
if (!metaphysical)
- rmap_write_protect(vcpu, gfn);
+ rmap_write_protect(vcpu->kvm, gfn);
return page;
}
if (page->role.level == PT_PAGE_TABLE_LEVEL) {
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (pt[i] & PT_PRESENT_MASK)
- rmap_remove(&pt[i]);
- pt[i] = 0;
+ if (is_shadow_present_pte(pt[i]))
+ rmap_remove(kvm, &pt[i]);
+ pt[i] = shadow_trap_nonpresent_pte;
}
kvm_flush_remote_tlbs(kvm);
return;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
ent = pt[i];
- pt[i] = 0;
- if (!(ent & PT_PRESENT_MASK))
+ pt[i] = shadow_trap_nonpresent_pte;
+ if (!is_shadow_present_pte(ent))
continue;
ent &= PT64_BASE_ADDR_MASK;
mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
mmu_page_remove_parent_pte(page, parent_pte);
}
+static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
+{
+ int i;
+
+ for (i = 0; i < KVM_MAX_VCPUS; ++i)
+ if (kvm->vcpus[i])
+ kvm->vcpus[i]->last_pte_updated = NULL;
+}
+
static void kvm_mmu_zap_page(struct kvm *kvm,
struct kvm_mmu_page *page)
{
u64 *parent_pte;
+ ++kvm->stat.mmu_shadow_zapped;
while (page->multimapped || page->parent_pte) {
if (!page->multimapped)
parent_pte = page->parent_pte;
}
BUG_ON(!parent_pte);
kvm_mmu_put_page(page, parent_pte);
- set_shadow_pte(parent_pte, 0);
+ set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
}
kvm_mmu_page_unlink_children(kvm, page);
if (!page->root_count) {
kvm_mmu_free_page(kvm, page);
} else
list_move(&page->link, &kvm->active_mmu_pages);
+ kvm_mmu_reset_last_pte_updated(kvm);
}
-static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+{
+ /*
+ * If we set the number of mmu pages to be smaller be than the
+ * number of actived pages , we must to free some mmu pages before we
+ * change the value
+ */
+
+ if ((kvm->n_alloc_mmu_pages - kvm->n_free_mmu_pages) >
+ kvm_nr_mmu_pages) {
+ int n_used_mmu_pages = kvm->n_alloc_mmu_pages
+ - kvm->n_free_mmu_pages;
+
+ while (n_used_mmu_pages > kvm_nr_mmu_pages) {
+ struct kvm_mmu_page *page;
+
+ page = container_of(kvm->active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ kvm_mmu_zap_page(kvm, page);
+ n_used_mmu_pages--;
+ }
+ kvm->n_free_mmu_pages = 0;
+ }
+ else
+ kvm->n_free_mmu_pages += kvm_nr_mmu_pages
+ - kvm->n_alloc_mmu_pages;
+
+ kvm->n_alloc_mmu_pages = kvm_nr_mmu_pages;
+}
+
+static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
{
unsigned index;
struct hlist_head *bucket;
pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
r = 0;
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
- bucket = &vcpu->kvm->mmu_page_hash[index];
+ bucket = &kvm->mmu_page_hash[index];
hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
if (page->gfn == gfn && !page->role.metaphysical) {
pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
page->role.word);
- kvm_mmu_zap_page(vcpu->kvm, page);
+ kvm_mmu_zap_page(kvm, page);
r = 1;
}
return r;
}
-static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
+static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
{
struct kvm_mmu_page *page;
- while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
+ while ((page = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
pgprintk("%s: zap %lx %x\n",
__FUNCTION__, gfn, page->role.word);
- kvm_mmu_zap_page(vcpu->kvm, page);
+ kvm_mmu_zap_page(kvm, page);
}
}
__set_bit(slot, &page_head->slot_bitmap);
}
-hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- hpa_t hpa = gpa_to_hpa(vcpu, gpa);
-
- return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
-}
-
-hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+hpa_t gpa_to_hpa(struct kvm *kvm, gpa_t gpa)
{
struct page *page;
+ hpa_t hpa;
ASSERT((gpa & HPA_ERR_MASK) == 0);
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (!page)
- return gpa | HPA_ERR_MASK;
- return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
- | (gpa & (PAGE_SIZE-1));
+ page = gfn_to_page(kvm, gpa >> PAGE_SHIFT);
+ hpa = ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) | (gpa & (PAGE_SIZE-1));
+ if (is_error_page(page))
+ return hpa | HPA_ERR_MASK;
+ return hpa;
}
hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
if (gpa == UNMAPPED_GVA)
return UNMAPPED_GVA;
- return gpa_to_hpa(vcpu, gpa);
+ return gpa_to_hpa(vcpu->kvm, gpa);
}
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
if (gpa == UNMAPPED_GVA)
return NULL;
- return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
+ return pfn_to_page(gpa_to_hpa(vcpu->kvm, gpa) >> PAGE_SHIFT);
}
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
int level = PT32E_ROOT_LEVEL;
hpa_t table_addr = vcpu->mmu.root_hpa;
+ struct page *page;
+ page = pfn_to_page(p >> PAGE_SHIFT);
for (; ; level--) {
u32 index = PT64_INDEX(v, level);
u64 *table;
table = __va(table_addr);
if (level == 1) {
+ int was_rmapped;
+
pte = table[index];
- if (is_present_pte(pte) && is_writeble_pte(pte))
+ was_rmapped = is_rmap_pte(pte);
+ if (is_shadow_present_pte(pte) && is_writeble_pte(pte)) {
+ kvm_release_page_clean(page);
return 0;
+ }
mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
page_header_update_slot(vcpu->kvm, table, v);
table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
PT_USER_MASK;
- rmap_add(vcpu, &table[index]);
+ if (!was_rmapped)
+ rmap_add(vcpu, &table[index], v >> PAGE_SHIFT);
+ else
+ kvm_release_page_clean(page);
+
return 0;
}
- if (table[index] == 0) {
+ if (table[index] == shadow_trap_nonpresent_pte) {
struct kvm_mmu_page *new_table;
gfn_t pseudo_gfn;
>> PAGE_SHIFT;
new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
v, level - 1,
- 1, 0, &table[index]);
+ 1, 3, &table[index]);
if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n");
+ kvm_release_page_clean(page);
return -ENOMEM;
}
}
}
+static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ sp->spt[i] = shadow_trap_nonpresent_pte;
+}
+
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
int i;
ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
- paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
+ paddr = gpa_to_hpa(vcpu->kvm, addr & PT64_BASE_ADDR_MASK);
- if (is_error_hpa(paddr))
+ if (is_error_hpa(paddr)) {
+ kvm_release_page_clean(pfn_to_page((paddr & PT64_BASE_ADDR_MASK)
+ >> PAGE_SHIFT));
return 1;
+ }
return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
}
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
+ context->prefetch_page = nonpaging_prefetch_page;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_arch_ops->tlb_flush(vcpu);
+ kvm_x86_ops->tlb_flush(vcpu);
}
static void paging_new_cr3(struct kvm_vcpu *vcpu)
u64 addr,
u32 err_code)
{
- kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
+ kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
}
static void paging_free(struct kvm_vcpu *vcpu)
context->new_cr3 = paging_new_cr3;
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
+ context->prefetch_page = paging64_prefetch_page;
context->free = paging_free;
context->root_level = level;
context->shadow_root_level = level;
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free;
+ context->prefetch_page = paging32_prefetch_page;
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
destroy_kvm_mmu(vcpu);
return init_kvm_mmu(vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
- spin_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->lock);
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
mmu_alloc_roots(vcpu);
- kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
+ kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
out:
- spin_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_load);
struct kvm_mmu_page *child;
pte = *spte;
- if (is_present_pte(pte)) {
+ if (is_shadow_present_pte(pte)) {
if (page->role.level == PT_PAGE_TABLE_LEVEL)
- rmap_remove(spte);
+ rmap_remove(vcpu->kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
}
}
- *spte = 0;
- kvm_flush_remote_tlbs(vcpu->kvm);
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
}
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page,
u64 *spte,
- const void *new, int bytes)
+ const void *new, int bytes,
+ int offset_in_pte)
{
- if (page->role.level != PT_PAGE_TABLE_LEVEL)
+ if (page->role.level != PT_PAGE_TABLE_LEVEL) {
+ ++vcpu->kvm->stat.mmu_pde_zapped;
return;
+ }
+ ++vcpu->kvm->stat.mmu_pte_updated;
if (page->role.glevels == PT32_ROOT_LEVEL)
- paging32_update_pte(vcpu, page, spte, new, bytes);
+ paging32_update_pte(vcpu, page, spte, new, bytes,
+ offset_in_pte);
+ else
+ paging64_update_pte(vcpu, page, spte, new, bytes,
+ offset_in_pte);
+}
+
+static bool need_remote_flush(u64 old, u64 new)
+{
+ if (!is_shadow_present_pte(old))
+ return false;
+ if (!is_shadow_present_pte(new))
+ return true;
+ if ((old ^ new) & PT64_BASE_ADDR_MASK)
+ return true;
+ old ^= PT64_NX_MASK;
+ new ^= PT64_NX_MASK;
+ return (old & ~new & PT64_PERM_MASK) != 0;
+}
+
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+{
+ if (need_remote_flush(old, new))
+ kvm_flush_remote_tlbs(vcpu->kvm);
else
- paging64_update_pte(vcpu, page, spte, new, bytes);
+ kvm_mmu_flush_tlb(vcpu);
+}
+
+static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
+{
+ u64 *spte = vcpu->last_pte_updated;
+
+ return !!(spte && (*spte & PT_ACCESSED_MASK));
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *old, const u8 *new, int bytes)
+ const u8 *new, int bytes)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *page;
struct hlist_node *node, *n;
struct hlist_head *bucket;
unsigned index;
+ u64 entry;
u64 *spte;
unsigned offset = offset_in_page(gpa);
unsigned pte_size;
int npte;
pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
- if (gfn == vcpu->last_pt_write_gfn) {
+ ++vcpu->kvm->stat.mmu_pte_write;
+ kvm_mmu_audit(vcpu, "pre pte write");
+ if (gfn == vcpu->last_pt_write_gfn
+ && !last_updated_pte_accessed(vcpu)) {
++vcpu->last_pt_write_count;
if (vcpu->last_pt_write_count >= 3)
flooded = 1;
} else {
vcpu->last_pt_write_gfn = gfn;
vcpu->last_pt_write_count = 1;
+ vcpu->last_pte_updated = NULL;
}
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
bucket = &vcpu->kvm->mmu_page_hash[index];
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, page->role.word);
kvm_mmu_zap_page(vcpu->kvm, page);
+ ++vcpu->kvm->stat.mmu_flooded;
continue;
}
page_offset = offset;
}
spte = &page->spt[page_offset / sizeof(*spte)];
while (npte--) {
+ entry = *spte;
mmu_pte_write_zap_pte(vcpu, page, spte);
- mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
+ mmu_pte_write_new_pte(vcpu, page, spte, new, bytes,
+ page_offset & (pte_size - 1));
+ mmu_pte_write_flush_tlb(vcpu, entry, *spte);
++spte;
}
}
+ kvm_mmu_audit(vcpu, "post pte write");
}
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
- return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
+ return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
}
-void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
struct kvm_mmu_page *page;
page = container_of(vcpu->kvm->active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu->kvm, page);
+ ++vcpu->kvm->stat.mmu_recycled;
}
}
-EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages);
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
+{
+ int r;
+ enum emulation_result er;
+
+ mutex_lock(&vcpu->kvm->lock);
+ r = vcpu->mmu.page_fault(vcpu, cr2, error_code);
+ if (r < 0)
+ goto out;
+
+ if (!r) {
+ r = 1;
+ goto out;
+ }
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ goto out;
+
+ er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
+ mutex_unlock(&vcpu->kvm->lock);
+
+ switch (er) {
+ case EMULATE_DONE:
+ return 1;
+ case EMULATE_DO_MMIO:
+ ++vcpu->stat.mmio_exits;
+ return 0;
+ case EMULATE_FAIL:
+ kvm_report_emulation_failure(vcpu, "pagetable");
+ return 1;
+ default:
+ BUG();
+ }
+out:
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
-
+ if (vcpu->kvm->n_requested_mmu_pages)
+ vcpu->kvm->n_free_mmu_pages = vcpu->kvm->n_requested_mmu_pages;
+ else
+ vcpu->kvm->n_free_mmu_pages = vcpu->kvm->n_alloc_mmu_pages;
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
pt = page->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
- if (pt[i] & PT_WRITABLE_MASK) {
- rmap_remove(&pt[i]);
+ if (pt[i] & PT_WRITABLE_MASK)
pt[i] &= ~PT_WRITABLE_MASK;
- }
}
}
return -ENOMEM;
}
+/*
+ * Caculate mmu pages needed for kvm.
+ */
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+{
+ int i;
+ unsigned int nr_mmu_pages;
+ unsigned int nr_pages = 0;
+
+ for (i = 0; i < kvm->nmemslots; i++)
+ nr_pages += kvm->memslots[i].npages;
+
+ nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+ nr_mmu_pages = max(nr_mmu_pages,
+ (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+
+ return nr_mmu_pages;
+}
+
#ifdef AUDIT
static const char *audit_msg;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
u64 ent = pt[i];
- if (!(ent & PT_PRESENT_MASK))
+ if (ent == shadow_trap_nonpresent_pte)
continue;
va = canonicalize(va);
- if (level > 1)
+ if (level > 1) {
+ if (ent == shadow_notrap_nonpresent_pte)
+ printk(KERN_ERR "audit: (%s) nontrapping pte"
+ " in nonleaf level: levels %d gva %lx"
+ " level %d pte %llx\n", audit_msg,
+ vcpu->mmu.root_level, va, level, ent);
+
audit_mappings_page(vcpu, ent, va, level - 1);
- else {
+ } else {
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
hpa_t hpa = gpa_to_hpa(vcpu, gpa);
+ struct page *page;
- if ((ent & PT_PRESENT_MASK)
+ if (is_shadow_present_pte(ent)
&& (ent & PT64_BASE_ADDR_MASK) != hpa)
- printk(KERN_ERR "audit error: (%s) levels %d"
- " gva %lx gpa %llx hpa %llx ent %llx\n",
+ printk(KERN_ERR "xx audit error: (%s) levels %d"
+ " gva %lx gpa %llx hpa %llx ent %llx %d\n",
audit_msg, vcpu->mmu.root_level,
- va, gpa, hpa, ent);
+ va, gpa, hpa, ent,
+ is_shadow_present_pte(ent));
+ else if (ent == shadow_notrap_nonpresent_pte
+ && !is_error_hpa(hpa))
+ printk(KERN_ERR "audit: (%s) notrap shadow,"
+ " valid guest gva %lx\n", audit_msg, va);
+ page = pfn_to_page((gpa & PT64_BASE_ADDR_MASK)
+ >> PAGE_SHIFT);
+ kvm_release_page_clean(page);
+
}
}
}
struct kvm_rmap_desc *d;
for (j = 0; j < m->npages; ++j) {
- struct page *page = m->phys_mem[j];
+ unsigned long *rmapp = &m->rmap[j];
- if (!page->private)
+ if (!*rmapp)
continue;
- if (!(page->private & 1)) {
+ if (!(*rmapp & 1)) {
++nmaps;
continue;
}
- d = (struct kvm_rmap_desc *)(page->private & ~1ul);
+ d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
while (d) {
for (k = 0; k < RMAP_EXT; ++k)
if (d->shadow_ptes[k])
static void audit_write_protection(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *page;
+ struct kvm_memory_slot *slot;
+ unsigned long *rmapp;
+ gfn_t gfn;
list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
- hfn_t hfn;
- struct page *pg;
-
if (page->role.metaphysical)
continue;
- hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
- >> PAGE_SHIFT;
- pg = pfn_to_page(hfn);
- if (pg->private)
+ slot = gfn_to_memslot(vcpu->kvm, page->gfn);
+ gfn = unalias_gfn(vcpu->kvm, page->gfn);
+ rmapp = &slot->rmap[gfn - slot->base_gfn];
+ if (*rmapp)
printk(KERN_ERR "%s: (%s) shadow page has writable"
" mappings: gfn %lx role %x\n",
__FUNCTION__, audit_msg, page->gfn,