Merge remote-tracking branch 'kvm/linux-next'

author Stephen Rothwell <sfr@canb.auug.org.au>

Tue, 13 Sep 2016 02:21:37 +0000 (12:21 +1000)

committer Stephen Rothwell <sfr@canb.auug.org.au>

Tue, 13 Sep 2016 02:21:37 +0000 (12:21 +1000)
author Stephen Rothwell <sfr@canb.auug.org.au>
Tue, 13 Sep 2016 02:21:37 +0000 (12:21 +1000)
committer Stephen Rothwell <sfr@canb.auug.org.au>
Tue, 13 Sep 2016 02:21:37 +0000 (12:21 +1000)
diff --combined arch/s390/kvm/kvm-s390.c

index d6e7e527f0bf3963dd8593b50969be0731ad345b,ac6c056df4b9aca4430368d94636e0ae5c51cebc..6b5c7c519e70ffcaca33018ce363f00ac249168d
--- 1/arch/s390/kvm/kvm-s390.c
--- 2/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@@ -245,33 -245,22 +245,33 @@@ static void kvm_s390_cpu_feat_init(void
                      PTFF_QAF);
   
         if (test_facility(17)) { /* MSA */
- -              __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
- -              __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
- -              __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
- -              __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
- -              __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
+ +              __cpacf_query(CPACF_KMAC, (cpacf_mask_t *)
+ +                            kvm_s390_available_subfunc.kmac);
+ +              __cpacf_query(CPACF_KMC, (cpacf_mask_t *)
+ +                            kvm_s390_available_subfunc.kmc);
+ +              __cpacf_query(CPACF_KM, (cpacf_mask_t *)
+ +                            kvm_s390_available_subfunc.km);
+ +              __cpacf_query(CPACF_KIMD, (cpacf_mask_t *)
+ +                            kvm_s390_available_subfunc.kimd);
+ +              __cpacf_query(CPACF_KLMD, (cpacf_mask_t *)
+ +                            kvm_s390_available_subfunc.klmd);
         }
         if (test_facility(76)) /* MSA3 */
- -              __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
+ +              __cpacf_query(CPACF_PCKMO, (cpacf_mask_t *)
+ +                            kvm_s390_available_subfunc.pckmo);
         if (test_facility(77)) { /* MSA4 */
- -              __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
- -              __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
- -              __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
- -              __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
+ +              __cpacf_query(CPACF_KMCTR, (cpacf_mask_t *)
+ +                            kvm_s390_available_subfunc.kmctr);
+ +              __cpacf_query(CPACF_KMF, (cpacf_mask_t *)
+ +                            kvm_s390_available_subfunc.kmf);
+ +              __cpacf_query(CPACF_KMO, (cpacf_mask_t *)
+ +                            kvm_s390_available_subfunc.kmo);
+ +              __cpacf_query(CPACF_PCC, (cpacf_mask_t *)
+ +                            kvm_s390_available_subfunc.pcc);
         }
         if (test_facility(57)) /* MSA5 */
- -              __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
+ +              __cpacf_query(CPACF_PPNO, (cpacf_mask_t *)
+ +                            kvm_s390_available_subfunc.ppno);
   
         if (MACHINE_HAS_ESOP)
                 allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
@@@ -384,7 -373,9 +384,9 @@@ int kvm_vm_ioctl_check_extension(struc
         case KVM_CAP_NR_VCPUS:
         case KVM_CAP_MAX_VCPUS:
                 r = KVM_S390_BSCA_CPU_SLOTS;
-               if (sclp.has_esca && sclp.has_64bscao)
+               if (!kvm_s390_use_sca_entries())
+                       r = KVM_MAX_VCPUS;
+               else if (sclp.has_esca && sclp.has_64bscao)
                         r = KVM_S390_ESCA_CPU_SLOTS;
                 break;
         case KVM_CAP_NR_MEMSLOTS:
@@@ -1561,6 -1552,8 +1563,8 @@@ static int __kvm_ucontrol_vcpu_init(str
   
   static void sca_del_vcpu(struct kvm_vcpu *vcpu)
   {
+       if (!kvm_s390_use_sca_entries())
+               return;
         read_lock(&vcpu->kvm->arch.sca_lock);
         if (vcpu->kvm->arch.use_esca) {
                 struct esca_block *sca = vcpu->kvm->arch.sca;
@@@ -1578,6 -1571,13 +1582,13 @@@
   
   static void sca_add_vcpu(struct kvm_vcpu *vcpu)
   {
+       if (!kvm_s390_use_sca_entries()) {
+               struct bsca_block *sca = vcpu->kvm->arch.sca;
+ 
+               /* we still need the basic sca for the ipte control */
+               vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
+               vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+       }
         read_lock(&vcpu->kvm->arch.sca_lock);
         if (vcpu->kvm->arch.use_esca) {
                 struct esca_block *sca = vcpu->kvm->arch.sca;
@@@ -1658,6 -1658,11 +1669,11 @@@ static int sca_can_add_vcpu(struct kvm 
   {
         int rc;
   
+       if (!kvm_s390_use_sca_entries()) {
+               if (id < KVM_MAX_VCPUS)
+                       return true;
+               return false;
+       }
         if (id < KVM_S390_BSCA_CPU_SLOTS)
                 return true;
         if (!sclp.has_esca || !sclp.has_64bscao)
@@@ -1946,8 -1951,6 +1962,6 @@@ int kvm_arch_vcpu_setup(struct kvm_vcp
                 vcpu->arch.sie_block->eca |= 1;
         if (sclp.has_sigpif)
                 vcpu->arch.sie_block->eca |= 0x10000000U;
-       if (test_kvm_facility(vcpu->kvm, 64))
-               vcpu->arch.sie_block->ecb3 |= 0x01;
         if (test_kvm_facility(vcpu->kvm, 129)) {
                 vcpu->arch.sie_block->eca |= 0x00020000;
                 vcpu->arch.sie_block->ecd |= 0x20000000;
@@@ -2702,6 -2705,19 +2716,19 @@@ static void sync_regs(struct kvm_vcpu *
                 if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
                         kvm_clear_async_pf_completion_queue(vcpu);
         }
+       /*
+        * If userspace sets the riccb (e.g. after migration) to a valid state,
+        * we should enable RI here instead of doing the lazy enablement.
+        */
+       if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
+           test_kvm_facility(vcpu->kvm, 64)) {
+               struct runtime_instr_cb *riccb =
+                       (struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
+ 
+               if (riccb->valid)
+                       vcpu->arch.sie_block->ecb3 |= 0x01;
+       }
+ 
         kvm_run->kvm_dirty_regs = 0;
   }
   
@@@ -2845,38 -2861,6 +2872,6 @@@ int kvm_s390_vcpu_store_status(struct k
         return kvm_s390_store_status_unloaded(vcpu, addr);
   }
   
- /*
-  * store additional status at address
-  */
- int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
-                                       unsigned long gpa)
- {
-       /* Only bits 0-53 are used for address formation */
-       if (!(gpa & ~0x3ff))
-               return 0;
- 
-       return write_guest_abs(vcpu, gpa & ~0x3ff,
-                              (void *)&vcpu->run->s.regs.vrs, 512);
- }
- 
- int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
- {
-       if (!test_kvm_facility(vcpu->kvm, 129))
-               return 0;
- 
-       /*
-        * The guest VXRS are in the host VXRs due to the lazy
-        * copying in vcpu load/put. We can simply call save_fpu_regs()
-        * to save the current register state because we are in the
-        * middle of a load/put cycle.
-        *
-        * Let's update our copies before we save it into the save area.
-        */
-       save_fpu_regs();
- 
-       return kvm_s390_store_adtl_status_unloaded(vcpu, addr);
- }
- 
   static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
   {
         kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
diff --combined arch/x86/kvm/svm.c

index 1e6b84b96ea697d7a047f9c6583b42807209e952,db77c1ca9e76accc86453c0288f98dcbd03c89a5..1b66c5afca840ef89ec32081aa2971aee6ccfffc
--- 1/arch/x86/kvm/svm.c
--- 2/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@@ -34,6 -34,8 +34,8 @@@
   #include <linux/sched.h>
   #include <linux/trace_events.h>
   #include <linux/slab.h>
+ #include <linux/amd-iommu.h>
+ #include <linux/hashtable.h>
   
   #include <asm/apic.h>
   #include <asm/perf_event.h>
@@@ -41,6 -43,7 +43,7 @@@
   #include <asm/desc.h>
   #include <asm/debugreg.h>
   #include <asm/kvm_para.h>
+ #include <asm/irq_remapping.h>
   
   #include <asm/virtext.h>
   #include "trace.h"
@@@ -96,6 -99,19 +99,19 @@@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id)
   #define AVIC_UNACCEL_ACCESS_OFFSET_MASK               0xFF0
   #define AVIC_UNACCEL_ACCESS_VECTOR_MASK               0xFFFFFFFF
   
+ /* AVIC GATAG is encoded using VM and VCPU IDs */
+ #define AVIC_VCPU_ID_BITS             8
+ #define AVIC_VCPU_ID_MASK             ((1 << AVIC_VCPU_ID_BITS) - 1)
+ 
+ #define AVIC_VM_ID_BITS                       24
+ #define AVIC_VM_ID_NR                 (1 << AVIC_VM_ID_BITS)
+ #define AVIC_VM_ID_MASK                       ((1 << AVIC_VM_ID_BITS) - 1)
+ 
+ #define AVIC_GATAG(x, y)              (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+                                               (y & AVIC_VCPU_ID_MASK))
+ #define AVIC_GATAG_TO_VMID(x)         ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+ #define AVIC_GATAG_TO_VCPUID(x)               (x & AVIC_VCPU_ID_MASK)
+ 
   static bool erratum_383_found __read_mostly;
   
   static const u32 host_save_user_msrs[] = {
@@@ -185,6 -201,23 +201,23 @@@ struct vcpu_svm 
         struct page *avic_backing_page;
         u64 *avic_physical_id_cache;
         bool avic_is_running;
+ 
+       /*
+        * Per-vcpu list of struct amd_svm_iommu_ir:
+        * This is used mainly to store interrupt remapping information used
+        * when update the vcpu affinity. This avoids the need to scan for
+        * IRTE and try to match ga_tag in the IOMMU driver.
+        */
+       struct list_head ir_list;
+       spinlock_t ir_list_lock;
+ };
+ 
+ /*
+  * This is a wrapper of struct amd_iommu_ir_data.
+  */
+ struct amd_svm_iommu_ir {
+       struct list_head node;  /* Used by SVM for per-vcpu ir_list */
+       void *data;             /* Storing pointer to struct amd_ir_data */
   };
   
   #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK  (0xFF)
@@@ -242,6 -275,10 +275,10 @@@ static int avic
   module_param(avic, int, S_IRUGO);
   #endif
   
+ /* AVIC VM ID bit masks and lock */
+ static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
+ static DEFINE_SPINLOCK(avic_vm_id_lock);
+ 
   static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
   static void svm_flush_tlb(struct kvm_vcpu *vcpu);
   static void svm_complete_interrupts(struct vcpu_svm *svm);
@@@ -928,6 -965,55 +965,55 @@@ static void svm_disable_lbrv(struct vcp
         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
   }
   
+ /* Note:
+  * This hash table is used to map VM_ID to a struct kvm_arch,
+  * when handling AMD IOMMU GALOG notification to schedule in
+  * a particular vCPU.
+  */
+ #define SVM_VM_DATA_HASH_BITS 8
+ DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+ static spinlock_t svm_vm_data_hash_lock;
+ 
+ /* Note:
+  * This function is called from IOMMU driver to notify
+  * SVM to schedule in a particular vCPU of a particular VM.
+  */
+ static int avic_ga_log_notifier(u32 ga_tag)
+ {
+       unsigned long flags;
+       struct kvm_arch *ka = NULL;
+       struct kvm_vcpu *vcpu = NULL;
+       u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+       u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+ 
+       pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+ 
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
+               struct kvm *kvm = container_of(ka, struct kvm, arch);
+               struct kvm_arch *vm_data = &kvm->arch;
+ 
+               if (vm_data->avic_vm_id != vm_id)
+                       continue;
+               vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+               break;
+       }
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+ 
+       if (!vcpu)
+               return 0;
+ 
+       /* Note:
+        * At this point, the IOMMU should have already set the pending
+        * bit in the vAPIC backing page. So, we just need to schedule
+        * in the vcpu.
+        */
+       if (vcpu->mode == OUTSIDE_GUEST_MODE)
+               kvm_vcpu_wake_up(vcpu);
+ 
+       return 0;
+ }
+ 
   static __init int svm_hardware_setup(void)
   {
         int cpu;
@@@ -986,10 -1072,15 +1072,15 @@@
         if (avic) {
                 if (!npt_enabled ||
                     !boot_cpu_has(X86_FEATURE_AVIC) ||
-                   !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
+                   !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
                         avic = false;
-               else
+               } else {
                         pr_info("AVIC enabled\n");
+ 
+                       hash_init(svm_vm_data_hash);
+                       spin_lock_init(&svm_vm_data_hash_lock);
+                       amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+               }
         }
   
         return 0;
@@@ -1280,18 -1371,54 +1371,54 @@@ static int avic_init_backing_page(struc
         return 0;
   }
   
+ static inline int avic_get_next_vm_id(void)
+ {
+       int id;
+ 
+       spin_lock(&avic_vm_id_lock);
+ 
+       /* AVIC VM ID is one-based. */
+       id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
+       if (id <= AVIC_VM_ID_MASK)
+               __set_bit(id, avic_vm_id_bitmap);
+       else
+               id = -EAGAIN;
+ 
+       spin_unlock(&avic_vm_id_lock);
+       return id;
+ }
+ 
+ static inline int avic_free_vm_id(int id)
+ {
+       if (id <= 0 || id > AVIC_VM_ID_MASK)
+               return -EINVAL;
+ 
+       spin_lock(&avic_vm_id_lock);
+       __clear_bit(id, avic_vm_id_bitmap);
+       spin_unlock(&avic_vm_id_lock);
+       return 0;
+ }
+ 
   static void avic_vm_destroy(struct kvm *kvm)
   {
+       unsigned long flags;
         struct kvm_arch *vm_data = &kvm->arch;
   
+       avic_free_vm_id(vm_data->avic_vm_id);
+ 
         if (vm_data->avic_logical_id_table_page)
                 __free_page(vm_data->avic_logical_id_table_page);
         if (vm_data->avic_physical_id_table_page)
                 __free_page(vm_data->avic_physical_id_table_page);
+ 
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_del(&vm_data->hnode);
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
   }
   
   static int avic_vm_init(struct kvm *kvm)
   {
+       unsigned long flags;
         int err = -ENOMEM;
         struct kvm_arch *vm_data = &kvm->arch;
         struct page *p_page;
@@@ -1300,6 -1427,10 +1427,10 @@@
         if (!avic)
                 return 0;
   
+       vm_data->avic_vm_id = avic_get_next_vm_id();
+       if (vm_data->avic_vm_id < 0)
+               return vm_data->avic_vm_id;
+ 
         /* Allocating physical APIC ID table (4KB) */
         p_page = alloc_page(GFP_KERNEL);
         if (!p_page)
@@@ -1316,6 -1447,10 +1447,10 @@@
         vm_data->avic_logical_id_table_page = l_page;
         clear_page(page_address(l_page));
   
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+ 
         return 0;
   
   free_avic:
@@@ -1323,31 -1458,34 +1458,34 @@@
         return err;
   }
   
- /**
-  * This function is called during VCPU halt/unhalt.
-  */
- static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+ static inline int
+ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
   {
-       u64 entry;
-       int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
+       int ret = 0;
+       unsigned long flags;
+       struct amd_svm_iommu_ir *ir;
         struct vcpu_svm *svm = to_svm(vcpu);
   
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
- 
-       svm->avic_is_running = is_run;
+       if (!kvm_arch_has_assigned_device(vcpu->kvm))
+               return 0;
   
-       /* ID = 0xff (broadcast), ID > 0xff (reserved) */
-       if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
-               return;
+       /*
+        * Here, we go through the per-vcpu ir_list to update all existing
+        * interrupt remapping table entry targeting this vcpu.
+        */
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
   
-       entry = READ_ONCE(*(svm->avic_physical_id_cache));
-       WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+       if (list_empty(&svm->ir_list))
+               goto out;
   
-       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       if (is_run)
-               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+       list_for_each_entry(ir, &svm->ir_list, node) {
+               ret = amd_iommu_update_ga(cpu, r, ir->data);
+               if (ret)
+                       break;
+       }
+ out:
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+       return ret;
   }
   
   static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@@ -1374,6 -1512,8 +1512,8 @@@
                 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
   
         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+       avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+                                       svm->avic_is_running);
   }
   
   static void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@@ -1385,10 -1525,27 +1525,27 @@@
                 return;
   
         entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+               avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+ 
         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
   }
   
+ /**
+  * This function is called during VCPU halt/unhalt.
+  */
+ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+ 
+       svm->avic_is_running = is_run;
+       if (is_run)
+               avic_vcpu_load(vcpu, vcpu->cpu);
+       else
+               avic_vcpu_put(vcpu);
+ }
+ 
   static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
@@@ -1450,6 -1607,9 +1607,9 @@@ static struct kvm_vcpu *svm_create_vcpu
                 err = avic_init_backing_page(&svm->vcpu);
                 if (err)
                         goto free_page4;
+ 
+               INIT_LIST_HEAD(&svm->ir_list);
+               spin_lock_init(&svm->ir_list_lock);
         }
   
         /* We initialize this flag to true to make sure that the is_running
@@@ -4246,6 -4406,209 +4406,209 @@@ static void svm_deliver_avic_intr(struc
                 kvm_vcpu_wake_up(vcpu);
   }
   
+ static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+ {
+       unsigned long flags;
+       struct amd_svm_iommu_ir *cur;
+ 
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_for_each_entry(cur, &svm->ir_list, node) {
+               if (cur->data != pi->ir_data)
+                       continue;
+               list_del(&cur->node);
+               kfree(cur);
+               break;
+       }
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ }
+ 
+ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+ {
+       int ret = 0;
+       unsigned long flags;
+       struct amd_svm_iommu_ir *ir;
+ 
+       /**
+        * In some cases, the existing irte is updaed and re-set,
+        * so we need to check here if it's already been * added
+        * to the ir_list.
+        */
+       if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+               struct kvm *kvm = svm->vcpu.kvm;
+               u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+               struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+               struct vcpu_svm *prev_svm;
+ 
+               if (!prev_vcpu) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+ 
+               prev_svm = to_svm(prev_vcpu);
+               svm_ir_list_del(prev_svm, pi);
+       }
+ 
+       /**
+        * Allocating new amd_iommu_pi_data, which will get
+        * add to the per-vcpu ir_list.
+        */
+       ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+       if (!ir) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       ir->data = pi->ir_data;
+ 
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_add(&ir->node, &svm->ir_list);
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ out:
+       return ret;
+ }
+ 
+ /**
+  * Note:
+  * The HW cannot support posting multicast/broadcast
+  * interrupts to a vCPU. So, we still use legacy interrupt
+  * remapping for these kind of interrupts.
+  *
+  * For lowest-priority interrupts, we only support
+  * those with single CPU as the destination, e.g. user
+  * configures the interrupts via /proc/irq or uses
+  * irqbalance to make the interrupts single-CPU.
+  */
+ static int
+ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+                struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+ {
+       struct kvm_lapic_irq irq;
+       struct kvm_vcpu *vcpu = NULL;
+ 
+       kvm_set_msi_irq(kvm, e, &irq);
+ 
+       if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+               pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+                        __func__, irq.vector);
+               return -1;
+       }
+ 
+       pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+                irq.vector);
+       *svm = to_svm(vcpu);
+       vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+       vcpu_info->vector = irq.vector;
+ 
+       return 0;
+ }
+ 
+ /*
+  * svm_update_pi_irte - set IRTE for Posted-Interrupts
+  *
+  * @kvm: kvm
+  * @host_irq: host irq of the interrupt
+  * @guest_irq: gsi of the interrupt
+  * @set: set or unset PI
+  * returns 0 on success, < 0 on failure
+  */
+ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+                             uint32_t guest_irq, bool set)
+ {
+       struct kvm_kernel_irq_routing_entry *e;
+       struct kvm_irq_routing_table *irq_rt;
+       int idx, ret = -EINVAL;
+ 
+       if (!kvm_arch_has_assigned_device(kvm) ||
+           !irq_remapping_cap(IRQ_POSTING_CAP))
+               return 0;
+ 
+       pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+                __func__, host_irq, guest_irq, set);
+ 
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+       WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+ 
+       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+               struct vcpu_data vcpu_info;
+               struct vcpu_svm *svm = NULL;
+ 
+               if (e->type != KVM_IRQ_ROUTING_MSI)
+                       continue;
+ 
+               /**
+                * Here, we setup with legacy mode in the following cases:
+                * 1. When cannot target interrupt to a specific vcpu.
+                * 2. Unsetting posted interrupt.
+                * 3. APIC virtialization is disabled for the vcpu.
+                */
+               if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+                   kvm_vcpu_apicv_active(&svm->vcpu)) {
+                       struct amd_iommu_pi_data pi;
+ 
+                       /* Try to enable guest_mode in IRTE */
+                       pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+                       pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+                                                    svm->vcpu.vcpu_id);
+                       pi.is_guest_mode = true;
+                       pi.vcpu_data = &vcpu_info;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+ 
+                       /**
+                        * Here, we successfully setting up vcpu affinity in
+                        * IOMMU guest mode. Now, we need to store the posted
+                        * interrupt information in a per-vcpu ir_list so that
+                        * we can reference to them directly when we update vcpu
+                        * scheduling information in IOMMU irte.
+                        */
+                       if (!ret && pi.is_guest_mode)
+                               svm_ir_list_add(svm, &pi);
+               } else {
+                       /* Use legacy mode in IRTE */
+                       struct amd_iommu_pi_data pi;
+ 
+                       /**
+                        * Here, pi is used to:
+                        * - Tell IOMMU to use legacy mode for this interrupt.
+                        * - Retrieve ga_tag of prior interrupt remapping data.
+                        */
+                       pi.is_guest_mode = false;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+ 
+                       /**
+                        * Check if the posted interrupt was previously
+                        * setup with the guest_mode by checking if the ga_tag
+                        * was cached. If so, we need to clean up the per-vcpu
+                        * ir_list.
+                        */
+                       if (!ret && pi.prev_ga_tag) {
+                               int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+                               struct kvm_vcpu *vcpu;
+ 
+                               vcpu = kvm_get_vcpu_by_id(kvm, id);
+                               if (vcpu)
+                                       svm_ir_list_del(to_svm(vcpu), &pi);
+                       }
+               }
+ 
+               if (!ret && svm) {
+                       trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
+                                                host_irq, e->gsi,
+                                                vcpu_info.vector,
+                                                vcpu_info.pi_desc_addr, set);
+               }
+ 
+               if (ret < 0) {
+                       pr_err("%s: failed to update PI IRTE\n", __func__);
+                       goto out;
+               }
+       }
+ 
+       ret = 0;
+ out:
+       srcu_read_unlock(&kvm->irq_srcu, idx);
+       return ret;
+ }
+ 
   static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
@@@ -4961,7 -5324,7 +5324,7 @@@ static inline void avic_post_state_rest
         avic_handle_ldr_update(vcpu);
   }
   
- -static struct kvm_x86_ops svm_x86_ops = {
+ +static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
         .cpu_has_kvm_support = has_svm,
         .disabled_by_bios = is_disabled,
         .hardware_setup = svm_hardware_setup,
@@@ -5078,6 -5441,7 +5441,7 @@@
   
         .pmu_ops = &amd_pmu_ops,
         .deliver_posted_interrupt = svm_deliver_avic_intr,
+       .update_pi_irte = svm_update_pi_irte,
   };
   
   static int __init svm_init(void)
diff --combined arch/x86/kvm/vmx.c

index 121fdf6e9ed09d15ef3c5abb8d00cf77a2ab5e78,f9939d0722b411c8cf0b0cc4e316b423f83e3827..4ee4187fcea3583eb420c3afaa3b77fb61ebfa77
--- 1/arch/x86/kvm/vmx.c
--- 2/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -939,6 -939,7 +939,7 @@@ static DEFINE_SPINLOCK(vmx_vpid_lock)
   static struct vmcs_config {
         int size;
         int order;
+       u32 basic_cap;
         u32 revision_id;
         u32 pin_based_exec_ctrl;
         u32 cpu_based_exec_ctrl;
@@@ -1215,6 -1216,11 +1216,11 @@@ static inline bool cpu_has_vmx_ple(void
                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
   }
   
+ static inline bool cpu_has_vmx_basic_inout(void)
+ {
+       return  (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+ }
+ 
   static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
   {
         return flexpriority_enabled && lapic_in_kernel(vcpu);
@@@ -2877,6 -2883,8 +2883,8 @@@ static int vmx_get_vmx_msr(struct kvm_v
                 *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
                            ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
                            (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+               if (cpu_has_vmx_basic_inout())
+                       *pdata |= VMX_BASIC_INOUT;
                 break;
         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
         case MSR_IA32_VMX_PINBASED_CTLS:
@@@ -3457,7 -3465,8 +3465,8 @@@ static __init int setup_vmcs_config(str
                 return -EIO;
   
         vmcs_conf->size = vmx_msr_high & 0x1fff;
-       vmcs_conf->order = get_order(vmcs_config.size);
+       vmcs_conf->order = get_order(vmcs_conf->size);
+       vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
         vmcs_conf->revision_id = vmx_msr_low;
   
         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
@@@ -6109,7 -6118,7 +6118,7 @@@ static int handle_ept_violation(struct 
         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
   
         gla_validity = (exit_qualification >> 7) & 0x3;
-       if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+       if (gla_validity == 0x2) {
                 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
                 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
                         (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
@@@ -6726,7 -6735,7 +6735,7 @@@ static void nested_vmx_abort(struct kvm
   {
         /* TODO: not to reset guest simply here. */
         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
-       pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+       pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
   }
   
   static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
@@@ -7013,7 -7022,7 +7022,7 @@@ static int handle_vmon(struct kvm_vcpu 
         vmx->nested.vmcs02_num = 0;
   
         hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-                    HRTIMER_MODE_REL);
+                    HRTIMER_MODE_REL_PINNED);
         vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
   
         vmx->nested.vmxon = true;
@@@ -9598,7 -9607,7 +9607,7 @@@ static int nested_vmx_check_msr_switch(
         maxphyaddr = cpuid_maxphyaddr(vcpu);
         if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
             (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
-               pr_warn_ratelimited(
+               pr_debug_ratelimited(
                         "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
                         addr_field, maxphyaddr, count, addr);
                 return -EINVAL;
@@@ -9671,13 -9680,13 +9680,13 @@@ static u32 nested_vmx_load_msr(struct k
         for (i = 0; i < count; i++) {
                 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
                                         &e, sizeof(e))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
                                 __func__, i, gpa + i * sizeof(e));
                         goto fail;
                 }
                 if (nested_vmx_load_msr_check(vcpu, &e)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s check failed (%u, 0x%x, 0x%x)\n",
                                 __func__, i, e.index, e.reserved);
                         goto fail;
@@@ -9685,7 -9694,7 +9694,7 @@@
                 msr.index = e.index;
                 msr.data = e.value;
                 if (kvm_set_msr(vcpu, &msr)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                 __func__, i, e.index, e.value);
                         goto fail;
@@@ -9706,13 -9715,13 +9715,13 @@@ static int nested_vmx_store_msr(struct 
                 if (kvm_vcpu_read_guest(vcpu,
                                         gpa + i * sizeof(e),
                                         &e, 2 * sizeof(u32))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
                                 __func__, i, gpa + i * sizeof(e));
                         return -EINVAL;
                 }
                 if (nested_vmx_store_msr_check(vcpu, &e)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s check failed (%u, 0x%x, 0x%x)\n",
                                 __func__, i, e.index, e.reserved);
                         return -EINVAL;
@@@ -9720,7 -9729,7 +9729,7 @@@
                 msr_info.host_initiated = false;
                 msr_info.index = e.index;
                 if (kvm_get_msr(vcpu, &msr_info)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot read MSR (%u, 0x%x)\n",
                                 __func__, i, e.index);
                         return -EINVAL;
@@@ -9729,7 -9738,7 +9738,7 @@@
                                          gpa + i * sizeof(e) +
                                              offsetof(struct vmx_msr_entry, value),
                                          &msr_info.data, sizeof(msr_info.data))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                 __func__, i, e.index, msr_info.data);
                         return -EINVAL;
@@@ -10500,6 -10509,9 +10509,9 @@@ static void prepare_vmcs12(struct kvm_v
                 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
         }
   
+       if (nested_cpu_has_ept(vmcs12))
+               vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+ 
         if (nested_cpu_has_vid(vmcs12))
                 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
   
@@@ -11177,7 -11189,7 +11189,7 @@@ static void vmx_setup_mce(struct kvm_vc
                         ~FEATURE_CONTROL_LMCE;
   }
   
- -static struct kvm_x86_ops vmx_x86_ops = {
+ +static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .cpu_has_kvm_support = cpu_has_kvm_support,
         .disabled_by_bios = vmx_disabled_by_bios,
         .hardware_setup = hardware_setup,
author	Stephen Rothwell <sfr@canb.auug.org.au>
	Tue, 13 Sep 2016 02:21:37 +0000 (12:21 +1000)
committer	Stephen Rothwell <sfr@canb.auug.org.au>
	Tue, 13 Sep 2016 02:21:37 +0000 (12:21 +1000)
		1	2
arch/s390/kvm/kvm-s390.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history