Generic:

- selftest compilation fix for non-x86 - KVM: avoid warning on s390 in mark_page_dirty x86: - fix page write-protection bug and improve comments - use binary search to lookup the PMU event filter, add test - enable_pmu module parameter support for Intel CPUs - switch blocked_vcpu_on_cpu_lock to raw spinlock - cleanups of blocked vCPU logic - partially allow KVM_SET_CPUID{,2} after KVM_RUN (5.16 regression) - various small fixes -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmHpmT0UHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroOstggAi1VSpT43oGslQjXNDZacHEARoYQs b0XpoW7HXicGSGRMWspCmiAPdJyYTsioEACttAmXUMs7brAgHb9n/vzdlcLh1ymL rQw2YFQlfqqB1Ki1iRhNkWlH9xOECsu28WLng6ylrx51GuT/pzWRt+V3EGUFTxIT ldW9HgZg2oFJIaLjg2hQVR/8EbBf0QdsAD3KV3tyvhBlXPkyeLOMcGe9onfjZ/NE JQeW7FtKtP4SsIFt1KrJpDPjtiwFt3bRM0gfgGw7//clvtKIqt1LYXZiq4C3b7f5 tfYiC8lO2vnOoYcfeYEmvybbSsoS/CgSliZB32qkwoVvRMIl82YmxtDD+Q== =/Mak -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull more kvm updates from Paolo Bonzini: "Generic: - selftest compilation fix for non-x86 - KVM: avoid warning on s390 in mark_page_dirty x86: - fix page write-protection bug and improve comments - use binary search to lookup the PMU event filter, add test - enable_pmu module parameter support for Intel CPUs - switch blocked_vcpu_on_cpu_lock to raw spinlock - cleanups of blocked vCPU logic - partially allow KVM_SET_CPUID{,2} after KVM_RUN (5.16 regression) - various small fixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (46 commits) docs: kvm: fix WARNINGs from api.rst selftests: kvm/x86: Fix the warning in lib/x86_64/processor.c selftests: kvm/x86: Fix the warning in pmu_event_filter_test.c kvm: selftests: Do not indent with spaces kvm: selftests: sync uapi/linux/kvm.h with Linux header selftests: kvm: add amx_test to .gitignore KVM: SVM: Nullify vcpu_(un)blocking() hooks if AVIC is disabled KVM: SVM: Move svm_hardware_setup() and its helpers below svm_x86_ops KVM: SVM: Drop AVIC's intermediate avic_set_running() helper KVM: VMX: Don't do full kick when handling posted interrupt wakeup KVM: VMX: Fold fallback path into triggering posted IRQ helper KVM: VMX: Pass desired vector instead of bool for triggering posted IRQ KVM: VMX: Don't do full kick when triggering posted interrupt "fails" KVM: SVM: Skip AVIC and IRTE updates when loading blocking vCPU KVM: SVM: Use kvm_vcpu_is_blocking() in AVIC load to handle preemption KVM: SVM: Remove unnecessary APICv/AVIC update in vCPU unblocking path KVM: SVM: Don't bother checking for "running" AVIC when kicking for IPIs KVM: SVM: Signal AVIC doorbell iff vCPU is in guest mode KVM: x86: Remove defunct pre_block/post_block kvm_x86_ops hooks KVM: x86: Unexport LAPIC's switch_to_{hv,sw}_timer() helpers ...
2025-04-24 14:07:52 -04:00 · 2022-01-22 09:40:01 +02:00 · 2022-01-22 09:40:01 +02:00 · 636b5284d8
commit 636b5284d8
parent dc5341f41d e2e83a73d7
36 changed files with 1428 additions and 635 deletions
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@ -5545,8 +5545,8 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header.
 The Stats Data block contains an array of 64-bit values in the same order
 as the descriptors in Descriptors block.
-4.42 KVM_GET_XSAVE2
+4.134 KVM_GET_XSAVE2
------------------
+--------------------
 :Capability: KVM_CAP_XSAVE2
 :Architectures: x86
@ -7363,7 +7363,7 @@ trap and emulate MSRs that are outside of the scope of KVM as well as
 limit the attack surface on KVM's MSR emulation code.
 8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID
-----------------------------
+-------------------------------------
 Architectures: x86
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@ -55,6 +55,7 @@ KVM_X86_OP_NULL(tlb_remote_flush)
 KVM_X86_OP_NULL(tlb_remote_flush_with_range)
 KVM_X86_OP(tlb_flush_gva)
 KVM_X86_OP(tlb_flush_guest)
 KVM_X86_OP(vcpu_pre_run)
 KVM_X86_OP(run)
 KVM_X86_OP_NULL(handle_exit)
 KVM_X86_OP_NULL(skip_emulated_instruction)
@ -98,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff)
 KVM_X86_OP_NULL(request_immediate_exit)
 KVM_X86_OP(sched_in)
 KVM_X86_OP_NULL(update_cpu_dirty_logging)
 KVM_X86_OP_NULL(pre_block)
 KVM_X86_OP_NULL(post_block)
 KVM_X86_OP_NULL(vcpu_blocking)
 KVM_X86_OP_NULL(vcpu_unblocking)
 KVM_X86_OP_NULL(update_pi_irte)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@ -1381,6 +1381,7 @@ struct kvm_x86_ops {
 	 */
 	void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
 	int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
 	enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
 	int (*handle_exit)(struct kvm_vcpu *vcpu,
 		enum exit_fastpath_completion exit_fastpath);
@ -1454,18 +1455,6 @@ struct kvm_x86_ops {
 	const struct kvm_pmu_ops *pmu_ops;
 	const struct kvm_x86_nested_ops *nested_ops;
 	/*
 	 * Architecture specific hooks for vCPU blocking due to
 	 * HLT instruction.
 	 * Returns for .pre_block():
 	 *    - 0 means continue to block the vCPU.
 	 *    - 1 means we cannot block the vCPU since some event
 	 *        happens during this period, such as, 'ON' bit in
 	 *        posted-interrupts descriptor is set.
 	 */
 	int (*pre_block)(struct kvm_vcpu *vcpu);
 	void (*post_block)(struct kvm_vcpu *vcpu);
 	void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
 	void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@ -119,6 +119,28 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
 	return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
 }
 /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
 static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
 				 int nent)
 {
 	struct kvm_cpuid_entry2 *orig;
 	int i;
 	if (nent != vcpu->arch.cpuid_nent)
 		return -EINVAL;
 	for (i = 0; i < nent; i++) {
 		orig = &vcpu->arch.cpuid_entries[i];
 		if (e2[i].function != orig->function ||
 		    e2[i].index != orig->index ||
 		    e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
 		    e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
 			return -EINVAL;
 	}
 	return 0;
 }
 static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
 {
 	u32 function;
@ -145,14 +167,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
 	}
 }
-static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
+static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
 					      struct kvm_cpuid_entry2 *entries, int nent)
 {
 	u32 base = vcpu->arch.kvm_cpuid_base;
 	if (!base)
 		return NULL;
-	return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0);
+	return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
 }
 static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
 {
 	return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
 					     vcpu->arch.cpuid_nent);
 }
 void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
@ -167,11 +196,12 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
 		vcpu->arch.pv_cpuid.features = best->eax;
 }
-void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
 				       int nent)
 {
 	struct kvm_cpuid_entry2 *best;
-	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	best = cpuid_entry2_find(entries, nent, 1, 0);
 	if (best) {
 		/* Update OSXSAVE bit */
 		if (boot_cpu_has(X86_FEATURE_XSAVE))
@ -182,33 +212,38 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
 			   vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
 	}
-	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	best = cpuid_entry2_find(entries, nent, 7, 0);
 	if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
 		cpuid_entry_change(best, X86_FEATURE_OSPKE,
 				   kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
-	best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
+	best = cpuid_entry2_find(entries, nent, 0xD, 0);
 	if (best)
 		best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
-	best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
+	best = cpuid_entry2_find(entries, nent, 0xD, 1);
 	if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
 		     cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
 		best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
-	best = kvm_find_kvm_cpuid_features(vcpu);
+	best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
 	if (kvm_hlt_in_guest(vcpu->kvm) && best &&
 		(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
 		best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
 	if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
-		best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+		best = cpuid_entry2_find(entries, nent, 0x1, 0);
 		if (best)
 			cpuid_entry_change(best, X86_FEATURE_MWAIT,
 					   vcpu->arch.ia32_misc_enable_msr &
 					   MSR_IA32_MISC_ENABLE_MWAIT);
 	}
 }
 void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
 {
 	__kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
 }
 EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
 static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
@ -298,6 +333,22 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
 {
 	int r;
 	__kvm_update_cpuid_runtime(vcpu, e2, nent);
 	/*
 	 * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
 	 * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
 	 * tracked in kvm_mmu_page_role.  As a result, KVM may miss guest page
 	 * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
 	 * the core vCPU model on the fly. It would've been better to forbid any
 	 * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
 	 * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
 	 * KVM_SET_CPUID{,2} again. To support this legacy behavior, check
 	 * whether the supplied CPUID data is equal to what's already set.
 	 */
 	if (vcpu->arch.last_vmentry_cpu != -1)
 		return kvm_cpuid_check_equal(vcpu, e2, nent);
 	r = kvm_check_cpuid(vcpu, e2, nent);
 	if (r)
 		return r;
@ -307,7 +358,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
 	vcpu->arch.cpuid_nent = nent;
 	kvm_update_kvm_cpuid_base(vcpu);
 	kvm_update_cpuid_runtime(vcpu);
 	kvm_vcpu_after_set_cpuid(vcpu);
 	return 0;
@ -795,10 +845,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		perf_get_x86_pmu_capability(&cap);
 		/*
-		 * Only support guest architectural pmu on a host
+		 * The guest architecture pmu is only supported if the architecture
-		 * with architectural pmu.
+		 * pmu exists on the host and the module parameters allow it.
 		 */
-		if (!cap.version)
+		if (!cap.version || !enable_pmu)
 			memset(&cap, 0, sizeof(cap));
 		eax.split.version_id = min(cap.version, 2);
@ -886,6 +936,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 				--array->nent;
 				continue;
 			}
 			if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
 				entry->ecx &= ~BIT_ULL(2);
 			entry->edx = 0;
 		}
 		break;
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@ -1950,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
 {
 	restart_apic_timer(vcpu->arch.apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
 void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
 {
@ -1962,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
 		start_sw_timer(apic);
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
 void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
 {
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@ -5756,6 +5756,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 				continue;
 			flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
 							PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
 							start, end - 1, true, flush);
 		}
@ -5825,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 	}
 	/*
-	 * We can flush all the TLBs out of the mmu lock without TLB
+	 * Flush TLBs if any SPTEs had to be write-protected to ensure that
-	 * corruption since we just change the spte from writable to
+	 * guest writes are reflected in the dirty bitmap before the memslot
-	 * readonly so that we only need to care the case of changing
+	 * update completes, i.e. before enabling dirty logging is visible to
-	 * spte from present to present (changing the spte from present
+	 * userspace.
-	 * to nonpresent will flush all the TLBs immediately), in other
+	 *
-	 * words, the only case we care is mmu_spte_update() where we
+	 * Perform the TLB flush outside the mmu_lock to reduce the amount of
-	 * have checked Host-writable | MMU-writable instead of
+	 * time the lock is held. However, this does mean that another CPU can
-	 * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
+	 * now grab mmu_lock and encounter a write-protected SPTE while CPUs
-	 * anymore.
+	 * still have a writable mapping for the associated GFN in their TLB.
 	 *
 	 * This is safe but requires KVM to be careful when making decisions
 	 * based on the write-protection status of an SPTE. Specifically, KVM
 	 * also write-protects SPTEs to monitor changes to guest page tables
 	 * during shadow paging, and must guarantee no CPUs can write to those
 	 * page before the lock is dropped. As mentioned in the previous
 	 * paragraph, a write-protected SPTE is no guarantee that CPU cannot
 	 * perform writes. So to determine if a TLB flush is truly required, KVM
 	 * will clear a separate software-only bit (MMU-writable) and skip the
 	 * flush if-and-only-if this bit was already clear.
 	 *
 	 * See DEFAULT_SPTE_MMU_WRITEABLE for more details.
 	 */
 	if (flush)
 		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@ -216,6 +216,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
 	new_spte &= ~PT_WRITABLE_MASK;
 	new_spte &= ~shadow_host_writable_mask;
 	new_spte &= ~shadow_mmu_writable_mask;
 	new_spte = mark_spte_for_access_track(new_spte);
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
 	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 /* Bits 9 and 10 are ignored by all non-EPT PTEs. */
 #define DEFAULT_SPTE_HOST_WRITEABLE	BIT_ULL(9)
 #define DEFAULT_SPTE_MMU_WRITEABLE	BIT_ULL(10)
 /*
 * The mask/shift to use for saving the original R/X bits when marking the PTE
 * as not-present for access tracking purposes. We do not save the W bit as the
@ -78,6 +74,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
 					 SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
 static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
 /*
 * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
 * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
 * that map guest pages in read-only memslots and read-only VMAs.
 *
 * Invariants:
 *  - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
 *
 *
 * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
 * allows writes to the guest page mapped by the SPTE. This bit is cleared when
 * the guest page mapped by the SPTE contains a page table that is being
 * monitored for shadow paging. In this case the SPTE can only be made writable
 * by unsyncing the shadow page under the mmu_lock.
 *
 * Invariants:
 *  - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
 *  - If MMU-writable is set, Host-writable must be set.
 *
 * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
 * to track writes for dirty logging. For such SPTEs, KVM will locklessly set
 * PT_WRITABLE_MASK upon the next write from the guest and record the write in
 * the dirty log (see fast_page_fault()).
 */
 /* Bits 9 and 10 are ignored by all non-EPT PTEs. */
 #define DEFAULT_SPTE_HOST_WRITEABLE	BIT_ULL(9)
 #define DEFAULT_SPTE_MMU_WRITEABLE	BIT_ULL(10)
 /*
 * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
 * to not overlap the A/D type mask or the saved access bits of access-tracked
@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
 static inline bool spte_can_locklessly_be_made_writable(u64 spte)
 {
-	return (spte & shadow_host_writable_mask) &&
+	if (spte & shadow_mmu_writable_mask) {
-	       (spte & shadow_mmu_writable_mask);
+		WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
 		return true;
 	}
 	WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
 	return false;
 }
 static inline u64 get_mmio_spte_generation(u64 spte)
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
 		    !is_last_spte(iter.old_spte, iter.level))
 			continue;
 		if (!is_writable_pte(iter.old_spte))
 			break;
 		new_spte = iter.old_spte &
 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
 		if (new_spte == iter.old_spte)
 			break;
 		tdp_mmu_set_spte(kvm, &iter, new_spte);
 		spte_set = true;
 	}
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@ -13,6 +13,8 @@
 #include <linux/types.h>
 #include <linux/kvm_host.h>
 #include <linux/perf_event.h>
 #include <linux/bsearch.h>
 #include <linux/sort.h>
 #include <asm/perf_event.h>
 #include "x86.h"
 #include "cpuid.h"
@ -109,6 +111,9 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 		.config = config,
 	};
 	if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
 		return;
 	attr.sample_period = get_sample_period(pmc, pmc->counter);
 	if (in_tx)
@ -169,12 +174,16 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
 	return true;
 }
 static int cmp_u64(const void *a, const void *b)
 {
 	return *(__u64 *)a - *(__u64 *)b;
 }
 void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 {
 	unsigned config, type = PERF_TYPE_RAW;
 	struct kvm *kvm = pmc->vcpu->kvm;
 	struct kvm_pmu_event_filter *filter;
 	int i;
 	bool allow_event = true;
 	if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@ -189,16 +198,13 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 	filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
 	if (filter) {
-		for (i = 0; i < filter->nevents; i++)
+		__u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
-			if (filter->events[i] ==
+
-			    (eventsel & AMD64_RAW_EVENT_MASK_NB))
+		if (bsearch(&key, filter->events, filter->nevents,
-				break;
+			    sizeof(__u64), cmp_u64))
-		if (filter->action == KVM_PMU_EVENT_ALLOW &&
+			allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
-		    i == filter->nevents)
+		else
-			allow_event = false;
+			allow_event = filter->action == KVM_PMU_EVENT_DENY;
 		if (filter->action == KVM_PMU_EVENT_DENY &&
 		    i < filter->nevents)
 			allow_event = false;
 	}
 	if (!allow_event)
 		return;
@ -573,6 +579,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
 	/* Ensure nevents can't be changed between the user copies. */
 	*filter = tmp;
 	/*
 	 * Sort the in-kernel list so that we can search it with bsearch.
 	 */
 	sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
 	mutex_lock(&kvm->lock);
 	filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
 				     mutex_is_locked(&kvm->lock));
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@ -295,13 +295,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
 	struct kvm_vcpu *vcpu;
 	unsigned long i;
 	/*
 	 * Wake any target vCPUs that are blocking, i.e. waiting for a wake
 	 * event.  There's no need to signal doorbells, as hardware has handled
 	 * vCPUs that were in guest at the time of the IPI, and vCPUs that have
 	 * since entered the guest will have processed pending IRQs at VMRUN.
 	 */
 	kvm_for_each_vcpu(i, vcpu, kvm) {
-		bool m = kvm_apic_match_dest(vcpu, source,
+		if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
-					     icrl & APIC_SHORT_MASK,
+					GET_APIC_DEST_FIELD(icrh),
-					     GET_APIC_DEST_FIELD(icrh),
+					icrl & APIC_DEST_MASK))
 					     icrl & APIC_DEST_MASK);
 		if (m && !avic_vcpu_is_running(vcpu))
 			kvm_vcpu_wake_up(vcpu);
 	}
 }
@ -672,9 +675,22 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
 		return -1;
 	kvm_lapic_set_irr(vec, vcpu->arch.apic);
 	/*
 	 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
 	 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
 	 * the read of guest_mode, which guarantees that either VMRUN will see
 	 * and process the new vIRR entry, or that the below code will signal
 	 * the doorbell if the vCPU is already running in the guest.
 	 */
 	smp_mb__after_atomic();
-	if (avic_vcpu_is_running(vcpu)) {
+	/*
 	 * Signal the doorbell to tell hardware to inject the IRQ if the vCPU
 	 * is in the guest.  If the vCPU is not in the guest, hardware will
 	 * automatically process AVIC interrupts at VMRUN.
 	 */
 	if (vcpu->mode == IN_GUEST_MODE) {
 		int cpu = READ_ONCE(vcpu->cpu);
 		/*
@ -688,8 +704,13 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
 		if (cpu != get_cpu())
 			wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
 		put_cpu();
-	} else
+	} else {
 		/*
 		 * Wake the vCPU if it was blocking.  KVM will then detect the
 		 * pending IRQ when checking if the vCPU has a wake event.
 		 */
 		kvm_vcpu_wake_up(vcpu);
 	}
 	return 0;
 }
@ -957,6 +978,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	int h_physical_id = kvm_cpu_get_apicid(cpu);
 	struct vcpu_svm *svm = to_svm(vcpu);
 	lockdep_assert_preemption_disabled();
 	/*
 	 * Since the host physical APIC id is 8 bits,
 	 * we can support host APIC ID upto 255.
@ -964,19 +987,25 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
 		return;
 	/*
 	 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
 	 * is being scheduled in after being preempted.  The CPU entries in the
 	 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
 	 * If the vCPU was migrated, its new CPU value will be stuffed when the
 	 * vCPU unblocks.
 	 */
 	if (kvm_vcpu_is_blocking(vcpu))
 		return;
 	entry = READ_ONCE(*(svm->avic_physical_id_cache));
 	WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
 	entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
 	entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
-
+	entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 	if (svm->avic_is_running)
 		entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
-	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
 					svm->avic_is_running);
 }
 void avic_vcpu_put(struct kvm_vcpu *vcpu)
@ -984,42 +1013,56 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
 	u64 entry;
 	struct vcpu_svm *svm = to_svm(vcpu);
 	lockdep_assert_preemption_disabled();
 	entry = READ_ONCE(*(svm->avic_physical_id_cache));
-	if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+
-		avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+	/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
 	if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
 		return;
 	avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
 	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
 }
-/*
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
 * This function is called during VCPU halt/unhalt.
 */
 static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
 {
-	struct vcpu_svm *svm = to_svm(vcpu);
+	if (!kvm_vcpu_apicv_active(vcpu))
-	int cpu = get_cpu();
+		return;
 	preempt_disable();
       /*
        * Unload the AVIC when the vCPU is about to block, _before_
        * the vCPU actually blocks.
        *
        * Any IRQs that arrive before IsRunning=0 will not cause an
        * incomplete IPI vmexit on the source, therefore vIRR will also
        * be checked by kvm_vcpu_check_block() before blocking.  The
        * memory barrier implicit in set_current_state orders writing
        * IsRunning=0 before reading the vIRR.  The processor needs a
        * matching memory barrier on interrupt delivery between writing
        * IRR and reading IsRunning; the lack of this barrier might be
        * the cause of errata #1235).
        */
 	avic_vcpu_put(vcpu);
 	preempt_enable();
 }
 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
 {
 	int cpu;
 	if (!kvm_vcpu_apicv_active(vcpu))
 		return;
 	cpu = get_cpu();
 	WARN_ON(cpu != vcpu->cpu);
 	svm->avic_is_running = is_run;
-	if (kvm_vcpu_apicv_active(vcpu)) {
+	avic_vcpu_load(vcpu, cpu);
-		if (is_run)
+
 			avic_vcpu_load(vcpu, cpu);
 		else
 			avic_vcpu_put(vcpu);
 	}
 	put_cpu();
 }
 void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
 	avic_set_running(vcpu, false);
 }
 void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
 {
 	if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
 		kvm_vcpu_update_apicv(vcpu);
 	avic_set_running(vcpu, true);
 }
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
 {
 	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
-	if (!pmu)
+	if (!enable_pmu)
 		return NULL;
 	switch (msr) {
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@ -192,10 +192,6 @@ module_param(vgif, int, 0444);
 static int lbrv = true;
 module_param(lbrv, int, 0444);
 /* enable/disable PMU virtualization */
 bool pmu = true;
 module_param(pmu, bool, 0444);
 static int tsc_scaling = true;
 module_param(tsc_scaling, int, 0444);
@ -873,47 +869,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
 	}
 }
 /*
 * The default MMIO mask is a single bit (excluding the present bit),
 * which could conflict with the memory encryption bit. Check for
 * memory encryption support and override the default MMIO mask if
 * memory encryption is enabled.
 */
 static __init void svm_adjust_mmio_mask(void)
 {
 	unsigned int enc_bit, mask_bit;
 	u64 msr, mask;
 	/* If there is no memory encryption support, use existing mask */
 	if (cpuid_eax(0x80000000) < 0x8000001f)
 		return;
 	/* If memory encryption is not enabled, use existing mask */
 	rdmsrl(MSR_AMD64_SYSCFG, msr);
 	if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
 		return;
 	enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
 	mask_bit = boot_cpu_data.x86_phys_bits;
 	/* Increment the mask bit if it is the same as the encryption bit */
 	if (enc_bit == mask_bit)
 		mask_bit++;
 	/*
 	 * If the mask bit location is below 52, then some bits above the
 	 * physical addressing limit will always be reserved, so use the
 	 * rsvd_bits() function to generate the mask. This mask, along with
 	 * the present bit, will be used to generate a page fault with
 	 * PFER.RSV = 1.
 	 *
 	 * If the mask bit location is 52 (or above), then clear the mask.
 	 */
 	mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
 	kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
 }
 static void svm_hardware_teardown(void)
 {
 	int cpu;
@ -928,198 +883,6 @@ static void svm_hardware_teardown(void)
 	iopm_base = 0;
 }
 static __init void svm_set_cpu_caps(void)
 {
 	kvm_set_cpu_caps();
 	supported_xss = 0;
 	/* CPUID 0x80000001 and 0x8000000A (SVM features) */
 	if (nested) {
 		kvm_cpu_cap_set(X86_FEATURE_SVM);
 		if (nrips)
 			kvm_cpu_cap_set(X86_FEATURE_NRIPS);
 		if (npt_enabled)
 			kvm_cpu_cap_set(X86_FEATURE_NPT);
 		if (tsc_scaling)
 			kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
 		/* Nested VM can receive #VMEXIT instead of triggering #GP */
 		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
 	}
 	/* CPUID 0x80000008 */
 	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
 	    boot_cpu_has(X86_FEATURE_AMD_SSBD))
 		kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
 	/* AMD PMU PERFCTR_CORE CPUID */
 	if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
 		kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
 	/* CPUID 0x8000001F (SME/SEV features) */
 	sev_set_cpu_caps();
 }
 static __init int svm_hardware_setup(void)
 {
 	int cpu;
 	struct page *iopm_pages;
 	void *iopm_va;
 	int r;
 	unsigned int order = get_order(IOPM_SIZE);
 	/*
 	 * NX is required for shadow paging and for NPT if the NX huge pages
 	 * mitigation is enabled.
 	 */
 	if (!boot_cpu_has(X86_FEATURE_NX)) {
 		pr_err_ratelimited("NX (Execute Disable) not supported\n");
 		return -EOPNOTSUPP;
 	}
 	kvm_enable_efer_bits(EFER_NX);
 	iopm_pages = alloc_pages(GFP_KERNEL, order);
 	if (!iopm_pages)
 		return -ENOMEM;
 	iopm_va = page_address(iopm_pages);
 	memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
 	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 	init_msrpm_offsets();
 	supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
 		kvm_enable_efer_bits(EFER_FFXSR);
 	if (tsc_scaling) {
 		if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 			tsc_scaling = false;
 		} else {
 			pr_info("TSC scaling supported\n");
 			kvm_has_tsc_control = true;
 			kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
 			kvm_tsc_scaling_ratio_frac_bits = 32;
 		}
 	}
 	tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
 	/* Check for pause filtering support */
 	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
 		pause_filter_count = 0;
 		pause_filter_thresh = 0;
 	} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
 		pause_filter_thresh = 0;
 	}
 	if (nested) {
 		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
 		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
 	}
 	/*
 	 * KVM's MMU doesn't support using 2-level paging for itself, and thus
 	 * NPT isn't supported if the host is using 2-level paging since host
 	 * CR4 is unchanged on VMRUN.
 	 */
 	if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
 		npt_enabled = false;
 	if (!boot_cpu_has(X86_FEATURE_NPT))
 		npt_enabled = false;
 	/* Force VM NPT level equal to the host's paging level */
 	kvm_configure_mmu(npt_enabled, get_npt_level(),
 			  get_npt_level(), PG_LEVEL_1G);
 	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
 	/* Note, SEV setup consumes npt_enabled. */
 	sev_hardware_setup();
 	svm_hv_hardware_setup();
 	svm_adjust_mmio_mask();
 	for_each_possible_cpu(cpu) {
 		r = svm_cpu_init(cpu);
 		if (r)
 			goto err;
 	}
 	if (nrips) {
 		if (!boot_cpu_has(X86_FEATURE_NRIPS))
 			nrips = false;
 	}
 	enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
 	if (enable_apicv) {
 		pr_info("AVIC enabled\n");
 		amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
 	}
 	if (vls) {
 		if (!npt_enabled ||
 		    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
 		    !IS_ENABLED(CONFIG_X86_64)) {
 			vls = false;
 		} else {
 			pr_info("Virtual VMLOAD VMSAVE supported\n");
 		}
 	}
 	if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
 		svm_gp_erratum_intercept = false;
 	if (vgif) {
 		if (!boot_cpu_has(X86_FEATURE_VGIF))
 			vgif = false;
 		else
 			pr_info("Virtual GIF supported\n");
 	}
 	if (lbrv) {
 		if (!boot_cpu_has(X86_FEATURE_LBRV))
 			lbrv = false;
 		else
 			pr_info("LBR virtualization supported\n");
 	}
 	if (!pmu)
 		pr_info("PMU virtualization is disabled\n");
 	svm_set_cpu_caps();
 	/*
 	 * It seems that on AMD processors PTE's accessed bit is
 	 * being set by the CPU hardware before the NPF vmexit.
 	 * This is not expected behaviour and our tests fail because
 	 * of it.
 	 * A workaround here is to disable support for
 	 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
 	 * In this case userspace can know if there is support using
 	 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
 	 * it
 	 * If future AMD CPU models change the behaviour described above,
 	 * this variable can be changed accordingly
 	 */
 	allow_smaller_maxphyaddr = !npt_enabled;
 	return 0;
 err:
 	svm_hardware_teardown();
 	return r;
 }
 static void init_seg(struct vmcb_seg *seg)
 {
 	seg->selector = 0;
@ -1444,12 +1207,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 	if (err)
 		goto error_free_vmsa_page;
 	/* We initialize this flag to true to make sure that the is_running
 	 * bit would be set the first time the vcpu is loaded.
 	 */
 	if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
 		svm->avic_is_running = true;
 	svm->msrpm = svm_vcpu_alloc_msrpm();
 	if (!svm->msrpm) {
 		err = -ENOMEM;
@ -3833,6 +3590,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 	svm_complete_interrupts(vcpu);
 }
 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
 {
 	return 1;
 }
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
 	if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
@ -4629,8 +4391,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.prepare_guest_switch = svm_prepare_guest_switch,
 	.vcpu_load = svm_vcpu_load,
 	.vcpu_put = svm_vcpu_put,
-	.vcpu_blocking = svm_vcpu_blocking,
+	.vcpu_blocking = avic_vcpu_blocking,
-	.vcpu_unblocking = svm_vcpu_unblocking,
+	.vcpu_unblocking = avic_vcpu_unblocking,
 	.update_exception_bitmap = svm_update_exception_bitmap,
 	.get_msr_feature = svm_get_msr_feature,
@ -4662,6 +4424,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.tlb_flush_gva = svm_flush_tlb_gva,
 	.tlb_flush_guest = svm_flush_tlb,
 	.vcpu_pre_run = svm_vcpu_pre_run,
 	.run = svm_vcpu_run,
 	.handle_exit = handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
@ -4742,6 +4505,243 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
 };
 /*
 * The default MMIO mask is a single bit (excluding the present bit),
 * which could conflict with the memory encryption bit. Check for
 * memory encryption support and override the default MMIO mask if
 * memory encryption is enabled.
 */
 static __init void svm_adjust_mmio_mask(void)
 {
 	unsigned int enc_bit, mask_bit;
 	u64 msr, mask;
 	/* If there is no memory encryption support, use existing mask */
 	if (cpuid_eax(0x80000000) < 0x8000001f)
 		return;
 	/* If memory encryption is not enabled, use existing mask */
 	rdmsrl(MSR_AMD64_SYSCFG, msr);
 	if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
 		return;
 	enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
 	mask_bit = boot_cpu_data.x86_phys_bits;
 	/* Increment the mask bit if it is the same as the encryption bit */
 	if (enc_bit == mask_bit)
 		mask_bit++;
 	/*
 	 * If the mask bit location is below 52, then some bits above the
 	 * physical addressing limit will always be reserved, so use the
 	 * rsvd_bits() function to generate the mask. This mask, along with
 	 * the present bit, will be used to generate a page fault with
 	 * PFER.RSV = 1.
 	 *
 	 * If the mask bit location is 52 (or above), then clear the mask.
 	 */
 	mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
 	kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
 }
 static __init void svm_set_cpu_caps(void)
 {
 	kvm_set_cpu_caps();
 	supported_xss = 0;
 	/* CPUID 0x80000001 and 0x8000000A (SVM features) */
 	if (nested) {
 		kvm_cpu_cap_set(X86_FEATURE_SVM);
 		if (nrips)
 			kvm_cpu_cap_set(X86_FEATURE_NRIPS);
 		if (npt_enabled)
 			kvm_cpu_cap_set(X86_FEATURE_NPT);
 		if (tsc_scaling)
 			kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
 		/* Nested VM can receive #VMEXIT instead of triggering #GP */
 		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
 	}
 	/* CPUID 0x80000008 */
 	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
 	    boot_cpu_has(X86_FEATURE_AMD_SSBD))
 		kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
 	/* AMD PMU PERFCTR_CORE CPUID */
 	if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
 		kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
 	/* CPUID 0x8000001F (SME/SEV features) */
 	sev_set_cpu_caps();
 }
 static __init int svm_hardware_setup(void)
 {
 	int cpu;
 	struct page *iopm_pages;
 	void *iopm_va;
 	int r;
 	unsigned int order = get_order(IOPM_SIZE);
 	/*
 	 * NX is required for shadow paging and for NPT if the NX huge pages
 	 * mitigation is enabled.
 	 */
 	if (!boot_cpu_has(X86_FEATURE_NX)) {
 		pr_err_ratelimited("NX (Execute Disable) not supported\n");
 		return -EOPNOTSUPP;
 	}
 	kvm_enable_efer_bits(EFER_NX);
 	iopm_pages = alloc_pages(GFP_KERNEL, order);
 	if (!iopm_pages)
 		return -ENOMEM;
 	iopm_va = page_address(iopm_pages);
 	memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
 	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 	init_msrpm_offsets();
 	supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
 		kvm_enable_efer_bits(EFER_FFXSR);
 	if (tsc_scaling) {
 		if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 			tsc_scaling = false;
 		} else {
 			pr_info("TSC scaling supported\n");
 			kvm_has_tsc_control = true;
 			kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
 			kvm_tsc_scaling_ratio_frac_bits = 32;
 		}
 	}
 	tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
 	/* Check for pause filtering support */
 	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
 		pause_filter_count = 0;
 		pause_filter_thresh = 0;
 	} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
 		pause_filter_thresh = 0;
 	}
 	if (nested) {
 		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
 		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
 	}
 	/*
 	 * KVM's MMU doesn't support using 2-level paging for itself, and thus
 	 * NPT isn't supported if the host is using 2-level paging since host
 	 * CR4 is unchanged on VMRUN.
 	 */
 	if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
 		npt_enabled = false;
 	if (!boot_cpu_has(X86_FEATURE_NPT))
 		npt_enabled = false;
 	/* Force VM NPT level equal to the host's paging level */
 	kvm_configure_mmu(npt_enabled, get_npt_level(),
 			  get_npt_level(), PG_LEVEL_1G);
 	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
 	/* Note, SEV setup consumes npt_enabled. */
 	sev_hardware_setup();
 	svm_hv_hardware_setup();
 	svm_adjust_mmio_mask();
 	for_each_possible_cpu(cpu) {
 		r = svm_cpu_init(cpu);
 		if (r)
 			goto err;
 	}
 	if (nrips) {
 		if (!boot_cpu_has(X86_FEATURE_NRIPS))
 			nrips = false;
 	}
 	enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
 	if (enable_apicv) {
 		pr_info("AVIC enabled\n");
 		amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
 	} else {
 		svm_x86_ops.vcpu_blocking = NULL;
 		svm_x86_ops.vcpu_unblocking = NULL;
 	}
 	if (vls) {
 		if (!npt_enabled ||
 		    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
 		    !IS_ENABLED(CONFIG_X86_64)) {
 			vls = false;
 		} else {
 			pr_info("Virtual VMLOAD VMSAVE supported\n");
 		}
 	}
 	if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
 		svm_gp_erratum_intercept = false;
 	if (vgif) {
 		if (!boot_cpu_has(X86_FEATURE_VGIF))
 			vgif = false;
 		else
 			pr_info("Virtual GIF supported\n");
 	}
 	if (lbrv) {
 		if (!boot_cpu_has(X86_FEATURE_LBRV))
 			lbrv = false;
 		else
 			pr_info("LBR virtualization supported\n");
 	}
 	if (!enable_pmu)
 		pr_info("PMU virtualization is disabled\n");
 	svm_set_cpu_caps();
 	/*
 	 * It seems that on AMD processors PTE's accessed bit is
 	 * being set by the CPU hardware before the NPF vmexit.
 	 * This is not expected behaviour and our tests fail because
 	 * of it.
 	 * A workaround here is to disable support for
 	 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
 	 * In this case userspace can know if there is support using
 	 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
 	 * it
 	 * If future AMD CPU models change the behaviour described above,
 	 * this variable can be changed accordingly
 	 */
 	allow_smaller_maxphyaddr = !npt_enabled;
 	return 0;
 err:
 	svm_hardware_teardown();
 	return r;
 }
 static struct kvm_x86_init_ops svm_init_ops __initdata = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@ -32,7 +32,6 @@
 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 extern bool npt_enabled;
 extern bool intercept_smi;
 extern bool pmu;
 /*
 * Clean bits in VMCB.
@ -226,7 +225,6 @@ struct vcpu_svm {
 	u32 dfr_reg;
 	struct page *avic_backing_page;
 	u64 *avic_physical_id_cache;
 	bool avic_is_running;
 	/*
 	 * Per-vcpu list of struct amd_svm_iommu_ir:
@ -574,17 +572,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
 #define VMCB_AVIC_APIC_BAR_MASK		0xFFFFFFFFFF000ULL
 static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 *entry = svm->avic_physical_id_cache;
 	if (!entry)
 		return false;
 	return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
 }
 int avic_ga_log_notifier(u32 ga_tag);
 void avic_vm_destroy(struct kvm *kvm);
 int avic_vm_init(struct kvm *kvm);
@ -605,8 +592,8 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
 bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 		       uint32_t guest_irq, bool set);
-void svm_vcpu_blocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
-void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
 /* sev.c */
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@ -5,6 +5,7 @@
 #include <asm/vmx.h>
 #include "lapic.h"
 #include "x86.h"
 extern bool __read_mostly enable_vpid;
 extern bool __read_mostly flexpriority_enabled;
@ -389,6 +390,9 @@ static inline u64 vmx_get_perf_capabilities(void)
 {
 	u64 perf_cap = 0;
 	if (!enable_pmu)
 		return perf_cap;
 	if (boot_cpu_has(X86_FEATURE_PDCM))
 		rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@ -21,7 +21,6 @@
 #define MSR_PMC_FULL_WIDTH_BIT      (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
 static struct kvm_event_hw_type_mapping intel_arch_events[] = {
 	/* Index must match CPUID 0x0A.EBX bit vector */
 	[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
 	[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
 	[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES  },
@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
 	[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
 	[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
 	[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
 	/* The above index must match CPUID 0x0A.EBX bit vector */
 	[7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
 };
@ -75,11 +75,17 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
 	u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
 	int i;
-	for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
+	for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
-		if (intel_arch_events[i].eventsel == event_select &&
+		if (intel_arch_events[i].eventsel != event_select ||
-		    intel_arch_events[i].unit_mask == unit_mask &&
+		    intel_arch_events[i].unit_mask != unit_mask)
-		    (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i)))
+			continue;
-			break;
+
 		/* disable event that reported as not present by cpuid */
 		if ((i < 7) && !(pmu->available_event_types & (1 << i)))
 			return PERF_COUNT_HW_MAX + 1;
 		break;
 	}
 	if (i == ARRAY_SIZE(intel_arch_events))
 		return PERF_COUNT_HW_MAX;
@ -481,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
 	pmu->reserved_bits = 0xffffffff00200000ull;
 	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
-	if (!entry)
+	if (!entry || !enable_pmu)
 		return;
 	eax.full = entry->eax;
 	edx.full = entry->edx;
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@ -19,7 +19,7 @@
 * wake the target vCPUs.  vCPUs are removed from the list and the notification
 * vector is reset when the vCPU is scheduled in.
 */
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
 /*
 * Protect the per-CPU list with a per-CPU spinlock to handle task migration.
 * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
@ -27,7 +27,7 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
 * CPU.  IRQs must be disabled when taking this lock, otherwise deadlock will
 * occur if a wakeup IRQ arrives and attempts to acquire the lock.
 */
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
 static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
 {
@ -51,7 +51,9 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct pi_desc old, new;
 	unsigned long flags;
 	unsigned int dest;
 	/*
@ -62,23 +64,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	if (!enable_apicv || !lapic_in_kernel(vcpu))
 		return;
 	/* Nothing to do if PI.SN and PI.NDST both have the desired value. */
 	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
 		return;
 	/*
-	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+	 * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
-	 * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
+	 * full update can be skipped as neither the vector nor the destination
-	 * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
+	 * needs to be changed.
 	 * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
 	 * correctly.
 	 */
-	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
+	if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
-		pi_clear_sn(pi_desc);
+		/*
-		goto after_clear_sn;
+		 * Clear SN if it was set due to being preempted.  Again, do
 		 * this even if there is no assigned device for simplicity.
 		 */
 		if (pi_test_and_clear_sn(pi_desc))
 			goto after_clear_sn;
 		return;
 	}
 	local_irq_save(flags);
 	/*
 	 * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
 	 * list of the _previous_ pCPU, which will not be the same as the
 	 * current pCPU if the task was migrated.
 	 */
 	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
 		raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
 		list_del(&vmx->pi_wakeup_list);
 		raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
 	}
 	/* The full case.  Set the new destination and clear SN. */
 	dest = cpu_physical_id(cpu);
 	if (!x2apic_mode)
 		dest = (dest << 8) & 0xFF00;
@ -86,10 +99,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	do {
 		old.control = new.control = READ_ONCE(pi_desc->control);
 		/*
 		 * Clear SN (as above) and refresh the destination APIC ID to
 		 * handle task migration (@cpu != vcpu->cpu).
 		 */
 		new.ndst = dest;
 		new.sn = 0;
 		/*
 		 * Restore the notification vector; in the blocking case, the
 		 * descriptor was modified on "put" to use the wakeup vector.
 		 */
 		new.nv = POSTED_INTR_VECTOR;
 	} while (pi_try_set_control(pi_desc, old.control, new.control));
 	local_irq_restore(flags);
 after_clear_sn:
 	/*
@ -111,83 +136,25 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm)
 		irq_remapping_cap(IRQ_POSTING_CAP);
 }
 void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 {
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 	if (!vmx_can_use_vtd_pi(vcpu->kvm))
 		return;
 	/* Set SN when the vCPU is preempted */
 	if (vcpu->preempted)
 		pi_set_sn(pi_desc);
 }
 static void __pi_post_block(struct kvm_vcpu *vcpu)
 {
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 	struct pi_desc old, new;
 	unsigned int dest;
 	/*
 	 * Remove the vCPU from the wakeup list of the _previous_ pCPU, which
 	 * will not be the same as the current pCPU if the task was migrated.
 	 */
 	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
 	list_del(&vcpu->blocked_vcpu_list);
 	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
 	dest = cpu_physical_id(vcpu->cpu);
 	if (!x2apic_mode)
 		dest = (dest << 8) & 0xFF00;
 	WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR,
 	     "Wakeup handler not enabled while the vCPU was blocking");
 	do {
 		old.control = new.control = READ_ONCE(pi_desc->control);
 		new.ndst = dest;
 		/* set 'NV' to 'notification vector' */
 		new.nv = POSTED_INTR_VECTOR;
 	} while (pi_try_set_control(pi_desc, old.control, new.control));
 	vcpu->pre_pcpu = -1;
 }
 /*
- * This routine does the following things for vCPU which is going
+ * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
- * to be blocked if VT-d PI is enabled.
+ * WAKEUP as the notification vector in the PI descriptor.
 * - Store the vCPU to the wakeup list, so when interrupts happen
 *   we can find the right vCPU to wake up.
 * - Change the Posted-interrupt descriptor as below:
 *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
 * - If 'ON' is set during this process, which means at least one
 *   interrupt is posted for this vCPU, we cannot block it, in
 *   this case, return 1, otherwise, return 0.
 *
 */
-int pi_pre_block(struct kvm_vcpu *vcpu)
+static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
 {
 	struct pi_desc old, new;
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct pi_desc old, new;
 	unsigned long flags;
 	if (!vmx_can_use_vtd_pi(vcpu->kvm) ||
 	    vmx_interrupt_blocked(vcpu))
 		return 0;
 	local_irq_save(flags);
-	vcpu->pre_pcpu = vcpu->cpu;
+	raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
-	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
+	list_add_tail(&vmx->pi_wakeup_list,
-	list_add_tail(&vcpu->blocked_vcpu_list,
+		      &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
-		      &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu));
+	raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
 	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
-	WARN(pi_desc->sn == 1,
+	WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
 	     "Posted Interrupt Suppress Notification set before blocking");
 	do {
 		old.control = new.control = READ_ONCE(pi_desc->control);
@ -196,24 +163,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
 		new.nv = POSTED_INTR_WAKEUP_VECTOR;
 	} while (pi_try_set_control(pi_desc, old.control, new.control));
-	/* We should not block the vCPU if an interrupt is posted for it.  */
+	/*
-	if (pi_test_on(pi_desc))
+	 * Send a wakeup IPI to this CPU if an interrupt may have been posted
-		__pi_post_block(vcpu);
+	 * before the notification vector was updated, in which case the IRQ
 	 * will arrive on the non-wakeup vector.  An IPI is needed as calling
 	 * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
 	 * enabled until it is safe to call try_to_wake_up() on the task being
 	 * scheduled out).
 	 */
 	if (pi_test_on(&new))
 		apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
 	local_irq_restore(flags);
 	return (vcpu->pre_pcpu == -1);
 }
-void pi_post_block(struct kvm_vcpu *vcpu)
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 {
-	unsigned long flags;
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-	if (vcpu->pre_pcpu == -1)
+	if (!vmx_can_use_vtd_pi(vcpu->kvm))
 		return;
-	local_irq_save(flags);
+	if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
-	__pi_post_block(vcpu);
+		pi_enable_wakeup_handler(vcpu);
-	local_irq_restore(flags);
+
 	/*
 	 * Set SN when the vCPU is preempted.  Note, the vCPU can both be seen
 	 * as blocking and preempted, e.g. if it's preempted between setting
 	 * its wait state and manually scheduling out.
 	 */
 	if (vcpu->preempted)
 		pi_set_sn(pi_desc);
 }
 /*
@ -221,24 +201,23 @@ void pi_post_block(struct kvm_vcpu *vcpu)
 */
 void pi_wakeup_handler(void)
 {
 	struct kvm_vcpu *vcpu;
 	int cpu = smp_processor_id();
 	struct vcpu_vmx *vmx;
-	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+	raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
-	list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+	list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu),
-			blocked_vcpu_list) {
+			    pi_wakeup_list) {
 		struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-		if (pi_test_on(pi_desc))
+		if (pi_test_on(&vmx->pi_desc))
-			kvm_vcpu_kick(vcpu);
+			kvm_vcpu_wake_up(&vmx->vcpu);
 	}
-	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+	raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
 }
 void __init pi_init_cpu(int cpu)
 {
-	INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+	INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
-	spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+	raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
 }
 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
@ -254,7 +233,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
 * Bail out of the block loop if the VM has an assigned
 * device, but the blocking vCPU didn't reconfigure the
 * PI.NV to the wakeup vector, i.e. the assigned device
- * came along after the initial check in pi_pre_block().
+ * came along after the initial check in vmx_vcpu_pi_put().
 */
 void vmx_pi_start_assignment(struct kvm *kvm)
 {
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
 			(unsigned long *)&pi_desc->control);
 }
 static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
 {
 	return test_and_clear_bit(POSTED_INTR_SN,
 			(unsigned long *)&pi_desc->control);
 }
 static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
 {
 	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
 void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
 int pi_pre_block(struct kvm_vcpu *vcpu);
 void pi_post_block(struct kvm_vcpu *vcpu);
 void pi_wakeup_handler(void);
 void __init pi_init_cpu(int cpu);
 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@ -3931,12 +3931,10 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
 	pt_update_intercept_for_msr(vcpu);
 }
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
-						     bool nested)
+						     int pi_vec)
 {
 #ifdef CONFIG_SMP
 	int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
 	if (vcpu->mode == IN_GUEST_MODE) {
 		/*
 		 * The vector of interrupt to be delivered to vcpu had
@ -3964,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
 		 */
 		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
-		return true;
+		return;
 	}
 #endif
-	return false;
+	/*
 	 * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
 	 * otherwise do nothing as KVM will grab the highest priority pending
 	 * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
 	 */
 	kvm_vcpu_wake_up(vcpu);
 }
 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@ -3997,8 +4000,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
 		smp_mb__after_atomic();
 		/* the PIR and ON have been set by L1. */
-		if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
+		kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
 			kvm_vcpu_kick(vcpu);
 		return 0;
 	}
 	return -1;
@ -4035,9 +4037,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 	 * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
 	 * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
 	 */
-	if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
+	kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
 		kvm_vcpu_kick(vcpu);
 	return 0;
 }
@ -5426,6 +5426,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
 	return 1;
 }
 static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	return vmx->emulation_required && !vmx->rmode.vm86_active &&
 	       vcpu->arch.exception.pending;
 }
 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@ -5445,8 +5453,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 		if (!kvm_emulate_instruction(vcpu, 0))
 			return 0;
-		if (vmx->emulation_required && !vmx->rmode.vm86_active &&
+		if (vmx_emulation_required_with_pending_exception(vcpu)) {
 		    vcpu->arch.exception.pending) {
 			kvm_prepare_emulation_failure_exit(vcpu);
 			return 0;
 		}
@ -5468,6 +5475,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 	return 1;
 }
 static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
 {
 	if (vmx_emulation_required_with_pending_exception(vcpu)) {
 		kvm_prepare_emulation_failure_exit(vcpu);
 		return 0;
 	}
 	return 1;
 }
 static void grow_ple_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@ -6928,6 +6945,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 	BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
 	vmx = to_vmx(vcpu);
 	INIT_LIST_HEAD(&vmx->pi_wakeup_list);
 	err = -ENOMEM;
 	vmx->vpid = allocate_vpid();
@ -7549,25 +7568,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
 		secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
 }
 static int vmx_pre_block(struct kvm_vcpu *vcpu)
 {
 	if (pi_pre_block(vcpu))
 		return 1;
 	if (kvm_lapic_hv_timer_in_use(vcpu))
 		kvm_lapic_switch_to_sw_timer(vcpu);
 	return 0;
 }
 static void vmx_post_block(struct kvm_vcpu *vcpu)
 {
 	if (kvm_x86_ops.set_hv_timer)
 		kvm_lapic_switch_to_hv_timer(vcpu);
 	pi_post_block(vcpu);
 }
 static void vmx_setup_mce(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@ -7710,6 +7710,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.tlb_flush_gva = vmx_flush_tlb_gva,
 	.tlb_flush_guest = vmx_flush_tlb_guest,
 	.vcpu_pre_run = vmx_vcpu_pre_run,
 	.run = vmx_vcpu_run,
 	.handle_exit = vmx_handle_exit,
 	.skip_emulated_instruction = vmx_skip_emulated_instruction,
@ -7768,9 +7769,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.cpu_dirty_log_size = PML_ENTITY_NUM,
 	.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
 	.pre_block = vmx_pre_block,
 	.post_block = vmx_post_block,
 	.pmu_ops = &intel_pmu_ops,
 	.nested_ops = &vmx_nested_ops,
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@ -317,6 +317,9 @@ struct vcpu_vmx {
 	/* Posted interrupt descriptor */
 	struct pi_desc pi_desc;
 	/* Used if this vCPU is waiting for PI notification wakeup. */
 	struct list_head pi_wakeup_list;
 	/* Support for a guest hypervisor (nested VMX) */
 	struct nested_vmx nested;
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -187,6 +187,11 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
 int __read_mostly pi_inject_timer = -1;
 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
 /* Enable/disable PMU virtualization */
 bool __read_mostly enable_pmu = true;
 EXPORT_SYMBOL_GPL(enable_pmu);
 module_param(enable_pmu, bool, 0444);
 /*
 * Restoring the host value for MSRs that are only consumed when running in
 * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@ -5230,17 +5235,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		struct kvm_cpuid __user *cpuid_arg = argp;
 		struct kvm_cpuid cpuid;
 		/*
 		 * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
 		 * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
 		 * tracked in kvm_mmu_page_role.  As a result, KVM may miss guest page
 		 * faults due to reusing SPs/SPTEs.  In practice no sane VMM mucks with
 		 * the core vCPU model on the fly, so fail.
 		 */
 		r = -EINVAL;
 		if (vcpu->arch.last_vmentry_cpu != -1)
 			goto out;
 		r = -EFAULT;
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
 			goto out;
@ -5251,14 +5245,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		struct kvm_cpuid2 __user *cpuid_arg = argp;
 		struct kvm_cpuid2 cpuid;
 		/*
 		 * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
 		 * KVM_SET_CPUID case above.
 		 */
 		r = -EINVAL;
 		if (vcpu->arch.last_vmentry_cpu != -1)
 			goto out;
 		r = -EFAULT;
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
 			goto out;
@ -9945,10 +9931,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	smp_mb__after_srcu_read_unlock();
 	/*
-	 * This handles the case where a posted interrupt was
+	 * Process pending posted interrupts to handle the case where the
-	 * notified with kvm_vcpu_kick.  Assigned devices can
+	 * notification IRQ arrived in the host, or was never sent (because the
-	 * use the POSTED_INTR_VECTOR even if APICv is disabled,
+	 * target vCPU wasn't running).  Do this regardless of the vCPU's APICv
-	 * so do it even if APICv is disabled on this vCPU.
+	 * status, KVM doesn't update assigned devices when APICv is inhibited,
 	 * i.e. they can post interrupts even if APICv is temporarily disabled.
 	 */
 	if (kvm_lapic_enabled(vcpu))
 		static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
@ -10113,8 +10100,20 @@ out:
 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 {
-	if (!kvm_arch_vcpu_runnable(vcpu) &&
+	bool hv_timer;
-	    (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
+
 	if (!kvm_arch_vcpu_runnable(vcpu)) {
 		/*
 		 * Switch to the software timer before halt-polling/blocking as
 		 * the guest's timer may be a break event for the vCPU, and the
 		 * hypervisor timer runs only when the CPU is in guest mode.
 		 * Switch before halt-polling so that KVM recognizes an expired
 		 * timer before blocking.
 		 */
 		hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
 		if (hv_timer)
 			kvm_lapic_switch_to_sw_timer(vcpu);
 		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 		if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
 			kvm_vcpu_halt(vcpu);
@ -10122,8 +10121,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 			kvm_vcpu_block(vcpu);
 		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-		if (kvm_x86_ops.post_block)
+		if (hv_timer)
-			static_call(kvm_x86_post_block)(vcpu);
+			kvm_lapic_switch_to_hv_timer(vcpu);
 		if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
 			return 1;
@ -10316,6 +10315,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 			r = -EINTR;
 			goto out;
 		}
 		/*
 		 * It should be impossible for the hypervisor timer to be in
 		 * use before KVM has ever run the vCPU.
 		 */
 		WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
 		kvm_vcpu_block(vcpu);
 		if (kvm_apic_accept_events(vcpu) < 0) {
 			r = 0;
@ -10360,10 +10364,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 	} else
 		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
-	if (kvm_run->immediate_exit)
+	if (kvm_run->immediate_exit) {
 		r = -EINTR;
-	else
+		goto out;
-		r = vcpu_run(vcpu);
+	}
 	r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
 	if (r <= 0)
 		goto out;
 	r = vcpu_run(vcpu);
 out:
 	kvm_put_guest_fpu(vcpu);
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@ -336,6 +336,7 @@ extern u64 host_xcr0;
 extern u64 supported_xcr0;
 extern u64 host_xss;
 extern u64 supported_xss;
 extern bool enable_pmu;
 static inline bool kvm_mpx_supported(void)
 {
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@ -309,9 +309,6 @@ struct kvm_vcpu {
 	u64 requests;
 	unsigned long guest_debug;
 	int pre_pcpu;
 	struct list_head blocked_vcpu_list;
 	struct mutex mutex;
 	struct kvm_run *run;
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@ -1131,7 +1131,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
 #define KVM_CAP_ARM_MTE 205
 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
-#define KVM_CAP_XSAVE2 207
+#define KVM_CAP_VM_GPA_BITS 207
 #define KVM_CAP_XSAVE2 208
 #ifdef KVM_CAP_IRQ_ROUTING
@ -1163,11 +1164,20 @@ struct kvm_irq_routing_hv_sint {
 	__u32 sint;
 };
 struct kvm_irq_routing_xen_evtchn {
 	__u32 port;
 	__u32 vcpu;
 	__u32 priority;
 };
 #define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
 #define KVM_IRQ_ROUTING_S390_ADAPTER 3
 #define KVM_IRQ_ROUTING_HV_SINT 4
 #define KVM_IRQ_ROUTING_XEN_EVTCHN 5
 struct kvm_irq_routing_entry {
 	__u32 gsi;
@ -1179,6 +1189,7 @@ struct kvm_irq_routing_entry {
 		struct kvm_irq_routing_msi msi;
 		struct kvm_irq_routing_s390_adapter adapter;
 		struct kvm_irq_routing_hv_sint hv_sint;
 		struct kvm_irq_routing_xen_evtchn xen_evtchn;
 		__u32 pad[8];
 	} u;
 };
@ -1209,6 +1220,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO		(1 << 2)
 #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
 #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
 struct kvm_xen_hvm_config {
 	__u32 flags;
@ -1552,8 +1564,6 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_XSAVE */
 #define KVM_GET_XSAVE		  _IOR(KVMIO,  0xa4, struct kvm_xsave)
 #define KVM_SET_XSAVE		  _IOW(KVMIO,  0xa5, struct kvm_xsave)
 /* Available with KVM_CAP_XSAVE2 */
 #define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
@ -1613,6 +1623,9 @@ struct kvm_enc_region {
 #define KVM_S390_NORMAL_RESET	_IO(KVMIO,   0xc3)
 #define KVM_S390_CLEAR_RESET	_IO(KVMIO,   0xc4)
 /* Available with KVM_CAP_XSAVE2 */
 #define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
 struct kvm_s390_pv_sec_parm {
 	__u64 origin;
 	__u64 length;
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@ -8,11 +8,12 @@
 /s390x/memop
 /s390x/resets
 /s390x/sync_regs_test
 /x86_64/amx_test
 /x86_64/cpuid_test
 /x86_64/cr4_cpuid_sync_test
 /x86_64/debug_regs
 /x86_64/evmcs_test
 /x86_64/emulator_error_test
 /x86_64/get_cpuid_test
 /x86_64/get_msr_index_features
 /x86_64/kvm_clock_test
 /x86_64/kvm_pv_test
@ -22,6 +23,7 @@
 /x86_64/mmio_warning_test
 /x86_64/mmu_role_test
 /x86_64/platform_info_test
 /x86_64/pmu_event_filter_test
 /x86_64/set_boot_cpu_id
 /x86_64/set_sregs_test
 /x86_64/sev_migrate_tests
@ -36,6 +38,7 @@
 /x86_64/vmx_apic_access_test
 /x86_64/vmx_close_while_nested_test
 /x86_64/vmx_dirty_log_test
 /x86_64/vmx_exception_with_invalid_guest_state
 /x86_64/vmx_invalid_nested_guest_state
 /x86_64/vmx_preemption_timer_test
 /x86_64/vmx_set_nested_state_test
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@ -43,11 +43,11 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handler
 LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
 LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c
-TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
 TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
 TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
 TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
@ -56,6 +56,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test
 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
 TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
 TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
 TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
 TEST_GEN_PROGS_x86_64 += x86_64/smm_test
@ -69,6 +70,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@ -364,6 +364,24 @@ static inline unsigned long get_xmm(int n)
 }
 bool is_intel_cpu(void);
 bool is_amd_cpu(void);
 static inline unsigned int x86_family(unsigned int eax)
 {
 	unsigned int x86;
 	x86 = (eax >> 8) & 0xf;
 	if (x86 == 0xf)
 		x86 += (eax >> 20) & 0xff;
 	return x86;
 }
 static inline unsigned int x86_model(unsigned int eax)
 {
 	return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
 }
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
@ -375,6 +393,8 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index);
 struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
 struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
 int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
 		     struct kvm_cpuid2 *cpuid);
 void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
 		    struct kvm_cpuid2 *cpuid);
@ -418,6 +438,11 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr);
 void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
 			     uint64_t pte);
 /*
 * get_cpuid() - find matching CPUID entry and return pointer to it.
 */
 struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
 				   uint32_t index);
 /*
 * set_cpuid() - overwrites a matching cpuid entry with the provided value.
 *		 matches based on ent->function && ent->index. returns true
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@ -393,10 +393,12 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
 	struct kvm_vm *vm;
 	int i;
 #ifdef __x86_64__
 	/*
 	 * Permission needs to be requested before KVM_SET_CPUID2.
 	 */
 	vm_xsave_req_perm();
 #endif
 	/* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
 	if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
@ -497,9 +499,11 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
 void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
 			    uint64_t first_page, uint32_t num_pages)
 {
-	struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot,
+	struct kvm_clear_dirty_log args = {
-		                            .first_page = first_page,
+		.dirty_bitmap = log, .slot = slot,
-	                                    .num_pages = num_pages };
+		.first_page = first_page,
 		.num_pages = num_pages
 	};
 	int ret;
 	ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@ -886,6 +886,17 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
 	return entry;
 }
 int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
 		     struct kvm_cpuid2 *cpuid)
 {
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 	TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
 	return ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
 }
 /*
 * VM VCPU CPUID Set
 *
@ -903,12 +914,9 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
 void vcpu_set_cpuid(struct kvm_vm *vm,
 		uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
 {
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 	int rc;
-	TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+	rc = __vcpu_set_cpuid(vm, vcpuid, cpuid);
 	rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
 	TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
 		    rc, errno);
@ -1136,25 +1144,25 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
 	list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
 	list->nmsrs = nmsrs;
 	r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
-                r);
+		    r);
 	state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
 	r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
-                r);
+		    r);
 	r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
-                r);
+		    r);
 	r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
-                r);
+		    r);
 	r = vcpu_save_xsave_state(vm, vcpu, state);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
-                r);
+		    r);
 	if (kvm_check_cap(KVM_CAP_XCRS)) {
 		r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
@ -1163,17 +1171,17 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
 	}
 	r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
-                r);
+		    r);
 	if (nested_size) {
 		state->nested.size = sizeof(state->nested_);
 		r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
 		TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
-			r);
+			    r);
 		TEST_ASSERT(state->nested.size <= nested_size,
-			"Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
+			    "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
-			state->nested.size, nested_size);
+			    state->nested.size, nested_size);
 	} else
 		state->nested.size = 0;
@ -1181,12 +1189,12 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
 	for (i = 0; i < nmsrs; i++)
 		state->msrs.entries[i].index = list->indices[i];
 	r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
-        TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
+	TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
-                r, r == nmsrs ? -1 : list->indices[r]);
+		    r, r == nmsrs ? -1 : list->indices[r]);
 	r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
-                r);
+		    r);
 	free(list);
 	return state;
@ -1199,7 +1207,7 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 	r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
 	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
-                r);
+		    r);
 	r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
 	TEST_ASSERT(r == state->msrs.nmsrs,
@ -1214,28 +1222,28 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 	r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
 	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
-                r);
+		    r);
 	r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
-                r);
+		    r);
 	r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
-                r);
+		    r);
 	r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
-                r);
+		    r);
 	r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
-                r);
+		    r);
 	if (state->nested.size) {
 		r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
 		TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
-			r);
+			    r);
 	}
 }
@ -1245,10 +1253,10 @@ void kvm_x86_state_cleanup(struct kvm_x86_state *state)
 	free(state);
 }
-bool is_intel_cpu(void)
+static bool cpu_vendor_string_is(const char *vendor)
 {
 	const uint32_t *chunk = (const uint32_t *)vendor;
 	int eax, ebx, ecx, edx;
 	const uint32_t *chunk;
 	const int leaf = 0;
 	__asm__ __volatile__(
@ -1257,10 +1265,22 @@ bool is_intel_cpu(void)
 		  "=c"(ecx), "=d"(edx)
 		: /* input */ "0"(leaf), "2"(0));
 	chunk = (const uint32_t *)("GenuineIntel");
 	return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
 }
 bool is_intel_cpu(void)
 {
 	return cpu_vendor_string_is("GenuineIntel");
 }
 /*
 * Exclude early K5 samples with a vendor string of "AMDisbetter!"
 */
 bool is_amd_cpu(void)
 {
 	return cpu_vendor_string_is("AuthenticAMD");
 }
 uint32_t kvm_get_cpuid_max_basic(void)
 {
 	return kvm_get_supported_cpuid_entry(0)->eax;
@ -1384,6 +1404,23 @@ void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
 	}
 }
 struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
 				   uint32_t index)
 {
 	int i;
 	for (i = 0; i < cpuid->nent; i++) {
 		struct kvm_cpuid_entry2 *cur = &cpuid->entries[i];
 		if (cur->function == function && cur->index == index)
 			return cur;
 	}
 	TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
 	return NULL;
 }
 bool set_cpuid(struct kvm_cpuid2 *cpuid,
 	       struct kvm_cpuid_entry2 *ent)
 {
@ -1479,22 +1516,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui
 	return cpuid;
 }
 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
 static inline unsigned x86_family(unsigned int eax)
 {
        unsigned int x86;
        x86 = (eax >> 8) & 0xf;
        if (x86 == 0xf)
                x86 += (eax >> 20) & 0xff;
        return x86;
 }
 unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
 {
 	const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
@ -1504,11 +1525,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
 	max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
 	/* Avoid reserved HyperTransport region on AMD processors.  */
-	eax = ecx = 0;
+	if (!is_amd_cpu())
 	cpuid(&eax, &ebx, &ecx, &edx);
 	if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx ||
 	    ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx ||
 	    edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
 		return max_gfn;
 	/* On parts with <40 physical address bits, the area is fully hidden */
@ -1518,6 +1535,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
 	/* Before family 17h, the HyperTransport area is just below 1T.  */
 	ht_gfn = (1 << 28) - num_ht_pages;
 	eax = 1;
 	ecx = 0;
 	cpuid(&eax, &ebx, &ecx, &edx);
 	if (x86_family(eax) < 0x17)
 		goto done;
--- a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
@ -154,6 +154,34 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct
 	return guest_cpuids;
 }
 static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid)
 {
 	struct kvm_cpuid_entry2 *ent;
 	int rc;
 	u32 eax, ebx, x;
 	/* Setting unmodified CPUID is allowed */
 	rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
 	TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc);
 	/* Changing CPU features is forbidden */
 	ent = get_cpuid(cpuid, 0x7, 0);
 	ebx = ent->ebx;
 	ent->ebx--;
 	rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
 	TEST_ASSERT(rc, "Changing CPU features should fail");
 	ent->ebx = ebx;
 	/* Changing MAXPHYADDR is forbidden */
 	ent = get_cpuid(cpuid, 0x80000008, 0);
 	eax = ent->eax;
 	x = eax & 0xff;
 	ent->eax = (eax & ~0xffu) | (x - 1);
 	rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
 	TEST_ASSERT(rc, "Changing MAXPHYADDR should fail");
 	ent->eax = eax;
 }
 int main(void)
 {
 	struct kvm_cpuid2 *supp_cpuid, *cpuid2;
@ -175,5 +203,7 @@ int main(void)
 	for (stage = 0; stage < 3; stage++)
 		run_vcpu(vm, VCPU_ID, stage);
 	set_cpuid_after_run(vm, cpuid2);
 	kvm_vm_free(vm);
 }
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@ -0,0 +1,434 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
 * Test for x86 KVM_SET_PMU_EVENT_FILTER.
 *
 * Copyright (C) 2022, Google LLC.
 *
 * This work is licensed under the terms of the GNU GPL, version 2.
 *
 * Verifies the expected behavior of allow lists and deny lists for
 * virtual PMU events.
 */
 #define _GNU_SOURCE /* for program_invocation_short_name */
 #include "test_util.h"
 #include "kvm_util.h"
 #include "processor.h"
 /*
 * In lieu of copying perf_event.h into tools...
 */
 #define ARCH_PERFMON_EVENTSEL_OS			(1ULL << 17)
 #define ARCH_PERFMON_EVENTSEL_ENABLE			(1ULL << 22)
 union cpuid10_eax {
 	struct {
 		unsigned int version_id:8;
 		unsigned int num_counters:8;
 		unsigned int bit_width:8;
 		unsigned int mask_length:8;
 	} split;
 	unsigned int full;
 };
 union cpuid10_ebx {
 	struct {
 		unsigned int no_unhalted_core_cycles:1;
 		unsigned int no_instructions_retired:1;
 		unsigned int no_unhalted_reference_cycles:1;
 		unsigned int no_llc_reference:1;
 		unsigned int no_llc_misses:1;
 		unsigned int no_branch_instruction_retired:1;
 		unsigned int no_branch_misses_retired:1;
 	} split;
 	unsigned int full;
 };
 /* End of stuff taken from perf_event.h. */
 /* Oddly, this isn't in perf_event.h. */
 #define ARCH_PERFMON_BRANCHES_RETIRED		5
 #define VCPU_ID 0
 #define NUM_BRANCHES 42
 /*
 * This is how the event selector and unit mask are stored in an AMD
 * core performance event-select register. Intel's format is similar,
 * but the event selector is only 8 bits.
 */
 #define EVENT(select, umask) ((select & 0xf00UL) << 24 | (select & 0xff) | \
 			      (umask & 0xff) << 8)
 /*
 * "Branch instructions retired", from the Intel SDM, volume 3,
 * "Pre-defined Architectural Performance Events."
 */
 #define INTEL_BR_RETIRED EVENT(0xc4, 0)
 /*
 * "Retired branch instructions", from Processor Programming Reference
 * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
 * Preliminary Processor Programming Reference (PPR) for AMD Family
 * 17h Model 31h, Revision B0 Processors, and Preliminary Processor
 * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
 * B1 Processors Volume 1 of 2.
 */
 #define AMD_ZEN_BR_RETIRED EVENT(0xc2, 0)
 /*
 * This event list comprises Intel's eight architectural events plus
 * AMD's "retired branch instructions" for Zen[123] (and possibly
 * other AMD CPUs).
 */
 static const uint64_t event_list[] = {
 	EVENT(0x3c, 0),
 	EVENT(0xc0, 0),
 	EVENT(0x3c, 1),
 	EVENT(0x2e, 0x4f),
 	EVENT(0x2e, 0x41),
 	EVENT(0xc4, 0),
 	EVENT(0xc5, 0),
 	EVENT(0xa4, 1),
 	AMD_ZEN_BR_RETIRED,
 };
 /*
 * If we encounter a #GP during the guest PMU sanity check, then the guest
 * PMU is not functional. Inform the hypervisor via GUEST_SYNC(0).
 */
 static void guest_gp_handler(struct ex_regs *regs)
 {
 	GUEST_SYNC(0);
 }
 /*
 * Check that we can write a new value to the given MSR and read it back.
 * The caller should provide a non-empty set of bits that are safe to flip.
 *
 * Return on success. GUEST_SYNC(0) on error.
 */
 static void check_msr(uint32_t msr, uint64_t bits_to_flip)
 {
 	uint64_t v = rdmsr(msr) ^ bits_to_flip;
 	wrmsr(msr, v);
 	if (rdmsr(msr) != v)
 		GUEST_SYNC(0);
 	v ^= bits_to_flip;
 	wrmsr(msr, v);
 	if (rdmsr(msr) != v)
 		GUEST_SYNC(0);
 }
 static void intel_guest_code(void)
 {
 	check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
 	check_msr(MSR_P6_EVNTSEL0, 0xffff);
 	check_msr(MSR_IA32_PMC0, 0xffff);
 	GUEST_SYNC(1);
 	for (;;) {
 		uint64_t br0, br1;
 		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 		wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
 		      ARCH_PERFMON_EVENTSEL_OS | INTEL_BR_RETIRED);
 		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
 		br0 = rdmsr(MSR_IA32_PMC0);
 		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
 		br1 = rdmsr(MSR_IA32_PMC0);
 		GUEST_SYNC(br1 - br0);
 	}
 }
 /*
 * To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23],
 * this code uses the always-available, legacy K7 PMU MSRs, which alias to
 * the first four of the six extended core PMU MSRs.
 */
 static void amd_guest_code(void)
 {
 	check_msr(MSR_K7_EVNTSEL0, 0xffff);
 	check_msr(MSR_K7_PERFCTR0, 0xffff);
 	GUEST_SYNC(1);
 	for (;;) {
 		uint64_t br0, br1;
 		wrmsr(MSR_K7_EVNTSEL0, 0);
 		wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
 		      ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BR_RETIRED);
 		br0 = rdmsr(MSR_K7_PERFCTR0);
 		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
 		br1 = rdmsr(MSR_K7_PERFCTR0);
 		GUEST_SYNC(br1 - br0);
 	}
 }
 /*
 * Run the VM to the next GUEST_SYNC(value), and return the value passed
 * to the sync. Any other exit from the guest is fatal.
 */
 static uint64_t run_vm_to_sync(struct kvm_vm *vm)
 {
 	struct kvm_run *run = vcpu_state(vm, VCPU_ID);
 	struct ucall uc;
 	vcpu_run(vm, VCPU_ID);
 	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
 		    "Exit_reason other than KVM_EXIT_IO: %u (%s)\n",
 		    run->exit_reason,
 		    exit_reason_str(run->exit_reason));
 	get_ucall(vm, VCPU_ID, &uc);
 	TEST_ASSERT(uc.cmd == UCALL_SYNC,
 		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
 	return uc.args[1];
 }
 /*
 * In a nested environment or if the vPMU is disabled, the guest PMU
 * might not work as architected (accessing the PMU MSRs may raise
 * #GP, or writes could simply be discarded). In those situations,
 * there is no point in running these tests. The guest code will perform
 * a sanity check and then GUEST_SYNC(success). In the case of failure,
 * the behavior of the guest on resumption is undefined.
 */
 static bool sanity_check_pmu(struct kvm_vm *vm)
 {
 	bool success;
 	vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
 	success = run_vm_to_sync(vm);
 	vm_install_exception_handler(vm, GP_VECTOR, NULL);
 	return success;
 }
 static struct kvm_pmu_event_filter *make_pmu_event_filter(uint32_t nevents)
 {
 	struct kvm_pmu_event_filter *f;
 	int size = sizeof(*f) + nevents * sizeof(f->events[0]);
 	f = malloc(size);
 	TEST_ASSERT(f, "Out of memory");
 	memset(f, 0, size);
 	f->nevents = nevents;
 	return f;
 }
 static struct kvm_pmu_event_filter *event_filter(uint32_t action)
 {
 	struct kvm_pmu_event_filter *f;
 	int i;
 	f = make_pmu_event_filter(ARRAY_SIZE(event_list));
 	f->action = action;
 	for (i = 0; i < ARRAY_SIZE(event_list); i++)
 		f->events[i] = event_list[i];
 	return f;
 }
 /*
 * Remove the first occurrence of 'event' (if any) from the filter's
 * event list.
 */
 static struct kvm_pmu_event_filter *remove_event(struct kvm_pmu_event_filter *f,
 						 uint64_t event)
 {
 	bool found = false;
 	int i;
 	for (i = 0; i < f->nevents; i++) {
 		if (found)
 			f->events[i - 1] = f->events[i];
 		else
 			found = f->events[i] == event;
 	}
 	if (found)
 		f->nevents--;
 	return f;
 }
 static void test_without_filter(struct kvm_vm *vm)
 {
 	uint64_t count = run_vm_to_sync(vm);
 	if (count != NUM_BRANCHES)
 		pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
 			__func__, count, NUM_BRANCHES);
 	TEST_ASSERT(count, "Allowed PMU event is not counting");
 }
 static uint64_t test_with_filter(struct kvm_vm *vm,
 				 struct kvm_pmu_event_filter *f)
 {
 	vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, (void *)f);
 	return run_vm_to_sync(vm);
 }
 static void test_member_deny_list(struct kvm_vm *vm)
 {
 	struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
 	uint64_t count = test_with_filter(vm, f);
 	free(f);
 	if (count)
 		pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
 			__func__, count);
 	TEST_ASSERT(!count, "Disallowed PMU Event is counting");
 }
 static void test_member_allow_list(struct kvm_vm *vm)
 {
 	struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
 	uint64_t count = test_with_filter(vm, f);
 	free(f);
 	if (count != NUM_BRANCHES)
 		pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
 			__func__, count, NUM_BRANCHES);
 	TEST_ASSERT(count, "Allowed PMU event is not counting");
 }
 static void test_not_member_deny_list(struct kvm_vm *vm)
 {
 	struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
 	uint64_t count;
 	remove_event(f, INTEL_BR_RETIRED);
 	remove_event(f, AMD_ZEN_BR_RETIRED);
 	count = test_with_filter(vm, f);
 	free(f);
 	if (count != NUM_BRANCHES)
 		pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
 			__func__, count, NUM_BRANCHES);
 	TEST_ASSERT(count, "Allowed PMU event is not counting");
 }
 static void test_not_member_allow_list(struct kvm_vm *vm)
 {
 	struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
 	uint64_t count;
 	remove_event(f, INTEL_BR_RETIRED);
 	remove_event(f, AMD_ZEN_BR_RETIRED);
 	count = test_with_filter(vm, f);
 	free(f);
 	if (count)
 		pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
 			__func__, count);
 	TEST_ASSERT(!count, "Disallowed PMU Event is counting");
 }
 /*
 * Check for a non-zero PMU version, at least one general-purpose
 * counter per logical processor, an EBX bit vector of length greater
 * than 5, and EBX[5] clear.
 */
 static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry)
 {
 	union cpuid10_eax eax = { .full = entry->eax };
 	union cpuid10_ebx ebx = { .full = entry->ebx };
 	return eax.split.version_id && eax.split.num_counters > 0 &&
 		eax.split.mask_length > ARCH_PERFMON_BRANCHES_RETIRED &&
 		!ebx.split.no_branch_instruction_retired;
 }
 /*
 * Note that CPUID leaf 0xa is Intel-specific. This leaf should be
 * clear on AMD hardware.
 */
 static bool use_intel_pmu(void)
 {
 	struct kvm_cpuid_entry2 *entry;
 	entry = kvm_get_supported_cpuid_index(0xa, 0);
 	return is_intel_cpu() && entry && check_intel_pmu_leaf(entry);
 }
 static bool is_zen1(uint32_t eax)
 {
 	return x86_family(eax) == 0x17 && x86_model(eax) <= 0x0f;
 }
 static bool is_zen2(uint32_t eax)
 {
 	return x86_family(eax) == 0x17 &&
 		x86_model(eax) >= 0x30 && x86_model(eax) <= 0x3f;
 }
 static bool is_zen3(uint32_t eax)
 {
 	return x86_family(eax) == 0x19 && x86_model(eax) <= 0x0f;
 }
 /*
 * Determining AMD support for a PMU event requires consulting the AMD
 * PPR for the CPU or reference material derived therefrom. The AMD
 * test code herein has been verified to work on Zen1, Zen2, and Zen3.
 *
 * Feel free to add more AMD CPUs that are documented to support event
 * select 0xc2 umask 0 as "retired branch instructions."
 */
 static bool use_amd_pmu(void)
 {
 	struct kvm_cpuid_entry2 *entry;
 	entry = kvm_get_supported_cpuid_index(1, 0);
 	return is_amd_cpu() && entry &&
 		(is_zen1(entry->eax) ||
 		 is_zen2(entry->eax) ||
 		 is_zen3(entry->eax));
 }
 int main(int argc, char *argv[])
 {
 	void (*guest_code)(void) = NULL;
 	struct kvm_vm *vm;
 	int r;
 	/* Tell stdout not to buffer its content */
 	setbuf(stdout, NULL);
 	r = kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER);
 	if (!r) {
 		print_skip("KVM_CAP_PMU_EVENT_FILTER not supported");
 		exit(KSFT_SKIP);
 	}
 	if (use_intel_pmu())
 		guest_code = intel_guest_code;
 	else if (use_amd_pmu())
 		guest_code = amd_guest_code;
 	if (!guest_code) {
 		print_skip("Don't know how to test this guest PMU");
 		exit(KSFT_SKIP);
 	}
 	vm = vm_create_default(VCPU_ID, 0, guest_code);
 	vm_init_descriptor_tables(vm);
 	vcpu_init_descriptor_tables(vm, VCPU_ID);
 	if (!sanity_check_pmu(vm)) {
 		print_skip("Guest PMU is not functional");
 		exit(KSFT_SKIP);
 	}
 	test_without_filter(vm);
 	test_member_deny_list(vm);
 	test_member_allow_list(vm);
 	test_not_member_deny_list(vm);
 	test_not_member_allow_list(vm);
 	kvm_vm_free(vm);
 	return 0;
 }
--- a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
@ -77,8 +77,8 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
 	switch (get_ucall(vm, vcpuid, &uc)) {
 	case UCALL_SYNC:
 		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-                            uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
+			    uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
-                            stage + 1, (ulong)uc.args[1]);
+			    stage + 1, (ulong)uc.args[1]);
 		return;
 	case UCALL_DONE:
 		return;
--- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
@ -30,8 +30,8 @@ static struct kvm_vm *vm;
 static void l2_guest_code(void)
 {
 	/* Exit to L0 */
-        asm volatile("inb %%dx, %%al"
+	asm volatile("inb %%dx, %%al"
-                     : : [port] "d" (PORT_L0_EXIT) : "rax");
+		     : : [port] "d" (PORT_L0_EXIT) : "rax");
 }
 static void l1_guest_code(struct vmx_pages *vmx_pages)
--- a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
@ -0,0 +1,139 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include "test_util.h"
 #include "kvm_util.h"
 #include "processor.h"
 #include <signal.h>
 #include <string.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
 #include "kselftest.h"
 #define VCPU_ID	0
 static struct kvm_vm *vm;
 static void guest_ud_handler(struct ex_regs *regs)
 {
 	/* Loop on the ud2 until guest state is made invalid. */
 }
 static void guest_code(void)
 {
 	asm volatile("ud2");
 }
 static void __run_vcpu_with_invalid_state(void)
 {
 	struct kvm_run *run = vcpu_state(vm, VCPU_ID);
 	vcpu_run(vm, VCPU_ID);
 	TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
 		    "Expected KVM_EXIT_INTERNAL_ERROR, got %d (%s)\n",
 		    run->exit_reason, exit_reason_str(run->exit_reason));
 	TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
 		    "Expected emulation failure, got %d\n",
 		    run->emulation_failure.suberror);
 }
 static void run_vcpu_with_invalid_state(void)
 {
 	/*
 	 * Always run twice to verify KVM handles the case where _KVM_ queues
 	 * an exception with invalid state and then exits to userspace, i.e.
 	 * that KVM doesn't explode if userspace ignores the initial error.
 	 */
 	__run_vcpu_with_invalid_state();
 	__run_vcpu_with_invalid_state();
 }
 static void set_timer(void)
 {
 	struct itimerval timer;
 	timer.it_value.tv_sec  = 0;
 	timer.it_value.tv_usec = 200;
 	timer.it_interval = timer.it_value;
 	ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0);
 }
 static void set_or_clear_invalid_guest_state(bool set)
 {
 	static struct kvm_sregs sregs;
 	if (!sregs.cr0)
 		vcpu_sregs_get(vm, VCPU_ID, &sregs);
 	sregs.tr.unusable = !!set;
 	vcpu_sregs_set(vm, VCPU_ID, &sregs);
 }
 static void set_invalid_guest_state(void)
 {
 	set_or_clear_invalid_guest_state(true);
 }
 static void clear_invalid_guest_state(void)
 {
 	set_or_clear_invalid_guest_state(false);
 }
 static void sigalrm_handler(int sig)
 {
 	struct kvm_vcpu_events events;
 	TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig);
 	vcpu_events_get(vm, VCPU_ID, &events);
 	/*
 	 * If an exception is pending, attempt KVM_RUN with invalid guest,
 	 * otherwise rearm the timer and keep doing so until the timer fires
 	 * between KVM queueing an exception and re-entering the guest.
 	 */
 	if (events.exception.pending) {
 		set_invalid_guest_state();
 		run_vcpu_with_invalid_state();
 	} else {
 		set_timer();
 	}
 }
 int main(int argc, char *argv[])
 {
 	if (!is_intel_cpu() || vm_is_unrestricted_guest(NULL)) {
 		print_skip("Must be run with kvm_intel.unrestricted_guest=0");
 		exit(KSFT_SKIP);
 	}
 	vm = vm_create_default(VCPU_ID, 0, (void *)guest_code);
 	vm_init_descriptor_tables(vm);
 	vcpu_init_descriptor_tables(vm, VCPU_ID);
 	vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
 	/*
 	 * Stuff invalid guest state for L2 by making TR unusuable.  The next
 	 * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
 	 * emulating invalid guest state for L2.
 	 */
 	set_invalid_guest_state();
 	run_vcpu_with_invalid_state();
 	/*
 	 * Verify KVM also handles the case where userspace gains control while
 	 * an exception is pending and stuffs invalid state.  Run with valid
 	 * guest state and a timer firing every 200us, and attempt to enter the
 	 * guest with invalid state when the handler interrupts KVM with an
 	 * exception pending.
 	 */
 	clear_invalid_guest_state();
 	TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR,
 		    "Failed to register SIGALRM handler, errno = %d (%s)",
 		    errno, strerror(errno));
 	set_timer();
 	run_vcpu_with_invalid_state();
 }
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@ -46,20 +46,20 @@ static struct kvm_vm *vm;
 #define MIN_STEAL_TIME		50000
 struct pvclock_vcpu_time_info {
-        u32   version;
+	u32   version;
-        u32   pad0;
+	u32   pad0;
-        u64   tsc_timestamp;
+	u64   tsc_timestamp;
-        u64   system_time;
+	u64   system_time;
-        u32   tsc_to_system_mul;
+	u32   tsc_to_system_mul;
-        s8    tsc_shift;
+	s8    tsc_shift;
-        u8    flags;
+	u8    flags;
-        u8    pad[2];
+	u8    pad[2];
 } __attribute__((__packed__)); /* 32 bytes */
 struct pvclock_wall_clock {
-        u32   version;
+	u32   version;
-        u32   sec;
+	u32   sec;
-        u32   nsec;
+	u32   nsec;
 } __attribute__((__packed__));
 struct vcpu_runstate_info {
@ -74,11 +74,11 @@ struct arch_vcpu_info {
 };
 struct vcpu_info {
-        uint8_t evtchn_upcall_pending;
+	uint8_t evtchn_upcall_pending;
-        uint8_t evtchn_upcall_mask;
+	uint8_t evtchn_upcall_mask;
-        unsigned long evtchn_pending_sel;
+	unsigned long evtchn_pending_sel;
-        struct arch_vcpu_info arch;
+	struct arch_vcpu_info arch;
-        struct pvclock_vcpu_time_info time;
+	struct pvclock_vcpu_time_info time;
 }; /* 64 bytes (x86) */
 struct shared_info {
@ -493,7 +493,7 @@ int main(int argc, char *argv[])
 	vm_ts.tv_sec = wc->sec;
 	vm_ts.tv_nsec = wc->nsec;
-        TEST_ASSERT(wc->version && !(wc->version & 1),
+	TEST_ASSERT(wc->version && !(wc->version & 1),
 		    "Bad wallclock version %x", wc->version);
 	TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
 	TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@ -427,9 +427,6 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 #endif
 	kvm_async_pf_vcpu_init(vcpu);
 	vcpu->pre_pcpu = -1;
 	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
 	kvm_vcpu_set_in_spin_loop(vcpu, false);
 	kvm_vcpu_set_dy_eligible(vcpu, false);
 	vcpu->preempted = false;
@ -3163,8 +3160,10 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
 {
 	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
 	if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
 		return;
 #endif
 	if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;