- selftest compilation fix for non-x86
 
 - KVM: avoid warning on s390 in mark_page_dirty
 
 x86:
 - fix page write-protection bug and improve comments
 
 - use binary search to lookup the PMU event filter, add test
 
 - enable_pmu module parameter support for Intel CPUs
 
 - switch blocked_vcpu_on_cpu_lock to raw spinlock
 
 - cleanups of blocked vCPU logic
 
 - partially allow KVM_SET_CPUID{,2} after KVM_RUN (5.16 regression)
 
 - various small fixes
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmHpmT0UHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroOstggAi1VSpT43oGslQjXNDZacHEARoYQs
 b0XpoW7HXicGSGRMWspCmiAPdJyYTsioEACttAmXUMs7brAgHb9n/vzdlcLh1ymL
 rQw2YFQlfqqB1Ki1iRhNkWlH9xOECsu28WLng6ylrx51GuT/pzWRt+V3EGUFTxIT
 ldW9HgZg2oFJIaLjg2hQVR/8EbBf0QdsAD3KV3tyvhBlXPkyeLOMcGe9onfjZ/NE
 JQeW7FtKtP4SsIFt1KrJpDPjtiwFt3bRM0gfgGw7//clvtKIqt1LYXZiq4C3b7f5
 tfYiC8lO2vnOoYcfeYEmvybbSsoS/CgSliZB32qkwoVvRMIl82YmxtDD+Q==
 =/Mak
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull more kvm updates from Paolo Bonzini:
 "Generic:

   - selftest compilation fix for non-x86

   - KVM: avoid warning on s390 in mark_page_dirty

 x86:

   - fix page write-protection bug and improve comments

   - use binary search to lookup the PMU event filter, add test

   - enable_pmu module parameter support for Intel CPUs

   - switch blocked_vcpu_on_cpu_lock to raw spinlock

   - cleanups of blocked vCPU logic

   - partially allow KVM_SET_CPUID{,2} after KVM_RUN (5.16 regression)

   - various small fixes"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (46 commits)
  docs: kvm: fix WARNINGs from api.rst
  selftests: kvm/x86: Fix the warning in lib/x86_64/processor.c
  selftests: kvm/x86: Fix the warning in pmu_event_filter_test.c
  kvm: selftests: Do not indent with spaces
  kvm: selftests: sync uapi/linux/kvm.h with Linux header
  selftests: kvm: add amx_test to .gitignore
  KVM: SVM: Nullify vcpu_(un)blocking() hooks if AVIC is disabled
  KVM: SVM: Move svm_hardware_setup() and its helpers below svm_x86_ops
  KVM: SVM: Drop AVIC's intermediate avic_set_running() helper
  KVM: VMX: Don't do full kick when handling posted interrupt wakeup
  KVM: VMX: Fold fallback path into triggering posted IRQ helper
  KVM: VMX: Pass desired vector instead of bool for triggering posted IRQ
  KVM: VMX: Don't do full kick when triggering posted interrupt "fails"
  KVM: SVM: Skip AVIC and IRTE updates when loading blocking vCPU
  KVM: SVM: Use kvm_vcpu_is_blocking() in AVIC load to handle preemption
  KVM: SVM: Remove unnecessary APICv/AVIC update in vCPU unblocking path
  KVM: SVM: Don't bother checking for "running" AVIC when kicking for IPIs
  KVM: SVM: Signal AVIC doorbell iff vCPU is in guest mode
  KVM: x86: Remove defunct pre_block/post_block kvm_x86_ops hooks
  KVM: x86: Unexport LAPIC's switch_to_{hv,sw}_timer() helpers
  ...
This commit is contained in:
Linus Torvalds 2022-01-22 09:40:01 +02:00
commit 636b5284d8
36 changed files with 1428 additions and 635 deletions

View file

@ -5545,8 +5545,8 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header.
The Stats Data block contains an array of 64-bit values in the same order The Stats Data block contains an array of 64-bit values in the same order
as the descriptors in Descriptors block. as the descriptors in Descriptors block.
4.42 KVM_GET_XSAVE2 4.134 KVM_GET_XSAVE2
------------------ --------------------
:Capability: KVM_CAP_XSAVE2 :Capability: KVM_CAP_XSAVE2
:Architectures: x86 :Architectures: x86
@ -7363,7 +7363,7 @@ trap and emulate MSRs that are outside of the scope of KVM as well as
limit the attack surface on KVM's MSR emulation code. limit the attack surface on KVM's MSR emulation code.
8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID 8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID
----------------------------- -------------------------------------
Architectures: x86 Architectures: x86

View file

@ -55,6 +55,7 @@ KVM_X86_OP_NULL(tlb_remote_flush)
KVM_X86_OP_NULL(tlb_remote_flush_with_range) KVM_X86_OP_NULL(tlb_remote_flush_with_range)
KVM_X86_OP(tlb_flush_gva) KVM_X86_OP(tlb_flush_gva)
KVM_X86_OP(tlb_flush_guest) KVM_X86_OP(tlb_flush_guest)
KVM_X86_OP(vcpu_pre_run)
KVM_X86_OP(run) KVM_X86_OP(run)
KVM_X86_OP_NULL(handle_exit) KVM_X86_OP_NULL(handle_exit)
KVM_X86_OP_NULL(skip_emulated_instruction) KVM_X86_OP_NULL(skip_emulated_instruction)
@ -98,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff)
KVM_X86_OP_NULL(request_immediate_exit) KVM_X86_OP_NULL(request_immediate_exit)
KVM_X86_OP(sched_in) KVM_X86_OP(sched_in)
KVM_X86_OP_NULL(update_cpu_dirty_logging) KVM_X86_OP_NULL(update_cpu_dirty_logging)
KVM_X86_OP_NULL(pre_block)
KVM_X86_OP_NULL(post_block)
KVM_X86_OP_NULL(vcpu_blocking) KVM_X86_OP_NULL(vcpu_blocking)
KVM_X86_OP_NULL(vcpu_unblocking) KVM_X86_OP_NULL(vcpu_unblocking)
KVM_X86_OP_NULL(update_pi_irte) KVM_X86_OP_NULL(update_pi_irte)

View file

@ -1381,6 +1381,7 @@ struct kvm_x86_ops {
*/ */
void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
int (*handle_exit)(struct kvm_vcpu *vcpu, int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath); enum exit_fastpath_completion exit_fastpath);
@ -1454,18 +1455,6 @@ struct kvm_x86_ops {
const struct kvm_pmu_ops *pmu_ops; const struct kvm_pmu_ops *pmu_ops;
const struct kvm_x86_nested_ops *nested_ops; const struct kvm_x86_nested_ops *nested_ops;
/*
* Architecture specific hooks for vCPU blocking due to
* HLT instruction.
* Returns for .pre_block():
* - 0 means continue to block the vCPU.
* - 1 means we cannot block the vCPU since some event
* happens during this period, such as, 'ON' bit in
* posted-interrupts descriptor is set.
*/
int (*pre_block)(struct kvm_vcpu *vcpu);
void (*post_block)(struct kvm_vcpu *vcpu);
void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);

View file

@ -119,6 +119,28 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
} }
/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
int nent)
{
struct kvm_cpuid_entry2 *orig;
int i;
if (nent != vcpu->arch.cpuid_nent)
return -EINVAL;
for (i = 0; i < nent; i++) {
orig = &vcpu->arch.cpuid_entries[i];
if (e2[i].function != orig->function ||
e2[i].index != orig->index ||
e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
return -EINVAL;
}
return 0;
}
static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
{ {
u32 function; u32 function;
@ -145,14 +167,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
} }
} }
static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
struct kvm_cpuid_entry2 *entries, int nent)
{ {
u32 base = vcpu->arch.kvm_cpuid_base; u32 base = vcpu->arch.kvm_cpuid_base;
if (!base) if (!base)
return NULL; return NULL;
return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
}
static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
{
return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
vcpu->arch.cpuid_nent);
} }
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
@ -167,11 +196,12 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.pv_cpuid.features = best->eax; vcpu->arch.pv_cpuid.features = best->eax;
} }
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
int nent)
{ {
struct kvm_cpuid_entry2 *best; struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 1, 0); best = cpuid_entry2_find(entries, nent, 1, 0);
if (best) { if (best) {
/* Update OSXSAVE bit */ /* Update OSXSAVE bit */
if (boot_cpu_has(X86_FEATURE_XSAVE)) if (boot_cpu_has(X86_FEATURE_XSAVE))
@ -182,33 +212,38 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
} }
best = kvm_find_cpuid_entry(vcpu, 7, 0); best = cpuid_entry2_find(entries, nent, 7, 0);
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
cpuid_entry_change(best, X86_FEATURE_OSPKE, cpuid_entry_change(best, X86_FEATURE_OSPKE,
kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
best = kvm_find_cpuid_entry(vcpu, 0xD, 0); best = cpuid_entry2_find(entries, nent, 0xD, 0);
if (best) if (best)
best->ebx = xstate_required_size(vcpu->arch.xcr0, false); best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
best = kvm_find_cpuid_entry(vcpu, 0xD, 1); best = cpuid_entry2_find(entries, nent, 0xD, 1);
if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
cpuid_entry_has(best, X86_FEATURE_XSAVEC))) cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true); best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
best = kvm_find_kvm_cpuid_features(vcpu); best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
if (kvm_hlt_in_guest(vcpu->kvm) && best && if (kvm_hlt_in_guest(vcpu->kvm) && best &&
(best->eax & (1 << KVM_FEATURE_PV_UNHALT))) (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
best = kvm_find_cpuid_entry(vcpu, 0x1, 0); best = cpuid_entry2_find(entries, nent, 0x1, 0);
if (best) if (best)
cpuid_entry_change(best, X86_FEATURE_MWAIT, cpuid_entry_change(best, X86_FEATURE_MWAIT,
vcpu->arch.ia32_misc_enable_msr & vcpu->arch.ia32_misc_enable_msr &
MSR_IA32_MISC_ENABLE_MWAIT); MSR_IA32_MISC_ENABLE_MWAIT);
} }
} }
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
{
__kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
}
EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
@ -298,6 +333,22 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
{ {
int r; int r;
__kvm_update_cpuid_runtime(vcpu, e2, nent);
/*
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
* faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
* the core vCPU model on the fly. It would've been better to forbid any
* KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
* some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
* whether the supplied CPUID data is equal to what's already set.
*/
if (vcpu->arch.last_vmentry_cpu != -1)
return kvm_cpuid_check_equal(vcpu, e2, nent);
r = kvm_check_cpuid(vcpu, e2, nent); r = kvm_check_cpuid(vcpu, e2, nent);
if (r) if (r)
return r; return r;
@ -307,7 +358,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
vcpu->arch.cpuid_nent = nent; vcpu->arch.cpuid_nent = nent;
kvm_update_kvm_cpuid_base(vcpu); kvm_update_kvm_cpuid_base(vcpu);
kvm_update_cpuid_runtime(vcpu);
kvm_vcpu_after_set_cpuid(vcpu); kvm_vcpu_after_set_cpuid(vcpu);
return 0; return 0;
@ -795,10 +845,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
perf_get_x86_pmu_capability(&cap); perf_get_x86_pmu_capability(&cap);
/* /*
* Only support guest architectural pmu on a host * The guest architecture pmu is only supported if the architecture
* with architectural pmu. * pmu exists on the host and the module parameters allow it.
*/ */
if (!cap.version) if (!cap.version || !enable_pmu)
memset(&cap, 0, sizeof(cap)); memset(&cap, 0, sizeof(cap));
eax.split.version_id = min(cap.version, 2); eax.split.version_id = min(cap.version, 2);
@ -886,6 +936,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
--array->nent; --array->nent;
continue; continue;
} }
if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
entry->ecx &= ~BIT_ULL(2);
entry->edx = 0; entry->edx = 0;
} }
break; break;

View file

@ -1950,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
{ {
restart_apic_timer(vcpu->arch.apic); restart_apic_timer(vcpu->arch.apic);
} }
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
{ {
@ -1962,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
start_sw_timer(apic); start_sw_timer(apic);
preempt_enable(); preempt_enable();
} }
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
{ {

View file

@ -5756,6 +5756,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
continue; continue;
flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
start, end - 1, true, flush); start, end - 1, true, flush);
} }
@ -5825,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
} }
/* /*
* We can flush all the TLBs out of the mmu lock without TLB * Flush TLBs if any SPTEs had to be write-protected to ensure that
* corruption since we just change the spte from writable to * guest writes are reflected in the dirty bitmap before the memslot
* readonly so that we only need to care the case of changing * update completes, i.e. before enabling dirty logging is visible to
* spte from present to present (changing the spte from present * userspace.
* to nonpresent will flush all the TLBs immediately), in other *
* words, the only case we care is mmu_spte_update() where we * Perform the TLB flush outside the mmu_lock to reduce the amount of
* have checked Host-writable | MMU-writable instead of * time the lock is held. However, this does mean that another CPU can
* PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK * now grab mmu_lock and encounter a write-protected SPTE while CPUs
* anymore. * still have a writable mapping for the associated GFN in their TLB.
*
* This is safe but requires KVM to be careful when making decisions
* based on the write-protection status of an SPTE. Specifically, KVM
* also write-protects SPTEs to monitor changes to guest page tables
* during shadow paging, and must guarantee no CPUs can write to those
* page before the lock is dropped. As mentioned in the previous
* paragraph, a write-protected SPTE is no guarantee that CPU cannot
* perform writes. So to determine if a TLB flush is truly required, KVM
* will clear a separate software-only bit (MMU-writable) and skip the
* flush if-and-only-if this bit was already clear.
*
* See DEFAULT_SPTE_MMU_WRITEABLE for more details.
*/ */
if (flush) if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);

View file

@ -216,6 +216,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~shadow_host_writable_mask; new_spte &= ~shadow_host_writable_mask;
new_spte &= ~shadow_mmu_writable_mask;
new_spte = mark_spte_for_access_track(new_spte); new_spte = mark_spte_for_access_track(new_spte);

View file

@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
/* /*
* The mask/shift to use for saving the original R/X bits when marking the PTE * The mask/shift to use for saving the original R/X bits when marking the PTE
* as not-present for access tracking purposes. We do not save the W bit as the * as not-present for access tracking purposes. We do not save the W bit as the
@ -78,6 +74,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
/*
* *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
* writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
* that map guest pages in read-only memslots and read-only VMAs.
*
* Invariants:
* - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
*
*
* *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
* allows writes to the guest page mapped by the SPTE. This bit is cleared when
* the guest page mapped by the SPTE contains a page table that is being
* monitored for shadow paging. In this case the SPTE can only be made writable
* by unsyncing the shadow page under the mmu_lock.
*
* Invariants:
* - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
* - If MMU-writable is set, Host-writable must be set.
*
* If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
* to track writes for dirty logging. For such SPTEs, KVM will locklessly set
* PT_WRITABLE_MASK upon the next write from the guest and record the write in
* the dirty log (see fast_page_fault()).
*/
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
/* /*
* Low ignored bits are at a premium for EPT, use high ignored bits, taking care * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
* to not overlap the A/D type mask or the saved access bits of access-tracked * to not overlap the A/D type mask or the saved access bits of access-tracked
@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
static inline bool spte_can_locklessly_be_made_writable(u64 spte) static inline bool spte_can_locklessly_be_made_writable(u64 spte)
{ {
return (spte & shadow_host_writable_mask) && if (spte & shadow_mmu_writable_mask) {
(spte & shadow_mmu_writable_mask); WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
return true;
}
WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
return false;
} }
static inline u64 get_mmio_spte_generation(u64 spte) static inline u64 get_mmio_spte_generation(u64 spte)

View file

@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level)) !is_last_spte(iter.old_spte, iter.level))
continue; continue;
if (!is_writable_pte(iter.old_spte))
break;
new_spte = iter.old_spte & new_spte = iter.old_spte &
~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
if (new_spte == iter.old_spte)
break;
tdp_mmu_set_spte(kvm, &iter, new_spte); tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true; spte_set = true;
} }

View file

@ -13,6 +13,8 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/kvm_host.h> #include <linux/kvm_host.h>
#include <linux/perf_event.h> #include <linux/perf_event.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <asm/perf_event.h> #include <asm/perf_event.h>
#include "x86.h" #include "x86.h"
#include "cpuid.h" #include "cpuid.h"
@ -109,6 +111,9 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
.config = config, .config = config,
}; };
if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
return;
attr.sample_period = get_sample_period(pmc, pmc->counter); attr.sample_period = get_sample_period(pmc, pmc->counter);
if (in_tx) if (in_tx)
@ -169,12 +174,16 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
return true; return true;
} }
static int cmp_u64(const void *a, const void *b)
{
return *(__u64 *)a - *(__u64 *)b;
}
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{ {
unsigned config, type = PERF_TYPE_RAW; unsigned config, type = PERF_TYPE_RAW;
struct kvm *kvm = pmc->vcpu->kvm; struct kvm *kvm = pmc->vcpu->kvm;
struct kvm_pmu_event_filter *filter; struct kvm_pmu_event_filter *filter;
int i;
bool allow_event = true; bool allow_event = true;
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@ -189,16 +198,13 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
if (filter) { if (filter) {
for (i = 0; i < filter->nevents; i++) __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
if (filter->events[i] ==
(eventsel & AMD64_RAW_EVENT_MASK_NB)) if (bsearch(&key, filter->events, filter->nevents,
break; sizeof(__u64), cmp_u64))
if (filter->action == KVM_PMU_EVENT_ALLOW && allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
i == filter->nevents) else
allow_event = false; allow_event = filter->action == KVM_PMU_EVENT_DENY;
if (filter->action == KVM_PMU_EVENT_DENY &&
i < filter->nevents)
allow_event = false;
} }
if (!allow_event) if (!allow_event)
return; return;
@ -573,6 +579,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
/* Ensure nevents can't be changed between the user copies. */ /* Ensure nevents can't be changed between the user copies. */
*filter = tmp; *filter = tmp;
/*
* Sort the in-kernel list so that we can search it with bsearch.
*/
sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
mutex_lock(&kvm->lock); mutex_lock(&kvm->lock);
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock)); mutex_is_locked(&kvm->lock));

View file

@ -295,13 +295,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
struct kvm_vcpu *vcpu; struct kvm_vcpu *vcpu;
unsigned long i; unsigned long i;
/*
* Wake any target vCPUs that are blocking, i.e. waiting for a wake
* event. There's no need to signal doorbells, as hardware has handled
* vCPUs that were in guest at the time of the IPI, and vCPUs that have
* since entered the guest will have processed pending IRQs at VMRUN.
*/
kvm_for_each_vcpu(i, vcpu, kvm) { kvm_for_each_vcpu(i, vcpu, kvm) {
bool m = kvm_apic_match_dest(vcpu, source, if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
icrl & APIC_SHORT_MASK, GET_APIC_DEST_FIELD(icrh),
GET_APIC_DEST_FIELD(icrh), icrl & APIC_DEST_MASK))
icrl & APIC_DEST_MASK);
if (m && !avic_vcpu_is_running(vcpu))
kvm_vcpu_wake_up(vcpu); kvm_vcpu_wake_up(vcpu);
} }
} }
@ -672,9 +675,22 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
return -1; return -1;
kvm_lapic_set_irr(vec, vcpu->arch.apic); kvm_lapic_set_irr(vec, vcpu->arch.apic);
/*
* Pairs with the smp_mb_*() after setting vcpu->guest_mode in
* vcpu_enter_guest() to ensure the write to the vIRR is ordered before
* the read of guest_mode, which guarantees that either VMRUN will see
* and process the new vIRR entry, or that the below code will signal
* the doorbell if the vCPU is already running in the guest.
*/
smp_mb__after_atomic(); smp_mb__after_atomic();
if (avic_vcpu_is_running(vcpu)) { /*
* Signal the doorbell to tell hardware to inject the IRQ if the vCPU
* is in the guest. If the vCPU is not in the guest, hardware will
* automatically process AVIC interrupts at VMRUN.
*/
if (vcpu->mode == IN_GUEST_MODE) {
int cpu = READ_ONCE(vcpu->cpu); int cpu = READ_ONCE(vcpu->cpu);
/* /*
@ -688,8 +704,13 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
if (cpu != get_cpu()) if (cpu != get_cpu())
wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
put_cpu(); put_cpu();
} else } else {
/*
* Wake the vCPU if it was blocking. KVM will then detect the
* pending IRQ when checking if the vCPU has a wake event.
*/
kvm_vcpu_wake_up(vcpu); kvm_vcpu_wake_up(vcpu);
}
return 0; return 0;
} }
@ -957,6 +978,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
int h_physical_id = kvm_cpu_get_apicid(cpu); int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu); struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
/* /*
* Since the host physical APIC id is 8 bits, * Since the host physical APIC id is 8 bits,
* we can support host APIC ID upto 255. * we can support host APIC ID upto 255.
@ -964,19 +987,25 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return; return;
/*
* No need to update anything if the vCPU is blocking, i.e. if the vCPU
* is being scheduled in after being preempted. The CPU entries in the
* Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
* If the vCPU was migrated, its new CPU value will be stuffed when the
* vCPU unblocks.
*/
if (kvm_vcpu_is_blocking(vcpu))
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache)); entry = READ_ONCE(*(svm->avic_physical_id_cache));
WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
if (svm->avic_is_running)
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry); WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
svm->avic_is_running);
} }
void avic_vcpu_put(struct kvm_vcpu *vcpu) void avic_vcpu_put(struct kvm_vcpu *vcpu)
@ -984,42 +1013,56 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
u64 entry; u64 entry;
struct vcpu_svm *svm = to_svm(vcpu); struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
entry = READ_ONCE(*(svm->avic_physical_id_cache)); entry = READ_ONCE(*(svm->avic_physical_id_cache));
if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
avic_update_iommu_vcpu_affinity(vcpu, -1, 0); /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
return;
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry); WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
} }
/* void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
* This function is called during VCPU halt/unhalt.
*/
static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
{ {
struct vcpu_svm *svm = to_svm(vcpu); if (!kvm_vcpu_apicv_active(vcpu))
int cpu = get_cpu(); return;
preempt_disable();
/*
* Unload the AVIC when the vCPU is about to block, _before_
* the vCPU actually blocks.
*
* Any IRQs that arrive before IsRunning=0 will not cause an
* incomplete IPI vmexit on the source, therefore vIRR will also
* be checked by kvm_vcpu_check_block() before blocking. The
* memory barrier implicit in set_current_state orders writing
* IsRunning=0 before reading the vIRR. The processor needs a
* matching memory barrier on interrupt delivery between writing
* IRR and reading IsRunning; the lack of this barrier might be
* the cause of errata #1235).
*/
avic_vcpu_put(vcpu);
preempt_enable();
}
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
int cpu;
if (!kvm_vcpu_apicv_active(vcpu))
return;
cpu = get_cpu();
WARN_ON(cpu != vcpu->cpu); WARN_ON(cpu != vcpu->cpu);
svm->avic_is_running = is_run;
if (kvm_vcpu_apicv_active(vcpu)) { avic_vcpu_load(vcpu, cpu);
if (is_run)
avic_vcpu_load(vcpu, cpu);
else
avic_vcpu_put(vcpu);
}
put_cpu(); put_cpu();
} }
void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
{
avic_set_running(vcpu, false);
}
void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
kvm_vcpu_update_apicv(vcpu);
avic_set_running(vcpu, true);
}

View file

@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
{ {
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
if (!pmu) if (!enable_pmu)
return NULL; return NULL;
switch (msr) { switch (msr) {

View file

@ -192,10 +192,6 @@ module_param(vgif, int, 0444);
static int lbrv = true; static int lbrv = true;
module_param(lbrv, int, 0444); module_param(lbrv, int, 0444);
/* enable/disable PMU virtualization */
bool pmu = true;
module_param(pmu, bool, 0444);
static int tsc_scaling = true; static int tsc_scaling = true;
module_param(tsc_scaling, int, 0444); module_param(tsc_scaling, int, 0444);
@ -873,47 +869,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
} }
} }
/*
* The default MMIO mask is a single bit (excluding the present bit),
* which could conflict with the memory encryption bit. Check for
* memory encryption support and override the default MMIO mask if
* memory encryption is enabled.
*/
static __init void svm_adjust_mmio_mask(void)
{
unsigned int enc_bit, mask_bit;
u64 msr, mask;
/* If there is no memory encryption support, use existing mask */
if (cpuid_eax(0x80000000) < 0x8000001f)
return;
/* If memory encryption is not enabled, use existing mask */
rdmsrl(MSR_AMD64_SYSCFG, msr);
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
mask_bit = boot_cpu_data.x86_phys_bits;
/* Increment the mask bit if it is the same as the encryption bit */
if (enc_bit == mask_bit)
mask_bit++;
/*
* If the mask bit location is below 52, then some bits above the
* physical addressing limit will always be reserved, so use the
* rsvd_bits() function to generate the mask. This mask, along with
* the present bit, will be used to generate a page fault with
* PFER.RSV = 1.
*
* If the mask bit location is 52 (or above), then clear the mask.
*/
mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
}
static void svm_hardware_teardown(void) static void svm_hardware_teardown(void)
{ {
int cpu; int cpu;
@ -928,198 +883,6 @@ static void svm_hardware_teardown(void)
iopm_base = 0; iopm_base = 0;
} }
static __init void svm_set_cpu_caps(void)
{
kvm_set_cpu_caps();
supported_xss = 0;
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
if (nested) {
kvm_cpu_cap_set(X86_FEATURE_SVM);
if (nrips)
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
if (npt_enabled)
kvm_cpu_cap_set(X86_FEATURE_NPT);
if (tsc_scaling)
kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
/* CPUID 0x80000008 */
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
/* AMD PMU PERFCTR_CORE CPUID */
if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
/* CPUID 0x8000001F (SME/SEV features) */
sev_set_cpu_caps();
}
static __init int svm_hardware_setup(void)
{
int cpu;
struct page *iopm_pages;
void *iopm_va;
int r;
unsigned int order = get_order(IOPM_SIZE);
/*
* NX is required for shadow paging and for NPT if the NX huge pages
* mitigation is enabled.
*/
if (!boot_cpu_has(X86_FEATURE_NX)) {
pr_err_ratelimited("NX (Execute Disable) not supported\n");
return -EOPNOTSUPP;
}
kvm_enable_efer_bits(EFER_NX);
iopm_pages = alloc_pages(GFP_KERNEL, order);
if (!iopm_pages)
return -ENOMEM;
iopm_va = page_address(iopm_pages);
memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
init_msrpm_offsets();
supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
if (tsc_scaling) {
if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
tsc_scaling = false;
} else {
pr_info("TSC scaling supported\n");
kvm_has_tsc_control = true;
kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
kvm_tsc_scaling_ratio_frac_bits = 32;
}
}
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
/* Check for pause filtering support */
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
pause_filter_count = 0;
pause_filter_thresh = 0;
} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
pause_filter_thresh = 0;
}
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
/*
* KVM's MMU doesn't support using 2-level paging for itself, and thus
* NPT isn't supported if the host is using 2-level paging since host
* CR4 is unchanged on VMRUN.
*/
if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
npt_enabled = false;
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Note, SEV setup consumes npt_enabled. */
sev_hardware_setup();
svm_hv_hardware_setup();
svm_adjust_mmio_mask();
for_each_possible_cpu(cpu) {
r = svm_cpu_init(cpu);
if (r)
goto err;
}
if (nrips) {
if (!boot_cpu_has(X86_FEATURE_NRIPS))
nrips = false;
}
enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
if (enable_apicv) {
pr_info("AVIC enabled\n");
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
}
if (vls) {
if (!npt_enabled ||
!boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
!IS_ENABLED(CONFIG_X86_64)) {
vls = false;
} else {
pr_info("Virtual VMLOAD VMSAVE supported\n");
}
}
if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
svm_gp_erratum_intercept = false;
if (vgif) {
if (!boot_cpu_has(X86_FEATURE_VGIF))
vgif = false;
else
pr_info("Virtual GIF supported\n");
}
if (lbrv) {
if (!boot_cpu_has(X86_FEATURE_LBRV))
lbrv = false;
else
pr_info("LBR virtualization supported\n");
}
if (!pmu)
pr_info("PMU virtualization is disabled\n");
svm_set_cpu_caps();
/*
* It seems that on AMD processors PTE's accessed bit is
* being set by the CPU hardware before the NPF vmexit.
* This is not expected behaviour and our tests fail because
* of it.
* A workaround here is to disable support for
* GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
* In this case userspace can know if there is support using
* KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
* it
* If future AMD CPU models change the behaviour described above,
* this variable can be changed accordingly
*/
allow_smaller_maxphyaddr = !npt_enabled;
return 0;
err:
svm_hardware_teardown();
return r;
}
static void init_seg(struct vmcb_seg *seg) static void init_seg(struct vmcb_seg *seg)
{ {
seg->selector = 0; seg->selector = 0;
@ -1444,12 +1207,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (err) if (err)
goto error_free_vmsa_page; goto error_free_vmsa_page;
/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
*/
if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
svm->avic_is_running = true;
svm->msrpm = svm_vcpu_alloc_msrpm(); svm->msrpm = svm_vcpu_alloc_msrpm();
if (!svm->msrpm) { if (!svm->msrpm) {
err = -ENOMEM; err = -ENOMEM;
@ -3833,6 +3590,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
svm_complete_interrupts(vcpu); svm_complete_interrupts(vcpu);
} }
static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
return 1;
}
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{ {
if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
@ -4629,8 +4391,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.prepare_guest_switch = svm_prepare_guest_switch, .prepare_guest_switch = svm_prepare_guest_switch,
.vcpu_load = svm_vcpu_load, .vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put, .vcpu_put = svm_vcpu_put,
.vcpu_blocking = svm_vcpu_blocking, .vcpu_blocking = avic_vcpu_blocking,
.vcpu_unblocking = svm_vcpu_unblocking, .vcpu_unblocking = avic_vcpu_unblocking,
.update_exception_bitmap = svm_update_exception_bitmap, .update_exception_bitmap = svm_update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature, .get_msr_feature = svm_get_msr_feature,
@ -4662,6 +4424,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.tlb_flush_gva = svm_flush_tlb_gva, .tlb_flush_gva = svm_flush_tlb_gva,
.tlb_flush_guest = svm_flush_tlb, .tlb_flush_guest = svm_flush_tlb,
.vcpu_pre_run = svm_vcpu_pre_run,
.run = svm_vcpu_run, .run = svm_vcpu_run,
.handle_exit = handle_exit, .handle_exit = handle_exit,
.skip_emulated_instruction = skip_emulated_instruction, .skip_emulated_instruction = skip_emulated_instruction,
@ -4742,6 +4505,243 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
}; };
/*
* The default MMIO mask is a single bit (excluding the present bit),
* which could conflict with the memory encryption bit. Check for
* memory encryption support and override the default MMIO mask if
* memory encryption is enabled.
*/
static __init void svm_adjust_mmio_mask(void)
{
unsigned int enc_bit, mask_bit;
u64 msr, mask;
/* If there is no memory encryption support, use existing mask */
if (cpuid_eax(0x80000000) < 0x8000001f)
return;
/* If memory encryption is not enabled, use existing mask */
rdmsrl(MSR_AMD64_SYSCFG, msr);
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
mask_bit = boot_cpu_data.x86_phys_bits;
/* Increment the mask bit if it is the same as the encryption bit */
if (enc_bit == mask_bit)
mask_bit++;
/*
* If the mask bit location is below 52, then some bits above the
* physical addressing limit will always be reserved, so use the
* rsvd_bits() function to generate the mask. This mask, along with
* the present bit, will be used to generate a page fault with
* PFER.RSV = 1.
*
* If the mask bit location is 52 (or above), then clear the mask.
*/
mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
}
static __init void svm_set_cpu_caps(void)
{
kvm_set_cpu_caps();
supported_xss = 0;
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
if (nested) {
kvm_cpu_cap_set(X86_FEATURE_SVM);
if (nrips)
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
if (npt_enabled)
kvm_cpu_cap_set(X86_FEATURE_NPT);
if (tsc_scaling)
kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
/* CPUID 0x80000008 */
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
/* AMD PMU PERFCTR_CORE CPUID */
if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
/* CPUID 0x8000001F (SME/SEV features) */
sev_set_cpu_caps();
}
static __init int svm_hardware_setup(void)
{
int cpu;
struct page *iopm_pages;
void *iopm_va;
int r;
unsigned int order = get_order(IOPM_SIZE);
/*
* NX is required for shadow paging and for NPT if the NX huge pages
* mitigation is enabled.
*/
if (!boot_cpu_has(X86_FEATURE_NX)) {
pr_err_ratelimited("NX (Execute Disable) not supported\n");
return -EOPNOTSUPP;
}
kvm_enable_efer_bits(EFER_NX);
iopm_pages = alloc_pages(GFP_KERNEL, order);
if (!iopm_pages)
return -ENOMEM;
iopm_va = page_address(iopm_pages);
memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
init_msrpm_offsets();
supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
if (tsc_scaling) {
if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
tsc_scaling = false;
} else {
pr_info("TSC scaling supported\n");
kvm_has_tsc_control = true;
kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
kvm_tsc_scaling_ratio_frac_bits = 32;
}
}
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
/* Check for pause filtering support */
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
pause_filter_count = 0;
pause_filter_thresh = 0;
} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
pause_filter_thresh = 0;
}
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
/*
* KVM's MMU doesn't support using 2-level paging for itself, and thus
* NPT isn't supported if the host is using 2-level paging since host
* CR4 is unchanged on VMRUN.
*/
if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
npt_enabled = false;
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Note, SEV setup consumes npt_enabled. */
sev_hardware_setup();
svm_hv_hardware_setup();
svm_adjust_mmio_mask();
for_each_possible_cpu(cpu) {
r = svm_cpu_init(cpu);
if (r)
goto err;
}
if (nrips) {
if (!boot_cpu_has(X86_FEATURE_NRIPS))
nrips = false;
}
enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
if (enable_apicv) {
pr_info("AVIC enabled\n");
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
} else {
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
}
if (vls) {
if (!npt_enabled ||
!boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
!IS_ENABLED(CONFIG_X86_64)) {
vls = false;
} else {
pr_info("Virtual VMLOAD VMSAVE supported\n");
}
}
if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
svm_gp_erratum_intercept = false;
if (vgif) {
if (!boot_cpu_has(X86_FEATURE_VGIF))
vgif = false;
else
pr_info("Virtual GIF supported\n");
}
if (lbrv) {
if (!boot_cpu_has(X86_FEATURE_LBRV))
lbrv = false;
else
pr_info("LBR virtualization supported\n");
}
if (!enable_pmu)
pr_info("PMU virtualization is disabled\n");
svm_set_cpu_caps();
/*
* It seems that on AMD processors PTE's accessed bit is
* being set by the CPU hardware before the NPF vmexit.
* This is not expected behaviour and our tests fail because
* of it.
* A workaround here is to disable support for
* GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
* In this case userspace can know if there is support using
* KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
* it
* If future AMD CPU models change the behaviour described above,
* this variable can be changed accordingly
*/
allow_smaller_maxphyaddr = !npt_enabled;
return 0;
err:
svm_hardware_teardown();
return r;
}
static struct kvm_x86_init_ops svm_init_ops __initdata = { static struct kvm_x86_init_ops svm_init_ops __initdata = {
.cpu_has_kvm_support = has_svm, .cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled, .disabled_by_bios = is_disabled,

View file

@ -32,7 +32,6 @@
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled; extern bool npt_enabled;
extern bool intercept_smi; extern bool intercept_smi;
extern bool pmu;
/* /*
* Clean bits in VMCB. * Clean bits in VMCB.
@ -226,7 +225,6 @@ struct vcpu_svm {
u32 dfr_reg; u32 dfr_reg;
struct page *avic_backing_page; struct page *avic_backing_page;
u64 *avic_physical_id_cache; u64 *avic_physical_id_cache;
bool avic_is_running;
/* /*
* Per-vcpu list of struct amd_svm_iommu_ir: * Per-vcpu list of struct amd_svm_iommu_ir:
@ -574,17 +572,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 *entry = svm->avic_physical_id_cache;
if (!entry)
return false;
return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
}
int avic_ga_log_notifier(u32 ga_tag); int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm); void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm); int avic_vm_init(struct kvm *kvm);
@ -605,8 +592,8 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set); uint32_t guest_irq, bool set);
void svm_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
/* sev.c */ /* sev.c */

View file

@ -5,6 +5,7 @@
#include <asm/vmx.h> #include <asm/vmx.h>
#include "lapic.h" #include "lapic.h"
#include "x86.h"
extern bool __read_mostly enable_vpid; extern bool __read_mostly enable_vpid;
extern bool __read_mostly flexpriority_enabled; extern bool __read_mostly flexpriority_enabled;
@ -389,6 +390,9 @@ static inline u64 vmx_get_perf_capabilities(void)
{ {
u64 perf_cap = 0; u64 perf_cap = 0;
if (!enable_pmu)
return perf_cap;
if (boot_cpu_has(X86_FEATURE_PDCM)) if (boot_cpu_has(X86_FEATURE_PDCM))
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);

View file

@ -21,7 +21,6 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
static struct kvm_event_hw_type_mapping intel_arch_events[] = { static struct kvm_event_hw_type_mapping intel_arch_events[] = {
/* Index must match CPUID 0x0A.EBX bit vector */
[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
/* The above index must match CPUID 0x0A.EBX bit vector */
[7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
}; };
@ -75,11 +75,17 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i; int i;
for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
if (intel_arch_events[i].eventsel == event_select && if (intel_arch_events[i].eventsel != event_select ||
intel_arch_events[i].unit_mask == unit_mask && intel_arch_events[i].unit_mask != unit_mask)
(pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i))) continue;
break;
/* disable event that reported as not present by cpuid */
if ((i < 7) && !(pmu->available_event_types & (1 << i)))
return PERF_COUNT_HW_MAX + 1;
break;
}
if (i == ARRAY_SIZE(intel_arch_events)) if (i == ARRAY_SIZE(intel_arch_events))
return PERF_COUNT_HW_MAX; return PERF_COUNT_HW_MAX;
@ -481,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->reserved_bits = 0xffffffff00200000ull; pmu->reserved_bits = 0xffffffff00200000ull;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry) if (!entry || !enable_pmu)
return; return;
eax.full = entry->eax; eax.full = entry->eax;
edx.full = entry->edx; edx.full = entry->edx;

View file

@ -19,7 +19,7 @@
* wake the target vCPUs. vCPUs are removed from the list and the notification * wake the target vCPUs. vCPUs are removed from the list and the notification
* vector is reset when the vCPU is scheduled in. * vector is reset when the vCPU is scheduled in.
*/ */
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
/* /*
* Protect the per-CPU list with a per-CPU spinlock to handle task migration. * Protect the per-CPU list with a per-CPU spinlock to handle task migration.
* When a blocking vCPU is awakened _and_ migrated to a different pCPU, the * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
@ -27,7 +27,7 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
* CPU. IRQs must be disabled when taking this lock, otherwise deadlock will * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will
* occur if a wakeup IRQ arrives and attempts to acquire the lock. * occur if a wakeup IRQ arrives and attempts to acquire the lock.
*/ */
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{ {
@ -51,7 +51,9 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{ {
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct pi_desc old, new; struct pi_desc old, new;
unsigned long flags;
unsigned int dest; unsigned int dest;
/* /*
@ -62,23 +64,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
if (!enable_apicv || !lapic_in_kernel(vcpu)) if (!enable_apicv || !lapic_in_kernel(vcpu))
return; return;
/* Nothing to do if PI.SN and PI.NDST both have the desired value. */
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
return;
/* /*
* If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
* PI.NDST: pi_post_block is the one expected to change PID.NDST and the * full update can be skipped as neither the vector nor the destination
* wakeup handler expects the vCPU to be on the blocked_vcpu_list that * needs to be changed.
* matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
* correctly.
*/ */
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
pi_clear_sn(pi_desc); /*
goto after_clear_sn; * Clear SN if it was set due to being preempted. Again, do
* this even if there is no assigned device for simplicity.
*/
if (pi_test_and_clear_sn(pi_desc))
goto after_clear_sn;
return;
}
local_irq_save(flags);
/*
* If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
* list of the _previous_ pCPU, which will not be the same as the
* current pCPU if the task was migrated.
*/
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
list_del(&vmx->pi_wakeup_list);
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
} }
/* The full case. Set the new destination and clear SN. */
dest = cpu_physical_id(cpu); dest = cpu_physical_id(cpu);
if (!x2apic_mode) if (!x2apic_mode)
dest = (dest << 8) & 0xFF00; dest = (dest << 8) & 0xFF00;
@ -86,10 +99,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
do { do {
old.control = new.control = READ_ONCE(pi_desc->control); old.control = new.control = READ_ONCE(pi_desc->control);
/*
* Clear SN (as above) and refresh the destination APIC ID to
* handle task migration (@cpu != vcpu->cpu).
*/
new.ndst = dest; new.ndst = dest;
new.sn = 0; new.sn = 0;
/*
* Restore the notification vector; in the blocking case, the
* descriptor was modified on "put" to use the wakeup vector.
*/
new.nv = POSTED_INTR_VECTOR;
} while (pi_try_set_control(pi_desc, old.control, new.control)); } while (pi_try_set_control(pi_desc, old.control, new.control));
local_irq_restore(flags);
after_clear_sn: after_clear_sn:
/* /*
@ -111,83 +136,25 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm)
irq_remapping_cap(IRQ_POSTING_CAP); irq_remapping_cap(IRQ_POSTING_CAP);
} }
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
/* Set SN when the vCPU is preempted */
if (vcpu->preempted)
pi_set_sn(pi_desc);
}
static void __pi_post_block(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old, new;
unsigned int dest;
/*
* Remove the vCPU from the wakeup list of the _previous_ pCPU, which
* will not be the same as the current pCPU if the task was migrated.
*/
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
list_del(&vcpu->blocked_vcpu_list);
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
dest = cpu_physical_id(vcpu->cpu);
if (!x2apic_mode)
dest = (dest << 8) & 0xFF00;
WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR,
"Wakeup handler not enabled while the vCPU was blocking");
do {
old.control = new.control = READ_ONCE(pi_desc->control);
new.ndst = dest;
/* set 'NV' to 'notification vector' */
new.nv = POSTED_INTR_VECTOR;
} while (pi_try_set_control(pi_desc, old.control, new.control));
vcpu->pre_pcpu = -1;
}
/* /*
* This routine does the following things for vCPU which is going * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
* to be blocked if VT-d PI is enabled. * WAKEUP as the notification vector in the PI descriptor.
* - Store the vCPU to the wakeup list, so when interrupts happen
* we can find the right vCPU to wake up.
* - Change the Posted-interrupt descriptor as below:
* 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
* - If 'ON' is set during this process, which means at least one
* interrupt is posted for this vCPU, we cannot block it, in
* this case, return 1, otherwise, return 0.
*
*/ */
int pi_pre_block(struct kvm_vcpu *vcpu) static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
{ {
struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct pi_desc old, new;
unsigned long flags; unsigned long flags;
if (!vmx_can_use_vtd_pi(vcpu->kvm) ||
vmx_interrupt_blocked(vcpu))
return 0;
local_irq_save(flags); local_irq_save(flags);
vcpu->pre_pcpu = vcpu->cpu; raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); list_add_tail(&vmx->pi_wakeup_list,
list_add_tail(&vcpu->blocked_vcpu_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
&per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
WARN(pi_desc->sn == 1, WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
"Posted Interrupt Suppress Notification set before blocking");
do { do {
old.control = new.control = READ_ONCE(pi_desc->control); old.control = new.control = READ_ONCE(pi_desc->control);
@ -196,24 +163,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
new.nv = POSTED_INTR_WAKEUP_VECTOR; new.nv = POSTED_INTR_WAKEUP_VECTOR;
} while (pi_try_set_control(pi_desc, old.control, new.control)); } while (pi_try_set_control(pi_desc, old.control, new.control));
/* We should not block the vCPU if an interrupt is posted for it. */ /*
if (pi_test_on(pi_desc)) * Send a wakeup IPI to this CPU if an interrupt may have been posted
__pi_post_block(vcpu); * before the notification vector was updated, in which case the IRQ
* will arrive on the non-wakeup vector. An IPI is needed as calling
* try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
* enabled until it is safe to call try_to_wake_up() on the task being
* scheduled out).
*/
if (pi_test_on(&new))
apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
local_irq_restore(flags); local_irq_restore(flags);
return (vcpu->pre_pcpu == -1);
} }
void pi_post_block(struct kvm_vcpu *vcpu) void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{ {
unsigned long flags; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
if (vcpu->pre_pcpu == -1) if (!vmx_can_use_vtd_pi(vcpu->kvm))
return; return;
local_irq_save(flags); if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
__pi_post_block(vcpu); pi_enable_wakeup_handler(vcpu);
local_irq_restore(flags);
/*
* Set SN when the vCPU is preempted. Note, the vCPU can both be seen
* as blocking and preempted, e.g. if it's preempted between setting
* its wait state and manually scheduling out.
*/
if (vcpu->preempted)
pi_set_sn(pi_desc);
} }
/* /*
@ -221,24 +201,23 @@ void pi_post_block(struct kvm_vcpu *vcpu)
*/ */
void pi_wakeup_handler(void) void pi_wakeup_handler(void)
{ {
struct kvm_vcpu *vcpu;
int cpu = smp_processor_id(); int cpu = smp_processor_id();
struct vcpu_vmx *vmx;
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu),
blocked_vcpu_list) { pi_wakeup_list) {
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
if (pi_test_on(pi_desc)) if (pi_test_on(&vmx->pi_desc))
kvm_vcpu_kick(vcpu); kvm_vcpu_wake_up(&vmx->vcpu);
} }
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
} }
void __init pi_init_cpu(int cpu) void __init pi_init_cpu(int cpu)
{ {
INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
} }
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
@ -254,7 +233,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
* Bail out of the block loop if the VM has an assigned * Bail out of the block loop if the VM has an assigned
* device, but the blocking vCPU didn't reconfigure the * device, but the blocking vCPU didn't reconfigure the
* PI.NV to the wakeup vector, i.e. the assigned device * PI.NV to the wakeup vector, i.e. the assigned device
* came along after the initial check in pi_pre_block(). * came along after the initial check in vmx_vcpu_pi_put().
*/ */
void vmx_pi_start_assignment(struct kvm *kvm) void vmx_pi_start_assignment(struct kvm *kvm)
{ {

View file

@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control); (unsigned long *)&pi_desc->control);
} }
static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
{
return test_and_clear_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
}
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{ {
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
int pi_pre_block(struct kvm_vcpu *vcpu);
void pi_post_block(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void); void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu); void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);

View file

@ -3931,12 +3931,10 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
pt_update_intercept_for_msr(vcpu); pt_update_intercept_for_msr(vcpu);
} }
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
bool nested) int pi_vec)
{ {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
if (vcpu->mode == IN_GUEST_MODE) { if (vcpu->mode == IN_GUEST_MODE) {
/* /*
* The vector of interrupt to be delivered to vcpu had * The vector of interrupt to be delivered to vcpu had
@ -3964,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
*/ */
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
return true; return;
} }
#endif #endif
return false; /*
* The vCPU isn't in the guest; wake the vCPU in case it is blocking,
* otherwise do nothing as KVM will grab the highest priority pending
* IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
*/
kvm_vcpu_wake_up(vcpu);
} }
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@ -3997,8 +4000,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
smp_mb__after_atomic(); smp_mb__after_atomic();
/* the PIR and ON have been set by L1. */ /* the PIR and ON have been set by L1. */
if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
kvm_vcpu_kick(vcpu);
return 0; return 0;
} }
return -1; return -1;
@ -4035,9 +4037,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
* guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
* posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
*/ */
if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
kvm_vcpu_kick(vcpu);
return 0; return 0;
} }
@ -5426,6 +5426,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
return 1; return 1;
} }
static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
return vmx->emulation_required && !vmx->rmode.vm86_active &&
vcpu->arch.exception.pending;
}
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
@ -5445,8 +5453,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (!kvm_emulate_instruction(vcpu, 0)) if (!kvm_emulate_instruction(vcpu, 0))
return 0; return 0;
if (vmx->emulation_required && !vmx->rmode.vm86_active && if (vmx_emulation_required_with_pending_exception(vcpu)) {
vcpu->arch.exception.pending) {
kvm_prepare_emulation_failure_exit(vcpu); kvm_prepare_emulation_failure_exit(vcpu);
return 0; return 0;
} }
@ -5468,6 +5475,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1; return 1;
} }
static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
if (vmx_emulation_required_with_pending_exception(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
return 1;
}
static void grow_ple_window(struct kvm_vcpu *vcpu) static void grow_ple_window(struct kvm_vcpu *vcpu)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
@ -6928,6 +6945,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu); vmx = to_vmx(vcpu);
INIT_LIST_HEAD(&vmx->pi_wakeup_list);
err = -ENOMEM; err = -ENOMEM;
vmx->vpid = allocate_vpid(); vmx->vpid = allocate_vpid();
@ -7549,25 +7568,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
} }
static int vmx_pre_block(struct kvm_vcpu *vcpu)
{
if (pi_pre_block(vcpu))
return 1;
if (kvm_lapic_hv_timer_in_use(vcpu))
kvm_lapic_switch_to_sw_timer(vcpu);
return 0;
}
static void vmx_post_block(struct kvm_vcpu *vcpu)
{
if (kvm_x86_ops.set_hv_timer)
kvm_lapic_switch_to_hv_timer(vcpu);
pi_post_block(vcpu);
}
static void vmx_setup_mce(struct kvm_vcpu *vcpu) static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{ {
if (vcpu->arch.mcg_cap & MCG_LMCE_P) if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@ -7710,6 +7710,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.tlb_flush_gva = vmx_flush_tlb_gva, .tlb_flush_gva = vmx_flush_tlb_gva,
.tlb_flush_guest = vmx_flush_tlb_guest, .tlb_flush_guest = vmx_flush_tlb_guest,
.vcpu_pre_run = vmx_vcpu_pre_run,
.run = vmx_vcpu_run, .run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit, .handle_exit = vmx_handle_exit,
.skip_emulated_instruction = vmx_skip_emulated_instruction, .skip_emulated_instruction = vmx_skip_emulated_instruction,
@ -7768,9 +7769,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cpu_dirty_log_size = PML_ENTITY_NUM, .cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
.pre_block = vmx_pre_block,
.post_block = vmx_post_block,
.pmu_ops = &intel_pmu_ops, .pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops, .nested_ops = &vmx_nested_ops,

View file

@ -317,6 +317,9 @@ struct vcpu_vmx {
/* Posted interrupt descriptor */ /* Posted interrupt descriptor */
struct pi_desc pi_desc; struct pi_desc pi_desc;
/* Used if this vCPU is waiting for PI notification wakeup. */
struct list_head pi_wakeup_list;
/* Support for a guest hypervisor (nested VMX) */ /* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested; struct nested_vmx nested;

View file

@ -187,6 +187,11 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
int __read_mostly pi_inject_timer = -1; int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
/* Enable/disable PMU virtualization */
bool __read_mostly enable_pmu = true;
EXPORT_SYMBOL_GPL(enable_pmu);
module_param(enable_pmu, bool, 0444);
/* /*
* Restoring the host value for MSRs that are only consumed when running in * Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@ -5230,17 +5235,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid; struct kvm_cpuid cpuid;
/*
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
* faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
* the core vCPU model on the fly, so fail.
*/
r = -EINVAL;
if (vcpu->arch.last_vmentry_cpu != -1)
goto out;
r = -EFAULT; r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out; goto out;
@ -5251,14 +5245,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid; struct kvm_cpuid2 cpuid;
/*
* KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
* KVM_SET_CPUID case above.
*/
r = -EINVAL;
if (vcpu->arch.last_vmentry_cpu != -1)
goto out;
r = -EFAULT; r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out; goto out;
@ -9945,10 +9931,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
smp_mb__after_srcu_read_unlock(); smp_mb__after_srcu_read_unlock();
/* /*
* This handles the case where a posted interrupt was * Process pending posted interrupts to handle the case where the
* notified with kvm_vcpu_kick. Assigned devices can * notification IRQ arrived in the host, or was never sent (because the
* use the POSTED_INTR_VECTOR even if APICv is disabled, * target vCPU wasn't running). Do this regardless of the vCPU's APICv
* so do it even if APICv is disabled on this vCPU. * status, KVM doesn't update assigned devices when APICv is inhibited,
* i.e. they can post interrupts even if APICv is temporarily disabled.
*/ */
if (kvm_lapic_enabled(vcpu)) if (kvm_lapic_enabled(vcpu))
static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
@ -10113,8 +10100,20 @@ out:
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{ {
if (!kvm_arch_vcpu_runnable(vcpu) && bool hv_timer;
(!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
if (!kvm_arch_vcpu_runnable(vcpu)) {
/*
* Switch to the software timer before halt-polling/blocking as
* the guest's timer may be a break event for the vCPU, and the
* hypervisor timer runs only when the CPU is in guest mode.
* Switch before halt-polling so that KVM recognizes an expired
* timer before blocking.
*/
hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
if (hv_timer)
kvm_lapic_switch_to_sw_timer(vcpu);
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
kvm_vcpu_halt(vcpu); kvm_vcpu_halt(vcpu);
@ -10122,8 +10121,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
kvm_vcpu_block(vcpu); kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (kvm_x86_ops.post_block) if (hv_timer)
static_call(kvm_x86_post_block)(vcpu); kvm_lapic_switch_to_hv_timer(vcpu);
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1; return 1;
@ -10316,6 +10315,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
r = -EINTR; r = -EINTR;
goto out; goto out;
} }
/*
* It should be impossible for the hypervisor timer to be in
* use before KVM has ever run the vCPU.
*/
WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
kvm_vcpu_block(vcpu); kvm_vcpu_block(vcpu);
if (kvm_apic_accept_events(vcpu) < 0) { if (kvm_apic_accept_events(vcpu) < 0) {
r = 0; r = 0;
@ -10360,10 +10364,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
} else } else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
if (kvm_run->immediate_exit) if (kvm_run->immediate_exit) {
r = -EINTR; r = -EINTR;
else goto out;
r = vcpu_run(vcpu); }
r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
if (r <= 0)
goto out;
r = vcpu_run(vcpu);
out: out:
kvm_put_guest_fpu(vcpu); kvm_put_guest_fpu(vcpu);

View file

@ -336,6 +336,7 @@ extern u64 host_xcr0;
extern u64 supported_xcr0; extern u64 supported_xcr0;
extern u64 host_xss; extern u64 host_xss;
extern u64 supported_xss; extern u64 supported_xss;
extern bool enable_pmu;
static inline bool kvm_mpx_supported(void) static inline bool kvm_mpx_supported(void)
{ {

View file

@ -309,9 +309,6 @@ struct kvm_vcpu {
u64 requests; u64 requests;
unsigned long guest_debug; unsigned long guest_debug;
int pre_pcpu;
struct list_head blocked_vcpu_list;
struct mutex mutex; struct mutex mutex;
struct kvm_run *run; struct kvm_run *run;

View file

@ -1131,7 +1131,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
#define KVM_CAP_ARM_MTE 205 #define KVM_CAP_ARM_MTE 205
#define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
#define KVM_CAP_XSAVE2 207 #define KVM_CAP_VM_GPA_BITS 207
#define KVM_CAP_XSAVE2 208
#ifdef KVM_CAP_IRQ_ROUTING #ifdef KVM_CAP_IRQ_ROUTING
@ -1163,11 +1164,20 @@ struct kvm_irq_routing_hv_sint {
__u32 sint; __u32 sint;
}; };
struct kvm_irq_routing_xen_evtchn {
__u32 port;
__u32 vcpu;
__u32 priority;
};
#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
/* gsi routing entry types */ /* gsi routing entry types */
#define KVM_IRQ_ROUTING_IRQCHIP 1 #define KVM_IRQ_ROUTING_IRQCHIP 1
#define KVM_IRQ_ROUTING_MSI 2 #define KVM_IRQ_ROUTING_MSI 2
#define KVM_IRQ_ROUTING_S390_ADAPTER 3 #define KVM_IRQ_ROUTING_S390_ADAPTER 3
#define KVM_IRQ_ROUTING_HV_SINT 4 #define KVM_IRQ_ROUTING_HV_SINT 4
#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
struct kvm_irq_routing_entry { struct kvm_irq_routing_entry {
__u32 gsi; __u32 gsi;
@ -1179,6 +1189,7 @@ struct kvm_irq_routing_entry {
struct kvm_irq_routing_msi msi; struct kvm_irq_routing_msi msi;
struct kvm_irq_routing_s390_adapter adapter; struct kvm_irq_routing_s390_adapter adapter;
struct kvm_irq_routing_hv_sint hv_sint; struct kvm_irq_routing_hv_sint hv_sint;
struct kvm_irq_routing_xen_evtchn xen_evtchn;
__u32 pad[8]; __u32 pad[8];
} u; } u;
}; };
@ -1209,6 +1220,7 @@ struct kvm_x86_mce {
#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
struct kvm_xen_hvm_config { struct kvm_xen_hvm_config {
__u32 flags; __u32 flags;
@ -1552,8 +1564,6 @@ struct kvm_s390_ucas_mapping {
/* Available with KVM_CAP_XSAVE */ /* Available with KVM_CAP_XSAVE */
#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) #define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave)
#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave) #define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave)
/* Available with KVM_CAP_XSAVE2 */
#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave)
/* Available with KVM_CAP_XCRS */ /* Available with KVM_CAP_XCRS */
#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs)
#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs)
@ -1613,6 +1623,9 @@ struct kvm_enc_region {
#define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3)
#define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4)
/* Available with KVM_CAP_XSAVE2 */
#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave)
struct kvm_s390_pv_sec_parm { struct kvm_s390_pv_sec_parm {
__u64 origin; __u64 origin;
__u64 length; __u64 length;

View file

@ -8,11 +8,12 @@
/s390x/memop /s390x/memop
/s390x/resets /s390x/resets
/s390x/sync_regs_test /s390x/sync_regs_test
/x86_64/amx_test
/x86_64/cpuid_test
/x86_64/cr4_cpuid_sync_test /x86_64/cr4_cpuid_sync_test
/x86_64/debug_regs /x86_64/debug_regs
/x86_64/evmcs_test /x86_64/evmcs_test
/x86_64/emulator_error_test /x86_64/emulator_error_test
/x86_64/get_cpuid_test
/x86_64/get_msr_index_features /x86_64/get_msr_index_features
/x86_64/kvm_clock_test /x86_64/kvm_clock_test
/x86_64/kvm_pv_test /x86_64/kvm_pv_test
@ -22,6 +23,7 @@
/x86_64/mmio_warning_test /x86_64/mmio_warning_test
/x86_64/mmu_role_test /x86_64/mmu_role_test
/x86_64/platform_info_test /x86_64/platform_info_test
/x86_64/pmu_event_filter_test
/x86_64/set_boot_cpu_id /x86_64/set_boot_cpu_id
/x86_64/set_sregs_test /x86_64/set_sregs_test
/x86_64/sev_migrate_tests /x86_64/sev_migrate_tests
@ -36,6 +38,7 @@
/x86_64/vmx_apic_access_test /x86_64/vmx_apic_access_test
/x86_64/vmx_close_while_nested_test /x86_64/vmx_close_while_nested_test
/x86_64/vmx_dirty_log_test /x86_64/vmx_dirty_log_test
/x86_64/vmx_exception_with_invalid_guest_state
/x86_64/vmx_invalid_nested_guest_state /x86_64/vmx_invalid_nested_guest_state
/x86_64/vmx_preemption_timer_test /x86_64/vmx_preemption_timer_test
/x86_64/vmx_set_nested_state_test /x86_64/vmx_set_nested_state_test

View file

@ -43,11 +43,11 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handler
LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c
TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
@ -56,6 +56,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test
TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test
@ -69,6 +70,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test

View file

@ -364,6 +364,24 @@ static inline unsigned long get_xmm(int n)
} }
bool is_intel_cpu(void); bool is_intel_cpu(void);
bool is_amd_cpu(void);
static inline unsigned int x86_family(unsigned int eax)
{
unsigned int x86;
x86 = (eax >> 8) & 0xf;
if (x86 == 0xf)
x86 += (eax >> 20) & 0xff;
return x86;
}
static inline unsigned int x86_model(unsigned int eax)
{
return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
}
struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
@ -375,6 +393,8 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index);
struct kvm_cpuid2 *kvm_get_supported_cpuid(void); struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid); struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_cpuid2 *cpuid);
void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_cpuid2 *cpuid); struct kvm_cpuid2 *cpuid);
@ -418,6 +438,11 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr);
void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
uint64_t pte); uint64_t pte);
/*
* get_cpuid() - find matching CPUID entry and return pointer to it.
*/
struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
uint32_t index);
/* /*
* set_cpuid() - overwrites a matching cpuid entry with the provided value. * set_cpuid() - overwrites a matching cpuid entry with the provided value.
* matches based on ent->function && ent->index. returns true * matches based on ent->function && ent->index. returns true

View file

@ -393,10 +393,12 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
struct kvm_vm *vm; struct kvm_vm *vm;
int i; int i;
#ifdef __x86_64__
/* /*
* Permission needs to be requested before KVM_SET_CPUID2. * Permission needs to be requested before KVM_SET_CPUID2.
*/ */
vm_xsave_req_perm(); vm_xsave_req_perm();
#endif
/* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
@ -497,9 +499,11 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
uint64_t first_page, uint32_t num_pages) uint64_t first_page, uint32_t num_pages)
{ {
struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot, struct kvm_clear_dirty_log args = {
.first_page = first_page, .dirty_bitmap = log, .slot = slot,
.num_pages = num_pages }; .first_page = first_page,
.num_pages = num_pages
};
int ret; int ret;
ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args); ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);

View file

@ -886,6 +886,17 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
return entry; return entry;
} }
int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_cpuid2 *cpuid)
{
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
return ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
}
/* /*
* VM VCPU CPUID Set * VM VCPU CPUID Set
* *
@ -903,12 +914,9 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
void vcpu_set_cpuid(struct kvm_vm *vm, void vcpu_set_cpuid(struct kvm_vm *vm,
uint32_t vcpuid, struct kvm_cpuid2 *cpuid) uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
{ {
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
int rc; int rc;
TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); rc = __vcpu_set_cpuid(vm, vcpuid, cpuid);
rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i", TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
rc, errno); rc, errno);
@ -1136,25 +1144,25 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
list->nmsrs = nmsrs; list->nmsrs = nmsrs;
r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
r); r);
state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0])); state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
r); r);
r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state); r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
r); r);
r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs); r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
r); r);
r = vcpu_save_xsave_state(vm, vcpu, state); r = vcpu_save_xsave_state(vm, vcpu, state);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
r); r);
if (kvm_check_cap(KVM_CAP_XCRS)) { if (kvm_check_cap(KVM_CAP_XCRS)) {
r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
@ -1163,17 +1171,17 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
} }
r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
r); r);
if (nested_size) { if (nested_size) {
state->nested.size = sizeof(state->nested_); state->nested.size = sizeof(state->nested_);
r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested); r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
r); r);
TEST_ASSERT(state->nested.size <= nested_size, TEST_ASSERT(state->nested.size <= nested_size,
"Nested state size too big, %i (KVM_CHECK_CAP gave %i)", "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
state->nested.size, nested_size); state->nested.size, nested_size);
} else } else
state->nested.size = 0; state->nested.size = 0;
@ -1181,12 +1189,12 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
for (i = 0; i < nmsrs; i++) for (i = 0; i < nmsrs; i++)
state->msrs.entries[i].index = list->indices[i]; state->msrs.entries[i].index = list->indices[i];
r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
r, r == nmsrs ? -1 : list->indices[r]); r, r == nmsrs ? -1 : list->indices[r]);
r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
r); r);
free(list); free(list);
return state; return state;
@ -1199,7 +1207,7 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
r); r);
r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs); r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
TEST_ASSERT(r == state->msrs.nmsrs, TEST_ASSERT(r == state->msrs.nmsrs,
@ -1214,28 +1222,28 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave); r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
r); r);
r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events); r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
r); r);
r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state); r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
r); r);
r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs); r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
r); r);
r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs); r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
r); r);
if (state->nested.size) { if (state->nested.size) {
r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested); r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i", TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
r); r);
} }
} }
@ -1245,10 +1253,10 @@ void kvm_x86_state_cleanup(struct kvm_x86_state *state)
free(state); free(state);
} }
bool is_intel_cpu(void) static bool cpu_vendor_string_is(const char *vendor)
{ {
const uint32_t *chunk = (const uint32_t *)vendor;
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
const uint32_t *chunk;
const int leaf = 0; const int leaf = 0;
__asm__ __volatile__( __asm__ __volatile__(
@ -1257,10 +1265,22 @@ bool is_intel_cpu(void)
"=c"(ecx), "=d"(edx) "=c"(ecx), "=d"(edx)
: /* input */ "0"(leaf), "2"(0)); : /* input */ "0"(leaf), "2"(0));
chunk = (const uint32_t *)("GenuineIntel");
return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]); return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
} }
bool is_intel_cpu(void)
{
return cpu_vendor_string_is("GenuineIntel");
}
/*
* Exclude early K5 samples with a vendor string of "AMDisbetter!"
*/
bool is_amd_cpu(void)
{
return cpu_vendor_string_is("AuthenticAMD");
}
uint32_t kvm_get_cpuid_max_basic(void) uint32_t kvm_get_cpuid_max_basic(void)
{ {
return kvm_get_supported_cpuid_entry(0)->eax; return kvm_get_supported_cpuid_entry(0)->eax;
@ -1384,6 +1404,23 @@ void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
} }
} }
struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
uint32_t index)
{
int i;
for (i = 0; i < cpuid->nent; i++) {
struct kvm_cpuid_entry2 *cur = &cpuid->entries[i];
if (cur->function == function && cur->index == index)
return cur;
}
TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
return NULL;
}
bool set_cpuid(struct kvm_cpuid2 *cpuid, bool set_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 *ent) struct kvm_cpuid_entry2 *ent)
{ {
@ -1479,22 +1516,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui
return cpuid; return cpuid;
} }
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
static inline unsigned x86_family(unsigned int eax)
{
unsigned int x86;
x86 = (eax >> 8) & 0xf;
if (x86 == 0xf)
x86 += (eax >> 20) & 0xff;
return x86;
}
unsigned long vm_compute_max_gfn(struct kvm_vm *vm) unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
{ {
const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
@ -1504,11 +1525,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1; max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
/* Avoid reserved HyperTransport region on AMD processors. */ /* Avoid reserved HyperTransport region on AMD processors. */
eax = ecx = 0; if (!is_amd_cpu())
cpuid(&eax, &ebx, &ecx, &edx);
if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx ||
ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx ||
edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
return max_gfn; return max_gfn;
/* On parts with <40 physical address bits, the area is fully hidden */ /* On parts with <40 physical address bits, the area is fully hidden */
@ -1518,6 +1535,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
/* Before family 17h, the HyperTransport area is just below 1T. */ /* Before family 17h, the HyperTransport area is just below 1T. */
ht_gfn = (1 << 28) - num_ht_pages; ht_gfn = (1 << 28) - num_ht_pages;
eax = 1; eax = 1;
ecx = 0;
cpuid(&eax, &ebx, &ecx, &edx); cpuid(&eax, &ebx, &ecx, &edx);
if (x86_family(eax) < 0x17) if (x86_family(eax) < 0x17)
goto done; goto done;

View file

@ -154,6 +154,34 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct
return guest_cpuids; return guest_cpuids;
} }
static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid)
{
struct kvm_cpuid_entry2 *ent;
int rc;
u32 eax, ebx, x;
/* Setting unmodified CPUID is allowed */
rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc);
/* Changing CPU features is forbidden */
ent = get_cpuid(cpuid, 0x7, 0);
ebx = ent->ebx;
ent->ebx--;
rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
TEST_ASSERT(rc, "Changing CPU features should fail");
ent->ebx = ebx;
/* Changing MAXPHYADDR is forbidden */
ent = get_cpuid(cpuid, 0x80000008, 0);
eax = ent->eax;
x = eax & 0xff;
ent->eax = (eax & ~0xffu) | (x - 1);
rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
TEST_ASSERT(rc, "Changing MAXPHYADDR should fail");
ent->eax = eax;
}
int main(void) int main(void)
{ {
struct kvm_cpuid2 *supp_cpuid, *cpuid2; struct kvm_cpuid2 *supp_cpuid, *cpuid2;
@ -175,5 +203,7 @@ int main(void)
for (stage = 0; stage < 3; stage++) for (stage = 0; stage < 3; stage++)
run_vcpu(vm, VCPU_ID, stage); run_vcpu(vm, VCPU_ID, stage);
set_cpuid_after_run(vm, cpuid2);
kvm_vm_free(vm); kvm_vm_free(vm);
} }

View file

@ -0,0 +1,434 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Test for x86 KVM_SET_PMU_EVENT_FILTER.
*
* Copyright (C) 2022, Google LLC.
*
* This work is licensed under the terms of the GNU GPL, version 2.
*
* Verifies the expected behavior of allow lists and deny lists for
* virtual PMU events.
*/
#define _GNU_SOURCE /* for program_invocation_short_name */
#include "test_util.h"
#include "kvm_util.h"
#include "processor.h"
/*
* In lieu of copying perf_event.h into tools...
*/
#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
union cpuid10_eax {
struct {
unsigned int version_id:8;
unsigned int num_counters:8;
unsigned int bit_width:8;
unsigned int mask_length:8;
} split;
unsigned int full;
};
union cpuid10_ebx {
struct {
unsigned int no_unhalted_core_cycles:1;
unsigned int no_instructions_retired:1;
unsigned int no_unhalted_reference_cycles:1;
unsigned int no_llc_reference:1;
unsigned int no_llc_misses:1;
unsigned int no_branch_instruction_retired:1;
unsigned int no_branch_misses_retired:1;
} split;
unsigned int full;
};
/* End of stuff taken from perf_event.h. */
/* Oddly, this isn't in perf_event.h. */
#define ARCH_PERFMON_BRANCHES_RETIRED 5
#define VCPU_ID 0
#define NUM_BRANCHES 42
/*
* This is how the event selector and unit mask are stored in an AMD
* core performance event-select register. Intel's format is similar,
* but the event selector is only 8 bits.
*/
#define EVENT(select, umask) ((select & 0xf00UL) << 24 | (select & 0xff) | \
(umask & 0xff) << 8)
/*
* "Branch instructions retired", from the Intel SDM, volume 3,
* "Pre-defined Architectural Performance Events."
*/
#define INTEL_BR_RETIRED EVENT(0xc4, 0)
/*
* "Retired branch instructions", from Processor Programming Reference
* (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
* Preliminary Processor Programming Reference (PPR) for AMD Family
* 17h Model 31h, Revision B0 Processors, and Preliminary Processor
* Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
* B1 Processors Volume 1 of 2.
*/
#define AMD_ZEN_BR_RETIRED EVENT(0xc2, 0)
/*
* This event list comprises Intel's eight architectural events plus
* AMD's "retired branch instructions" for Zen[123] (and possibly
* other AMD CPUs).
*/
static const uint64_t event_list[] = {
EVENT(0x3c, 0),
EVENT(0xc0, 0),
EVENT(0x3c, 1),
EVENT(0x2e, 0x4f),
EVENT(0x2e, 0x41),
EVENT(0xc4, 0),
EVENT(0xc5, 0),
EVENT(0xa4, 1),
AMD_ZEN_BR_RETIRED,
};
/*
* If we encounter a #GP during the guest PMU sanity check, then the guest
* PMU is not functional. Inform the hypervisor via GUEST_SYNC(0).
*/
static void guest_gp_handler(struct ex_regs *regs)
{
GUEST_SYNC(0);
}
/*
* Check that we can write a new value to the given MSR and read it back.
* The caller should provide a non-empty set of bits that are safe to flip.
*
* Return on success. GUEST_SYNC(0) on error.
*/
static void check_msr(uint32_t msr, uint64_t bits_to_flip)
{
uint64_t v = rdmsr(msr) ^ bits_to_flip;
wrmsr(msr, v);
if (rdmsr(msr) != v)
GUEST_SYNC(0);
v ^= bits_to_flip;
wrmsr(msr, v);
if (rdmsr(msr) != v)
GUEST_SYNC(0);
}
static void intel_guest_code(void)
{
check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
check_msr(MSR_P6_EVNTSEL0, 0xffff);
check_msr(MSR_IA32_PMC0, 0xffff);
GUEST_SYNC(1);
for (;;) {
uint64_t br0, br1;
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
ARCH_PERFMON_EVENTSEL_OS | INTEL_BR_RETIRED);
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
br0 = rdmsr(MSR_IA32_PMC0);
__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
br1 = rdmsr(MSR_IA32_PMC0);
GUEST_SYNC(br1 - br0);
}
}
/*
* To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23],
* this code uses the always-available, legacy K7 PMU MSRs, which alias to
* the first four of the six extended core PMU MSRs.
*/
static void amd_guest_code(void)
{
check_msr(MSR_K7_EVNTSEL0, 0xffff);
check_msr(MSR_K7_PERFCTR0, 0xffff);
GUEST_SYNC(1);
for (;;) {
uint64_t br0, br1;
wrmsr(MSR_K7_EVNTSEL0, 0);
wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BR_RETIRED);
br0 = rdmsr(MSR_K7_PERFCTR0);
__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
br1 = rdmsr(MSR_K7_PERFCTR0);
GUEST_SYNC(br1 - br0);
}
}
/*
* Run the VM to the next GUEST_SYNC(value), and return the value passed
* to the sync. Any other exit from the guest is fatal.
*/
static uint64_t run_vm_to_sync(struct kvm_vm *vm)
{
struct kvm_run *run = vcpu_state(vm, VCPU_ID);
struct ucall uc;
vcpu_run(vm, VCPU_ID);
TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
"Exit_reason other than KVM_EXIT_IO: %u (%s)\n",
run->exit_reason,
exit_reason_str(run->exit_reason));
get_ucall(vm, VCPU_ID, &uc);
TEST_ASSERT(uc.cmd == UCALL_SYNC,
"Received ucall other than UCALL_SYNC: %lu", uc.cmd);
return uc.args[1];
}
/*
* In a nested environment or if the vPMU is disabled, the guest PMU
* might not work as architected (accessing the PMU MSRs may raise
* #GP, or writes could simply be discarded). In those situations,
* there is no point in running these tests. The guest code will perform
* a sanity check and then GUEST_SYNC(success). In the case of failure,
* the behavior of the guest on resumption is undefined.
*/
static bool sanity_check_pmu(struct kvm_vm *vm)
{
bool success;
vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
success = run_vm_to_sync(vm);
vm_install_exception_handler(vm, GP_VECTOR, NULL);
return success;
}
static struct kvm_pmu_event_filter *make_pmu_event_filter(uint32_t nevents)
{
struct kvm_pmu_event_filter *f;
int size = sizeof(*f) + nevents * sizeof(f->events[0]);
f = malloc(size);
TEST_ASSERT(f, "Out of memory");
memset(f, 0, size);
f->nevents = nevents;
return f;
}
static struct kvm_pmu_event_filter *event_filter(uint32_t action)
{
struct kvm_pmu_event_filter *f;
int i;
f = make_pmu_event_filter(ARRAY_SIZE(event_list));
f->action = action;
for (i = 0; i < ARRAY_SIZE(event_list); i++)
f->events[i] = event_list[i];
return f;
}
/*
* Remove the first occurrence of 'event' (if any) from the filter's
* event list.
*/
static struct kvm_pmu_event_filter *remove_event(struct kvm_pmu_event_filter *f,
uint64_t event)
{
bool found = false;
int i;
for (i = 0; i < f->nevents; i++) {
if (found)
f->events[i - 1] = f->events[i];
else
found = f->events[i] == event;
}
if (found)
f->nevents--;
return f;
}
static void test_without_filter(struct kvm_vm *vm)
{
uint64_t count = run_vm_to_sync(vm);
if (count != NUM_BRANCHES)
pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
__func__, count, NUM_BRANCHES);
TEST_ASSERT(count, "Allowed PMU event is not counting");
}
static uint64_t test_with_filter(struct kvm_vm *vm,
struct kvm_pmu_event_filter *f)
{
vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, (void *)f);
return run_vm_to_sync(vm);
}
static void test_member_deny_list(struct kvm_vm *vm)
{
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
uint64_t count = test_with_filter(vm, f);
free(f);
if (count)
pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
__func__, count);
TEST_ASSERT(!count, "Disallowed PMU Event is counting");
}
static void test_member_allow_list(struct kvm_vm *vm)
{
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
uint64_t count = test_with_filter(vm, f);
free(f);
if (count != NUM_BRANCHES)
pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
__func__, count, NUM_BRANCHES);
TEST_ASSERT(count, "Allowed PMU event is not counting");
}
static void test_not_member_deny_list(struct kvm_vm *vm)
{
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
uint64_t count;
remove_event(f, INTEL_BR_RETIRED);
remove_event(f, AMD_ZEN_BR_RETIRED);
count = test_with_filter(vm, f);
free(f);
if (count != NUM_BRANCHES)
pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
__func__, count, NUM_BRANCHES);
TEST_ASSERT(count, "Allowed PMU event is not counting");
}
static void test_not_member_allow_list(struct kvm_vm *vm)
{
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
uint64_t count;
remove_event(f, INTEL_BR_RETIRED);
remove_event(f, AMD_ZEN_BR_RETIRED);
count = test_with_filter(vm, f);
free(f);
if (count)
pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
__func__, count);
TEST_ASSERT(!count, "Disallowed PMU Event is counting");
}
/*
* Check for a non-zero PMU version, at least one general-purpose
* counter per logical processor, an EBX bit vector of length greater
* than 5, and EBX[5] clear.
*/
static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry)
{
union cpuid10_eax eax = { .full = entry->eax };
union cpuid10_ebx ebx = { .full = entry->ebx };
return eax.split.version_id && eax.split.num_counters > 0 &&
eax.split.mask_length > ARCH_PERFMON_BRANCHES_RETIRED &&
!ebx.split.no_branch_instruction_retired;
}
/*
* Note that CPUID leaf 0xa is Intel-specific. This leaf should be
* clear on AMD hardware.
*/
static bool use_intel_pmu(void)
{
struct kvm_cpuid_entry2 *entry;
entry = kvm_get_supported_cpuid_index(0xa, 0);
return is_intel_cpu() && entry && check_intel_pmu_leaf(entry);
}
static bool is_zen1(uint32_t eax)
{
return x86_family(eax) == 0x17 && x86_model(eax) <= 0x0f;
}
static bool is_zen2(uint32_t eax)
{
return x86_family(eax) == 0x17 &&
x86_model(eax) >= 0x30 && x86_model(eax) <= 0x3f;
}
static bool is_zen3(uint32_t eax)
{
return x86_family(eax) == 0x19 && x86_model(eax) <= 0x0f;
}
/*
* Determining AMD support for a PMU event requires consulting the AMD
* PPR for the CPU or reference material derived therefrom. The AMD
* test code herein has been verified to work on Zen1, Zen2, and Zen3.
*
* Feel free to add more AMD CPUs that are documented to support event
* select 0xc2 umask 0 as "retired branch instructions."
*/
static bool use_amd_pmu(void)
{
struct kvm_cpuid_entry2 *entry;
entry = kvm_get_supported_cpuid_index(1, 0);
return is_amd_cpu() && entry &&
(is_zen1(entry->eax) ||
is_zen2(entry->eax) ||
is_zen3(entry->eax));
}
int main(int argc, char *argv[])
{
void (*guest_code)(void) = NULL;
struct kvm_vm *vm;
int r;
/* Tell stdout not to buffer its content */
setbuf(stdout, NULL);
r = kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER);
if (!r) {
print_skip("KVM_CAP_PMU_EVENT_FILTER not supported");
exit(KSFT_SKIP);
}
if (use_intel_pmu())
guest_code = intel_guest_code;
else if (use_amd_pmu())
guest_code = amd_guest_code;
if (!guest_code) {
print_skip("Don't know how to test this guest PMU");
exit(KSFT_SKIP);
}
vm = vm_create_default(VCPU_ID, 0, guest_code);
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vm, VCPU_ID);
if (!sanity_check_pmu(vm)) {
print_skip("Guest PMU is not functional");
exit(KSFT_SKIP);
}
test_without_filter(vm);
test_member_deny_list(vm);
test_member_allow_list(vm);
test_not_member_deny_list(vm);
test_not_member_allow_list(vm);
kvm_vm_free(vm);
return 0;
}

View file

@ -77,8 +77,8 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
switch (get_ucall(vm, vcpuid, &uc)) { switch (get_ucall(vm, vcpuid, &uc)) {
case UCALL_SYNC: case UCALL_SYNC:
TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
stage + 1, (ulong)uc.args[1]); stage + 1, (ulong)uc.args[1]);
return; return;
case UCALL_DONE: case UCALL_DONE:
return; return;

View file

@ -30,8 +30,8 @@ static struct kvm_vm *vm;
static void l2_guest_code(void) static void l2_guest_code(void)
{ {
/* Exit to L0 */ /* Exit to L0 */
asm volatile("inb %%dx, %%al" asm volatile("inb %%dx, %%al"
: : [port] "d" (PORT_L0_EXIT) : "rax"); : : [port] "d" (PORT_L0_EXIT) : "rax");
} }
static void l1_guest_code(struct vmx_pages *vmx_pages) static void l1_guest_code(struct vmx_pages *vmx_pages)

View file

@ -0,0 +1,139 @@
// SPDX-License-Identifier: GPL-2.0-only
#include "test_util.h"
#include "kvm_util.h"
#include "processor.h"
#include <signal.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include "kselftest.h"
#define VCPU_ID 0
static struct kvm_vm *vm;
static void guest_ud_handler(struct ex_regs *regs)
{
/* Loop on the ud2 until guest state is made invalid. */
}
static void guest_code(void)
{
asm volatile("ud2");
}
static void __run_vcpu_with_invalid_state(void)
{
struct kvm_run *run = vcpu_state(vm, VCPU_ID);
vcpu_run(vm, VCPU_ID);
TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
"Expected KVM_EXIT_INTERNAL_ERROR, got %d (%s)\n",
run->exit_reason, exit_reason_str(run->exit_reason));
TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
"Expected emulation failure, got %d\n",
run->emulation_failure.suberror);
}
static void run_vcpu_with_invalid_state(void)
{
/*
* Always run twice to verify KVM handles the case where _KVM_ queues
* an exception with invalid state and then exits to userspace, i.e.
* that KVM doesn't explode if userspace ignores the initial error.
*/
__run_vcpu_with_invalid_state();
__run_vcpu_with_invalid_state();
}
static void set_timer(void)
{
struct itimerval timer;
timer.it_value.tv_sec = 0;
timer.it_value.tv_usec = 200;
timer.it_interval = timer.it_value;
ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0);
}
static void set_or_clear_invalid_guest_state(bool set)
{
static struct kvm_sregs sregs;
if (!sregs.cr0)
vcpu_sregs_get(vm, VCPU_ID, &sregs);
sregs.tr.unusable = !!set;
vcpu_sregs_set(vm, VCPU_ID, &sregs);
}
static void set_invalid_guest_state(void)
{
set_or_clear_invalid_guest_state(true);
}
static void clear_invalid_guest_state(void)
{
set_or_clear_invalid_guest_state(false);
}
static void sigalrm_handler(int sig)
{
struct kvm_vcpu_events events;
TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig);
vcpu_events_get(vm, VCPU_ID, &events);
/*
* If an exception is pending, attempt KVM_RUN with invalid guest,
* otherwise rearm the timer and keep doing so until the timer fires
* between KVM queueing an exception and re-entering the guest.
*/
if (events.exception.pending) {
set_invalid_guest_state();
run_vcpu_with_invalid_state();
} else {
set_timer();
}
}
int main(int argc, char *argv[])
{
if (!is_intel_cpu() || vm_is_unrestricted_guest(NULL)) {
print_skip("Must be run with kvm_intel.unrestricted_guest=0");
exit(KSFT_SKIP);
}
vm = vm_create_default(VCPU_ID, 0, (void *)guest_code);
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vm, VCPU_ID);
vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
/*
* Stuff invalid guest state for L2 by making TR unusuable. The next
* KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
* emulating invalid guest state for L2.
*/
set_invalid_guest_state();
run_vcpu_with_invalid_state();
/*
* Verify KVM also handles the case where userspace gains control while
* an exception is pending and stuffs invalid state. Run with valid
* guest state and a timer firing every 200us, and attempt to enter the
* guest with invalid state when the handler interrupts KVM with an
* exception pending.
*/
clear_invalid_guest_state();
TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR,
"Failed to register SIGALRM handler, errno = %d (%s)",
errno, strerror(errno));
set_timer();
run_vcpu_with_invalid_state();
}

View file

@ -46,20 +46,20 @@ static struct kvm_vm *vm;
#define MIN_STEAL_TIME 50000 #define MIN_STEAL_TIME 50000
struct pvclock_vcpu_time_info { struct pvclock_vcpu_time_info {
u32 version; u32 version;
u32 pad0; u32 pad0;
u64 tsc_timestamp; u64 tsc_timestamp;
u64 system_time; u64 system_time;
u32 tsc_to_system_mul; u32 tsc_to_system_mul;
s8 tsc_shift; s8 tsc_shift;
u8 flags; u8 flags;
u8 pad[2]; u8 pad[2];
} __attribute__((__packed__)); /* 32 bytes */ } __attribute__((__packed__)); /* 32 bytes */
struct pvclock_wall_clock { struct pvclock_wall_clock {
u32 version; u32 version;
u32 sec; u32 sec;
u32 nsec; u32 nsec;
} __attribute__((__packed__)); } __attribute__((__packed__));
struct vcpu_runstate_info { struct vcpu_runstate_info {
@ -74,11 +74,11 @@ struct arch_vcpu_info {
}; };
struct vcpu_info { struct vcpu_info {
uint8_t evtchn_upcall_pending; uint8_t evtchn_upcall_pending;
uint8_t evtchn_upcall_mask; uint8_t evtchn_upcall_mask;
unsigned long evtchn_pending_sel; unsigned long evtchn_pending_sel;
struct arch_vcpu_info arch; struct arch_vcpu_info arch;
struct pvclock_vcpu_time_info time; struct pvclock_vcpu_time_info time;
}; /* 64 bytes (x86) */ }; /* 64 bytes (x86) */
struct shared_info { struct shared_info {
@ -493,7 +493,7 @@ int main(int argc, char *argv[])
vm_ts.tv_sec = wc->sec; vm_ts.tv_sec = wc->sec;
vm_ts.tv_nsec = wc->nsec; vm_ts.tv_nsec = wc->nsec;
TEST_ASSERT(wc->version && !(wc->version & 1), TEST_ASSERT(wc->version && !(wc->version & 1),
"Bad wallclock version %x", wc->version); "Bad wallclock version %x", wc->version);
TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old"); TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new"); TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");

View file

@ -427,9 +427,6 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
#endif #endif
kvm_async_pf_vcpu_init(vcpu); kvm_async_pf_vcpu_init(vcpu);
vcpu->pre_pcpu = -1;
INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_in_spin_loop(vcpu, false);
kvm_vcpu_set_dy_eligible(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false);
vcpu->preempted = false; vcpu->preempted = false;
@ -3163,8 +3160,10 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
{ {
struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
#ifdef CONFIG_HAVE_KVM_DIRTY_RING
if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm)) if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
return; return;
#endif
if (memslot && kvm_slot_dirty_track_enabled(memslot)) { if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
unsigned long rel_gfn = gfn - memslot->base_gfn; unsigned long rel_gfn = gfn - memslot->base_gfn;