From 66200bbcde697dea7f48071d325ea4dec6f66b11 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Tue, 12 Apr 2022 19:49:00 -0700 Subject: [PATCH 01/28] Drivers: hv: vmbus: Add VMbus IMC device to unsupported list Hyper-V may offer an Initial Machine Configuration (IMC) synthetic device to guest VMs. The device may be used by Windows guests to get specialization information, such as the hostname. But the device is not used in Linux and there is no Linux driver, so it is unsupported. Currently, the IMC device GUID is not recognized by the VMbus driver, which results in an "Unknown GUID" error message during boot. Add the GUID to the list of known but unsupported devices so that the error message is not generated. Other than avoiding the error message, there is no change in guest behavior. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/1649818140-100953-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 1 + include/linux/hyperv.h | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 85a2142c9384..50fea31f019f 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -152,6 +152,7 @@ static const struct { { HV_AVMA1_GUID }, { HV_AVMA2_GUID }, { HV_RDV_GUID }, + { HV_IMC_GUID }, }; /* diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index fe2e0179ed51..2d085d951ee4 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1451,12 +1451,14 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size); 0x80, 0x2e, 0x27, 0xed, 0xe1, 0x9f) /* - * Linux doesn't support the 3 devices: the first two are for - * Automatic Virtual Machine Activation, and the third is for - * Remote Desktop Virtualization. + * Linux doesn't support these 4 devices: the first two are for + * Automatic Virtual Machine Activation, the third is for + * Remote Desktop Virtualization, and the fourth is Initial + * Machine Configuration (IMC) used only by Windows guests. * {f8e65716-3cb3-4a06-9a60-1889c5cccab5} * {3375baf4-9e15-4b30-b765-67acb10d607b} * {276aacf4-ac15-426c-98dd-7521ad3f01fe} + * {c376c1c3-d276-48d2-90a9-c04748072c60} */ #define HV_AVMA1_GUID \ @@ -1471,6 +1473,10 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size); .guid = GUID_INIT(0x276aacf4, 0xac15, 0x426c, 0x98, 0xdd, \ 0x75, 0x21, 0xad, 0x3f, 0x01, 0xfe) +#define HV_IMC_GUID \ + .guid = GUID_INIT(0xc376c1c3, 0xd276, 0x48d2, 0x90, 0xa9, \ + 0xc0, 0x47, 0x48, 0x07, 0x2c, 0x60) + /* * Common header for Hyper-V ICs */ From 08e61e861a0e47e5e1a3fb78406afd6b0cea6b6d Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 13 Apr 2022 07:36:21 -0600 Subject: [PATCH 02/28] PCI: hv: Fix multi-MSI to allow more than one MSI vector If the allocation of multiple MSI vectors for multi-MSI fails in the core PCI framework, the framework will retry the allocation as a single MSI vector, assuming that meets the min_vecs specified by the requesting driver. Hyper-V advertises that multi-MSI is supported, but reuses the VECTOR domain to implement that for x86. The VECTOR domain does not support multi-MSI, so the alloc will always fail and fallback to a single MSI allocation. In short, Hyper-V advertises a capability it does not implement. Hyper-V can support multi-MSI because it coordinates with the hypervisor to map the MSIs in the IOMMU's interrupt remapper, which is something the VECTOR domain does not have. Therefore the fix is simple - copy what the x86 IOMMU drivers (AMD/Intel-IR) do by removing X86_IRQ_ALLOC_CONTIGUOUS_VECTORS after calling the VECTOR domain's pci_msi_prepare(). Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs") Signed-off-by: Jeffrey Hugo Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1649856981-14649-1-git-send-email-quic_jhugo@quicinc.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index d270a204324e..1cbe24b92a38 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -614,7 +614,16 @@ static void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, static int hv_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *info) { - return pci_msi_prepare(domain, dev, nvec, info); + int ret = pci_msi_prepare(domain, dev, nvec, info); + + /* + * By using the interrupt remapper in the hypervisor IOMMU, contiguous + * CPU vectors is not needed for multi-MSI + */ + if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) + info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + + return ret; } /** From 82cd4bacff88a11e36f143e2cb950174b09c86c3 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 19 Apr 2022 14:23:20 +0200 Subject: [PATCH 03/28] Drivers: hv: vmbus: Fix handling of messages with transaction ID of zero vmbus_request_addr() returns 0 (zero) if the transaction ID passed to as argument is 0. This is unfortunate for two reasons: first, netvsc_send_completion() does not check for a NULL cmd_rqst (before dereferencing the corresponding NVSP message); second, 0 is a *valid* value of cmd_rqst in netvsc_send_tx_complete(), cf. the call of vmbus_sendpacket() in netvsc_send_pkt(). vmbus_request_addr() has included the code in question since its introduction with commit e8b7db38449ac ("Drivers: hv: vmbus: Add vmbus_requestor data structure for VMBus hardening"); such code was motivated by the early use of vmbus_requestor by hv_storvsc. Since hv_storvsc moved to a tag-based mechanism to generate and retrieve transaction IDs with commit bf5fd8cae3c8f ("scsi: storvsc: Use blk_mq_unique_tag() to generate requestIDs"), vmbus_request_addr() can be modified to return VMBUS_RQST_ERROR if the ID is 0. This change solves the issues in hv_netvsc (and makes the handling of messages with transaction ID of 0 consistent with the semantics "the ID is not contained in the requestor/invalid ID"). vmbus_next_request_id(), vmbus_request_addr() should still reserve the ID of 0 for Hyper-V, because Hyper-V will "ignore" (not respond to) VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED packets/requests with transaction ID of 0 from the guest. Fixes: bf5fd8cae3c8f ("scsi: storvsc: Use blk_mq_unique_tag() to generate requestIDs") Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220419122325.10078-2-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index dc5c35210c16..20fc8d50a039 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1245,7 +1245,9 @@ u64 vmbus_next_request_id(struct vmbus_channel *channel, u64 rqst_addr) /* * Cannot return an ID of 0, which is reserved for an unsolicited - * message from Hyper-V. + * message from Hyper-V; Hyper-V does not acknowledge (respond to) + * VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED requests with ID of + * 0 sent by the guest. */ return current_id + 1; } @@ -1270,7 +1272,7 @@ u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id) /* Hyper-V can send an unsolicited message with ID of 0 */ if (!trans_id) - return trans_id; + return VMBUS_RQST_ERROR; spin_lock_irqsave(&rqstor->req_lock, flags); From de5ddb7d44347ad8b00533c1850a4e2e636a1ce9 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 19 Apr 2022 14:23:21 +0200 Subject: [PATCH 04/28] PCI: hv: Use vmbus_requestor to generate transaction IDs for VMbus hardening Currently, pointers to guest memory are passed to Hyper-V as transaction IDs in hv_pci. In the face of errors or malicious behavior in Hyper-V, hv_pci should not expose or trust the transaction IDs returned by Hyper-V to be valid guest memory addresses. Instead, use small integers generated by vmbus_requestor as request (transaction) IDs. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220419122325.10078-3-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 39 +++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 1cbe24b92a38..c18e9b608bd6 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -91,6 +91,13 @@ static enum pci_protocol_version_t pci_protocol_versions[] = { /* space for 32bit serial number as string */ #define SLOT_NAME_SIZE 11 +/* + * Size of requestor for VMbus; the value is based on the observation + * that having more than one request outstanding is 'rare', and so 64 + * should be generous in ensuring that we don't ever run out. + */ +#define HV_PCI_RQSTOR_SIZE 64 + /* * Message Types */ @@ -1529,7 +1536,7 @@ static void hv_int_desc_free(struct hv_pci_dev *hpdev, int_pkt->wslot.slot = hpdev->desc.win_slot.slot; int_pkt->int_desc = *int_desc; vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt), - (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0); + 0, VM_PKT_DATA_INBAND, 0); kfree(int_desc); } @@ -2661,7 +2668,7 @@ static void hv_eject_device_work(struct work_struct *work) ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE; ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot; vmbus_sendpacket(hbus->hdev->channel, ejct_pkt, - sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt, + sizeof(*ejct_pkt), 0, VM_PKT_DATA_INBAND, 0); /* For the get_pcichild() in hv_pci_eject_device() */ @@ -2708,8 +2715,9 @@ static void hv_pci_onchannelcallback(void *context) const int packet_size = 0x100; int ret; struct hv_pcibus_device *hbus = context; + struct vmbus_channel *chan = hbus->hdev->channel; u32 bytes_recvd; - u64 req_id; + u64 req_id, req_addr; struct vmpacket_descriptor *desc; unsigned char *buffer; int bufferlen = packet_size; @@ -2727,8 +2735,8 @@ static void hv_pci_onchannelcallback(void *context) return; while (1) { - ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer, - bufferlen, &bytes_recvd, &req_id); + ret = vmbus_recvpacket_raw(chan, buffer, bufferlen, + &bytes_recvd, &req_id); if (ret == -ENOBUFS) { kfree(buffer); @@ -2755,11 +2763,14 @@ static void hv_pci_onchannelcallback(void *context) switch (desc->type) { case VM_PKT_COMP: - /* - * The host is trusted, and thus it's safe to interpret - * this transaction ID as a pointer. - */ - comp_packet = (struct pci_packet *)req_id; + req_addr = chan->request_addr_callback(chan, req_id); + if (req_addr == VMBUS_RQST_ERROR) { + dev_err(&hbus->hdev->device, + "Invalid transaction ID %llx\n", + req_id); + break; + } + comp_packet = (struct pci_packet *)req_addr; response = (struct pci_response *)buffer; comp_packet->completion_func(comp_packet->compl_ctxt, response, @@ -3440,6 +3451,10 @@ static int hv_pci_probe(struct hv_device *hdev, goto free_dom; } + hdev->channel->next_request_id_callback = vmbus_next_request_id; + hdev->channel->request_addr_callback = vmbus_request_addr; + hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE; + ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, hv_pci_onchannelcallback, hbus); if (ret) @@ -3770,6 +3785,10 @@ static int hv_pci_resume(struct hv_device *hdev) hbus->state = hv_pcibus_init; + hdev->channel->next_request_id_callback = vmbus_next_request_id; + hdev->channel->request_addr_callback = vmbus_request_addr; + hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE; + ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, hv_pci_onchannelcallback, hbus); if (ret) From b03afa57c65e1e045e02df49777e953742745f4c Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 19 Apr 2022 14:23:22 +0200 Subject: [PATCH 05/28] Drivers: hv: vmbus: Introduce vmbus_sendpacket_getid() The function can be used to send a VMbus packet and retrieve the corresponding transaction ID. It will be used by hv_pci. No functional change. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220419122325.10078-4-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel.c | 38 ++++++++++++++++++++++++++++++++------ drivers/hv/hyperv_vmbus.h | 2 +- drivers/hv/ring_buffer.c | 14 +++++++++++--- include/linux/hyperv.h | 7 +++++++ 4 files changed, 51 insertions(+), 10 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 20fc8d50a039..585a8084848b 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1022,11 +1022,13 @@ void vmbus_close(struct vmbus_channel *channel) EXPORT_SYMBOL_GPL(vmbus_close); /** - * vmbus_sendpacket() - Send the specified buffer on the given channel + * vmbus_sendpacket_getid() - Send the specified buffer on the given channel * @channel: Pointer to vmbus_channel structure * @buffer: Pointer to the buffer you want to send the data from. * @bufferlen: Maximum size of what the buffer holds. * @requestid: Identifier of the request + * @trans_id: Identifier of the transaction associated to this request, if + * the send is successful; undefined, otherwise. * @type: Type of packet that is being sent e.g. negotiate, time * packet etc. * @flags: 0 or VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED @@ -1036,8 +1038,8 @@ EXPORT_SYMBOL_GPL(vmbus_close); * * Mainly used by Hyper-V drivers. */ -int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, - u32 bufferlen, u64 requestid, +int vmbus_sendpacket_getid(struct vmbus_channel *channel, void *buffer, + u32 bufferlen, u64 requestid, u64 *trans_id, enum vmbus_packet_type type, u32 flags) { struct vmpacket_descriptor desc; @@ -1063,7 +1065,31 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, bufferlist[2].iov_base = &aligned_data; bufferlist[2].iov_len = (packetlen_aligned - packetlen); - return hv_ringbuffer_write(channel, bufferlist, num_vecs, requestid); + return hv_ringbuffer_write(channel, bufferlist, num_vecs, requestid, trans_id); +} +EXPORT_SYMBOL(vmbus_sendpacket_getid); + +/** + * vmbus_sendpacket() - Send the specified buffer on the given channel + * @channel: Pointer to vmbus_channel structure + * @buffer: Pointer to the buffer you want to send the data from. + * @bufferlen: Maximum size of what the buffer holds. + * @requestid: Identifier of the request + * @type: Type of packet that is being sent e.g. negotiate, time + * packet etc. + * @flags: 0 or VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED + * + * Sends data in @buffer directly to Hyper-V via the vmbus. + * This will send the data unparsed to Hyper-V. + * + * Mainly used by Hyper-V drivers. + */ +int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, + u32 bufferlen, u64 requestid, + enum vmbus_packet_type type, u32 flags) +{ + return vmbus_sendpacket_getid(channel, buffer, bufferlen, + requestid, NULL, type, flags); } EXPORT_SYMBOL(vmbus_sendpacket); @@ -1122,7 +1148,7 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, bufferlist[2].iov_base = &aligned_data; bufferlist[2].iov_len = (packetlen_aligned - packetlen); - return hv_ringbuffer_write(channel, bufferlist, 3, requestid); + return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL); } EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); @@ -1160,7 +1186,7 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, bufferlist[2].iov_base = &aligned_data; bufferlist[2].iov_len = (packetlen_aligned - packetlen); - return hv_ringbuffer_write(channel, bufferlist, 3, requestid); + return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL); } EXPORT_SYMBOL_GPL(vmbus_sendpacket_mpb_desc); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 6b45c22bb717..4f5b824b16cf 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -181,7 +181,7 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info); int hv_ringbuffer_write(struct vmbus_channel *channel, const struct kvec *kv_list, u32 kv_count, - u64 requestid); + u64 requestid, u64 *trans_id); int hv_ringbuffer_read(struct vmbus_channel *channel, void *buffer, u32 buflen, u32 *buffer_actual_len, diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 3d215d9dec43..e101b11f95e5 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -283,7 +283,7 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info) /* Write to the ring buffer. */ int hv_ringbuffer_write(struct vmbus_channel *channel, const struct kvec *kv_list, u32 kv_count, - u64 requestid) + u64 requestid, u64 *trans_id) { int i; u32 bytes_avail_towrite; @@ -294,7 +294,7 @@ int hv_ringbuffer_write(struct vmbus_channel *channel, unsigned long flags; struct hv_ring_buffer_info *outring_info = &channel->outbound; struct vmpacket_descriptor *desc = kv_list[0].iov_base; - u64 rqst_id = VMBUS_NO_RQSTOR; + u64 __trans_id, rqst_id = VMBUS_NO_RQSTOR; if (channel->rescind) return -ENODEV; @@ -353,7 +353,15 @@ int hv_ringbuffer_write(struct vmbus_channel *channel, } } desc = hv_get_ring_buffer(outring_info) + old_write; - desc->trans_id = (rqst_id == VMBUS_NO_RQSTOR) ? requestid : rqst_id; + __trans_id = (rqst_id == VMBUS_NO_RQSTOR) ? requestid : rqst_id; + /* + * Ensure the compiler doesn't generate code that reads the value of + * the transaction ID from the ring buffer, which is shared with the + * Hyper-V host and subject to being changed at any time. + */ + WRITE_ONCE(desc->trans_id, __trans_id); + if (trans_id) + *trans_id = __trans_id; /* Set previous packet start */ prev_indices = hv_get_ring_bufferindices(outring_info); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 2d085d951ee4..7cb8718825ee 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1161,6 +1161,13 @@ extern int vmbus_open(struct vmbus_channel *channel, extern void vmbus_close(struct vmbus_channel *channel); +extern int vmbus_sendpacket_getid(struct vmbus_channel *channel, + void *buffer, + u32 bufferLen, + u64 requestid, + u64 *trans_id, + enum vmbus_packet_type type, + u32 flags); extern int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, u32 bufferLen, From 0aadb6a7bb811554cf39318b5d18e8ec50dd9f02 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 19 Apr 2022 14:23:23 +0200 Subject: [PATCH 06/28] Drivers: hv: vmbus: Introduce vmbus_request_addr_match() The function can be used to retrieve and clear/remove a transation ID from a channel requestor, provided the memory address corresponding to the ID equals a specified address. The function, and its 'lockless' variant __vmbus_request_addr_match(), will be used by hv_pci. Refactor vmbus_request_addr() to reuse the 'newly' introduced code. No functional change. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220419122325.10078-5-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel.c | 67 ++++++++++++++++++++++++++++++------------ include/linux/hyperv.h | 5 ++++ 2 files changed, 53 insertions(+), 19 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 585a8084848b..49f10a603a09 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1279,17 +1279,11 @@ u64 vmbus_next_request_id(struct vmbus_channel *channel, u64 rqst_addr) } EXPORT_SYMBOL_GPL(vmbus_next_request_id); -/* - * vmbus_request_addr - Returns the memory address stored at @trans_id - * in @rqstor. Uses a spin lock to avoid race conditions. - * @channel: Pointer to the VMbus channel struct - * @trans_id: Request id sent back from Hyper-V. Becomes the requestor's - * next request id. - */ -u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id) +/* As in vmbus_request_addr_match() but without the requestor lock */ +u64 __vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, + u64 rqst_addr) { struct vmbus_requestor *rqstor = &channel->requestor; - unsigned long flags; u64 req_addr; /* Check rqstor has been initialized */ @@ -1300,25 +1294,60 @@ u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id) if (!trans_id) return VMBUS_RQST_ERROR; - spin_lock_irqsave(&rqstor->req_lock, flags); - /* Data corresponding to trans_id is stored at trans_id - 1 */ trans_id--; /* Invalid trans_id */ - if (trans_id >= rqstor->size || !test_bit(trans_id, rqstor->req_bitmap)) { - spin_unlock_irqrestore(&rqstor->req_lock, flags); + if (trans_id >= rqstor->size || !test_bit(trans_id, rqstor->req_bitmap)) return VMBUS_RQST_ERROR; - } req_addr = rqstor->req_arr[trans_id]; - rqstor->req_arr[trans_id] = rqstor->next_request_id; - rqstor->next_request_id = trans_id; + if (rqst_addr == VMBUS_RQST_ADDR_ANY || req_addr == rqst_addr) { + rqstor->req_arr[trans_id] = rqstor->next_request_id; + rqstor->next_request_id = trans_id; - /* The already held spin lock provides atomicity */ - bitmap_clear(rqstor->req_bitmap, trans_id, 1); + /* The already held spin lock provides atomicity */ + bitmap_clear(rqstor->req_bitmap, trans_id, 1); + } - spin_unlock_irqrestore(&rqstor->req_lock, flags); return req_addr; } +EXPORT_SYMBOL_GPL(__vmbus_request_addr_match); + +/* + * vmbus_request_addr_match - Clears/removes @trans_id from the @channel's + * requestor, provided the memory address stored at @trans_id equals @rqst_addr + * (or provided @rqst_addr matches the sentinel value VMBUS_RQST_ADDR_ANY). + * + * Returns the memory address stored at @trans_id, or VMBUS_RQST_ERROR if + * @trans_id is not contained in the requestor. + * + * Acquires and releases the requestor spin lock. + */ +u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, + u64 rqst_addr) +{ + struct vmbus_requestor *rqstor = &channel->requestor; + unsigned long flags; + u64 req_addr; + + spin_lock_irqsave(&rqstor->req_lock, flags); + req_addr = __vmbus_request_addr_match(channel, trans_id, rqst_addr); + spin_unlock_irqrestore(&rqstor->req_lock, flags); + + return req_addr; +} +EXPORT_SYMBOL_GPL(vmbus_request_addr_match); + +/* + * vmbus_request_addr - Returns the memory address stored at @trans_id + * in @rqstor. Uses a spin lock to avoid race conditions. + * @channel: Pointer to the VMbus channel struct + * @trans_id: Request id sent back from Hyper-V. Becomes the requestor's + * next request id. + */ +u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id) +{ + return vmbus_request_addr_match(channel, trans_id, VMBUS_RQST_ADDR_ANY); +} EXPORT_SYMBOL_GPL(vmbus_request_addr); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 7cb8718825ee..2e4962a2fbec 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -788,6 +788,7 @@ struct vmbus_requestor { #define VMBUS_NO_RQSTOR U64_MAX #define VMBUS_RQST_ERROR (U64_MAX - 1) +#define VMBUS_RQST_ADDR_ANY U64_MAX /* NetVSC-specific */ #define VMBUS_RQST_ID_NO_RESPONSE (U64_MAX - 2) /* StorVSC-specific */ @@ -1042,6 +1043,10 @@ struct vmbus_channel { }; u64 vmbus_next_request_id(struct vmbus_channel *channel, u64 rqst_addr); +u64 __vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, + u64 rqst_addr); +u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, + u64 rqst_addr); u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id); static inline bool is_hvsock_channel(const struct vmbus_channel *c) From b91eaf7267cf7aec0a4e087decf7770dfb694d78 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 19 Apr 2022 14:23:24 +0200 Subject: [PATCH 07/28] Drivers: hv: vmbus: Introduce {lock,unlock}_requestor() To abtract the lock and unlock operations on the requestor spin lock. The helpers will come in handy in hv_pci. No functional change. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220419122325.10078-6-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel.c | 11 +++++------ include/linux/hyperv.h | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 49f10a603a09..56f7e06c673e 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1252,12 +1252,12 @@ u64 vmbus_next_request_id(struct vmbus_channel *channel, u64 rqst_addr) if (!channel->rqstor_size) return VMBUS_NO_RQSTOR; - spin_lock_irqsave(&rqstor->req_lock, flags); + lock_requestor(channel, flags); current_id = rqstor->next_request_id; /* Requestor array is full */ if (current_id >= rqstor->size) { - spin_unlock_irqrestore(&rqstor->req_lock, flags); + unlock_requestor(channel, flags); return VMBUS_RQST_ERROR; } @@ -1267,7 +1267,7 @@ u64 vmbus_next_request_id(struct vmbus_channel *channel, u64 rqst_addr) /* The already held spin lock provides atomicity */ bitmap_set(rqstor->req_bitmap, current_id, 1); - spin_unlock_irqrestore(&rqstor->req_lock, flags); + unlock_requestor(channel, flags); /* * Cannot return an ID of 0, which is reserved for an unsolicited @@ -1327,13 +1327,12 @@ EXPORT_SYMBOL_GPL(__vmbus_request_addr_match); u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, u64 rqst_addr) { - struct vmbus_requestor *rqstor = &channel->requestor; unsigned long flags; u64 req_addr; - spin_lock_irqsave(&rqstor->req_lock, flags); + lock_requestor(channel, flags); req_addr = __vmbus_request_addr_match(channel, trans_id, rqst_addr); - spin_unlock_irqrestore(&rqstor->req_lock, flags); + unlock_requestor(channel, flags); return req_addr; } diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 2e4962a2fbec..460a716f4748 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1042,6 +1042,21 @@ struct vmbus_channel { u32 max_pkt_size; }; +#define lock_requestor(channel, flags) \ +do { \ + struct vmbus_requestor *rqstor = &(channel)->requestor; \ + \ + spin_lock_irqsave(&rqstor->req_lock, flags); \ +} while (0) + +static __always_inline void unlock_requestor(struct vmbus_channel *channel, + unsigned long flags) +{ + struct vmbus_requestor *rqstor = &channel->requestor; + + spin_unlock_irqrestore(&rqstor->req_lock, flags); +} + u64 vmbus_next_request_id(struct vmbus_channel *channel, u64 rqst_addr); u64 __vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, u64 rqst_addr); From a765ed47e45166451680ee9af2b9e435c82ec3ba Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 19 Apr 2022 14:23:25 +0200 Subject: [PATCH 08/28] PCI: hv: Fix synchronization between channel callback and hv_compose_msi_msg() Dexuan wrote: "[...] when we disable AccelNet, the host PCI VSP driver sends a PCI_EJECT message first, and the channel callback may set hpdev->state to hv_pcichild_ejecting on a different CPU. This can cause hv_compose_msi_msg() to exit from the loop and 'return', and the on-stack variable 'ctxt' is invalid. Now, if the response message from the host arrives, the channel callback will try to access the invalid 'ctxt' variable, and this may cause a crash." Schematically: Hyper-V sends PCI_EJECT msg hv_pci_onchannelcallback() state = hv_pcichild_ejecting hv_compose_msi_msg() alloc and init comp_pkt state == hv_pcichild_ejecting Hyper-V sends VM_PKT_COMP msg hv_pci_onchannelcallback() retrieve address of comp_pkt 'free' comp_pkt and return comp_pkt->completion_func() Dexuan also showed how the crash can be triggered after introducing suitable delays in the driver code, thus validating the 'assumption' that the host can still normally respond to the guest's compose_msi request after the host has started to eject the PCI device. Fix the synchronization by leveraging the requestor lock as follows: - Before 'return'-ing in hv_compose_msi_msg(), remove the ID (while holding the requestor lock) associated to the completion packet. - Retrieve the address *and call ->completion_func() within a same (requestor) critical section in hv_pci_onchannelcallback(). Reported-by: Wei Hu Reported-by: Dexuan Cui Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220419122325.10078-7-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 33 +++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index c18e9b608bd6..5800ecfcc517 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1707,7 +1707,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) struct pci_create_interrupt3 v3; } int_pkts; } __packed ctxt; - + u64 trans_id; u32 size; int ret; @@ -1769,10 +1769,10 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) goto free_int_desc; } - ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts, - size, (unsigned long)&ctxt.pci_pkt, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + ret = vmbus_sendpacket_getid(hpdev->hbus->hdev->channel, &ctxt.int_pkts, + size, (unsigned long)&ctxt.pci_pkt, + &trans_id, VM_PKT_DATA_INBAND, + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); if (ret) { dev_err(&hbus->hdev->device, "Sending request for interrupt failed: 0x%x", @@ -1851,6 +1851,15 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) enable_tasklet: tasklet_enable(&channel->callback_event); + /* + * The completion packet on the stack becomes invalid after 'return'; + * remove the ID from the VMbus requestor if the identifier is still + * mapped to/associated with the packet. (The identifier could have + * been 're-used', i.e., already removed and (re-)mapped.) + * + * Cf. hv_pci_onchannelcallback(). + */ + vmbus_request_addr_match(channel, trans_id, (unsigned long)&ctxt.pci_pkt); free_int_desc: kfree(int_desc); drop_reference: @@ -2729,6 +2738,7 @@ static void hv_pci_onchannelcallback(void *context) struct pci_dev_inval_block *inval; struct pci_dev_incoming *dev_message; struct hv_pci_dev *hpdev; + unsigned long flags; buffer = kmalloc(bufferlen, GFP_ATOMIC); if (!buffer) @@ -2763,8 +2773,11 @@ static void hv_pci_onchannelcallback(void *context) switch (desc->type) { case VM_PKT_COMP: - req_addr = chan->request_addr_callback(chan, req_id); + lock_requestor(chan, flags); + req_addr = __vmbus_request_addr_match(chan, req_id, + VMBUS_RQST_ADDR_ANY); if (req_addr == VMBUS_RQST_ERROR) { + unlock_requestor(chan, flags); dev_err(&hbus->hdev->device, "Invalid transaction ID %llx\n", req_id); @@ -2772,9 +2785,17 @@ static void hv_pci_onchannelcallback(void *context) } comp_packet = (struct pci_packet *)req_addr; response = (struct pci_response *)buffer; + /* + * Call ->completion_func() within the critical section to make + * sure that the packet pointer is still valid during the call: + * here 'valid' means that there's a task still waiting for the + * completion, and that the packet data is still on the waiting + * task's stack. Cf. hv_compose_msi_msg(). + */ comp_packet->completion_func(comp_packet->compl_ctxt, response, bytes_recvd); + unlock_requestor(chan, flags); break; case VM_PKT_DATA_INBAND: From 71abb94ff63063f0cb303ac7860568639c10f42e Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 28 Apr 2022 16:51:03 +0200 Subject: [PATCH 09/28] hv_sock: Check hv_pkt_iter_first_raw()'s return value The function returns NULL if the ring buffer doesn't contain enough readable bytes to constitute a packet descriptor. The ring buffer's write_index is in memory which is shared with the Hyper-V host, an erroneous or malicious host could thus change its value and overturn the result of hvs_stream_has_data(). Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20220428145107.7878-2-parri.andrea@gmail.com Signed-off-by: Wei Liu --- net/vmw_vsock/hyperv_transport.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index e111e13b6660..943352530936 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -603,6 +603,8 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, if (need_refill) { hvs->recv_desc = hv_pkt_iter_first_raw(hvs->chan); + if (!hvs->recv_desc) + return -ENOBUFS; ret = hvs_update_recv_data(hvs); if (ret) return ret; From 066f3377fb663f839a961cd66e71fc1a36437657 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 28 Apr 2022 16:51:04 +0200 Subject: [PATCH 10/28] hv_sock: Copy packets sent by Hyper-V out of the ring buffer Pointers to VMbus packets sent by Hyper-V are used by the hv_sock driver within the guest VM. Hyper-V can send packets with erroneous values or modify packet fields after they are processed by the guest. To defend against these scenarios, copy the incoming packet after validating its length and offset fields using hv_pkt_iter_{first,next}(). Use HVS_PKT_LEN(HVS_MTU_SIZE) to initialize the buffer which holds the copies of the incoming packets. In this way, the packet can no longer be modified by the host. Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20220428145107.7878-3-parri.andrea@gmail.com Signed-off-by: Wei Liu --- net/vmw_vsock/hyperv_transport.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 943352530936..8c37d07017fc 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -78,6 +78,9 @@ struct hvs_send_buf { ALIGN((payload_len), 8) + \ VMBUS_PKT_TRAILER_SIZE) +/* Upper bound on the size of a VMbus packet for hv_sock */ +#define HVS_MAX_PKT_SIZE HVS_PKT_LEN(HVS_MTU_SIZE) + union hvs_service_id { guid_t srv_id; @@ -378,6 +381,8 @@ static void hvs_open_connection(struct vmbus_channel *chan) rcvbuf = ALIGN(rcvbuf, HV_HYP_PAGE_SIZE); } + chan->max_pkt_size = HVS_MAX_PKT_SIZE; + ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb, conn_from_host ? new : sk); if (ret != 0) { @@ -602,7 +607,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, return -EOPNOTSUPP; if (need_refill) { - hvs->recv_desc = hv_pkt_iter_first_raw(hvs->chan); + hvs->recv_desc = hv_pkt_iter_first(hvs->chan); if (!hvs->recv_desc) return -ENOBUFS; ret = hvs_update_recv_data(hvs); @@ -618,7 +623,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, hvs->recv_data_len -= to_read; if (hvs->recv_data_len == 0) { - hvs->recv_desc = hv_pkt_iter_next_raw(hvs->chan, hvs->recv_desc); + hvs->recv_desc = hv_pkt_iter_next(hvs->chan, hvs->recv_desc); if (hvs->recv_desc) { ret = hvs_update_recv_data(hvs); if (ret) From dbde6d0c7a5a462a1767a07c28eadd2c3dd08fb2 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 28 Apr 2022 16:51:05 +0200 Subject: [PATCH 11/28] hv_sock: Add validation for untrusted Hyper-V values For additional robustness in the face of Hyper-V errors or malicious behavior, validate all values that originate from packets that Hyper-V has sent to the guest in the host-to-guest ring buffer. Ensure that invalid values cannot cause data being copied out of the bounds of the source buffer in hvs_stream_dequeue(). Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20220428145107.7878-4-parri.andrea@gmail.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 5 +++++ net/vmw_vsock/hyperv_transport.c | 10 ++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 460a716f4748..a01c9fd0a334 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1696,6 +1696,11 @@ static inline u32 hv_pkt_datalen(const struct vmpacket_descriptor *desc) return (desc->len8 << 3) - (desc->offset8 << 3); } +/* Get packet length associated with descriptor */ +static inline u32 hv_pkt_len(const struct vmpacket_descriptor *desc) +{ + return desc->len8 << 3; +} struct vmpacket_descriptor * hv_pkt_iter_first_raw(struct vmbus_channel *channel); diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 8c37d07017fc..fd98229e3db3 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -577,12 +577,18 @@ static bool hvs_dgram_allow(u32 cid, u32 port) static int hvs_update_recv_data(struct hvsock *hvs) { struct hvs_recv_buf *recv_buf; - u32 payload_len; + u32 pkt_len, payload_len; + + pkt_len = hv_pkt_len(hvs->recv_desc); + + if (pkt_len < HVS_HEADER_LEN) + return -EIO; recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1); payload_len = recv_buf->hdr.data_size; - if (payload_len > HVS_MTU_SIZE) + if (payload_len > pkt_len - HVS_HEADER_LEN || + payload_len > HVS_MTU_SIZE) return -EIO; if (payload_len == 0) From da795eb239d9e9496812e22cb582d38a71e0789a Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 28 Apr 2022 16:51:06 +0200 Subject: [PATCH 12/28] Drivers: hv: vmbus: Accept hv_sock offers in isolated guests So that isolated guests can communicate with the host via hv_sock channels. Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220428145107.7878-5-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 8 ++++++-- include/linux/hyperv.h | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 50fea31f019f..0db1b775e013 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -976,13 +976,17 @@ find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer) return channel; } -static bool vmbus_is_valid_device(const guid_t *guid) +static bool vmbus_is_valid_offer(const struct vmbus_channel_offer_channel *offer) { + const guid_t *guid = &offer->offer.if_type; u16 i; if (!hv_is_isolation_supported()) return true; + if (is_hvsock_offer(offer)) + return true; + for (i = 0; i < ARRAY_SIZE(vmbus_devs); i++) { if (guid_equal(guid, &vmbus_devs[i].guid)) return vmbus_devs[i].allowed_in_isolated; @@ -1004,7 +1008,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) trace_vmbus_onoffer(offer); - if (!vmbus_is_valid_device(&offer->offer.if_type)) { + if (!vmbus_is_valid_offer(offer)) { pr_err_ratelimited("Invalid offer %d from the host supporting isolation\n", offer->child_relid); atomic_dec(&vmbus_connection.offer_in_progress); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index a01c9fd0a334..b028905d8334 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1064,10 +1064,14 @@ u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, u64 rqst_addr); u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id); +static inline bool is_hvsock_offer(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); +} + static inline bool is_hvsock_channel(const struct vmbus_channel *c) { - return !!(c->offermsg.offer.chn_flags & - VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); + return is_hvsock_offer(&c->offermsg); } static inline bool is_sub_channel(const struct vmbus_channel *c) From 1c9de08f7f952b4101f092802581344033d84429 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 28 Apr 2022 16:51:07 +0200 Subject: [PATCH 13/28] Drivers: hv: vmbus: Refactor the ring-buffer iterator functions With no users of hv_pkt_iter_next_raw() and no "external" users of hv_pkt_iter_first_raw(), the iterator functions can be refactored and simplified to remove some indirection/code. Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220428145107.7878-6-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/ring_buffer.c | 34 ++++++++++------------------------ include/linux/hyperv.h | 35 ++++------------------------------- 2 files changed, 14 insertions(+), 55 deletions(-) diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index e101b11f95e5..59a4aa86d1f3 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -429,7 +429,7 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, memcpy(buffer, (const char *)desc + offset, packetlen); /* Advance ring index to next packet descriptor */ - __hv_pkt_iter_next(channel, desc, true); + __hv_pkt_iter_next(channel, desc); /* Notify host of update */ hv_pkt_iter_close(channel); @@ -464,22 +464,6 @@ static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi) return (rbi->ring_datasize - priv_read_loc) + write_loc; } -/* - * Get first vmbus packet without copying it out of the ring buffer - */ -struct vmpacket_descriptor *hv_pkt_iter_first_raw(struct vmbus_channel *channel) -{ - struct hv_ring_buffer_info *rbi = &channel->inbound; - - hv_debug_delay_test(channel, MESSAGE_DELAY); - - if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor)) - return NULL; - - return (struct vmpacket_descriptor *)(hv_get_ring_buffer(rbi) + rbi->priv_read_index); -} -EXPORT_SYMBOL_GPL(hv_pkt_iter_first_raw); - /* * Get first vmbus packet from ring buffer after read_index * @@ -491,11 +475,14 @@ struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel) struct vmpacket_descriptor *desc, *desc_copy; u32 bytes_avail, pkt_len, pkt_offset; - desc = hv_pkt_iter_first_raw(channel); - if (!desc) - return NULL; + hv_debug_delay_test(channel, MESSAGE_DELAY); - bytes_avail = min(rbi->pkt_buffer_size, hv_pkt_iter_avail(rbi)); + bytes_avail = hv_pkt_iter_avail(rbi); + if (bytes_avail < sizeof(struct vmpacket_descriptor)) + return NULL; + bytes_avail = min(rbi->pkt_buffer_size, bytes_avail); + + desc = (struct vmpacket_descriptor *)(hv_get_ring_buffer(rbi) + rbi->priv_read_index); /* * Ensure the compiler does not use references to incoming Hyper-V values (which @@ -542,8 +529,7 @@ EXPORT_SYMBOL_GPL(hv_pkt_iter_first); */ struct vmpacket_descriptor * __hv_pkt_iter_next(struct vmbus_channel *channel, - const struct vmpacket_descriptor *desc, - bool copy) + const struct vmpacket_descriptor *desc) { struct hv_ring_buffer_info *rbi = &channel->inbound; u32 packetlen = desc->len8 << 3; @@ -556,7 +542,7 @@ __hv_pkt_iter_next(struct vmbus_channel *channel, rbi->priv_read_index -= dsize; /* more data? */ - return copy ? hv_pkt_iter_first(channel) : hv_pkt_iter_first_raw(channel); + return hv_pkt_iter_first(channel); } EXPORT_SYMBOL_GPL(__hv_pkt_iter_next); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b028905d8334..c440c45887c2 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1706,55 +1706,28 @@ static inline u32 hv_pkt_len(const struct vmpacket_descriptor *desc) return desc->len8 << 3; } -struct vmpacket_descriptor * -hv_pkt_iter_first_raw(struct vmbus_channel *channel); - struct vmpacket_descriptor * hv_pkt_iter_first(struct vmbus_channel *channel); struct vmpacket_descriptor * __hv_pkt_iter_next(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt, - bool copy); + const struct vmpacket_descriptor *pkt); void hv_pkt_iter_close(struct vmbus_channel *channel); static inline struct vmpacket_descriptor * -hv_pkt_iter_next_pkt(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt, - bool copy) +hv_pkt_iter_next(struct vmbus_channel *channel, + const struct vmpacket_descriptor *pkt) { struct vmpacket_descriptor *nxt; - nxt = __hv_pkt_iter_next(channel, pkt, copy); + nxt = __hv_pkt_iter_next(channel, pkt); if (!nxt) hv_pkt_iter_close(channel); return nxt; } -/* - * Get next packet descriptor without copying it out of the ring buffer - * If at end of list, return NULL and update host. - */ -static inline struct vmpacket_descriptor * -hv_pkt_iter_next_raw(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt) -{ - return hv_pkt_iter_next_pkt(channel, pkt, false); -} - -/* - * Get next packet descriptor from iterator - * If at end of list, return NULL and update host. - */ -static inline struct vmpacket_descriptor * -hv_pkt_iter_next(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt) -{ - return hv_pkt_iter_next_pkt(channel, pkt, true); -} - #define foreach_vmbus_pkt(pkt, channel) \ for (pkt = hv_pkt_iter_first(channel); pkt; \ pkt = hv_pkt_iter_next(channel, pkt)) From 455880dfe292a2bdd3b4ad6a107299fce610e64b Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 27 Apr 2022 08:07:33 -0600 Subject: [PATCH 14/28] PCI: hv: Fix hv_arch_irq_unmask() for multi-MSI In the multi-MSI case, hv_arch_irq_unmask() will only operate on the first MSI of the N allocated. This is because only the first msi_desc is cached and it is shared by all the MSIs of the multi-MSI block. This means that hv_arch_irq_unmask() gets the correct address, but the wrong data (always 0). This can break MSIs. Lets assume MSI0 is vector 34 on CPU0, and MSI1 is vector 33 on CPU0. hv_arch_irq_unmask() is called on MSI0. It uses a hypercall to configure the MSI address and data (0) to vector 34 of CPU0. This is correct. Then hv_arch_irq_unmask is called on MSI1. It uses another hypercall to configure the MSI address and data (0) to vector 33 of CPU0. This is wrong, and results in both MSI0 and MSI1 being routed to vector 33. Linux will observe extra instances of MSI1 and no instances of MSI0 despite the endpoint device behaving correctly. For the multi-MSI case, we need unique address and data info for each MSI, but the cached msi_desc does not provide that. However, that information can be gotten from the int_desc cached in the chip_data by compose_msi_msg(). Fix the multi-MSI case to use that cached information instead. Since hv_set_msi_entry_from_desc() is no longer applicable, remove it. Signed-off-by: Jeffrey Hugo Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1651068453-29588-1-git-send-email-quic_jhugo@quicinc.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 5800ecfcc517..7aea0b7a994b 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -611,13 +611,6 @@ static unsigned int hv_msi_get_int_vector(struct irq_data *data) return cfg->vector; } -static void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, - struct msi_desc *msi_desc) -{ - msi_entry->address.as_uint32 = msi_desc->msg.address_lo; - msi_entry->data.as_uint32 = msi_desc->msg.data; -} - static int hv_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *info) { @@ -647,6 +640,7 @@ static void hv_arch_irq_unmask(struct irq_data *data) { struct msi_desc *msi_desc = irq_data_get_msi_desc(data); struct hv_retarget_device_interrupt *params; + struct tran_int_desc *int_desc; struct hv_pcibus_device *hbus; struct cpumask *dest; cpumask_var_t tmp; @@ -661,6 +655,7 @@ static void hv_arch_irq_unmask(struct irq_data *data) pdev = msi_desc_to_pci_dev(msi_desc); pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); + int_desc = data->chip_data; spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); @@ -668,7 +663,8 @@ static void hv_arch_irq_unmask(struct irq_data *data) memset(params, 0, sizeof(*params)); params->partition_id = HV_PARTITION_ID_SELF; params->int_entry.source = HV_INTERRUPT_SOURCE_MSI; - hv_set_msi_entry_from_desc(¶ms->int_entry.msi_entry, msi_desc); + params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff; + params->int_entry.msi_entry.data.as_uint32 = int_desc->data; params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | (hbus->hdev->dev_instance.b[4] << 16) | (hbus->hdev->dev_instance.b[7] << 8) | From 23e118a48acf7be223e57d98e98da8ac5a4071ac Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 2 May 2022 00:42:55 -0700 Subject: [PATCH 15/28] PCI: hv: Do not set PCI_COMMAND_MEMORY to reduce VM boot time Currently when the pci-hyperv driver finishes probing and initializing the PCI device, it sets the PCI_COMMAND_MEMORY bit; later when the PCI device is registered to the core PCI subsystem, the core PCI driver's BAR detection and initialization code toggles the bit multiple times, and each toggling of the bit causes the hypervisor to unmap/map the virtual BARs from/to the physical BARs, which can be slow if the BAR sizes are huge, e.g., a Linux VM with 14 GPU devices has to spend more than 3 minutes on BAR detection and initialization, causing a long boot time. Reduce the boot time by not setting the PCI_COMMAND_MEMORY bit when we register the PCI device (there is no need to have it set in the first place). The bit stays off till the PCI device driver calls pci_enable_device(). With this change, the boot time of such a 14-GPU VM is reduced by almost 3 minutes. Link: https://lore.kernel.org/lkml/20220419220007.26550-1-decui@microsoft.com/ Tested-by: Boqun Feng (Microsoft) Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Cc: Jake Oshins Link: https://lore.kernel.org/r/20220502074255.16901-1-decui@microsoft.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 7aea0b7a994b..cf2fe5754fde 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2103,12 +2103,17 @@ static void prepopulate_bars(struct hv_pcibus_device *hbus) } } if (high_size <= 1 && low_size <= 1) { - /* Set the memory enable bit. */ - _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, - &command); - command |= PCI_COMMAND_MEMORY; - _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, - command); + /* + * No need to set the PCI_COMMAND_MEMORY bit as + * the core PCI driver doesn't require the bit + * to be pre-set. Actually here we intentionally + * keep the bit off so that the PCI BAR probing + * in the core PCI driver doesn't cause Hyper-V + * to unnecessarily unmap/map the virtual BARs + * from/to the physical BARs multiple times. + * This reduces the VM boot time significantly + * if the BAR sizes are huge. + */ break; } } From 6733dd4af7818559114e2a4771363dd6239297f6 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Mon, 11 Apr 2022 21:28:59 -0700 Subject: [PATCH 16/28] drm/hyperv: Add error message for fb size greater than allocated Add error message when the size of requested framebuffer is more than the allocated size by vmbus mmio region for framebuffer. Signed-off-by: Saurabh Sengar Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1649737739-10113-1-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Wei Liu --- drivers/gpu/drm/hyperv/hyperv_drm_modeset.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c b/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c index e82b815f83a6..27f4fcb058f9 100644 --- a/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c +++ b/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c @@ -123,8 +123,11 @@ static int hyperv_pipe_check(struct drm_simple_display_pipe *pipe, if (fb->format->format != DRM_FORMAT_XRGB8888) return -EINVAL; - if (fb->pitches[0] * fb->height > hv->fb_size) + if (fb->pitches[0] * fb->height > hv->fb_size) { + drm_err(&hv->dev, "fb size requested by %s for %dX%d (pitch %d) greater than %ld\n", + current->comm, fb->width, fb->height, fb->pitches[0], hv->fb_size); return -EINVAL; + } return 0; } From f1f8288d19d03af9d03db219c23d07a6e8ecd51b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 9 May 2022 08:44:23 -0700 Subject: [PATCH 17/28] x86/hyperv: Disable hardlockup detector by default in Hyper-V guests In newer versions of Hyper-V, the x86/x64 PMU can be virtualized into guest VMs by explicitly enabling it. Linux kernels are typically built to automatically enable the hardlockup detector if the PMU is found. To prevent the possibility of false positives due to the vagaries of VM scheduling, disable the PMU-based hardlockup detector by default in a VM on Hyper-V. The hardlockup detector can still be enabled by overriding the default with the nmi_watchdog=1 option on the kernel boot line or via sysctl at runtime. This change mimics the approach taken with KVM guests in commit 692297d8f968 ("watchdog: introduce the hardlockup_detector_disable() function"). Linux on ARM64 does not provide a PMU-based hardlockup detector, so there's no corresponding disable in the Hyper-V init code on ARM64. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/1652111063-6535-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 4b67094215bb..16dad62c8155 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -465,6 +465,8 @@ static void __init ms_hyperv_init_platform(void) */ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) mark_tsc_unstable("running on Hyper-V"); + + hardlockup_detector_disable(); } static bool __init ms_hyperv_x2apic_available(void) From a6b94c6b49198266eaf78095a632df7245ef5196 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 2 May 2022 09:36:28 -0700 Subject: [PATCH 18/28] Drivers: hv: vmbus: Remove support for Hyper-V 2008 and Hyper-V 2008R2/Win7 The VMbus driver has special case code for running on the first released versions of Hyper-V: 2008 and 2008 R2/Windows 7. These versions are now out of support (except for extended security updates) and lack the performance features needed for effective production usage of Linux guests. Simplify the code by removing the negotiation of the VMbus protocol versions required for these releases of Hyper-V, and by removing the special case code for handling these VMbus protocol versions. Signed-off-by: Michael Kelley Reviewed-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/1651509391-2058-2-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 29 +++++++------------ drivers/hv/connection.c | 6 ++-- drivers/hv/vmbus_drv.c | 60 ++++++++------------------------------- include/linux/hyperv.h | 10 +++++-- 4 files changed, 33 insertions(+), 72 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 0db1b775e013..97d8f5646778 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -714,15 +714,13 @@ static bool hv_cpuself_used(u32 cpu, struct vmbus_channel *chn) static int next_numa_node_id; /* - * Starting with Win8, we can statically distribute the incoming - * channel interrupt load by binding a channel to VCPU. + * We can statically distribute the incoming channel interrupt load + * by binding a channel to VCPU. * - * For pre-win8 hosts or non-performance critical channels we assign the - * VMBUS_CONNECT_CPU. - * - * Starting with win8, performance critical channels will be distributed - * evenly among all the available NUMA nodes. Once the node is assigned, - * we will assign the CPU based on a simple round robin scheme. + * For non-performance critical channels we assign the VMBUS_CONNECT_CPU. + * Performance critical channels will be distributed evenly among all + * the available NUMA nodes. Once the node is assigned, we will assign + * the CPU based on a simple round robin scheme. */ static void init_vp_index(struct vmbus_channel *channel) { @@ -733,13 +731,10 @@ static void init_vp_index(struct vmbus_channel *channel) u32 target_cpu; int numa_node; - if ((vmbus_proto_version == VERSION_WS2008) || - (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) || + if (!perf_chn || !alloc_cpumask_var(&available_mask, GFP_KERNEL)) { /* - * Prior to win8, all channel interrupts are - * delivered on VMBUS_CONNECT_CPU. - * Also if the channel is not a performance critical + * If the channel is not a performance critical * channel, bind it to VMBUS_CONNECT_CPU. * In case alloc_cpumask_var() fails, bind it to * VMBUS_CONNECT_CPU. @@ -932,11 +927,9 @@ static void vmbus_setup_channel_state(struct vmbus_channel *channel, */ channel->sig_event = VMBUS_EVENT_CONNECTION_ID; - if (vmbus_proto_version != VERSION_WS2008) { - channel->is_dedicated_interrupt = - (offer->is_dedicated_interrupt != 0); - channel->sig_event = offer->connection_id; - } + channel->is_dedicated_interrupt = + (offer->is_dedicated_interrupt != 0); + channel->sig_event = offer->connection_id; memcpy(&channel->offermsg, offer, sizeof(struct vmbus_channel_offer_channel)); diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index a3d8be8d6cfb..6218bbf6863a 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -47,6 +47,8 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version); /* * Table of VMBus versions listed from newest to oldest. + * VERSION_WIN7 and VERSION_WS2008 are no longer supported in + * Linux guests and are not listed. */ static __u32 vmbus_versions[] = { VERSION_WIN10_V5_3, @@ -56,9 +58,7 @@ static __u32 vmbus_versions[] = { VERSION_WIN10_V4_1, VERSION_WIN10, VERSION_WIN8_1, - VERSION_WIN8, - VERSION_WIN7, - VERSION_WS2008 + VERSION_WIN8 }; /* diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 14de17087864..9c1b3620775c 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1263,23 +1263,17 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) unsigned long *recv_int_page; u32 maxbits, relid; - if (vmbus_proto_version < VERSION_WIN8) { - maxbits = MAX_NUM_CHANNELS_SUPPORTED; - recv_int_page = vmbus_connection.recv_int_page; - } else { - /* - * When the host is win8 and beyond, the event page - * can be directly checked to get the id of the channel - * that has the interrupt pending. - */ - void *page_addr = hv_cpu->synic_event_page; - union hv_synic_event_flags *event - = (union hv_synic_event_flags *)page_addr + - VMBUS_MESSAGE_SINT; + /* + * The event page can be directly checked to get the id of + * the channel that has the interrupt pending. + */ + void *page_addr = hv_cpu->synic_event_page; + union hv_synic_event_flags *event + = (union hv_synic_event_flags *)page_addr + + VMBUS_MESSAGE_SINT; - maxbits = HV_EVENT_FLAGS_COUNT; - recv_int_page = event->flags; - } + maxbits = HV_EVENT_FLAGS_COUNT; + recv_int_page = event->flags; if (unlikely(!recv_int_page)) return; @@ -1351,40 +1345,10 @@ static void vmbus_isr(void) { struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); - void *page_addr = hv_cpu->synic_event_page; + void *page_addr; struct hv_message *msg; - union hv_synic_event_flags *event; - bool handled = false; - if (unlikely(page_addr == NULL)) - return; - - event = (union hv_synic_event_flags *)page_addr + - VMBUS_MESSAGE_SINT; - /* - * Check for events before checking for messages. This is the order - * in which events and messages are checked in Windows guests on - * Hyper-V, and the Windows team suggested we do the same. - */ - - if ((vmbus_proto_version == VERSION_WS2008) || - (vmbus_proto_version == VERSION_WIN7)) { - - /* Since we are a child, we only need to check bit 0 */ - if (sync_test_and_clear_bit(0, event->flags)) - handled = true; - } else { - /* - * Our host is win8 or above. The signaling mechanism - * has changed and we can directly look at the event page. - * If bit n is set then we have an interrup on the channel - * whose id is n. - */ - handled = true; - } - - if (handled) - vmbus_chan_sched(hv_cpu); + vmbus_chan_sched(hv_cpu); page_addr = hv_cpu->synic_message_page; msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index c440c45887c2..a2464295c14a 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -230,15 +230,19 @@ static inline u32 hv_get_avail_to_write_percent( * two 16 bit quantities: major_number. minor_number. * * 0 . 13 (Windows Server 2008) - * 1 . 1 (Windows 7) - * 2 . 4 (Windows 8) - * 3 . 0 (Windows 8 R2) + * 1 . 1 (Windows 7, WS2008 R2) + * 2 . 4 (Windows 8, WS2012) + * 3 . 0 (Windows 8.1, WS2012 R2) * 4 . 0 (Windows 10) * 4 . 1 (Windows 10 RS3) * 5 . 0 (Newer Windows 10) * 5 . 1 (Windows 10 RS4) * 5 . 2 (Windows Server 2019, RS5) * 5 . 3 (Windows Server 2022) + * + * The WS2008 and WIN7 versions are listed here for + * completeness but are no longer supported in the + * Linux kernel. */ #define VERSION_WS2008 ((0 << 16) | (13)) From 106b98a5181c1a5831f1fe31d33d17dd1f0e7ae1 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 2 May 2022 09:36:29 -0700 Subject: [PATCH 19/28] scsi: storvsc: Remove support for Hyper-V 2008 and 2008R2/Win7 The storvsc driver has special case code for running on the first released versions of Hyper-V: 2008 and 2008 R2/Windows 7. These versions are now out of support (except for extended security updates) and lack support for performance features like multiple VMbus channels that are needed for effective production usage of Linux guests. The negotiation of the VMbus protocol versions required by these old Hyper-V versions has been removed from the VMbus driver. So now remove the handling of these VMbus protocol versions from the storvsc driver. Signed-off-by: Michael Kelley Reviewed-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/1651509391-2058-3-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/scsi/storvsc_drv.c | 36 +++++++++--------------------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 9a0bba5a51a7..5585e9d30bbf 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1966,34 +1966,16 @@ static int storvsc_probe(struct hv_device *device, bool is_fc = ((dev_id->driver_data == SFC_GUID) ? true : false); int target = 0; struct storvsc_device *stor_device; - int max_luns_per_target; - int max_targets; - int max_channels; int max_sub_channels = 0; /* - * Based on the windows host we are running on, - * set state to properly communicate with the host. + * We support sub-channels for storage on SCSI and FC controllers. + * The number of sub-channels offerred is based on the number of + * VCPUs in the guest. */ - - if (vmbus_proto_version < VERSION_WIN8) { - max_luns_per_target = STORVSC_IDE_MAX_LUNS_PER_TARGET; - max_targets = STORVSC_IDE_MAX_TARGETS; - max_channels = STORVSC_IDE_MAX_CHANNELS; - } else { - max_luns_per_target = STORVSC_MAX_LUNS_PER_TARGET; - max_targets = STORVSC_MAX_TARGETS; - max_channels = STORVSC_MAX_CHANNELS; - /* - * On Windows8 and above, we support sub-channels for storage - * on SCSI and FC controllers. - * The number of sub-channels offerred is based on the number of - * VCPUs in the guest. - */ - if (!dev_is_ide) - max_sub_channels = - (num_cpus - 1) / storvsc_vcpus_per_sub_channel; - } + if (!dev_is_ide) + max_sub_channels = + (num_cpus - 1) / storvsc_vcpus_per_sub_channel; scsi_driver.can_queue = max_outstanding_req_per_channel * (max_sub_channels + 1) * @@ -2046,9 +2028,9 @@ static int storvsc_probe(struct hv_device *device, break; case SCSI_GUID: - host->max_lun = max_luns_per_target; - host->max_id = max_targets; - host->max_channel = max_channels - 1; + host->max_lun = STORVSC_MAX_LUNS_PER_TARGET; + host->max_id = STORVSC_MAX_TARGETS; + host->max_channel = STORVSC_MAX_CHANNELS - 1; break; default: From b0cce4f6fe6633546bbe5ca04f965f76948e2f34 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 2 May 2022 09:36:30 -0700 Subject: [PATCH 20/28] video: hyperv_fb: Remove support for Hyper-V 2008 and 2008R2/Win7 The hyperv_fb driver has special case code for running on the first released versions of Hyper-V: 2008 and 2008 R2/Windows 7. These versions are now out of support (except for extended security updates) and lack support for performance features that are needed for effective production usage of Linux guests. The negotiation of the VMbus protocol versions required by these old Hyper-V versions has been removed from the VMbus driver. So now remove the handling of these VMbus protocol versions from the hyperv_fb driver. Signed-off-by: Michael Kelley Reviewed-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/1651509391-2058-4-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/video/fbdev/hyperv_fb.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c index c8e0ea27caf1..7563d54a9d41 100644 --- a/drivers/video/fbdev/hyperv_fb.c +++ b/drivers/video/fbdev/hyperv_fb.c @@ -63,6 +63,7 @@ #define MAX_VMBUS_PKT_SIZE 0x4000 #define SYNTHVID_VERSION(major, minor) ((minor) << 16 | (major)) +/* Support for VERSION_WIN7 is removed. #define is retained for reference. */ #define SYNTHVID_VERSION_WIN7 SYNTHVID_VERSION(3, 0) #define SYNTHVID_VERSION_WIN8 SYNTHVID_VERSION(3, 2) #define SYNTHVID_VERSION_WIN10 SYNTHVID_VERSION(3, 5) @@ -70,13 +71,7 @@ #define SYNTHVID_VER_GET_MAJOR(ver) (ver & 0x0000ffff) #define SYNTHVID_VER_GET_MINOR(ver) ((ver & 0xffff0000) >> 16) -#define SYNTHVID_DEPTH_WIN7 16 #define SYNTHVID_DEPTH_WIN8 32 - -#define SYNTHVID_FB_SIZE_WIN7 (4 * 1024 * 1024) -#define SYNTHVID_WIDTH_MAX_WIN7 1600 -#define SYNTHVID_HEIGHT_MAX_WIN7 1200 - #define SYNTHVID_FB_SIZE_WIN8 (8 * 1024 * 1024) #define PCI_VENDOR_ID_MICROSOFT 0x1414 @@ -644,12 +639,6 @@ static int synthvid_connect_vsp(struct hv_device *hdev) case VERSION_WIN8: case VERSION_WIN8_1: ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN8); - if (!ret) - break; - fallthrough; - case VERSION_WS2008: - case VERSION_WIN7: - ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN7); break; default: ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN10); @@ -661,11 +650,7 @@ static int synthvid_connect_vsp(struct hv_device *hdev) goto error; } - if (par->synthvid_version == SYNTHVID_VERSION_WIN7) - screen_depth = SYNTHVID_DEPTH_WIN7; - else - screen_depth = SYNTHVID_DEPTH_WIN8; - + screen_depth = SYNTHVID_DEPTH_WIN8; if (synthvid_ver_ge(par->synthvid_version, SYNTHVID_VERSION_WIN10)) { ret = synthvid_get_supported_resolution(hdev); if (ret) @@ -933,9 +918,7 @@ static void hvfb_get_option(struct fb_info *info) (synthvid_ver_ge(par->synthvid_version, SYNTHVID_VERSION_WIN10) && (x * y * screen_depth / 8 > screen_fb_size)) || (par->synthvid_version == SYNTHVID_VERSION_WIN8 && - x * y * screen_depth / 8 > SYNTHVID_FB_SIZE_WIN8) || - (par->synthvid_version == SYNTHVID_VERSION_WIN7 && - (x > SYNTHVID_WIDTH_MAX_WIN7 || y > SYNTHVID_HEIGHT_MAX_WIN7))) { + x * y * screen_depth / 8 > SYNTHVID_FB_SIZE_WIN8)) { pr_err("Screen resolution option is out of range: skipped\n"); return; } From ac6811a9b36f3ceb549d8b84bd8aeedf6026df02 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 2 May 2022 09:36:31 -0700 Subject: [PATCH 21/28] drm/hyperv: Remove support for Hyper-V 2008 and 2008R2/Win7 The DRM Hyper-V driver has special case code for running on the first released versions of Hyper-V: 2008 and 2008 R2/Windows 7. These versions are now out of support (except for extended security updates) and lack support for performance features that are needed for effective production usage of Linux guests. The negotiation of the VMbus protocol versions required by these old Hyper-V versions has been removed from the VMbus driver. So now remove the handling of these VMbus protocol versions from the DRM Hyper-V driver. Signed-off-by: Michael Kelley Reviewed-by: Deepak Rawat Reviewed-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/1651509391-2058-5-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/gpu/drm/hyperv/hyperv_drm_proto.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/hyperv/hyperv_drm_proto.c b/drivers/gpu/drm/hyperv/hyperv_drm_proto.c index c0155c6271bf..76a182a9a765 100644 --- a/drivers/gpu/drm/hyperv/hyperv_drm_proto.c +++ b/drivers/gpu/drm/hyperv/hyperv_drm_proto.c @@ -18,16 +18,16 @@ #define SYNTHVID_VERSION(major, minor) ((minor) << 16 | (major)) #define SYNTHVID_VER_GET_MAJOR(ver) (ver & 0x0000ffff) #define SYNTHVID_VER_GET_MINOR(ver) ((ver & 0xffff0000) >> 16) + +/* Support for VERSION_WIN7 is removed. #define is retained for reference. */ #define SYNTHVID_VERSION_WIN7 SYNTHVID_VERSION(3, 0) #define SYNTHVID_VERSION_WIN8 SYNTHVID_VERSION(3, 2) #define SYNTHVID_VERSION_WIN10 SYNTHVID_VERSION(3, 5) -#define SYNTHVID_DEPTH_WIN7 16 #define SYNTHVID_DEPTH_WIN8 32 -#define SYNTHVID_FB_SIZE_WIN7 (4 * 1024 * 1024) +#define SYNTHVID_WIDTH_WIN8 1600 +#define SYNTHVID_HEIGHT_WIN8 1200 #define SYNTHVID_FB_SIZE_WIN8 (8 * 1024 * 1024) -#define SYNTHVID_WIDTH_MAX_WIN7 1600 -#define SYNTHVID_HEIGHT_MAX_WIN7 1200 enum pipe_msg_type { PIPE_MSG_INVALID, @@ -496,12 +496,6 @@ int hyperv_connect_vsp(struct hv_device *hdev) case VERSION_WIN8: case VERSION_WIN8_1: ret = hyperv_negotiate_version(hdev, SYNTHVID_VERSION_WIN8); - if (!ret) - break; - fallthrough; - case VERSION_WS2008: - case VERSION_WIN7: - ret = hyperv_negotiate_version(hdev, SYNTHVID_VERSION_WIN7); break; default: ret = hyperv_negotiate_version(hdev, SYNTHVID_VERSION_WIN10); @@ -513,18 +507,15 @@ int hyperv_connect_vsp(struct hv_device *hdev) goto error; } - if (hv->synthvid_version == SYNTHVID_VERSION_WIN7) - hv->screen_depth = SYNTHVID_DEPTH_WIN7; - else - hv->screen_depth = SYNTHVID_DEPTH_WIN8; + hv->screen_depth = SYNTHVID_DEPTH_WIN8; if (hyperv_version_ge(hv->synthvid_version, SYNTHVID_VERSION_WIN10)) { ret = hyperv_get_supported_resolution(hdev); if (ret) drm_err(dev, "Failed to get supported resolution from host, use default\n"); } else { - hv->screen_width_max = SYNTHVID_WIDTH_MAX_WIN7; - hv->screen_height_max = SYNTHVID_HEIGHT_MAX_WIN7; + hv->screen_width_max = SYNTHVID_WIDTH_WIN8; + hv->screen_height_max = SYNTHVID_HEIGHT_WIN8; } hv->mmio_megabytes = hdev->channel->offermsg.offer.mmio_megabytes; From b4b77778ecc5bfbd4e77de1b2fd5c1dd3c655f1f Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 11 May 2022 09:23:02 -0600 Subject: [PATCH 22/28] PCI: hv: Reuse existing IRTE allocation in compose_msi_msg() Currently if compose_msi_msg() is called multiple times, it will free any previous IRTE allocation, and generate a new allocation. While nothing prevents this from occurring, it is extraneous when Linux could just reuse the existing allocation and avoid a bunch of overhead. However, when future IRTE allocations operate on blocks of MSIs instead of a single line, freeing the allocation will impact all of the lines. This could cause an issue where an allocation of N MSIs occurs, then some of the lines are retargeted, and finally the allocation is freed/reallocated. The freeing of the allocation removes all of the configuration for the entire block, which requires all the lines to be retargeted, which might not happen since some lines might already be unmasked/active. Signed-off-by: Jeffrey Hugo Reviewed-by: Dexuan Cui Tested-by: Dexuan Cui Tested-by: Michael Kelley Link: https://lore.kernel.org/r/1652282582-21595-1-git-send-email-quic_jhugo@quicinc.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index cf2fe5754fde..5e2e637ad344 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1707,6 +1707,15 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) u32 size; int ret; + /* Reuse the previous allocation */ + if (data->chip_data) { + int_desc = data->chip_data; + msg->address_hi = int_desc->address >> 32; + msg->address_lo = int_desc->address & 0xffffffff; + msg->data = int_desc->data; + return; + } + pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); dest = irq_data_get_effective_affinity_mask(data); pbus = pdev->bus; @@ -1716,13 +1725,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) if (!hpdev) goto return_null_message; - /* Free any previous message that might have already been composed. */ - if (data->chip_data) { - int_desc = data->chip_data; - data->chip_data = NULL; - hv_int_desc_free(hpdev, int_desc); - } - int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC); if (!int_desc) goto drop_reference; From a2bad844a67b1c7740bda63e87453baf63c3a7f7 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 11 May 2022 09:23:19 -0600 Subject: [PATCH 23/28] PCI: hv: Fix interrupt mapping for multi-MSI According to Dexuan, the hypervisor folks beleive that multi-msi allocations are not correct. compose_msi_msg() will allocate multi-msi one by one. However, multi-msi is a block of related MSIs, with alignment requirements. In order for the hypervisor to allocate properly aligned and consecutive entries in the IOMMU Interrupt Remapping Table, there should be a single mapping request that requests all of the multi-msi vectors in one shot. Dexuan suggests detecting the multi-msi case and composing a single request related to the first MSI. Then for the other MSIs in the same block, use the cached information. This appears to be viable, so do it. Suggested-by: Dexuan Cui Signed-off-by: Jeffrey Hugo Reviewed-by: Dexuan Cui Tested-by: Michael Kelley Link: https://lore.kernel.org/r/1652282599-21643-1-git-send-email-quic_jhugo@quicinc.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 60 ++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 5e2e637ad344..e439b810f974 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1525,6 +1525,10 @@ static void hv_int_desc_free(struct hv_pci_dev *hpdev, u8 buffer[sizeof(struct pci_delete_interrupt)]; } ctxt; + if (!int_desc->vector_count) { + kfree(int_desc); + return; + } memset(&ctxt, 0, sizeof(ctxt)); int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; int_pkt->message_type.type = @@ -1609,12 +1613,12 @@ static void hv_pci_compose_compl(void *context, struct pci_response *resp, static u32 hv_compose_msi_req_v1( struct pci_create_interrupt *int_pkt, struct cpumask *affinity, - u32 slot, u8 vector) + u32 slot, u8 vector, u8 vector_count) { int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; - int_pkt->int_desc.vector_count = 1; + int_pkt->int_desc.vector_count = vector_count; int_pkt->int_desc.delivery_mode = DELIVERY_MODE; /* @@ -1637,14 +1641,14 @@ static int hv_compose_msi_req_get_cpu(struct cpumask *affinity) static u32 hv_compose_msi_req_v2( struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity, - u32 slot, u8 vector) + u32 slot, u8 vector, u8 vector_count) { int cpu; int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2; int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; - int_pkt->int_desc.vector_count = 1; + int_pkt->int_desc.vector_count = vector_count; int_pkt->int_desc.delivery_mode = DELIVERY_MODE; cpu = hv_compose_msi_req_get_cpu(affinity); int_pkt->int_desc.processor_array[0] = @@ -1656,7 +1660,7 @@ static u32 hv_compose_msi_req_v2( static u32 hv_compose_msi_req_v3( struct pci_create_interrupt3 *int_pkt, struct cpumask *affinity, - u32 slot, u32 vector) + u32 slot, u32 vector, u8 vector_count) { int cpu; @@ -1664,7 +1668,7 @@ static u32 hv_compose_msi_req_v3( int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; int_pkt->int_desc.reserved = 0; - int_pkt->int_desc.vector_count = 1; + int_pkt->int_desc.vector_count = vector_count; int_pkt->int_desc.delivery_mode = DELIVERY_MODE; cpu = hv_compose_msi_req_get_cpu(affinity); int_pkt->int_desc.processor_array[0] = @@ -1695,6 +1699,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) struct cpumask *dest; struct compose_comp_ctxt comp; struct tran_int_desc *int_desc; + struct msi_desc *msi_desc; + u8 vector, vector_count; struct { struct pci_packet pci_pkt; union { @@ -1716,7 +1722,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return; } - pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); + msi_desc = irq_data_get_msi_desc(data); + pdev = msi_desc_to_pci_dev(msi_desc); dest = irq_data_get_effective_affinity_mask(data); pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); @@ -1729,6 +1736,36 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) if (!int_desc) goto drop_reference; + if (!msi_desc->pci.msi_attrib.is_msix && msi_desc->nvec_used > 1) { + /* + * If this is not the first MSI of Multi MSI, we already have + * a mapping. Can exit early. + */ + if (msi_desc->irq != data->irq) { + data->chip_data = int_desc; + int_desc->address = msi_desc->msg.address_lo | + (u64)msi_desc->msg.address_hi << 32; + int_desc->data = msi_desc->msg.data + + (data->irq - msi_desc->irq); + msg->address_hi = msi_desc->msg.address_hi; + msg->address_lo = msi_desc->msg.address_lo; + msg->data = int_desc->data; + put_pcichild(hpdev); + return; + } + /* + * The vector we select here is a dummy value. The correct + * value gets sent to the hypervisor in unmask(). This needs + * to be aligned with the count, and also not zero. Multi-msi + * is powers of 2 up to 32, so 32 will always work here. + */ + vector = 32; + vector_count = msi_desc->nvec_used; + } else { + vector = hv_msi_get_int_vector(data); + vector_count = 1; + } + memset(&ctxt, 0, sizeof(ctxt)); init_completion(&comp.comp_pkt.host_event); ctxt.pci_pkt.completion_func = hv_pci_compose_compl; @@ -1739,7 +1776,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, dest, hpdev->desc.win_slot.slot, - hv_msi_get_int_vector(data)); + vector, + vector_count); break; case PCI_PROTOCOL_VERSION_1_2: @@ -1747,14 +1785,16 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, dest, hpdev->desc.win_slot.slot, - hv_msi_get_int_vector(data)); + vector, + vector_count); break; case PCI_PROTOCOL_VERSION_1_4: size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3, dest, hpdev->desc.win_slot.slot, - hv_msi_get_int_vector(data)); + vector, + vector_count); break; default: From 9937fa6d1eb6fac95586970e17617a718919c858 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 12 May 2022 00:32:06 +0200 Subject: [PATCH 24/28] PCI: hv: Add validation for untrusted Hyper-V values For additional robustness in the face of Hyper-V errors or malicious behavior, validate all values that originate from packets that Hyper-V has sent to the guest in the host-to-guest ring buffer. Ensure that invalid values cannot cause data being copied out of the bounds of the source buffer in hv_pci_onchannelcallback(). While at it, remove a redundant validation in hv_pci_generic_compl(): hv_pci_onchannelcallback() already ensures that all processed incoming packets are "at least as large as [in fact larger than] a response". Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20220511223207.3386-2-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 33 +++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index e439b810f974..a06e2cf94658 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -981,11 +981,7 @@ static void hv_pci_generic_compl(void *context, struct pci_response *resp, { struct hv_pci_compl *comp_pkt = context; - if (resp_packet_size >= offsetofend(struct pci_response, status)) - comp_pkt->completion_status = resp->status; - else - comp_pkt->completion_status = -1; - + comp_pkt->completion_status = resp->status; complete(&comp_pkt->host_event); } @@ -1606,8 +1602,13 @@ static void hv_pci_compose_compl(void *context, struct pci_response *resp, struct pci_create_int_response *int_resp = (struct pci_create_int_response *)resp; + if (resp_packet_size < sizeof(*int_resp)) { + comp_pkt->comp_pkt.completion_status = -1; + goto out; + } comp_pkt->comp_pkt.completion_status = resp->status; comp_pkt->int_desc = int_resp->int_desc; +out: complete(&comp_pkt->comp_pkt.host_event); } @@ -2291,12 +2292,14 @@ static void q_resource_requirements(void *context, struct pci_response *resp, struct q_res_req_compl *completion = context; struct pci_q_res_req_response *q_res_req = (struct pci_q_res_req_response *)resp; + s32 status; int i; - if (resp->status < 0) { + status = (resp_packet_size < sizeof(*q_res_req)) ? -1 : resp->status; + if (status < 0) { dev_err(&completion->hpdev->hbus->hdev->device, "query resource requirements failed: %x\n", - resp->status); + status); } else { for (i = 0; i < PCI_STD_NUM_BARS; i++) { completion->hpdev->probed_bar[i] = @@ -2848,7 +2851,8 @@ static void hv_pci_onchannelcallback(void *context) case PCI_BUS_RELATIONS: bus_rel = (struct pci_bus_relations *)buffer; - if (bytes_recvd < + if (bytes_recvd < sizeof(*bus_rel) || + bytes_recvd < struct_size(bus_rel, func, bus_rel->device_count)) { dev_err(&hbus->hdev->device, @@ -2862,7 +2866,8 @@ static void hv_pci_onchannelcallback(void *context) case PCI_BUS_RELATIONS2: bus_rel2 = (struct pci_bus_relations2 *)buffer; - if (bytes_recvd < + if (bytes_recvd < sizeof(*bus_rel2) || + bytes_recvd < struct_size(bus_rel2, func, bus_rel2->device_count)) { dev_err(&hbus->hdev->device, @@ -2876,6 +2881,11 @@ static void hv_pci_onchannelcallback(void *context) case PCI_EJECT: dev_message = (struct pci_dev_incoming *)buffer; + if (bytes_recvd < sizeof(*dev_message)) { + dev_err(&hbus->hdev->device, + "eject message too small\n"); + break; + } hpdev = get_pcichild_wslot(hbus, dev_message->wslot.slot); if (hpdev) { @@ -2887,6 +2897,11 @@ static void hv_pci_onchannelcallback(void *context) case PCI_INVALIDATE_BLOCK: inval = (struct pci_dev_inval_block *)buffer; + if (bytes_recvd < sizeof(*inval)) { + dev_err(&hbus->hdev->device, + "invalidate message too small\n"); + break; + } hpdev = get_pcichild_wslot(hbus, inval->wslot.slot); if (hpdev) { From b4927bd272623694314f37823302f9d67aa5964c Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 12 May 2022 00:32:07 +0200 Subject: [PATCH 25/28] PCI: hv: Fix synchronization between channel callback and hv_pci_bus_exit() [ Similarly to commit a765ed47e4516 ("PCI: hv: Fix synchronization between channel callback and hv_compose_msi_msg()"): ] The (on-stack) teardown packet becomes invalid once the completion timeout in hv_pci_bus_exit() has expired and hv_pci_bus_exit() has returned. Prevent the channel callback from accessing the invalid packet by removing the ID associated to such packet from the VMbus requestor in hv_pci_bus_exit(). Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20220511223207.3386-3-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index a06e2cf94658..db814f7b93ba 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3664,6 +3664,7 @@ free_bus: static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) { struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); + struct vmbus_channel *chan = hdev->channel; struct { struct pci_packet teardown_packet; u8 buffer[sizeof(struct pci_message)]; @@ -3671,13 +3672,14 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) struct hv_pci_compl comp_pkt; struct hv_pci_dev *hpdev, *tmp; unsigned long flags; + u64 trans_id; int ret; /* * After the host sends the RESCIND_CHANNEL message, it doesn't * access the per-channel ringbuffer any longer. */ - if (hdev->channel->rescind) + if (chan->rescind) return 0; if (!keep_devs) { @@ -3714,16 +3716,26 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) pkt.teardown_packet.compl_ctxt = &comp_pkt; pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT; - ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message, - sizeof(struct pci_message), - (unsigned long)&pkt.teardown_packet, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + ret = vmbus_sendpacket_getid(chan, &pkt.teardown_packet.message, + sizeof(struct pci_message), + (unsigned long)&pkt.teardown_packet, + &trans_id, VM_PKT_DATA_INBAND, + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); if (ret) return ret; - if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) + if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) { + /* + * The completion packet on the stack becomes invalid after + * 'return'; remove the ID from the VMbus requestor if the + * identifier is still mapped to/associated with the packet. + * + * Cf. hv_pci_onchannelcallback(). + */ + vmbus_request_addr_match(chan, trans_id, + (unsigned long)&pkt.teardown_packet); return -ETIMEDOUT; + } return 0; } From 1940f9f81d4523b261617cf10e926b4a2485168c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:11:10 +0200 Subject: [PATCH 26/28] Drivers: hv: vmbus: fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Link: https://lore.kernel.org/r/20220521111145.81697-60-Julia.Lawall@inria.fr Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 97d8f5646778..b60f13481bdc 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -443,7 +443,7 @@ void hv_process_channel_removal(struct vmbus_channel *channel) /* * Upon suspend, an in-use hv_sock channel is removed from the array of * channels and the relid is invalidated. After hibernation, when the - * user-space appplication destroys the channel, it's unnecessary and + * user-space application destroys the channel, it's unnecessary and * unsafe to remove the channel from the array of channels. See also * the inline comments before the call of vmbus_release_relid() below. */ From 86c8fb4d228ed8dbe17b1abd664888bc7ee0052a Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Wed, 25 May 2022 04:27:02 -0700 Subject: [PATCH 27/28] scsi: storvsc: Removing Pre Win8 related logic The latest storvsc code has already removed the support for windows 7 and earlier. There is still some code logic remaining which is there to support pre Windows 8 OS. This patch removes these stale logic. This patch majorly does three things : 1. Removes vmscsi_size_delta and its logic, as the vmscsi_request struct is same for all the OS post windows 8 there is no need of delta. 2. Simplify sense_buffer_size logic, as there is single buffer size for all the post windows 8 OS. 3. Embed the vmscsi_win8_extension structure inside the vmscsi_request, as there is no separate handling required for different OS. Signed-off-by: Saurabh Sengar Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1653478022-26621-1-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Wei Liu --- drivers/scsi/storvsc_drv.c | 155 ++++++++++--------------------------- 1 file changed, 39 insertions(+), 116 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 5585e9d30bbf..08ed059a738b 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -54,7 +54,6 @@ #define VMSTOR_PROTO_VERSION(MAJOR_, MINOR_) ((((MAJOR_) & 0xff) << 8) | \ (((MINOR_) & 0xff))) - #define VMSTOR_PROTO_VERSION_WIN6 VMSTOR_PROTO_VERSION(2, 0) #define VMSTOR_PROTO_VERSION_WIN7 VMSTOR_PROTO_VERSION(4, 2) #define VMSTOR_PROTO_VERSION_WIN8 VMSTOR_PROTO_VERSION(5, 1) @@ -136,20 +135,10 @@ struct hv_fc_wwn_packet { */ #define STORVSC_MAX_CMD_LEN 0x10 -#define POST_WIN7_STORVSC_SENSE_BUFFER_SIZE 0x14 -#define PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE 0x12 - +/* Sense buffer size is the same for all versions since Windows 8 */ #define STORVSC_SENSE_BUFFER_SIZE 0x14 #define STORVSC_MAX_BUF_LEN_WITH_PADDING 0x14 -/* - * Sense buffer size changed in win8; have a run-time - * variable to track the size we should use. This value will - * likely change during protocol negotiation but it is valid - * to start by assuming pre-Win8. - */ -static int sense_buffer_size = PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE; - /* * The storage protocol version is determined during the * initial exchange with the host. It will indicate which @@ -177,18 +166,6 @@ do { \ dev_warn(&(dev)->device, fmt, ##__VA_ARGS__); \ } while (0) -struct vmscsi_win8_extension { - /* - * The following were added in Windows 8 - */ - u16 reserve; - u8 queue_tag; - u8 queue_action; - u32 srb_flags; - u32 time_out_value; - u32 queue_sort_ey; -} __packed; - struct vmscsi_request { u16 length; u8 srb_status; @@ -214,46 +191,23 @@ struct vmscsi_request { /* * The following was added in win8. */ - struct vmscsi_win8_extension win8_extension; + u16 reserve; + u8 queue_tag; + u8 queue_action; + u32 srb_flags; + u32 time_out_value; + u32 queue_sort_ey; } __attribute((packed)); /* - * The list of storage protocols in order of preference. + * The list of windows version in order of preference. */ -struct vmstor_protocol { - int protocol_version; - int sense_buffer_size; - int vmscsi_size_delta; -}; - -static const struct vmstor_protocol vmstor_protocols[] = { - { +static const int protocol_version[] = { VMSTOR_PROTO_VERSION_WIN10, - POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, - 0 - }, - { VMSTOR_PROTO_VERSION_WIN8_1, - POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, - 0 - }, - { VMSTOR_PROTO_VERSION_WIN8, - POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, - 0 - }, - { - VMSTOR_PROTO_VERSION_WIN7, - PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE, - sizeof(struct vmscsi_win8_extension), - }, - { - VMSTOR_PROTO_VERSION_WIN6, - PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE, - sizeof(struct vmscsi_win8_extension), - } }; @@ -409,9 +363,7 @@ static void storvsc_on_channel_callback(void *context); #define STORVSC_IDE_MAX_CHANNELS 1 /* - * Upper bound on the size of a storvsc packet. vmscsi_size_delta is not - * included in the calculation because it is set after STORVSC_MAX_PKT_SIZE - * is used in storvsc_connect_to_vsp + * Upper bound on the size of a storvsc packet. */ #define STORVSC_MAX_PKT_SIZE (sizeof(struct vmpacket_descriptor) +\ sizeof(struct vstor_packet)) @@ -452,17 +404,6 @@ struct storvsc_device { unsigned char path_id; unsigned char target_id; - /* - * The size of the vmscsi_request has changed in win8. The - * additional size is because of new elements added to the - * structure. These elements are valid only when we are talking - * to a win8 host. - * Track the correction to size we need to apply. This value - * will likely change during protocol negotiation but it is - * valid to start by assuming pre-Win8. - */ - int vmscsi_size_delta; - /* * Max I/O, the device can support. */ @@ -795,8 +736,7 @@ static void handle_multichannel_storage(struct hv_device *device, int max_chns) vstor_packet->sub_channel_count = num_sc; ret = vmbus_sendpacket(device->channel, vstor_packet, - (sizeof(struct vstor_packet) - - stor_device->vmscsi_size_delta), + sizeof(struct vstor_packet), VMBUS_RQST_INIT, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); @@ -864,8 +804,7 @@ static int storvsc_execute_vstor_op(struct hv_device *device, vstor_packet->flags = REQUEST_COMPLETION_FLAG; ret = vmbus_sendpacket(device->channel, vstor_packet, - (sizeof(struct vstor_packet) - - stor_device->vmscsi_size_delta), + sizeof(struct vstor_packet), VMBUS_RQST_INIT, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); @@ -915,14 +854,13 @@ static int storvsc_channel_init(struct hv_device *device, bool is_fc) * Query host supported protocol version. */ - for (i = 0; i < ARRAY_SIZE(vmstor_protocols); i++) { + for (i = 0; i < ARRAY_SIZE(protocol_version); i++) { /* reuse the packet for version range supported */ memset(vstor_packet, 0, sizeof(struct vstor_packet)); vstor_packet->operation = VSTOR_OPERATION_QUERY_PROTOCOL_VERSION; - vstor_packet->version.major_minor = - vmstor_protocols[i].protocol_version; + vstor_packet->version.major_minor = protocol_version[i]; /* * The revision number is only used in Windows; set it to 0. @@ -936,21 +874,16 @@ static int storvsc_channel_init(struct hv_device *device, bool is_fc) return -EINVAL; if (vstor_packet->status == 0) { - vmstor_proto_version = - vmstor_protocols[i].protocol_version; - - sense_buffer_size = - vmstor_protocols[i].sense_buffer_size; - - stor_device->vmscsi_size_delta = - vmstor_protocols[i].vmscsi_size_delta; + vmstor_proto_version = protocol_version[i]; break; } } - if (vstor_packet->status != 0) + if (vstor_packet->status != 0) { + dev_err(&device->device, "Obsolete Hyper-V version\n"); return -EINVAL; + } memset(vstor_packet, 0, sizeof(struct vstor_packet)); @@ -986,11 +919,10 @@ static int storvsc_channel_init(struct hv_device *device, bool is_fc) cpumask_set_cpu(device->channel->target_cpu, &stor_device->alloced_cpus); - if (vmstor_proto_version >= VMSTOR_PROTO_VERSION_WIN8) { - if (vstor_packet->storage_channel_properties.flags & - STORAGE_CHANNEL_SUPPORTS_MULTI_CHANNEL) - process_sub_channels = true; - } + if (vstor_packet->storage_channel_properties.flags & + STORAGE_CHANNEL_SUPPORTS_MULTI_CHANNEL) + process_sub_channels = true; + stor_device->max_transfer_bytes = vstor_packet->storage_channel_properties.max_transfer_bytes; @@ -1197,7 +1129,7 @@ static void storvsc_on_io_completion(struct storvsc_device *stor_device, * Copy over the sense_info_length, but limit to the known max * size if Hyper-V returns a bad value. */ - stor_pkt->vm_srb.sense_info_length = min_t(u8, sense_buffer_size, + stor_pkt->vm_srb.sense_info_length = min_t(u8, STORVSC_SENSE_BUFFER_SIZE, vstor_packet->vm_srb.sense_info_length); if (vstor_packet->vm_srb.scsi_status != 0 || @@ -1289,8 +1221,8 @@ static void storvsc_on_channel_callback(void *context) struct storvsc_cmd_request *request = NULL; u32 pktlen = hv_pkt_datalen(desc); u64 rqst_id = desc->trans_id; - u32 minlen = rqst_id ? sizeof(struct vstor_packet) - - stor_device->vmscsi_size_delta : sizeof(enum vstor_packet_operation); + u32 minlen = rqst_id ? sizeof(struct vstor_packet) : + sizeof(enum vstor_packet_operation); if (pktlen < minlen) { dev_err(&device->device, @@ -1346,7 +1278,7 @@ static void storvsc_on_channel_callback(void *context) } memcpy(&request->vstor_packet, packet, - (sizeof(struct vstor_packet) - stor_device->vmscsi_size_delta)); + sizeof(struct vstor_packet)); complete(&request->wait_event); } } @@ -1557,11 +1489,10 @@ static int storvsc_do_io(struct hv_device *device, found_channel: vstor_packet->flags |= REQUEST_COMPLETION_FLAG; - vstor_packet->vm_srb.length = (sizeof(struct vmscsi_request) - - stor_device->vmscsi_size_delta); + vstor_packet->vm_srb.length = sizeof(struct vmscsi_request); - vstor_packet->vm_srb.sense_info_length = sense_buffer_size; + vstor_packet->vm_srb.sense_info_length = STORVSC_SENSE_BUFFER_SIZE; vstor_packet->vm_srb.data_transfer_length = @@ -1574,13 +1505,11 @@ found_channel: ret = vmbus_sendpacket_mpb_desc(outgoing_channel, request->payload, request->payload_sz, vstor_packet, - (sizeof(struct vstor_packet) - - stor_device->vmscsi_size_delta), + sizeof(struct vstor_packet), (unsigned long)request); } else { ret = vmbus_sendpacket(outgoing_channel, vstor_packet, - (sizeof(struct vstor_packet) - - stor_device->vmscsi_size_delta), + sizeof(struct vstor_packet), (unsigned long)request, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); @@ -1684,8 +1613,7 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd) vstor_packet->vm_srb.path_id = stor_device->path_id; ret = vmbus_sendpacket(device->channel, vstor_packet, - (sizeof(struct vstor_packet) - - stor_device->vmscsi_size_delta), + sizeof(struct vstor_packet), VMBUS_RQST_RESET, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); @@ -1778,31 +1706,31 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) memset(&cmd_request->vstor_packet, 0, sizeof(struct vstor_packet)); vm_srb = &cmd_request->vstor_packet.vm_srb; - vm_srb->win8_extension.time_out_value = 60; + vm_srb->time_out_value = 60; - vm_srb->win8_extension.srb_flags |= + vm_srb->srb_flags |= SRB_FLAGS_DISABLE_SYNCH_TRANSFER; if (scmnd->device->tagged_supported) { - vm_srb->win8_extension.srb_flags |= + vm_srb->srb_flags |= (SRB_FLAGS_QUEUE_ACTION_ENABLE | SRB_FLAGS_NO_QUEUE_FREEZE); - vm_srb->win8_extension.queue_tag = SP_UNTAGGED; - vm_srb->win8_extension.queue_action = SRB_SIMPLE_TAG_REQUEST; + vm_srb->queue_tag = SP_UNTAGGED; + vm_srb->queue_action = SRB_SIMPLE_TAG_REQUEST; } /* Build the SRB */ switch (scmnd->sc_data_direction) { case DMA_TO_DEVICE: vm_srb->data_in = WRITE_TYPE; - vm_srb->win8_extension.srb_flags |= SRB_FLAGS_DATA_OUT; + vm_srb->srb_flags |= SRB_FLAGS_DATA_OUT; break; case DMA_FROM_DEVICE: vm_srb->data_in = READ_TYPE; - vm_srb->win8_extension.srb_flags |= SRB_FLAGS_DATA_IN; + vm_srb->srb_flags |= SRB_FLAGS_DATA_IN; break; case DMA_NONE: vm_srb->data_in = UNKNOWN_TYPE; - vm_srb->win8_extension.srb_flags |= SRB_FLAGS_NO_DATA_TRANSFER; + vm_srb->srb_flags |= SRB_FLAGS_NO_DATA_TRANSFER; break; default: /* @@ -2004,7 +1932,6 @@ static int storvsc_probe(struct hv_device *device, init_waitqueue_head(&stor_device->waiting_to_drain); stor_device->device = device; stor_device->host = host; - stor_device->vmscsi_size_delta = sizeof(struct vmscsi_win8_extension); spin_lock_init(&stor_device->lock); hv_set_drvdata(device, stor_device); dma_set_min_align_mask(&device->device, HV_HYP_PAGE_SIZE - 1); @@ -2217,10 +2144,6 @@ static int __init storvsc_drv_init(void) * than the ring buffer size since that page is reserved for * the ring buffer indices) by the max request size (which is * vmbus_channel_packet_multipage_buffer + struct vstor_packet + u64) - * - * The computation underestimates max_outstanding_req_per_channel - * for Win7 and older hosts because it does not take into account - * the vmscsi_size_delta correction to the max request size. */ max_outstanding_req_per_channel = ((storvsc_ringbuffer_size - PAGE_SIZE) / From d27423bf048dcb5e15f04286d001c66685e30c29 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Sun, 15 May 2022 21:50:58 -0700 Subject: [PATCH 28/28] hv_balloon: Fix balloon_probe() and balloon_remove() error handling Add missing cleanup in balloon_probe() if the call to balloon_connect_vsp() fails. Also correctly handle cleanup in balloon_remove() when dm_state is DM_INIT_ERROR because balloon_resume() failed. Signed-off-by: Shradha Gupta Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220516045058.GA7933@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net Signed-off-by: Wei Liu --- drivers/hv/hv_balloon.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 3248b48f37f6..91e8a72eee14 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1842,7 +1842,7 @@ static int balloon_probe(struct hv_device *dev, ret = balloon_connect_vsp(dev); if (ret != 0) - return ret; + goto connect_error; enable_page_reporting(); dm_device.state = DM_INITIALIZED; @@ -1861,6 +1861,7 @@ probe_error: dm_device.thread = NULL; disable_page_reporting(); vmbus_close(dev->channel); +connect_error: #ifdef CONFIG_MEMORY_HOTPLUG unregister_memory_notifier(&hv_memory_nb); restore_online_page_callback(&hv_online_page); @@ -1882,12 +1883,21 @@ static int balloon_remove(struct hv_device *dev) cancel_work_sync(&dm->ha_wrk.wrk); kthread_stop(dm->thread); - disable_page_reporting(); - vmbus_close(dev->channel); + + /* + * This is to handle the case when balloon_resume() + * call has failed and some cleanup has been done as + * a part of the error handling. + */ + if (dm_device.state != DM_INIT_ERROR) { + disable_page_reporting(); + vmbus_close(dev->channel); #ifdef CONFIG_MEMORY_HOTPLUG - unregister_memory_notifier(&hv_memory_nb); - restore_online_page_callback(&hv_online_page); + unregister_memory_notifier(&hv_memory_nb); + restore_online_page_callback(&hv_online_page); #endif + } + spin_lock_irqsave(&dm_device.ha_lock, flags); list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) { list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) { @@ -1948,6 +1958,7 @@ close_channel: vmbus_close(dev->channel); out: dm_device.state = DM_INIT_ERROR; + disable_page_reporting(); #ifdef CONFIG_MEMORY_HOTPLUG unregister_memory_notifier(&hv_memory_nb); restore_online_page_callback(&hv_online_page);