drm/amdkfd: Update interrupt handling for GFX9.4.3

Update interrupt handling in CPX mode for GFX9.4.3 by using the
VMID space instead of SDMA client id to determine if an interrupt
should be processed by a KFD node. This is especially needed for
handling retry faults from MMHUB.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Mukul Joshi 2022-09-30 09:16:21 -04:00 committed by Alex Deucher
parent cb30544e3c
commit f5fe7edfd6
6 changed files with 19 additions and 20 deletions

View file

@ -2434,6 +2434,9 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
* amdgpu_vm_handle_fault - graceful handling of VM faults.
* @adev: amdgpu device pointer
* @pasid: PASID of the VM
* @vmid: VMID, only used for GFX 9.4.3.
* @node_id: Node_id received in IH cookie. Only applicable for
* GFX 9.4.3.
* @addr: Address of the fault
* @write_fault: true is write fault, false is read fault
*
@ -2441,7 +2444,7 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
* shouldn't be reported any more.
*/
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 client_id, u32 node_id, uint64_t addr,
u32 vmid, u32 node_id, uint64_t addr,
bool write_fault)
{
bool is_compute_context = false;
@ -2466,7 +2469,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
addr /= AMDGPU_GPU_PAGE_SIZE;
if (is_compute_context && !svm_range_restore_pages(adev, pasid, client_id,
if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
node_id, addr, write_fault)) {
amdgpu_bo_unref(&root);
return true;

View file

@ -455,7 +455,7 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
struct amdgpu_task_info *task_info);
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 client_id, u32 node_id, uint64_t addr,
u32 vmid, u32 node_id, uint64_t addr,
bool write_fault);
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);

View file

@ -587,7 +587,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
cam_index = entry->src_data[2] & 0x3ff;
ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->client_id, node_id,
ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
addr, write_fault);
WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
if (ret)
@ -610,7 +610,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
/* Try to handle the recoverable page faults by filling page
* tables
*/
if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->client_id, node_id,
if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
addr, write_fault))
return 1;
}

View file

@ -1073,18 +1073,14 @@ struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
struct kfd_node *kfd_device_by_id(uint32_t gpu_id);
struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev);
struct kfd_node *kfd_device_by_adev(const struct amdgpu_device *adev);
static inline bool kfd_irq_is_from_node(struct kfd_node *node, uint32_t client_id,
uint32_t node_id)
static inline bool kfd_irq_is_from_node(struct kfd_node *node, uint32_t node_id,
uint32_t vmid)
{
if ((node->interrupt_bitmap & (0x1U << node_id)) ||
((node_id % 4) == 0 &&
(node->interrupt_bitmap >> 16) & (0x1U << client_id)))
return true;
return false;
return (node->interrupt_bitmap & (1 << node_id)) != 0 &&
(node->compute_vmid_bitmap & (1 << vmid)) != 0;
}
static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
uint32_t client_id, uint32_t node_id) {
uint32_t node_id, uint32_t vmid) {
struct kfd_dev *dev = adev->kfd.dev;
uint32_t i;
@ -1092,7 +1088,7 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
return dev->nodes[0];
for (i = 0; i < dev->num_nodes; i++)
if (kfd_irq_is_from_node(dev->nodes[i], client_id, node_id))
if (kfd_irq_is_from_node(dev->nodes[i], node_id, vmid))
return dev->nodes[i];
return NULL;

View file

@ -2799,7 +2799,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
int
svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
uint32_t client_id, uint32_t node_id,
uint32_t vmid, uint32_t node_id,
uint64_t addr, bool write_fault)
{
struct mm_struct *mm = NULL;
@ -2851,10 +2851,10 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
goto out;
}
node = kfd_node_by_irq_ids(adev, node_id, client_id);
node = kfd_node_by_irq_ids(adev, node_id, vmid);
if (!node) {
pr_debug("kfd node does not exist node_id: %d, client_id: %d\n", node_id,
client_id);
pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
vmid);
r = -EFAULT;
goto out;
}

View file

@ -173,7 +173,7 @@ int svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm,
unsigned long addr, struct svm_range *parent,
struct svm_range *prange);
int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
uint32_t client_id, uint32_t node_id, uint64_t addr,
uint32_t vmid, uint32_t node_id, uint64_t addr,
bool write_fault);
int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
void svm_range_add_list_work(struct svm_range_list *svms,