mirror of
https://gitee.com/bianbu-linux/linux-6.6
synced 2025-04-24 14:07:52 -04:00
drm/amdgpu: change reset lock from mutex to rw_semaphore
clients don't need reset-lock for synchronization when no GPU recovery. v2: change to return the return value of down_read_killable. v3: if GPU recovery begin, VF ignore FLR notification. Reviewed-by: Monk Liu <monk.liu@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Dennis Li <Dennis.Li@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
66b8a9c0a7
commit
6049db43d6
5 changed files with 32 additions and 35 deletions
|
@ -951,7 +951,7 @@ struct amdgpu_device {
|
||||||
|
|
||||||
atomic_t in_gpu_reset;
|
atomic_t in_gpu_reset;
|
||||||
enum pp_mp1_state mp1_state;
|
enum pp_mp1_state mp1_state;
|
||||||
struct mutex lock_reset;
|
struct rw_semaphore reset_sem;
|
||||||
struct amdgpu_doorbell_index doorbell_index;
|
struct amdgpu_doorbell_index doorbell_index;
|
||||||
|
|
||||||
struct mutex notifier_lock;
|
struct mutex notifier_lock;
|
||||||
|
|
|
@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
|
||||||
|
|
||||||
file->private_data = adev;
|
file->private_data = adev;
|
||||||
|
|
||||||
mutex_lock(&adev->lock_reset);
|
ret = down_read_killable(&adev->reset_sem);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
if (adev->autodump.dumping.done) {
|
if (adev->autodump.dumping.done) {
|
||||||
reinit_completion(&adev->autodump.dumping);
|
reinit_completion(&adev->autodump.dumping);
|
||||||
ret = 0;
|
ret = 0;
|
||||||
} else {
|
} else {
|
||||||
ret = -EBUSY;
|
ret = -EBUSY;
|
||||||
}
|
}
|
||||||
mutex_unlock(&adev->lock_reset);
|
|
||||||
|
up_read(&adev->reset_sem);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Avoid accidently unparking the sched thread during GPU reset */
|
/* Avoid accidently unparking the sched thread during GPU reset */
|
||||||
mutex_lock(&adev->lock_reset);
|
r = down_read_killable(&adev->reset_sem);
|
||||||
|
if (r)
|
||||||
|
return r;
|
||||||
|
|
||||||
/* hold on the scheduler */
|
/* hold on the scheduler */
|
||||||
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
|
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
|
||||||
|
@ -1269,7 +1275,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
|
||||||
kthread_unpark(ring->sched.thread);
|
kthread_unpark(ring->sched.thread);
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_unlock(&adev->lock_reset);
|
up_read(&adev->reset_sem);
|
||||||
|
|
||||||
pm_runtime_mark_last_busy(dev->dev);
|
pm_runtime_mark_last_busy(dev->dev);
|
||||||
pm_runtime_put_autosuspend(dev->dev);
|
pm_runtime_put_autosuspend(dev->dev);
|
||||||
|
@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
/* Avoid accidently unparking the sched thread during GPU reset */
|
/* Avoid accidently unparking the sched thread during GPU reset */
|
||||||
mutex_lock(&adev->lock_reset);
|
r = down_read_killable(&adev->reset_sem);
|
||||||
|
if (r)
|
||||||
|
goto pro_end;
|
||||||
|
|
||||||
/* stop the scheduler */
|
/* stop the scheduler */
|
||||||
kthread_park(ring->sched.thread);
|
kthread_park(ring->sched.thread);
|
||||||
|
@ -1500,13 +1508,14 @@ failure:
|
||||||
/* restart the scheduler */
|
/* restart the scheduler */
|
||||||
kthread_unpark(ring->sched.thread);
|
kthread_unpark(ring->sched.thread);
|
||||||
|
|
||||||
mutex_unlock(&adev->lock_reset);
|
up_read(&adev->reset_sem);
|
||||||
|
|
||||||
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
|
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
|
||||||
|
|
||||||
|
pro_end:
|
||||||
kfree(fences);
|
kfree(fences);
|
||||||
|
|
||||||
return 0;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int amdgpu_debugfs_sclk_set(void *data, u64 val)
|
static int amdgpu_debugfs_sclk_set(void *data, u64 val)
|
||||||
|
|
|
@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||||
mutex_init(&adev->virt.vf_errors.lock);
|
mutex_init(&adev->virt.vf_errors.lock);
|
||||||
hash_init(adev->mn_hash);
|
hash_init(adev->mn_hash);
|
||||||
atomic_set(&adev->in_gpu_reset, 0);
|
atomic_set(&adev->in_gpu_reset, 0);
|
||||||
mutex_init(&adev->lock_reset);
|
init_rwsem(&adev->reset_sem);
|
||||||
mutex_init(&adev->psp.mutex);
|
mutex_init(&adev->psp.mutex);
|
||||||
mutex_init(&adev->notifier_lock);
|
mutex_init(&adev->notifier_lock);
|
||||||
|
|
||||||
|
@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev)
|
||||||
if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
|
if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
mutex_lock(&adev->lock_reset);
|
down_write(&adev->reset_sem);
|
||||||
|
|
||||||
atomic_inc(&adev->gpu_reset_counter);
|
atomic_inc(&adev->gpu_reset_counter);
|
||||||
switch (amdgpu_asic_reset_method(adev)) {
|
switch (amdgpu_asic_reset_method(adev)) {
|
||||||
|
@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
|
||||||
amdgpu_vf_error_trans_all(adev);
|
amdgpu_vf_error_trans_all(adev);
|
||||||
adev->mp1_state = PP_MP1_STATE_NONE;
|
adev->mp1_state = PP_MP1_STATE_NONE;
|
||||||
atomic_set(&adev->in_gpu_reset, 0);
|
atomic_set(&adev->in_gpu_reset, 0);
|
||||||
mutex_unlock(&adev->lock_reset);
|
up_write(&adev->reset_sem);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
|
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
|
||||||
|
|
|
@ -238,19 +238,15 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
|
||||||
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
|
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
|
||||||
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
|
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
|
||||||
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
|
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
|
||||||
int locked;
|
|
||||||
|
|
||||||
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
|
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
|
||||||
* otherwise the mailbox msg will be ruined/reseted by
|
* otherwise the mailbox msg will be ruined/reseted by
|
||||||
* the VF FLR.
|
* the VF FLR.
|
||||||
*
|
|
||||||
* we can unlock the lock_reset to allow "amdgpu_job_timedout"
|
|
||||||
* to run gpu_recover() after FLR_NOTIFICATION_CMPL received
|
|
||||||
* which means host side had finished this VF's FLR.
|
|
||||||
*/
|
*/
|
||||||
locked = mutex_trylock(&adev->lock_reset);
|
if (!down_read_trylock(&adev->reset_sem))
|
||||||
if (locked)
|
return;
|
||||||
atomic_set(&adev->in_gpu_reset, 1);
|
|
||||||
|
atomic_set(&adev->in_gpu_reset, 1);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
|
if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
|
||||||
|
@ -261,10 +257,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
|
||||||
} while (timeout > 1);
|
} while (timeout > 1);
|
||||||
|
|
||||||
flr_done:
|
flr_done:
|
||||||
if (locked) {
|
atomic_set(&adev->in_gpu_reset, 0);
|
||||||
atomic_set(&adev->in_gpu_reset, 0);
|
up_read(&adev->reset_sem);
|
||||||
mutex_unlock(&adev->lock_reset);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Trigger recovery for world switch failure if no TDR */
|
/* Trigger recovery for world switch failure if no TDR */
|
||||||
if (amdgpu_device_should_recover_gpu(adev)
|
if (amdgpu_device_should_recover_gpu(adev)
|
||||||
|
|
|
@ -259,19 +259,15 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
|
||||||
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
|
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
|
||||||
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
|
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
|
||||||
int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
|
int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
|
||||||
int locked;
|
|
||||||
|
|
||||||
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
|
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
|
||||||
* otherwise the mailbox msg will be ruined/reseted by
|
* otherwise the mailbox msg will be ruined/reseted by
|
||||||
* the VF FLR.
|
* the VF FLR.
|
||||||
*
|
|
||||||
* we can unlock the lock_reset to allow "amdgpu_job_timedout"
|
|
||||||
* to run gpu_recover() after FLR_NOTIFICATION_CMPL received
|
|
||||||
* which means host side had finished this VF's FLR.
|
|
||||||
*/
|
*/
|
||||||
locked = mutex_trylock(&adev->lock_reset);
|
if (!down_read_trylock(&adev->reset_sem))
|
||||||
if (locked)
|
return;
|
||||||
atomic_set(&adev->in_gpu_reset, 1);
|
|
||||||
|
atomic_set(&adev->in_gpu_reset, 1);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
|
if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
|
||||||
|
@ -282,10 +278,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
|
||||||
} while (timeout > 1);
|
} while (timeout > 1);
|
||||||
|
|
||||||
flr_done:
|
flr_done:
|
||||||
if (locked) {
|
atomic_set(&adev->in_gpu_reset, 0);
|
||||||
atomic_set(&adev->in_gpu_reset, 0);
|
up_read(&adev->reset_sem);
|
||||||
mutex_unlock(&adev->lock_reset);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Trigger recovery for world switch failure if no TDR */
|
/* Trigger recovery for world switch failure if no TDR */
|
||||||
if (amdgpu_device_should_recover_gpu(adev)
|
if (amdgpu_device_should_recover_gpu(adev)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue