summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/Kconfig2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_chardev.c1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c123
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c11
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h5
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c8
9 files changed, 115 insertions, 40 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig
index 62e88e5362e9..16e12c9913f9 100644
--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
@@ -5,7 +5,7 @@
config HSA_AMD
bool "HSA kernel driver for AMD GPU devices"
- depends on DRM_AMDGPU && (X86_64 || ARM64 || PPC64 || (RISCV && 64BIT))
+ depends on DRM_AMDGPU && (X86_64 || ARM64 || PPC64 || (RISCV && 64BIT) || (LOONGARCH && 64BIT))
select HMM_MIRROR
select MMU_NOTIFIER
select DRM_AMDGPU_USERPTR
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index a2149afa5803..828a9ceef1e7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -22,7 +22,6 @@
*/
#include <linux/device.h>
-#include <linux/export.h>
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/file.h>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index bf0854bd5555..7e749f9b6d69 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -971,7 +971,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd,
kfd_smi_event_update_gpu_reset(node, false, reset_context);
}
- kgd2kfd_suspend(kfd, false);
+ kgd2kfd_suspend(kfd, true);
for (i = 0; i < kfd->num_nodes; i++)
kfd_signal_reset_event(kfd->nodes[i]);
@@ -1013,13 +1013,33 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
return 0;
}
-bool kfd_is_locked(void)
+bool kfd_is_locked(struct kfd_dev *kfd)
{
+ uint8_t id = 0;
+ struct kfd_node *dev;
+
lockdep_assert_held(&kfd_processes_mutex);
- return (kfd_locked > 0);
+
+ /* check reset/suspend lock */
+ if (kfd_locked > 0)
+ return true;
+
+ if (kfd)
+ return kfd->kfd_dev_lock > 0;
+
+ /* check lock on all cgroup accessible devices */
+ while (kfd_topology_enum_kfd_devices(id++, &dev) == 0) {
+ if (!dev || kfd_devcgroup_check_permission(dev))
+ continue;
+
+ if (dev->kfd->kfd_dev_lock > 0)
+ return true;
+ }
+
+ return false;
}
-void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
+void kgd2kfd_suspend(struct kfd_dev *kfd, bool suspend_proc)
{
struct kfd_node *node;
int i;
@@ -1027,14 +1047,8 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
if (!kfd->init_complete)
return;
- /* for runtime suspend, skip locking kfd */
- if (!run_pm) {
- mutex_lock(&kfd_processes_mutex);
- /* For first KFD device suspend all the KFD processes */
- if (++kfd_locked == 1)
- kfd_suspend_all_processes();
- mutex_unlock(&kfd_processes_mutex);
- }
+ if (suspend_proc)
+ kgd2kfd_suspend_process(kfd);
for (i = 0; i < kfd->num_nodes; i++) {
node = kfd->nodes[i];
@@ -1042,7 +1056,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
}
}
-int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
+int kgd2kfd_resume(struct kfd_dev *kfd, bool resume_proc)
{
int ret, i;
@@ -1055,14 +1069,36 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
return ret;
}
- /* for runtime resume, skip unlocking kfd */
- if (!run_pm) {
- mutex_lock(&kfd_processes_mutex);
- if (--kfd_locked == 0)
- ret = kfd_resume_all_processes();
- WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
- mutex_unlock(&kfd_processes_mutex);
- }
+ if (resume_proc)
+ ret = kgd2kfd_resume_process(kfd);
+
+ return ret;
+}
+
+void kgd2kfd_suspend_process(struct kfd_dev *kfd)
+{
+ if (!kfd->init_complete)
+ return;
+
+ mutex_lock(&kfd_processes_mutex);
+ /* For first KFD device suspend all the KFD processes */
+ if (++kfd_locked == 1)
+ kfd_suspend_all_processes();
+ mutex_unlock(&kfd_processes_mutex);
+}
+
+int kgd2kfd_resume_process(struct kfd_dev *kfd)
+{
+ int ret = 0;
+
+ if (!kfd->init_complete)
+ return 0;
+
+ mutex_lock(&kfd_processes_mutex);
+ if (--kfd_locked == 0)
+ ret = kfd_resume_all_processes();
+ WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
+ mutex_unlock(&kfd_processes_mutex);
return ret;
}
@@ -1442,24 +1478,53 @@ unsigned int kfd_get_num_xgmi_sdma_engines(struct kfd_node *node)
kfd_get_num_sdma_engines(node);
}
-int kgd2kfd_check_and_lock_kfd(void)
+int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd)
{
+ struct kfd_process *p;
+ int r = 0, temp, idx;
+
mutex_lock(&kfd_processes_mutex);
- if (!hash_empty(kfd_processes_table) || kfd_is_locked()) {
- mutex_unlock(&kfd_processes_mutex);
- return -EBUSY;
+
+ if (hash_empty(kfd_processes_table) && !kfd_is_locked(kfd))
+ goto out;
+
+ /* fail under system reset/resume or kfd device is partition switching. */
+ if (kfd_is_locked(kfd)) {
+ r = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * ensure all running processes are cgroup excluded from device before mode switch.
+ * i.e. no pdd was created on the process socket.
+ */
+ idx = srcu_read_lock(&kfd_processes_srcu);
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ int i;
+
+ for (i = 0; i < p->n_pdds; i++) {
+ if (p->pdds[i]->dev->kfd != kfd)
+ continue;
+
+ r = -EBUSY;
+ goto proc_check_unlock;
+ }
}
- ++kfd_locked;
+proc_check_unlock:
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+out:
+ if (!r)
+ ++kfd->kfd_dev_lock;
mutex_unlock(&kfd_processes_mutex);
- return 0;
+ return r;
}
-void kgd2kfd_unlock_kfd(void)
+void kgd2kfd_unlock_kfd(struct kfd_dev *kfd)
{
mutex_lock(&kfd_processes_mutex);
- --kfd_locked;
+ --kfd->kfd_dev_lock;
mutex_unlock(&kfd_processes_mutex);
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 76359c6a3f3a..2d91027e2a74 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2312,7 +2312,7 @@ static int reset_hung_queues_sdma(struct device_queue_manager *dqm)
continue;
/* Reset engine and check. */
- if (amdgpu_sdma_reset_engine(dqm->dev->adev, i) ||
+ if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) ||
dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) ||
!set_sdma_queue_as_reset(dqm, doorbell_off)) {
r = -ENOTRECOVERABLE;
@@ -2339,9 +2339,18 @@ reset_fail:
static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma)
{
+ struct amdgpu_device *adev = dqm->dev->adev;
+
while (halt_if_hws_hang)
schedule();
+ if (adev->debug_disable_gpu_ring_reset) {
+ dev_info_once(adev->dev,
+ "%s queue hung, but ring reset disabled",
+ is_sdma ? "sdma" : "compute");
+
+ return -EPERM;
+ }
if (!amdgpu_gpu_recovery)
return -ENOTRECOVERABLE;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 2b294ada3ec0..82905f3e54dd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1302,7 +1302,7 @@ void kfd_signal_reset_event(struct kfd_node *dev)
if (ti) {
dev_err(dev->adev->dev,
"Queues reset on process %s tid %d thread %s pid %d\n",
- ti->process_name, ti->tgid, ti->task_name, ti->pid);
+ ti->process_name, ti->tgid, ti->task.comm, ti->task.pid);
amdgpu_vm_put_task_info(ti);
}
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index dbcb60eb54b2..1d170dc50df3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -23,7 +23,6 @@
*/
#include <linux/device.h>
-#include <linux/export.h>
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/sched.h>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d221c58dccc3..67694bcd9464 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -372,6 +372,9 @@ struct kfd_dev {
/* bitmap for dynamic doorbell allocation from doorbell object */
unsigned long *doorbell_bitmap;
+
+ /* for dynamic partitioning */
+ int kfd_dev_lock;
};
enum kfd_mempool {
@@ -1536,7 +1539,7 @@ static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev)
int kfd_send_exception_to_runtime(struct kfd_process *p,
unsigned int queue_id,
uint64_t error_reason);
-bool kfd_is_locked(void);
+bool kfd_is_locked(struct kfd_dev *kfd);
/* Compute profile */
void kfd_inc_compute_active(struct kfd_node *dev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 722ac1662bdc..5be28c6c4f6a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -854,7 +854,7 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
*/
mutex_lock(&kfd_processes_mutex);
- if (kfd_is_locked()) {
+ if (kfd_is_locked(NULL)) {
pr_debug("KFD is locked! Cannot create process");
process = ERR_PTR(-EINVAL);
goto out;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 83d9384ac815..a499449fcb06 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -253,9 +253,9 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid);
if (task_info) {
/* Report VM faults from user applications, not retry from kernel */
- if (task_info->pid)
+ if (task_info->task.pid)
kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT(
- task_info->pid, task_info->task_name));
+ task_info->task.pid, task_info->task.comm));
amdgpu_vm_put_task_info(task_info);
}
}
@@ -359,8 +359,8 @@ void kfd_smi_event_process(struct kfd_process_device *pdd, bool start)
kfd_smi_event_add(0, pdd->dev,
start ? KFD_SMI_EVENT_PROCESS_START :
KFD_SMI_EVENT_PROCESS_END,
- KFD_EVENT_FMT_PROCESS(task_info->pid,
- task_info->task_name));
+ KFD_EVENT_FMT_PROCESS(task_info->task.pid,
+ task_info->task.comm));
amdgpu_vm_put_task_info(task_info);
}
}