summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/aldebaran.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/aldebaran.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/aldebaran.c139
1 files changed, 93 insertions, 46 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index 2b97b8a96fb4..daa7b23bc775 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -35,7 +35,7 @@ static bool aldebaran_is_mode2_default(struct amdgpu_reset_control *reset_ctl)
{
struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
- if ((adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
+ if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&
adev->gmc.xgmi.connected_to_cpu))
return true;
@@ -48,59 +48,64 @@ aldebaran_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
{
struct amdgpu_reset_handler *handler;
struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+ int i;
+
+ if (reset_context->method == AMD_RESET_METHOD_NONE) {
+ if (aldebaran_is_mode2_default(reset_ctl))
+ reset_context->method = AMD_RESET_METHOD_MODE2;
+ else
+ reset_context->method = amdgpu_asic_reset_method(adev);
+ }
if (reset_context->method != AMD_RESET_METHOD_NONE) {
dev_dbg(adev->dev, "Getting reset handler for method %d\n",
reset_context->method);
- list_for_each_entry(handler, &reset_ctl->reset_handlers,
- handler_list) {
+ for_each_handler(i, handler, reset_ctl) {
if (handler->reset_method == reset_context->method)
return handler;
}
}
- if (aldebaran_is_mode2_default(reset_ctl)) {
- list_for_each_entry(handler, &reset_ctl->reset_handlers,
- handler_list) {
- if (handler->reset_method == AMD_RESET_METHOD_MODE2) {
- reset_context->method = AMD_RESET_METHOD_MODE2;
- return handler;
- }
- }
- }
-
dev_dbg(adev->dev, "Reset handler not found!\n");
return NULL;
}
+static inline uint32_t aldebaran_get_ip_block_mask(struct amdgpu_device *adev)
+{
+ uint32_t ip_block_mask = BIT(AMD_IP_BLOCK_TYPE_GFX) |
+ BIT(AMD_IP_BLOCK_TYPE_SDMA);
+
+ if (adev->aid_mask)
+ ip_block_mask |= BIT(AMD_IP_BLOCK_TYPE_IH);
+
+ return ip_block_mask;
+}
+
static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)
{
+ uint32_t ip_block_mask = aldebaran_get_ip_block_mask(adev);
+ uint32_t ip_block;
int r, i;
+ /* Skip suspend of SDMA IP versions >= 4.4.2. They are multi-aid */
+ if (adev->aid_mask)
+ ip_block_mask &= ~BIT(AMD_IP_BLOCK_TYPE_SDMA);
+
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!(adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA))
+ ip_block = BIT(adev->ip_blocks[i].version->type);
+ if (!(ip_block_mask & ip_block))
continue;
- r = adev->ip_blocks[i].version->funcs->suspend(adev);
-
- if (r) {
- dev_err(adev->dev,
- "suspend of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
+ if (r)
return r;
- }
-
- adev->ip_blocks[i].status.hw = false;
}
- return r;
+ return 0;
}
static int
@@ -124,9 +129,9 @@ static void aldebaran_async_reset(struct work_struct *work)
struct amdgpu_reset_control *reset_ctl =
container_of(work, struct amdgpu_reset_control, reset_work);
struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+ int i;
- list_for_each_entry(handler, &reset_ctl->reset_handlers,
- handler_list) {
+ for_each_handler(i, handler, reset_ctl) {
if (handler->reset_method == reset_ctl->active_reset) {
dev_dbg(adev->dev, "Resetting device\n");
handler->do_reset(adev);
@@ -157,7 +162,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
if (reset_device_list == NULL)
return -EINVAL;
- if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
+ if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&
reset_context->hive == NULL) {
/* Wrong context, return error */
return -EINVAL;
@@ -210,8 +215,10 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
{
struct amdgpu_firmware_info *ucode_list[AMDGPU_UCODE_ID_MAXIMUM];
+ uint32_t ip_block_mask = aldebaran_get_ip_block_mask(adev);
struct amdgpu_firmware_info *ucode;
struct amdgpu_ip_block *cmn_block;
+ struct amdgpu_ip_block *ih_block;
int ucode_count = 0;
int i, r;
@@ -249,10 +256,22 @@ static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
dev_err(adev->dev, "Failed to get BIF handle\n");
return -EINVAL;
}
- r = cmn_block->version->funcs->resume(adev);
+ r = amdgpu_ip_block_resume(cmn_block);
if (r)
return r;
+ if (ip_block_mask & BIT(AMD_IP_BLOCK_TYPE_IH)) {
+ ih_block = amdgpu_device_ip_get_ip_block(adev,
+ AMD_IP_BLOCK_TYPE_IH);
+ if (unlikely(!ih_block)) {
+ dev_err(adev->dev, "Failed to get IH handle\n");
+ return -EINVAL;
+ }
+ r = amdgpu_ip_block_resume(ih_block);
+ if (r)
+ return r;
+ }
+
/* Reinit GFXHUB */
adev->gfxhub.funcs->init(adev);
r = adev->gfxhub.funcs->gart_enable(adev);
@@ -285,15 +304,10 @@ static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
adev->ip_blocks[i].version->type ==
AMD_IP_BLOCK_TYPE_SDMA))
continue;
- r = adev->ip_blocks[i].version->funcs->resume(adev);
- if (r) {
- dev_err(adev->dev,
- "resume of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
- return r;
- }
- adev->ip_blocks[i].status.hw = true;
+ r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
+ if (r)
+ return r;
}
for (i = 0; i < adev->num_ip_blocks; i++) {
@@ -307,7 +321,7 @@ static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
if (adev->ip_blocks[i].version->funcs->late_init) {
r = adev->ip_blocks[i].version->funcs->late_init(
- (void *)adev);
+ &adev->ip_blocks[i]);
if (r) {
dev_err(adev->dev,
"late_init of IP block <%s> failed %d after reset\n",
@@ -319,8 +333,6 @@ static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
adev->ip_blocks[i].status.late_initialized = true;
}
- amdgpu_ras_set_error_query_ready(adev, true);
-
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
@@ -333,12 +345,13 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
{
struct list_head *reset_device_list = reset_context->reset_device_list;
struct amdgpu_device *tmp_adev = NULL;
+ struct amdgpu_ras *con;
int r;
if (reset_device_list == NULL)
return -EINVAL;
- if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==
+ if (amdgpu_ip_version(reset_context->reset_req_dev, MP1_HWIP, 0) ==
IP_VERSION(13, 0, 2) &&
reset_context->hive == NULL) {
/* Wrong context, return error */
@@ -346,8 +359,12 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
}
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
+ amdgpu_set_init_level(tmp_adev,
+ AMDGPU_INIT_LEVEL_RESET_RECOVERY);
dev_info(tmp_adev->dev,
"GPU reset succeeded, trying to resume\n");
+ /*TBD: Ideally should clear only GFX, SDMA blocks*/
+ amdgpu_ras_clear_err_state(tmp_adev);
r = aldebaran_mode2_restore_ip(tmp_adev);
if (r)
goto end;
@@ -358,7 +375,30 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
*/
amdgpu_register_gpu_instance(tmp_adev);
- /* Resume RAS */
+ /* Resume RAS, ecc_irq */
+ con = amdgpu_ras_get_context(tmp_adev);
+ if (!amdgpu_sriov_vf(tmp_adev) && con) {
+ if (tmp_adev->sdma.ras &&
+ tmp_adev->sdma.ras->ras_block.ras_late_init) {
+ r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,
+ &tmp_adev->sdma.ras->ras_block.ras_comm);
+ if (r) {
+ dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);
+ goto end;
+ }
+ }
+
+ if (tmp_adev->gfx.ras &&
+ tmp_adev->gfx.ras->ras_block.ras_late_init) {
+ r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,
+ &tmp_adev->gfx.ras->ras_block.ras_comm);
+ if (r) {
+ dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);
+ goto end;
+ }
+ }
+ }
+
amdgpu_ras_resume(tmp_adev);
/* Update PSP FW topology after reset */
@@ -368,6 +408,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
tmp_adev);
if (!r) {
+ amdgpu_set_init_level(tmp_adev,
+ AMDGPU_INIT_LEVEL_DEFAULT);
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
r = amdgpu_ib_ring_tests(tmp_adev);
@@ -395,6 +437,12 @@ static struct amdgpu_reset_handler aldebaran_mode2_handler = {
.do_reset = aldebaran_mode2_reset,
};
+static struct amdgpu_reset_handler
+ *aldebaran_rst_handlers[AMDGPU_RESET_MAX_HANDLERS] = {
+ &aldebaran_mode2_handler,
+ &xgmi_reset_on_init_handler,
+ };
+
int aldebaran_reset_init(struct amdgpu_device *adev)
{
struct amdgpu_reset_control *reset_ctl;
@@ -408,10 +456,9 @@ int aldebaran_reset_init(struct amdgpu_device *adev)
reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
reset_ctl->get_reset_handler = aldebaran_get_reset_handler;
- INIT_LIST_HEAD(&reset_ctl->reset_handlers);
INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
/* Only mode2 is handled through reset control now */
- amdgpu_reset_add_handler(reset_ctl, &aldebaran_mode2_handler);
+ reset_ctl->reset_handlers = &aldebaran_rst_handlers;
adev->reset_cntl = reset_ctl;