1 files changed, 440 insertions, 223 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index b692ad402252..a7d8f1ce6ac2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -27,35 +27,46 @@
 
 #include <linux/firmware.h>
 #include <linux/module.h>
-#include <drm/drmP.h>
+
 #include <drm/drm.h>
+#include <drm/drm_drv.h>
 
 #include "amdgpu.h"
 #include "amdgpu_pm.h"
 #include "amdgpu_vce.h"
+#include "amdgpu_cs.h"
 #include "cikd.h"
 
 /* 1 second timeout */
 #define VCE_IDLE_TIMEOUT	msecs_to_jiffies(1000)
 
 /* Firmware Names */
+#ifdef CONFIG_DRM_AMDGPU_SI
+#define FIRMWARE_VCE_V1_0	"amdgpu/vce_1_0_0.bin"
+#endif
 #ifdef CONFIG_DRM_AMDGPU_CIK
-#define FIRMWARE_BONAIRE	"radeon/bonaire_vce.bin"
-#define FIRMWARE_KABINI	"radeon/kabini_vce.bin"
-#define FIRMWARE_KAVERI	"radeon/kaveri_vce.bin"
-#define FIRMWARE_HAWAII	"radeon/hawaii_vce.bin"
-#define FIRMWARE_MULLINS	"radeon/mullins_vce.bin"
+#define FIRMWARE_BONAIRE	"amdgpu/bonaire_vce.bin"
+#define FIRMWARE_KABINI	"amdgpu/kabini_vce.bin"
+#define FIRMWARE_KAVERI	"amdgpu/kaveri_vce.bin"
+#define FIRMWARE_HAWAII	"amdgpu/hawaii_vce.bin"
+#define FIRMWARE_MULLINS	"amdgpu/mullins_vce.bin"
 #endif
 #define FIRMWARE_TONGA		"amdgpu/tonga_vce.bin"
 #define FIRMWARE_CARRIZO	"amdgpu/carrizo_vce.bin"
 #define FIRMWARE_FIJI		"amdgpu/fiji_vce.bin"
 #define FIRMWARE_STONEY		"amdgpu/stoney_vce.bin"
 #define FIRMWARE_POLARIS10	"amdgpu/polaris10_vce.bin"
-#define FIRMWARE_POLARIS11         "amdgpu/polaris11_vce.bin"
-#define FIRMWARE_POLARIS12         "amdgpu/polaris12_vce.bin"
+#define FIRMWARE_POLARIS11	"amdgpu/polaris11_vce.bin"
+#define FIRMWARE_POLARIS12	"amdgpu/polaris12_vce.bin"
+#define FIRMWARE_VEGAM		"amdgpu/vegam_vce.bin"
 
 #define FIRMWARE_VEGA10		"amdgpu/vega10_vce.bin"
+#define FIRMWARE_VEGA12		"amdgpu/vega12_vce.bin"
+#define FIRMWARE_VEGA20		"amdgpu/vega20_vce.bin"
 
+#ifdef CONFIG_DRM_AMDGPU_SI
+MODULE_FIRMWARE(FIRMWARE_VCE_V1_0);
+#endif
 #ifdef CONFIG_DRM_AMDGPU_CIK
 MODULE_FIRMWARE(FIRMWARE_BONAIRE);
 MODULE_FIRMWARE(FIRMWARE_KABINI);
@@ -70,88 +81,106 @@ MODULE_FIRMWARE(FIRMWARE_STONEY);
 MODULE_FIRMWARE(FIRMWARE_POLARIS10);
 MODULE_FIRMWARE(FIRMWARE_POLARIS11);
 MODULE_FIRMWARE(FIRMWARE_POLARIS12);
+MODULE_FIRMWARE(FIRMWARE_VEGAM);
 
 MODULE_FIRMWARE(FIRMWARE_VEGA10);
+MODULE_FIRMWARE(FIRMWARE_VEGA12);
+MODULE_FIRMWARE(FIRMWARE_VEGA20);
 
 static void amdgpu_vce_idle_work_handler(struct work_struct *work);
+static int amdgpu_vce_get_create_msg(struct amdgpu_ring *ring, uint32_t handle,
+				     struct dma_fence **fence);
+static int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle,
+				      bool direct, struct dma_fence **fence);
 
 /**
- * amdgpu_vce_init - allocate memory, load vce firmware
+ * amdgpu_vce_firmware_name() - determine the firmware file name for VCE
  *
  * @adev: amdgpu_device pointer
  *
- * First step to get VCE online, allocate memory and load the firmware
+ * Each chip that has VCE IP may need a different firmware.
+ * This function returns the name of the VCE firmware file
+ * appropriate for the current chip.
  */
-int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned long size)
+static const char *amdgpu_vce_firmware_name(struct amdgpu_device *adev)
 {
-	struct amdgpu_ring *ring;
-	struct amd_sched_rq *rq;
-	const char *fw_name;
-	const struct common_firmware_header *hdr;
-	unsigned ucode_version, version_major, version_minor, binary_id;
-	int i, r;
-
 	switch (adev->asic_type) {
+#ifdef CONFIG_DRM_AMDGPU_SI
+	case CHIP_PITCAIRN:
+	case CHIP_TAHITI:
+	case CHIP_VERDE:
+		return FIRMWARE_VCE_V1_0;
+#endif
 #ifdef CONFIG_DRM_AMDGPU_CIK
 	case CHIP_BONAIRE:
-		fw_name = FIRMWARE_BONAIRE;
-		break;
+		return FIRMWARE_BONAIRE;
 	case CHIP_KAVERI:
-		fw_name = FIRMWARE_KAVERI;
-		break;
+		return FIRMWARE_KAVERI;
 	case CHIP_KABINI:
-		fw_name = FIRMWARE_KABINI;
-		break;
+		return FIRMWARE_KABINI;
 	case CHIP_HAWAII:
-		fw_name = FIRMWARE_HAWAII;
-		break;
+		return FIRMWARE_HAWAII;
 	case CHIP_MULLINS:
-		fw_name = FIRMWARE_MULLINS;
-		break;
+		return FIRMWARE_MULLINS;
 #endif
 	case CHIP_TONGA:
-		fw_name = FIRMWARE_TONGA;
-		break;
+		return  FIRMWARE_TONGA;
 	case CHIP_CARRIZO:
-		fw_name = FIRMWARE_CARRIZO;
-		break;
+		return  FIRMWARE_CARRIZO;
 	case CHIP_FIJI:
-		fw_name = FIRMWARE_FIJI;
-		break;
+		return  FIRMWARE_FIJI;
 	case CHIP_STONEY:
-		fw_name = FIRMWARE_STONEY;
-		break;
+		return  FIRMWARE_STONEY;
 	case CHIP_POLARIS10:
-		fw_name = FIRMWARE_POLARIS10;
-		break;
+		return  FIRMWARE_POLARIS10;
 	case CHIP_POLARIS11:
-		fw_name = FIRMWARE_POLARIS11;
-		break;
-	case CHIP_VEGA10:
-		fw_name = FIRMWARE_VEGA10;
-		break;
+		return  FIRMWARE_POLARIS11;
 	case CHIP_POLARIS12:
-		fw_name = FIRMWARE_POLARIS12;
-		break;
+		return  FIRMWARE_POLARIS12;
+	case CHIP_VEGAM:
+		return  FIRMWARE_VEGAM;
+	case CHIP_VEGA10:
+		return  FIRMWARE_VEGA10;
+	case CHIP_VEGA12:
+		return  FIRMWARE_VEGA12;
+	case CHIP_VEGA20:
+		return  FIRMWARE_VEGA20;
 
 	default:
-		return -EINVAL;
+		return NULL;
 	}
+}
 
-	r = request_firmware(&adev->vce.fw, fw_name, adev->dev);
-	if (r) {
-		dev_err(adev->dev, "amdgpu_vce: Can't load firmware \"%s\"\n",
-			fw_name);
-		return r;
-	}
+/**
+ * amdgpu_vce_early_init() - try to load VCE firmware
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * Tries to load the VCE firmware.
+ *
+ * When not found, returns ENOENT so that the driver can
+ * still load and initialize the rest of the IP blocks.
+ * The GPU can function just fine without VCE, they will just
+ * not support video encoding.
+ */
+int amdgpu_vce_early_init(struct amdgpu_device *adev)
+{
+	const char *fw_name = amdgpu_vce_firmware_name(adev);
+	const struct common_firmware_header *hdr;
+	unsigned int ucode_version, version_major, version_minor, binary_id;
+	int r;
 
-	r = amdgpu_ucode_validate(adev->vce.fw);
+	if (!fw_name)
+		return -ENOENT;
+
+	r = amdgpu_ucode_request(adev, &adev->vce.fw, AMDGPU_UCODE_REQUIRED, "%s", fw_name);
 	if (r) {
-		dev_err(adev->dev, "amdgpu_vce: Can't validate firmware \"%s\"\n",
-			fw_name);
-		release_firmware(adev->vce.fw);
-		adev->vce.fw = NULL;
-		return r;
+		dev_err(adev->dev,
+			"amdgpu_vce: Firmware \"%s\" not found or failed to validate (%d)\n",
+			fw_name, r);
+
+		amdgpu_ucode_release(&adev->vce.fw);
+		return -ENOENT;
 	}
 
 	hdr = (const struct common_firmware_header *)adev->vce.fw->data;
@@ -160,28 +189,45 @@ int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned long size)
 	version_major = (ucode_version >> 20) & 0xfff;
 	version_minor = (ucode_version >> 8) & 0xfff;
 	binary_id = ucode_version & 0xff;
-	DRM_INFO("Found VCE firmware Version: %hhd.%hhd Binary ID: %hhd\n",
+	dev_info(adev->dev, "Found VCE firmware Version: %d.%d Binary ID: %d\n",
 		version_major, version_minor, binary_id);
 	adev->vce.fw_version = ((version_major << 24) | (version_minor << 16) |
 				(binary_id << 8));
 
+	return 0;
+}
+
+/**
+ * amdgpu_vce_sw_init() - allocate memory for VCE BO
+ *
+ * @adev: amdgpu_device pointer
+ * @size: size for the new BO
+ *
+ * First step to get VCE online: allocate memory for VCE BO.
+ * The VCE firmware binary is copied into the VCE BO later,
+ * in amdgpu_vce_resume. The VCE executes its code from the
+ * VCE BO and also uses the space in this BO for its stack and data.
+ *
+ * Ideally this BO should be placed in VRAM for optimal performance,
+ * although technically it also runs from system RAM (albeit slowly).
+ */
+int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned long size)
+{
+	int i, r;
+
+	if (!adev->vce.fw)
+		return -ENOENT;
+
 	r = amdgpu_bo_create_kernel(adev, size, PAGE_SIZE,
-				    AMDGPU_GEM_DOMAIN_VRAM, &adev->vce.vcpu_bo,
+				    AMDGPU_GEM_DOMAIN_VRAM |
+				    AMDGPU_GEM_DOMAIN_GTT,
+				    &adev->vce.vcpu_bo,
 				    &adev->vce.gpu_addr, &adev->vce.cpu_addr);
 	if (r) {
 		dev_err(adev->dev, "(%d) failed to allocate VCE bo\n", r);
 		return r;
 	}
 
-	ring = &adev->vce.ring[0];
-	rq = &ring->sched.sched_rq[AMD_SCHED_PRIORITY_NORMAL];
-	r = amd_sched_entity_init(&ring->sched, &adev->vce.entity,
-				  rq, amdgpu_sched_jobs);
-	if (r != 0) {
-		DRM_ERROR("Failed setting up VCE run queue.\n");
-		return r;
-	}
-
 	for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i) {
 		atomic_set(&adev->vce.handles[i], 0);
 		adev->vce.filp[i] = NULL;
@@ -194,7 +240,7 @@ int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned long size)
 }
 
 /**
- * amdgpu_vce_fini - free memory
+ * amdgpu_vce_sw_fini - free memory
  *
  * @adev: amdgpu_device pointer
  *
@@ -202,22 +248,47 @@ int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned long size)
  */
 int amdgpu_vce_sw_fini(struct amdgpu_device *adev)
 {
-	unsigned i;
+	unsigned int i;
 
 	if (adev->vce.vcpu_bo == NULL)
 		return 0;
 
-	amd_sched_entity_fini(&adev->vce.ring[0].sched, &adev->vce.entity);
-
-	amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev->vce.gpu_addr,
-		(void **)&adev->vce.cpu_addr);
+	drm_sched_entity_destroy(&adev->vce.entity);
 
 	for (i = 0; i < adev->vce.num_rings; i++)
 		amdgpu_ring_fini(&adev->vce.ring[i]);
 
-	release_firmware(adev->vce.fw);
+	amdgpu_ucode_release(&adev->vce.fw);
 	mutex_destroy(&adev->vce.idle_mutex);
 
+	amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev->vce.gpu_addr,
+		(void **)&adev->vce.cpu_addr);
+
+	return 0;
+}
+
+/**
+ * amdgpu_vce_entity_init - init entity
+ *
+ * @adev: amdgpu_device pointer
+ * @ring: amdgpu_ring pointer to check
+ *
+ * Initialize the entity used for handle management in the kernel driver.
+ */
+int amdgpu_vce_entity_init(struct amdgpu_device *adev, struct amdgpu_ring *ring)
+{
+	if (ring == &adev->vce.ring[0]) {
+		struct drm_gpu_scheduler *sched = &ring->sched;
+		int r;
+
+		r = drm_sched_entity_init(&adev->vce.entity, DRM_SCHED_PRIORITY_NORMAL,
+					  &sched, 1, NULL);
+		if (r != 0) {
+			DRM_ERROR("Failed setting up VCE run queue.\n");
+			return r;
+		}
+	}
+
 	return 0;
 }
 
@@ -231,6 +302,8 @@ int amdgpu_vce_suspend(struct amdgpu_device *adev)
 {
 	int i;
 
+	cancel_delayed_work_sync(&adev->vce.idle_work);
+
 	if (adev->vce.vcpu_bo == NULL)
 		return 0;
 
@@ -241,7 +314,6 @@ int amdgpu_vce_suspend(struct amdgpu_device *adev)
 	if (i == AMDGPU_MAX_VCE_HANDLES)
 		return 0;
 
-	cancel_delayed_work_sync(&adev->vce.idle_work);
 	/* TODO: suspending running encoding sessions isn't supported */
 	return -EINVAL;
 }
@@ -254,35 +326,22 @@ int amdgpu_vce_suspend(struct amdgpu_device *adev)
  */
 int amdgpu_vce_resume(struct amdgpu_device *adev)
 {
-	void *cpu_addr;
 	const struct common_firmware_header *hdr;
-	unsigned offset;
-	int r;
+	unsigned int offset;
+	int idx;
 
 	if (adev->vce.vcpu_bo == NULL)
 		return -EINVAL;
 
-	r = amdgpu_bo_reserve(adev->vce.vcpu_bo, false);
-	if (r) {
-		dev_err(adev->dev, "(%d) failed to reserve VCE bo\n", r);
-		return r;
-	}
-
-	r = amdgpu_bo_kmap(adev->vce.vcpu_bo, &cpu_addr);
-	if (r) {
-		amdgpu_bo_unreserve(adev->vce.vcpu_bo);
-		dev_err(adev->dev, "(%d) VCE map failed\n", r);
-		return r;
-	}
-
 	hdr = (const struct common_firmware_header *)adev->vce.fw->data;
 	offset = le32_to_cpu(hdr->ucode_array_offset_bytes);
-	memcpy_toio(cpu_addr, adev->vce.fw->data + offset,
-		    adev->vce.fw->size - offset);
-
-	amdgpu_bo_kunmap(adev->vce.vcpu_bo);
 
-	amdgpu_bo_unreserve(adev->vce.vcpu_bo);
+	if (drm_dev_enter(adev_to_drm(adev), &idx)) {
+		memset_io(adev->vce.cpu_addr, 0, amdgpu_bo_size(adev->vce.vcpu_bo));
+		memcpy_toio(adev->vce.cpu_addr, adev->vce.fw->data + offset,
+			    adev->vce.fw->size - offset);
+		drm_dev_exit(idx);
+	}
 
 	return 0;
 }
@@ -298,10 +357,7 @@ static void amdgpu_vce_idle_work_handler(struct work_struct *work)
 {
 	struct amdgpu_device *adev =
 		container_of(work, struct amdgpu_device, vce.idle_work.work);
-	unsigned i, count = 0;
-
-	if (amdgpu_sriov_vf(adev))
-		return;
+	unsigned int i, count = 0;
 
 	for (i = 0; i < adev->vce.num_rings; i++)
 		count += amdgpu_fence_count_emitted(&adev->vce.ring[i]);
@@ -311,10 +367,10 @@ static void amdgpu_vce_idle_work_handler(struct work_struct *work)
 			amdgpu_dpm_enable_vce(adev, false);
 		} else {
 			amdgpu_asic_set_vce_clocks(adev, 0, 0);
-			amdgpu_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
-							    AMD_PG_STATE_GATE);
-			amdgpu_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
-							    AMD_CG_STATE_GATE);
+			amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
+							       AMD_PG_STATE_GATE);
+			amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
+							       AMD_CG_STATE_GATE);
 		}
 	} else {
 		schedule_delayed_work(&adev->vce.idle_work, VCE_IDLE_TIMEOUT);
@@ -343,10 +399,10 @@ void amdgpu_vce_ring_begin_use(struct amdgpu_ring *ring)
 			amdgpu_dpm_enable_vce(adev, true);
 		} else {
 			amdgpu_asic_set_vce_clocks(adev, 53300, 40000);
-			amdgpu_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
-							    AMD_CG_STATE_UNGATE);
-			amdgpu_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
-							    AMD_PG_STATE_UNGATE);
+			amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
+							       AMD_CG_STATE_UNGATE);
+			amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
+							       AMD_PG_STATE_UNGATE);
 
 		}
 	}
@@ -362,7 +418,8 @@ void amdgpu_vce_ring_begin_use(struct amdgpu_ring *ring)
  */
 void amdgpu_vce_ring_end_use(struct amdgpu_ring *ring)
 {
-	schedule_delayed_work(&ring->adev->vce.idle_work, VCE_IDLE_TIMEOUT);
+	if (!amdgpu_sriov_vf(ring->adev))
+		schedule_delayed_work(&ring->adev->vce.idle_work, VCE_IDLE_TIMEOUT);
 }
 
 /**
@@ -377,6 +434,7 @@ void amdgpu_vce_free_handles(struct amdgpu_device *adev, struct drm_file *filp)
 {
 	struct amdgpu_ring *ring = &adev->vce.ring[0];
 	int i, r;
+
 	for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i) {
 		uint32_t handle = atomic_read(&adev->vce.handles[i]);
 
@@ -393,32 +451,61 @@ void amdgpu_vce_free_handles(struct amdgpu_device *adev, struct drm_file *filp)
 }
 
 /**
- * amdgpu_vce_get_create_msg - generate a VCE create msg
+ * amdgpu_vce_required_gart_pages() - gets number of GART pages required by VCE
  *
  * @adev: amdgpu_device pointer
+ *
+ * Returns how many GART pages we need before GTT for the VCE IP block.
+ * For VCE1, see vce_v1_0_ensure_vcpu_bo_32bit_addr for details.
+ * For VCE2+, this is not needed so return zero.
+ */
+u32 amdgpu_vce_required_gart_pages(struct amdgpu_device *adev)
+{
+	/* VCE IP block not added yet, so can't use amdgpu_ip_version */
+	if (adev->family == AMDGPU_FAMILY_SI)
+		return 512;
+
+	return 0;
+}
+
+/**
+ * amdgpu_vce_get_create_msg - generate a VCE create msg
+ *
  * @ring: ring we should submit the msg to
  * @handle: VCE session handle to use
  * @fence: optional fence to return
  *
  * Open up a stream for HW test
  */
-int amdgpu_vce_get_create_msg(struct amdgpu_ring *ring, uint32_t handle,
-			      struct dma_fence **fence)
+static int amdgpu_vce_get_create_msg(struct amdgpu_ring *ring, uint32_t handle,
+				     struct dma_fence **fence)
 {
-	const unsigned ib_size_dw = 1024;
+	const unsigned int ib_size_dw = 1024;
 	struct amdgpu_job *job;
 	struct amdgpu_ib *ib;
+	struct amdgpu_ib ib_msg;
 	struct dma_fence *f = NULL;
-	uint64_t dummy;
+	uint64_t addr;
 	int i, r;
 
-	r = amdgpu_job_alloc_with_ib(ring->adev, ib_size_dw * 4, &job);
+	r = amdgpu_job_alloc_with_ib(ring->adev, &ring->adev->vce.entity,
+				     AMDGPU_FENCE_OWNER_UNDEFINED,
+				     ib_size_dw * 4, AMDGPU_IB_POOL_DIRECT,
+				     &job, AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST);
 	if (r)
 		return r;
 
-	ib = &job->ibs[0];
+	memset(&ib_msg, 0, sizeof(ib_msg));
+	/* only one gpu page is needed, alloc +1 page to make addr aligned. */
+	r = amdgpu_ib_get(ring->adev, NULL, AMDGPU_GPU_PAGE_SIZE * 2,
+			  AMDGPU_IB_POOL_DIRECT,
+			  &ib_msg);
+	if (r)
+		goto err;
 
-	dummy = ib->gpu_addr + 1024;
+	ib = &job->ibs[0];
+	/* let addr point to page boundary */
+	addr = AMDGPU_GPU_PAGE_ALIGN(ib_msg.gpu_addr);
 
 	/* stitch together an VCE create msg */
 	ib->length_dw = 0;
@@ -450,19 +537,18 @@ int amdgpu_vce_get_create_msg(struct amdgpu_ring *ring, uint32_t handle,
 
 	ib->ptr[ib->length_dw++] = 0x00000014; /* len */
 	ib->ptr[ib->length_dw++] = 0x05000005; /* feedback buffer */
-	ib->ptr[ib->length_dw++] = upper_32_bits(dummy);
-	ib->ptr[ib->length_dw++] = dummy;
+	ib->ptr[ib->length_dw++] = upper_32_bits(addr);
+	ib->ptr[ib->length_dw++] = addr;
 	ib->ptr[ib->length_dw++] = 0x00000001;
 
 	for (i = ib->length_dw; i < ib_size_dw; ++i)
 		ib->ptr[i] = 0x0;
 
-	r = amdgpu_ib_schedule(ring, 1, ib, NULL, &f);
-	job->fence = dma_fence_get(f);
+	r = amdgpu_job_submit_direct(job, ring, &f);
+	amdgpu_ib_free(&ib_msg, f);
 	if (r)
 		goto err;
 
-	amdgpu_job_free(job);
 	if (fence)
 		*fence = dma_fence_get(f);
 	dma_fence_put(f);
@@ -476,23 +562,28 @@ err:
 /**
  * amdgpu_vce_get_destroy_msg - generate a VCE destroy msg
  *
- * @adev: amdgpu_device pointer
  * @ring: ring we should submit the msg to
  * @handle: VCE session handle to use
+ * @direct: direct or delayed pool
  * @fence: optional fence to return
  *
  * Close up a stream for HW test or if userspace failed to do so
  */
-int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle,
-			       bool direct, struct dma_fence **fence)
+static int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle,
+				      bool direct, struct dma_fence **fence)
 {
-	const unsigned ib_size_dw = 1024;
+	const unsigned int ib_size_dw = 1024;
 	struct amdgpu_job *job;
 	struct amdgpu_ib *ib;
 	struct dma_fence *f = NULL;
 	int i, r;
 
-	r = amdgpu_job_alloc_with_ib(ring->adev, ib_size_dw * 4, &job);
+	r = amdgpu_job_alloc_with_ib(ring->adev, &ring->adev->vce.entity,
+				     AMDGPU_FENCE_OWNER_UNDEFINED,
+				     ib_size_dw * 4,
+				     direct ? AMDGPU_IB_POOL_DIRECT :
+				     AMDGPU_IB_POOL_DELAYED, &job,
+				     AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST);
 	if (r)
 		return r;
 
@@ -519,19 +610,12 @@ int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle,
 	for (i = ib->length_dw; i < ib_size_dw; ++i)
 		ib->ptr[i] = 0x0;
 
-	if (direct) {
-		r = amdgpu_ib_schedule(ring, 1, ib, NULL, &f);
-		job->fence = dma_fence_get(f);
-		if (r)
-			goto err;
-
-		amdgpu_job_free(job);
-	} else {
-		r = amdgpu_job_submit(job, ring, &ring->adev->vce.entity,
-				      AMDGPU_FENCE_OWNER_UNDEFINED, &f);
-		if (r)
-			goto err;
-	}
+	if (direct)
+		r = amdgpu_job_submit_direct(job, ring, &f);
+	else
+		f = amdgpu_job_submit(job);
+	if (r)
+		goto err;
 
 	if (fence)
 		*fence = dma_fence_get(f);
@@ -544,39 +628,93 @@ err:
 }
 
 /**
+ * amdgpu_vce_validate_bo - make sure not to cross 4GB boundary
+ *
+ * @p: cs parser
+ * @ib: indirect buffer to use
+ * @lo: address of lower dword
+ * @hi: address of higher dword
+ * @size: minimum size
+ * @index: bs/fb index
+ *
+ * Make sure that no BO cross a 4GB boundary.
+ */
+static int amdgpu_vce_validate_bo(struct amdgpu_cs_parser *p,
+				  struct amdgpu_ib *ib, int lo, int hi,
+				  unsigned int size, int32_t index)
+{
+	int64_t offset = ((uint64_t)size) * ((int64_t)index);
+	struct ttm_operation_ctx ctx = { false, false };
+	struct amdgpu_bo_va_mapping *mapping;
+	unsigned int i, fpfn, lpfn;
+	struct amdgpu_bo *bo;
+	uint64_t addr;
+	int r;
+
+	addr = ((uint64_t)amdgpu_ib_get_value(ib, lo)) |
+	       ((uint64_t)amdgpu_ib_get_value(ib, hi)) << 32;
+	if (index >= 0) {
+		addr += offset;
+		fpfn = PAGE_ALIGN(offset) >> PAGE_SHIFT;
+		lpfn = 0x100000000ULL >> PAGE_SHIFT;
+	} else {
+		fpfn = 0;
+		lpfn = (0x100000000ULL - PAGE_ALIGN(offset)) >> PAGE_SHIFT;
+	}
+
+	r = amdgpu_cs_find_mapping(p, addr, &bo, &mapping);
+	if (r) {
+		DRM_ERROR("Can't find BO for addr 0x%010llx %d %d %d %d\n",
+			  addr, lo, hi, size, index);
+		return r;
+	}
+
+	for (i = 0; i < bo->placement.num_placement; ++i) {
+		bo->placements[i].fpfn = max(bo->placements[i].fpfn, fpfn);
+		bo->placements[i].lpfn = bo->placements[i].lpfn ?
+			min(bo->placements[i].lpfn, lpfn) : lpfn;
+	}
+	return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+}
+
+
+/**
  * amdgpu_vce_cs_reloc - command submission relocation
  *
  * @p: parser context
+ * @ib: indirect buffer to use
  * @lo: address of lower dword
  * @hi: address of higher dword
  * @size: minimum size
+ * @index: bs/fb index
  *
  * Patch relocation inside command stream with real buffer address
  */
-static int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx,
-			       int lo, int hi, unsigned size, uint32_t index)
+static int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, struct amdgpu_ib *ib,
+			       int lo, int hi, unsigned int size, uint32_t index)
 {
 	struct amdgpu_bo_va_mapping *mapping;
 	struct amdgpu_bo *bo;
 	uint64_t addr;
+	int r;
 
 	if (index == 0xffffffff)
 		index = 0;
 
-	addr = ((uint64_t)amdgpu_get_ib_value(p, ib_idx, lo)) |
-	       ((uint64_t)amdgpu_get_ib_value(p, ib_idx, hi)) << 32;
+	addr = ((uint64_t)amdgpu_ib_get_value(ib, lo)) |
+	       ((uint64_t)amdgpu_ib_get_value(ib, hi)) << 32;
 	addr += ((uint64_t)size) * ((uint64_t)index);
 
-	mapping = amdgpu_cs_find_mapping(p, addr, &bo);
-	if (mapping == NULL) {
-		DRM_ERROR("Can't find BO for addr 0x%010Lx %d %d %d %d\n",
+	r = amdgpu_cs_find_mapping(p, addr, &bo, &mapping);
+	if (r) {
+		DRM_ERROR("Can't find BO for addr 0x%010llx %d %d %d %d\n",
 			  addr, lo, hi, size, index);
-		return -EINVAL;
+		return r;
 	}
 
 	if ((addr + (uint64_t)size) >
 	    (mapping->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
-		DRM_ERROR("BO to small for addr 0x%010Lx %d %d\n",
+		DRM_ERROR("BO too small for addr 0x%010llx %d %d\n",
 			  addr, lo, hi);
 		return -EINVAL;
 	}
@@ -585,8 +723,8 @@ static int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx,
 	addr += amdgpu_bo_gpu_offset(bo);
 	addr -= ((uint64_t)size) * ((uint64_t)index);
 
-	amdgpu_set_ib_value(p, ib_idx, lo, lower_32_bits(addr));
-	amdgpu_set_ib_value(p, ib_idx, hi, upper_32_bits(addr));
+	amdgpu_ib_set_value(ib, lo, lower_32_bits(addr));
+	amdgpu_ib_set_value(ib, hi, upper_32_bits(addr));
 
 	return 0;
 }
@@ -599,12 +737,12 @@ static int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx,
  * @allocated: allocated a new handle?
  *
  * Validates the handle and return the found session index or -EINVAL
- * we we don't have another free session index.
+ * we don't have another free session index.
  */
 static int amdgpu_vce_validate_handle(struct amdgpu_cs_parser *p,
 				      uint32_t handle, uint32_t *allocated)
 {
-	unsigned i;
+	unsigned int i;
 
 	/* validate the handle */
 	for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i) {
@@ -632,33 +770,32 @@ static int amdgpu_vce_validate_handle(struct amdgpu_cs_parser *p,
 }
 
 /**
- * amdgpu_vce_cs_parse - parse and validate the command stream
+ * amdgpu_vce_ring_parse_cs - parse and validate the command stream
  *
  * @p: parser context
- *
+ * @job: the job to parse
+ * @ib: the IB to patch
  */
-int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
+int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p,
+			     struct amdgpu_job *job,
+			     struct amdgpu_ib *ib)
 {
-	struct amdgpu_ib *ib = &p->job->ibs[ib_idx];
-	unsigned fb_idx = 0, bs_idx = 0;
+	unsigned int fb_idx = 0, bs_idx = 0;
 	int session_idx = -1;
 	uint32_t destroyed = 0;
 	uint32_t created = 0;
 	uint32_t allocated = 0;
 	uint32_t tmp, handle = 0;
-	uint32_t *size = &tmp;
-	int i, r, idx = 0;
+	uint32_t dummy = 0xffffffff;
+	uint32_t *size = &dummy;
+	unsigned int idx;
+	int i, r = 0;
 
-	p->job->vm = NULL;
-	ib->gpu_addr = amdgpu_sa_bo_gpu_addr(ib->sa_bo);
+	job->vm = NULL;
 
-	r = amdgpu_cs_sysvm_access_required(p);
-	if (r)
-		return r;
-
-	while (idx < ib->length_dw) {
-		uint32_t len = amdgpu_get_ib_value(p, ib_idx, idx);
-		uint32_t cmd = amdgpu_get_ib_value(p, ib_idx, idx + 1);
+	for (idx = 0; idx < ib->length_dw;) {
+		uint32_t len = amdgpu_ib_get_value(ib, idx);
+		uint32_t cmd = amdgpu_ib_get_value(ib, idx + 1);
 
 		if ((len < 8) || (len & 3)) {
 			DRM_ERROR("invalid VCE command length (%d)!\n", len);
@@ -667,8 +804,68 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 		}
 
 		switch (cmd) {
+		case 0x00000002: /* task info */
+			fb_idx = amdgpu_ib_get_value(ib, idx + 6);
+			bs_idx = amdgpu_ib_get_value(ib, idx + 7);
+			break;
+
+		case 0x03000001: /* encode */
+			r = amdgpu_vce_validate_bo(p, ib, idx + 10, idx + 9,
+						   0, 0);
+			if (r)
+				goto out;
+
+			r = amdgpu_vce_validate_bo(p, ib, idx + 12, idx + 11,
+						   0, 0);
+			if (r)
+				goto out;
+			break;
+
+		case 0x05000001: /* context buffer */
+			r = amdgpu_vce_validate_bo(p, ib, idx + 3, idx + 2,
+						   0, 0);
+			if (r)
+				goto out;
+			break;
+
+		case 0x05000004: /* video bitstream buffer */
+			tmp = amdgpu_ib_get_value(ib, idx + 4);
+			r = amdgpu_vce_validate_bo(p, ib, idx + 3, idx + 2,
+						   tmp, bs_idx);
+			if (r)
+				goto out;
+			break;
+
+		case 0x05000005: /* feedback buffer */
+			r = amdgpu_vce_validate_bo(p, ib, idx + 3, idx + 2,
+						   4096, fb_idx);
+			if (r)
+				goto out;
+			break;
+
+		case 0x0500000d: /* MV buffer */
+			r = amdgpu_vce_validate_bo(p, ib, idx + 3, idx + 2,
+						   0, 0);
+			if (r)
+				goto out;
+
+			r = amdgpu_vce_validate_bo(p, ib, idx + 8, idx + 7,
+						   0, 0);
+			if (r)
+				goto out;
+			break;
+		}
+
+		idx += len / 4;
+	}
+
+	for (idx = 0; idx < ib->length_dw;) {
+		uint32_t len = amdgpu_ib_get_value(ib, idx);
+		uint32_t cmd = amdgpu_ib_get_value(ib, idx + 1);
+
+		switch (cmd) {
 		case 0x00000001: /* session */
-			handle = amdgpu_get_ib_value(p, ib_idx, idx + 2);
+			handle = amdgpu_ib_get_value(ib, idx + 2);
 			session_idx = amdgpu_vce_validate_handle(p, handle,
 								 &allocated);
 			if (session_idx < 0) {
@@ -679,8 +876,8 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 			break;
 
 		case 0x00000002: /* task info */
-			fb_idx = amdgpu_get_ib_value(p, ib_idx, idx + 6);
-			bs_idx = amdgpu_get_ib_value(p, ib_idx, idx + 7);
+			fb_idx = amdgpu_ib_get_value(ib, idx + 6);
+			bs_idx = amdgpu_ib_get_value(ib, idx + 7);
 			break;
 
 		case 0x01000001: /* create */
@@ -695,8 +892,8 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 				goto out;
 			}
 
-			*size = amdgpu_get_ib_value(p, ib_idx, idx + 8) *
-				amdgpu_get_ib_value(p, ib_idx, idx + 10) *
+			*size = amdgpu_ib_get_value(ib, idx + 8) *
+				amdgpu_ib_get_value(ib, idx + 10) *
 				8 * 3 / 2;
 			break;
 
@@ -725,12 +922,12 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 			break;
 
 		case 0x03000001: /* encode */
-			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 10, idx + 9,
+			r = amdgpu_vce_cs_reloc(p, ib, idx + 10, idx + 9,
 						*size, 0);
 			if (r)
 				goto out;
 
-			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 12, idx + 11,
+			r = amdgpu_vce_cs_reloc(p, ib, idx + 12, idx + 11,
 						*size / 3, 0);
 			if (r)
 				goto out;
@@ -741,27 +938,39 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 			break;
 
 		case 0x05000001: /* context buffer */
-			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 3, idx + 2,
+			r = amdgpu_vce_cs_reloc(p, ib, idx + 3, idx + 2,
 						*size * 2, 0);
 			if (r)
 				goto out;
 			break;
 
 		case 0x05000004: /* video bitstream buffer */
-			tmp = amdgpu_get_ib_value(p, ib_idx, idx + 4);
-			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 3, idx + 2,
+			tmp = amdgpu_ib_get_value(ib, idx + 4);
+			r = amdgpu_vce_cs_reloc(p, ib, idx + 3, idx + 2,
 						tmp, bs_idx);
 			if (r)
 				goto out;
 			break;
 
 		case 0x05000005: /* feedback buffer */
-			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 3, idx + 2,
+			r = amdgpu_vce_cs_reloc(p, ib, idx + 3, idx + 2,
 						4096, fb_idx);
 			if (r)
 				goto out;
 			break;
 
+		case 0x0500000d: /* MV buffer */
+			r = amdgpu_vce_cs_reloc(p, ib, idx + 3,
+						idx + 2, *size, 0);
+			if (r)
+				goto out;
+
+			r = amdgpu_vce_cs_reloc(p, ib, idx + 8,
+						idx + 7, *size / 12, 0);
+			if (r)
+				goto out;
+			break;
+
 		default:
 			DRM_ERROR("invalid VCE command (0x%x)!\n", cmd);
 			r = -EINVAL;
@@ -799,14 +1008,16 @@ out:
 }
 
 /**
- * amdgpu_vce_cs_parse_vm - parse the command stream in VM mode
+ * amdgpu_vce_ring_parse_cs_vm - parse the command stream in VM mode
  *
  * @p: parser context
- *
+ * @job: the job to parse
+ * @ib: the IB to patch
  */
-int amdgpu_vce_ring_parse_cs_vm(struct amdgpu_cs_parser *p, uint32_t ib_idx)
+int amdgpu_vce_ring_parse_cs_vm(struct amdgpu_cs_parser *p,
+				struct amdgpu_job *job,
+				struct amdgpu_ib *ib)
 {
-	struct amdgpu_ib *ib = &p->job->ibs[ib_idx];
 	int session_idx = -1;
 	uint32_t destroyed = 0;
 	uint32_t created = 0;
@@ -815,8 +1026,8 @@ int amdgpu_vce_ring_parse_cs_vm(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 	int i, r = 0, idx = 0;
 
 	while (idx < ib->length_dw) {
-		uint32_t len = amdgpu_get_ib_value(p, ib_idx, idx);
-		uint32_t cmd = amdgpu_get_ib_value(p, ib_idx, idx + 1);
+		uint32_t len = amdgpu_ib_get_value(ib, idx);
+		uint32_t cmd = amdgpu_ib_get_value(ib, idx + 1);
 
 		if ((len < 8) || (len & 3)) {
 			DRM_ERROR("invalid VCE command length (%d)!\n", len);
@@ -826,7 +1037,7 @@ int amdgpu_vce_ring_parse_cs_vm(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 
 		switch (cmd) {
 		case 0x00000001: /* session */
-			handle = amdgpu_get_ib_value(p, ib_idx, idx + 2);
+			handle = amdgpu_ib_get_value(ib, idx + 2);
 			session_idx = amdgpu_vce_validate_handle(p, handle,
 								 &allocated);
 			if (session_idx < 0) {
@@ -875,7 +1086,6 @@ out:
 	if (!r) {
 		/* No error, free all destroyed handle slots */
 		tmp = destroyed;
-		amdgpu_ib_free(p->adev, ib, NULL);
 	} else {
 		/* Error during parsing, free all allocated handle slots */
 		tmp = allocated;
@@ -892,11 +1102,15 @@ out:
  * amdgpu_vce_ring_emit_ib - execute indirect buffer
  *
  * @ring: engine to use
+ * @job: job to retrieve vmid from
  * @ib: the IB to execute
+ * @flags: unused
  *
  */
-void amdgpu_vce_ring_emit_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib,
-			     unsigned vm_id, bool ctx_switch)
+void amdgpu_vce_ring_emit_ib(struct amdgpu_ring *ring,
+				struct amdgpu_job *job,
+				struct amdgpu_ib *ib,
+				uint32_t flags)
 {
 	amdgpu_ring_write(ring, VCE_CMD_IB);
 	amdgpu_ring_write(ring, lower_32_bits(ib->gpu_addr));
@@ -908,11 +1122,13 @@ void amdgpu_vce_ring_emit_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib,
  * amdgpu_vce_ring_emit_fence - add a fence command to the ring
  *
  * @ring: engine to use
- * @fence: the fence
+ * @addr: address
+ * @seq: sequence number
+ * @flags: fence related flags
  *
  */
 void amdgpu_vce_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 seq,
-				unsigned flags)
+				unsigned int flags)
 {
 	WARN_ON(flags & AMDGPU_FENCE_FLAG_64BIT);
 
@@ -933,37 +1149,31 @@ void amdgpu_vce_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 seq,
 int amdgpu_vce_ring_test_ring(struct amdgpu_ring *ring)
 {
 	struct amdgpu_device *adev = ring->adev;
-	uint32_t rptr = amdgpu_ring_get_rptr(ring);
-	unsigned i;
+	uint32_t rptr;
+	unsigned int i;
 	int r, timeout = adev->usec_timeout;
 
-	/* workaround VCE ring test slow issue for sriov*/
+	/* skip ring test for sriov*/
 	if (amdgpu_sriov_vf(adev))
-		timeout *= 10;
+		return 0;
 
 	r = amdgpu_ring_alloc(ring, 16);
-	if (r) {
-		DRM_ERROR("amdgpu: vce failed to lock ring %d (%d).\n",
-			  ring->idx, r);
+	if (r)
 		return r;
-	}
+
+	rptr = amdgpu_ring_get_rptr(ring);
+
 	amdgpu_ring_write(ring, VCE_CMD_END);
 	amdgpu_ring_commit(ring);
 
 	for (i = 0; i < timeout; i++) {
 		if (amdgpu_ring_get_rptr(ring) != rptr)
 			break;
-		DRM_UDELAY(1);
+		udelay(1);
 	}
 
-	if (i < timeout) {
-		DRM_INFO("ring test on %d succeeded in %d usecs\n",
-			 ring->idx, i);
-	} else {
-		DRM_ERROR("amdgpu: ring %d test failed\n",
-			  ring->idx);
+	if (i >= timeout)
 		r = -ETIMEDOUT;
-	}
 
 	return r;
 }
@@ -972,6 +1182,7 @@ int amdgpu_vce_ring_test_ring(struct amdgpu_ring *ring)
  * amdgpu_vce_ring_test_ib - test if VCE IBs are working
  *
  * @ring: the engine to test on
+ * @timeout: timeout value in jiffies, or MAX_SCHEDULE_TIMEOUT
  *
  */
 int amdgpu_vce_ring_test_ib(struct amdgpu_ring *ring, long timeout)
@@ -984,28 +1195,34 @@ int amdgpu_vce_ring_test_ib(struct amdgpu_ring *ring, long timeout)
 		return 0;
 
 	r = amdgpu_vce_get_create_msg(ring, 1, NULL);
-	if (r) {
-		DRM_ERROR("amdgpu: failed to get create msg (%ld).\n", r);
+	if (r)
 		goto error;
-	}
 
 	r = amdgpu_vce_get_destroy_msg(ring, 1, true, &fence);
-	if (r) {
-		DRM_ERROR("amdgpu: failed to get destroy ib (%ld).\n", r);
+	if (r)
 		goto error;
-	}
 
 	r = dma_fence_wait_timeout(fence, false, timeout);
-	if (r == 0) {
-		DRM_ERROR("amdgpu: IB test timed out.\n");
+	if (r == 0)
 		r = -ETIMEDOUT;
-	} else if (r < 0) {
-		DRM_ERROR("amdgpu: fence wait failed (%ld).\n", r);
-	} else {
-		DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
+	else if (r > 0)
 		r = 0;
-	}
+
 error:
 	dma_fence_put(fence);
 	return r;
 }
+
+enum amdgpu_ring_priority_level amdgpu_vce_get_ring_prio(int ring)
+{
+	switch (ring) {
+	case 0:
+		return AMDGPU_RING_PRIO_0;
+	case 1:
+		return AMDGPU_RING_PRIO_1;
+	case 2:
+		return AMDGPU_RING_PRIO_2;
+	default:
+		return AMDGPU_RING_PRIO_0;
+	}
+}