drm/i915: Micro-optimise hotpath through intel_ring_begin()

Typically, there is space available within the ring and if not we have to wait (by definition a slow path). Rearrange the code to reduce the number of branches and stack size for the hotpath, accomodating a slight growth for the wait. v2: Fix the new assert that packets are not larger than the actual ring. v3: Make the parameters unsigned as well to make usage. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170504130846.4807-3-chris@chris-wilson.co.uk
author: Chris Wilson <chris@chris-wilson.co.uk> 2017-05-04 14:08:46 +0100
committer: Chris Wilson <chris@chris-wilson.co.uk> 2017-05-04 15:40:38 +0100
commit: 5e5655c32de83a0151de0c4993d7783c22b6f9b4 (patch)
tree: 3436e783ba56065ce254fc3ac3c4de9582fab0be /drivers/gpu/drm/i915
parent: 95aebcb2da73079f9ecb7f4e353af71ff1f04c05 (diff)
2 files changed, 38 insertions, 32 deletions
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index b308e73fcfae..acd1da9b62a3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1656,7 +1656,8 @@ static int ring_request_alloc(struct drm_i915_gem_request *request)
 	return 0;
 }
 
-static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
+static noinline int wait_for_space(struct drm_i915_gem_request *req,
+				   unsigned int bytes)
 {
 	struct intel_ring *ring = req->ring;
 	struct drm_i915_gem_request *target;
@@ -1701,52 +1702,56 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
 	return 0;
 }
 
-u32 *intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
+u32 *intel_ring_begin(struct drm_i915_gem_request *req,
+		      unsigned int num_dwords)
 {
 	struct intel_ring *ring = req->ring;
-	int remain_actual = ring->size - ring->emit;
-	int remain_usable = ring->effective_size - ring->emit;
-	int bytes = num_dwords * sizeof(u32);
-	int total_bytes, wait_bytes;
-	bool need_wrap = false;
+	const unsigned int remain_usable = ring->effective_size - ring->emit;
+	const unsigned int bytes = num_dwords * sizeof(u32);
+	unsigned int need_wrap = 0;
+	unsigned int total_bytes;
 	u32 *cs;
 
 	total_bytes = bytes + req->reserved_space;
+	GEM_BUG_ON(total_bytes > ring->effective_size);
 
-	if (unlikely(bytes > remain_usable)) {
-		/*
-		 * Not enough space for the basic request. So need to flush
-		 * out the remainder and then wait for base + reserved.
-		 */
-		wait_bytes = remain_actual + total_bytes;
-		need_wrap = true;
-	} else if (unlikely(total_bytes > remain_usable)) {
-		/*
-		 * The base request will fit but the reserved space
-		 * falls off the end. So we don't need an immediate wrap
-		 * and only need to effectively wait for the reserved
-		 * size space from the start of ringbuffer.
-		 */
-		wait_bytes = remain_actual + req->reserved_space;
-	} else {
-		/* No wrapping required, just waiting. */
-		wait_bytes = total_bytes;
+	if (unlikely(total_bytes > remain_usable)) {
+		const int remain_actual = ring->size - ring->emit;
+
+		if (bytes > remain_usable) {
+			/*
+			 * Not enough space for the basic request. So need to
+			 * flush out the remainder and then wait for
+			 * base + reserved.
+			 */
+			total_bytes += remain_actual;
+			need_wrap = remain_actual | 1;
+		} else  {
+			/*
+			 * The base request will fit but the reserved space
+			 * falls off the end. So we don't need an immediate
+			 * wrap and only need to effectively wait for the
+			 * reserved size from the start of ringbuffer.
+			 */
+			total_bytes = req->reserved_space + remain_actual;
+		}
 	}
 
-	if (wait_bytes > ring->space) {
-		int ret = wait_for_space(req, wait_bytes);
+	if (unlikely(total_bytes > ring->space)) {
+		int ret = wait_for_space(req, total_bytes);
 		if (unlikely(ret))
 			return ERR_PTR(ret);
 	}
 
 	if (unlikely(need_wrap)) {
-		GEM_BUG_ON(remain_actual > ring->space);
-		GEM_BUG_ON(ring->emit + remain_actual > ring->size);
+		need_wrap &= ~1;
+		GEM_BUG_ON(need_wrap > ring->space);
+		GEM_BUG_ON(ring->emit + need_wrap > ring->size);
 
 		/* Fill the tail with MI_NOOP */
-		memset(ring->vaddr + ring->emit, 0, remain_actual);
+		memset(ring->vaddr + ring->emit, 0, need_wrap);
 		ring->emit = 0;
-		ring->space -= remain_actual;
+		ring->space -= need_wrap;
 	}
 
 	GEM_BUG_ON(ring->emit > ring->size - bytes);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3e343b09eeb6..ec16fb6fde62 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -497,7 +497,8 @@ void intel_legacy_submission_resume(struct drm_i915_private *dev_priv);
 
 int __must_check intel_ring_cacheline_align(struct drm_i915_gem_request *req);
 
-u32 __must_check *intel_ring_begin(struct drm_i915_gem_request *req, int n);
+u32 __must_check *intel_ring_begin(struct drm_i915_gem_request *req,
+				   unsigned int n);
 
 static inline void
 intel_ring_advance(struct drm_i915_gem_request *req, u32 *cs)
author	Chris Wilson <chris@chris-wilson.co.uk>	2017-05-04 14:08:46 +0100
committer	Chris Wilson <chris@chris-wilson.co.uk>	2017-05-04 15:40:38 +0100
commit	5e5655c32de83a0151de0c4993d7783c22b6f9b4 (patch)
tree	3436e783ba56065ce254fc3ac3c4de9582fab0be /drivers/gpu/drm/i915
parent	95aebcb2da73079f9ecb7f4e353af71ff1f04c05 (diff)