summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/i915/i915_request.c
diff options
context:
space:
mode:
authorMaxime Ripard <maxime@cerno.tech>2021-10-25 15:27:56 +0200
committerMaxime Ripard <maxime@cerno.tech>2021-10-25 15:27:56 +0200
commit736638246ec215f999dd132334d2d7c49bcb85c7 (patch)
tree6c79e96f8dccb0514d918cf279915695ffb3aeea /drivers/gpu/drm/i915/i915_request.c
parent525bbf72dbe0004a009dc39b239dec74e8007f6f (diff)
parent6f2f7c83303d2227f47551423e507d77d9ea01c7 (diff)
Merge drm/drm-next into drm-misc-next
drm-misc-next hasn't been updated in a while and I need a post -rc2 state to merge some vc4 patches. Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Diffstat (limited to 'drivers/gpu/drm/i915/i915_request.c')
-rw-r--r--drivers/gpu/drm/i915/i915_request.c168
1 files changed, 127 insertions, 41 deletions
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 3839712ebd23..820a1f38b271 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -829,8 +829,6 @@ static void __i915_request_ctor(void *arg)
i915_sw_fence_init(&rq->submit, submit_notify);
i915_sw_fence_init(&rq->semaphore, semaphore_notify);
- dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, 0, 0);
-
rq->capture_list = NULL;
init_llist_head(&rq->execute_cb);
@@ -905,17 +903,12 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp)
rq->ring = ce->ring;
rq->execution_mask = ce->engine->mask;
- kref_init(&rq->fence.refcount);
- rq->fence.flags = 0;
- rq->fence.error = 0;
- INIT_LIST_HEAD(&rq->fence.cb_list);
-
ret = intel_timeline_get_seqno(tl, rq, &seqno);
if (ret)
goto err_free;
- rq->fence.context = tl->fence_context;
- rq->fence.seqno = seqno;
+ dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock,
+ tl->fence_context, seqno);
RCU_INIT_POINTER(rq->timeline, tl);
rq->hwsp_seqno = tl->hwsp_seqno;
@@ -1152,6 +1145,12 @@ __emit_semaphore_wait(struct i915_request *to,
return 0;
}
+static bool
+can_use_semaphore_wait(struct i915_request *to, struct i915_request *from)
+{
+ return to->engine->gt->ggtt == from->engine->gt->ggtt;
+}
+
static int
emit_semaphore_wait(struct i915_request *to,
struct i915_request *from,
@@ -1160,6 +1159,9 @@ emit_semaphore_wait(struct i915_request *to,
const intel_engine_mask_t mask = READ_ONCE(from->engine)->mask;
struct i915_sw_fence *wait = &to->submit;
+ if (!can_use_semaphore_wait(to, from))
+ goto await_fence;
+
if (!intel_context_use_semaphores(to->context))
goto await_fence;
@@ -1263,7 +1265,8 @@ __i915_request_await_execution(struct i915_request *to,
* immediate execution, and so we must wait until it reaches the
* active slot.
*/
- if (intel_engine_has_semaphores(to->engine) &&
+ if (can_use_semaphore_wait(to, from) &&
+ intel_engine_has_semaphores(to->engine) &&
!i915_request_has_initial_breadcrumb(to)) {
err = __emit_semaphore_wait(to, from, from->fence.seqno - 1);
if (err < 0)
@@ -1332,6 +1335,25 @@ i915_request_await_external(struct i915_request *rq, struct dma_fence *fence)
return err;
}
+static inline bool is_parallel_rq(struct i915_request *rq)
+{
+ return intel_context_is_parallel(rq->context);
+}
+
+static inline struct intel_context *request_to_parent(struct i915_request *rq)
+{
+ return intel_context_to_parent(rq->context);
+}
+
+static bool is_same_parallel_context(struct i915_request *to,
+ struct i915_request *from)
+{
+ if (is_parallel_rq(to))
+ return request_to_parent(to) == request_to_parent(from);
+
+ return false;
+}
+
int
i915_request_await_execution(struct i915_request *rq,
struct dma_fence *fence)
@@ -1363,11 +1385,14 @@ i915_request_await_execution(struct i915_request *rq,
* want to run our callback in all cases.
*/
- if (dma_fence_is_i915(fence))
+ if (dma_fence_is_i915(fence)) {
+ if (is_same_parallel_context(rq, to_request(fence)))
+ continue;
ret = __i915_request_await_execution(rq,
to_request(fence));
- else
+ } else {
ret = i915_request_await_external(rq, fence);
+ }
if (ret < 0)
return ret;
} while (--nchild);
@@ -1468,10 +1493,13 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
fence))
continue;
- if (dma_fence_is_i915(fence))
+ if (dma_fence_is_i915(fence)) {
+ if (is_same_parallel_context(rq, to_request(fence)))
+ continue;
ret = i915_request_await_request(rq, to_request(fence));
- else
+ } else {
ret = i915_request_await_external(rq, fence);
+ }
if (ret < 0)
return ret;
@@ -1523,35 +1551,51 @@ i915_request_await_object(struct i915_request *to,
}
static struct i915_request *
-__i915_request_add_to_timeline(struct i915_request *rq)
+__i915_request_ensure_parallel_ordering(struct i915_request *rq,
+ struct intel_timeline *timeline)
{
- struct intel_timeline *timeline = i915_request_timeline(rq);
struct i915_request *prev;
- /*
- * Dependency tracking and request ordering along the timeline
- * is special cased so that we can eliminate redundant ordering
- * operations while building the request (we know that the timeline
- * itself is ordered, and here we guarantee it).
- *
- * As we know we will need to emit tracking along the timeline,
- * we embed the hooks into our request struct -- at the cost of
- * having to have specialised no-allocation interfaces (which will
- * be beneficial elsewhere).
- *
- * A second benefit to open-coding i915_request_await_request is
- * that we can apply a slight variant of the rules specialised
- * for timelines that jump between engines (such as virtual engines).
- * If we consider the case of virtual engine, we must emit a dma-fence
- * to prevent scheduling of the second request until the first is
- * complete (to maximise our greedy late load balancing) and this
- * precludes optimising to use semaphores serialisation of a single
- * timeline across engines.
- */
+ GEM_BUG_ON(!is_parallel_rq(rq));
+
+ prev = request_to_parent(rq)->parallel.last_rq;
+ if (prev) {
+ if (!__i915_request_is_complete(prev)) {
+ i915_sw_fence_await_sw_fence(&rq->submit,
+ &prev->submit,
+ &rq->submitq);
+
+ if (rq->engine->sched_engine->schedule)
+ __i915_sched_node_add_dependency(&rq->sched,
+ &prev->sched,
+ &rq->dep,
+ 0);
+ }
+ i915_request_put(prev);
+ }
+
+ request_to_parent(rq)->parallel.last_rq = i915_request_get(rq);
+
+ return to_request(__i915_active_fence_set(&timeline->last_request,
+ &rq->fence));
+}
+
+static struct i915_request *
+__i915_request_ensure_ordering(struct i915_request *rq,
+ struct intel_timeline *timeline)
+{
+ struct i915_request *prev;
+
+ GEM_BUG_ON(is_parallel_rq(rq));
+
prev = to_request(__i915_active_fence_set(&timeline->last_request,
&rq->fence));
+
if (prev && !__i915_request_is_complete(prev)) {
bool uses_guc = intel_engine_uses_guc(rq->engine);
+ bool pow2 = is_power_of_2(READ_ONCE(prev->engine)->mask |
+ rq->engine->mask);
+ bool same_context = prev->context == rq->context;
/*
* The requests are supposed to be kept in order. However,
@@ -1559,13 +1603,11 @@ __i915_request_add_to_timeline(struct i915_request *rq)
* is used as a barrier for external modification to this
* context.
*/
- GEM_BUG_ON(prev->context == rq->context &&
+ GEM_BUG_ON(same_context &&
i915_seqno_passed(prev->fence.seqno,
rq->fence.seqno));
- if ((!uses_guc &&
- is_power_of_2(READ_ONCE(prev->engine)->mask | rq->engine->mask)) ||
- (uses_guc && prev->context == rq->context))
+ if ((same_context && uses_guc) || (!uses_guc && pow2))
i915_sw_fence_await_sw_fence(&rq->submit,
&prev->submit,
&rq->submitq);
@@ -1580,6 +1622,50 @@ __i915_request_add_to_timeline(struct i915_request *rq)
0);
}
+ return prev;
+}
+
+static struct i915_request *
+__i915_request_add_to_timeline(struct i915_request *rq)
+{
+ struct intel_timeline *timeline = i915_request_timeline(rq);
+ struct i915_request *prev;
+
+ /*
+ * Dependency tracking and request ordering along the timeline
+ * is special cased so that we can eliminate redundant ordering
+ * operations while building the request (we know that the timeline
+ * itself is ordered, and here we guarantee it).
+ *
+ * As we know we will need to emit tracking along the timeline,
+ * we embed the hooks into our request struct -- at the cost of
+ * having to have specialised no-allocation interfaces (which will
+ * be beneficial elsewhere).
+ *
+ * A second benefit to open-coding i915_request_await_request is
+ * that we can apply a slight variant of the rules specialised
+ * for timelines that jump between engines (such as virtual engines).
+ * If we consider the case of virtual engine, we must emit a dma-fence
+ * to prevent scheduling of the second request until the first is
+ * complete (to maximise our greedy late load balancing) and this
+ * precludes optimising to use semaphores serialisation of a single
+ * timeline across engines.
+ *
+ * We do not order parallel submission requests on the timeline as each
+ * parallel submission context has its own timeline and the ordering
+ * rules for parallel requests are that they must be submitted in the
+ * order received from the execbuf IOCTL. So rather than using the
+ * timeline we store a pointer to last request submitted in the
+ * relationship in the gem context and insert a submission fence
+ * between that request and request passed into this function or
+ * alternatively we use completion fence if gem context has a single
+ * timeline and this is the first submission of an execbuf IOCTL.
+ */
+ if (likely(!is_parallel_rq(rq)))
+ prev = __i915_request_ensure_ordering(rq, timeline);
+ else
+ prev = __i915_request_ensure_parallel_ordering(rq, timeline);
+
/*
* Make sure that no request gazumped us - if it was allocated after
* our i915_request_alloc() and called __i915_request_add() before
@@ -1835,7 +1921,7 @@ long i915_request_wait(struct i915_request *rq,
* completion. That requires having a good predictor for the request
* duration, which we currently lack.
*/
- if (IS_ACTIVE(CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT) &&
+ if (CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT &&
__i915_spin_request(rq, state))
goto out;