diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2019-10-04 14:40:00 +0100 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2019-10-04 15:39:12 +0100 |
commit | b1e3177bd1d8f41e2a9cc847e56a96cdc0eefe62 (patch) | |
tree | 9af22565533f12868a015e18e51406d54773e08a /drivers/gpu/drm/i915/i915_request.c | |
parent | 274cbf20fd108fa26d0497282b102e00371210fd (diff) |
drm/i915: Coordinate i915_active with its own mutex
Forgo the struct_mutex serialisation for i915_active, and interpose its
own mutex handling for active/retire.
This is a multi-layered sleight-of-hand. First, we had to ensure that no
active/retire callbacks accidentally inverted the mutex ordering rules,
nor assumed that they were themselves serialised by struct_mutex. More
challenging though, is the rule over updating elements of the active
rbtree. Instead of the whole i915_active now being serialised by
struct_mutex, allocations/rotations of the tree are serialised by the
i915_active.mutex and individual nodes are serialised by the caller
using the i915_timeline.mutex (we need to use nested spinlocks to
interact with the dma_fence callback lists).
The pain point here is that instead of a single mutex around execbuf, we
now have to take a mutex for active tracker (one for each vma, context,
etc) and a couple of spinlocks for each fence update. The improvement in
fine grained locking allowing for multiple concurrent clients
(eventually!) should be worth it in typical loads.
v2: Add some comments that barely elucidate anything :(
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191004134015.13204-6-chris@chris-wilson.co.uk
Diffstat (limited to 'drivers/gpu/drm/i915/i915_request.c')
-rw-r--r-- | drivers/gpu/drm/i915/i915_request.c | 38 |
1 files changed, 2 insertions, 36 deletions
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index a8916412759b..4ffe62a42186 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -218,8 +218,6 @@ static void remove_from_engine(struct i915_request *rq) static bool i915_request_retire(struct i915_request *rq) { - struct i915_active_request *active, *next; - if (!i915_request_completed(rq)) return false; @@ -244,35 +242,6 @@ static bool i915_request_retire(struct i915_request *rq) &i915_request_timeline(rq)->requests)); rq->ring->head = rq->postfix; - /* - * Walk through the active list, calling retire on each. This allows - * objects to track their GPU activity and mark themselves as idle - * when their *last* active request is completed (updating state - * tracking lists for eviction, active references for GEM, etc). - * - * As the ->retire() may free the node, we decouple it first and - * pass along the auxiliary information (to avoid dereferencing - * the node after the callback). - */ - list_for_each_entry_safe(active, next, &rq->active_list, link) { - /* - * In microbenchmarks or focusing upon time inside the kernel, - * we may spend an inordinate amount of time simply handling - * the retirement of requests and processing their callbacks. - * Of which, this loop itself is particularly hot due to the - * cache misses when jumping around the list of - * i915_active_request. So we try to keep this loop as - * streamlined as possible and also prefetch the next - * i915_active_request to try and hide the likely cache miss. - */ - prefetchw(next); - - INIT_LIST_HEAD(&active->link); - RCU_INIT_POINTER(active->request, NULL); - - active->retire(active, rq); - } - local_irq_disable(); /* @@ -704,7 +673,6 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp) rq->flags = 0; rq->execution_mask = ALL_ENGINES; - INIT_LIST_HEAD(&rq->active_list); INIT_LIST_HEAD(&rq->execute_cb); /* @@ -743,7 +711,6 @@ err_unwind: ce->ring->emit = rq->head; /* Make sure we didn't add ourselves to external state before freeing */ - GEM_BUG_ON(!list_empty(&rq->active_list)); GEM_BUG_ON(!list_empty(&rq->sched.signalers_list)); GEM_BUG_ON(!list_empty(&rq->sched.waiters_list)); @@ -1174,8 +1141,8 @@ __i915_request_add_to_timeline(struct i915_request *rq) * precludes optimising to use semaphores serialisation of a single * timeline across engines. */ - prev = rcu_dereference_protected(timeline->last_request.request, - lockdep_is_held(&timeline->mutex)); + prev = to_request(__i915_active_fence_set(&timeline->last_request, + &rq->fence)); if (prev && !i915_request_completed(prev)) { if (is_power_of_2(prev->engine->mask | rq->engine->mask)) i915_sw_fence_await_sw_fence(&rq->submit, @@ -1200,7 +1167,6 @@ __i915_request_add_to_timeline(struct i915_request *rq) * us, the timeline will hold its seqno which is later than ours. */ GEM_BUG_ON(timeline->seqno != rq->fence.seqno); - __i915_active_request_set(&timeline->last_request, rq); return prev; } |