diff options
Diffstat (limited to 'drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c')
-rw-r--r-- | drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 2298 |
1 files changed, 1797 insertions, 501 deletions
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 87d8dc8f51b9..d7710debcd47 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -11,6 +11,7 @@ #include "gt/intel_context.h" #include "gt/intel_engine_pm.h" #include "gt/intel_engine_heartbeat.h" +#include "gt/intel_gpu_commands.h" #include "gt/intel_gt.h" #include "gt/intel_gt_irq.h" #include "gt/intel_gt_pm.h" @@ -28,21 +29,6 @@ /** * DOC: GuC-based command submission * - * IMPORTANT NOTE: GuC submission is currently not supported in i915. The GuC - * firmware is moving to an updated submission interface and we plan to - * turn submission back on when that lands. The below documentation (and related - * code) matches the old submission model and will be updated as part of the - * upgrade to the new flow. - * - * GuC stage descriptor: - * During initialization, the driver allocates a static pool of 1024 such - * descriptors, and shares them with the GuC. Currently, we only use one - * descriptor. This stage descriptor lets the GuC know about the workqueue and - * process descriptor. Theoretically, it also lets the GuC know about our HW - * contexts (context ID, etc...), but we actually employ a kind of submission - * where the GuC uses the LRCA sent via the work item instead. This is called - * a "proxy" submission. - * * The Scratch registers: * There are 16 MMIO-based registers start from 0xC180. The kernel driver writes * a value to the action register (SOFT_SCRATCH_0) along with any data. It then @@ -51,14 +37,85 @@ * processes the request. The kernel driver polls waiting for this update and * then proceeds. * - * Work Items: - * There are several types of work items that the host may place into a - * workqueue, each with its own requirements and limitations. Currently only - * WQ_TYPE_INORDER is needed to support legacy submission via GuC, which - * represents in-order queue. The kernel driver packs ring tail pointer and an - * ELSP context descriptor dword into Work Item. - * See guc_add_request() + * Command Transport buffers (CTBs): + * Covered in detail in other sections but CTBs (Host to GuC - H2G, GuC to Host + * - G2H) are a message interface between the i915 and GuC. + * + * Context registration: + * Before a context can be submitted it must be registered with the GuC via a + * H2G. A unique guc_id is associated with each context. The context is either + * registered at request creation time (normal operation) or at submission time + * (abnormal operation, e.g. after a reset). + * + * Context submission: + * The i915 updates the LRC tail value in memory. The i915 must enable the + * scheduling of the context within the GuC for the GuC to actually consider it. + * Therefore, the first time a disabled context is submitted we use a schedule + * enable H2G, while follow up submissions are done via the context submit H2G, + * which informs the GuC that a previously enabled context has new work + * available. + * + * Context unpin: + * To unpin a context a H2G is used to disable scheduling. When the + * corresponding G2H returns indicating the scheduling disable operation has + * completed it is safe to unpin the context. While a disable is in flight it + * isn't safe to resubmit the context so a fence is used to stall all future + * requests of that context until the G2H is returned. + * + * Context deregistration: + * Before a context can be destroyed or if we steal its guc_id we must + * deregister the context with the GuC via H2G. If stealing the guc_id it isn't + * safe to submit anything to this guc_id until the deregister completes so a + * fence is used to stall all requests associated with this guc_id until the + * corresponding G2H returns indicating the guc_id has been deregistered. + * + * submission_state.guc_ids: + * Unique number associated with private GuC context data passed in during + * context registration / submission / deregistration. 64k available. Simple ida + * is used for allocation. + * + * Stealing guc_ids: + * If no guc_ids are available they can be stolen from another context at + * request creation time if that context is unpinned. If a guc_id can't be found + * we punt this problem to the user as we believe this is near impossible to hit + * during normal use cases. + * + * Locking: + * In the GuC submission code we have 3 basic spin locks which protect + * everything. Details about each below. + * + * sched_engine->lock + * This is the submission lock for all contexts that share an i915 schedule + * engine (sched_engine), thus only one of the contexts which share a + * sched_engine can be submitting at a time. Currently only one sched_engine is + * used for all of GuC submission but that could change in the future. + * + * guc->submission_state.lock + * Global lock for GuC submission state. Protects guc_ids and destroyed contexts + * list. + * + * ce->guc_state.lock + * Protects everything under ce->guc_state. Ensures that a context is in the + * correct state before issuing a H2G. e.g. We don't issue a schedule disable + * on a disabled context (bad idea), we don't issue a schedule enable when a + * schedule disable is in flight, etc... Also protects list of inflight requests + * on the context and the priority management state. Lock is individual to each + * context. + * + * Lock ordering rules: + * sched_engine->lock -> ce->guc_state.lock + * guc->submission_state.lock -> ce->guc_state.lock * + * Reset races: + * When a full GT reset is triggered it is assumed that some G2H responses to + * H2Gs can be lost as the GuC is also reset. Losing these G2H can prove to be + * fatal as we do certain operations upon receiving a G2H (e.g. destroy + * contexts, release guc_ids, etc...). When this occurs we can scrub the + * context state and cleanup appropriately, however this is quite racey. + * To avoid races, the reset code must disable submission before scrubbing for + * the missing G2H, while the submission code must check for submission being + * disabled and skip sending H2Gs and updating context states when it is. Both + * sides must also make sure to hold the relevant locks. */ /* GuC Virtual Engine */ @@ -68,91 +125,56 @@ struct guc_virtual_engine { }; static struct intel_context * -guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count); +guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count, + unsigned long flags); + +static struct intel_context * +guc_create_parallel(struct intel_engine_cs **engines, + unsigned int num_siblings, + unsigned int width); #define GUC_REQUEST_SIZE 64 /* bytes */ /* - * Below is a set of functions which control the GuC scheduling state which do - * not require a lock as all state transitions are mutually exclusive. i.e. It - * is not possible for the context pinning code and submission, for the same - * context, to be executing simultaneously. We still need an atomic as it is - * possible for some of the bits to changing at the same time though. + * We reserve 1/16 of the guc_ids for multi-lrc as these need to be contiguous + * per the GuC submission interface. A different allocation algorithm is used + * (bitmap vs. ida) between multi-lrc and single-lrc hence the reason to + * partition the guc_id space. We believe the number of multi-lrc contexts in + * use should be low and 1/16 should be sufficient. Minimum of 32 guc_ids for + * multi-lrc. */ -#define SCHED_STATE_NO_LOCK_ENABLED BIT(0) -#define SCHED_STATE_NO_LOCK_PENDING_ENABLE BIT(1) -#define SCHED_STATE_NO_LOCK_REGISTERED BIT(2) -static inline bool context_enabled(struct intel_context *ce) -{ - return (atomic_read(&ce->guc_sched_state_no_lock) & - SCHED_STATE_NO_LOCK_ENABLED); -} - -static inline void set_context_enabled(struct intel_context *ce) -{ - atomic_or(SCHED_STATE_NO_LOCK_ENABLED, &ce->guc_sched_state_no_lock); -} - -static inline void clr_context_enabled(struct intel_context *ce) -{ - atomic_and((u32)~SCHED_STATE_NO_LOCK_ENABLED, - &ce->guc_sched_state_no_lock); -} - -static inline bool context_pending_enable(struct intel_context *ce) -{ - return (atomic_read(&ce->guc_sched_state_no_lock) & - SCHED_STATE_NO_LOCK_PENDING_ENABLE); -} - -static inline void set_context_pending_enable(struct intel_context *ce) -{ - atomic_or(SCHED_STATE_NO_LOCK_PENDING_ENABLE, - &ce->guc_sched_state_no_lock); -} - -static inline void clr_context_pending_enable(struct intel_context *ce) -{ - atomic_and((u32)~SCHED_STATE_NO_LOCK_PENDING_ENABLE, - &ce->guc_sched_state_no_lock); -} - -static inline bool context_registered(struct intel_context *ce) -{ - return (atomic_read(&ce->guc_sched_state_no_lock) & - SCHED_STATE_NO_LOCK_REGISTERED); -} - -static inline void set_context_registered(struct intel_context *ce) -{ - atomic_or(SCHED_STATE_NO_LOCK_REGISTERED, - &ce->guc_sched_state_no_lock); -} - -static inline void clr_context_registered(struct intel_context *ce) -{ - atomic_and((u32)~SCHED_STATE_NO_LOCK_REGISTERED, - &ce->guc_sched_state_no_lock); -} +#define NUMBER_MULTI_LRC_GUC_ID (GUC_MAX_LRC_DESCRIPTORS / 16) /* * Below is a set of functions which control the GuC scheduling state which - * require a lock, aside from the special case where the functions are called - * from guc_lrc_desc_pin(). In that case it isn't possible for any other code - * path to be executing on the context. + * require a lock. */ #define SCHED_STATE_WAIT_FOR_DEREGISTER_TO_REGISTER BIT(0) #define SCHED_STATE_DESTROYED BIT(1) #define SCHED_STATE_PENDING_DISABLE BIT(2) #define SCHED_STATE_BANNED BIT(3) -#define SCHED_STATE_BLOCKED_SHIFT 4 +#define SCHED_STATE_ENABLED BIT(4) +#define SCHED_STATE_PENDING_ENABLE BIT(5) +#define SCHED_STATE_REGISTERED BIT(6) +#define SCHED_STATE_BLOCKED_SHIFT 7 #define SCHED_STATE_BLOCKED BIT(SCHED_STATE_BLOCKED_SHIFT) #define SCHED_STATE_BLOCKED_MASK (0xfff << SCHED_STATE_BLOCKED_SHIFT) + static inline void init_sched_state(struct intel_context *ce) { - /* Only should be called from guc_lrc_desc_pin() */ - atomic_set(&ce->guc_sched_state_no_lock, 0); - ce->guc_state.sched_state = 0; + lockdep_assert_held(&ce->guc_state.lock); + ce->guc_state.sched_state &= SCHED_STATE_BLOCKED_MASK; +} + +__maybe_unused +static bool sched_state_is_init(struct intel_context *ce) +{ + /* + * XXX: Kernel contexts can have SCHED_STATE_NO_LOCK_REGISTERED after + * suspend. + */ + return !(ce->guc_state.sched_state &= + ~(SCHED_STATE_BLOCKED_MASK | SCHED_STATE_REGISTERED)); } static inline bool @@ -165,7 +187,7 @@ context_wait_for_deregister_to_register(struct intel_context *ce) static inline void set_context_wait_for_deregister_to_register(struct intel_context *ce) { - /* Only should be called from guc_lrc_desc_pin() without lock */ + lockdep_assert_held(&ce->guc_state.lock); ce->guc_state.sched_state |= SCHED_STATE_WAIT_FOR_DEREGISTER_TO_REGISTER; } @@ -225,6 +247,57 @@ static inline void clr_context_banned(struct intel_context *ce) ce->guc_state.sched_state &= ~SCHED_STATE_BANNED; } +static inline bool context_enabled(struct intel_context *ce) +{ + return ce->guc_state.sched_state & SCHED_STATE_ENABLED; +} + +static inline void set_context_enabled(struct intel_context *ce) +{ + lockdep_assert_held(&ce->guc_state.lock); + ce->guc_state.sched_state |= SCHED_STATE_ENABLED; +} + +static inline void clr_context_enabled(struct intel_context *ce) +{ + lockdep_assert_held(&ce->guc_state.lock); + ce->guc_state.sched_state &= ~SCHED_STATE_ENABLED; +} + +static inline bool context_pending_enable(struct intel_context *ce) +{ + return ce->guc_state.sched_state & SCHED_STATE_PENDING_ENABLE; +} + +static inline void set_context_pending_enable(struct intel_context *ce) +{ + lockdep_assert_held(&ce->guc_state.lock); + ce->guc_state.sched_state |= SCHED_STATE_PENDING_ENABLE; +} + +static inline void clr_context_pending_enable(struct intel_context *ce) +{ + lockdep_assert_held(&ce->guc_state.lock); + ce->guc_state.sched_state &= ~SCHED_STATE_PENDING_ENABLE; +} + +static inline bool context_registered(struct intel_context *ce) +{ + return ce->guc_state.sched_state & SCHED_STATE_REGISTERED; +} + +static inline void set_context_registered(struct intel_context *ce) +{ + lockdep_assert_held(&ce->guc_state.lock); + ce->guc_state.sched_state |= SCHED_STATE_REGISTERED; +} + +static inline void clr_context_registered(struct intel_context *ce) +{ + lockdep_assert_held(&ce->guc_state.lock); + ce->guc_state.sched_state &= ~SCHED_STATE_REGISTERED; +} + static inline u32 context_blocked(struct intel_context *ce) { return (ce->guc_state.sched_state & SCHED_STATE_BLOCKED_MASK) >> @@ -233,7 +306,6 @@ static inline u32 context_blocked(struct intel_context *ce) static inline void incr_context_blocked(struct intel_context *ce) { - lockdep_assert_held(&ce->engine->sched_engine->lock); lockdep_assert_held(&ce->guc_state.lock); ce->guc_state.sched_state += SCHED_STATE_BLOCKED; @@ -243,7 +315,6 @@ static inline void incr_context_blocked(struct intel_context *ce) static inline void decr_context_blocked(struct intel_context *ce) { - lockdep_assert_held(&ce->engine->sched_engine->lock); lockdep_assert_held(&ce->guc_state.lock); GEM_BUG_ON(!context_blocked(ce)); /* Underflow check */ @@ -251,14 +322,39 @@ static inline void decr_context_blocked(struct intel_context *ce) ce->guc_state.sched_state -= SCHED_STATE_BLOCKED; } +static inline bool context_has_committed_requests(struct intel_context *ce) +{ + return !!ce->guc_state.number_committed_requests; +} + +static inline void incr_context_committed_requests(struct intel_context *ce) +{ + lockdep_assert_held(&ce->guc_state.lock); + ++ce->guc_state.number_committed_requests; + GEM_BUG_ON(ce->guc_state.number_committed_requests < 0); +} + +static inline void decr_context_committed_requests(struct intel_context *ce) +{ + lockdep_assert_held(&ce->guc_state.lock); + --ce->guc_state.number_committed_requests; + GEM_BUG_ON(ce->guc_state.number_committed_requests < 0); +} + +static struct intel_context * +request_to_scheduling_context(struct i915_request *rq) +{ + return intel_context_to_parent(rq->context); +} + static inline bool context_guc_id_invalid(struct intel_context *ce) { - return ce->guc_id == GUC_INVALID_LRC_ID; + return ce->guc_id.id == GUC_INVALID_LRC_ID; } static inline void set_context_guc_id_invalid(struct intel_context *ce) { - ce->guc_id = GUC_INVALID_LRC_ID; + ce->guc_id.id = GUC_INVALID_LRC_ID; } static inline struct intel_guc *ce_to_guc(struct intel_context *ce) @@ -271,6 +367,104 @@ static inline struct i915_priolist *to_priolist(struct rb_node *rb) return rb_entry(rb, struct i915_priolist, node); } +/* + * When using multi-lrc submission a scratch memory area is reserved in the + * parent's context state for the process descriptor, work queue, and handshake + * between the parent + children contexts to insert safe preemption points + * between each of the BBs. Currently the scratch area is sized to a page. + * + * The layout of this scratch area is below: + * 0 guc_process_desc + * + sizeof(struct guc_process_desc) child go + * + CACHELINE_BYTES child join[0] + * ... + * + CACHELINE_BYTES child join[n - 1] + * ... unused + * PARENT_SCRATCH_SIZE / 2 work queue start + * ... work queue + * PARENT_SCRATCH_SIZE - 1 work queue end + */ +#define WQ_SIZE (PARENT_SCRATCH_SIZE / 2) +#define WQ_OFFSET (PARENT_SCRATCH_SIZE - WQ_SIZE) + +struct sync_semaphore { + u32 semaphore; + u8 unused[CACHELINE_BYTES - sizeof(u32)]; +}; + +struct parent_scratch { + struct guc_process_desc pdesc; + + struct sync_semaphore go; + struct sync_semaphore join[MAX_ENGINE_INSTANCE + 1]; + + u8 unused[WQ_OFFSET - sizeof(struct guc_process_desc) - + sizeof(struct sync_semaphore) * (MAX_ENGINE_INSTANCE + 2)]; + + u32 wq[WQ_SIZE / sizeof(u32)]; +}; + +static u32 __get_parent_scratch_offset(struct intel_context *ce) +{ + GEM_BUG_ON(!ce->parallel.guc.parent_page); + + return ce->parallel.guc.parent_page * PAGE_SIZE; +} + +static u32 __get_wq_offset(struct intel_context *ce) +{ + BUILD_BUG_ON(offsetof(struct parent_scratch, wq) != WQ_OFFSET); + + return __get_parent_scratch_offset(ce) + WQ_OFFSET; +} + +static struct parent_scratch * +__get_parent_scratch(struct intel_context *ce) +{ + BUILD_BUG_ON(sizeof(struct parent_scratch) != PARENT_SCRATCH_SIZE); + BUILD_BUG_ON(sizeof(struct sync_semaphore) != CACHELINE_BYTES); + + /* + * Need to subtract LRC_STATE_OFFSET here as the + * parallel.guc.parent_page is the offset into ce->state while + * ce->lrc_reg_reg is ce->state + LRC_STATE_OFFSET. + */ + return (struct parent_scratch *) + (ce->lrc_reg_state + + ((__get_parent_scratch_offset(ce) - + LRC_STATE_OFFSET) / sizeof(u32))); +} + +static struct guc_process_desc * +__get_process_desc(struct intel_context *ce) +{ + struct parent_scratch *ps = __get_parent_scratch(ce); + + return &ps->pdesc; +} + +static u32 *get_wq_pointer(struct guc_process_desc *desc, + struct intel_context *ce, + u32 wqi_size) +{ + /* + * Check for space in work queue. Caching a value of head pointer in + * intel_context structure in order reduce the number accesses to shared + * GPU memory which may be across a PCIe bus. + */ +#define AVAILABLE_SPACE \ + CIRC_SPACE(ce->parallel.guc.wqi_tail, ce->parallel.guc.wqi_head, WQ_SIZE) + if (wqi_size > AVAILABLE_SPACE) { + ce->parallel.guc.wqi_head = READ_ONCE(desc->head); + + if (wqi_size > AVAILABLE_SPACE) + return NULL; + } +#undef AVAILABLE_SPACE + + return &__get_parent_scratch(ce)->wq[ce->parallel.guc.wqi_tail / sizeof(u32)]; +} + static struct guc_lrc_desc *__get_lrc_desc(struct intel_guc *guc, u32 index) { struct guc_lrc_desc *base = guc->lrc_desc_pool_vaddr; @@ -352,20 +546,29 @@ static inline void set_lrc_desc_registered(struct intel_guc *guc, u32 id, xa_unlock_irqrestore(&guc->context_lookup, flags); } +static void decr_outstanding_submission_g2h(struct intel_guc *guc) +{ + if (atomic_dec_and_test(&guc->outstanding_submission_g2h)) + wake_up_all(&guc->ct.wq); +} + static int guc_submission_send_busy_loop(struct intel_guc *guc, const u32 *action, u32 len, u32 g2h_len_dw, bool loop) { - int err; - - err = intel_guc_send_busy_loop(guc, action, len, g2h_len_dw, loop); + /* + * We always loop when a send requires a reply (i.e. g2h_len_dw > 0), + * so we don't handle the case where we don't get a reply because we + * aborted the send due to the channel being busy. + */ + GEM_BUG_ON(g2h_len_dw && !loop); - if (!err && g2h_len_dw) + if (g2h_len_dw) atomic_inc(&guc->outstanding_submission_g2h); - return err; + return intel_guc_send_busy_loop(guc, action, len, g2h_len_dw, loop); } int intel_guc_wait_for_pending_msg(struct intel_guc *guc, @@ -421,15 +624,17 @@ int intel_guc_wait_for_idle(struct intel_guc *guc, long timeout) static int guc_lrc_desc_pin(struct intel_context *ce, bool loop); -static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) +static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq) { int err = 0; - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); u32 action[3]; int len = 0; u32 g2h_len_dw = 0; bool enabled; + lockdep_assert_held(&rq->engine->sched_engine->lock); + /* * Corner case where requests were sitting in the priority list or a * request resubmitted after the context was banned. @@ -437,41 +642,34 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) if (unlikely(intel_context_is_banned(ce))) { i915_request_put(i915_request_mark_eio(rq)); intel_engine_signal_breadcrumbs(ce->engine); - goto out; + return 0; } - GEM_BUG_ON(!atomic_read(&ce->guc_id_ref)); + GEM_BUG_ON(!atomic_read(&ce->guc_id.ref)); GEM_BUG_ON(context_guc_id_invalid(ce)); - /* - * Corner case where the GuC firmware was blown away and reloaded while - * this context was pinned. - */ - if (unlikely(!lrc_desc_registered(guc, ce->guc_id))) { - err = guc_lrc_desc_pin(ce, false); - if (unlikely(err)) - goto out; - } + spin_lock(&ce->guc_state.lock); /* * The request / context will be run on the hardware when scheduling - * gets enabled in the unblock. + * gets enabled in the unblock. For multi-lrc we still submit the + * context to move the LRC tails. */ - if (unlikely(context_blocked(ce))) + if (unlikely(context_blocked(ce) && !intel_context_is_parent(ce))) goto out; - enabled = context_enabled(ce); + enabled = context_enabled(ce) || context_blocked(ce); if (!enabled) { action[len++] = INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET; - action[len++] = ce->guc_id; + action[len++] = ce->guc_id.id; action[len++] = GUC_CONTEXT_ENABLE; set_context_pending_enable(ce); intel_context_get(ce); g2h_len_dw = G2H_LEN_DW_SCHED_CONTEXT_MODE_SET; } else { action[len++] = INTEL_GUC_ACTION_SCHED_CONTEXT; - action[len++] = ce->guc_id; + action[len++] = ce->guc_id.id; } err = intel_guc_send_nb(guc, action, len, g2h_len_dw); @@ -479,6 +677,18 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) trace_intel_context_sched_enable(ce); atomic_inc(&guc->outstanding_submission_g2h); set_context_enabled(ce); + + /* + * Without multi-lrc KMD does the submission step (moving the + * lrc tail) so enabling scheduling is sufficient to submit the + * context. This isn't the case in multi-lrc submission as the + * GuC needs to move the tails, hence the need for another H2G + * to submit a multi-lrc context after enabling scheduling. + */ + if (intel_context_is_parent(ce)) { + action[0] = INTEL_GUC_ACTION_SCHED_CONTEXT; + err = intel_guc_send_nb(guc, action, len - 1, 0); + } } else if (!enabled) { clr_context_pending_enable(ce); intel_context_put(ce); @@ -487,9 +697,22 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) trace_i915_request_guc_submit(rq); out: + spin_unlock(&ce->guc_state.lock); return err; } +static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) +{ + int ret = __guc_add_request(guc, rq); + + if (unlikely(ret == -EBUSY)) { + guc->stalled_request = rq; + guc->submission_stall_reason = STALL_ADD_REQUEST; + } + + return ret; +} + static inline void guc_set_lrc_tail(struct i915_request *rq) { rq->context->lrc_reg_state[CTX_RING_TAIL] = @@ -501,6 +724,135 @@ static inline int rq_prio(const struct i915_request *rq) return rq->sched.attr.priority; } +static bool is_multi_lrc_rq(struct i915_request *rq) +{ + return intel_context_is_parallel(rq->context); +} + +static bool can_merge_rq(struct i915_request *rq, + struct i915_request *last) +{ + return request_to_scheduling_context(rq) == + request_to_scheduling_context(last); +} + +static u32 wq_space_until_wrap(struct intel_context *ce) +{ + return (WQ_SIZE - ce->parallel.guc.wqi_tail); +} + +static void write_wqi(struct guc_process_desc *desc, + struct intel_context *ce, + u32 wqi_size) +{ + BUILD_BUG_ON(!is_power_of_2(WQ_SIZE)); + + /* + * Ensure WQI are visible before updating tail + */ + intel_guc_write_barrier(ce_to_guc(ce)); + + ce->parallel.guc.wqi_tail = (ce->parallel.guc.wqi_tail + wqi_size) & + (WQ_SIZE - 1); + WRITE_ONCE(desc->tail, ce->parallel.guc.wqi_tail); +} + +static int guc_wq_noop_append(struct intel_context *ce) +{ + struct guc_process_desc *desc = __get_process_desc(ce); + u32 *wqi = get_wq_pointer(desc, ce, wq_space_until_wrap(ce)); + u32 len_dw = wq_space_until_wrap(ce) / sizeof(u32) - 1; + + if (!wqi) + return -EBUSY; + + GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw)); + + *wqi = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_NOOP) | + FIELD_PREP(WQ_LEN_MASK, len_dw); + ce->parallel.guc.wqi_tail = 0; + + return 0; +} + +static int __guc_wq_item_append(struct i915_request *rq) +{ + struct intel_context *ce = request_to_scheduling_context(rq); + struct intel_context *child; + struct guc_process_desc *desc = __get_process_desc(ce); + unsigned int wqi_size = (ce->parallel.number_children + 4) * + sizeof(u32); + u32 *wqi; + u32 len_dw = (wqi_size / sizeof(u32)) - 1; + int ret; + + /* Ensure context is in correct state updating work queue */ + GEM_BUG_ON(!atomic_read(&ce->guc_id.ref)); + GEM_BUG_ON(context_guc_id_invalid(ce)); + GEM_BUG_ON(context_wait_for_deregister_to_register(ce)); + GEM_BUG_ON(!lrc_desc_registered(ce_to_guc(ce), ce->guc_id.id)); + + /* Insert NOOP if this work queue item will wrap the tail pointer. */ + if (wqi_size > wq_space_until_wrap(ce)) { + ret = guc_wq_noop_append(ce); + if (ret) + return ret; + } + + wqi = get_wq_pointer(desc, ce, wqi_size); + if (!wqi) + return -EBUSY; + + GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw)); + + *wqi++ = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_MULTI_LRC) | + FIELD_PREP(WQ_LEN_MASK, len_dw); + *wqi++ = ce->lrc.lrca; + *wqi++ = FIELD_PREP(WQ_GUC_ID_MASK, ce->guc_id.id) | + FIELD_PREP(WQ_RING_TAIL_MASK, ce->ring->tail / sizeof(u64)); + *wqi++ = 0; /* fence_id */ + for_each_child(ce, child) + *wqi++ = child->ring->tail / sizeof(u64); + + write_wqi(desc, ce, wqi_size); + + return 0; +} + +static int guc_wq_item_append(struct intel_guc *guc, + struct i915_request *rq) +{ + struct intel_context *ce = request_to_scheduling_context(rq); + int ret = 0; + + if (likely(!intel_context_is_banned(ce))) { + ret = __guc_wq_item_append(rq); + + if (unlikely(ret == -EBUSY)) { + guc->stalled_request = rq; + guc->submission_stall_reason = STALL_MOVE_LRC_TAIL; + } + } + + return ret; +} + +static bool multi_lrc_submit(struct i915_request *rq) +{ + struct intel_context *ce = request_to_scheduling_context(rq); + + intel_ring_set_tail(rq->ring, rq->tail); + + /* + * We expect the front end (execbuf IOCTL) to set this flag on the last + * request generated from a multi-BB submission. This indicates to the + * backend (GuC interface) that we should submit this context thus + * submitting all the requests generated in parallel. + */ + return test_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, &rq->fence.flags) || + intel_context_is_banned(ce); +} + static int guc_dequeue_one_context(struct intel_guc *guc) { struct i915_sched_engine * const sched_engine = guc->sched_engine; @@ -514,7 +866,17 @@ static int guc_dequeue_one_context(struct intel_guc *guc) if (guc->stalled_request) { submit = true; last = guc->stalled_request; - goto resubmit; + + switch (guc->submission_stall_reason) { + case STALL_REGISTER_CONTEXT: + goto register_context; + case STALL_MOVE_LRC_TAIL: + goto move_lrc_tail; + case STALL_ADD_REQUEST: + goto add_request; + default: + MISSING_CASE(guc->submission_stall_reason); + } } while ((rb = rb_first_cached(&sched_engine->queue))) { @@ -522,8 +884,8 @@ static int guc_dequeue_one_context(struct intel_guc *guc) struct i915_request *rq, *rn; priolist_for_each_request_consume(rq, rn, p) { - if (last && rq->context != last->context) - goto done; + if (last && !can_merge_rq(rq, last)) + goto register_context; list_del_init(&rq->sched.link); @@ -531,33 +893,84 @@ static int guc_dequeue_one_context(struct intel_guc *guc) trace_i915_request_in(rq, 0); last = rq; - submit = true; + + if (is_multi_lrc_rq(rq)) { + /* + * We need to coalesce all multi-lrc requests in + * a relationship into a single H2G. We are + * guaranteed that all of these requests will be + * submitted sequentially. + */ + if (multi_lrc_submit(rq)) { + submit = true; + goto register_context; + } + } else { + submit = true; + } } rb_erase_cached(&p->node, &sched_engine->queue); i915_priolist_free(p); } -done: + +register_context: if (submit) { - guc_set_lrc_tail(last); -resubmit: + struct intel_context *ce = request_to_scheduling_context(last); + + if (unlikely(!lrc_desc_registered(guc, ce->guc_id.id) && + !intel_context_is_banned(ce))) { + ret = guc_lrc_desc_pin(ce, false); + if (unlikely(ret == -EPIPE)) { + goto deadlk; + } else if (ret == -EBUSY) { + guc->stalled_request = last; + guc->submission_stall_reason = + STALL_REGISTER_CONTEXT; + goto schedule_tasklet; + } else if (ret != 0) { + GEM_WARN_ON(ret); /* Unexpected */ + goto deadlk; + } + } + +move_lrc_tail: + if (is_multi_lrc_rq(last)) { + ret = guc_wq_item_append(guc, last); + if (ret == -EBUSY) { + goto schedule_tasklet; + } else if (ret != 0) { + GEM_WARN_ON(ret); /* Unexpected */ + goto deadlk; + } + } else { + guc_set_lrc_tail(last); + } + +add_request: ret = guc_add_request(guc, last); - if (unlikely(ret == -EPIPE)) + if (unlikely(ret == -EPIPE)) { + goto deadlk; + } else if (ret == -EBUSY) { + goto schedule_tasklet; + } else if (ret != 0) { + GEM_WARN_ON(ret); /* Unexpected */ goto deadlk; - else if (ret == -EBUSY) { - tasklet_schedule(&sched_engine->tasklet); - guc->stalled_request = last; - return false; } } guc->stalled_request = NULL; + guc->submission_stall_reason = STALL_NONE; return submit; deadlk: sched_engine->tasklet.callback = NULL; tasklet_disable_nosync(&sched_engine->tasklet); return false; + +schedule_tasklet: + tasklet_schedule(&sched_engine->tasklet); + return false; } static void guc_submission_tasklet(struct tasklet_struct *t) @@ -596,10 +1009,18 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc) unsigned long index, flags; bool pending_disable, pending_enable, deregister, destroyed, banned; + xa_lock_irqsave(&guc->context_lookup, flags); xa_for_each(&guc->context_lookup, index, ce) { - /* Flush context */ - spin_lock_irqsave(&ce->guc_state.lock, flags); - spin_unlock_irqrestore(&ce->guc_state.lock, flags); + /* + * Corner case where the ref count on the object is zero but and + * deregister G2H was lost. In this case we don't touch the ref + * count and finish the destroy of the context. + */ + bool do_put = kref_get_unless_zero(&ce->ref); + + xa_unlock(&guc->context_lookup); + + spin_lock(&ce->guc_state.lock); /* * Once we are at this point submission_disabled() is guaranteed @@ -615,11 +1036,16 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc) banned = context_banned(ce); init_sched_state(ce); + spin_unlock(&ce->guc_state.lock); + + GEM_BUG_ON(!do_put && !destroyed); + if (pending_enable || destroyed || deregister) { - atomic_dec(&guc->outstanding_submission_g2h); + decr_outstanding_submission_g2h(guc); if (deregister) guc_signal_context_fence(ce); if (destroyed) { + intel_gt_pm_put_async(guc_to_gt(guc)); release_guc_id(guc, ce); __guc_context_destroy(ce); } @@ -635,14 +1061,20 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc) intel_engine_signal_breadcrumbs(ce->engine); } intel_context_sched_disable_unpin(ce); - atomic_dec(&guc->outstanding_submission_g2h); - spin_lock_irqsave(&ce->guc_state.lock, flags); + decr_outstanding_submission_g2h(guc); + + spin_lock(&ce->guc_state.lock); guc_blocked_fence_complete(ce); - spin_unlock_irqrestore(&ce->guc_state.lock, flags); + spin_unlock(&ce->guc_state.lock); intel_context_put(ce); } + + if (do_put) + intel_context_put(ce); + xa_lock(&guc->context_lookup); } + xa_unlock_irqrestore(&guc->context_lookup, flags); } static inline bool @@ -692,6 +1124,8 @@ static void guc_flush_submissions(struct intel_guc *guc) spin_unlock_irqrestore(&sched_engine->lock, flags); } +static void guc_flush_destroyed_contexts(struct intel_guc *guc); + void intel_guc_submission_reset_prepare(struct intel_guc *guc) { int i; @@ -710,6 +1144,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc) spin_unlock_irq(&guc_to_gt(guc)->irq_lock); guc_flush_submissions(guc); + guc_flush_destroyed_contexts(guc); /* * Handle any outstanding G2Hs before reset. Call IRQ handler directly @@ -725,6 +1160,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc) wait_for_reset(guc, &guc->outstanding_submission_g2h); } while (!list_empty(&guc->ct.requests.incoming)); } + scrub_guc_desc_for_outstanding_g2h(guc); } @@ -796,16 +1232,14 @@ __unwind_incomplete_requests(struct intel_context *ce) unsigned long flags; spin_lock_irqsave(&sched_engine->lock, flags); - spin_lock(&ce->guc_active.lock); - list_for_each_entry_safe(rq, rn, - &ce->guc_active.requests, - sched.link) { + spin_lock(&ce->guc_state.lock); + list_for_each_entry_safe_reverse(rq, rn, + &ce->guc_state.requests, + sched.link) { if (i915_request_completed(rq)) continue; list_del_init(&rq->sched.link); - spin_unlock(&ce->guc_active.lock); - __i915_request_unsubmit(rq); /* Push the request back into the queue for later resubmission. */ @@ -816,64 +1250,111 @@ __unwind_incomplete_requests(struct intel_context *ce) } GEM_BUG_ON(i915_sched_engine_is_empty(sched_engine)); - list_add_tail(&rq->sched.link, pl); + list_add(&rq->sched.link, pl); set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); - - spin_lock(&ce->guc_active.lock); } - spin_unlock(&ce->guc_active.lock); + spin_unlock(&ce->guc_state.lock); spin_unlock_irqrestore(&sched_engine->lock, flags); } static void __guc_reset_context(struct intel_context *ce, bool stalled) { + bool local_stalled; struct i915_request *rq; + unsigned long flags; u32 head; + int i, number_children = ce->parallel.number_children; + bool skip = false; + struct intel_context *parent = ce; + + GEM_BUG_ON(intel_context_is_child(ce)); intel_context_get(ce); /* - * GuC will implicitly mark the context as non-schedulable - * when it sends the reset notification. Make sure our state - * reflects this change. The context will be marked enabled - * on resubmission. + * GuC will implicitly mark the context as non-schedulable when it sends + * the reset notification. Make sure our state reflects this change. The + * context will be marked enabled on resubmission. + * + * XXX: If the context is reset as a result of the request cancellation + * this G2H is received after the schedule disable complete G2H which is + * wrong as this creates a race between the request cancellation code + * re-submitting the context and this G2H handler. This is a bug in the + * GuC but can be worked around in the meantime but converting this to a + * NOP if a pending enable is in flight as this indicates that a request + * cancellation has occurred. */ - clr_context_enabled(ce); + spin_lock_irqsave(&ce->guc_state.lock, flags); + if (likely(!context_pending_enable(ce))) + clr_context_enabled(ce); + else + skip = true; + spin_unlock_irqrestore(&ce->guc_state.lock, flags); + if (unlikely(skip)) + goto out_put; - rq = intel_context_find_active_request(ce); - if (!rq) { - head = ce->ring->tail; - stalled = false; - goto out_replay; - } + /* + * For each context in the relationship find the hanging request + * resetting each context / request as needed + */ + for (i = 0; i < number_children + 1; ++i) { + if (!intel_context_is_pinned(ce)) + goto next_context; + + local_stalled = false; + rq = intel_context_find_active_request(ce); + if (!rq) { + head = ce->ring->tail; + goto out_replay; + } - if (!i915_request_started(rq)) - stalled = false; + if (i915_request_started(rq)) + local_stalled = true; - GEM_BUG_ON(i915_active_is_idle(&ce->active)); - head = intel_ring_wrap(ce->ring, rq->head); - __i915_request_reset(rq, stalled); + GEM_BUG_ON(i915_active_is_idle(&ce->active)); + head = intel_ring_wrap(ce->ring, rq->head); + __i915_request_reset(rq, local_stalled && stalled); out_replay: - guc_reset_state(ce, head, stalled); - __unwind_incomplete_requests(ce); - intel_context_put(ce); + guc_reset_state(ce, head, local_stalled && stalled); +next_context: + if (i != number_children) + ce = list_next_entry(ce, parallel.child_link); + } + + __unwind_incomplete_requests(parent); +out_put: + intel_context_put(parent); } void intel_guc_submission_reset(struct intel_guc *guc, bool stalled) { struct intel_context *ce; unsigned long index; + unsigned long flags; if (unlikely(!guc_submission_initialized(guc))) { /* Reset called during driver load? GuC not yet initialised! */ return; } - xa_for_each(&guc->context_lookup, index, ce) - if (intel_context_is_pinned(ce)) + xa_lock_irqsave(&guc->context_lookup, flags); + xa_for_each(&guc->context_lookup, index, ce) { + if (!kref_get_unless_zero(&ce->ref)) + continue; + + xa_unlock(&guc->context_lookup); + + if (intel_context_is_pinned(ce) && + !intel_context_is_child(ce)) __guc_reset_context(ce, stalled); + intel_context_put(ce); + + xa_lock(&guc->context_lookup); + } + xa_unlock_irqrestore(&guc->context_lookup, flags); + /* GuC is blown away, drop all references to contexts */ xa_destroy(&guc->context_lookup); } @@ -886,10 +1367,10 @@ static void guc_cancel_context_requests(struct intel_context *ce) /* Mark all executing requests as skipped. */ spin_lock_irqsave(&sched_engine->lock, flags); - spin_lock(&ce->guc_active.lock); - list_for_each_entry(rq, &ce->guc_active.requests, sched.link) + spin_lock(&ce->guc_state.lock); + list_for_each_entry(rq, &ce->guc_state.requests, sched.link) i915_request_put(i915_request_mark_eio(rq)); - spin_unlock(&ce->guc_active.lock); + spin_unlock(&ce->guc_state.lock); spin_unlock_irqrestore(&sched_engine->lock, flags); } @@ -948,11 +1429,25 @@ void intel_guc_submission_cancel_requests(struct intel_guc *guc) { struct intel_context *ce; unsigned long index; + unsigned long flags; + + xa_lock_irqsave(&guc->context_lookup, flags); + xa_for_each(&guc->context_lookup, index, ce) { + if (!kref_get_unless_zero(&ce->ref)) + continue; - xa_for_each(&guc->context_lookup, index, ce) - if (intel_context_is_pinned(ce)) + xa_unlock(&guc->context_lookup); + + if (intel_context_is_pinned(ce) && + !intel_context_is_child(ce)) guc_cancel_context_requests(ce); + intel_context_put(ce); + + xa_lock(&guc->context_lookup); + } + xa_unlock_irqrestore(&guc->context_lookup, flags); + guc_cancel_sched_engine_requests(guc->sched_engine); /* GuC is blown away, drop all references to contexts */ @@ -981,6 +1476,8 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc) intel_gt_unpark_heartbeats(guc_to_gt(guc)); } +static void destroyed_worker_func(struct work_struct *w); + /* * Set up the memory resources to be shared with the GuC (via the GGTT) * at firmware loading time. @@ -1003,9 +1500,17 @@ int intel_guc_submission_init(struct intel_guc *guc) xa_init_flags(&guc->context_lookup, XA_FLAGS_LOCK_IRQ); - spin_lock_init(&guc->contexts_lock); - INIT_LIST_HEAD(&guc->guc_id_list); - ida_init(&guc->guc_ids); + spin_lock_init(&guc->submission_state.lock); + INIT_LIST_HEAD(&guc->submission_state.guc_id_list); + ida_init(&guc->submission_state.guc_ids); + INIT_LIST_HEAD(&guc->submission_state.destroyed_contexts); + INIT_WORK(&guc->submission_state.destroyed_worker, + destroyed_worker_func); + + guc->submission_state.guc_ids_bitmap = + bitmap_zalloc(NUMBER_MULTI_LRC_GUC_ID, GFP_KERNEL); + if (!guc->submission_state.guc_ids_bitmap) + return -ENOMEM; return 0; } @@ -1015,8 +1520,10 @@ void intel_guc_submission_fini(struct intel_guc *guc) if (!guc->lrc_desc_pool) return; + guc_flush_destroyed_contexts(guc); guc_lrc_desc_pool_destroy(guc); i915_sched_engine_put(guc->sched_engine); + bitmap_free(guc->submission_state.guc_ids_bitmap); } static inline void queue_request(struct i915_sched_engine *sched_engine, @@ -1027,21 +1534,28 @@ static inline void queue_request(struct i915_sched_engine *sched_engine, list_add_tail(&rq->sched.link, i915_sched_lookup_priolist(sched_engine, prio)); set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); + tasklet_hi_schedule(&sched_engine->tasklet); } static int guc_bypass_tasklet_submit(struct intel_guc *guc, struct i915_request *rq) { - int ret; + int ret = 0; __i915_request_submit(rq); trace_i915_request_in(rq, 0); - guc_set_lrc_tail(rq); - ret = guc_add_request(guc, rq); - if (ret == -EBUSY) - guc->stalled_request = rq; + if (is_multi_lrc_rq(rq)) { + if (multi_lrc_submit(rq)) { + ret = guc_wq_item_append(guc, rq); + if (!ret) + ret = guc_add_request(guc, rq); + } + } else { + guc_set_lrc_tail(rq); + ret = guc_add_request(guc, rq); + } if (unlikely(ret == -EPIPE)) disable_submission(guc); @@ -1049,6 +1563,16 @@ static int guc_bypass_tasklet_submit(struct intel_guc *guc, return ret; } +static bool need_tasklet(struct intel_guc *guc, struct i915_request *rq) +{ + struct i915_sched_engine *sched_engine = rq->engine->sched_engine; + struct intel_context *ce = request_to_scheduling_context(rq); + + return submission_disabled(guc) || guc->stalled_request || + !i915_sched_engine_is_empty(sched_engine) || + !lrc_desc_registered(guc, ce->guc_id.id); +} + static void guc_submit_request(struct i915_request *rq) { struct i915_sched_engine *sched_engine = rq->engine->sched_engine; @@ -1058,8 +1582,7 @@ static void guc_submit_request(struct i915_request *rq) /* Will be called from irq-context when using foreign fences. */ spin_lock_irqsave(&sched_engine->lock, flags); - if (submission_disabled(guc) || guc->stalled_request || - !i915_sched_engine_is_empty(sched_engine)) + if (need_tasklet(guc, rq)) queue_request(sched_engine, rq, rq_prio(rq)); else if (guc_bypass_tasklet_submit(guc, rq) == -EBUSY) tasklet_hi_schedule(&sched_engine->tasklet); @@ -1067,72 +1590,117 @@ static void guc_submit_request(struct i915_request *rq) spin_unlock_irqrestore(&sched_engine->lock, flags); } -static int new_guc_id(struct intel_guc *guc) +static int new_guc_id(struct intel_guc *guc, struct intel_context *ce) { - return ida_simple_get(&guc->guc_ids, 0, - GUC_MAX_LRC_DESCRIPTORS, GFP_KERNEL | - __GFP_RETRY_MAYFAIL | __GFP_NOWARN); + int ret; + + GEM_BUG_ON(intel_context_is_child(ce)); + + if (intel_context_is_parent(ce)) + ret = bitmap_find_free_region(guc->submission_state.guc_ids_bitmap, + NUMBER_MULTI_LRC_GUC_ID, + order_base_2(ce->parallel.number_children + + 1)); + else + ret = ida_simple_get(&guc->submission_state.guc_ids, + NUMBER_MULTI_LRC_GUC_ID, + GUC_MAX_LRC_DESCRIPTORS, + GFP_KERNEL | __GFP_RETRY_MAYFAIL | + __GFP_NOWARN); + if (unlikely(ret < 0)) + return ret; + + ce->guc_id.id = ret; + return 0; } static void __release_guc_id(struct intel_guc *guc, struct intel_context *ce) { + GEM_BUG_ON(intel_context_is_child(ce)); + if (!context_guc_id_invalid(ce)) { - ida_simple_remove(&guc->guc_ids, ce->guc_id); - reset_lrc_desc(guc, ce->guc_id); + if (intel_context_is_parent(ce)) + bitmap_release_region(guc->submission_state.guc_ids_bitmap, + ce->guc_id.id, + order_base_2(ce->parallel.number_children + + 1)); + else + ida_simple_remove(&guc->submission_state.guc_ids, + ce->guc_id.id); + reset_lrc_desc(guc, ce->guc_id.id); set_context_guc_id_invalid(ce); } - if (!list_empty(&ce->guc_id_link)) - list_del_init(&ce->guc_id_link); + if (!list_empty(&ce->guc_id.link)) + list_del_init(&ce->guc_id.link); } static void release_guc_id(struct intel_guc *guc, struct intel_context *ce) { unsigned long flags; - spin_lock_irqsave(&guc->contexts_lock, flags); + spin_lock_irqsave(&guc->submission_state.lock, flags); __release_guc_id(guc, ce); - spin_unlock_irqrestore(&guc->contexts_lock, flags); + spin_unlock_irqrestore(&guc->submission_state.lock, flags); } -static int steal_guc_id(struct intel_guc *guc) +static int steal_guc_id(struct intel_guc *guc, struct intel_context *ce) { - struct intel_context *ce; - int guc_id; + struct intel_context *cn; - lockdep_assert_held(&guc->contexts_lock); + lockdep_assert_held(&guc->submission_state.lock); + GEM_BUG_ON(intel_context_is_child(ce)); + GEM_BUG_ON(intel_context_is_parent(ce)); - if (!list_empty(&guc->guc_id_list)) { - ce = list_first_entry(&guc->guc_id_list, + if (!list_empty(&guc->submission_state.guc_id_list)) { + cn = list_first_entry(&guc->submission_state.guc_id_list, struct intel_context, - guc_id_link); + guc_id.link); - GEM_BUG_ON(atomic_read(&ce->guc_id_ref)); - GEM_BUG_ON(context_guc_id_invalid(ce)); + GEM_BUG_ON(atomic_read(&cn->guc_id.ref)); + GEM_BUG_ON(context_guc_id_invalid(cn)); + GEM_BUG_ON(intel_context_is_child(cn)); + GEM_BUG_ON(intel_context_is_parent(cn)); - list_del_init(&ce->guc_id_link); - guc_id = ce->guc_id; - clr_context_registered(ce); - set_context_guc_id_invalid(ce); - return guc_id; + list_del_init(&cn->guc_id.link); + ce->guc_id = cn->guc_id; + + spin_lock(&ce->guc_state.lock); + clr_context_registered(cn); + spin_unlock(&ce->guc_state.lock); + + set_context_guc_id_invalid(cn); + + return 0; } else { return -EAGAIN; } } -static int assign_guc_id(struct intel_guc *guc, u16 *out) +static int assign_guc_id(struct intel_guc *guc, struct intel_context *ce) { int ret; - lockdep_assert_held(&guc->contexts_lock); + lockdep_assert_held(&guc->submission_state.lock); + GEM_BUG_ON(intel_context_is_child(ce)); - ret = new_guc_id(guc); + ret = new_guc_id(guc, ce); if (unlikely(ret < 0)) { - ret = steal_guc_id(guc); + if (intel_context_is_parent(ce)) + return -ENOSPC; + + ret = steal_guc_id(guc, ce); if (ret < 0) return ret; } - *out = ret; + if (intel_context_is_parent(ce)) { + struct intel_context *child; + int i = 1; + + for_each_child(ce, child) + child->guc_id.id = ce->guc_id.id + i++; + } + return 0; } @@ -1142,26 +1710,28 @@ static int pin_guc_id(struct intel_guc *guc, struct intel_context *ce) int ret = 0; unsigned long flags, tries = PIN_GUC_ID_TRIES; - GEM_BUG_ON(atomic_read(&ce->guc_id_ref)); + GEM_BUG_ON(atomic_read(&ce->guc_id.ref)); try_again: - spin_lock_irqsave(&guc->contexts_lock, flags); + spin_lock_irqsave(&guc->submission_state.lock, flags); + + might_lock(&ce->guc_state.lock); if (context_guc_id_invalid(ce)) { - ret = assign_guc_id(guc, &ce->guc_id); + ret = assign_guc_id(guc, ce); if (ret) goto out_unlock; ret = 1; /* Indidcates newly assigned guc_id */ } - if (!list_empty(&ce->guc_id_link)) - list_del_init(&ce->guc_id_link); - atomic_inc(&ce->guc_id_ref); + if (!list_empty(&ce->guc_id.link)) + list_del_init(&ce->guc_id.link); + atomic_inc(&ce->guc_id.ref); out_unlock: - spin_unlock_irqrestore(&guc->contexts_lock, flags); + spin_unlock_irqrestore(&guc->submission_state.lock, flags); /* - * -EAGAIN indicates no guc_ids are available, let's retire any + * -EAGAIN indicates no guc_id are available, let's retire any * outstanding requests to see if that frees up a guc_id. If the first * retire didn't help, insert a sleep with the timeslice duration before * attempting to retire more requests. Double the sleep period each @@ -1189,16 +1759,43 @@ static void unpin_guc_id(struct intel_guc *guc, struct intel_context *ce) { unsigned long flags; - GEM_BUG_ON(atomic_read(&ce->guc_id_ref) < 0); + GEM_BUG_ON(atomic_read(&ce->guc_id.ref) < 0); + GEM_BUG_ON(intel_context_is_child(ce)); - if (unlikely(context_guc_id_invalid(ce))) + if (unlikely(context_guc_id_invalid(ce) || + intel_context_is_parent(ce))) return; - spin_lock_irqsave(&guc->contexts_lock, flags); - if (!context_guc_id_invalid(ce) && list_empty(&ce->guc_id_link) && - !atomic_read(&ce->guc_id_ref)) - list_add_tail(&ce->guc_id_link, &guc->guc_id_list); - spin_unlock_irqrestore(&guc->contexts_lock, flags); + spin_lock_irqsave(&guc->submission_state.lock, flags); + if (!context_guc_id_invalid(ce) && list_empty(&ce->guc_id.link) && + !atomic_read(&ce->guc_id.ref)) + list_add_tail(&ce->guc_id.link, + &guc->submission_state.guc_id_list); + spin_unlock_irqrestore(&guc->submission_state.lock, flags); +} + +static int __guc_action_register_multi_lrc(struct intel_guc *guc, + struct intel_context *ce, + u32 guc_id, + u32 offset, + bool loop) +{ + struct intel_context *child; + u32 action[4 + MAX_ENGINE_INSTANCE]; + int len = 0; + + GEM_BUG_ON(ce->parallel.number_children > MAX_ENGINE_INSTANCE); + + action[len++] = INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC; + action[len++] = guc_id; + action[len++] = ce->parallel.number_children + 1; + action[len++] = offset; + for_each_child(ce, child) { + offset += sizeof(struct guc_lrc_desc); + action[len++] = offset; + } + + return guc_submission_send_busy_loop(guc, action, len, 0, loop); } static int __guc_action_register_context(struct intel_guc *guc, @@ -1220,21 +1817,31 @@ static int register_context(struct intel_context *ce, bool loop) { struct intel_guc *guc = ce_to_guc(ce); u32 offset = intel_guc_ggtt_offset(guc, guc->lrc_desc_pool) + - ce->guc_id * sizeof(struct guc_lrc_desc); + ce->guc_id.id * sizeof(struct guc_lrc_desc); int ret; + GEM_BUG_ON(intel_context_is_child(ce)); trace_intel_context_register(ce); - ret = __guc_action_register_context(guc, ce->guc_id, offset, loop); - if (likely(!ret)) + if (intel_context_is_parent(ce)) + ret = __guc_action_register_multi_lrc(guc, ce, ce->guc_id.id, + offset, loop); + else + ret = __guc_action_register_context(guc, ce->guc_id.id, offset, + loop); + if (likely(!ret)) { + unsigned long flags; + + spin_lock_irqsave(&ce->guc_state.lock, flags); set_context_registered(ce); + spin_unlock_irqrestore(&ce->guc_state.lock, flags); + } return ret; } static int __guc_action_deregister_context(struct intel_guc *guc, - u32 guc_id, - bool loop) + u32 guc_id) { u32 action[] = { INTEL_GUC_ACTION_DEREGISTER_CONTEXT, @@ -1243,33 +1850,38 @@ static int __guc_action_deregister_context(struct intel_guc *guc, return guc_submission_send_busy_loop(guc, action, ARRAY_SIZE(action), G2H_LEN_DW_DEREGISTER_CONTEXT, - loop); + true); } -static int deregister_context(struct intel_context *ce, u32 guc_id, bool loop) +static int deregister_context(struct intel_context *ce, u32 guc_id) { struct intel_guc *guc = ce_to_guc(ce); + GEM_BUG_ON(intel_context_is_child(ce)); trace_intel_context_deregister(ce); - return __guc_action_deregister_context(guc, guc_id, loop); + return __guc_action_deregister_context(guc, guc_id); } -static intel_engine_mask_t adjust_engine_mask(u8 class, intel_engine_mask_t mask) +static inline void clear_children_join_go_memory(struct intel_context *ce) { - switch (class) { - case RENDER_CLASS: - return mask >> RCS0; - case VIDEO_ENHANCEMENT_CLASS: - return mask >> VECS0; - case VIDEO_DECODE_CLASS: - return mask >> VCS0; - case COPY_ENGINE_CLASS: - return mask >> BCS0; - default: - MISSING_CASE(class); - return 0; - } + struct parent_scratch *ps = __get_parent_scratch(ce); + int i; + + ps->go.semaphore = 0; + for (i = 0; i < ce->parallel.number_children + 1; ++i) + ps->join[i].semaphore = 0; +} + +static inline u32 get_children_go_value(struct intel_context *ce) +{ + return __get_parent_scratch(ce)->go.semaphore; +} + +static inline u32 get_children_join_value(struct intel_context *ce, + u8 child_index) +{ + return __get_parent_scratch(ce)->join[child_index].semaphore; } static void guc_context_policy_init(struct intel_engine_cs *engine, @@ -1285,22 +1897,20 @@ static void guc_context_policy_init(struct intel_engine_cs *engine, desc->preemption_timeout = engine->props.preempt_timeout_ms * 1000; } -static inline u8 map_i915_prio_to_guc_prio(int prio); - static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) { struct intel_engine_cs *engine = ce->engine; struct intel_runtime_pm *runtime_pm = engine->uncore->rpm; struct intel_guc *guc = &engine->gt->uc.guc; - u32 desc_idx = ce->guc_id; + u32 desc_idx = ce->guc_id.id; struct guc_lrc_desc *desc; - const struct i915_gem_context *ctx; - int prio = I915_CONTEXT_DEFAULT_PRIORITY; bool context_registered; intel_wakeref_t wakeref; + struct intel_context *child; int ret = 0; GEM_BUG_ON(!engine->mask); + GEM_BUG_ON(!sched_state_is_init(ce)); /* * Ensure LRC + CT vmas are is same region as write barrier is done @@ -1311,25 +1921,53 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) context_registered = lrc_desc_registered(guc, desc_idx); - rcu_read_lock(); - ctx = rcu_dereference(ce->gem_context); - if (ctx) - prio = ctx->sched.priority; - rcu_read_unlock(); - reset_lrc_desc(guc, desc_idx); set_lrc_desc_registered(guc, desc_idx, ce); desc = __get_lrc_desc(guc, desc_idx); desc->engine_class = engine_class_to_guc_class(engine->class); - desc->engine_submit_mask = adjust_engine_mask(engine->class, - engine->mask); + desc->engine_submit_mask = engine->logical_mask; desc->hw_context_desc = ce->lrc.lrca; - ce->guc_prio = map_i915_prio_to_guc_prio(prio); - desc->priority = ce->guc_prio; + desc->priority = ce->guc_state.prio; desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; guc_context_policy_init(engine, desc); - init_sched_state(ce); + + /* + * If context is a parent, we need to register a process descriptor + * describing a work queue and register all child contexts. + */ + if (intel_context_is_parent(ce)) { + struct guc_process_desc *pdesc; + + ce->parallel.guc.wqi_tail = 0; + ce->parallel.guc.wqi_head = 0; + + desc->process_desc = i915_ggtt_offset(ce->state) + + __get_parent_scratch_offset(ce); + desc->wq_addr = i915_ggtt_offset(ce->state) + + __get_wq_offset(ce); + desc->wq_size = WQ_SIZE; + + pdesc = __get_process_desc(ce); + memset(pdesc, 0, sizeof(*(pdesc))); + pdesc->stage_id = ce->guc_id.id; + pdesc->wq_base_addr = desc->wq_addr; + pdesc->wq_size_bytes = desc->wq_size; + pdesc->wq_status = WQ_STATUS_ACTIVE; + + for_each_child(ce, child) { + desc = __get_lrc_desc(guc, child->guc_id.id); + + desc->engine_class = + engine_class_to_guc_class(engine->class); + desc->hw_context_desc = child->lrc.lrca; + desc->priority = ce->guc_state.prio; + desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; + guc_context_policy_init(engine, desc); + } + + clear_children_join_go_memory(ce); + } /* * The context_lookup xarray is used to determine if the hardware @@ -1340,26 +1978,23 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) * registering this context. */ if (context_registered) { + bool disabled; + unsigned long flags; + trace_intel_context_steal_guc_id(ce); - if (!loop) { + GEM_BUG_ON(!loop); + + /* Seal race with Reset */ + spin_lock_irqsave(&ce->guc_state.lock, flags); + disabled = submission_disabled(guc); + if (likely(!disabled)) { set_context_wait_for_deregister_to_register(ce); intel_context_get(ce); - } else { - bool disabled; - unsigned long flags; - - /* Seal race with Reset */ - spin_lock_irqsave(&ce->guc_state.lock, flags); - disabled = submission_disabled(guc); - if (likely(!disabled)) { - set_context_wait_for_deregister_to_register(ce); - intel_context_get(ce); - } - spin_unlock_irqrestore(&ce->guc_state.lock, flags); - if (unlikely(disabled)) { - reset_lrc_desc(guc, desc_idx); - return 0; /* Will get registered later */ - } + } + spin_unlock_irqrestore(&ce->guc_state.lock, flags); + if (unlikely(disabled)) { + reset_lrc_desc(guc, desc_idx); + return 0; /* Will get registered later */ } /* @@ -1367,20 +2002,18 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) * context whose guc_id was stolen. */ with_intel_runtime_pm(runtime_pm, wakeref) - ret = deregister_context(ce, ce->guc_id, loop); - if (unlikely(ret == -EBUSY)) { - clr_context_wait_for_deregister_to_register(ce); - intel_context_put(ce); - } else if (unlikely(ret == -ENODEV)) { + ret = deregister_context(ce, ce->guc_id.id); + if (unlikely(ret == -ENODEV)) ret = 0; /* Will get registered later */ - } } else { with_intel_runtime_pm(runtime_pm, wakeref) ret = register_context(ce, loop); - if (unlikely(ret == -EBUSY)) + if (unlikely(ret == -EBUSY)) { + reset_lrc_desc(guc, desc_idx); + } else if (unlikely(ret == -ENODEV)) { reset_lrc_desc(guc, desc_idx); - else if (unlikely(ret == -ENODEV)) ret = 0; /* Will get registered later */ + } } return ret; @@ -1419,7 +2052,12 @@ static int guc_context_pre_pin(struct intel_context *ce, static int guc_context_pin(struct intel_context *ce, void *vaddr) { - return __guc_context_pin(ce, ce->engine, vaddr); + int ret = __guc_context_pin(ce, ce->engine, vaddr); + + if (likely(!ret && !intel_context_is_barrier(ce))) + intel_engine_pm_get(ce->engine); + + return ret; } static void guc_context_unpin(struct intel_context *ce) @@ -1428,6 +2066,9 @@ static void guc_context_unpin(struct intel_context *ce) unpin_guc_id(guc, ce); lrc_unpin(ce); + + if (likely(!intel_context_is_barrier(ce))) + intel_engine_pm_put_async(ce->engine); } static void guc_context_post_unpin(struct intel_context *ce) @@ -1440,7 +2081,7 @@ static void __guc_context_sched_enable(struct intel_guc *guc, { u32 action[] = { INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET, - ce->guc_id, + ce->guc_id.id, GUC_CONTEXT_ENABLE }; @@ -1456,12 +2097,13 @@ static void __guc_context_sched_disable(struct intel_guc *guc, { u32 action[] = { INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET, - guc_id, /* ce->guc_id not stable */ + guc_id, /* ce->guc_id.id not stable */ GUC_CONTEXT_DISABLE }; GEM_BUG_ON(guc_id == GUC_INVALID_LRC_ID); + GEM_BUG_ON(intel_context_is_child(ce)); trace_intel_context_sched_disable(ce); guc_submission_send_busy_loop(guc, action, ARRAY_SIZE(action), @@ -1472,24 +2114,24 @@ static void guc_blocked_fence_complete(struct intel_context *ce) { lockdep_assert_held(&ce->guc_state.lock); - if (!i915_sw_fence_done(&ce->guc_blocked)) - i915_sw_fence_complete(&ce->guc_blocked); + if (!i915_sw_fence_done(&ce->guc_state.blocked)) + i915_sw_fence_complete(&ce->guc_state.blocked); } static void guc_blocked_fence_reinit(struct intel_context *ce) { lockdep_assert_held(&ce->guc_state.lock); - GEM_BUG_ON(!i915_sw_fence_done(&ce->guc_blocked)); + GEM_BUG_ON(!i915_sw_fence_done(&ce->guc_state.blocked)); /* * This fence is always complete unless a pending schedule disable is * outstanding. We arm the fence here and complete it when we receive * the pending schedule disable complete message. */ - i915_sw_fence_fini(&ce->guc_blocked); - i915_sw_fence_reinit(&ce->guc_blocked); - i915_sw_fence_await(&ce->guc_blocked); - i915_sw_fence_commit(&ce->guc_blocked); + i915_sw_fence_fini(&ce->guc_state.blocked); + i915_sw_fence_reinit(&ce->guc_state.blocked); + i915_sw_fence_await(&ce->guc_state.blocked); + i915_sw_fence_commit(&ce->guc_state.blocked); } static u16 prep_context_pending_disable(struct intel_context *ce) @@ -1501,35 +2143,30 @@ static u16 prep_context_pending_disable(struct intel_context *ce) guc_blocked_fence_reinit(ce); intel_context_get(ce); - return ce->guc_id; + return ce->guc_id.id; } static struct i915_sw_fence *guc_context_block(struct intel_context *ce) { struct intel_guc *guc = ce_to_guc(ce); - struct i915_sched_engine *sched_engine = ce->engine->sched_engine; unsigned long flags; struct intel_runtime_pm *runtime_pm = ce->engine->uncore->rpm; intel_wakeref_t wakeref; u16 guc_id; bool enabled; + GEM_BUG_ON(intel_context_is_child(ce)); + spin_lock_irqsave(&ce->guc_state.lock, flags); - /* - * Sync with submission path, increment before below changes to context - * state. - */ - spin_lock(&sched_engine->lock); incr_context_blocked(ce); - spin_unlock(&sched_engine->lock); enabled = context_enabled(ce); if (unlikely(!enabled || submission_disabled(guc))) { if (enabled) clr_context_enabled(ce); spin_unlock_irqrestore(&ce->guc_state.lock, flags); - return &ce->guc_blocked; + return &ce->guc_state.blocked; } /* @@ -1545,26 +2182,41 @@ static struct i915_sw_fence *guc_context_block(struct intel_context *ce) with_intel_runtime_pm(runtime_pm, wakeref) __guc_context_sched_disable(guc, ce, guc_id); - return &ce->guc_blocked; + return &ce->guc_state.blocked; +} + +#define SCHED_STATE_MULTI_BLOCKED_MASK \ + (SCHED_STATE_BLOCKED_MASK & ~SCHED_STATE_BLOCKED) +#define SCHED_STATE_NO_UNBLOCK \ + (SCHED_STATE_MULTI_BLOCKED_MASK | \ + SCHED_STATE_PENDING_DISABLE | \ + SCHED_STATE_BANNED) + +static bool context_cant_unblock(struct intel_context *ce) +{ + lockdep_assert_held(&ce->guc_state.lock); + + return (ce->guc_state.sched_state & SCHED_STATE_NO_UNBLOCK) || + context_guc_id_invalid(ce) || + !lrc_desc_registered(ce_to_guc(ce), ce->guc_id.id) || + !intel_context_is_pinned(ce); } static void guc_context_unblock(struct intel_context *ce) { struct intel_guc *guc = ce_to_guc(ce); - struct i915_sched_engine *sched_engine = ce->engine->sched_engine; unsigned long flags; struct intel_runtime_pm *runtime_pm = ce->engine->uncore->rpm; intel_wakeref_t wakeref; bool enable; GEM_BUG_ON(context_enabled(ce)); + GEM_BUG_ON(intel_context_is_child(ce)); spin_lock_irqsave(&ce->guc_state.lock, flags); if (unlikely(submission_disabled(guc) || - !intel_context_is_pinned(ce) || - context_pending_disable(ce) || - context_blocked(ce) > 1)) { + context_cant_unblock(ce))) { enable = false; } else { enable = true; @@ -1573,13 +2225,7 @@ static void guc_context_unblock(struct intel_context *ce) intel_context_get(ce); } - /* - * Sync with submission path, decrement after above changes to context - * state. - */ - spin_lock(&sched_engine->lock); decr_context_blocked(ce); - spin_unlock(&sched_engine->lock); spin_unlock_irqrestore(&ce->guc_state.lock, flags); @@ -1592,16 +2238,29 @@ static void guc_context_unblock(struct intel_context *ce) static void guc_context_cancel_request(struct intel_context *ce, struct i915_request *rq) { + struct intel_context *block_context = + request_to_scheduling_context(rq); + if (i915_sw_fence_signaled(&rq->submit)) { - struct i915_sw_fence *fence = guc_context_block(ce); + struct i915_sw_fence *fence; + intel_context_get(ce); + fence = guc_context_block(block_context); i915_sw_fence_wait(fence); if (!i915_request_completed(rq)) { __i915_request_skip(rq); guc_reset_state(ce, intel_ring_wrap(ce->ring, rq->head), true); } - guc_context_unblock(ce); + + /* + * XXX: Racey if context is reset, see comment in + * __guc_reset_context(). + */ + flush_work(&ce_to_guc(ce)->ct.requests.worker); + + guc_context_unblock(block_context); + intel_context_put(ce); } } @@ -1626,6 +2285,8 @@ static void guc_context_ban(struct intel_context *ce, struct i915_request *rq) intel_wakeref_t wakeref; unsigned long flags; + GEM_BUG_ON(intel_context_is_child(ce)); + guc_flush_submissions(guc); spin_lock_irqsave(&ce->guc_state.lock, flags); @@ -1662,7 +2323,7 @@ static void guc_context_ban(struct intel_context *ce, struct i915_request *rq) if (!context_guc_id_invalid(ce)) with_intel_runtime_pm(runtime_pm, wakeref) __guc_context_set_preemption_timeout(guc, - ce->guc_id, + ce->guc_id.id, 1); spin_unlock_irqrestore(&ce->guc_state.lock, flags); } @@ -1675,40 +2336,24 @@ static void guc_context_sched_disable(struct intel_context *ce) struct intel_runtime_pm *runtime_pm = &ce->engine->gt->i915->runtime_pm; intel_wakeref_t wakeref; u16 guc_id; - bool enabled; - if (submission_disabled(guc) || context_guc_id_invalid(ce) || - !lrc_desc_registered(guc, ce->guc_id)) { - clr_context_enabled(ce); - goto unpin; - } - - if (!context_enabled(ce)) - goto unpin; + GEM_BUG_ON(intel_context_is_child(ce)); spin_lock_irqsave(&ce->guc_state.lock, flags); /* - * We have to check if the context has been disabled by another thread. - * We also have to check if the context has been pinned again as another - * pin operation is allowed to pass this function. Checking the pin - * count, within ce->guc_state.lock, synchronizes this function with - * guc_request_alloc ensuring a request doesn't slip through the - * 'context_pending_disable' fence. Checking within the spin lock (can't - * sleep) ensures another process doesn't pin this context and generate - * a request before we set the 'context_pending_disable' flag here. + * We have to check if the context has been disabled by another thread, + * check if submssion has been disabled to seal a race with reset and + * finally check if any more requests have been committed to the + * context ensursing that a request doesn't slip through the + * 'context_pending_disable' fence. */ - enabled = context_enabled(ce); - if (unlikely(!enabled || submission_disabled(guc))) { - if (enabled) - clr_context_enabled(ce); + if (unlikely(!context_enabled(ce) || submission_disabled(guc) || + context_has_committed_requests(ce))) { + clr_context_enabled(ce); spin_unlock_irqrestore(&ce->guc_state.lock, flags); goto unpin; } - if (unlikely(atomic_add_unless(&ce->pin_count, -2, 2))) { - spin_unlock_irqrestore(&ce->guc_state.lock, flags); - return; - } guc_id = prep_context_pending_disable(ce); spin_unlock_irqrestore(&ce->guc_state.lock, flags); @@ -1724,21 +2369,40 @@ unpin: static inline void guc_lrc_desc_unpin(struct intel_context *ce) { struct intel_guc *guc = ce_to_guc(ce); + struct intel_gt *gt = guc_to_gt(guc); + unsigned long flags; + bool disabled; - GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id)); - GEM_BUG_ON(ce != __get_context(guc, ce->guc_id)); + GEM_BUG_ON(!intel_gt_pm_is_awake(gt)); + GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id)); + GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id)); GEM_BUG_ON(context_enabled(ce)); - clr_context_registered(ce); - deregister_context(ce, ce->guc_id, true); + /* Seal race with Reset */ + spin_lock_irqsave(&ce->guc_state.lock, flags); + disabled = submission_disabled(guc); + if (likely(!disabled)) { + __intel_gt_pm_get(gt); + set_context_destroyed(ce); + clr_context_registered(ce); + } + spin_unlock_irqrestore(&ce->guc_state.lock, flags); + if (unlikely(disabled)) { + release_guc_id(guc, ce); + __guc_context_destroy(ce); + return; + } + + deregister_context(ce, ce->guc_id.id); } static void __guc_context_destroy(struct intel_context *ce) { - GEM_BUG_ON(ce->guc_prio_count[GUC_CLIENT_PRIORITY_KMD_HIGH] || - ce->guc_prio_count[GUC_CLIENT_PRIORITY_HIGH] || - ce->guc_prio_count[GUC_CLIENT_PRIORITY_KMD_NORMAL] || - ce->guc_prio_count[GUC_CLIENT_PRIORITY_NORMAL]); + GEM_BUG_ON(ce->guc_state.prio_count[GUC_CLIENT_PRIORITY_KMD_HIGH] || + ce->guc_state.prio_count[GUC_CLIENT_PRIORITY_HIGH] || + ce->guc_state.prio_count[GUC_CLIENT_PRIORITY_KMD_NORMAL] || + ce->guc_state.prio_count[GUC_CLIENT_PRIORITY_NORMAL]); + GEM_BUG_ON(ce->guc_state.number_committed_requests); lrc_fini(ce); intel_context_fini(ce); @@ -1756,76 +2420,86 @@ static void __guc_context_destroy(struct intel_context *ce) } } +static void guc_flush_destroyed_contexts(struct intel_guc *guc) +{ + struct intel_context *ce, *cn; + unsigned long flags; + + GEM_BUG_ON(!submission_disabled(guc) && + guc_submission_initialized(guc)); + + spin_lock_irqsave(&guc->submission_state.lock, flags); + list_for_each_entry_safe(ce, cn, + &guc->submission_state.destroyed_contexts, + destroyed_link) { + list_del_init(&ce->destroyed_link); + __release_guc_id(guc, ce); + __guc_context_destroy(ce); + } + spin_unlock_irqrestore(&guc->submission_state.lock, flags); +} + +static void deregister_destroyed_contexts(struct intel_guc *guc) +{ + struct intel_context *ce, *cn; + unsigned long flags; + + spin_lock_irqsave(&guc->submission_state.lock, flags); + list_for_each_entry_safe(ce, cn, + &guc->submission_state.destroyed_contexts, + destroyed_link) { + list_del_init(&ce->destroyed_link); + guc_lrc_desc_unpin(ce); + } + spin_unlock_irqrestore(&guc->submission_state.lock, flags); +} + +static void destroyed_worker_func(struct work_struct *w) +{ + struct intel_guc *guc = container_of(w, struct intel_guc, + submission_state.destroyed_worker); + struct intel_gt *gt = guc_to_gt(guc); + int tmp; + + with_intel_gt_pm(gt, tmp) + deregister_destroyed_contexts(guc); +} + static void guc_context_destroy(struct kref *kref) { struct intel_context *ce = container_of(kref, typeof(*ce), ref); - struct intel_runtime_pm *runtime_pm = ce->engine->uncore->rpm; struct intel_guc *guc = ce_to_guc(ce); - intel_wakeref_t wakeref; unsigned long flags; - bool disabled; + bool destroy; /* * If the guc_id is invalid this context has been stolen and we can free * it immediately. Also can be freed immediately if the context is not * registered with the GuC or the GuC is in the middle of a reset. */ - if (context_guc_id_invalid(ce)) { - __guc_context_destroy(ce); - return; - } else if (submission_disabled(guc) || - !lrc_desc_registered(guc, ce->guc_id)) { - release_guc_id(guc, ce); - __guc_context_destroy(ce); - return; - } - - /* - * We have to acquire the context spinlock and check guc_id again, if it - * is valid it hasn't been stolen and needs to be deregistered. We - * delete this context from the list of unpinned guc_ids available to - * steal to seal a race with guc_lrc_desc_pin(). When the G2H CTB - * returns indicating this context has been deregistered the guc_id is - * returned to the pool of available guc_ids. - */ - spin_lock_irqsave(&guc->contexts_lock, flags); - if (context_guc_id_invalid(ce)) { - spin_unlock_irqrestore(&guc->contexts_lock, flags); - __guc_context_destroy(ce); - return; + spin_lock_irqsave(&guc->submission_state.lock, flags); + destroy = submission_disabled(guc) || context_guc_id_invalid(ce) || + !lrc_desc_registered(guc, ce->guc_id.id); + if (likely(!destroy)) { + if (!list_empty(&ce->guc_id.link)) + list_del_init(&ce->guc_id.link); + list_add_tail(&ce->destroyed_link, + &guc->submission_state.destroyed_contexts); + } else { + __release_guc_id(guc, ce); } - - if (!list_empty(&ce->guc_id_link)) - list_del_init(&ce->guc_id_link); - spin_unlock_irqrestore(&guc->contexts_lock, flags); - - /* Seal race with Reset */ - spin_lock_irqsave(&ce->guc_state.lock, flags); - disabled = submission_disabled(guc); - if (likely(!disabled)) - set_context_destroyed(ce); - spin_unlock_irqrestore(&ce->guc_state.lock, flags); - if (unlikely(disabled)) { - release_guc_id(guc, ce); + spin_unlock_irqrestore(&guc->submission_state.lock, flags); + if (unlikely(destroy)) { __guc_context_destroy(ce); return; } /* - * We defer GuC context deregistration until the context is destroyed - * in order to save on CTBs. With this optimization ideally we only need - * 1 CTB to register the context during the first pin and 1 CTB to - * deregister the context when the context is destroyed. Without this - * optimization, a CTB would be needed every pin & unpin. - * - * XXX: Need to acqiure the runtime wakeref as this can be triggered - * from context_free_worker when runtime wakeref is not held. - * guc_lrc_desc_unpin requires the runtime as a GuC register is written - * in H2G CTB to deregister the context. A future patch may defer this - * H2G CTB if the runtime wakeref is zero. + * We use a worker to issue the H2G to deregister the context as we can + * take the GT PM for the first time which isn't allowed from an atomic + * context. */ - with_intel_runtime_pm(runtime_pm, wakeref) - guc_lrc_desc_unpin(ce); + queue_work(system_unbound_wq, &guc->submission_state.destroyed_worker); } static int guc_context_alloc(struct intel_context *ce) @@ -1839,20 +2513,23 @@ static void guc_context_set_prio(struct intel_guc *guc, { u32 action[] = { INTEL_GUC_ACTION_SET_CONTEXT_PRIORITY, - ce->guc_id, + ce->guc_id.id, prio, }; GEM_BUG_ON(prio < GUC_CLIENT_PRIORITY_KMD_HIGH || prio > GUC_CLIENT_PRIORITY_NORMAL); + lockdep_assert_held(&ce->guc_state.lock); - if (ce->guc_prio == prio || submission_disabled(guc) || - !context_registered(ce)) + if (ce->guc_state.prio == prio || submission_disabled(guc) || + !context_registered(ce)) { + ce->guc_state.prio = prio; return; + } guc_submission_send_busy_loop(guc, action, ARRAY_SIZE(action), 0, true); - ce->guc_prio = prio; + ce->guc_state.prio = prio; trace_intel_context_set_prio(ce); } @@ -1871,25 +2548,25 @@ static inline u8 map_i915_prio_to_guc_prio(int prio) static inline void add_context_inflight_prio(struct intel_context *ce, u8 guc_prio) { - lockdep_assert_held(&ce->guc_active.lock); - GEM_BUG_ON(guc_prio >= ARRAY_SIZE(ce->guc_prio_count)); + lockdep_assert_held(&ce->guc_state.lock); + GEM_BUG_ON(guc_prio >= ARRAY_SIZE(ce->guc_state.prio_count)); - ++ce->guc_prio_count[guc_prio]; + ++ce->guc_state.prio_count[guc_prio]; /* Overflow protection */ - GEM_WARN_ON(!ce->guc_prio_count[guc_prio]); + GEM_WARN_ON(!ce->guc_state.prio_count[guc_prio]); } static inline void sub_context_inflight_prio(struct intel_context *ce, u8 guc_prio) { - lockdep_assert_held(&ce->guc_active.lock); - GEM_BUG_ON(guc_prio >= ARRAY_SIZE(ce->guc_prio_count)); + lockdep_assert_held(&ce->guc_state.lock); + GEM_BUG_ON(guc_prio >= ARRAY_SIZE(ce->guc_state.prio_count)); /* Underflow protection */ - GEM_WARN_ON(!ce->guc_prio_count[guc_prio]); + GEM_WARN_ON(!ce->guc_state.prio_count[guc_prio]); - --ce->guc_prio_count[guc_prio]; + --ce->guc_state.prio_count[guc_prio]; } static inline void update_context_prio(struct intel_context *ce) @@ -1900,10 +2577,10 @@ static inline void update_context_prio(struct intel_context *ce) BUILD_BUG_ON(GUC_CLIENT_PRIORITY_KMD_HIGH != 0); BUILD_BUG_ON(GUC_CLIENT_PRIORITY_KMD_HIGH > GUC_CLIENT_PRIORITY_NORMAL); - lockdep_assert_held(&ce->guc_active.lock); + lockdep_assert_held(&ce->guc_state.lock); - for (i = 0; i < ARRAY_SIZE(ce->guc_prio_count); ++i) { - if (ce->guc_prio_count[i]) { + for (i = 0; i < ARRAY_SIZE(ce->guc_state.prio_count); ++i) { + if (ce->guc_state.prio_count[i]) { guc_context_set_prio(guc, ce, i); break; } @@ -1918,13 +2595,14 @@ static inline bool new_guc_prio_higher(u8 old_guc_prio, u8 new_guc_prio) static void add_to_context(struct i915_request *rq) { - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); u8 new_guc_prio = map_i915_prio_to_guc_prio(rq_prio(rq)); + GEM_BUG_ON(intel_context_is_child(ce)); GEM_BUG_ON(rq->guc_prio == GUC_PRIO_FINI); - spin_lock(&ce->guc_active.lock); - list_move_tail(&rq->sched.link, &ce->guc_active.requests); + spin_lock(&ce->guc_state.lock); + list_move_tail(&rq->sched.link, &ce->guc_state.requests); if (rq->guc_prio == GUC_PRIO_INIT) { rq->guc_prio = new_guc_prio; @@ -1936,12 +2614,12 @@ static void add_to_context(struct i915_request *rq) } update_context_prio(ce); - spin_unlock(&ce->guc_active.lock); + spin_unlock(&ce->guc_state.lock); } static void guc_prio_fini(struct i915_request *rq, struct intel_context *ce) { - lockdep_assert_held(&ce->guc_active.lock); + lockdep_assert_held(&ce->guc_state.lock); if (rq->guc_prio != GUC_PRIO_INIT && rq->guc_prio != GUC_PRIO_FINI) { @@ -1953,9 +2631,11 @@ static void guc_prio_fini(struct i915_request *rq, struct intel_context *ce) static void remove_from_context(struct i915_request *rq) { - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); - spin_lock_irq(&ce->guc_active.lock); + GEM_BUG_ON(intel_context_is_child(ce)); + + spin_lock_irq(&ce->guc_state.lock); list_del_init(&rq->sched.link); clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); @@ -1965,9 +2645,11 @@ static void remove_from_context(struct i915_request *rq) guc_prio_fini(rq, ce); - spin_unlock_irq(&ce->guc_active.lock); + decr_context_committed_requests(ce); + + spin_unlock_irq(&ce->guc_state.lock); - atomic_dec(&ce->guc_id_ref); + atomic_dec(&ce->guc_id.ref); i915_request_notify_execute_cb_imm(rq); } @@ -1992,19 +2674,35 @@ static const struct intel_context_ops guc_context_ops = { .destroy = guc_context_destroy, .create_virtual = guc_create_virtual, + .create_parallel = guc_create_parallel, }; +static void submit_work_cb(struct irq_work *wrk) +{ + struct i915_request *rq = container_of(wrk, typeof(*rq), submit_work); + + might_lock(&rq->engine->sched_engine->lock); + i915_sw_fence_complete(&rq->submit); +} + static void __guc_signal_context_fence(struct intel_context *ce) { - struct i915_request *rq; + struct i915_request *rq, *rn; lockdep_assert_held(&ce->guc_state.lock); if (!list_empty(&ce->guc_state.fences)) trace_intel_context_fence_release(ce); - list_for_each_entry(rq, &ce->guc_state.fences, guc_fence_link) - i915_sw_fence_complete(&rq->submit); + /* + * Use an IRQ to ensure locking order of sched_engine->lock -> + * ce->guc_state.lock is preserved. + */ + list_for_each_entry_safe(rq, rn, &ce->guc_state.fences, + guc_fence_link) { + list_del(&rq->guc_fence_link); + irq_work_queue(&rq->submit_work); + } INIT_LIST_HEAD(&ce->guc_state.fences); } @@ -2013,6 +2711,8 @@ static void guc_signal_context_fence(struct intel_context *ce) { unsigned long flags; + GEM_BUG_ON(intel_context_is_child(ce)); + spin_lock_irqsave(&ce->guc_state.lock, flags); clr_context_wait_for_deregister_to_register(ce); __guc_signal_context_fence(ce); @@ -2022,13 +2722,28 @@ static void guc_signal_context_fence(struct intel_context *ce) static bool context_needs_register(struct intel_context *ce, bool new_guc_id) { return (new_guc_id || test_bit(CONTEXT_LRCA_DIRTY, &ce->flags) || - !lrc_desc_registered(ce_to_guc(ce), ce->guc_id)) && + !lrc_desc_registered(ce_to_guc(ce), ce->guc_id.id)) && !submission_disabled(ce_to_guc(ce)); } +static void guc_context_init(struct intel_context *ce) +{ + const struct i915_gem_context *ctx; + int prio = I915_CONTEXT_DEFAULT_PRIORITY; + + rcu_read_lock(); + ctx = rcu_dereference(ce->gem_context); + if (ctx) + prio = ctx->sched.priority; + rcu_read_unlock(); + + ce->guc_state.prio = map_i915_prio_to_guc_prio(prio); + set_bit(CONTEXT_GUC_INIT, &ce->flags); +} + static int guc_request_alloc(struct i915_request *rq) { - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); struct intel_guc *guc = ce_to_guc(ce); unsigned long flags; int ret; @@ -2057,14 +2772,17 @@ static int guc_request_alloc(struct i915_request *rq) rq->reserved_space -= GUC_REQUEST_SIZE; + if (unlikely(!test_bit(CONTEXT_GUC_INIT, &ce->flags))) + guc_context_init(ce); + /* * Call pin_guc_id here rather than in the pinning step as with * dma_resv, contexts can be repeatedly pinned / unpinned trashing the - * guc_ids and creating horrible race conditions. This is especially bad - * when guc_ids are being stolen due to over subscription. By the time + * guc_id and creating horrible race conditions. This is especially bad + * when guc_id are being stolen due to over subscription. By the time * this function is reached, it is guaranteed that the guc_id will be * persistent until the generated request is retired. Thus, sealing these - * race conditions. It is still safe to fail here if guc_ids are + * race conditions. It is still safe to fail here if guc_id are * exhausted and return -EAGAIN to the user indicating that they can try * again in the future. * @@ -2074,7 +2792,7 @@ static int guc_request_alloc(struct i915_request *rq) * decremented on each retire. When it is zero, a lock around the * increment (in pin_guc_id) is needed to seal a race with unpin_guc_id. */ - if (atomic_add_unless(&ce->guc_id_ref, 1, 0)) + if (atomic_add_unless(&ce->guc_id.ref, 1, 0)) goto out; ret = pin_guc_id(guc, ce); /* returns 1 if new guc_id assigned */ @@ -2087,7 +2805,7 @@ static int guc_request_alloc(struct i915_request *rq) disable_submission(guc); goto out; /* GPU will be reset */ } - atomic_dec(&ce->guc_id_ref); + atomic_dec(&ce->guc_id.ref); unpin_guc_id(guc, ce); return ret; } @@ -2102,22 +2820,16 @@ out: * schedule enable or context registration if either G2H is pending * respectfully. Once a G2H returns, the fence is released that is * blocking these requests (see guc_signal_context_fence). - * - * We can safely check the below fields outside of the lock as it isn't - * possible for these fields to transition from being clear to set but - * converse is possible, hence the need for the check within the lock. */ - if (likely(!context_wait_for_deregister_to_register(ce) && - !context_pending_disable(ce))) - return 0; - spin_lock_irqsave(&ce->guc_state.lock, flags); if (context_wait_for_deregister_to_register(ce) || context_pending_disable(ce)) { + init_irq_work(&rq->submit_work, submit_work_cb); i915_sw_fence_await(&rq->submit); list_add_tail(&rq->guc_fence_link, &ce->guc_state.fences); } + incr_context_committed_requests(ce); spin_unlock_irqrestore(&ce->guc_state.lock, flags); return 0; @@ -2135,8 +2847,30 @@ static int guc_virtual_context_pre_pin(struct intel_context *ce, static int guc_virtual_context_pin(struct intel_context *ce, void *vaddr) { struct intel_engine_cs *engine = guc_virtual_get_sibling(ce->engine, 0); + int ret = __guc_context_pin(ce, engine, vaddr); + intel_engine_mask_t tmp, mask = ce->engine->mask; - return __guc_context_pin(ce, engine, vaddr); + if (likely(!ret)) + for_each_engine_masked(engine, ce->engine->gt, mask, tmp) + intel_engine_pm_get(engine); + + return ret; +} + +static void guc_virtual_context_unpin(struct intel_context *ce) +{ + intel_engine_mask_t tmp, mask = ce->engine->mask; + struct intel_engine_cs *engine; + struct intel_guc *guc = ce_to_guc(ce); + + GEM_BUG_ON(context_enabled(ce)); + GEM_BUG_ON(intel_context_is_barrier(ce)); + + unpin_guc_id(guc, ce); + lrc_unpin(ce); + + for_each_engine_masked(engine, ce->engine->gt, mask, tmp) + intel_engine_pm_put_async(engine); } static void guc_virtual_context_enter(struct intel_context *ce) @@ -2173,7 +2907,98 @@ static const struct intel_context_ops virtual_guc_context_ops = { .pre_pin = guc_virtual_context_pre_pin, .pin = guc_virtual_context_pin, - .unpin = guc_context_unpin, + .unpin = guc_virtual_context_unpin, + .post_unpin = guc_context_post_unpin, + + .ban = guc_context_ban, + + .cancel_request = guc_context_cancel_request, + + .enter = guc_virtual_context_enter, + .exit = guc_virtual_context_exit, + + .sched_disable = guc_context_sched_disable, + + .destroy = guc_context_destroy, + + .get_sibling = guc_virtual_get_sibling, +}; + +static int guc_parent_context_pin(struct intel_context *ce, void *vaddr) +{ + struct intel_engine_cs *engine = guc_virtual_get_sibling(ce->engine, 0); + struct intel_guc *guc = ce_to_guc(ce); + int ret; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + + ret = pin_guc_id(guc, ce); + if (unlikely(ret < 0)) + return ret; + + return __guc_context_pin(ce, engine, vaddr); +} + +static int guc_child_context_pin(struct intel_context *ce, void *vaddr) +{ + struct intel_engine_cs *engine = guc_virtual_get_sibling(ce->engine, 0); + + GEM_BUG_ON(!intel_context_is_child(ce)); + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + + __intel_context_pin(ce->parallel.parent); + return __guc_context_pin(ce, engine, vaddr); +} + +static void guc_parent_context_unpin(struct intel_context *ce) +{ + struct intel_guc *guc = ce_to_guc(ce); + + GEM_BUG_ON(context_enabled(ce)); + GEM_BUG_ON(intel_context_is_barrier(ce)); + GEM_BUG_ON(!intel_context_is_parent(ce)); + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + + if (ce->parallel.last_rq) + i915_request_put(ce->parallel.last_rq); + unpin_guc_id(guc, ce); + lrc_unpin(ce); +} + +static void guc_child_context_unpin(struct intel_context *ce) +{ + GEM_BUG_ON(context_enabled(ce)); + GEM_BUG_ON(intel_context_is_barrier(ce)); + GEM_BUG_ON(!intel_context_is_child(ce)); + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + + lrc_unpin(ce); +} + +static void guc_child_context_post_unpin(struct intel_context *ce) +{ + GEM_BUG_ON(!intel_context_is_child(ce)); + GEM_BUG_ON(!intel_context_is_pinned(ce->parallel.parent)); + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + + lrc_post_unpin(ce); + intel_context_unpin(ce->parallel.parent); +} + +static void guc_child_context_destroy(struct kref *kref) +{ + struct intel_context *ce = container_of(kref, typeof(*ce), ref); + + __guc_context_destroy(ce); +} + +static const struct intel_context_ops virtual_parent_context_ops = { + .alloc = guc_virtual_context_alloc, + + .pre_pin = guc_context_pre_pin, + .pin = guc_parent_context_pin, + .unpin = guc_parent_context_unpin, .post_unpin = guc_context_post_unpin, .ban = guc_context_ban, @@ -2190,6 +3015,110 @@ static const struct intel_context_ops virtual_guc_context_ops = { .get_sibling = guc_virtual_get_sibling, }; +static const struct intel_context_ops virtual_child_context_ops = { + .alloc = guc_virtual_context_alloc, + + .pre_pin = guc_context_pre_pin, + .pin = guc_child_context_pin, + .unpin = guc_child_context_unpin, + .post_unpin = guc_child_context_post_unpin, + + .cancel_request = guc_context_cancel_request, + + .enter = guc_virtual_context_enter, + .exit = guc_virtual_context_exit, + + .destroy = guc_child_context_destroy, + + .get_sibling = guc_virtual_get_sibling, +}; + +/* + * The below override of the breadcrumbs is enabled when the user configures a + * context for parallel submission (multi-lrc, parent-child). + * + * The overridden breadcrumbs implements an algorithm which allows the GuC to + * safely preempt all the hw contexts configured for parallel submission + * between each BB. The contract between the i915 and GuC is if the parent + * context can be preempted, all the children can be preempted, and the GuC will + * always try to preempt the parent before the children. A handshake between the + * parent / children breadcrumbs ensures the i915 holds up its end of the deal + * creating a window to preempt between each set of BBs. + */ +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags); +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags); +static u32 * +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs); +static u32 * +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs); + +static struct intel_context * +guc_create_parallel(struct intel_engine_cs **engines, + unsigned int num_siblings, + unsigned int width) +{ + struct intel_engine_cs **siblings = NULL; + struct intel_context *parent = NULL, *ce, *err; + int i, j; + + siblings = kmalloc_array(num_siblings, + sizeof(*siblings), + GFP_KERNEL); + if (!siblings) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < width; ++i) { + for (j = 0; j < num_siblings; ++j) + siblings[j] = engines[i * num_siblings + j]; + + ce = intel_engine_create_virtual(siblings, num_siblings, + FORCE_VIRTUAL); + if (!ce) { + err = ERR_PTR(-ENOMEM); + goto unwind; + } + + if (i == 0) { + parent = ce; + parent->ops = &virtual_parent_context_ops; + } else { + ce->ops = &virtual_child_context_ops; + intel_context_bind_parent_child(parent, ce); + } + } + + parent->parallel.fence_context = dma_fence_context_alloc(1); + + parent->engine->emit_bb_start = + emit_bb_start_parent_no_preempt_mid_batch; + parent->engine->emit_fini_breadcrumb = + emit_fini_breadcrumb_parent_no_preempt_mid_batch; + parent->engine->emit_fini_breadcrumb_dw = + 12 + 4 * parent->parallel.number_children; + for_each_child(parent, ce) { + ce->engine->emit_bb_start = + emit_bb_start_child_no_preempt_mid_batch; + ce->engine->emit_fini_breadcrumb = + emit_fini_breadcrumb_child_no_preempt_mid_batch; + ce->engine->emit_fini_breadcrumb_dw = 16; + } + + kfree(siblings); + return parent; + +unwind: + if (parent) + intel_context_put(parent); + kfree(siblings); + return err; +} + static bool guc_irq_enable_breadcrumbs(struct intel_breadcrumbs *b) { @@ -2249,7 +3178,7 @@ static void guc_init_breadcrumbs(struct intel_engine_cs *engine) static void guc_bump_inflight_request_prio(struct i915_request *rq, int prio) { - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); u8 new_guc_prio = map_i915_prio_to_guc_prio(prio); /* Short circuit function */ @@ -2259,7 +3188,7 @@ static void guc_bump_inflight_request_prio(struct i915_request *rq, !new_guc_prio_higher(rq->guc_prio, new_guc_prio))) return; - spin_lock(&ce->guc_active.lock); + spin_lock(&ce->guc_state.lock); if (rq->guc_prio != GUC_PRIO_FINI) { if (rq->guc_prio != GUC_PRIO_INIT) sub_context_inflight_prio(ce, rq->guc_prio); @@ -2267,16 +3196,16 @@ static void guc_bump_inflight_request_prio(struct i915_request *rq, add_context_inflight_prio(ce, rq->guc_prio); update_context_prio(ce); } - spin_unlock(&ce->guc_active.lock); + spin_unlock(&ce->guc_state.lock); } static void guc_retire_inflight_request_prio(struct i915_request *rq) { - struct intel_context *ce = rq->context; + struct intel_context *ce = request_to_scheduling_context(rq); - spin_lock(&ce->guc_active.lock); + spin_lock(&ce->guc_state.lock); guc_prio_fini(rq, ce); - spin_unlock(&ce->guc_active.lock); + spin_unlock(&ce->guc_state.lock); } static void sanitize_hwsp(struct intel_engine_cs *engine) @@ -2310,6 +3239,8 @@ static void guc_sanitize(struct intel_engine_cs *engine) /* And scrub the dirty cachelines for the HWSP */ clflush_cache_range(engine->status_page.addr, PAGE_SIZE); + + intel_engine_reset_pinned_contexts(engine); } static void setup_hwsp(struct intel_engine_cs *engine) @@ -2385,9 +3316,13 @@ static inline void guc_init_lrc_mapping(struct intel_guc *guc) * and even it did this code would be run again. */ - for_each_engine(engine, gt, id) - if (engine->kernel_context) - guc_kernel_context_pin(guc, engine->kernel_context); + for_each_engine(engine, gt, id) { + struct intel_context *ce; + + list_for_each_entry(ce, &engine->pinned_contexts_list, + pinned_contexts_link) + guc_kernel_context_pin(guc, ce); + } } static void guc_release(struct intel_engine_cs *engine) @@ -2580,13 +3515,13 @@ g2h_context_lookup(struct intel_guc *guc, u32 desc_idx) return NULL; } - return ce; -} + if (unlikely(intel_context_is_child(ce))) { + drm_err(&guc_to_gt(guc)->i915->drm, + "Context is child, desc_idx %u", desc_idx); + return NULL; + } -static void decr_outstanding_submission_g2h(struct intel_guc *guc) -{ - if (atomic_dec_and_test(&guc->outstanding_submission_g2h)) - wake_up_all(&guc->ct.wq); + return ce; } int intel_guc_deregister_done_process_msg(struct intel_guc *guc, @@ -2607,6 +3542,13 @@ int intel_guc_deregister_done_process_msg(struct intel_guc *guc, trace_intel_context_deregister_done(ce); +#ifdef CONFIG_DRM_I915_SELFTEST + if (unlikely(ce->drop_deregister)) { + ce->drop_deregister = false; + return 0; + } +#endif + if (context_wait_for_deregister_to_register(ce)) { struct intel_runtime_pm *runtime_pm = &ce->engine->gt->i915->runtime_pm; @@ -2622,6 +3564,7 @@ int intel_guc_deregister_done_process_msg(struct intel_guc *guc, intel_context_put(ce); } else if (context_destroyed(ce)) { /* Context has been destroyed */ + intel_gt_pm_put_async(guc_to_gt(guc)); release_guc_id(guc, ce); __guc_context_destroy(ce); } @@ -2652,8 +3595,7 @@ int intel_guc_sched_done_process_msg(struct intel_guc *guc, (!context_pending_enable(ce) && !context_pending_disable(ce)))) { drm_err(&guc_to_gt(guc)->i915->drm, - "Bad context sched_state 0x%x, 0x%x, desc_idx %u", - atomic_read(&ce->guc_sched_state_no_lock), + "Bad context sched_state 0x%x, desc_idx %u", ce->guc_state.sched_state, desc_idx); return -EPROTO; } @@ -2661,10 +3603,26 @@ int intel_guc_sched_done_process_msg(struct intel_guc *guc, trace_intel_context_sched_done(ce); if (context_pending_enable(ce)) { +#ifdef CONFIG_DRM_I915_SELFTEST + if (unlikely(ce->drop_schedule_enable)) { + ce->drop_schedule_enable = false; + return 0; + } +#endif + + spin_lock_irqsave(&ce->guc_state.lock, flags); clr_context_pending_enable(ce); + spin_unlock_irqrestore(&ce->guc_state.lock, flags); } else if (context_pending_disable(ce)) { bool banned; +#ifdef CONFIG_DRM_I915_SELFTEST + if (unlikely(ce->drop_schedule_disable)) { + ce->drop_schedule_disable = false; + return 0; + } +#endif + /* * Unpin must be done before __guc_signal_context_fence, * otherwise a race exists between the requests getting @@ -2721,7 +3679,12 @@ static void guc_handle_context_reset(struct intel_guc *guc, { trace_intel_context_reset(ce); - if (likely(!intel_context_is_banned(ce))) { + /* + * XXX: Racey if request cancellation has occurred, see comment in + * __guc_reset_context(). + */ + if (likely(!intel_context_is_banned(ce) && + !context_blocked(ce))) { capture_error_state(guc, ce); guc_context_replay(ce); } @@ -2797,33 +3760,47 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine) struct intel_context *ce; struct i915_request *rq; unsigned long index; + unsigned long flags; /* Reset called during driver load? GuC not yet initialised! */ if (unlikely(!guc_submission_initialized(guc))) return; + xa_lock_irqsave(&guc->context_lookup, flags); xa_for_each(&guc->context_lookup, index, ce) { - if (!intel_context_is_pinned(ce)) + if (!kref_get_unless_zero(&ce->ref)) continue; + xa_unlock(&guc->context_lookup); + + if (!intel_context_is_pinned(ce)) + goto next; + if (intel_engine_is_virtual(ce->engine)) { if (!(ce->engine->mask & engine->mask)) - continue; + goto next; } else { if (ce->engine != engine) - continue; + goto next; } - list_for_each_entry(rq, &ce->guc_active.requests, sched.link) { + list_for_each_entry(rq, &ce->guc_state.requests, sched.link) { if (i915_test_request_state(rq) != I915_REQUEST_ACTIVE) continue; intel_engine_set_hung_context(engine, ce); /* Can only cope with one hang at a time... */ - return; + intel_context_put(ce); + xa_lock(&guc->context_lookup); + goto done; } +next: + intel_context_put(ce); + xa_lock(&guc->context_lookup); } +done: + xa_unlock_irqrestore(&guc->context_lookup, flags); } void intel_guc_dump_active_requests(struct intel_engine_cs *engine, @@ -2839,23 +3816,34 @@ void intel_guc_dump_active_requests(struct intel_engine_cs *engine, if (unlikely(!guc_submission_initialized(guc))) return; + xa_lock_irqsave(&guc->context_lookup, flags); xa_for_each(&guc->context_lookup, index, ce) { - if (!intel_context_is_pinned(ce)) + if (!kref_get_unless_zero(&ce->ref)) continue; + xa_unlock(&guc->context_lookup); + + if (!intel_context_is_pinned(ce)) + goto next; + if (intel_engine_is_virtual(ce->engine)) { if (!(ce->engine->mask & engine->mask)) - continue; + goto next; } else { if (ce->engine != engine) - continue; + goto next; } - spin_lock_irqsave(&ce->guc_active.lock, flags); - intel_engine_dump_active_requests(&ce->guc_active.requests, + spin_lock(&ce->guc_state.lock); + intel_engine_dump_active_requests(&ce->guc_state.requests, hung_rq, m); - spin_unlock_irqrestore(&ce->guc_active.lock, flags); + spin_unlock(&ce->guc_state.lock); + +next: + intel_context_put(ce); + xa_lock(&guc->context_lookup); } + xa_unlock_irqrestore(&guc->context_lookup, flags); } void intel_guc_submission_print_info(struct intel_guc *guc, @@ -2881,7 +3869,7 @@ void intel_guc_submission_print_info(struct intel_guc *guc, priolist_for_each_request(rq, pl) drm_printf(p, "guc_id=%u, seqno=%llu\n", - rq->context->guc_id, + rq->context->guc_id.id, rq->fence.seqno); } spin_unlock_irqrestore(&sched_engine->lock, flags); @@ -2893,46 +3881,348 @@ static inline void guc_log_context_priority(struct drm_printer *p, { int i; - drm_printf(p, "\t\tPriority: %d\n", - ce->guc_prio); + drm_printf(p, "\t\tPriority: %d\n", ce->guc_state.prio); drm_printf(p, "\t\tNumber Requests (lower index == higher priority)\n"); for (i = GUC_CLIENT_PRIORITY_KMD_HIGH; i < GUC_CLIENT_PRIORITY_NUM; ++i) { drm_printf(p, "\t\tNumber requests in priority band[%d]: %d\n", - i, ce->guc_prio_count[i]); + i, ce->guc_state.prio_count[i]); } drm_printf(p, "\n"); } +static inline void guc_log_context(struct drm_printer *p, + struct intel_context *ce) +{ + drm_printf(p, "GuC lrc descriptor %u:\n", ce->guc_id.id); + drm_printf(p, "\tHW Context Desc: 0x%08x\n", ce->lrc.lrca); + drm_printf(p, "\t\tLRC Head: Internal %u, Memory %u\n", + ce->ring->head, + ce->lrc_reg_state[CTX_RING_HEAD]); + drm_printf(p, "\t\tLRC Tail: Internal %u, Memory %u\n", + ce->ring->tail, + ce->lrc_reg_state[CTX_RING_TAIL]); + drm_printf(p, "\t\tContext Pin Count: %u\n", + atomic_read(&ce->pin_count)); + drm_printf(p, "\t\tGuC ID Ref Count: %u\n", + atomic_read(&ce->guc_id.ref)); + drm_printf(p, "\t\tSchedule State: 0x%x\n\n", + ce->guc_state.sched_state); +} + void intel_guc_submission_print_context_info(struct intel_guc *guc, struct drm_printer *p) { struct intel_context *ce; unsigned long index; + unsigned long flags; + xa_lock_irqsave(&guc->context_lookup, flags); xa_for_each(&guc->context_lookup, index, ce) { - drm_printf(p, "GuC lrc descriptor %u:\n", ce->guc_id); - drm_printf(p, "\tHW Context Desc: 0x%08x\n", ce->lrc.lrca); - drm_printf(p, "\t\tLRC Head: Internal %u, Memory %u\n", - ce->ring->head, - ce->lrc_reg_state[CTX_RING_HEAD]); - drm_printf(p, "\t\tLRC Tail: Internal %u, Memory %u\n", - ce->ring->tail, - ce->lrc_reg_state[CTX_RING_TAIL]); - drm_printf(p, "\t\tContext Pin Count: %u\n", - atomic_read(&ce->pin_count)); - drm_printf(p, "\t\tGuC ID Ref Count: %u\n", - atomic_read(&ce->guc_id_ref)); - drm_printf(p, "\t\tSchedule State: 0x%x, 0x%x\n\n", - ce->guc_state.sched_state, - atomic_read(&ce->guc_sched_state_no_lock)); + GEM_BUG_ON(intel_context_is_child(ce)); + guc_log_context(p, ce); guc_log_context_priority(p, ce); + + if (intel_context_is_parent(ce)) { + struct guc_process_desc *desc = __get_process_desc(ce); + struct intel_context *child; + + drm_printf(p, "\t\tNumber children: %u\n", + ce->parallel.number_children); + drm_printf(p, "\t\tWQI Head: %u\n", + READ_ONCE(desc->head)); + drm_printf(p, "\t\tWQI Tail: %u\n", + READ_ONCE(desc->tail)); + drm_printf(p, "\t\tWQI Status: %u\n\n", + READ_ONCE(desc->wq_status)); + + if (ce->engine->emit_bb_start == + emit_bb_start_parent_no_preempt_mid_batch) { + u8 i; + + drm_printf(p, "\t\tChildren Go: %u\n\n", + get_children_go_value(ce)); + for (i = 0; i < ce->parallel.number_children; ++i) + drm_printf(p, "\t\tChildren Join: %u\n", + get_children_join_value(ce, i)); + } + + for_each_child(ce, child) + guc_log_context(p, child); + } + } + xa_unlock_irqrestore(&guc->context_lookup, flags); +} + +static inline u32 get_children_go_addr(struct intel_context *ce) +{ + GEM_BUG_ON(!intel_context_is_parent(ce)); + + return i915_ggtt_offset(ce->state) + + __get_parent_scratch_offset(ce) + + offsetof(struct parent_scratch, go.semaphore); +} + +static inline u32 get_children_join_addr(struct intel_context *ce, + u8 child_index) +{ + GEM_BUG_ON(!intel_context_is_parent(ce)); + + return i915_ggtt_offset(ce->state) + + __get_parent_scratch_offset(ce) + + offsetof(struct parent_scratch, join[child_index].semaphore); +} + +#define PARENT_GO_BB 1 +#define PARENT_GO_FINI_BREADCRUMB 0 +#define CHILD_GO_BB 1 +#define CHILD_GO_FINI_BREADCRUMB 0 +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + struct intel_context *ce = rq->context; + u32 *cs; + u8 i; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + cs = intel_ring_begin(rq, 10 + 4 * ce->parallel.number_children); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Wait on children */ + for (i = 0; i < ce->parallel.number_children; ++i) { + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = PARENT_GO_BB; + *cs++ = get_children_join_addr(ce, i); + *cs++ = 0; + } + + /* Turn off preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + *cs++ = MI_NOOP; + + /* Tell children go */ + cs = gen8_emit_ggtt_write(cs, + CHILD_GO_BB, + get_children_go_addr(ce), + 0); + + /* Jump to batch */ + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + return 0; +} + +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + struct intel_context *ce = rq->context; + struct intel_context *parent = intel_context_to_parent(ce); + u32 *cs; + + GEM_BUG_ON(!intel_context_is_child(ce)); + + cs = intel_ring_begin(rq, 12); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Signal parent */ + cs = gen8_emit_ggtt_write(cs, + PARENT_GO_BB, + get_children_join_addr(parent, + ce->parallel.child_index), + 0); + + /* Wait on parent for go */ + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = CHILD_GO_BB; + *cs++ = get_children_go_addr(parent); + *cs++ = 0; + + /* Turn off preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + /* Jump to batch */ + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + + intel_ring_advance(rq, cs); + + return 0; +} + +static u32 * +__emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs) +{ + struct intel_context *ce = rq->context; + u8 i; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + /* Wait on children */ + for (i = 0; i < ce->parallel.number_children; ++i) { + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = PARENT_GO_FINI_BREADCRUMB; + *cs++ = get_children_join_addr(ce, i); + *cs++ = 0; + } + + /* Turn on preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_NOOP; + + /* Tell children go */ + cs = gen8_emit_ggtt_write(cs, + CHILD_GO_FINI_BREADCRUMB, + get_children_go_addr(ce), + 0); + + return cs; +} + +/* + * If this true, a submission of multi-lrc requests had an error and the + * requests need to be skipped. The front end (execuf IOCTL) should've called + * i915_request_skip which squashes the BB but we still need to emit the fini + * breadrcrumbs seqno write. At this point we don't know how many of the + * requests in the multi-lrc submission were generated so we can't do the + * handshake between the parent and children (e.g. if 4 requests should be + * generated but 2nd hit an error only 1 would be seen by the GuC backend). + * Simply skip the handshake, but still emit the breadcrumbd seqno, if an error + * has occurred on any of the requests in submission / relationship. + */ +static inline bool skip_handshake(struct i915_request *rq) +{ + return test_bit(I915_FENCE_FLAG_SKIP_PARALLEL, &rq->fence.flags); +} + +static u32 * +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs) +{ + struct intel_context *ce = rq->context; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + if (unlikely(skip_handshake(rq))) { + /* + * NOP everything in __emit_fini_breadcrumb_parent_no_preempt_mid_batch, + * the -6 comes from the length of the emits below. + */ + memset(cs, 0, sizeof(u32) * + (ce->engine->emit_fini_breadcrumb_dw - 6)); + cs += ce->engine->emit_fini_breadcrumb_dw - 6; + } else { + cs = __emit_fini_breadcrumb_parent_no_preempt_mid_batch(rq, cs); + } + + /* Emit fini breadcrumb */ + cs = gen8_emit_ggtt_write(cs, + rq->fence.seqno, + i915_request_active_timeline(rq)->hwsp_offset, + 0); + + /* User interrupt */ + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + + return cs; +} + +static u32 * +__emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs) +{ + struct intel_context *ce = rq->context; + struct intel_context *parent = intel_context_to_parent(ce); + + GEM_BUG_ON(!intel_context_is_child(ce)); + + /* Turn on preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_NOOP; + + /* Signal parent */ + cs = gen8_emit_ggtt_write(cs, + PARENT_GO_FINI_BREADCRUMB, + get_children_join_addr(parent, + ce->parallel.child_index), + 0); + + /* Wait parent on for go */ + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = CHILD_GO_FINI_BREADCRUMB; + *cs++ = get_children_go_addr(parent); + *cs++ = 0; + + return cs; +} + +static u32 * +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs) +{ + struct intel_context *ce = rq->context; + + GEM_BUG_ON(!intel_context_is_child(ce)); + + if (unlikely(skip_handshake(rq))) { + /* + * NOP everything in __emit_fini_breadcrumb_child_no_preempt_mid_batch, + * the -6 comes from the length of the emits below. + */ + memset(cs, 0, sizeof(u32) * + (ce->engine->emit_fini_breadcrumb_dw - 6)); + cs += ce->engine->emit_fini_breadcrumb_dw - 6; + } else { + cs = __emit_fini_breadcrumb_child_no_preempt_mid_batch(rq, cs); } + + /* Emit fini breadcrumb */ + cs = gen8_emit_ggtt_write(cs, + rq->fence.seqno, + i915_request_active_timeline(rq)->hwsp_offset, + 0); + + /* User interrupt */ + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + + return cs; } static struct intel_context * -guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count) +guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count, + unsigned long flags) { struct guc_virtual_engine *ve; struct intel_guc *guc; @@ -2981,6 +4271,7 @@ guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count) } ve->base.mask |= sibling->mask; + ve->base.logical_mask |= sibling->logical_mask; if (n != 0 && ve->base.class != sibling->class) { DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", @@ -3036,3 +4327,8 @@ bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve) return false; } + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_guc.c" +#include "selftest_guc_multi_lrc.c" +#endif |