From 4a6eccbcb9ea88cf0408115e9d130ce7062ee6fd Mon Sep 17 00:00:00 2001 From: Xiong Zhang Date: Wed, 10 Apr 2019 12:16:34 +0800 Subject: drm/i915/gvt: Change fb_info->size from pages to bytes fb_info->size is in pages, but some function need bytes when it is as a parameter. Such as: a. intel_gvt_ggtt_validate_range(), according to function definition b. vifio_device_gfx_plane_info->size, according to the comment of its definition So change fb_info->size into bytes. v2: Keep fb_info->size in real size instead of assinging casted page size(zhenyu) v3: obj->size should be page aligned and delete redundant check(zhenyu) Signed-off-by: Xiong Zhang Reviewed-by: Zhenyu Wang Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/dmabuf.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/i915/gvt/dmabuf.c b/drivers/gpu/drm/i915/gvt/dmabuf.c index 4e1e425189ba..c104f041d0f4 100644 --- a/drivers/gpu/drm/i915/gvt/dmabuf.c +++ b/drivers/gpu/drm/i915/gvt/dmabuf.c @@ -45,6 +45,7 @@ static int vgpu_gem_get_pages( int i, ret; gen8_pte_t __iomem *gtt_entries; struct intel_vgpu_fb_info *fb_info; + u32 page_num; fb_info = (struct intel_vgpu_fb_info *)obj->gvt_info; if (WARN_ON(!fb_info)) @@ -54,14 +55,15 @@ static int vgpu_gem_get_pages( if (unlikely(!st)) return -ENOMEM; - ret = sg_alloc_table(st, fb_info->size, GFP_KERNEL); + page_num = obj->base.size >> PAGE_SHIFT; + ret = sg_alloc_table(st, page_num, GFP_KERNEL); if (ret) { kfree(st); return ret; } gtt_entries = (gen8_pte_t __iomem *)dev_priv->ggtt.gsm + (fb_info->start >> PAGE_SHIFT); - for_each_sg(st->sgl, sg, fb_info->size, i) { + for_each_sg(st->sgl, sg, page_num, i) { sg->offset = 0; sg->length = PAGE_SIZE; sg_dma_address(sg) = @@ -158,7 +160,7 @@ static struct drm_i915_gem_object *vgpu_create_gem(struct drm_device *dev, return NULL; drm_gem_private_object_init(dev, &obj->base, - info->size << PAGE_SHIFT); + roundup(info->size, PAGE_SIZE)); i915_gem_object_init(obj, &intel_vgpu_gem_ops); obj->read_domains = I915_GEM_DOMAIN_GTT; @@ -206,7 +208,6 @@ static int vgpu_get_plane_info(struct drm_device *dev, struct intel_vgpu_fb_info *info, int plane_id) { - struct drm_i915_private *dev_priv = to_i915(dev); struct intel_vgpu_primary_plane_format p; struct intel_vgpu_cursor_plane_format c; int ret, tile_height = 1; @@ -267,8 +268,7 @@ static int vgpu_get_plane_info(struct drm_device *dev, return -EINVAL; } - info->size = (info->stride * roundup(info->height, tile_height) - + PAGE_SIZE - 1) >> PAGE_SHIFT; + info->size = info->stride * roundup(info->height, tile_height); if (info->size == 0) { gvt_vgpu_err("fb size is zero\n"); return -EINVAL; @@ -278,11 +278,6 @@ static int vgpu_get_plane_info(struct drm_device *dev, gvt_vgpu_err("Not aligned fb address:0x%llx\n", info->start); return -EFAULT; } - if (((info->start >> PAGE_SHIFT) + info->size) > - ggtt_total_entries(&dev_priv->ggtt)) { - gvt_vgpu_err("Invalid GTT offset or size\n"); - return -EFAULT; - } if (!intel_gvt_ggtt_validate_range(vgpu, info->start, info->size)) { gvt_vgpu_err("invalid gma addr\n"); -- cgit From 0cf8f58d0a340a2ac744f6e0e3402a89780ecf8b Mon Sep 17 00:00:00 2001 From: Aleksei Gimbitskii Date: Tue, 23 Apr 2019 15:04:08 +0300 Subject: drm/i915/gvt: Remove typedef and let the enumeration starts from zero Typedef is not recommended in the Linux kernel.The klocwork static code analyzer takes the enumeration as the full range of intel_gvt_gtt_type_t. But the intel_gvt_gtt_type_t will never be used in full range. For example, the GTT_TYPE_INVALID will never be used as an index of an array. Remove the typedef and let the enumeration starts from zero to pass klocwork analysis. This patch fixed the critial issues #483, #551, #665 reported by klockwork. v3: - Remove the typedef and let the enumeration starts from zero. Signed-off-by: Aleksei Gimbitskii Cc: Zhenyu Wang Cc: Zhi Wang CC: Colin Xu Reviewed-by: Colin Xu Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/gtt.c | 12 ++++++------ drivers/gpu/drm/i915/gvt/gtt.h | 16 ++++++++-------- drivers/gpu/drm/i915/gvt/handlers.c | 2 +- drivers/gpu/drm/i915/gvt/scheduler.c | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index c2f7d20f6346..7600416db908 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -811,7 +811,7 @@ static int reclaim_one_ppgtt_mm(struct intel_gvt *gvt); /* Allocate shadow page table without guest page. */ static struct intel_vgpu_ppgtt_spt *ppgtt_alloc_spt( - struct intel_vgpu *vgpu, intel_gvt_gtt_type_t type) + struct intel_vgpu *vgpu, enum intel_gvt_gtt_type type) { struct device *kdev = &vgpu->gvt->dev_priv->drm.pdev->dev; struct intel_vgpu_ppgtt_spt *spt = NULL; @@ -861,7 +861,7 @@ err_free_spt: /* Allocate shadow page table associated with specific gfn. */ static struct intel_vgpu_ppgtt_spt *ppgtt_alloc_spt_gfn( - struct intel_vgpu *vgpu, intel_gvt_gtt_type_t type, + struct intel_vgpu *vgpu, enum intel_gvt_gtt_type type, unsigned long gfn, bool guest_pde_ips) { struct intel_vgpu_ppgtt_spt *spt; @@ -936,7 +936,7 @@ static int ppgtt_invalidate_spt_by_shadow_entry(struct intel_vgpu *vgpu, { struct intel_gvt_gtt_pte_ops *ops = vgpu->gvt->gtt.pte_ops; struct intel_vgpu_ppgtt_spt *s; - intel_gvt_gtt_type_t cur_pt_type; + enum intel_gvt_gtt_type cur_pt_type; GEM_BUG_ON(!gtt_type_is_pt(get_next_pt_type(e->type))); @@ -1855,7 +1855,7 @@ static void vgpu_free_mm(struct intel_vgpu_mm *mm) * Zero on success, negative error code in pointer if failed. */ struct intel_vgpu_mm *intel_vgpu_create_ppgtt_mm(struct intel_vgpu *vgpu, - intel_gvt_gtt_type_t root_entry_type, u64 pdps[]) + enum intel_gvt_gtt_type root_entry_type, u64 pdps[]) { struct intel_gvt *gvt = vgpu->gvt; struct intel_vgpu_mm *mm; @@ -2309,7 +2309,7 @@ int intel_vgpu_emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, } static int alloc_scratch_pages(struct intel_vgpu *vgpu, - intel_gvt_gtt_type_t type) + enum intel_gvt_gtt_type type) { struct intel_vgpu_gtt *gtt = &vgpu->gtt; struct intel_gvt_gtt_pte_ops *ops = vgpu->gvt->gtt.pte_ops; @@ -2594,7 +2594,7 @@ struct intel_vgpu_mm *intel_vgpu_find_ppgtt_mm(struct intel_vgpu *vgpu, * Zero on success, negative error code if failed. */ struct intel_vgpu_mm *intel_vgpu_get_ppgtt_mm(struct intel_vgpu *vgpu, - intel_gvt_gtt_type_t root_entry_type, u64 pdps[]) + enum intel_gvt_gtt_type root_entry_type, u64 pdps[]) { struct intel_vgpu_mm *mm; diff --git a/drivers/gpu/drm/i915/gvt/gtt.h b/drivers/gpu/drm/i915/gvt/gtt.h index 32c573aea494..42d0394f0de2 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.h +++ b/drivers/gpu/drm/i915/gvt/gtt.h @@ -95,8 +95,8 @@ struct intel_gvt_gtt { unsigned long scratch_mfn; }; -typedef enum { - GTT_TYPE_INVALID = -1, +enum intel_gvt_gtt_type { + GTT_TYPE_INVALID = 0, GTT_TYPE_GGTT_PTE, @@ -124,7 +124,7 @@ typedef enum { GTT_TYPE_PPGTT_PML4_PT, GTT_TYPE_MAX, -} intel_gvt_gtt_type_t; +}; enum intel_gvt_mm_type { INTEL_GVT_MM_GGTT, @@ -148,7 +148,7 @@ struct intel_vgpu_mm { union { struct { - intel_gvt_gtt_type_t root_entry_type; + enum intel_gvt_gtt_type root_entry_type; /* * The 4 PDPs in ring context. For 48bit addressing, * only PDP0 is valid and point to PML4. For 32it @@ -169,7 +169,7 @@ struct intel_vgpu_mm { }; struct intel_vgpu_mm *intel_vgpu_create_ppgtt_mm(struct intel_vgpu *vgpu, - intel_gvt_gtt_type_t root_entry_type, u64 pdps[]); + enum intel_gvt_gtt_type root_entry_type, u64 pdps[]); static inline void intel_vgpu_mm_get(struct intel_vgpu_mm *mm) { @@ -233,7 +233,7 @@ struct intel_vgpu_ppgtt_spt { struct intel_vgpu *vgpu; struct { - intel_gvt_gtt_type_t type; + enum intel_gvt_gtt_type type; bool pde_ips; /* for 64KB PTEs */ void *vaddr; struct page *page; @@ -241,7 +241,7 @@ struct intel_vgpu_ppgtt_spt { } shadow_page; struct { - intel_gvt_gtt_type_t type; + enum intel_gvt_gtt_type type; bool pde_ips; /* for 64KB PTEs */ unsigned long gfn; unsigned long write_cnt; @@ -267,7 +267,7 @@ struct intel_vgpu_mm *intel_vgpu_find_ppgtt_mm(struct intel_vgpu *vgpu, u64 pdps[]); struct intel_vgpu_mm *intel_vgpu_get_ppgtt_mm(struct intel_vgpu *vgpu, - intel_gvt_gtt_type_t root_entry_type, u64 pdps[]); + enum intel_gvt_gtt_type root_entry_type, u64 pdps[]); int intel_vgpu_put_ppgtt_mm(struct intel_vgpu *vgpu, u64 pdps[]); diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 18f01eeb2510..34129eacfd22 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -1206,7 +1206,7 @@ static int pvinfo_mmio_read(struct intel_vgpu *vgpu, unsigned int offset, static int handle_g2v_notification(struct intel_vgpu *vgpu, int notification) { - intel_gvt_gtt_type_t root_entry_type = GTT_TYPE_PPGTT_ROOT_L4_ENTRY; + enum intel_gvt_gtt_type root_entry_type = GTT_TYPE_PPGTT_ROOT_L4_ENTRY; struct intel_vgpu_mm *mm; u64 *pdps; diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index 8998fa5ab198..7c99bbc3e2b8 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -1343,7 +1343,7 @@ static int prepare_mm(struct intel_vgpu_workload *workload) struct execlist_ctx_descriptor_format *desc = &workload->ctx_desc; struct intel_vgpu_mm *mm; struct intel_vgpu *vgpu = workload->vgpu; - intel_gvt_gtt_type_t root_entry_type; + enum intel_gvt_gtt_type root_entry_type; u64 pdps[GVT_RING_CTX_NR_PDPS]; switch (desc->addressing_mode) { -- cgit From d9420241d09bac6ba930d95963bd237ec9629db6 Mon Sep 17 00:00:00 2001 From: Aleksei Gimbitskii Date: Tue, 23 Apr 2019 15:04:09 +0300 Subject: drm/i915/gvt: Do not copy the uninitialized pointer from fb_info In the code the memcpy() function copied uninitialized pointer in fb_info to dmabuf_obj->info. Later the pointer in dmabuf_obj->info will be initialized. To make the code aligned with requirements of the klocwork static code analyzer, the uninitialized pointer should be initialized before memcpy(). v2: - Initialize fb_info.obj in vgpu_get_plane_info(). (Colin Xu) This patch fixed the critical issue #632 reported by klockwork. Signed-off-by: Aleksei Gimbitskii Cc: Zhenyu Wang Cc: Zhi Wang Cc: Colin Xu Reviewed-by: Colin Xu Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/dmabuf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/i915/gvt/dmabuf.c b/drivers/gpu/drm/i915/gvt/dmabuf.c index c104f041d0f4..41c8ebc60c63 100644 --- a/drivers/gpu/drm/i915/gvt/dmabuf.c +++ b/drivers/gpu/drm/i915/gvt/dmabuf.c @@ -212,6 +212,8 @@ static int vgpu_get_plane_info(struct drm_device *dev, struct intel_vgpu_cursor_plane_format c; int ret, tile_height = 1; + memset(info, 0, sizeof(*info)); + if (plane_id == DRM_PLANE_TYPE_PRIMARY) { ret = intel_vgpu_decode_primary_plane(vgpu, &p); if (ret) -- cgit From 4feeea1d8d7713c5838d99c1fdfcc2e90c0f977d Mon Sep 17 00:00:00 2001 From: Aleksei Gimbitskii Date: Tue, 23 Apr 2019 15:04:10 +0300 Subject: drm/i915/gvt: Use snprintf() to prevent possible buffer overflow. For printing the intel_vgpu->id, a buffer with fixed length is allocated on the stack. But if vgpu->id is greater than 6 characters, the buffer overflow will happen. Even the string of the amount of max vgpu is less that the length buffer right now, it's better to replace sprintf() with snprintf(). v2: - Increase the size of the buffer. (Colin Xu) This patch fixed the critical issue #673 reported by klocwork. Signed-off-by: Aleksei Gimbitskii Cc: Zhenyu Wang Cc: Zhi Wang Cc: Colin Xu Reviewed-by: Colin Xu Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/i915/gvt/debugfs.c b/drivers/gpu/drm/i915/gvt/debugfs.c index 2ec89bcb59f1..8a9606f91e68 100644 --- a/drivers/gpu/drm/i915/gvt/debugfs.c +++ b/drivers/gpu/drm/i915/gvt/debugfs.c @@ -196,9 +196,9 @@ DEFINE_SIMPLE_ATTRIBUTE(vgpu_scan_nonprivbb_fops, int intel_gvt_debugfs_add_vgpu(struct intel_vgpu *vgpu) { struct dentry *ent; - char name[10] = ""; + char name[16] = ""; - sprintf(name, "vgpu%d", vgpu->id); + snprintf(name, 16, "vgpu%d", vgpu->id); vgpu->debugfs = debugfs_create_dir(name, vgpu->gvt->debugfs_root); if (!vgpu->debugfs) return -ENOMEM; -- cgit From 930c8dfea4b8f320bd7f6a3e5e721ac20e444e03 Mon Sep 17 00:00:00 2001 From: Aleksei Gimbitskii Date: Tue, 23 Apr 2019 15:04:13 +0300 Subject: drm/i915/gvt: Check if get_next_pt_type() always returns a valid value According to gtt_type_table[] function get_next_pt_type() may returns GTT_TYPE_INVALID in some cases. To prevent driver to try to create memory page with invalid data type, additional check is added. Signed-off-by: Aleksei Gimbitskii Cc: Zhenyu Wang Cc: Zhi Wang Cc: Colin Xu Reviewed-by: Colin Xu Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/gtt.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index 7600416db908..08c74e65836b 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -1076,6 +1076,9 @@ static struct intel_vgpu_ppgtt_spt *ppgtt_populate_spt_by_guest_entry( } else { int type = get_next_pt_type(we->type); + if (!gtt_type_is_pt(type)) + goto err; + spt = ppgtt_alloc_spt_gfn(vgpu, type, ops->get_pfn(we), ips); if (IS_ERR(spt)) { ret = PTR_ERR(spt); -- cgit From 8631fef7f2037281efca685c9e717eaed33ee37c Mon Sep 17 00:00:00 2001 From: Zhao Yakui Date: Thu, 25 Apr 2019 17:04:54 +0800 Subject: drm/i915/gvt: Revert "drm/i915/gvt: Refine the snapshort range of I915 MCHBAR to optimize gvt-g boot time" This reverts commit f74a6d9a2c427b6656bc93eacfa6d329ba54d611. BXT needs to access 0x141000-0x1417ff register to obtain the dram info. But after the snapshot range of I915_MCHBAR is refined in f74a6d9a2c, it only initializes the range of 0x144000-0x147fff for VGPU and then causes that the guest GPU can't get the initialized value for dram detection on BXT. Signed-off-by: Zhao Yakui Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/handlers.c | 2 +- drivers/gpu/drm/i915/gvt/reg.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 34129eacfd22..90673fca792f 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -3303,7 +3303,7 @@ void intel_gvt_clean_mmio_info(struct intel_gvt *gvt) /* Special MMIO blocks. */ static struct gvt_mmio_block mmio_blocks[] = { {D_SKL_PLUS, _MMIO(CSR_MMIO_START_RANGE), 0x3000, NULL, NULL}, - {D_ALL, MCHBAR_MIRROR_REG_BASE, 0x4000, NULL, NULL}, + {D_ALL, _MMIO(MCHBAR_MIRROR_BASE_SNB), 0x40000, NULL, NULL}, {D_ALL, _MMIO(VGT_PVINFO_PAGE), VGT_PVINFO_SIZE, pvinfo_mmio_read, pvinfo_mmio_write}, {D_ALL, LGC_PALETTE(PIPE_A, 0), 1024, NULL, NULL}, diff --git a/drivers/gpu/drm/i915/gvt/reg.h b/drivers/gpu/drm/i915/gvt/reg.h index 3de5b643b266..33aaa14bfdde 100644 --- a/drivers/gpu/drm/i915/gvt/reg.h +++ b/drivers/gpu/drm/i915/gvt/reg.h @@ -126,7 +126,4 @@ #define RING_GFX_MODE(base) _MMIO((base) + 0x29c) #define VF_GUARDBAND _MMIO(0x83a4) -/* define the effective range of MCHBAR register on Sandybridge+ */ -#define MCHBAR_MIRROR_REG_BASE _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x4000) - #endif -- cgit From 75fdb811d93c8aa4a9f73b63db032b1e6a8668ef Mon Sep 17 00:00:00 2001 From: Colin Xu Date: Fri, 22 Feb 2019 14:13:42 +0800 Subject: drm/i915/gvt: Add in context mmio 0x20D8 to gen9 mmio list Depends on GEN family and I915_PARAM_HAS_CONTEXT_ISOLATION, Mesa driver will decide whether constant buffer 0 address is relative or absolute, and load GPU initial state by lri to context mmio INSTPM (GEN8) or 0x20D8 (>=GEN9). Mesa Commit fa8a764b62 ("i965: Use absolute addressing for constant buffer 0 on Kernel 4.16+.") INSTPM is already added to gen8_engine_mmio_list, but 0x20D8 is missed in gen9_engine_mmio_list. From GVT point of view, different guest could have different context so should switch those mmio accordingly. v2: Update fixes commit ID. Fixes: 178657139307 ("drm/i915/gvt: vGPU context switch") Reviewed-by: Zhenyu Wang Signed-off-by: Colin Xu Signed-off-by: Zhenyu Wang (cherry picked from commit 1e8b15a1988ed3c7429402017d589422628cdf47) --- drivers/gpu/drm/i915/gvt/mmio_context.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.c b/drivers/gpu/drm/i915/gvt/mmio_context.c index e7e14c842be4..edf6d646eb25 100644 --- a/drivers/gpu/drm/i915/gvt/mmio_context.c +++ b/drivers/gpu/drm/i915/gvt/mmio_context.c @@ -132,6 +132,7 @@ static struct engine_mmio gen9_engine_mmio_list[] __cacheline_aligned = { {RCS0, GEN9_GAMT_ECO_REG_RW_IA, 0x0, false}, /* 0x4ab0 */ {RCS0, GEN9_CSFE_CHICKEN1_RCS, 0xffff, false}, /* 0x20d4 */ + {RCS0, _MMIO(0x20D8), 0xffff, true}, /* 0x20d8 */ {RCS0, GEN8_GARBCNTL, 0x0, false}, /* 0xb004 */ {RCS0, GEN7_FF_THREAD_MODE, 0x0, false}, /* 0x20a0 */ -- cgit From e766fde6511e2be83acbca1d603035e08de23f3b Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 1 May 2019 12:45:36 +0100 Subject: drm/i915: Delay semaphore submission until the start of the signaler Currently we submit the semaphore busywait as soon as the signaler is submitted to HW. However, we may submit the signaler as the tail of a batch of requests, and even not as the first context in the HW list, i.e. the busywait may start spinning far in advance of the signaler even starting. If we wait until the request before the signaler is completed before submitting the busywait, we prevent the busywait from starting too early, if the signaler is not first in submission port. To handle the case where the signaler is at the start of the second (or later) submission port, we will need to delay the execution callback until we know the context is promoted to port0. A challenge for later. Fixes: e88619646971 ("drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20190501114541.10077-9-chris@chris-wilson.co.uk (cherry picked from commit 0d90ccb70211cbf55140e91bd39db684aa4c16e9) [Joonas: edited Fixes: tag into single line.] Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_request.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index b836721d3b13..20c7c77b3768 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -798,6 +798,21 @@ err_unreserve: return ERR_PTR(ret); } +static int +i915_request_await_start(struct i915_request *rq, struct i915_request *signal) +{ + if (list_is_first(&signal->ring_link, &signal->ring->request_list)) + return 0; + + signal = list_prev_entry(signal, ring_link); + if (i915_timeline_sync_is_later(rq->timeline, &signal->fence)) + return 0; + + return i915_sw_fence_await_dma_fence(&rq->submit, + &signal->fence, 0, + I915_FENCE_GFP); +} + static int emit_semaphore_wait(struct i915_request *to, struct i915_request *from, @@ -816,6 +831,10 @@ emit_semaphore_wait(struct i915_request *to, &from->fence, 0, I915_FENCE_GFP); + err = i915_request_await_start(to, from); + if (err < 0) + return err; + err = i915_sw_fence_await_dma_fence(&to->semaphore, &from->fence, 0, I915_FENCE_GFP); -- cgit From 2564fe708b580c1ef12b2b527ab6e8afe11ad444 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 4 May 2019 08:07:07 +0100 Subject: drm/i915: Disable semaphore busywaits on saturated systems Asking the GPU to busywait on a memory address, perhaps not unexpectedly in hindsight for a shared system, leads to bus contention that affects CPU programs trying to concurrently access memory. This can manifest as a drop in transcode throughput on highly over-saturated workloads. The only clue offered by perf, is that the bus-cycles (perf stat -e bus-cycles) jumped by 50% when enabling semaphores. This corresponds with extra CPU active cycles being attributed to intel_idle's mwait. This patch introduces a heuristic to try and detect when more than one client is submitting to the GPU pushing it into an oversaturated state. As we already keep track of when the semaphores are signaled, we can inspect their state on submitting the busywait batch and if we planned to use a semaphore but were too late, conclude that the GPU is overloaded and not try to use semaphores in future requests. In practice, this means we optimistically try to use semaphores for the first frame of a transcode job split over multiple engines, and fail if there are multiple clients active and continue not to use semaphores for the subsequent frames in the sequence. Periodically, we try to optimistically switch semaphores back on whenever the client waits to catch up with the transcode results. With 1 client, on Broxton J3455, with the relative fps normalized by %cpu: x no semaphores + drm-tip * patched +------------------------------------------------------------------------+ | * | | *+ | | **+ | | **+ x | | x * +**+ x | | x x * * +***x xx | | x x * * *+***x *x | | x x* + * * *****x *x x | | + x xx+x* + *** * ********* x * | | + x xx+x* * *** +** ********* xx * | | * + ++++* + x*x****+*+* ***+*************+x* * | |*+ +** *+ + +* + *++****** *xxx**********x***+*****************+*++ *| | |__________A_____M_____| | | |_______________A____M_________| | | |____________A___M________| | +------------------------------------------------------------------------+ N Min Max Median Avg Stddev x 120 2.60475 3.50941 3.31123 3.2143953 0.21117399 + 120 2.3826 3.57077 3.25101 3.1414161 0.28146407 Difference at 95.0% confidence -0.0729792 +/- 0.0629585 -2.27039% +/- 1.95864% (Student's t, pooled s = 0.248814) * 120 2.35536 3.66713 3.2849 3.2059917 0.24618565 No difference proven at 95.0% confidence With 10 clients over-saturating the pipeline: x no semaphores + drm-tip * patched +------------------------------------------------------------------------+ | ++ ** | | ++ ** | | ++ ** | | ++ ** | | ++ xx *** | | ++ xx *** | | ++ xxx*** | | ++ xxx*** | | +++ xxx*** | | +++ xx**** | | +++ xx**** | | +++ xx**** | | +++ xx**** | | ++++ xx**** | | +++++ xx**** | | +++++ x x****** | | ++++++ xxx******* | | ++++++ xxx******* | | ++++++ xxx******* | | ++++++ xx******** | | ++++++ xxxx******** | | ++++++ xxxx******** | | ++++++++ xxxxx********* | |+ + + + ++++++++ xxx*xx**********x* *| | |__A__| | | |__AM__| | | |__A_| | +------------------------------------------------------------------------+ N Min Max Median Avg Stddev x 120 2.47855 2.8972 2.72376 2.7193402 0.074604933 + 120 1.17367 1.77459 1.71977 1.6966782 0.085850697 Difference at 95.0% confidence -1.02266 +/- 0.0203502 -37.607% +/- 0.748352% (Student's t, pooled s = 0.0804246) * 120 2.57868 3.00821 2.80142 2.7923878 0.058646477 Difference at 95.0% confidence 0.0730476 +/- 0.0169791 2.68622% +/- 0.624383% (Student's t, pooled s = 0.0671018) Indicating that we've recovered the regression from enabling semaphores on this saturated setup, with a hint towards an overall improvement. Very similar, but of smaller magnitude, results are observed on both Skylake(gt2) and Kabylake(gt4). This may be due to the reduced impact of bus-cycles, where we see a 50% hit on Broxton, it is only 10% on the big core, in this particular test. One observation to make here is that for a greedy client trying to maximise its own throughput, using semaphores is the right choice. It is only the holistic system-wide view that semaphores of one client impacts another and reduces the overall throughput where we would choose to disable semaphores. The most noticeable negactive impact this has is on the no-op microbenchmarks, which are also very notable for having no cpu bus load. In particular, this increases the runtime and energy consumption of gem_exec_whisper. Fixes: e88619646971 ("drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Dmitry Rogozhkin Cc: Dmitry Ermilov Cc: Joonas Lahtinen Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20190504070707.30902-1-chris@chris-wilson.co.uk (cherry picked from commit ca6e56f654e7b241256ffba78cd2abb22aa3bc97) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_request.c | 40 +++++++++++++++++++++++++++++- drivers/gpu/drm/i915/intel_context.c | 1 + drivers/gpu/drm/i915/intel_context_types.h | 3 +++ 3 files changed, 43 insertions(+), 1 deletion(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 20c7c77b3768..ce342f7f7ddb 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -425,6 +425,26 @@ void __i915_request_submit(struct i915_request *request) if (i915_gem_context_is_banned(request->gem_context)) i915_request_skip(request, -EIO); + /* + * Are we using semaphores when the gpu is already saturated? + * + * Using semaphores incurs a cost in having the GPU poll a + * memory location, busywaiting for it to change. The continual + * memory reads can have a noticeable impact on the rest of the + * system with the extra bus traffic, stalling the cpu as it too + * tries to access memory across the bus (perf stat -e bus-cycles). + * + * If we installed a semaphore on this request and we only submit + * the request after the signaler completed, that indicates the + * system is overloaded and using semaphores at this time only + * increases the amount of work we are doing. If so, we disable + * further use of semaphores until we are idle again, whence we + * optimistically try again. + */ + if (request->sched.semaphores && + i915_sw_fence_signaled(&request->semaphore)) + request->hw_context->saturated |= request->sched.semaphores; + /* We may be recursing from the signal callback of another i915 fence */ spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); @@ -813,6 +833,24 @@ i915_request_await_start(struct i915_request *rq, struct i915_request *signal) I915_FENCE_GFP); } +static intel_engine_mask_t +already_busywaiting(struct i915_request *rq) +{ + /* + * Polling a semaphore causes bus traffic, delaying other users of + * both the GPU and CPU. We want to limit the impact on others, + * while taking advantage of early submission to reduce GPU + * latency. Therefore we restrict ourselves to not using more + * than one semaphore from each source, and not using a semaphore + * if we have detected the engine is saturated (i.e. would not be + * submitted early and cause bus traffic reading an already passed + * semaphore). + * + * See the are-we-too-late? check in __i915_request_submit(). + */ + return rq->sched.semaphores | rq->hw_context->saturated; +} + static int emit_semaphore_wait(struct i915_request *to, struct i915_request *from, @@ -826,7 +864,7 @@ emit_semaphore_wait(struct i915_request *to, GEM_BUG_ON(INTEL_GEN(to->i915) < 8); /* Just emit the first semaphore we see as request space is limited. */ - if (to->sched.semaphores & from->engine->mask) + if (already_busywaiting(to) & from->engine->mask) return i915_sw_fence_await_dma_fence(&to->submit, &from->fence, 0, I915_FENCE_GFP); diff --git a/drivers/gpu/drm/i915/intel_context.c b/drivers/gpu/drm/i915/intel_context.c index 8931e0fee873..924cc556223a 100644 --- a/drivers/gpu/drm/i915/intel_context.c +++ b/drivers/gpu/drm/i915/intel_context.c @@ -230,6 +230,7 @@ intel_context_init(struct intel_context *ce, ce->gem_context = ctx; ce->engine = engine; ce->ops = engine->cops; + ce->saturated = 0; INIT_LIST_HEAD(&ce->signal_link); INIT_LIST_HEAD(&ce->signals); diff --git a/drivers/gpu/drm/i915/intel_context_types.h b/drivers/gpu/drm/i915/intel_context_types.h index 68b4ca1611e0..339c7437fe82 100644 --- a/drivers/gpu/drm/i915/intel_context_types.h +++ b/drivers/gpu/drm/i915/intel_context_types.h @@ -14,6 +14,7 @@ #include #include "i915_active_types.h" +#include "intel_engine_types.h" struct i915_gem_context; struct i915_vma; @@ -58,6 +59,8 @@ struct intel_context { atomic_t pin_count; struct mutex pin_mutex; /* guards pinning and associated on-gpuing */ + intel_engine_mask_t saturated; /* submitting semaphores too late? */ + /** * active_tracker: Active tracker for the external rq activity * on this intel_context object. -- cgit