diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_topology.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 504 |
1 files changed, 279 insertions, 225 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 61fc62f3e003..811636af14ea 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -31,13 +31,14 @@ #include <linux/log2.h> #include <linux/dmi.h> #include <linux/atomic.h> +#include <linux/crc16.h> #include "kfd_priv.h" #include "kfd_crat.h" #include "kfd_topology.h" #include "kfd_device_queue_manager.h" -#include "kfd_iommu.h" #include "kfd_svm.h" +#include "kfd_debug.h" #include "amdgpu_amdkfd.h" #include "amdgpu_ras.h" #include "amdgpu.h" @@ -107,24 +108,6 @@ struct kfd_node *kfd_device_by_id(uint32_t gpu_id) return top_dev->gpu; } -struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev) -{ - struct kfd_topology_device *top_dev; - struct kfd_node *device = NULL; - - down_read(&topology_lock); - - list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu && top_dev->gpu->adev->pdev == pdev) { - device = top_dev->gpu; - break; - } - - up_read(&topology_lock); - - return device; -} - /* Called with write topology_lock acquired */ static void kfd_release_topology_device(struct kfd_topology_device *dev) { @@ -291,6 +274,8 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, iolink->max_bandwidth); sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size", iolink->rec_transfer_size); + sysfs_show_32bit_prop(buffer, offs, "recommended_sdma_engine_id_mask", + iolink->rec_sdma_eng_id_mask); sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags); return offs; @@ -450,8 +435,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, sysfs_show_32bit_prop(buffer, offs, "cpu_cores_count", dev->node_props.cpu_cores_count); sysfs_show_32bit_prop(buffer, offs, "simd_count", - dev->gpu ? (dev->node_props.simd_count * - NUM_XCC(dev->gpu->xcc_mask)) : 0); + dev->gpu ? dev->node_props.simd_count : 0); sysfs_show_32bit_prop(buffer, offs, "mem_banks_count", dev->node_props.mem_banks_count); sysfs_show_32bit_prop(buffer, offs, "caches_count", @@ -526,6 +510,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.capability |= HSA_CAP_AQL_QUEUE_DOUBLE_MAP; + if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0) && + (dev->gpu->adev->sdma.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)) + dev->node_props.capability2 |= HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED; + sysfs_show_32bit_prop(buffer, offs, "max_engine_clk_fcompute", dev->node_props.max_engine_clk_fcompute); @@ -535,11 +523,17 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->gpu->kfd->mec_fw_version); sysfs_show_32bit_prop(buffer, offs, "capability", dev->node_props.capability); + sysfs_show_32bit_prop(buffer, offs, "capability2", + dev->node_props.capability2); sysfs_show_64bit_prop(buffer, offs, "debug_prop", dev->node_props.debug_prop); sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version", dev->gpu->kfd->sdma_fw_version); sysfs_show_64bit_prop(buffer, offs, "unique_id", + dev->gpu->xcp && + (dev->gpu->xcp->xcp_mgr->mode != + AMDGPU_SPX_PARTITION_MODE) ? + dev->gpu->xcp->unique_id : dev->gpu->adev->unique_id); sysfs_show_32bit_prop(buffer, offs, "num_xcc", NUM_XCC(dev->gpu->xcc_mask)); @@ -959,44 +953,31 @@ static void kfd_update_system_properties(void) dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list); if (dev) { - sys_props.platform_id = - (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; + sys_props.platform_id = dev->oem_id64; sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); sys_props.platform_rev = dev->oem_revision; } up_read(&topology_lock); } -static void find_system_memory(const struct dmi_header *dm, - void *private) +static void find_system_memory(const struct dmi_header *dm, void *private) { + struct dmi_mem_device *memdev = container_of(dm, struct dmi_mem_device, header); struct kfd_mem_properties *mem; - u16 mem_width, mem_clock; struct kfd_topology_device *kdev = (struct kfd_topology_device *)private; - const u8 *dmi_data = (const u8 *)(dm + 1); - - if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { - mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); - mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); - list_for_each_entry(mem, &kdev->mem_props, list) { - if (mem_width != 0xFFFF && mem_width != 0) - mem->width = mem_width; - if (mem_clock != 0) - mem->mem_clk_max = mem_clock; - } - } -} -/* - * Performance counters information is not part of CRAT but we would like to - * put them in the sysfs under topology directory for Thunk to get the data. - * This function is called before updating the sysfs. - */ -static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev) -{ - /* These are the only counters supported so far */ - return kfd_iommu_add_perf_counters(kdev); + if (memdev->header.type != DMI_ENTRY_MEM_DEVICE) + return; + if (memdev->header.length < sizeof(struct dmi_mem_device)) + return; + + list_for_each_entry(mem, &kdev->mem_props, list) { + if (memdev->total_width != 0xFFFF && memdev->total_width != 0) + mem->width = memdev->total_width; + if (memdev->speed != 0) + mem->mem_clk_max = memdev->speed; + } } /* kfd_add_non_crat_information - Add information that is not currently @@ -1013,25 +994,6 @@ static void kfd_add_non_crat_information(struct kfd_topology_device *kdev) /* TODO: For GPU node, rearrange code from kfd_topology_add_device */ } -/* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices. - * Ignore CRAT for all other devices. AMD APU is identified if both CPU - * and GPU cores are present. - * @device_list - topology device list created by parsing ACPI CRAT table. - * @return - TRUE if invalid, FALSE is valid. - */ -static bool kfd_is_acpi_crat_invalid(struct list_head *device_list) -{ - struct kfd_topology_device *dev; - - list_for_each_entry(dev, device_list, list) { - if (dev->node_props.cpu_cores_count && - dev->node_props.simd_count) - return false; - } - pr_info("Ignoring ACPI CRAT on non-APU system\n"); - return true; -} - int kfd_topology_init(void) { void *crat_image = NULL; @@ -1062,48 +1024,25 @@ int kfd_topology_init(void) */ proximity_domain = 0; - /* - * Get the CRAT image from the ACPI. If ACPI doesn't have one - * or if ACPI CRAT is invalid create a virtual CRAT. - * NOTE: The current implementation expects all AMD APUs to have - * CRAT. If no CRAT is available, it is assumed to be a CPU - */ - ret = kfd_create_crat_image_acpi(&crat_image, &image_size); - if (!ret) { - ret = kfd_parse_crat_table(crat_image, - &temp_topology_device_list, - proximity_domain); - if (ret || - kfd_is_acpi_crat_invalid(&temp_topology_device_list)) { - kfd_release_topology_device_list( - &temp_topology_device_list); - kfd_destroy_crat_image(crat_image); - crat_image = NULL; - } + ret = kfd_create_crat_image_virtual(&crat_image, &image_size, + COMPUTE_UNIT_CPU, NULL, + proximity_domain); + cpu_only_node = 1; + if (ret) { + pr_err("Error creating VCRAT table for CPU\n"); + return ret; } - if (!crat_image) { - ret = kfd_create_crat_image_virtual(&crat_image, &image_size, - COMPUTE_UNIT_CPU, NULL, - proximity_domain); - cpu_only_node = 1; - if (ret) { - pr_err("Error creating VCRAT table for CPU\n"); - return ret; - } - - ret = kfd_parse_crat_table(crat_image, - &temp_topology_device_list, - proximity_domain); - if (ret) { - pr_err("Error parsing VCRAT table for CPU\n"); - goto err; - } + ret = kfd_parse_crat_table(crat_image, + &temp_topology_device_list, + proximity_domain); + if (ret) { + pr_err("Error parsing VCRAT table for CPU\n"); + goto err; } kdev = list_first_entry(&temp_topology_device_list, struct kfd_topology_device, list); - kfd_add_perf_to_topology(kdev); down_write(&topology_lock); kfd_topology_update_device_list(&temp_topology_device_list, @@ -1146,14 +1085,17 @@ void kfd_topology_shutdown(void) static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) { - uint32_t hashout; + uint32_t gpu_id; uint32_t buf[8]; uint64_t local_mem_size; - int i; + struct kfd_topology_device *dev; + bool is_unique; + uint8_t *crc_buf; if (!gpu) return 0; + crc_buf = (uint8_t *)&buf; local_mem_size = gpu->local_mem_info.local_mem_size_private + gpu->local_mem_info.local_mem_size_public; buf[0] = gpu->adev->pdev->devfn; @@ -1166,10 +1108,34 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) buf[6] = upper_32_bits(local_mem_size); buf[7] = (ffs(gpu->xcc_mask) - 1) | (NUM_XCC(gpu->xcc_mask) << 16); - for (i = 0, hashout = 0; i < 8; i++) - hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); + gpu_id = crc16(0, crc_buf, sizeof(buf)) & + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); + + /* There is a very small possibility when generating a + * 16 (KFD_GPU_ID_HASH_WIDTH) bit value from 8 word buffer + * that the value could be 0 or non-unique. So, check if + * it is unique and non-zero. If not unique increment till + * unique one is found. In case of overflow, restart from 1 + */ - return hashout; + down_read(&topology_lock); + do { + is_unique = true; + if (!gpu_id) + gpu_id = 1; + list_for_each_entry(dev, &topology_device_list, list) { + if (dev->gpu && dev->gpu_id == gpu_id) { + is_unique = false; + break; + } + } + if (unlikely(!is_unique)) + gpu_id = (gpu_id + 1) & + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); + } while (!is_unique); + up_read(&topology_lock); + + return gpu_id; } /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If * the GPU device is not already present in the topology device @@ -1189,8 +1155,7 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_node *gpu) /* Discrete GPUs need their own topology device list * entries. Don't assign them to CPU/APU nodes. */ - if (!gpu->kfd->use_iommu_v2 && - dev->node_props.cpu_cores_count) + if (dev->node_props.cpu_cores_count) continue; if (!dev->gpu && (dev->node_props.simd_count > 0)) { @@ -1293,6 +1258,61 @@ static void kfd_set_iolink_non_coherent(struct kfd_topology_device *to_dev, } } +#define REC_SDMA_NUM_GPU 8 +static const int rec_sdma_eng_map[REC_SDMA_NUM_GPU][REC_SDMA_NUM_GPU] = { + { -1, 14, 12, 2, 4, 8, 10, 6 }, + { 14, -1, 2, 10, 8, 4, 6, 12 }, + { 10, 2, -1, 12, 14, 6, 4, 8 }, + { 2, 12, 10, -1, 6, 14, 8, 4 }, + { 4, 8, 14, 6, -1, 10, 12, 2 }, + { 8, 4, 6, 14, 12, -1, 2, 10 }, + { 10, 6, 4, 8, 12, 2, -1, 14 }, + { 6, 12, 8, 4, 2, 10, 14, -1 }}; + +static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev, + struct kfd_iolink_properties *outbound_link, + struct kfd_iolink_properties *inbound_link) +{ + struct kfd_node *gpu = outbound_link->gpu; + struct amdgpu_device *adev = gpu->adev; + unsigned int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes; + unsigned int num_xgmi_sdma_engines = kfd_get_num_xgmi_sdma_engines(gpu); + unsigned int num_sdma_engines = kfd_get_num_sdma_engines(gpu); + uint32_t sdma_eng_id_mask = (1 << num_sdma_engines) - 1; + uint32_t xgmi_sdma_eng_id_mask = + ((1 << num_xgmi_sdma_engines) - 1) << num_sdma_engines; + + bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu && + adev->aid_mask && num_xgmi_nodes && gpu->kfd->num_nodes == 1 && + num_xgmi_sdma_engines >= 6 && (!(adev->flags & AMD_IS_APU) && + num_xgmi_nodes == 8); + + if (support_rec_eng) { + int src_socket_id = adev->gmc.xgmi.physical_node_id; + int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id; + unsigned int reshift = num_xgmi_sdma_engines == 6 ? 1 : 0; + + outbound_link->rec_sdma_eng_id_mask = + 1 << (rec_sdma_eng_map[src_socket_id][dst_socket_id] >> reshift); + inbound_link->rec_sdma_eng_id_mask = + 1 << (rec_sdma_eng_map[dst_socket_id][src_socket_id] >> reshift); + + /* If recommended engine is out of range, need to reset the mask */ + if (outbound_link->rec_sdma_eng_id_mask & sdma_eng_id_mask) + outbound_link->rec_sdma_eng_id_mask = xgmi_sdma_eng_id_mask; + if (inbound_link->rec_sdma_eng_id_mask & sdma_eng_id_mask) + inbound_link->rec_sdma_eng_id_mask = xgmi_sdma_eng_id_mask; + + } else { + uint32_t engine_mask = (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI && + num_xgmi_sdma_engines && to_dev->gpu) ? xgmi_sdma_eng_id_mask : + sdma_eng_id_mask; + + outbound_link->rec_sdma_eng_id_mask = engine_mask; + inbound_link->rec_sdma_eng_id_mask = engine_mask; + } +} + static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) { struct kfd_iolink_properties *link, *inbound_link; @@ -1331,6 +1351,7 @@ static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) inbound_link->flags = CRAT_IOLINK_FLAGS_ENABLED; kfd_set_iolink_no_atomics(peer_dev, dev, inbound_link); kfd_set_iolink_non_coherent(peer_dev, link, inbound_link); + kfd_set_recommended_sdma_engines(peer_dev, link, inbound_link); } } @@ -1397,10 +1418,11 @@ static int kfd_create_indirect_link_prop(struct kfd_topology_device *kdev, int g num_cpu++; } + if (list_empty(&kdev->io_link_props)) + return -ENODATA; + gpu_link = list_first_entry(&kdev->io_link_props, - struct kfd_iolink_properties, list); - if (!gpu_link) - return -ENOMEM; + struct kfd_iolink_properties, list); for (i = 0; i < num_cpu; i++) { /* CPU <--> GPU */ @@ -1478,15 +1500,17 @@ static int kfd_add_peer_prop(struct kfd_topology_device *kdev, peer->gpu->adev)) return ret; + if (list_empty(&kdev->io_link_props)) + return -ENODATA; + iolink1 = list_first_entry(&kdev->io_link_props, - struct kfd_iolink_properties, list); - if (!iolink1) - return -ENOMEM; + struct kfd_iolink_properties, list); + + if (list_empty(&peer->io_link_props)) + return -ENODATA; iolink2 = list_first_entry(&peer->io_link_props, - struct kfd_iolink_properties, list); - if (!iolink2) - return -ENOMEM; + struct kfd_iolink_properties, list); props = kfd_alloc_struct(props); if (!props) @@ -1504,17 +1528,19 @@ static int kfd_add_peer_prop(struct kfd_topology_device *kdev, /* CPU->CPU link*/ cpu_dev = kfd_topology_device_by_proximity_domain(iolink1->node_to); if (cpu_dev) { - list_for_each_entry(iolink3, &cpu_dev->io_link_props, list) - if (iolink3->node_to == iolink2->node_to) - break; - - props->weight += iolink3->weight; - props->min_latency += iolink3->min_latency; - props->max_latency += iolink3->max_latency; - props->min_bandwidth = min(props->min_bandwidth, - iolink3->min_bandwidth); - props->max_bandwidth = min(props->max_bandwidth, - iolink3->max_bandwidth); + list_for_each_entry(iolink3, &cpu_dev->io_link_props, list) { + if (iolink3->node_to != iolink2->node_to) + continue; + + props->weight += iolink3->weight; + props->min_latency += iolink3->min_latency; + props->max_latency += iolink3->max_latency; + props->min_bandwidth = min(props->min_bandwidth, + iolink3->min_bandwidth); + props->max_bandwidth = min(props->max_bandwidth, + iolink3->max_bandwidth); + break; + } } else { WARN(1, "CPU node not found"); } @@ -1565,7 +1591,8 @@ static int kfd_dev_create_p2p_links(void) break; if (!dev->gpu || !dev->gpu->adev || (dev->gpu->kfd->hive_id && - dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id)) + dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id && + amdgpu_xgmi_get_is_sharing_enabled(dev->gpu->adev, new_dev->gpu->adev))) goto next; /* check if node(s) is/are peer accessible in one direction or bi-direction */ @@ -1588,7 +1615,6 @@ out: /* Helper function. See kfd_fill_gpu_cache_info for parameter description */ static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext, struct kfd_gpu_cache_info *pcache_info, - struct kfd_cu_info *cu_info, int cu_bitmask, int cache_type, unsigned int cu_processor_id, int cu_block) @@ -1615,6 +1641,7 @@ static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext, pcache->processor_id_low = cu_processor_id + (first_active_cu - 1); pcache->cache_level = pcache_info[cache_type].cache_level; pcache->cache_size = pcache_info[cache_type].cache_size; + pcache->cacheline_size = pcache_info[cache_type].cache_line_size; if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE) pcache->cache_type |= HSA_CACHE_TYPE_DATA; @@ -1650,15 +1677,37 @@ static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext, /* Helper function. See kfd_fill_gpu_cache_info for parameter description */ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, struct kfd_gpu_cache_info *pcache_info, - struct kfd_cu_info *cu_info, - int cache_type, unsigned int cu_processor_id) + struct amdgpu_cu_info *cu_info, + struct amdgpu_gfx_config *gfx_info, + int cache_type, unsigned int cu_processor_id, + struct kfd_node *knode) { - unsigned int cu_sibling_map_mask; + unsigned int cu_sibling_map_mask = 0; int first_active_cu; - int i, j, k; + int i, j, k, xcc, start, end; + int num_xcc = NUM_XCC(knode->xcc_mask); struct kfd_cache_properties *pcache = NULL; + enum amdgpu_memory_partition mode; + struct amdgpu_device *adev = knode->adev; + bool found = false; + + start = ffs(knode->xcc_mask) - 1; + end = start + num_xcc; + + /* To find the bitmap in the first active cu in the first + * xcc, it is based on the assumption that evrey xcc must + * have at least one active cu. + */ + for (i = 0; i < gfx_info->max_shader_engines && !found; i++) { + for (j = 0; j < gfx_info->max_sh_per_se && !found; j++) { + if (cu_info->bitmap[start][i % 4][j % 4]) { + cu_sibling_map_mask = + cu_info->bitmap[start][i % 4][j % 4]; + found = true; + } + } + } - cu_sibling_map_mask = cu_info->cu_bitmap[0][0]; cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); first_active_cu = ffs(cu_sibling_map_mask); @@ -1676,7 +1725,19 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, pcache->processor_id_low = cu_processor_id + (first_active_cu - 1); pcache->cache_level = pcache_info[cache_type].cache_level; + pcache->cacheline_size = pcache_info[cache_type].cache_line_size; + + if (KFD_GC_VERSION(knode) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(knode) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(knode) == IP_VERSION(9, 5, 0)) + mode = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); + else + mode = UNKNOWN_MEMORY_PARTITION_MODE; + pcache->cache_size = pcache_info[cache_type].cache_size; + /* Partition mode only affects L3 cache size */ + if (mode && pcache->cache_level == 3) + pcache->cache_size /= mode; if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE) pcache->cache_type |= HSA_CACHE_TYPE_DATA; @@ -1693,16 +1754,18 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, cu_sibling_map_mask = cu_sibling_map_mask >> (first_active_cu - 1); k = 0; - for (i = 0; i < cu_info->num_shader_engines; i++) { - for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) { - pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF); - pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); - pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); - pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); - k += 4; - - cu_sibling_map_mask = cu_info->cu_bitmap[i % 4][j + i / 4]; - cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); + for (xcc = start; xcc < end; xcc++) { + for (i = 0; i < gfx_info->max_shader_engines; i++) { + for (j = 0; j < gfx_info->max_sh_per_se; j++) { + pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF); + pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); + pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); + pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); + k += 4; + + cu_sibling_map_mask = cu_info->bitmap[xcc][i % 4][j + i / 4]; + cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); + } } } pcache->sibling_map_size = k; @@ -1720,24 +1783,23 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct kfd_node *kdev) { struct kfd_gpu_cache_info *pcache_info = NULL; - int i, j, k; + int i, j, k, xcc, start, end; int ct = 0; unsigned int cu_processor_id; int ret; unsigned int num_cu_shared; - struct kfd_cu_info cu_info; - struct kfd_cu_info *pcu_info; + struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info; + struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config; int gpu_processor_id; - struct kfd_cache_properties *props_ext; + struct kfd_cache_properties *props_ext = NULL; int num_of_entries = 0; int num_of_cache_types = 0; struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES]; - amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info); - pcu_info = &cu_info; gpu_processor_id = dev->node_props.simd_id_base; + memset(cache_info, 0, sizeof(cache_info)); pcache_info = cache_info; num_of_cache_types = kfd_get_gpu_cache_info(kdev, &pcache_info); if (!num_of_cache_types) { @@ -1754,37 +1816,42 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct * then it will consider only one CU from * the shared unit */ + start = ffs(kdev->xcc_mask) - 1; + end = start + NUM_XCC(kdev->xcc_mask); + for (ct = 0; ct < num_of_cache_types; ct++) { cu_processor_id = gpu_processor_id; if (pcache_info[ct].cache_level == 1) { - for (i = 0; i < pcu_info->num_shader_engines; i++) { - for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) { - for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) { + for (xcc = start; xcc < end; xcc++) { + for (i = 0; i < gfx_info->max_shader_engines; i++) { + for (j = 0; j < gfx_info->max_sh_per_se; j++) { + for (k = 0; k < gfx_info->max_cu_per_sh; k += pcache_info[ct].num_cu_shared) { - ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info, - pcu_info->cu_bitmap[i % 4][j + i / 4], ct, + ret = fill_in_l1_pcache(&props_ext, pcache_info, + cu_info->bitmap[xcc][i % 4][j + i / 4], ct, cu_processor_id, k); - if (ret < 0) - break; + if (ret < 0) + break; - if (!ret) { - num_of_entries++; - list_add_tail(&props_ext->list, &dev->cache_props); - } + if (!ret) { + num_of_entries++; + list_add_tail(&props_ext->list, &dev->cache_props); + } - /* Move to next CU block */ - num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <= - pcu_info->num_cu_per_sh) ? - pcache_info[ct].num_cu_shared : - (pcu_info->num_cu_per_sh - k); - cu_processor_id += num_cu_shared; + /* Move to next CU block */ + num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <= + gfx_info->max_cu_per_sh) ? + pcache_info[ct].num_cu_shared : + (gfx_info->max_cu_per_sh - k); + cu_processor_id += num_cu_shared; + } } } } } else { ret = fill_in_l2_l3_pcache(&props_ext, pcache_info, - pcu_info, ct, cu_processor_id); + cu_info, gfx_info, ct, cu_processor_id, kdev); if (ret < 0) break; @@ -1799,7 +1866,7 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct pr_debug("Added [%d] GPU cache entries\n", num_of_entries); } -static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, +static int kfd_topology_add_device_locked(struct kfd_node *gpu, struct kfd_topology_device **dev) { int proximity_domain = ++topology_crat_proximity_domain; @@ -1812,8 +1879,7 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, COMPUTE_UNIT_GPU, gpu, proximity_domain); if (res) { - pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", - gpu_id); + dev_err(gpu->adev->dev, "Error creating VCRAT\n"); topology_crat_proximity_domain--; goto err; } @@ -1824,8 +1890,7 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, &temp_topology_device_list, proximity_domain); if (res) { - pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", - gpu_id); + dev_err(gpu->adev->dev, "Error parsing VCRAT\n"); topology_crat_proximity_domain--; goto err; } @@ -1851,8 +1916,8 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, if (!res) sys_props.generation_count++; else - pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", - gpu_id, res); + dev_err(gpu->adev->dev, "Failed to update GPU to sysfs topology. res=%d\n", + res); err: kfd_destroy_crat_image(crat_image); @@ -1931,25 +1996,34 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED | HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED; - if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) { - dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9 | - HSA_DBG_WATCH_ADDR_MASK_HI_BIT; + if (kfd_dbg_has_ttmps_always_setup(dev->gpu)) + dev->node_props.debug_prop |= HSA_DBG_DISPATCH_INFO_ALWAYS_VALID; - if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 4, 2)) + if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) { + if (KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 4)) dev->node_props.debug_prop |= - HSA_DBG_DISPATCH_INFO_ALWAYS_VALID; + HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9_4_3 | + HSA_DBG_WATCH_ADDR_MASK_HI_BIT_GFX9_4_3; else + dev->node_props.debug_prop |= + HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9 | + HSA_DBG_WATCH_ADDR_MASK_HI_BIT; + + if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(9, 4, 2)) dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; + + if (!amdgpu_sriov_vf(dev->gpu->adev)) + dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED; + } else { dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 | HSA_DBG_WATCH_ADDR_MASK_HI_BIT; - if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(11, 0, 0)) - dev->node_props.debug_prop |= HSA_DBG_DISPATCH_INFO_ALWAYS_VALID; - else + if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 0, 0)) dev->node_props.capability |= - HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; + HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED; } kfd_topology_set_dbg_firmware_support(dev); @@ -1959,13 +2033,19 @@ int kfd_topology_add_device(struct kfd_node *gpu) { uint32_t gpu_id; struct kfd_topology_device *dev; - struct kfd_cu_info cu_info; int res = 0; int i; const char *asic_name = amdgpu_asic_name[gpu->adev->asic_type]; + struct amdgpu_gfx_config *gfx_info = &gpu->adev->gfx.config; + struct amdgpu_cu_info *cu_info = &gpu->adev->gfx.cu_info; - gpu_id = kfd_generate_gpu_id(gpu); - pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); + if (gpu->xcp && !gpu->xcp->ddev) { + dev_warn(gpu->adev->dev, + "Won't add GPU to topology since it has no drm node assigned."); + return 0; + } else { + dev_dbg(gpu->adev->dev, "Adding new GPU to topology\n"); + } /* Check to see if this gpu device exists in the topology_device_list. * If so, assign the gpu to that device, @@ -1976,11 +2056,12 @@ int kfd_topology_add_device(struct kfd_node *gpu) down_write(&topology_lock); dev = kfd_assign_gpu(gpu); if (!dev) - res = kfd_topology_add_device_locked(gpu, gpu_id, &dev); + res = kfd_topology_add_device_locked(gpu, &dev); up_write(&topology_lock); if (res) return res; + gpu_id = kfd_generate_gpu_id(gpu); dev->gpu_id = gpu_id; gpu->id = gpu_id; @@ -1993,9 +2074,6 @@ int kfd_topology_add_device(struct kfd_node *gpu) /* Fill-in additional information that is not available in CRAT but * needed for the topology */ - - amdgpu_amdkfd_get_cu_info(dev->gpu->adev, &cu_info); - for (i = 0; i < KFD_TOPOLOGY_PUBLIC_NAME_SIZE-1; i++) { dev->node_props.name[i] = __tolower(asic_name[i]); if (asic_name[i] == '\0') @@ -2004,7 +2082,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) dev->node_props.name[i] = '\0'; dev->node_props.simd_arrays_per_engine = - cu_info.num_shader_arrays_per_engine; + gfx_info->max_sh_per_se; dev->node_props.gfx_target_version = gpu->kfd->device_info.gfx_target_version; @@ -2015,7 +2093,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) HSA_CAP_ASIC_REVISION_MASK); dev->node_props.location_id = pci_dev_id(gpu->adev->pdev); - if (KFD_GC_VERSION(dev->gpu->kfd) == IP_VERSION(9, 4, 3)) + if (gpu->kfd->num_nodes > 1) dev->node_props.location_id |= dev->gpu->node_id; dev->node_props.domain = pci_domain_nr(gpu->adev->pdev->bus); @@ -2076,10 +2154,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) * Overwrite ATS capability according to needs_iommu_device to fix * potential missing corresponding bit in CRAT of BIOS. */ - if (dev->gpu->kfd->use_iommu_v2) - dev->node_props.capability |= HSA_CAP_ATS_PRESENT; - else - dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT; + dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT; /* Fix errors in CZ CRAT. * simd_count: Carrizo CRAT reports wrong simd_count, probably @@ -2088,7 +2163,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) */ if (dev->gpu->adev->asic_type == CHIP_CARRIZO) { dev->node_props.simd_count = - cu_info.simd_per_cu * cu_info.cu_active_number; + cu_info->simd_per_cu * cu_info->number; dev->node_props.max_waves_per_simd = 10; } @@ -2111,6 +2186,8 @@ int kfd_topology_add_device(struct kfd_node *gpu) dev->gpu->adev->gmc.xgmi.connected_to_cpu) dev->node_props.capability |= HSA_CAP_FLAGS_COHERENTHOSTACCESS; + kfd_queue_ctx_save_restore_size(dev); + kfd_debug_print_topology(); kfd_notify_gpu_change(gpu_id, 1); @@ -2255,7 +2332,7 @@ static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) if (first_cpu_of_numa_node >= nr_cpu_ids) return -1; #ifdef CONFIG_X86_64 - return cpu_data(first_cpu_of_numa_node).apicid; + return cpu_data(first_cpu_of_numa_node).topo.apicid; #else return first_cpu_of_numa_node; #endif @@ -2274,29 +2351,6 @@ int kfd_numa_node_to_apic_id(int numa_node_id) return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); } -void kfd_double_confirm_iommu_support(struct kfd_dev *gpu) -{ - struct kfd_topology_device *dev; - - gpu->use_iommu_v2 = false; - - if (!gpu->device_info.needs_iommu_device) - return; - - down_read(&topology_lock); - - /* Only use IOMMUv2 if there is an APU topology node with no GPU - * assigned yet. This GPU will be assigned to it. - */ - list_for_each_entry(dev, &topology_device_list, list) - if (dev->node_props.cpu_cores_count && - dev->node_props.simd_count && - !dev->gpu) - gpu->use_iommu_v2 = true; - - up_read(&topology_lock); -} - #if defined(CONFIG_DEBUG_FS) int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data) |
